diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 313b776b0824..849e4606834e 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1 +1 @@
-Thanks for contributing to TVM!   Please refer to guideline https://docs.tvm.ai/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from others in the community.
+Thanks for contributing to TVM!   Please refer to guideline https://docs.tvm.ai/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from [Reviewers](https://github.com/dmlc/tvm/blob/master/CONTRIBUTORS.md#reviewers).
diff --git a/.gitignore b/.gitignore
index 3c968eb3ed47..04dad2039860 100644
--- a/.gitignore
+++ b/.gitignore
@@ -91,10 +91,8 @@ ENV/
 *~
 *.pyc
 *~
-build
 config.mk
 config.cmake
-build_*
 Win32
 *.dir
 perf
@@ -179,15 +177,39 @@ perf
 *.h5
 synset.txt
 cat.jpg
+cat.png
 docs.tgz
 cat.png
 *.mlmodel
+tvm_u.*
+tvm_t.*
 # Mac OS X
 .DS_Store
-build*
 
 # Jetbrain
 .idea
+.ipython
+.jupyter
+.nv
+.pylint.d
+.python_history
+.pytest_cache
+.local
 
 # tmp file
 .nfs*
+
+# keys
+*.pem
+*.p12
+*.pfx
+*.cer
+*.crt
+*.der
+
+# patch sentinel
+patched.txt
+
+# Python type checking
+.mypy_cache/
+.pyre/
diff --git a/.gitmodules b/.gitmodules
index 3f0b222a86c6..8011ec12d24b 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,9 +1,9 @@
 [submodule "dmlc-core"]
-	path = dmlc-core
+	path = 3rdparty/dmlc-core
 	url = https://github.com/dmlc/dmlc-core
 [submodule "HalideIR"]
-	path = HalideIR
+	path = 3rdparty/HalideIR
 	url = https://github.com/dmlc/HalideIR
 [submodule "dlpack"]
-	path = dlpack
+	path = 3rdparty/dlpack
 	url = https://github.com/dmlc/dlpack
diff --git a/3rdparty/HalideIR b/3rdparty/HalideIR
new file mode 160000
index 000000000000..a08e26e5a97f
--- /dev/null
+++ b/3rdparty/HalideIR
@@ -0,0 +1 @@
+Subproject commit a08e26e5a97f4ef4d566a42f6c78704b3f9c7b8a
diff --git a/3rdparty/compiler-rt/builtin_fp16.h b/3rdparty/compiler-rt/builtin_fp16.h
new file mode 100644
index 000000000000..1657d2830119
--- /dev/null
+++ b/3rdparty/compiler-rt/builtin_fp16.h
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2009-2015 by llvm/compiler-rt contributors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+
+ * Copyright (c) 2018 by Contributors
+ * \file builtin_fp16.cc
+ * \brief Functions for conversion between fp32 and fp16, adopted from compiler-rt.
+ */
+
+#include <cstdint>
+
+static inline uint32_t __clz(uint32_t x) {
+  // count leading zeros
+  int n = 32;
+  uint32_t y;
+
+  y = x >>16; if (y) { n = n -16; x = y; }
+  y = x >> 8; if (y) { n = n - 8; x = y; }
+  y = x >> 4; if (y) { n = n - 4; x = y; }
+  y = x >> 2; if (y) { n = n - 2; x = y; }
+  y = x >> 1; if (y) return n - 2;
+  return n - x;
+}
+
+template <typename SRC_T, typename SRC_REP_T, int SRC_SIG_BITS,
+          typename DST_T, typename DST_REP_T, int DST_SIG_BITS>
+static inline DST_T __truncXfYf2__(SRC_T a) {
+  // Various constants whose values follow from the type parameters.
+  // Any reasonable optimizer will fold and propagate all of these.
+  const int srcBits = sizeof(SRC_T) * 8;
+  const int srcExpBits = srcBits - SRC_SIG_BITS - 1;
+  const int srcInfExp = (1 << srcExpBits) - 1;
+  const int srcExpBias = srcInfExp >> 1;
+
+  const SRC_REP_T srcMinNormal = SRC_REP_T(1) << SRC_SIG_BITS;
+  const SRC_REP_T srcSignificandMask = srcMinNormal - 1;
+  const SRC_REP_T srcInfinity = (SRC_REP_T)srcInfExp << SRC_SIG_BITS;
+  const SRC_REP_T srcSignMask = SRC_REP_T(1) << (SRC_SIG_BITS + srcExpBits);
+  const SRC_REP_T srcAbsMask = srcSignMask - 1;
+  const SRC_REP_T roundMask = (SRC_REP_T(1) << (SRC_SIG_BITS - DST_SIG_BITS)) - 1;
+  const SRC_REP_T halfway = SRC_REP_T(1) << (SRC_SIG_BITS - DST_SIG_BITS - 1);
+  const SRC_REP_T srcQNaN = SRC_REP_T(1) << (SRC_SIG_BITS - 1);
+  const SRC_REP_T srcNaNCode = srcQNaN - 1;
+
+  const int dstBits = sizeof(DST_T) * 8;
+  const int dstExpBits = dstBits - DST_SIG_BITS - 1;
+  const int dstInfExp = (1 << dstExpBits) - 1;
+  const int dstExpBias = dstInfExp >> 1;
+
+  const int underflowExponent = srcExpBias + 1 - dstExpBias;
+  const int overflowExponent = srcExpBias + dstInfExp - dstExpBias;
+  const SRC_REP_T underflow = (SRC_REP_T)underflowExponent << SRC_SIG_BITS;
+  const SRC_REP_T overflow = (SRC_REP_T)overflowExponent << SRC_SIG_BITS;
+
+  const DST_REP_T dstQNaN = DST_REP_T(1) << (DST_SIG_BITS - 1);
+  const DST_REP_T dstNaNCode = dstQNaN - 1;
+
+  // Break a into a sign and representation of the absolute value
+  const union { SRC_T f; SRC_REP_T i; } src_rep = {.f = a};
+  const SRC_REP_T aRep = src_rep.i;
+  const SRC_REP_T aAbs = aRep & srcAbsMask;
+  const SRC_REP_T sign = aRep & srcSignMask;
+  DST_REP_T absResult;
+
+  if (aAbs - underflow < aAbs - overflow) {
+    // The exponent of a is within the range of normal numbers in the
+    // destination format.  We can convert by simply right-shifting with
+    // rounding and adjusting the exponent.
+    absResult = aAbs >> (SRC_SIG_BITS - DST_SIG_BITS);
+    absResult -= (DST_REP_T)(srcExpBias - dstExpBias) << DST_SIG_BITS;
+
+    const SRC_REP_T roundBits = aAbs & roundMask;
+    // Round to nearest
+    if (roundBits > halfway)
+      absResult++;
+      // Ties to even
+    else if (roundBits == halfway)
+      absResult += absResult & 1;
+  }
+  else if (aAbs > srcInfinity) {
+    // a is NaN.
+    // Conjure the result by beginning with infinity, setting the qNaN
+    // bit and inserting the (truncated) trailing NaN field.
+    absResult = (DST_REP_T)dstInfExp << DST_SIG_BITS;
+    absResult |= dstQNaN;
+    absResult |= ((aAbs & srcNaNCode) >> (SRC_SIG_BITS - DST_SIG_BITS)) & dstNaNCode;
+  }
+  else if (aAbs >= overflow) {
+    // a overflows to infinity.
+    absResult = (DST_REP_T)dstInfExp << DST_SIG_BITS;
+  }
+  else {
+    // a underflows on conversion to the destination type or is an exact
+    // zero.  The result may be a denormal or zero.  Extract the exponent
+    // to get the shift amount for the denormalization.
+    const int aExp = aAbs >> SRC_SIG_BITS;
+    const int shift = srcExpBias - dstExpBias - aExp + 1;
+
+    const SRC_REP_T significand = (aRep & srcSignificandMask) | srcMinNormal;
+
+    // Right shift by the denormalization amount with sticky.
+    if (shift > SRC_SIG_BITS) {
+      absResult = 0;
+    } else {
+      const bool sticky = significand << (srcBits - shift);
+      SRC_REP_T denormalizedSignificand = significand >> shift | sticky;
+      absResult = denormalizedSignificand >> (SRC_SIG_BITS - DST_SIG_BITS);
+      const SRC_REP_T roundBits = denormalizedSignificand & roundMask;
+      // Round to nearest
+      if (roundBits > halfway)
+        absResult++;
+        // Ties to even
+      else if (roundBits == halfway)
+        absResult += absResult & 1;
+    }
+  }
+
+  // Apply the signbit to (DST_T)abs(a).
+  const DST_REP_T result = absResult | sign >> (srcBits - dstBits);
+  const union { DST_T f; DST_REP_T i; } dst_rep = {.i = result};
+  return dst_rep.f;
+}
+
+template<typename SRC_T, typename SRC_REP_T, int SRC_SIG_BITS,
+         typename DST_T, typename DST_REP_T, int DST_SIG_BITS>
+static inline DST_T __extendXfYf2__(SRC_T a) {
+  // Various constants whose values follow from the type parameters.
+  // Any reasonable optimizer will fold and propagate all of these.
+  const int srcBits = sizeof(SRC_T) * 8;
+  const int srcExpBits = srcBits - SRC_SIG_BITS - 1;
+  const int srcInfExp = (1 << srcExpBits) - 1;
+  const int srcExpBias = srcInfExp >> 1;
+
+  const SRC_REP_T srcMinNormal = SRC_REP_T(1) << SRC_SIG_BITS;
+  const SRC_REP_T srcInfinity = (SRC_REP_T)srcInfExp << SRC_SIG_BITS;
+  const SRC_REP_T srcSignMask = SRC_REP_T(1) << (SRC_SIG_BITS + srcExpBits);
+  const SRC_REP_T srcAbsMask = srcSignMask - 1;
+  const SRC_REP_T srcQNaN = SRC_REP_T(1) << (SRC_SIG_BITS - 1);
+  const SRC_REP_T srcNaNCode = srcQNaN - 1;
+
+  const int dstBits = sizeof(DST_T)*8;
+  const int dstExpBits = dstBits - DST_SIG_BITS - 1;
+  const int dstInfExp = (1 << dstExpBits) - 1;
+  const int dstExpBias = dstInfExp >> 1;
+
+  const DST_REP_T dstMinNormal = DST_REP_T(1) << DST_SIG_BITS;
+
+  // Break a into a sign and representation of the absolute value
+  const union { SRC_T f; SRC_REP_T i; } src_rep = {.f = a};
+  const SRC_REP_T aRep = src_rep.i;
+  const SRC_REP_T aAbs = aRep & srcAbsMask;
+  const SRC_REP_T sign = aRep & srcSignMask;
+  DST_REP_T absResult;
+
+  // If sizeof(SRC_REP_T) < sizeof(int), the subtraction result is promoted
+  // to (signed) int.  To avoid that, explicitly cast to SRC_REP_T.
+  if ((SRC_REP_T)(aAbs - srcMinNormal) < srcInfinity - srcMinNormal) {
+    // a is a normal number.
+    // Extend to the destination type by shifting the significand and
+    // exponent into the proper position and rebiasing the exponent.
+    absResult = (DST_REP_T)aAbs << (DST_SIG_BITS - SRC_SIG_BITS);
+    absResult += (DST_REP_T)(dstExpBias - srcExpBias) << DST_SIG_BITS;
+  }
+
+  else if (aAbs >= srcInfinity) {
+    // a is NaN or infinity.
+    // Conjure the result by beginning with infinity, then setting the qNaN
+    // bit (if needed) and right-aligning the rest of the trailing NaN
+    // payload field.
+    absResult = (DST_REP_T)dstInfExp << DST_SIG_BITS;
+    absResult |= (DST_REP_T)(aAbs & srcQNaN) << (DST_SIG_BITS - SRC_SIG_BITS);
+    absResult |= (DST_REP_T)(aAbs & srcNaNCode) << (DST_SIG_BITS - SRC_SIG_BITS);
+  }
+  else if (aAbs) {
+    // a is denormal.
+    // renormalize the significand and clear the leading bit, then insert
+    // the correct adjusted exponent in the destination type.
+    const int scale = __clz(aAbs) - __clz(srcMinNormal);
+    absResult = (DST_REP_T)aAbs << (DST_SIG_BITS - SRC_SIG_BITS + scale);
+    absResult ^= dstMinNormal;
+    const int resultExponent = dstExpBias - srcExpBias - scale + 1;
+    absResult |= (DST_REP_T)resultExponent << DST_SIG_BITS;
+  }
+  else {
+    // a is zero.
+    absResult = 0;
+  }
+
+  // Apply the signbit to (DST_T)abs(a).
+  const DST_REP_T result = absResult | (DST_REP_T)sign << (dstBits - srcBits);
+  const union { DST_T f; DST_REP_T i; } dst_rep = {.i = result};
+  return dst_rep.f;
+}
diff --git a/3rdparty/dlpack b/3rdparty/dlpack
new file mode 160000
index 000000000000..bee4d1dd8dc1
--- /dev/null
+++ b/3rdparty/dlpack
@@ -0,0 +1 @@
+Subproject commit bee4d1dd8dc1ee4a1fd8fa6a96476c2f8b7492a3
diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
new file mode 160000
index 000000000000..519d013a213c
--- /dev/null
+++ b/3rdparty/dmlc-core
@@ -0,0 +1 @@
+Subproject commit 519d013a213c0c447a971f51219473ef564d2348
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 39776d53d1f1..363b2056a87a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,8 +29,10 @@ tvm_option(USE_ROCM "Build with ROCM" OFF)
 tvm_option(ROCM_PATH "The path to rocm" /opt/rocm)
 tvm_option(USE_RPC "Build with RPC" ON)
 tvm_option(USE_LLVM "Build with LLVM, can be set to specific llvm-config path" OFF)
+tvm_option(USE_STACKVM_RUNTIME "Include stackvm into the runtime" OFF)
 tvm_option(USE_GRAPH_RUNTIME "Build with tiny graph runtime" ON)
 tvm_option(USE_GRAPH_RUNTIME_DEBUG "Build with tiny graph runtime debug mode" OFF)
+tvm_option(USE_SGX "Build with SGX" OFF)
 tvm_option(USE_RTTI "Build with RTTI" ON)
 tvm_option(USE_MSVC_MT "Build with MT" OFF)
 tvm_option(INSTALL_DEV "Install compiler infrastructure" OFF)
@@ -45,21 +47,25 @@ tvm_option(USE_ROCBLAS "Build with ROCM:RoCBLAS" OFF)
 tvm_option(USE_SORT "Build with sort support" OFF)
 tvm_option(USE_NNPACK "Build with nnpack support" OFF)
 tvm_option(USE_RANDOM "Build with random support" OFF)
+tvm_option(USE_ANTLR "Build with ANTLR for Relay parsing" OFF)
 
 # include directories
 include_directories("include")
-include_directories("dlpack/include")
-include_directories("dmlc-core/include")
+include_directories("3rdparty/dlpack/include")
+include_directories("3rdparty/dmlc-core/include")
+include_directories("3rdparty/compiler-rt")
 
 # initial variables
 set(TVM_LINKER_LIBS "")
 set(TVM_RUNTIME_LINKER_LIBS "")
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 # Generic compilation options
 if(MSVC)
   add_definitions(-DWIN32_LEAN_AND_MEAN)
   add_definitions(-D_CRT_SECURE_NO_WARNINGS)
   add_definitions(-D_SCL_SECURE_NO_WARNINGS)
+  add_definitions(-D_ENABLE_EXTENDED_ALIGNED_STORAGE)
   add_definitions(-DHalide_SHARED)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
@@ -76,8 +82,12 @@ if(MSVC)
 else(MSVC)
   include(CheckCXXCompilerFlag)
   check_cxx_compiler_flag("-std=c++11"    SUPPORT_CXX11)
-  set(CMAKE_C_FLAGS "-O2 -Wall -fPIC ${CMAKE_C_FLAGS}")
-  set(CMAKE_CXX_FLAGS "-O2 -Wall -fPIC -std=c++11 ${CMAKE_CXX_FLAGS}")
+  if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+    add_compile_options(-Wall -fPIC -std=c++11)
+  else()
+    set(CMAKE_C_FLAGS "-O2 -Wall -fPIC ${CMAKE_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "-O2 -Wall -fPIC -std=c++11 ${CMAKE_CXX_FLAGS}")
+  endif ()
   if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND
       CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
     set(CMAKE_CXX_FLAGS "-faligned-new ${CMAKE_CXX_FLAGS}")
@@ -85,8 +95,8 @@ else(MSVC)
 endif(MSVC)
 
 # add source group
-FILE(GLOB_RECURSE GROUP_SOURCE "src/*.cc" "HalideIR/src/*.cpp" "nnvm/src/*.cc")
-FILE(GLOB_RECURSE GROUP_INCLUDE "src/*.h" "include/*.h" "HalideIR/src/*.h"
+FILE(GLOB_RECURSE GROUP_SOURCE "src/*.cc" "3rdparty/HalideIR/src/*.cpp" "nnvm/src/*.cc")
+FILE(GLOB_RECURSE GROUP_INCLUDE "src/*.h" "include/*.h" "3rdparty/HalideIR/src/*.h"
                                 "nnvm/src/*.h" "nnvm/include/*.h")
 assign_source_group("Source" ${GROUP_SOURCE})
 assign_source_group("Include" ${GROUP_INCLUDE})
@@ -97,13 +107,18 @@ file(GLOB COMPILER_SRCS
     src/arithmetic/*.cc
     src/autotvm/*.cc
     src/codegen/*.cc
-    src/codegen/stack_vm/*.cc
     src/lang/*.cc
     src/pass/*.cc
     src/op/*.cc
     src/schedule/*.cc
     )
 
+file(GLOB_RECURSE RELAY_SRCS
+    src/relay/*.cc
+    )
+list(APPEND COMPILER_SRCS ${RELAY_SRCS})
+
+
 if(NOT MSVC)
   file(GLOB COMPILER_VERILOG_SRCS src/codegen/verilog/*.cc)
   list(APPEND COMPILER_SRCS ${COMPILER_VERILOG_SRCS})
@@ -120,7 +135,7 @@ file(GLOB_RECURSE NNVM_COMPILER_SRCS
 file(GLOB TOPI_SRCS
     topi/src/*.cc
 )
-file(GLOB_RECURSE HALIDEIR_SRCS HalideIR/src/*.cpp)
+file(GLOB_RECURSE HALIDEIR_SRCS 3rdparty/HalideIR/src/*.cpp)
 list(APPEND COMPILER_SRCS ${HALIDEIR_SRCS})
 file(GLOB RUNTIME_SRCS src/runtime/*.cc)
 
@@ -135,12 +150,25 @@ if(USE_RPC)
   list(APPEND RUNTIME_SRCS ${RUNTIME_RPC_SRCS})
 endif(USE_RPC)
 
+file(GLOB STACKVM_RUNTIME_SRCS src/runtime/stackvm/*.cc)
+file(GLOB STACKVM_CODEGEN_SRCS src/codegen/stackvm/*.cc)
+list(APPEND COMPILER_SRCS ${STACKVM_CODEGEN_SRCS})
+if(USE_STACKVM_RUNTIME)
+  message(STATUS "Build with stackvm support in runtime...")
+  list(APPEND RUNTIME_SRCS ${STACKVM_RUNTIME_SRCS})
+else()
+  list(APPEND COMPILER_SRCS ${STACKVM_RUNTIME_SRCS})
+endif(USE_STACKVM_RUNTIME)
+
 if(USE_GRAPH_RUNTIME)
   message(STATUS "Build with Graph runtime support...")
   file(GLOB RUNTIME_GRAPH_SRCS src/runtime/graph/*.cc)
   list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_SRCS})
 
   if(USE_GRAPH_RUNTIME_DEBUG)
+    message(STATUS "Build with Graph runtime debug support...")
+    file(GLOB RUNTIME_GRAPH_DEBUG_SRCS src/runtime/graph/debug/*.cc)
+    list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_DEBUG_SRCS})
     set_source_files_properties(${RUNTIME_GRAPH_SRCS}
       PROPERTIES COMPILE_DEFINITIONS "TVM_GRAPH_RUNTIME_DEBUG")
   endif(USE_GRAPH_RUNTIME_DEBUG)
@@ -154,7 +182,9 @@ include(cmake/modules/OpenGL.cmake)
 include(cmake/modules/Vulkan.cmake)
 include(cmake/modules/Metal.cmake)
 include(cmake/modules/ROCM.cmake)
+include(cmake/modules/SGX.cmake)
 include(cmake/modules/LLVM.cmake)
+include(cmake/modules/ANTLR.cmake)
 include(cmake/modules/contrib/BLAS.cmake)
 include(cmake/modules/contrib/Random.cmake)
 include(cmake/modules/contrib/Sort.cmake)
@@ -163,6 +193,11 @@ include(cmake/modules/contrib/NNPack.cmake)
 add_library(tvm SHARED ${COMPILER_SRCS} ${RUNTIME_SRCS})
 add_library(tvm_topi SHARED ${TOPI_SRCS})
 add_library(tvm_runtime SHARED ${RUNTIME_SRCS})
+if(NOT USE_SGX STREQUAL "OFF")
+  add_dependencies(tvm sgx_edl)
+  add_dependencies(tvm_runtime sgx_edl tvm_t)
+  install(TARGETS tvm_t ARCHIVE DESTINATION lib${LIB_SUFFIX})
+endif()
 add_library(nnvm_compiler SHARED ${NNVM_COMPILER_SRCS})
 
 target_link_libraries(tvm ${TVM_LINKER_LIBS} ${TVM_RUNTIME_LINKER_LIBS})
@@ -173,7 +208,7 @@ target_link_libraries(nnvm_compiler tvm)
 # Related headers
 target_include_directories(
   tvm
-  PUBLIC "HalideIR/src"
+  PUBLIC "3rdparty/HalideIR/src"
   PUBLIC "topi/include")
 target_include_directories(
   tvm_topi
@@ -186,7 +221,7 @@ target_include_directories(
 # Tests
 set(TEST_EXECS "")
 file(GLOB TEST_SRCS tests/cpp/*.cc)
-find_library(GTEST_LIB gtest)
+find_library(GTEST_LIB gtest "$ENV{GTEST_LIB}")
 
 if(GTEST_LIB)
   foreach(__srcpath ${TEST_SRCS})
@@ -223,12 +258,12 @@ if (INSTALL_DEV)
     PATTERN "*.h"
   )
   install(
-    DIRECTORY "HalideIR/src/." DESTINATION "include/HalideIR"
+    DIRECTORY "3rdparty/HalideIR/src/." DESTINATION "include/HalideIR"
     FILES_MATCHING
     PATTERN "*.h"
   )
   install(
-    DIRECTORY "dlpack/include/." DESTINATION "include"
+    DIRECTORY "3rdparty/dlpack/include/." DESTINATION "include"
     FILES_MATCHING
     PATTERN "*.h"
     )
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 6e3cf55b94b0..23d22686705b 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -5,38 +5,64 @@ contribute to, and influence the direction of the project. We actively invite co
 
 See the [community structure document](http://docs.tvm.ai/contribute/community.html) for the explanation of community structure and contribution guidelines.
 
+
 ## Committers
-- [Tianqi Chen](https://github.com/tqchen) (PMC)
-- [Thierry Moreau](http://homes.cs.washington.edu/~moreau/)
-- [Ziheng Jiang](https://github.com/ZihengJiang)
-- [Haichen Shen](http://homes.cs.washington.edu/~haichen/)
-- [Yizhi Liu](https://github.com/yzhliu)
-
-## Code Owners
-- [Aditya Atluri](https://github.com/adityaatluri) ROCM
-- [Leyuan Wang](https://github.com/Laurawly) TOPI
-- [Yuwei Hu](https://github.com/Huyuwei) TOPI
-- [Zhixun Tan](https://github.com/phisiart) OpenGL/WebGL backend
-- [Nick Hynes](https://github.com/nhynes) SGX and secured computing
+
+We add tag along with committer name to show areas that they are familiar with.
+We do encourage everyone to work anything they are interested in.
+
+- [Aditya Atluri](https://github.com/adityaatluri): @adityaatluri - rocm
+- [Tianqi Chen](https://github.com/tqchen) (PMC): @tqchen - topi, compiler, relay, docs
+- [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei - topi, frontends
+- [Nick Hynes](https://github.com/nhynes): @nhynes: - sgx, rust
+- [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
+- [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay
+- [Masahiro Masuda](https://github.com/masahi): @masahi - topi, relay
+- [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta
+- [Siva](https://github.com/srkreddy1238): @srkreddy1238 - frontends, golang
+- [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
+- [Zhixun Tan](https://github.com/phisiart): @phisiart - opengl, web
+- [Leyuan Wang](https://github.com/Laurawly): @Laurawly: - topi
+- [Eddie Yan](https://github.com/eqy): @eqy - runtime, autotvm, rpc, topi
+- [Lianmin Zheng](https://github.com/merrymercy): @merrymercy - autotvm, topi
 
 ## Reviewers
-- [Masahiro Masuda](https://github.com/masahi)
-- [Kazutaka Morita](https://github.com/kazum)
-- [Pariksheet Pinjari](https://github.com/PariksheetPinjari909)
-- [Siva](https://github.com/srkreddy1238)
-- [Alex Weaver](https://github.com/alex-weaver)
-- [Eddie Yan](https://github.com/eqy)
-- [Joshua Z. Zhang](https://github.com/zhreshold)
-- [Lianmin Zheng](https://github.com/merrymercy)
+
+- [Aditya Atluri](https://github.com/adityaatluri): @adityaatluri
+- [Tianqi Chen](https://github.com/tqchen): @tqchen
+- [Liangfu Chen](https://github.com/liangfu): @liangfu
+- [Zhi Chen](https://github.com/zhiics): @zhiics
+- [Nick Hynes](https://github.com/nhynes): @nhynes
+- [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei
+- [Yizhi Liu](https://github.com/yzhliu) : @yzhliu
+- [Zhixun Tan](https://github.com/phisiart): @phisiart
+- [Zhi Chen](https://github.com/zhiics): @zhiics
+- [Xiaoqiang Dan](https://github.com/xqdan): @xqdan
+- [Ziheng Jiang](https://github.com/ZihengJiang): @ZihengJiang
+- [Wuwei Lin](https://github.com/vinx13): @vinx13
+- [Masahiro Masuda](https://github.com/masahi): @masahi
+- [Sergey Mironov](https://github.com/grwlf): @grwlf
+- [Thierry Moreau](https://github.com/tmoreau89): @tmoreau89
+- [Kazutaka Morita](https://github.com/kazum): @kazum
+- [Tatsuya Nishiyama](https://github.com/nishi-t): @nishi-t
+- [Pariksheet Pinjari](https://github.com/PariksheetPinjari909): @PariksheetPinjari909
+- [Jared Roesch](https://github.com/jroesch): @jroesch
+- [Siva](https://github.com/srkreddy1238): @srkreddy1238
+- [Siju Samuel](https://github.com/siju-samuel): @siju-samuel
+- [Haichen Shen](https://github.com/icemelon9): @icemelon9
+- [Alex Weaver](https://github.com/alex-weaver): @alex-weaver
+- [Yao Wang](https://github.com/kevinthesun): @kevinthesun
+- [Leyuan Wang](https://github.com/Laurawly): @Laurawly
+- [Jian Weng](https://github.com/were): @were
+- [Eddie Yan](https://github.com/eqy): @eqy
+- [Joshua Z. Zhang](https://github.com/zhreshold): @zhreshold
+- [Lianmin Zheng](https://github.com/merrymercy): @merrymercy
+- [Andrew Tulloch](https://github.com/ajtulloch): @ajtulloch
 
 ## List of Contributors
 - [Full List of Contributors](https://github.com/dmlc/tvm/graphs/contributors)
   - To contributors: please add your name to the list.
 - [Qiao Zhang](https://github.com/zhangqiaorjc)
-- [Jian Weng](https://github.com/were)
-- [Masahiro Masuda](https://github.com/masahi)
 - [Haolong Zhang](https://github.com/haolongzhangm)
 - [Cody Hao Yu](https://github.com/comaniac)
 - [Chris Nuernberger](https://github.com/cnuernber)
-- [Tatsuya Nishiyama](https://github.com/nishi-t)
-- [Kazutaka Morita](https://github.com/kazum)
diff --git a/HalideIR b/HalideIR
deleted file mode 160000
index a5a80bdc8232..000000000000
--- a/HalideIR
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit a5a80bdc8232c9dbfe508bb5c46e8f58cdf7ec20
diff --git a/Jenkinsfile b/Jenkinsfile
index bec0d2be5df8..f0c11426a078 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -96,6 +96,9 @@ stage('Build') {
            echo set\\(USE_RPC ON\\) >> config.cmake
            echo set\\(USE_SORT ON\\) >> config.cmake
            echo set\\(USE_GRAPH_RUNTIME ON\\) >> config.cmake
+           echo set\\(USE_STACKVM_RUNTIME ON\\) >> config.cmake
+           echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
+           echo set\\(USE_ANTLR ON\\) >> config.cmake
            echo set\\(USE_BLAS openblas\\) >> config.cmake
            echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
            echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
@@ -110,6 +113,7 @@ stage('Build') {
            echo set\\(USE_OPENCL ON\\) >> config.cmake
            echo set\\(USE_ROCM ON\\) >> config.cmake
            echo set\\(USE_VULKAN ON\\) >> config.cmake
+           echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
            echo set\\(CMAKE_CXX_COMPILER clang-6.0\\) >> config.cmake
            echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
            """
@@ -126,7 +130,11 @@ stage('Build') {
            cd build
            cp ../cmake/config.cmake .
            echo set\\(USE_SORT ON\\) >> config.cmake
+           echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
            echo set\\(USE_LLVM llvm-config-4.0\\) >> config.cmake
+           echo set\\(USE_NNPACK ON\\) >> config.cmake
+           echo set\\(NNPACK_PATH /NNPACK/build/\\) >> config.cmake
+           echo set\\(USE_ANTLR ON\\) >> config.cmake
            echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
            echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
            """
@@ -135,6 +143,10 @@ stage('Build') {
         timeout(time: max_time, unit: 'MINUTES') {
           sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_cpp_unittest.sh"
           sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_python_vta.sh"
+          sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_rust.sh"
+          sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_golang.sh"
+          sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_python_unittest.sh"
+          sh "${docker_run} tvmai/ci-cpu ./tests/scripts/task_python_integration.sh"
         }
       }
     }
@@ -149,6 +161,7 @@ stage('Build') {
            cp ../cmake/config.cmake .
            echo set\\(USE_SORT ON\\) >> config.cmake
            echo set\\(USE_RPC ON\\) >> config.cmake
+           echo set\\(USE_GRAPH_RUNTIME_DEBUG ON\\) >> config.cmake
            echo set\\(USE_LLVM llvm-config-5.0\\) >> config.cmake
            echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
            echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
diff --git a/Makefile b/Makefile
index 2d3d4843c4c0..50048165bb8d 100644
--- a/Makefile
+++ b/Makefile
@@ -4,11 +4,11 @@ ROOTDIR = $(CURDIR)
 	 cython cython2 cython3 web runtime vta
 
 ifndef DMLC_CORE_PATH
-  DMLC_CORE_PATH = $(ROOTDIR)/dmlc-core
+  DMLC_CORE_PATH = $(ROOTDIR)/3rdparty/dmlc-core
 endif
 
 ifndef DLPACK_PATH
-  DLPACK_PATH = $(ROOTDIR)/dlpack
+  DLPACK_PATH = $(ROOTDIR)/3rdparty/dlpack
 endif
 
 INCLUDE_FLAGS = -Iinclude -I$(DLPACK_PATH)/include -I$(DMLC_CORE_PATH)/include
@@ -50,10 +50,10 @@ build/libtvm_web_runtime.js: build/libtvm_web_runtime.bc
 
 # Lint scripts
 cpplint:
-	python3 dmlc-core/scripts/lint.py vta cpp vta/include vta/src
-	python3 dmlc-core/scripts/lint.py topi cpp topi/include;
-	python3 dmlc-core/scripts/lint.py nnvm cpp nnvm/include nnvm/src;
-	python3 dmlc-core/scripts/lint.py tvm cpp include src verilog\
+	python3 3rdparty/dmlc-core/scripts/lint.py vta cpp vta/include vta/src
+	python3 3rdparty/dmlc-core/scripts/lint.py topi cpp topi/include;
+	python3 3rdparty/dmlc-core/scripts/lint.py nnvm cpp nnvm/include nnvm/src;
+	python3 3rdparty/dmlc-core/scripts/lint.py tvm cpp include src verilog\
 	 examples/extension/src examples/graph_executor/src
 
 pylint:
@@ -63,13 +63,17 @@ pylint:
 	python3 -m pylint vta/python/vta --rcfile=$(ROOTDIR)/tests/lint/pylintrc
 
 jnilint:
-	python3 dmlc-core/scripts/lint.py tvm4j-jni cpp jvm/native/src
+	python3 3rdparty/dmlc-core/scripts/lint.py tvm4j-jni cpp jvm/native/src
 
 lint: cpplint pylint jnilint
 
 doc:
 	doxygen docs/Doxyfile
 
+javadoc:
+	# build artifact is in jvm/core/target/site/apidocs
+	cd jvm && mvn javadoc:javadoc
+
 # Cython build
 cython:
 	cd python; python setup.py build_ext --inplace
diff --git a/NEWS.md b/NEWS.md
index 567aabf3fcbd..2c2f616cb2f0 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -9,6 +9,69 @@ Refer to the Roadmap issue for complete list on on-going version features.
 If you check in something that is not reflected in Roadmap issue, please reply
 to that issue so it can get added.
 
+## 0.4
+
+This release features several major improvements. The high-level graph optimizer is now part of TVM repo. Some of the highlights are: Initial support of AutoTVM for automated optimization; customized accelerator backend VTA.
+
+- Tensor operator primitives
+  - Introduce attrs field to operator primitives(e.g. compute) to store additional metadata, the attrs can be used as hint for scheduling
+- Enable embedding of asm micro-kernels
+- Hybrid python programming model
+   - python AST based IR builder interface
+   - support GPU programs
+- AutoTVM, Automated tuning, and scheduling
+   - basic autotvm infra
+    - GPU IR verifier
+   - basic autotuning tutorial
+   - topi integration
+- ARM support
+    - winograd support
+   - initial support of ARM autotuning records
+- TOPI Vision
+   - Generic GPU sort support(useful for vision)
+   - SSD operator support
+- TOPI numpy consistency
+   - Rename all binary operators for numpy consistecy: broadcast_add-> add, broadcast_sub -> substract, broadcast_mul -> multiply, broadcast_div->divide
+   - New operators: slice, LRN, equal, not_equal, less, greater
+   - tutorials on topi
+- Initial low-bit operator support support
+    - Optimized popcount generation on ARM
+    - general bit-serial convolution and GEMM
+    - optimized low bit kernels
+    - parallel optimization
+- New topi backend optimization for intel graphics
+- Adapt AVX schedules for SSE target
+- VTA: customized accelerator backend
+  - custom hardware backend example
+  - tutorials on how to use customized accelerator
+- Initial experimental support for  HLS backend
+- Bugfix in SPIRV code generator for vulkan
+- libdevice support, enable NVPTX backend
+- Introduce NDArrayContainer for managed NDarray
+- RPC and Device API
+   - Support communication between big/small endian machines.
+   - RPC and device API protocol upgrade (this is a non-backward compatible change) to support big-small endian communication. This is a non-backward compatible change, need to use the latest version of TVM runtime with the RPC
+   - graduate rpc from contrib, tvm.contrib.rpc->tvm.rpc
+   -Support tracker in Android RPC, add fault tolerance for AutoTVM
+- BIG.LITTLE aware threadpool
+- tvm4j graph runtime that runs end to end workload in java
+- DLPack support
+   - Support from_dlpack and to_dlpack
+   - Enables bridges to pytorch
+- Enable link of stackvm in runtime
+- Tensorflow graphdef frontend
+- Keras frontend
+   - improved to support reuse layers, add activations
+- ONNX
+   - gather,  LRN
+- CoreML frontend
+   - Support C-RNN and activation functions
+- Fix grads for sum and expand_like
+- Enhanced operator fusion for multiple elemwise branches
+- Separate nnvm fusion and compilation pass
+- Unified build system to cmake, customizable cmake path for vulkan, rocm, cuda
+
+
 ## 0.3
 
 This release features numerous improvements in TOPI and backends. We make the first step toward object detection support in TOPI, featuring operators necessary for YOLO and SSDs. The topi now supports numpy-style API and operator overloading. RPC is significantly improved to support resource allocation and using a pool of devices. We are adding two new backends: WebGL for running GPUs on the browser, and Vulkan for running on next-generation graphics API.
diff --git a/NOTICE b/NOTICE
new file mode 100644
index 000000000000..45468c50ba1b
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1 @@
+TVM End to End Deep Learning Compiler Stack: https://tvm.ai/
diff --git a/README.md b/README.md
index e2fc7b8c45d2..828b0f7e880b 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,6 @@ Acknowledgement
 ---------------
 We learnt a lot from the following projects when building TVM.
 - [Halide](https://github.com/halide/Halide): TVM uses [HalideIR](https://github.com/dmlc/HalideIR) as data structure for
-  arithematic simplification and low level lowering. We also learnt and adapted some part of lowering pipeline from Halide.
+  arithmetic simplification and low level lowering. We also learnt and adapted some part of lowering pipeline from Halide.
 - [Loopy](https://github.com/inducer/loopy): use of integer set analysis and its loop transformation primitives.
 - [Theano](https://github.com/Theano/Theano): the design inspiration of symbolic scan operator for recurrence.
diff --git a/apps/android_deploy/README.md b/apps/android_deploy/README.md
index 801ca8bdf95c..2c2951b5332d 100644
--- a/apps/android_deploy/README.md
+++ b/apps/android_deploy/README.md
@@ -2,14 +2,21 @@
 
 This folder contains Android Demo app that allows us to show how to deploy model using TVM runtime api on a Android phone.
 
-You will need [JDK](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html), [Android SDK](https://developer.android.com/studio/index.html), [Android NDK](https://developer.android.com/ndk) and an Android device to use this.
+You will need [JDK](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html), [Android SDK](https://developer.android.com/studio/index.html), [Android NDK](https://developer.android.com/ndk) and an Android device to use this. Make sure the `ANDROID_HOME` variable already points to your Android SDK folder or set it using `export ANDROID_HOME=[Path to your Android SDK, e.g., ~/Android/sdk]`. We use [Gradle](https://gradle.org) to build. Please follow [the installation instruction](https://gradle.org/install) for your operating system.
+
+Alternatively, you may execute Docker image we provide wich contains the required packages. Use the command below to build the image and enter interactive session. Note, that building with OpenCL was not tested from Docker.
+
+```bash
+./docker/build.sh demo_android -it bash
+(docker) $ echo $ANDROID_HOME
+(docker) /opt/android-sdk-linux
+```
+
 
 ## Build and Installation
 
 ### Build APK
 
-We use [Gradle](https://gradle.org) to build. Please follow [the installation instruction](https://gradle.org/install) for your operating system.
-
 Before you build the Android application, please refer to [TVM4J Installation Guide](https://github.com/dmlc/tvm/blob/master/jvm/README.md) and install tvm4j-core to your local maven repository. You can find tvm4j dependency declare in `app/build.gradle`. Modify it if it is necessary.
 
 ```
@@ -48,7 +55,6 @@ USE_OPENCL = 0
 Now use Gradle to compile JNI, resolve Java dependencies and build the Android application together with tvm4j. Run following script to generate the apk file.
 
 ```bash
-export ANDROID_HOME=[Path to your Android SDK, e.g., ~/Android/sdk]
 cd apps/android_deploy
 gradle clean build
 ```
diff --git a/apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java b/apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java
index f3cdefe1c2ff..7d391856f599 100644
--- a/apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java
+++ b/apps/android_deploy/app/src/main/java/ml/dmlc/tvm/android/demo/MainActivity.java
@@ -298,7 +298,7 @@ protected Integer doInBackground(Bitmap... bitmaps) {
 
                     // get the function from the module(get output data)
                     Log.i(TAG, "get output data");
-                    NDArray outputNdArray = NDArray.empty(new long[]{1000}, new TVMType("float32"));
+                    NDArray outputNdArray = NDArray.empty(new long[]{1, 1000}, new TVMType("float32"));
                     Function getOutputFunc = graphRuntimeModule.getFunction("get_output");
                     getOutputFunc.pushArg(OUTPUT_INDEX).pushArg(outputNdArray).invoke();
                     float[] output = outputNdArray.asFloatArray();
@@ -630,4 +630,4 @@ public static Matrix getTransformationMatrix(
 
         return matrix;
     }
-}
\ No newline at end of file
+}
diff --git a/apps/android_deploy/app/src/main/jni/Android.mk b/apps/android_deploy/app/src/main/jni/Android.mk
index a99517f90332..da5f499ea706 100644
--- a/apps/android_deploy/app/src/main/jni/Android.mk
+++ b/apps/android_deploy/app/src/main/jni/Android.mk
@@ -20,9 +20,9 @@ LOCAL_SRC_FILES := ml_dmlc_tvm_native_c_api.cc
 LOCAL_LDFLAGS := -L$(SYSROOT)/usr/lib/ -llog
 
 LOCAL_C_INCLUDES := $(ROOT_PATH)/include \
-                    $(ROOT_PATH)/dlpack/include \
-                    $(ROOT_PATH)/dmlc-core/include \
-                    $(ROOT_PATH)/HalideIR/src \
+                    $(ROOT_PATH)/3rdparty/dlpack/include \
+                    $(ROOT_PATH)/3rdparty/dmlc-core/include \
+                    $(ROOT_PATH)/3rdparty/HalideIR/src \
                     $(ROOT_PATH)/topi/include
 
 LOCAL_MODULE = tvm4j_runtime_packed
diff --git a/apps/android_deploy/build.gradle b/apps/android_deploy/build.gradle
index f7bbe2641c9d..1eeb9d686cfb 100644
--- a/apps/android_deploy/build.gradle
+++ b/apps/android_deploy/build.gradle
@@ -3,9 +3,12 @@
 buildscript {
     repositories {
         jcenter()
+        maven {
+            url 'https://maven.google.com'
+        }
     }
     dependencies {
-        classpath 'com.android.tools.build:gradle:2.3.3'
+        classpath 'com.android.tools.build:gradle:3.1.0'
         classpath 'org.apache.httpcomponents:httpclient:4.5.4'
 
         // NOTE: Do not place your application dependencies here; they belong
diff --git a/apps/android_deploy/dev_tools/sign_apk.sh b/apps/android_deploy/dev_tools/sign_apk.sh
index 314f82cdb76c..fd8cee6b927a 100644
--- a/apps/android_deploy/dev_tools/sign_apk.sh
+++ b/apps/android_deploy/dev_tools/sign_apk.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 CURR_DIR=$(cd `dirname $0`; pwd)
-APK_DIR=$CURR_DIR/../app/build/outputs/apk
+APK_DIR=$CURR_DIR/../app/build/outputs/apk/release
 UNSIGNED_APK=$APK_DIR/app-release-unsigned.apk
 SIGNED_APK=$APK_DIR/tvmdemo-release.apk
 jarsigner -verbose -keystore $CURR_DIR/tvmdemo.keystore -signedjar $SIGNED_APK $UNSIGNED_APK 'tvmdemo'
diff --git a/apps/android_rpc/README.md b/apps/android_rpc/README.md
index 41d361c823ed..453263aa824e 100644
--- a/apps/android_rpc/README.md
+++ b/apps/android_rpc/README.md
@@ -104,11 +104,11 @@ You are supposed to find a free "android" in the queue status.
 ...
 
 Queue Status
-----------------------------
-key    	free	pending
-----------------------------
-android	1	0
-----------------------------
+-------------------------------
+key       total  free  pending
+-------------------------------
+android   1      1     0
+-------------------------------
 ```
 
 
@@ -123,18 +123,25 @@ export TVM_NDK_CC=/opt/android-toolchain-arm64/bin/aarch64-linux-android-g++
 python android_rpc_test.py
 ```
 
-This will compile TVM IR to shared libraries (CPU and OpenCL) and run vector addition on your Android device. On my test device, it gives following results.
+This will compile TVM IR to shared libraries (CPU, OpenCL and Vulkan) and run vector addition on your Android device. To verify compiled TVM IR shared libraries on OpenCL target set [`'test_opencl = True'`](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py#L25) and on Vulkan target set [`'test_vulkan = False'`](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py#L27) in  [tests/android_rpc_test.py](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py), by default on CPU target will execute.
+On my test device, it gives following results.
 
 ```bash
-TVM: Initializing cython mode...
-[01:21:43] src/codegen/llvm/codegen_llvm.cc:75: set native vector to be 32 for target aarch64
-[01:21:43] src/runtime/opencl/opencl_device_api.cc:194: Initialize OpenCL platform 'Apple'
-[01:21:43] src/runtime/opencl/opencl_device_api.cc:214: opencl(0)='Iris' cl_device_id=0x1024500
-[01:21:44] src/codegen/llvm/codegen_llvm.cc:75: set native vector to be 32 for target aarch64
-Run GPU test ...
-0.000155807 secs/op
 Run CPU test ...
-0.00139824 secs/op
+0.000962932 secs/op
+
+Run GPU(OpenCL Flavor) test ...
+0.000155807 secs/op
+
+[23:29:34] /home/tvm/src/runtime/vulkan/vulkan_device_api.cc:674: Cannot initialize vulkan: [23:29:34] /home/tvm/src/runtime/vulkan/vulkan_device_api.cc:512: Check failed: __e == VK_SUCCESS Vulan Error, code=-9: VK_ERROR_INCOMPATIBLE_DRIVER
+
+Stack trace returned 10 entries:
+[bt] (0) /home/user/.local/lib/python3.6/site-packages/tvm-0.4.0-py3.6-linux-x86_64.egg/tvm/libtvm.so(dmlc::StackTrace[abi:cxx11]()+0x53) [0x7f477f5399f3]
+.........
+
+You can still compile vulkan module but cannot run locally
+Run GPU(Vulkan Flavor) test ...
+0.000225198 secs/op
 ```
 
 You can define your own TVM operators and test via this RPC app on your Android device to find the most optimized TVM schedule.
diff --git a/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java b/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java
index d80008bbe258..2ea4e4cb7528 100644
--- a/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java
+++ b/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java
@@ -39,11 +39,9 @@
 
 
 public class MainActivity extends AppCompatActivity {
-  private boolean skipRelaunch = true;
   // wait time before automatic restart of RPC Activity
   public static final int HANDLER_RESTART_DELAY = 5000;
 
-
   private void showDialog(String title, String msg) {
     AlertDialog.Builder builder = new AlertDialog.Builder(this);
     builder.setTitle(title);
@@ -91,7 +89,7 @@ private void setupRelaunch() {
     final Runnable rPCStarter = new Runnable() {
         public void run() {
             if (switchPersistent.isChecked()) {
-              System.err.println("relaunching RPC activity in 5s...");
+              System.err.println("relaunching RPC activity...");
               Intent intent = ((MainActivity) context).updateRPCPrefs();
               startActivity(intent);
             }
@@ -116,6 +114,7 @@ public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) {
         if (isChecked) {
           System.err.println("automatic RPC restart enabled...");
           updateRPCPrefs();
+          setupRelaunch();
         } else {
           System.err.println("automatic RPC restart disabled...");
           updateRPCPrefs();
@@ -123,29 +122,14 @@ public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) {
       }
     });
 
-    Button startRPC = findViewById(R.id.button_start_rpc);
-    startRPC.setOnClickListener(new View.OnClickListener() {
-        public void onClick(View v) {
-            Intent intent = ((MainActivity) context).updateRPCPrefs();
-            startActivity(intent);
-        }
-    });
-
     enableInputView(true);
   }
 
   @Override
   protected void onResume() {
     System.err.println("MainActivity onResume...");
-    System.err.println("skipRelaunch: " + skipRelaunch);
-    // if this is the first time onResume is called, do nothing, otherwise we
-    // may double launch
-    if (!skipRelaunch) {
-        enableInputView(true);
-        setupRelaunch();
-    } else {
-        skipRelaunch = false;
-    }
+    enableInputView(true);
+    setupRelaunch();
     super.onResume();
   }
 
diff --git a/apps/android_rpc/app/src/main/jni/Android.mk b/apps/android_rpc/app/src/main/jni/Android.mk
index a99517f90332..da5f499ea706 100644
--- a/apps/android_rpc/app/src/main/jni/Android.mk
+++ b/apps/android_rpc/app/src/main/jni/Android.mk
@@ -20,9 +20,9 @@ LOCAL_SRC_FILES := ml_dmlc_tvm_native_c_api.cc
 LOCAL_LDFLAGS := -L$(SYSROOT)/usr/lib/ -llog
 
 LOCAL_C_INCLUDES := $(ROOT_PATH)/include \
-                    $(ROOT_PATH)/dlpack/include \
-                    $(ROOT_PATH)/dmlc-core/include \
-                    $(ROOT_PATH)/HalideIR/src \
+                    $(ROOT_PATH)/3rdparty/dlpack/include \
+                    $(ROOT_PATH)/3rdparty/dmlc-core/include \
+                    $(ROOT_PATH)/3rdparty/HalideIR/src \
                     $(ROOT_PATH)/topi/include
 
 LOCAL_MODULE = tvm4j_runtime_packed
diff --git a/apps/android_rpc/app/src/main/jni/Application.mk b/apps/android_rpc/app/src/main/jni/Application.mk
index 5bf52bdaffc0..f142e2995777 100644
--- a/apps/android_rpc/app/src/main/jni/Application.mk
+++ b/apps/android_rpc/app/src/main/jni/Application.mk
@@ -1,9 +1,9 @@
 ifndef config
-	ifneq ("$(wildcard ./config.mk)","")
-	  config ?= config.mk
-	else
-	  config ?= make/config.mk
-	endif
+    ifneq ("$(wildcard ./config.mk)","")
+        config ?= config.mk
+    else
+        config ?= make/config.mk
+    endif
 endif
 
 include $(config)
@@ -16,10 +16,10 @@ APP_STL := c++_static
 
 APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++11 -Oz -frtti
 ifeq ($(USE_OPENCL), 1)
-	APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1
+    APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1
 endif
 
 ifeq ($(USE_VULKAN), 1)
-	APP_CPPFLAGS += -DTVM_VULKAN_RUNTIME=1
-	APP_LDFLAGS += -lvulkan
+    APP_CPPFLAGS += -DTVM_VULKAN_RUNTIME=1
+    APP_LDFLAGS += -lvulkan
 endif
diff --git a/apps/android_rpc/app/src/main/res/layout/content_main.xml b/apps/android_rpc/app/src/main/res/layout/content_main.xml
index 0f2564833ecd..69c1f76030df 100644
--- a/apps/android_rpc/app/src/main/res/layout/content_main.xml
+++ b/apps/android_rpc/app/src/main/res/layout/content_main.xml
@@ -20,6 +20,7 @@
             android:hint="@string/input_address"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
+            android:inputType="phone"
             android:background="@android:drawable/editbox_background"/>
     </LinearLayout>
 
@@ -37,6 +38,7 @@
             android:minWidth="100dip"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
+            android:inputType="phone"
             android:background="@android:drawable/editbox_background"/>
     </LinearLayout>
 
@@ -76,15 +78,4 @@
             android:textOn="@string/switch_on" />
     </LinearLayout>
 
-    <LinearLayout
-        android:orientation="horizontal"
-        android:layout_width="fill_parent"
-        android:layout_height="wrap_content">
-        <Button
-            android:id="@+id/button_start_rpc"
-            android:layout_height="wrap_content"
-            android:layout_width="wrap_content"
-            android:text="@string/start_rpc" />
-    </LinearLayout>
-
 </LinearLayout>
diff --git a/apps/android_rpc/app/src/main/res/values/strings.xml b/apps/android_rpc/app/src/main/res/values/strings.xml
index 33caa374b496..f1ca2b90a001 100644
--- a/apps/android_rpc/app/src/main/res/values/strings.xml
+++ b/apps/android_rpc/app/src/main/res/values/strings.xml
@@ -9,11 +9,10 @@
     <string name="label_address">Address</string>
     <string name="label_port">Port</string>
     <string name="label_key">Key</string>
-    <string name="label_persistent">Keep RPC Alive</string>
+    <string name="label_persistent">Enable RPC</string>
 
     <string name="switch_on">Enabled</string>
     <string name="switch_off">Disabled</string>
 
-    <string name="start_rpc">Start RPC</string>
     <string name="stop_rpc">Stop RPC</string>
 </resources>
diff --git a/apps/android_rpc/build.gradle b/apps/android_rpc/build.gradle
index f13b8fc9a728..08140708d5ef 100644
--- a/apps/android_rpc/build.gradle
+++ b/apps/android_rpc/build.gradle
@@ -3,9 +3,12 @@
 buildscript {
     repositories {
         jcenter()
+        maven {
+            url 'https://maven.google.com'
+        }
     }
     dependencies {
-        classpath 'com.android.tools.build:gradle:2.3.3'
+        classpath 'com.android.tools.build:gradle:3.1.0'
 
         // NOTE: Do not place your application dependencies here; they belong
         // in the individual module build.gradle files
diff --git a/apps/android_rpc/dev_tools/sign_apk.sh b/apps/android_rpc/dev_tools/sign_apk.sh
index f52faff4d074..7dc6480f4bca 100755
--- a/apps/android_rpc/dev_tools/sign_apk.sh
+++ b/apps/android_rpc/dev_tools/sign_apk.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 CURR_DIR=$(cd `dirname $0`; pwd)
-APK_DIR=$CURR_DIR/../app/build/outputs/apk
+APK_DIR=$CURR_DIR/../app/build/outputs/apk/release
 UNSIGNED_APK=$APK_DIR/app-release-unsigned.apk
 SIGNED_APK=$APK_DIR/tvmrpc-release.apk
 jarsigner -verbose -keystore $CURR_DIR/tvmrpc.keystore -signedjar $SIGNED_APK $UNSIGNED_APK 'tvmrpc'
diff --git a/apps/android_rpc/tests/android_rpc_test.py b/apps/android_rpc/tests/android_rpc_test.py
index cfb04c1ca9a9..44618efd45c1 100644
--- a/apps/android_rpc/tests/android_rpc_test.py
+++ b/apps/android_rpc/tests/android_rpc_test.py
@@ -21,59 +21,92 @@
 arch = "arm64"
 target = "llvm -target=%s-linux-android" % arch
 
+# whether enable to execute test on OpenCL target
+test_opencl = False
+# whether enable to execute test on Vulkan target
+test_vulkan = False
+
 def test_rpc_module():
     # graph
     n = tvm.convert(1024)
     A = tvm.placeholder((n,), name='A')
     B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+    a_np = np.random.uniform(size=1024).astype(A.dtype)
     temp = util.tempdir()
-    s = tvm.create_schedule(B.op)
-    xo, xi = s[B].split(B.op.axis[0], factor=64)
-    s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
-    s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
-    # Build the dynamic lib.
-    # If we don't want to do metal and only use cpu, just set target to be target
-    f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd")
-    path_dso1 = temp.relpath("dev_lib2.so")
-    f.export_library(path_dso1, ndk.create_shared)
 
+    # Establish remote connection with target hardware
+    tracker = rpc.connect_tracker(tracker_host, tracker_port)
+    remote = tracker.request(key, priority=0,
+                             session_timeout=60)
+
+    # Compile the Graph for CPU target
     s = tvm.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=64)
     s[B].parallel(xi)
     s[B].pragma(xo, "parallel_launch_point")
     s[B].pragma(xi, "parallel_barrier_when_finish")
     f = tvm.build(s, [A, B], target, name="myadd_cpu")
-    path_dso2 = temp.relpath("cpu_lib.so")
-    f.export_library(path_dso2, ndk.create_shared)
-
-    tracker = rpc.connect_tracker(tracker_host, tracker_port)
-    remote = tracker.request(key, priority=0,
-                             session_timeout=60)
+    path_dso_cpu = temp.relpath("cpu_lib.so")
+    f.export_library(path_dso_cpu, ndk.create_shared)
 
+    # Execute the portable graph on cpu target
     print('Run CPU test ...')
     ctx = remote.cpu(0)
-    remote.upload(path_dso2)
+    remote.upload(path_dso_cpu)
     f2 = remote.load_module("cpu_lib.so")
-    a_np = np.random.uniform(size=1024).astype(A.dtype)
     a = tvm.nd.array(a_np, ctx)
     b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
     time_f = f2.time_evaluator(f2.entry_name, ctx, number=10)
     cost = time_f(a, b).mean
-    print('%g secs/op' % cost)
+    print('%g secs/op\n' % cost)
     np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
+    # Compile the Graph for OpenCL target
+    if test_opencl:
+        s = tvm.create_schedule(B.op)
+        xo, xi = s[B].split(B.op.axis[0], factor=64)
+        s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
+        s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
+        # Build the dynamic lib.
+        # If we don't want to do metal and only use cpu, just set target to be target
+        f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd")
+        path_dso_cl = temp.relpath("dev_lib_cl.so")
+        f.export_library(path_dso_cl, ndk.create_shared)
+
+        print('Run GPU(OpenCL Flavor) test ...')
+        ctx = remote.cl(0)
+        remote.upload(path_dso_cl)
+        f1 = remote.load_module("dev_lib_cl.so")
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
+        time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
+        cost = time_f(a, b).mean
+        print('%g secs/op\n' % cost)
+        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
+
+    # Compile the Graph for Vulkan target
+    if test_vulkan:
+        s = tvm.create_schedule(B.op)
+        xo, xi = s[B].split(B.op.axis[0], factor=64)
+        s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
+        s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
+        # Build the dynamic lib.
+        # If we don't want to do metal and only use cpu, just set target to be target
+        f = tvm.build(s, [A, B], "vulkan", target_host=target, name="myadd")
+        path_dso_vulkan = temp.relpath("dev_lib_vulkan.so")
+        f.export_library(path_dso_vulkan, ndk.create_shared)
+
+        print('Run GPU(Vulkan Flavor) test ...')
+        ctx = remote.vulkan(0)
+        remote.upload(path_dso_vulkan)
+        f1 = remote.load_module("dev_lib_vulkan.so")
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
+        time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
+        cost = time_f(a, b).mean
+        print('%g secs/op\n' % cost)
+        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
-    print('Run GPU test ...')
-    ctx = remote.cl(0)
-    remote.upload(path_dso1)
-    f1 = remote.load_module("dev_lib2.so")
-    a_np = np.random.uniform(size=1024).astype(A.dtype)
-    a = tvm.nd.array(a_np, ctx)
-    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
-    time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
-    cost = time_f(a, b).mean
-    print('%g secs/op' % cost)
-    np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
 if __name__ == "__main__":
     test_rpc_module()
diff --git a/apps/benchmark/README.md b/apps/benchmark/README.md
index e83e47c46eb7..9806ddc05bae 100644
--- a/apps/benchmark/README.md
+++ b/apps/benchmark/README.md
@@ -6,8 +6,41 @@ See results on wiki page https://github.com/dmlc/tvm/wiki/Benchmark
 
 ## How to Reproduce
 
-### ARM CPU
-We use RPC infrastructure in TVM to make device management easy. So you need to use it for reproducing benchmark results.
+To obtain the best performance, we always do auto-tuning for the specific devices and get
+the parameters for used kernels. To enable easy reproduction of our results, we release
+pre-tuned parameters for popular networks on some common devices.
+TVM will download related tuning cache files during compilation.
+
+If you don't have the following listed devices, you can still run these scripts.
+You can pick the one that is most similar to your device as argument.
+In general, the performance should also be good.
+
+It is recommended that you run tuning by yourself if you have your customized network or devices.
+Please follow the tutorial for
+[NVIDIA GPU](https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_cuda.html),
+[ARM CPU](https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_arm.html),
+[Mobile GPU](https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_mobile_gpu.html).
+
+### NVIDIA GPU
+
+Build TVM with LLVM and CUDA enabled. [Help](https://docs.tvm.ai/install/from_source.html)
+
+```bash
+python3 gpu_imagenet_bench.py --model 1080ti
+python3 gpu_imagenet_bench.py --model titanx
+
+# For NVIDIA Jetson TX2, you can run the following command directly on the board,
+# or use cross compilation and RPC like what we do for ARM CPU.
+python3 gpu_imagenet_bench.py --model tx2
+```
+
+### ARM CPU & Mali GPU
+For embedded devices, we use RPC infrastructure in TVM to make the management easy.
+You need to use it for reproducing benchmark results.
+
+**Note**: We use llvm-4.0 in our tuning environment. Mismatch of the LLVM version during tuning and deployment can influence the performance, so you have to use a same version for reproduction.
+
+0. Build TVM with LLVM enabled. [Help](https://docs.tvm.ai/install/from_source.html)
 
 1. Start an RPC Tracker on the host machine
 ```bash
@@ -19,12 +52,12 @@ python3 -m tvm.exec.rpc_tracker
   * Build tvm runtime on your device [Help](https://docs.tvm.ai/tutorials/nnvm/deploy_model_on_rasp.html#build-tvm-runtime-on-device)
   * Register your device to tracker by
   ```bash
-  python3 -m tvm.exec.rpc_sever --tracker=[HOST_IP]:9190 --key=[DEVICE_KEY]
+  python3 -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=[DEVICE_KEY]
   ```
   replace `[HOST_IP]` with the IP address of the host machine, `[DEVICE_KEY]` with the name of device.
   
   E.g. Here is an example command for RK3399,
-  `python3 -m tvm.exec.rpc_sever --tracker=10.77.1.123:9190 --key=rk3399`, where 10.77.1.123 is the IP address of the tracker.
+  `python3 -m tvm.exec.rpc_server --tracker=10.77.1.123:9190 --key=rk3399`, where 10.77.1.123 is the IP address of the tracker.
 
 * For Android device
    * Build and install tvm RPC apk on your device [Help](https://github.com/dmlc/tvm/tree/master/apps/android_rpc).
@@ -40,31 +73,39 @@ python3 -m tvm.exec.rpc_tracker
   For our test environment, one sample output can be 
   ```bash
   Queue Status                
-  ------------------------------
-  key            free    pending    
-  ------------------------------
-  mate10pro      1       0   
-  p20pro         2       0  
-  pixel2         2       0 
-  rk3399         2       0
-  rasp3b         8       0
+  ----------------------------------
+  key          total  free  pending    
+  ----------------------------------
+  mate10pro    1      1     0
+  p20pro       2      2     0 
+  pixel2       2      2     0
+  rk3399       2      2     0
+  rasp3b       8      8     0
   ```
 
- 4. Run benchmark  
-  We did auto-tuning for Huawei P20/Mate10 Pro, Google Pixel2, Raspberry Pi3 and Firefly-RK3399,
-  and release pre-tuned parameters in [this repo](https://github.com/uwsaml/tvm-distro).
-  During compilation, TVM will download these operator parameters automatically.
+4. Run benchmark  
+  ```bash
+  # ARM CPU
+  python3 arm_cpu_imagenet_bench.py --model rasp3b --rpc-key rasp3b
+  python3 arm_cpu_imagenet_bench.py --model rk3399 --rpc-key rk3399
+  python3 arm_cpu_imagenet_bench.py --model pixel2 --rpc-key pixel2
+  python3 arm_cpu_imagenet_bench.py --model p20pro --rpc-key p20pro
+  python3 arm_cpu_imagenet_bench.py --model mate10pro --rpc-key mate10pro  
+  ```
 
   ```bash
-  python3 arm_cpu_imagenet_bench.py --device rasp3b --rpc-key rasp3b
-  python3 arm_cpu_imagenet_bench.py --device rk3399 --rpc-key rk3399
-  python3 arm_cpu_imagenet_bench.py --device pixel2 --rpc-key pixel2
-  python3 arm_cpu_imagenet_bench.py --device p20pro --rpc-key p20pro
-  python3 arm_cpu_imagenet_bench.py --device mate10pro --rpc-key mate10pro  
+  # Mali GPU
+  # NOTE: To make the test environment more stable, we close GUI and lock the frequency
+  sudo /etc/init.d/lightdm stop
+  sudo -i
+  echo performance > /sys/class/misc/mali0/device/devfreq/ff9a0000.gpu/governor
+  python3 mobile_gpu_imagenet_bench.py --model rk3399 --rpc-key rk3399
+  python3 mobile_gpu_imagenet_bench.py --model rk3399 --rpc-key rk3399 --dtype float16
   ```
 
-  If your device has a same SoC of the above device, you can reuse these parameters
-  (e.g. use `llvm -device=arm_cpu -mode=rk3399 -target=aarch64-linux-gnu` as target).
-  Otherwise, you need to tune for your own device, please follow this 
-  [tutorial](https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_arm.html).
+### AMD GPU
 
+Build TVM with LLVM and ROCm enabled. [Help](https://docs.tvm.ai/install/from_source.html)
+```bash
+python3 gpu_imagenet_bench.py --model gfx900 --target rocm
+```
diff --git a/apps/benchmark/arm_cpu_imagenet_bench.py b/apps/benchmark/arm_cpu_imagenet_bench.py
index 7baf244e0dae..f84d42bcab82 100644
--- a/apps/benchmark/arm_cpu_imagenet_bench.py
+++ b/apps/benchmark/arm_cpu_imagenet_bench.py
@@ -1,96 +1,91 @@
-"""Benchmark script for performance on ARM CPU.
+"""Benchmark script for ImageNet models on ARM CPU.
 see README.md for the usage and results of this script.
 """
-
 import argparse
-import time
 
 import numpy as np
 
-import nnvm.testing
-import nnvm.compiler
 import tvm
-from tvm import autotvm
 from tvm.contrib.util import tempdir
 import tvm.contrib.graph_runtime as runtime
+import nnvm.compiler
+import nnvm.testing
 
-def get_network(name, batch_size):
-    """Get the symbol definition and random weight of a network"""
-    input_shape = (batch_size, 3, 224, 224)
-    output_shape = (batch_size, 1000)
-
-    if name == 'resnet-18':
-        net, params = nnvm.testing.resnet.get_workload(num_layers=18,
-                                                       batch_size=batch_size, image_shape=(3, 224, 224))
-    elif name == 'mobilenet':
-        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
-    elif name == 'squeezenet v1.1':
-        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size,
-                                                           version='1.1')
-    elif name == 'vgg-16':
-        net, params = nnvm.testing.vgg.get_workload(batch_size=batch_size, num_layers=16)
+from util import get_network, print_progress
+
+
+def evaluate_network(network, target, target_host, repeat):
+    # connect to remote device
+    tracker = tvm.rpc.connect_tracker(args.host, args.port)
+    remote = tracker.request(args.rpc_key)
+
+    print_progress(network)
+    net, params, input_shape, output_shape = get_network(network, batch_size=1)
+
+    print_progress("%-20s building..." % network)
+    with nnvm.compiler.build_config(opt_level=3):
+        graph, lib, params = nnvm.compiler.build(
+            net, target=target, target_host=target_host,
+            shape={'data': input_shape}, params=params, dtype=dtype)
+
+    tmp = tempdir()
+    if 'android' in str(target):
+        from tvm.contrib import ndk
+        filename = "%s.so" % network
+        lib.export_library(tmp.relpath(filename), ndk.create_shared)
     else:
-        raise RuntimeError("Unsupported network: " + name)
+        filename = "%s.tar" % network
+        lib.export_library(tmp.relpath(filename))
+
+    # upload library and params
+    print_progress("%-20s uploading..." % network)
+    ctx = remote.context(str(target), 0)
+    remote.upload(tmp.relpath(filename))
+
+    rlib = remote.load_module(filename)
+    module = runtime.create(graph, rlib, ctx)
+    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+    module.set_input('data', data_tvm)
+    module.set_input(**params)
 
-    return net, params, input_shape, output_shape
+    # evaluate
+    print_progress("%-20s evaluating..." % network)
+    ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=repeat)
+    prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
+    print("%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--network", type=str, choices=['resnet-18', 'mobilenet', 'squeezenet v1.1', 'vgg-16'])
-    parser.add_argument("--device", type=str, required=True, choices=['rk3399', 'mate10', 'mate10pro', 'p20', 'p20pro', 
-                                                                      'pixel2', 'rasp3b', 'pynq'])
+    parser.add_argument("--network", type=str, choices=
+                        ['resnet-18', 'resnet-34', 'resnet-50',
+                         'vgg-16', 'vgg-19', 'densenet-121', 'inception_v3',
+                         'mobilenet', 'mobilenet_v2', 'squeezenet_v1.0', 'squeezenet_v1.1'],
+                        help='The name of neural network')
+    parser.add_argument("--model", type=str, choices=
+                        ['rk3399', 'mate10', 'mate10pro', 'p20', 'p20pro',
+                         'pixel2', 'rasp3b', 'pynq'], default='rk3399',
+                        help="The model of the test device. If your device is not listed in "
+                             "the choices list, pick the most similar one as argument.")
     parser.add_argument("--host", type=str, default='localhost')
     parser.add_argument("--port", type=int, default=9190)
     parser.add_argument("--rpc-key", type=str, required=True)
-    parser.add_argument("--number", type=int, default=6)
+    parser.add_argument("--repeat", type=int, default=10)
     args = parser.parse_args()
 
     dtype = 'float32'
 
     if args.network is None:
-        networks = ['squeezenet v1.1', 'mobilenet', 'resnet-18', 'vgg-16']
+        networks = ['squeezenet_v1.1', 'mobilenet', 'resnet-18', 'vgg-16']
     else:
         networks = [args.network]
 
-    target = tvm.target.arm_cpu(model=args.device)
-
-    # connect to remote device
-    tracker = tvm.rpc.connect_tracker(args.host, args.port)
-    remote = tracker.request(args.rpc_key)
+    target = tvm.target.arm_cpu(model=args.model)
+    target_host = None
 
     print("--------------------------------------------------")
     print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
     print("--------------------------------------------------")
     for network in networks:
-        net, params, input_shape, output_shape = get_network(network, batch_size=1)
-
-        with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
-            graph, lib, params = nnvm.compiler.build(
-                net, target=target, shape={'data': input_shape}, params=params, dtype=dtype)
-
-        tmp = tempdir()
-        if 'android' in str(target):
-            from tvm.contrib import ndk
-            filename = "%s.so" % network
-            lib.export_library(tmp.relpath(filename), ndk.create_shared)
-        else:
-            filename = "%s.tar" % network
-            lib.export_library(tmp.relpath(filename))
-
-        # upload library and params
-        ctx = remote.context(str(target), 0)
-        remote.upload(tmp.relpath(filename))
-        rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
-
-        rlib = remote.load_module(filename)
-        module = runtime.create(graph, rlib, ctx)
-        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
-        module.set_input('data', data_tvm)
-        module.set_input(**rparams)
-
-        # evaluate
-        ftimer = module.module.time_evaluator("run", ctx, number=args.number, repeat=3)
-        prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
-        print("%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
+        evaluate_network(network, target, target_host, args.repeat)
 
diff --git a/apps/benchmark/gpu_imagenet_bench.py b/apps/benchmark/gpu_imagenet_bench.py
index fca4e35b6516..17c1fbc435b6 100644
--- a/apps/benchmark/gpu_imagenet_bench.py
+++ b/apps/benchmark/gpu_imagenet_bench.py
@@ -1,80 +1,81 @@
-""" Benchmark script for performance on GPUs.
-
-For example, run the file with:
-`python gpu_imagenet_bench.py --model=mobilenet --target=cuda`.
-For more details about how to set up the inference environment on GPUs,
-please refer to NNVM Tutorial: ImageNet Inference on the GPU
+"""Benchmark script for ImageNet models on GPU.
+see README.md for the usage and results of this script.
 """
-import time
 import argparse
+import threading
+
 import numpy as np
+
 import tvm
+from tvm.contrib.util import tempdir
+import tvm.contrib.graph_runtime as runtime
 import nnvm.compiler
 import nnvm.testing
-from tvm.contrib import util, nvcc
-from tvm.contrib import graph_runtime as runtime
 
-@tvm.register_func
-def tvm_callback_cuda_compile(code):
-    ptx = nvcc.compile_cuda(code, target="ptx")
-    return ptx
+from util import get_network
+
+
+def benchmark(network, target):
+    net, params, input_shape, output_shape = get_network(network, batch_size=1)
+
+    with nnvm.compiler.build_config(opt_level=3):
+        graph, lib, params = nnvm.compiler.build(
+            net, target=target, shape={'data': input_shape}, params=params, dtype=dtype)
+
+    # create runtime
+    ctx = tvm.context(str(target), 0)
+    module = runtime.create(graph, lib, ctx)
+    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+    module.set_input('data', data_tvm)
+    module.set_input(**params)
+
+    # evaluate
+    ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=args.repeat)
+    prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
+    print("%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
 
-def main():
+
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model', type=str, required=True,
-                        choices=['resnet', 'mobilenet'],
-                        help="The model type.")
-    parser.add_argument('--target', type=str, required=True,
-                        choices=['cuda', 'rocm', 'opencl', 'metal', 'nvptx'],
-                        help="Compilation target.")
-    parser.add_argument('--opt-level', type=int, default=1, help="Level of optimization.")
-    parser.add_argument('--num-iter', type=int, default=1000, help="Number of iteration during benchmark.")
-    parser.add_argument('--repeat', type=int, default=1, help="Number of repeative times.")
+    parser.add_argument("--network", type=str, choices=
+                        ['resnet-18', 'resnet-34', 'resnet-50',
+                         'vgg-16', 'vgg-19', 'densenet-121', 'inception_v3',
+                         'mobilenet', 'mobilenet_v2', 'squeezenet_v1.0', 'squeezenet_v1.1'],
+                        help='The name of neural network')
+    parser.add_argument("--model", type=str,
+                        choices=['1080ti', 'titanx', 'tx2', 'gfx900'], default='1080ti',
+                        help="The model of the test device. If your device is not listed in "
+                             "the choices list, pick the most similar one as argument.")
+    parser.add_argument("--repeat", type=int, default=600)
+    parser.add_argument("--target", type=str,
+                        choices=['cuda', 'opencl', 'rocm', 'nvptx', 'metal'], default='cuda',
+                        help="The tvm compilation target")
+    parser.add_argument("--thread", type=int, default=1, help="The number of threads to be run.")
     args = parser.parse_args()
-    opt_level = args.opt_level
-    num_iter = args.num_iter
-    ctx = tvm.context(args.target, 0)
-    batch_size = 1
-    num_classes = 1000
-    image_shape = (3, 224, 224)
-
-    data_shape = (batch_size,) + image_shape
-    out_shape = (batch_size, num_classes)
-    if args.model == 'resnet':
-        net, params = nnvm.testing.resnet.get_workload(
-            batch_size=1, image_shape=image_shape)
-    elif args.model == 'mobilenet':
-        net, params = nnvm.testing.mobilenet.get_workload(
-            batch_size=1, image_shape=image_shape)
-    else:
-        raise ValueError('no benchmark prepared for {}.'.format(args.model))
 
-    if args.target == "cuda":
-        unroll = 1400
+    dtype = 'float32'
+
+    if args.network is None:
+        networks = ['resnet-50', 'mobilenet', 'vgg-19', 'inception_v3']
     else:
-        unroll = 128
-    with nnvm.compiler.build_config(opt_level=opt_level):
-        with tvm.build_config(auto_unroll_max_step=unroll,
-                              unroll_explicit=(args.target != "cuda")):
-            graph, lib, params = nnvm.compiler.build(
-                net, args.target, shape={"data": data_shape}, params=params)
-
-    data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
-    module = runtime.create(graph, lib, ctx)
-    module.set_input(**params)
-    module.set_input("data", data)
-    module.run()
-    out = module.get_output(0, tvm.nd.empty(out_shape))
-    out.asnumpy()
-
-    print('benchmark args: {}'.format(args))
-    ftimer = module.module.time_evaluator("run", ctx, num_iter)
-    for i in range(args.repeat):
-        prof_res = ftimer()
-        print(prof_res)
-        # sleep for avoiding device overheat
-        if i + 1 != args.repeat:
-            time.sleep(45)
-
-if __name__ == '__main__':
-    main()
+        networks = [args.network]
+
+    target = tvm.target.create('%s -model=%s' % (args.target, args.model))
+
+    print("--------------------------------------------------")
+    print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
+    print("--------------------------------------------------")
+    for network in networks:
+        if args.thread == 1:
+            benchmark(network, target)
+        else:
+            threads = list()
+            for n in range(args.thread):
+                thread = threading.Thread(target=benchmark, args=([network, target]), name="thread%d" % n)
+                threads.append(thread)
+
+            for thread in threads:
+                thread.start()
+
+            for thread in threads:
+                thread.join()
diff --git a/apps/benchmark/mobile_gpu_imagenet_bench.py b/apps/benchmark/mobile_gpu_imagenet_bench.py
new file mode 100644
index 000000000000..cd3d7eca9f3c
--- /dev/null
+++ b/apps/benchmark/mobile_gpu_imagenet_bench.py
@@ -0,0 +1,88 @@
+"""Benchmark script for ImageNet models on mobile GPU.
+see README.md for the usage and results of this script.
+"""
+import argparse
+
+import numpy as np
+
+import tvm
+from tvm.contrib.util import tempdir
+import tvm.contrib.graph_runtime as runtime
+import nnvm.compiler
+import nnvm.testing
+
+from util import get_network, print_progress
+
+def evaluate_network(network, target, target_host, dtype, repeat):
+    # connect to remote device
+    tracker = tvm.rpc.connect_tracker(args.host, args.port)
+    remote = tracker.request(args.rpc_key)
+
+    print_progress(network)
+    net, params, input_shape, output_shape = get_network(network, batch_size=1, dtype=dtype)
+
+    print_progress("%-20s building..." % network)
+    with nnvm.compiler.build_config(opt_level=3):
+        graph, lib, params = nnvm.compiler.build(
+            net, target=target, target_host=target_host,
+            shape={'data': input_shape}, params=params, dtype=dtype)
+
+    tmp = tempdir()
+    if 'android' in str(target) or 'android' in str(target_host):
+        from tvm.contrib import ndk
+        filename = "%s.so" % network
+        lib.export_library(tmp.relpath(filename), ndk.create_shared)
+    else:
+        filename = "%s.tar" % network
+        lib.export_library(tmp.relpath(filename))
+
+    # upload library and params
+    print_progress("%-20s uploading..." % network)
+    ctx = remote.context(str(target), 0)
+    remote.upload(tmp.relpath(filename))
+
+    rlib = remote.load_module(filename)
+    module = runtime.create(graph, rlib, ctx)
+    data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+    module.set_input('data', data_tvm)
+    module.set_input(**params)
+
+    # evaluate
+    print_progress("%-20s evaluating..." % network)
+    ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=repeat)
+    prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
+    print("%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--network", type=str, choices=
+                        ['resnet-18', 'resnet-34', 'resnet-50',
+                         'vgg-16', 'vgg-19', 'densenet-121', 'inception_v3',
+                         'mobilenet', 'mobilenet_v2', 'squeezenet_v1.0', 'squeezenet_v1.1'],
+                        help='The name of neural network')
+    parser.add_argument("--model", type=str, choices=
+                        ['rk3399'], default='rk3399',
+                        help="The model of the test device. If your device is not listed in "
+                             "the choices list, pick the most similar one as argument.")
+    parser.add_argument("--host", type=str, default='localhost')
+    parser.add_argument("--port", type=int, default=9190)
+    parser.add_argument("--rpc-key", type=str, required=True)
+    parser.add_argument("--repeat", type=int, default=30)
+    parser.add_argument("--dtype", type=str, default='float32')
+    args = parser.parse_args()
+
+    if args.network is None:
+        networks = ['squeezenet_v1.1', 'mobilenet', 'resnet-18', 'vgg-16']
+    else:
+        networks = [args.network]
+
+    target = tvm.target.mali(model=args.model)
+    target_host = tvm.target.arm_cpu(model=args.model)
+
+    print("--------------------------------------------------")
+    print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
+    print("--------------------------------------------------")
+
+    for network in networks:
+        evaluate_network(network, target, target_host, args.dtype, args.repeat)
diff --git a/apps/benchmark/util.py b/apps/benchmark/util.py
new file mode 100644
index 000000000000..ac732d7945b9
--- /dev/null
+++ b/apps/benchmark/util.py
@@ -0,0 +1,79 @@
+"""Utility for benchmark"""
+
+import sys
+import nnvm
+
+def get_network(name, batch_size, dtype='float32'):
+    """Get the symbol definition and random weight of a network
+    
+    Parameters
+    ----------
+    name: str
+        The name of the network, can be 'resnet-18', 'resnet-50', 'vgg-16', 'inception_v3', 'mobilenet', ...
+    batch_size: int
+        batch size
+    dtype: str
+        Data type
+
+    Returns
+    -------
+    net: nnvm.symbol
+        The NNVM symbol of network definition
+    params: dict
+        The random parameters for benchmark
+    input_shape: tuple
+        The shape of input tensor
+    output_shape: tuple
+        The shape of output tensor
+    """
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if name == 'mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size, dtype=dtype)
+    elif name == 'mobilenet_v2':
+        net, params = nnvm.testing.mobilenet_v2.get_workload(batch_size=batch_size, dtype=dtype)
+    elif name == 'inception_v3':
+        input_shape = (batch_size, 3, 299, 299)
+        net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
+    elif "resnet" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
+    elif "vgg" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
+    elif "densenet" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.densenet.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
+    elif "squeezenet" in name:
+        version = name.split("_v")[1]
+        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version=version, dtype=dtype)
+    elif name == 'custom':
+        # an example for custom network
+        from nnvm.testing import utils
+        net = nnvm.sym.Variable('data')
+        net = nnvm.sym.conv2d(net, channels=4, kernel_size=(3,3), padding=(1,1))
+        net = nnvm.sym.flatten(net)
+        net = nnvm.sym.dense(net, units=1000)
+        net, params = utils.create_workload(net, batch_size, (3, 224, 224), dtype=dtype)
+    elif name == 'mxnet':
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+        block = get_model('resnet18_v1', pretrained=True)
+        net, params = nnvm.frontend.from_mxnet(block)
+        net = nnvm.sym.softmax(net)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, input_shape, output_shape
+
+def print_progress(msg):
+    """print progress message
+    
+    Parameters
+    ----------
+    msg: str
+        The message to print
+    """
+    sys.stdout.write(msg + "\r")
+    sys.stdout.flush()
diff --git a/apps/bundle_deploy/Makefile b/apps/bundle_deploy/Makefile
new file mode 100644
index 000000000000..0bf1613c8d66
--- /dev/null
+++ b/apps/bundle_deploy/Makefile
@@ -0,0 +1,39 @@
+# Makefile Example to bundle TVM modules.
+TVM_ROOT=$(shell cd ../..; pwd)
+NNVM_PATH=nnvm
+DMLC_CORE=${TVM_ROOT}/3rdparty/dmlc-core
+PKG_CFLAGS = -std=c++14 -Oz -fPIC\
+	-I${TVM_ROOT}/include\
+	-I${DMLC_CORE}/include\
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
+
+PKG_LDFLAGS = -L${TVM_ROOT}/build
+
+build_dir := build
+
+test: $(build_dir)/demo $(build_dir)/bundle.so
+	$(build_dir)/demo $(build_dir)/bundle.so
+
+$(build_dir)/demo: demo.cc
+	@mkdir -p $(@D)
+	$(CXX) $(PKG_CFLAGS) -o $@  $^
+
+# Serialize our graph.json file.
+$(build_dir)/graph.json.cc: $(build_dir)/graph.json
+	xxd -i $^  > $@
+
+# Serialize our params.bin file.
+$(build_dir)/params.bin.cc: $(build_dir)/params.bin
+	xxd -i $^  > $@
+
+$(build_dir)/model.o $(build_dir)/graph.json $(build_dir)/params.bin: build_model.py
+	python $< -o $(build_dir)
+
+# Build our bundle against the serialized bundle.cc API, the runtime.cc API, and
+# the serialized graph.json and params.bin
+$(build_dir)/bundle.so: bundle.cc runtime.cc $(build_dir)/model.o $(build_dir)/graph.json.cc $(build_dir)/params.bin.cc
+	@mkdir -p $(@D)
+	$(CXX) $(PKG_CFLAGS) -fvisibility=hidden -o $@  $^ $(PKG_LDFLAGS) -shared
+
+clean:
+	rm -r $(build_dir)
diff --git a/apps/bundle_deploy/README.md b/apps/bundle_deploy/README.md
new file mode 100644
index 000000000000..2db8150b2659
--- /dev/null
+++ b/apps/bundle_deploy/README.md
@@ -0,0 +1,35 @@
+How to Bundle TVM Modules
+=========================
+
+This folder contains an example on how to bundle a TVM module (with the required
+interpreter runtime modules such as `runtime::GraphRuntime`, the graph JSON, and
+the params) into a single, self-contained shared object (`bundle.so`) which
+exposes a C API wrapping the appropriate `runtime::GraphRuntime` instance.
+
+This is useful for cases where we'd like to avoid deploying the TVM runtime
+components to the target host in advance - instead, we simply deploy the bundled
+shared-object to the host, which embeds both the model and the runtime
+components. The bundle should only depend on libc/libc++.
+
+It also contains an example code (`demo.cc`) to load this shared object and
+invoke the packaged TVM model instance. This is a dependency-free binary that
+uses the functionality packaged in `bundle.so` (which means that `bundle.so` can
+be deployed lazily at runtime, instead of at compile time) to invoke TVM
+functionality.
+
+Type the following command to run the sample code under the current folder,
+after building TVM first.
+
+```bash
+make demo
+```
+
+This will:
+
+- Download the mobilenet0.25 model from the MXNet Gluon Model Zoo
+- Compile the model with NNVM
+- Build a `bundle.so` shared object containing the model specification and
+  parameters
+- Build a `demo` executable that `dlopen`'s `bundle.so`, instantiates the
+  contained graph runtime, and invokes the `GraphRuntime::Run` function on a
+  random input, then prints the output tensor to `stderr`.
diff --git a/apps/bundle_deploy/build_model.py b/apps/bundle_deploy/build_model.py
new file mode 100644
index 000000000000..901996b8774e
--- /dev/null
+++ b/apps/bundle_deploy/build_model.py
@@ -0,0 +1,40 @@
+"""Creates a simple TVM modules."""
+
+import argparse
+import os
+import nnvm.compiler
+import nnvm.testing
+import tvm
+import logging
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-o', '--out-dir', default='.')
+    opts = parser.parse_args()
+
+    dshape = (1, 3, 224, 224)
+    from mxnet.gluon.model_zoo.vision import get_model
+    block = get_model('mobilenet0.25', pretrained=True)
+    net, params = nnvm.frontend.from_mxnet(block)
+    net = nnvm.sym.softmax(net)
+
+    with nnvm.compiler.build_config(opt_level=3):
+        graph, lib, params = nnvm.compiler.build(
+            net, 'llvm --system-lib', shape={'data': dshape}, params=params)
+    print(graph.symbol().debug_str())
+    build_dir = os.path.abspath(opts.out_dir)
+    if not os.path.isdir(build_dir):
+        os.makedirs(build_dir)
+
+    lib.save(os.path.join(build_dir, 'model.o'))
+    with open(os.path.join(build_dir, 'graph.json'), 'w') as f_graph_json:
+        f_graph_json.write(graph.json())
+    with open(os.path.join(build_dir, 'params.bin'), 'wb') as f_params:
+        f_params.write(nnvm.compiler.save_param_dict(params))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/apps/bundle_deploy/bundle.cc b/apps/bundle_deploy/bundle.cc
new file mode 100644
index 000000000000..af1ef7225bcb
--- /dev/null
+++ b/apps/bundle_deploy/bundle.cc
@@ -0,0 +1,47 @@
+#include <memory>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/registry.h>
+
+extern unsigned char build_graph_json[];
+extern unsigned int build_graph_json_len;
+extern unsigned char build_params_bin[];
+extern unsigned int build_params_bin_len;
+
+#define TVM_BUNDLE_FUNCTION __attribute__((visibility("default"))) extern "C"
+
+TVM_BUNDLE_FUNCTION void *tvm_runtime_create() {
+  const std::string json_data(&build_graph_json[0],
+                              &build_graph_json[0] + build_graph_json_len);
+  tvm::runtime::Module mod_syslib =
+      (*tvm::runtime::Registry::Get("module._GetSystemLib"))();
+  int device_type = kDLCPU;
+  int device_id = 0;
+  tvm::runtime::Module mod =
+      (*tvm::runtime::Registry::Get("tvm.graph_runtime.create"))(
+          json_data, mod_syslib, device_type, device_id);
+  TVMByteArray params;
+  params.data = reinterpret_cast<const char *>(&build_params_bin[0]);
+  params.size = build_params_bin_len;
+  mod.GetFunction("load_params")(params);
+  return new tvm::runtime::Module(mod);
+}
+
+TVM_BUNDLE_FUNCTION void tvm_runtime_destroy(void *handle) {
+  delete reinterpret_cast<tvm::runtime::Module *>(handle);
+}
+
+TVM_BUNDLE_FUNCTION void tvm_runtime_set_input(void *handle, const char *name,
+                                               void *tensor) {
+  reinterpret_cast<tvm::runtime::Module *>(handle)->GetFunction("set_input")(
+      name, reinterpret_cast<DLTensor *>(tensor));
+}
+
+TVM_BUNDLE_FUNCTION void tvm_runtime_run(void *handle) {
+  reinterpret_cast<tvm::runtime::Module *>(handle)->GetFunction("run")();
+}
+
+TVM_BUNDLE_FUNCTION void tvm_runtime_get_output(void *handle, int index,
+                                                void *tensor) {
+  reinterpret_cast<tvm::runtime::Module *>(handle)->GetFunction("get_output")(
+      index, reinterpret_cast<DLTensor *>(tensor));
+}
diff --git a/apps/bundle_deploy/demo.cc b/apps/bundle_deploy/demo.cc
new file mode 100644
index 000000000000..c888edcee772
--- /dev/null
+++ b/apps/bundle_deploy/demo.cc
@@ -0,0 +1,66 @@
+#include "tvm/runtime/c_runtime_api.h"
+#include <assert.h>
+#include <dlfcn.h> //dlopen
+#include <dlpack/dlpack.h>
+#include <iostream>
+#include <random>
+#include <vector>
+
+template <typename F> auto getFunc(void *bundle, const char *name) {
+  dlerror();
+  auto *f =
+      reinterpret_cast<typename std::add_pointer<F>::type>(dlsym(bundle, name));
+  assert(!dlerror());
+  return f;
+}
+
+int main(int argc, char **argv) {
+  assert(argc == 2 && "Usage: demo <bundle.so>");
+  auto *bundle = dlopen(argv[1], RTLD_LAZY | RTLD_LOCAL);
+  assert(bundle);
+
+  auto *handle = getFunc<void *()>(bundle, "tvm_runtime_create")();
+
+  std::vector<float> input_storage(1 * 3 * 224 * 224);
+  std::mt19937 gen(0);
+  for (auto &e : input_storage) {
+    e = std::uniform_real_distribution<float>(0.0, 1.0)(gen);
+  }
+
+  std::vector<int64_t> input_shape = {1, 3, 224, 224};
+  DLTensor input;
+  input.data = input_storage.data();
+  input.ctx = DLContext{kDLCPU, 0};
+  input.ndim = 4;
+  input.dtype = DLDataType{kDLFloat, 32, 1};
+  input.shape = input_shape.data();
+  input.strides = nullptr;
+  input.byte_offset = 0;
+  getFunc<void(void *, const char *, void *)>(bundle, "tvm_runtime_set_input")(
+      handle, "data", &input);
+
+  auto *ftvm_runtime_run =
+      (auto (*)(void *)->void)dlsym(bundle, "tvm_runtime_run");
+  assert(!dlerror());
+  ftvm_runtime_run(handle);
+
+  std::vector<float> output_storage(1000);
+  std::vector<int64_t> output_shape = {1, 1000};
+  DLTensor output;
+  output.data = output_storage.data();
+  output.ctx = DLContext{kDLCPU, 0};
+  output.ndim = 2;
+  output.dtype = DLDataType{kDLFloat, 32, 1};
+  output.shape = output_shape.data();
+  output.strides = nullptr;
+  output.byte_offset = 0;
+
+  getFunc<void(void *, int, void *)>(bundle, "tvm_runtime_get_output")(
+      handle, 0, &output);
+  for (auto i = 0; i < output_storage.size(); ++i) {
+    std::cerr << "output[" << i << "]: " << output_storage[i] << std::endl;
+  }
+  getFunc<void(void *)>(bundle, "tvm_runtime_destroy")(handle);
+  dlclose(bundle);
+  return 0;
+}
diff --git a/apps/bundle_deploy/runtime.cc b/apps/bundle_deploy/runtime.cc
new file mode 100644
index 000000000000..2284953b8c16
--- /dev/null
+++ b/apps/bundle_deploy/runtime.cc
@@ -0,0 +1,17 @@
+#include <dlpack/dlpack.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/packed_func.h>
+
+#include "../../src/runtime/c_runtime_api.cc"
+#include "../../src/runtime/cpu_device_api.cc"
+#include "../../src/runtime/workspace_pool.cc"
+#include "../../src/runtime/module_util.cc"
+#include "../../src/runtime/module.cc"
+#include "../../src/runtime/registry.cc"
+#include "../../src/runtime/file_util.cc"
+#include "../../src/runtime/threading_backend.cc"
+#include "../../src/runtime/thread_pool.cc"
+#include "../../src/runtime/ndarray.cc"
+#include "../../src/runtime/system_lib_module.cc"
+#include "../../src/runtime/graph/graph_runtime.cc"
diff --git a/apps/extension/Makefile b/apps/extension/Makefile
index 29b9a1163f16..3a1f8a2160ee 100644
--- a/apps/extension/Makefile
+++ b/apps/extension/Makefile
@@ -2,9 +2,9 @@
 TVM_ROOT=$(shell cd ../..; pwd)
 PKG_CFLAGS = -std=c++11 -O2 -fPIC\
 	-I${TVM_ROOT}/include\
-	-I${TVM_ROOT}/dmlc-core/include\
-	-I${TVM_ROOT}/dlpack/include\
-	-I${TVM_ROOT}/HalideIR/src
+	-I${TVM_ROOT}/3rdparty/dmlc-core/include\
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
+	-I${TVM_ROOT}/3rdparty/HalideIR/src
 
 PKG_LDFLAGS =-L${TVM_ROOT}/lib
 UNAME_S := $(shell uname -s)
diff --git a/apps/extension/python/tvm_ext/__init__.py b/apps/extension/python/tvm_ext/__init__.py
index 5045a9ec02e0..25286f67b4f5 100644
--- a/apps/extension/python/tvm_ext/__init__.py
+++ b/apps/extension/python/tvm_ext/__init__.py
@@ -8,7 +8,9 @@
 def load_lib():
     """Load library, the functions will be registered into TVM"""
     curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    lib = ctypes.CDLL(os.path.join(curr_path, "../../lib/libtvm_ext.so"))
+    # load in as global so the global extern symbol is visible to other dll.
+    lib = ctypes.CDLL(
+        os.path.join(curr_path, "../../lib/libtvm_ext.so"), ctypes.RTLD_GLOBAL)
     return lib
 
 _LIB = load_lib()
diff --git a/apps/extension/src/tvm_ext.cc b/apps/extension/src/tvm_ext.cc
index bb8b4b694187..362ac62dea3d 100644
--- a/apps/extension/src/tvm_ext.cc
+++ b/apps/extension/src/tvm_ext.cc
@@ -66,6 +66,11 @@ TVM_REGISTER_GLOBAL("device_api.ext_dev")
   });
 }  // namespace tvm_ext
 
+// External function exposed to runtime.
+extern "C" float TVMTestAddOne(float y) {
+  return y + 1;
+}
+
 // This callback approach allows extension allows tvm to extract
 // This way can be helpful when we want to use a header only
 // minimum version of TVM Runtime.
diff --git a/apps/extension/tests/test_ext.py b/apps/extension/tests/test_ext.py
index 628602f0baea..def30803135e 100644
--- a/apps/extension/tests/test_ext.py
+++ b/apps/extension/tests/test_ext.py
@@ -22,7 +22,7 @@ def check_llvm():
         a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
         b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1)
+        tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1)
     check_llvm()
 
 
@@ -49,7 +49,27 @@ def test_extract_ext():
     assert fdict["mul"](3, 4) == 12
 
 
+def test_extern_call():
+    n = 10
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.compute((n,), lambda *i: tvm.call_extern("float32", "TVMTestAddOne", A(*i)), name='B')
+    s = tvm.create_schedule(B.op)
+
+    def check_llvm():
+        if not tvm.module.enabled("llvm"):
+            return
+        f = tvm.build(s, [A, B], "llvm")
+        ctx = tvm.cpu(0)
+        # launch the kernel.
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
+        f(a, b)
+        tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1)
+    check_llvm()
+
+
 if __name__ == "__main__":
+    test_extern_call()
     test_ext_dev()
     test_ext_vec()
     test_bind_add()
diff --git a/apps/howto_deploy/Makefile b/apps/howto_deploy/Makefile
index ad4e56680d21..7accb7dd64ae 100644
--- a/apps/howto_deploy/Makefile
+++ b/apps/howto_deploy/Makefile
@@ -1,12 +1,12 @@
 # Makefile Example to deploy TVM modules.
 TVM_ROOT=$(shell cd ../..; pwd)
 NNVM_PATH=nnvm
-DMLC_CORE=${TVM_ROOT}/dmlc-core
+DMLC_CORE=${TVM_ROOT}/3rdparty/dmlc-core
 
 PKG_CFLAGS = -std=c++11 -O2 -fPIC\
 	-I${TVM_ROOT}/include\
 	-I${DMLC_CORE}/include\
-	-I${TVM_ROOT}/dlpack/include\
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
 
 PKG_LDFLAGS = -L${TVM_ROOT}/build -ldl -lpthread
 
diff --git a/apps/howto_deploy/cpp_deploy.cc b/apps/howto_deploy/cpp_deploy.cc
index 1fd22e5f2b5f..9a6c5ebca703 100644
--- a/apps/howto_deploy/cpp_deploy.cc
+++ b/apps/howto_deploy/cpp_deploy.cc
@@ -1,7 +1,7 @@
 /*!
  *  Copyright (c) 2017 by Contributors
  * \brief Example code on load and run TVM module.s
- * \file cpp_deploy_example.cc
+ * \file cpp_deploy.cc
  */
 #include <cstdio>
 #include <dlpack/dlpack.h>
diff --git a/apps/howto_deploy/tvm_runtime_pack.cc b/apps/howto_deploy/tvm_runtime_pack.cc
index 27f95e9e6065..c4b6e2a2d44e 100644
--- a/apps/howto_deploy/tvm_runtime_pack.cc
+++ b/apps/howto_deploy/tvm_runtime_pack.cc
@@ -8,8 +8,8 @@
  *  - Compile with -std=c++11
  *  - Add the following include path
  *     - /path/to/tvm/include/
- *     - /path/to/tvm/dmlc-core/include/
- *     - /path/to/tvm/dlpack/include/
+ *     - /path/to/tvm/3rdparty/dmlc-core/include/
+ *     - /path/to/tvm/3rdparty/dlpack/include/
  *   - Add -lpthread -ldl to the linked library.
  *   - You are good to go.
  *   - See the Makefile in the same folder for example.
diff --git a/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj b/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj
index d53ed6ba4cb9..60b6e99e7a92 100644
--- a/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj
+++ b/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj
@@ -386,8 +386,8 @@
 				GCC_SYMBOLS_PRIVATE_EXTERN = NO;
 				HEADER_SEARCH_PATHS = (
 					../../include,
-					../../dlpack/include,
-					"../../dmlc-core/include",
+					../../3rdparty/dlpack/include,
+					"../../3rdparty/dmlc-core/include",
 				);
 				INFOPLIST_FILE = tvmrpc/Info.plist;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
@@ -406,8 +406,8 @@
 				GCC_SYMBOLS_PRIVATE_EXTERN = NO;
 				HEADER_SEARCH_PATHS = (
 					../../include,
-					../../dlpack/include,
-					"../../dmlc-core/include",
+					../../3rdparty/dlpack/include,
+					"../../3rdparty/dmlc-core/include",
 				);
 				INFOPLIST_FILE = tvmrpc/Info.plist;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
@@ -422,9 +422,9 @@
 				BUNDLE_LOADER = "$(TEST_HOST)";
 				DEVELOPMENT_TEAM = 3FR42MXLK9;
 				HEADER_SEARCH_PATHS = (
-					../../dlpack/include,
+					../../3rdparty/dlpack/include,
 					../../include,
-					"../../dmlc-core/include",
+					"../../3rdparty/dmlc-core/include",
 				);
 				INFOPLIST_FILE = tvmrpcLauncher/Info.plist;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks @loader_path/Frameworks";
@@ -440,9 +440,9 @@
 				BUNDLE_LOADER = "$(TEST_HOST)";
 				DEVELOPMENT_TEAM = 3FR42MXLK9;
 				HEADER_SEARCH_PATHS = (
-					../../dlpack/include,
+					../../3rdparty/dlpack/include,
 					../../include,
-					"../../dmlc-core/include",
+					"../../3rdparty/dmlc-core/include",
 				);
 				INFOPLIST_FILE = tvmrpcLauncher/Info.plist;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks @loader_path/Frameworks";
diff --git a/apps/pynq_rpc/start_rpc_server.sh b/apps/pynq_rpc/start_rpc_server.sh
index 30b3c9a90d6b..2dce74472414 100755
--- a/apps/pynq_rpc/start_rpc_server.sh
+++ b/apps/pynq_rpc/start_rpc_server.sh
@@ -2,4 +2,5 @@
 PROJROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../" && pwd )"
 
 export PYTHONPATH=${PYTHONPATH}:${PROJROOT}/python:${PROJROOT}/vta/python
-python -m vta.exec.rpc_server
+export PYTHONPATH=${PYTHONPATH}:/home/xilinx/pynq
+python3 -m vta.exec.rpc_server
diff --git a/apps/rocm_rpc/Makefile b/apps/rocm_rpc/Makefile
index b4e527980941..d4e3ec06ca99 100644
--- a/apps/rocm_rpc/Makefile
+++ b/apps/rocm_rpc/Makefile
@@ -3,12 +3,12 @@ ROCM_PATH=/opt/rocm
 
 TVM_ROOT=$(shell cd ../..; pwd)
 NNVM_PATH=nnvm
-DMLC_CORE=${TVM_ROOT}/dmlc-core
+DMLC_CORE=${TVM_ROOT}/3rdparty/dmlc-core
 
 PKG_CFLAGS = -std=c++11 -O2 -fPIC\
 	-I${TVM_ROOT}/include\
 	-I${DMLC_CORE}/include\
-	-I${TVM_ROOT}/dlpack/include\
+	-I${TVM_ROOT}/3rdparty/dlpack/include\
 	-I${ROCM_PATH}/include
 
 PKG_LDFLAGS = -L${ROCM_PATH}/lib -L${TVM_ROOT}/lib -ldl -lpthread -lhip_hcc -lMIOpen
diff --git a/apps/sgx/Makefile b/apps/sgx/Makefile
index cd7034d4c41b..422d3e4f03ab 100644
--- a/apps/sgx/Makefile
+++ b/apps/sgx/Makefile
@@ -1,13 +1,12 @@
-# Makefile for example to deploy TVM modules in SGX.
-
-TVM_ROOT := $(shell cd ../..; pwd)
-NNVM_PATH := nnvm
-DMLC_CORE := ${TVM_ROOT}/dmlc-core
-
 SGX_SDK ?= /opt/sgxsdk
+RUST_SGX_SDK ?= /opt/rust-sgx-sdk
 SGX_MODE ?= SIM
-SGX_ARCH ?= x64
-SGX_DEBUG ?= 1
+DEBUG ?= true
+NUM_THREADS ?= 4
+
+TVM_DIR ?= $(shell git rev-parse --show-toplevel)
+
+export
 
 sgx_edger8r := $(SGX_SDK)/bin/x64/sgx_edger8r
 sgx_enclave_signer := $(SGX_SDK)/bin/x64/sgx_sign
@@ -20,69 +19,71 @@ trts_library_name := sgx_trts$(sgx_sim)
 tservice_library_name := sgx_tservice$(sgx_sim)
 uservice_library_name := sgx_uae_service$(sgx_sim)
 
-pkg_cflags := -std=c++11 -O2 -fPIC\
-	-I${TVM_ROOT}/include\
-	-I${DMLC_CORE}/include\
-	-I${TVM_ROOT}/dlpack/include\
-	-I.\
-	-DDMLC_LOG_STACK_TRACE=0\
-	-fmax-errors=4
-
-pkg_ldflags := -L${TVM_ROOT}/lib
-
-enclave_include_paths := -I$(SGX_SDK)/include\
-	-I$(SGX_SDK)/include/tlibc\
-	-I$(SGX_SDK)/include/libcxx\
-	-I$(SGX_SDK)/include/stdc++\
+pkg_cflags := -std=c++11 -fPIC \
+	-I$(SGX_SDK)/include \
+	-I$(TVM_DIR)/include \
+	-I$(TVM_DIR)/dlpack/include \
+	-I$(TVM_DIR)/dmlc-core/include
+
+pkg_ldflags := -L$(TVM_DIR)/build -ltvm_runtime
+
+ifneq ($(DEBUG), false)
+	debug := debug
+	enclave_cflags += -Og -g
+	pkg_cflags += -Og -g
+else
+	debug := release
+	enclave_cflags += -O2
+	pkg_cflags += -O2
+endif
 
-enclave_cflags := -static -nostdinc\
-	-fvisibility=hidden -fpie -fstack-protector-strong\
-	-ffunction-sections -fdata-sections\
-	-DDMLC_CXX11_THREAD_LOCAL=0\
-	-include "lib/tvm_t.h"\
-	$(enclave_include_paths)\
+build_dir := build
 
-enclave_cxxflags := -nostdinc++ $(enclave_cflags) -DTVM_SGX_MAX_CONCURRENCY=4
+enclave_cflags := \
+	-I$(SGX_SDK)/include \
+	-I$(SGX_SDK)/include/tlibc \
+	-I$(SGX_SDK)/include/stdport \
+	-I$(SGX_SDK)/include/epid \
+	-I$(TVM_DIR)/include \
+	-I$(TVM_DIR)/dlpack/include \
+	-I$(TVM_DIR)/dmlc-core/include
 
 enclave_ldflags :=\
+	-L$(build_dir) -L$(TVM_DIR)/build \
 	-Wl,--no-undefined -nostdlib -nodefaultlibs -nostartfiles -L$(SGX_SDK)/lib64\
 	-Wl,--whole-archive -l$(trts_library_name) -Wl,--no-whole-archive\
 	-Wl,--start-group\
 	-lsgx_tstdc -lsgx_tstdcxx -lsgx_tcxx -lsgx_tcrypto -lsgx_tkey_exchange -l$(tservice_library_name)\
+	-lenclave -ltvm_t\
 	-Wl,--end-group\
 	-Wl,-Bstatic -Wl,-Bsymbolic -Wl,--no-undefined\
 	-Wl,-pie,-eenclave_entry -Wl,--export-dynamic\
-	-Wl,--defsym,__ImageBase=0 -Wl,--gc-sections
-
-.PHONY: clean all
+	-Wl,--defsym,__ImageBase=0 -Wl,--gc-sections\
+	-Wl,--version-script=enclave/enclave.lds
 
-all: lib/test_addone.signed.so
+.PHONY: enclave clean
 
-# The code library built by TVM
-lib/test_addone_sys.o: prepare_test_libs.py
-	python prepare_test_libs.py
+enclave: $(build_dir)/enclave.signed.so
 
-lib/tvm_t.h: ../../src/runtime/sgx/tvm.edl
-	$(sgx_edger8r) --trusted $< --trusted-dir lib --search-path $(SGX_SDK)/include
-	mv $@ $@.in
-	awk 'NR==4{print "#include <tvm/runtime/c_runtime_api.h>"}1' $@.in > $@
+$(build_dir)/enclave.signed.so: $(build_dir)/enclave.so build/enclave_config.xml enclave/enclave.pem
+	$(sgx_enclave_signer) sign -key enclave/enclave.pem -enclave $< -out $@ -config build/enclave_config.xml
 
-lib/tvm_t.c: lib/tvm_t.h
+enclave/enclave.pem:
+	curl -sSo $@ 'https://gist.githubusercontent.com/nhynes/8a2d80068a92e672f8b0b7d710ceb404/raw/2d5ae5fbe83198ede49465fdc6535065e093543b/tvm_sgx_demo.pem'
 
-lib/tvm_t.o: lib/tvm_t.c
-	$(CC) $(enclave_cflags) $(pkg_cflags) -c $< -o $@ -include $(TVM_ROOT)/include/tvm/runtime/c_runtime_api.h
+build/enclave_config.xml: enclave/enclave_config.xml.in
+	cpp $^ -P -o $@ -DNUM_THREADS=$$(( $(NUM_THREADS) + 1 ))
 
-# The enclave library
-lib/test_addone.so: $(TVM_ROOT)/src/runtime/sgx/trusted/runtime.cc lib/tvm_t.o lib/test_addone_sys.o
-	$(CXX) $^ -o $@ $(pkg_cflags) $(pkg_ldflags) $(enclave_cxxflags) $(enclave_ldflags) -g
+$(build_dir)/enclave.so: $(build_dir)/libenclave.a $(TVM_DIR)/build/libtvm_t.a
+	$(CXX) $< -o $@ $(enclave_ldflags) $(enclave_cflags) -ltvm_t
 
-# The demo enclave signing key
-lib/enclave.pem:
-	curl -Lso $@ https://gist.githubusercontent.com/nhynes/8a2d80068a92e672f8b0b7d710ceb404/raw/2d5ae5fbe83198ede49465fdc6535065e093543b/tvm_sgx_demo.pem
+$(build_dir)/libenclave.a: enclave/target/x86_64-unknown-linux-sgx/$(debug)/libmodel_enclave.a
+	@mkdir -p $(@D)
+	@cp $< $@
 
-# The signed enclave
-lib/test_addone.signed.so: lib/test_addone.so enclave_config.xml lib/enclave.pem
-	$(sgx_enclave_signer) sign -key lib/enclave.pem -enclave $< -out $@ -config enclave_config.xml
+enclave/target/x86_64-unknown-linux-sgx/$(debug)/libmodel_enclave.a: enclave/**/*
+	$(MAKE) -C enclave
 
 clean:
-	rm -rf lib
+	$(MAKE) -s -C enclave clean
+	rm -rf build
diff --git a/apps/sgx/README.md b/apps/sgx/README.md
index 565519d457ce..10989ba4b90d 100644
--- a/apps/sgx/README.md
+++ b/apps/sgx/README.md
@@ -4,13 +4,41 @@ This application demonstrates the use of a simple TVM model in the [Intel SGX](h
 
 ## Prerequisites
 
+1. The TVM premade Docker image
+
+or
+
 1. A GNU/Linux environment
 2. TVM compiled with LLVM and SGX; and the `tvm` Python module
 3. The [Linux SGX SDK](https://github.com/intel/linux-sgx) [link to pre-built libraries](https://01.org/intel-software-guard-extensions/downloads)
+4. [Rust](https://rustup.sh)
+5. The [rust-sgx-sdk](https://github.com/baidu/rust-sgx-sdk)
+6. [xargo](https://github.com/japaric/xargo)
+
+Check out the `/tvm/install/ubuntu_install_sgx.sh` for the commands to get these dependencies.
 
 ## Running the example
 
-`SGX_SDK=/path/to/sgxsdk bash run_example.sh`
+If using Docker, start by running
+
+```
+git clone --recursive https://github.com/dmlc/tvm.git
+docker run --rm -it -v $(pwd)/tvm:/mnt tvmai/ci-cpu /bin/bash
+```
+then, in the container
+```
+cd /mnt
+mkdir build && cd build
+cmake .. -DUSE_LLVM=ON -DUSE_SGX=/opt/sgxsdk -DRUST_SGX_SDK=/opt/rust-sgx-sdk
+make -j4
+cd ..
+pip install -e python -e topi/python -e nnvm/python
+cd apps/sgx
+```
+
+Once TVM is build and installed, just
+
+`./run_example.sh`
 
 If everything goes well, you should see a lot of build messages and below them
 the text `It works!`.
@@ -24,10 +52,9 @@ In this library, one can use other libraries like TVM.
 Building this example performs the following steps:
 
 1. Creates a simple TVM module that computes `x + 1` and save it as a system library.
-2. Builds a minimal TVM runtime pack that can load the module.
-3. Links the TVM module into an SGX enclave along with some code that runs the module.
-4. Compiles and runs an executable that loads the enclave and calls a function
-   which invokes the TVM module.
+2. Builds a TVM runtime that links the module and allows running it using the TVM Python runtime.
+3. Packages the bundle into an SGX enclave
+4. Runs the enclave using the usual TVM Python `module` API
 
 For more information on building, please refer to the `Makefile`.  
 For more information on the TVM module, please refer to `../howto_deploy`.  
diff --git a/apps/sgx/enclave/.rustfmt.toml b/apps/sgx/enclave/.rustfmt.toml
new file mode 120000
index 000000000000..ec1baa2f89be
--- /dev/null
+++ b/apps/sgx/enclave/.rustfmt.toml
@@ -0,0 +1 @@
+../../../rust/.rustfmt.toml
\ No newline at end of file
diff --git a/apps/sgx/enclave/Cargo.toml b/apps/sgx/enclave/Cargo.toml
new file mode 100644
index 000000000000..cb128f3fbf94
--- /dev/null
+++ b/apps/sgx/enclave/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "model-enclave"
+version = "0.1.0"
+authors = ["Nick Hynes <nhynes@berkeley.edu>"]
+
+[lib]
+crate-type = ["staticlib"]
+
+[dependencies]
+lazy_static = "1.1.0"
+tvm = { path = "../../../rust", default-features = false, features = ["sgx"] }
+
+[profile.release]
+lto = true
+opt-level = 3
diff --git a/apps/sgx/enclave/Makefile b/apps/sgx/enclave/Makefile
new file mode 100644
index 000000000000..a28e05e03b13
--- /dev/null
+++ b/apps/sgx/enclave/Makefile
@@ -0,0 +1,42 @@
+MODEL ?= resnet
+NUM_THREADS ?= 4
+BATCH_SIZE ?= 64
+TRAINING ?= true
+DEBUG ?= false
+
+build_dir := ../build
+
+ifeq ($(DEBUG), false)
+	debug := release
+	xargo_args := --release
+else
+	debug := debug
+endif
+
+target=target/x86_64-unknown-linux-sgx/$(debug)/libmodel-enclave.a
+
+$(target): $(build_dir)/libmodel.a **/* $(TVM_DIR)/rust/patched.txt
+	RUST_TARGET_PATH=$(shell pwd) \
+		RUST_TARGET_DIR=$(shell pwd)/target \
+		RUSTFLAGS="-Z force-unstable-if-unmarked" \
+		TVM_NUM_THREADS=$(NUM_THREADS) \
+		BUILD_DIR=../build \
+		xargo build --target x86_64-unknown-linux-sgx $(xargo_args) -q
+
+$(TVM_DIR)/rust/patched.txt: $(shell pwd)/sgx-deps.diff
+	echo $(TVM_DIR)
+	cd $(TVM_DIR) && git apply $<
+	touch $@
+
+$(build_dir)/libmodel.a: $(build_dir)/model.o
+	$(AR) cr $@ $^
+
+$(build_dir)/model.o: $(build_dir)/model.bc
+	$(CC) -c $< -o $@ -fPIC -O3
+	objcopy --globalize-symbol __tvm_module_startup $@
+
+$(build_dir)/model.bc: src/build_model.py
+	python3 $< -o $(build_dir)
+
+clean:
+	xargo clean
diff --git a/apps/sgx/enclave/Xargo.toml b/apps/sgx/enclave/Xargo.toml
new file mode 100644
index 000000000000..57acf092b4d6
--- /dev/null
+++ b/apps/sgx/enclave/Xargo.toml
@@ -0,0 +1,13 @@
+[dependencies]
+alloc = {}
+panic_unwind = {}
+panic_abort = {}
+
+[dependencies.std]
+path = "/opt/rust-sgx-sdk/xargo/sgx_tstd"
+features = ["backtrace", "stdio", "untrusted_time"]
+stage = 2
+
+[dependencies.xargo_sgx_rand]
+path = "/opt/rust-sgx-sdk/xargo/sgx_rand"
+stage = 3
diff --git a/apps/sgx/enclave/build.rs b/apps/sgx/enclave/build.rs
new file mode 100644
index 000000000000..a3beedaacda6
--- /dev/null
+++ b/apps/sgx/enclave/build.rs
@@ -0,0 +1,9 @@
+use std::env;
+
+fn main() {
+  println!(
+    "cargo:rustc-link-search=native={}",
+    env::var("BUILD_DIR").unwrap()
+  );
+  println!("cargo:rustc-link-lib=static=model");
+}
diff --git a/apps/sgx/enclave/enclave.lds b/apps/sgx/enclave/enclave.lds
new file mode 100644
index 000000000000..e3d9d0ee0d90
--- /dev/null
+++ b/apps/sgx/enclave/enclave.lds
@@ -0,0 +1,9 @@
+enclave.so
+{
+    global:
+        g_global_data_sim;
+        g_global_data;
+        enclave_entry;
+    local:
+        *;
+};
diff --git a/apps/sgx/enclave_config.xml b/apps/sgx/enclave/enclave_config.xml.in
similarity index 50%
rename from apps/sgx/enclave_config.xml
rename to apps/sgx/enclave/enclave_config.xml.in
index 07be0d7a7ad2..630c84c2cc31 100644
--- a/apps/sgx/enclave_config.xml
+++ b/apps/sgx/enclave/enclave_config.xml.in
@@ -1,10 +1,10 @@
 <EnclaveConfiguration>
   <ProdID>0</ProdID>
   <ISVSVN>0</ISVSVN>
-  <StackMaxSize>0x2000</StackMaxSize>
-  <HeapMaxSize>0x2000</HeapMaxSize>
-  <TCSNum>5</TCSNum>
-  <TCSPolicy>1</TCSPolicy>
+  <StackMaxSize>0xf0000</StackMaxSize>
+  <HeapMaxSize>0xf000000</HeapMaxSize>
+  <TCSNum>NUM_THREADS</TCSNum>
+  <TCSPolicy>0</TCSPolicy> <!-- must be "bound" to use thread_local -->
   <DisableDebug>0</DisableDebug>
   <MiscSelect>0</MiscSelect>
   <MiscMask>0xFFFFFFFF</MiscMask>
diff --git a/apps/sgx/enclave/sgx-deps.diff b/apps/sgx/enclave/sgx-deps.diff
new file mode 100644
index 000000000000..1c67e7957f38
--- /dev/null
+++ b/apps/sgx/enclave/sgx-deps.diff
@@ -0,0 +1,13 @@
+diff --git a/rust/Cargo.toml b/rust/Cargo.toml
+index 0819e0c7..e56f4ef2 100644
+--- a/rust/Cargo.toml
++++ b/rust/Cargo.toml
+@@ -14,7 +14,7 @@ default = ["nom/std"]
+ sgx = ["nom/alloc"]
+ 
+ [dependencies]
+-bounded-spsc-queue = "0.4.0"
++bounded-spsc-queue = { git = "https://github.com/nhynes/bounded-spsc-queue", branch = "sgx" }
+ error-chain = { version = "0.12.0", default-features = false }
+ itertools = "0.7.8"
+ lazy_static = "1.1.0"
diff --git a/apps/sgx/enclave/src/build_model.py b/apps/sgx/enclave/src/build_model.py
new file mode 100644
index 000000000000..d1b45cc4a4df
--- /dev/null
+++ b/apps/sgx/enclave/src/build_model.py
@@ -0,0 +1,38 @@
+"""Creates a simple TVM modules."""
+
+import argparse
+import os
+from os import path as osp
+
+import nnvm.compiler
+import nnvm.testing
+import tvm
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-o', '--out-dir', default='.')
+    opts = parser.parse_args()
+
+    # from tutorials/nnvm_quick_start.py
+    dshape = (1, 3, 224, 224)
+    net, params = nnvm.testing.resnet.get_workload(
+        layers=18, batch_size=dshape[0], image_shape=dshape[1:])
+
+    with nnvm.compiler.build_config(opt_level=3):
+        graph, lib, params = nnvm.compiler.build(
+            net, 'llvm --system-lib', shape={'data': dshape}, params=params)
+
+    build_dir = osp.abspath(opts.out_dir)
+    if not osp.isdir(build_dir):
+        os.makedirs(build_dir, exist_ok=True)
+
+    lib.save(osp.join(build_dir, 'model.bc'))
+    with open(osp.join(build_dir, 'graph.json'), 'w') as f_graph_json:
+        f_graph_json.write(graph.json())
+        with open(osp.join(build_dir, 'params.bin'), 'wb') as f_params:
+            f_params.write(nnvm.compiler.save_param_dict(params))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/apps/sgx/enclave/src/lib.rs b/apps/sgx/enclave/src/lib.rs
new file mode 100644
index 000000000000..3310040d3657
--- /dev/null
+++ b/apps/sgx/enclave/src/lib.rs
@@ -0,0 +1,137 @@
+#![feature(try_from)]
+
+#[macro_use]
+extern crate lazy_static;
+#[macro_use]
+extern crate tvm;
+
+use std::{
+  convert::{TryFrom, TryInto},
+  sync::Mutex,
+};
+
+use tvm::{
+  ffi::runtime::DLTensor,
+  runtime::{
+    load_param_dict, sgx, Graph, GraphExecutor, SystemLibModule, TVMArgValue, TVMRetValue, Tensor,
+  },
+};
+
+lazy_static! {
+  static ref SYSLIB: SystemLibModule = { SystemLibModule::default() };
+  static ref MODEL: Mutex<GraphExecutor<'static, 'static>> = {
+    let graph_json = include_str!(concat!("../", env!("BUILD_DIR"), "/graph.json"));
+    let params_bytes = include_bytes!(concat!("../", env!("BUILD_DIR"), "/params.bin"));
+    let params = load_param_dict(params_bytes).unwrap();
+
+    let graph = Graph::try_from(graph_json).unwrap();
+    let mut exec = GraphExecutor::new(graph, &*SYSLIB).unwrap();
+    exec.load_params(params);
+    Mutex::new(exec)
+  };
+}
+
+fn ecall_init(_args: &[TVMArgValue]) -> TVMRetValue {
+  lazy_static::initialize(&MODEL);
+  TVMRetValue::from(0)
+}
+
+fn ecall_main(args: &[TVMArgValue<'static>]) -> TVMRetValue {
+  let mut model = MODEL.lock().unwrap();
+  let inp = args[0].try_into().unwrap();
+  let mut out: Tensor = args[1].try_into().unwrap();
+  model.set_input("data", inp);
+  model.run();
+  sgx::shutdown();
+  out.copy(model.get_output(0).unwrap());
+  TVMRetValue::from(1)
+}
+
+pub mod ecalls {
+  //! todo: generate this using proc_macros
+
+  use super::*;
+
+  use std::{
+    ffi::CString,
+    mem,
+    os::raw::{c_char, c_int, c_void},
+    slice,
+  };
+
+  use tvm::{
+    ffi::runtime::{TVMRetValueHandle, TVMValue},
+    runtime::{
+      sgx::{ocall_packed_func, run_worker, SgxStatus},
+      DataType, PackedFunc,
+    },
+  };
+
+  macro_rules! tvm_ocall {
+    ($func: expr) => {
+      match $func {
+        0 => Ok(()),
+        err => Err(err),
+      }
+    };
+  }
+
+  const ECALLS: &'static [&'static str] = &["__tvm_run_worker__", "__tvm_main__", "init"];
+
+  pub type EcallPackedFunc = Box<Fn(&[TVMArgValue<'static>]) -> TVMRetValue + Send + Sync>;
+
+  lazy_static! {
+    static ref ECALL_FUNCS: Vec<EcallPackedFunc> = {
+      vec![
+        Box::new(run_worker),
+        Box::new(ecall_main),
+        Box::new(ecall_init),
+      ]
+    };
+  }
+
+  extern "C" {
+    fn __tvm_module_startup() -> ();
+    fn tvm_ocall_register_export(name: *const c_char, func_id: c_int) -> SgxStatus;
+  }
+
+  #[no_mangle]
+  pub extern "C" fn tvm_ecall_init(_ret: TVMRetValueHandle) {
+    unsafe {
+      __tvm_module_startup();
+
+      ECALLS.into_iter().enumerate().for_each(|(i, ecall)| {
+        tvm_ocall!(tvm_ocall_register_export(
+          CString::new(*ecall).unwrap().as_ptr(),
+          i as i32
+        ))
+        .expect(&format!("Error registering `{}`", ecall));
+      });
+    }
+  }
+
+  #[no_mangle]
+  pub extern "C" fn tvm_ecall_packed_func(
+    func_id: c_int,
+    arg_values: *const TVMValue,
+    type_codes: *const c_int,
+    num_args: c_int,
+    ret_val: *mut TVMValue,
+    ret_type_code: *mut i64,
+  ) {
+    let args = unsafe {
+      let values = slice::from_raw_parts(arg_values, num_args as usize);
+      let type_codes = slice::from_raw_parts(type_codes, num_args as usize);
+      values
+        .into_iter()
+        .zip(type_codes.into_iter())
+        .map(|(v, t)| TVMArgValue::new(*v, *t as i64))
+        .collect::<Vec<TVMArgValue<'static>>>()
+    };
+    let (rv, tc) = ECALL_FUNCS[func_id as usize](&args).into_tvm_value();
+    unsafe {
+      *ret_val = rv;
+      *ret_type_code = tc;
+    }
+  }
+}
diff --git a/apps/sgx/enclave/x86_64-unknown-linux-sgx.json b/apps/sgx/enclave/x86_64-unknown-linux-sgx.json
new file mode 100644
index 000000000000..6cbb524f4439
--- /dev/null
+++ b/apps/sgx/enclave/x86_64-unknown-linux-sgx.json
@@ -0,0 +1,31 @@
+{
+  "arch": "x86_64",
+  "cpu": "x86-64",
+  "data-layout": "e-m:e-i64:64-f80:128-n8:16:32:64-S128",
+  "dynamic-linking": true,
+  "env": "sgx",
+  "exe-allocation-crate": "alloc_system",
+  "executables": true,
+  "has-elf-tls": true,
+  "has-rpath": true,
+  "linker-flavor": "gcc",
+  "linker-is-gnu": true,
+  "llvm-target": "x86_64-unknown-linux-gnu",
+  "max-atomic-width": 64,
+  "os": "linux",
+  "position-independent-executables": true,
+  "pre-link-args": {
+    "gcc": [
+      "-Wl,--as-needed",
+      "-Wl,-z,noexecstack",
+      "-m64"
+    ]
+  },
+  "relro-level": "full",
+  "stack-probes": true,
+  "target-c-int-width": "32",
+  "target-endian": "little",
+  "target-family": "unix",
+  "target-pointer-width": "64",
+  "vendor": "unknown"
+}
diff --git a/apps/sgx/prepare_test_libs.py b/apps/sgx/prepare_test_libs.py
deleted file mode 100644
index f676f46b7ff0..000000000000
--- a/apps/sgx/prepare_test_libs.py
+++ /dev/null
@@ -1,26 +0,0 @@
-"""Script to prepare test_addone_sys.o"""
-
-from os import path as osp
-
-import tvm
-
-CWD = osp.dirname(osp.abspath(osp.expanduser(__file__)))
-
-
-def main():
-    out_dir = osp.join(CWD, 'lib')
-
-    n = tvm.var('n')
-    A = tvm.placeholder((n,), name='A')
-    B = tvm.compute(A.shape, lambda *i: A(*i) + 1, name='B')
-    s = tvm.create_schedule(B.op)
-    s[B].parallel(s[B].op.axis[0])
-    print(tvm.lower(s, [A, B], simple_mode=True))
-
-    # Compile library in system library mode
-    fadd_syslib = tvm.build(s, [A, B], 'llvm --system-lib')
-    fadd_syslib.save(osp.join(out_dir, 'test_addone_sys.o'))
-
-
-if __name__ == '__main__':
-    main()
diff --git a/apps/sgx/run_example.sh b/apps/sgx/run_example.sh
index 9334b260cbf3..811da3938dd6 100755
--- a/apps/sgx/run_example.sh
+++ b/apps/sgx/run_example.sh
@@ -1,6 +1,10 @@
 #!/bin/bash
 
 sgx_sdk=${SGX_SDK:=/opt/sgxsdk}
-make
-echo "========================="
-LD_LIBRARY_PATH="$sgx_sdk/lib64":${LD_LIBRARY_PATH} TVM_CACHE_DIR=/tmp python test_addone.py
+
+export LD_LIBRARY_PATH="$sgx_sdk/lib64":${LD_LIBRARY_PATH}
+export CC=clang-6.0
+export AR=llvm-ar-6.0
+export TVM_CACHE_DIR=/tmp
+
+make && printf "\n" && python3 run_model.py
diff --git a/apps/sgx/run_model.py b/apps/sgx/run_model.py
new file mode 100644
index 000000000000..232a03524801
--- /dev/null
+++ b/apps/sgx/run_model.py
@@ -0,0 +1,22 @@
+import os.path as osp
+import numpy as np
+import tvm
+
+CWD = osp.abspath(osp.dirname(__file__))
+
+
+def main():
+    ctx = tvm.context('cpu', 0)
+    model = tvm.module.load(osp.join(CWD, 'build', 'enclave.signed.so'))
+    inp = tvm.nd.array(np.ones((1, 3, 224, 224), dtype='float32'), ctx)
+    out = tvm.nd.array(np.empty((1, 1000), dtype='float32'), ctx)
+    model(inp, out)
+    if abs(out.asnumpy().sum() - 1) < 0.001:
+        print('It works!')
+    else:
+        print('It doesn\'t work!')
+        exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/apps/sgx/test_addone.py b/apps/sgx/test_addone.py
deleted file mode 100644
index 5ddccfa425cc..000000000000
--- a/apps/sgx/test_addone.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import tvm
-import numpy as np
-
-ctx = tvm.context('cpu', 0)
-fadd1 = tvm.module.load('lib/test_addone.signed.so')
-
-n = 10
-x = tvm.nd.array(np.random.uniform(size=n).astype('float32'), ctx)
-y = tvm.nd.array(np.zeros(n, dtype='float32'), ctx)
-fadd1(x, y)
-
-np.testing.assert_allclose(y.asnumpy(), x.asnumpy() + 1)
-print("It works!")
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 85c5102169a9..a97def410ddd 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -62,9 +62,23 @@ set(USE_VULKAN OFF)
 # Whether enable OpenGL runtime
 set(USE_OPENGL OFF)
 
+# Whether to enable SGX runtime
+#
+# Possible values for USE_SGX:
+# - /path/to/sgxsdk: path to Intel SGX SDK
+# - OFF: disable SGX
+#
+# SGX_MODE := HW|SIM
+set(USE_SGX OFF)
+set(SGX_MODE "SIM")
+set(RUST_SGX_SDK "/path/to/rust-sgx-sdk")
+
 # Whether enable RPC runtime
 set(USE_RPC ON)
 
+# Whether embed stackvm into the runtime
+set(USE_STACKVM_RUNTIME OFF)
+
 # Whether enable tiny embedded graph runtime.
 set(USE_GRAPH_RUNTIME ON)
 
@@ -114,3 +128,6 @@ set(USE_ROCBLAS OFF)
 
 # Whether use contrib sort
 set(USE_SORT OFF)
+
+# Build ANTLR parser for Relay text format
+set(USE_ANTLR OFF)
diff --git a/cmake/modules/ANTLR.cmake b/cmake/modules/ANTLR.cmake
new file mode 100644
index 000000000000..72eb5925bda0
--- /dev/null
+++ b/cmake/modules/ANTLR.cmake
@@ -0,0 +1,28 @@
+if(USE_ANTLR)
+  if(EXISTS /usr/local/lib/antlr-4.7.1-complete.jar)
+    set(ANTLR4 "/usr/local/lib/antlr-4.7.1-complete.jar")
+
+    set(RELAY_PARSER_DIR
+      ${CMAKE_CURRENT_SOURCE_DIR}/python/tvm/relay/grammar)
+
+    set(RELAY_PARSER
+      ${RELAY_PARSER_DIR}/py2/RelayVisitor.py
+      ${RELAY_PARSER_DIR}/py2/RelayParser.py
+      ${RELAY_PARSER_DIR}/py2/RelayLexer.py
+
+      ${RELAY_PARSER_DIR}/py3/RelayVisitor.py
+      ${RELAY_PARSER_DIR}/py3/RelayParser.py
+      ${RELAY_PARSER_DIR}/py3/RelayLexer.py)
+
+    # Generate ANTLR grammar for parsing.
+    add_custom_command(OUTPUT ${RELAY_PARSER}
+      COMMAND $ENV{JAVA_HOME}/bin/java -jar ${ANTLR4} -visitor -no-listener -Dlanguage=Python2 ${RELAY_PARSER_DIR}/Relay.g4 -o ${RELAY_PARSER_DIR}/py2
+      COMMAND $ENV{JAVA_HOME}/bin/java -jar ${ANTLR4} -visitor -no-listener -Dlanguage=Python3 ${RELAY_PARSER_DIR}/Relay.g4 -o ${RELAY_PARSER_DIR}/py3
+      DEPENDS ${RELAY_PARSER_DIR}/Relay.g4
+      WORKING_DIRECTORY ${RELAY_PARSER_DIR})
+
+    add_custom_target(relay_parser ALL DEPENDS ${RELAY_PARSER})
+  else()
+    message(FATAL_ERROR "Can't find ANTLR4!")
+  endif()
+endif(USE_ANTLR)
diff --git a/cmake/modules/SGX.cmake b/cmake/modules/SGX.cmake
new file mode 100644
index 000000000000..608d6ff5a4bd
--- /dev/null
+++ b/cmake/modules/SGX.cmake
@@ -0,0 +1,51 @@
+if(NOT USE_SGX STREQUAL "OFF")
+
+  set(_sgx_src ${CMAKE_CURRENT_SOURCE_DIR}/src/runtime/sgx)
+  set(_tvm_u_h ${_sgx_src}/untrusted/tvm_u.h)
+  set(_tvm_t_h ${_sgx_src}/trusted/tvm_t.h)
+  set(_tvm_t_c ${_sgx_src}/trusted/tvm_t.c)
+  set(_tvm_edl ${_sgx_src}/tvm.edl)
+  set(_sgx_ustdc ${RUST_SGX_SDK}/sgx_ustdc)
+
+  set(_urts_lib "sgx_urts")
+  if(NOT SGX_MODE STREQUAL "HW")
+    message(STATUS "Build with SGX support (SIM)")
+    set(_urts_lib "${_urts_lib}_sim")
+  else()
+    message(STATUS "Build with SGX support (HW)")
+  endif()
+
+  # build edge routines
+  add_custom_command(
+    OUTPUT ${_tvm_u_h}
+    COMMAND ${USE_SGX}/bin/x64/sgx_edger8r --untrusted
+      --untrusted --untrusted-dir ${_sgx_src}/untrusted
+      --trusted --trusted-dir ${_sgx_src}/trusted
+      --search-path ${USE_SGX}/include --search-path ${RUST_SGX_SDK}/edl
+      ${_tvm_edl}
+    COMMAND sed -i "4i '#include <tvm/runtime/c_runtime_api.h>'" ${_tvm_u_h}
+    COMMAND sed -i "4i '#include <tvm/runtime/c_runtime_api.h>'" ${_tvm_t_h}
+    DEPENDS ${_tvm_edl}
+  )
+  add_custom_command(
+    OUTPUT ${_sgx_ustdc}/libsgx_ustdc.a
+    COMMAND make
+    WORKING_DIRECTORY ${_sgx_ustdc}
+  )
+  add_custom_target(sgx_edl DEPENDS ${_tvm_u_h} ${_sgx_ustdc}/libsgx_ustdc.a)
+
+  # build trusted library
+  set_source_files_properties(${_tvm_t_c} PROPERTIES GENERATED TRUE)
+  add_library(tvm_t STATIC ${_tvm_t_c})
+  add_dependencies(tvm_t sgx_edl)
+  target_include_directories(tvm_t PUBLIC ${USE_SGX}/include ${USE_SGX}/include/tlibc)
+
+  # add untrusted runtime files
+  include_directories(${USE_SGX}/include)
+  file(GLOB RUNTIME_SGX_SRCS ${_sgx_src}/untrusted/*.c*)
+  list(APPEND TVM_RUNTIME_LINKER_LIBS
+    -lpthread
+    -L${USE_SGX}/lib64 -l${_urts_lib}
+    -L${RUST_SGX_SDK}/sgx_ustdc -lsgx_ustdc)
+  list(APPEND RUNTIME_SRCS ${RUNTIME_SGX_SRCS})
+endif()
diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake
index 43fb700203c7..ea5bb5ae916a 100644
--- a/cmake/modules/VTA.cmake
+++ b/cmake/modules/VTA.cmake
@@ -38,13 +38,10 @@ elseif(PYTHON)
     set_target_properties(vta PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
   endif(APPLE)
 
-  # PYNQ rules
+  # PYNQ rules for Pynq v2.3
   if(${VTA_TARGET} STREQUAL "pynq")
-    find_library(__sds_lib NAMES sds_lib PATHS /usr/lib)
-    find_library(__dma_lib NAMES dma PATHS
-      "/opt/python3.6/lib/python3.6/site-packages/pynq/drivers/"
-      "/opt/python3.6/lib/python3.6/site-packages/pynq/lib/")
-    target_link_libraries(vta ${__sds_lib} ${__dma_lib})
+    find_library(__cma_lib NAMES cma PATH /usr/lib)
+    target_link_libraries(vta ${__cma_lib})
   endif()
 else()
   message(STATUS "Cannot found python in env, VTA build is skipped..")
diff --git a/cmake/modules/contrib/NNPack.cmake b/cmake/modules/contrib/NNPack.cmake
index 82de88a21e63..4bf844d0c468 100644
--- a/cmake/modules/contrib/NNPack.cmake
+++ b/cmake/modules/contrib/NNPack.cmake
@@ -9,6 +9,10 @@ if(USE_NNPACK)
 	include_directories(${PTHREAD_POOL_PATH}/include)
     find_library(NNPACK_CONTRIB_LIB nnpack ${NNPACK_PATH}/lib)
   find_library(NNPACK_PTHREAD_CONTRIB_LIB pthreadpool ${NNPACK_PATH}/lib)
+  find_library(NNPACK_CPUINFO_CONTRIB_LIB cpuinfo ${NNPACK_PATH}/lib)
+  find_library(NNPACK_CLOG_CONTRIB_LIB clog ${NNPACK_PATH}/lib)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${NNPACK_CONTRIB_LIB})
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${NNPACK_PTHREAD_CONTRIB_LIB})
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${NNPACK_CPUINFO_CONTRIB_LIB})
+  list(APPEND TVM_RUNTIME_LINKER_LIBS ${NNPACK_CLOG_CONTRIB_LIB})
 endif(USE_NNPACK)
diff --git a/cmake/util/FindCUDA.cmake b/cmake/util/FindCUDA.cmake
index 3ce0cc40a5e5..3a99551358f6 100644
--- a/cmake/util/FindCUDA.cmake
+++ b/cmake/util/FindCUDA.cmake
@@ -56,13 +56,15 @@ macro(find_cuda use_cuda)
     else(MSVC)
       find_library(_CUDA_CUDA_LIBRARY cuda
         PATHS ${CUDA_TOOLKIT_ROOT_DIR}
-        PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs)
+        PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs lib64/stubs
+        NO_DEFAULT_PATH)
       if(_CUDA_CUDA_LIBRARY)
         set(CUDA_CUDA_LIBRARY ${_CUDA_CUDA_LIBRARY})
       endif()
       find_library(CUDA_NVRTC_LIBRARY nvrtc
         PATHS ${CUDA_TOOLKIT_ROOT_DIR}
-        PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs)
+        PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs lib64/stubs lib/x86_64-linux-gnu
+        NO_DEFAULT_PATH)
       find_library(CUDA_CUDNN_LIBRARY cudnn
         ${CUDA_TOOLKIT_ROOT_DIR}/lib64
         ${CUDA_TOOLKIT_ROOT_DIR}/lib)
@@ -70,5 +72,11 @@ macro(find_cuda use_cuda)
         ${CUDA_TOOLKIT_ROOT_DIR}/lib64
         ${CUDA_TOOLKIT_ROOT_DIR}/lib)
     endif(MSVC)
+    message(STATUS "Found CUDA_TOOLKIT_ROOT_DIR=" ${CUDA_TOOLKIT_ROOT_DIR})
+    message(STATUS "Found CUDA_CUDA_LIBRARY=" ${CUDA_CUDA_LIBRARY})
+    message(STATUS "Found CUDA_CUDART_LIBRARY=" ${CUDA_CUDART_LIBRARY})
+    message(STATUS "Found CUDA_NVRTC_LIBRARY=" ${CUDA_NVRTC_LIBRARY})
+    message(STATUS "Found CUDA_CUDNN_LIBRARY=" ${CUDA_CUDNN_LIBRARY})
+    message(STATUS "Found CUDA_CUBLAS_LIBRARY=" ${CUDA_CUBLAS_LIBRARY})
   endif(CUDA_FOUND)
 endmacro(find_cuda)
diff --git a/cmake/util/FindLLVM.cmake b/cmake/util/FindLLVM.cmake
index 4bb58d462d12..8497761a7116 100644
--- a/cmake/util/FindLLVM.cmake
+++ b/cmake/util/FindLLVM.cmake
@@ -11,7 +11,7 @@
 # - LLVM_INCLUDE_DIRS
 # - LLVM_LIBS
 # - LLVM_DEFINITIONS
-# - TVM_LLVM_VERISON
+# - TVM_LLVM_VERSION
 #
 macro(find_llvm use_llvm)
   set(LLVM_CONFIG ${use_llvm})
@@ -56,4 +56,9 @@ macro(find_llvm use_llvm)
     separate_arguments(LLVM_LIBS)
     string(STRIP ${TVM_LLVM_VERSION} TVM_LLVM_VERSION)
   endif()
+  if(NOT LLVM_CONFIG STREQUAL "OFF")
+    message(STATUS "Found LLVM_INCLUDE_DIRS=" ${LLVM_INCLUDE_DIRS})
+    message(STATUS "Found LLVM_DEFINITIONS=" ${LLVM_DEFINITIONS})
+    message(STATUS "Found TVM_LLVM_VERSION=" ${TVM_LLVM_VERSION})
+  endif()
 endmacro(find_llvm)
diff --git a/cmake/util/FindROCM.cmake b/cmake/util/FindROCM.cmake
index 235969813382..317fea1b8f4e 100644
--- a/cmake/util/FindROCM.cmake
+++ b/cmake/util/FindROCM.cmake
@@ -21,21 +21,27 @@ macro(find_rocm use_rocm)
   if(IS_DIRECTORY ${__use_rocm})
     set(__rocm_sdk ${__use_rocm})
     message(STATUS "Custom ROCM SDK PATH=" ${__use_rocm})
-   elseif(IS_DIRECTORY $ENV{ROCM_PATH})
-     set(__rocm_sdk $ENV{ROCM_PATH})
-   elseif(IS_DIRECTORY /opt/rocm)
-     set(__rocm_sdk /opt/rocm)
-   else()
-     set(__rocm_sdk "")
-   endif()
+  elseif(IS_DIRECTORY $ENV{ROCM_PATH})
+    set(__rocm_sdk $ENV{ROCM_PATH})
+  elseif(IS_DIRECTORY /opt/rocm)
+    set(__rocm_sdk /opt/rocm)
+  else()
+    set(__rocm_sdk "")
+  endif()
 
-   if(__rocm_sdk)
-     set(ROCM_INCLUDE_DIRS ${__rocm_sdk}/include)
-     find_library(ROCM_HIPHCC_LIBRARY hip_hcc ${__rocm_sdk}/lib)
-     find_library(ROCM_MIOPEN_LIBRARY MIOpen ${__rocm_sdk}/lib)
-     find_library(ROCM_ROCBLAS_LIBRARY rocblas ${__rocm_sdk}/lib)
-     if(ROCM_HIPHCC_LIBRARY)
-       set(ROCM_FOUND TRUE)
-     endif()
-   endif(__rocm_sdk)
+  if(__rocm_sdk)
+    set(ROCM_INCLUDE_DIRS ${__rocm_sdk}/include)
+    find_library(ROCM_HIPHCC_LIBRARY hip_hcc ${__rocm_sdk}/lib)
+    find_library(ROCM_MIOPEN_LIBRARY MIOpen ${__rocm_sdk}/lib)
+    find_library(ROCM_ROCBLAS_LIBRARY rocblas ${__rocm_sdk}/lib)
+    if(ROCM_HIPHCC_LIBRARY)
+      set(ROCM_FOUND TRUE)
+    endif()
+  endif(__rocm_sdk)
+  if(ROCM_FOUND)
+    message(STATUS "Found ROCM_INCLUDE_DIRS=" ${ROCM_INCLUDE_DIRS})
+    message(STATUS "Found ROCM_HIPHCC_LIBRARY=" ${ROCM_HIPHCC_LIBRARY})
+    message(STATUS "Found ROCM_MIOPEN_LIBRARY=" ${ROCM_MIOPEN_LIBRARY})
+    message(STATUS "Found ROCM_ROCBLAS_LIBRARY=" ${ROCM_ROCBLAS_LIBRARY})
+  endif(ROCM_FOUND)
 endmacro(find_rocm)
diff --git a/cmake/util/FindVulkan.cmake b/cmake/util/FindVulkan.cmake
index 0b85e8f47d79..504058c66b62 100644
--- a/cmake/util/FindVulkan.cmake
+++ b/cmake/util/FindVulkan.cmake
@@ -45,11 +45,14 @@ macro(find_vulkan use_vulkan)
   if(Vulkan_FOUND)
     get_filename_component(VULKAN_LIBRARY_PATH ${Vulkan_LIBRARY} DIRECTORY)
     find_library(Vulkan_SPIRV_TOOLS_LIBRARY SPIRV-Tools
-      ${VULKAN_LIBRARY_PATH}/spirv-tools)
+        HINTS ${VULKAN_LIBRARY_PATH} ${VULKAN_LIBRARY_PATH}/spirv-tools)
 
     find_path(_libspirv libspirv.h HINTS ${Vulkan_INCLUDE_DIRS} PATH_SUFFIXES vulkan spirv-tools)
-    find_path(_spirv spirv.hpp HINTS ${Vulkan_INCLUDE_DIRS} PATH_SUFFIXES vulkan spirv/unified1)
-    find_path(_glsl_std GLSL.std.450.h HINTS ${Vulkan_INCLUDE_DIRS} PATH_SUFFIXES vulkan spirv/unified1)
+    find_path(_spirv spirv.hpp HINTS ${Vulkan_INCLUDE_DIRS} PATH_SUFFIXES vulkan SPIRV spirv/unified1)
+    find_path(_glsl_std GLSL.std.450.h HINTS ${Vulkan_INCLUDE_DIRS} PATH_SUFFIXES vulkan SPIRV spirv/unified1)
     list(APPEND Vulkan_INCLUDE_DIRS ${_libspirv} ${_spirv} ${_glsl_std})
+    message(STATUS "Vulkan_INCLUDE_DIRS=" ${Vulkan_INCLUDE_DIRS})
+    message(STATUS "Vulkan_LIBRARY=" ${Vulkan_LIBRARY})
+    message(STATUS "Vulkan_SPIRV_TOOLS_LIBRARY=" ${Vulkan_SPIRV_TOOLS_LIBRARY})
   endif(Vulkan_FOUND)
 endmacro(find_vulkan)
diff --git a/conda/nnvm/meta.yaml b/conda/nnvm/meta.yaml
index a8b47d0de118..9c045c177ff6 100644
--- a/conda/nnvm/meta.yaml
+++ b/conda/nnvm/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "0.4.dev" %}
+{% set version = "0.5.dev" %}
 
 package:
   name: nnvm
diff --git a/conda/topi/meta.yaml b/conda/topi/meta.yaml
index af2fb4fd4228..4002f577863b 100644
--- a/conda/topi/meta.yaml
+++ b/conda/topi/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "0.4.dev" %}
+{% set version = "0.5.dev" %}
 
 package:
   name: topi
diff --git a/conda/tvm-libs/meta.yaml b/conda/tvm-libs/meta.yaml
index dbdfd4a7701f..d6902c45a693 100644
--- a/conda/tvm-libs/meta.yaml
+++ b/conda/tvm-libs/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "0.4.dev" %}
+{% set version = "0.5.dev" %}
 
 package:
   name: tvm-libs
diff --git a/conda/tvm/meta.yaml b/conda/tvm/meta.yaml
index 478e095322eb..fe53b7dd49d9 100644
--- a/conda/tvm/meta.yaml
+++ b/conda/tvm/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "0.4.dev" %}
+{% set version = "0.5.dev" %}
 
 package:
   name: tvm
diff --git a/dlpack b/dlpack
deleted file mode 160000
index 10892ac964f1..000000000000
--- a/dlpack
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 10892ac964f1af7c81aae145cd3fab78bbccd297
diff --git a/dmlc-core b/dmlc-core
deleted file mode 160000
index e864aa6757cd..000000000000
--- a/dmlc-core
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit e864aa6757cdbe78b1296fe5231fd3050b7802c3
diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index 0f0fc6f04d4c..e6e2dd7a37b0 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -15,6 +15,28 @@ RUN bash /install/ubuntu_install_python_package.sh
 COPY install/ubuntu_install_llvm.sh /install/ubuntu_install_llvm.sh
 RUN bash /install/ubuntu_install_llvm.sh
 
+# SGX deps (build early; changes infrequently)
+COPY install/ubuntu_install_sgx.sh /install/ubuntu_install_sgx.sh
+RUN bash /install/ubuntu_install_sgx.sh
+ENV LD_LIBRARY_PATH /opt/sgxsdk/lib64:${LD_LIBRARY_PATH}
+
+# Rust env (build early; takes a while)
+COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
+RUN bash /install/ubuntu_install_rust.sh
+ENV RUSTUP_HOME /opt/rust
+ENV CARGO_HOME /opt/rust
+ENV RUSTC_WRAPPER sccache
+
 # AutoTVM deps
 COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
 RUN bash /install/ubuntu_install_redis.sh
+
+# Golang environment
+COPY install/ubuntu_install_golang.sh /install/ubuntu_install_golang.sh
+RUN bash /install/ubuntu_install_golang.sh
+
+# NNPACK deps
+COPY install/ubuntu_install_nnpack.sh /install/ubuntu_install_nnpack.sh
+RUN bash /install/ubuntu_install_nnpack.sh
+
+ENV PATH $PATH:$CARGO_HOME/bin:/usr/lib/go-1.10/bin
diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index c177ef9d420a..708331d3d61a 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -66,6 +66,9 @@ RUN bash /install/ubuntu_install_vulkan.sh
 COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
 RUN bash /install/ubuntu_install_redis.sh
 
+COPY install/ubuntu_install_antlr.sh /install/ubuntu_install_antlr.sh
+RUN bash /install/ubuntu_install_antlr.sh
+
 # Environment variables
 ENV PATH=/usr/local/nvidia/bin:${PATH}
 ENV PATH=/usr/local/cuda/bin:${PATH}
diff --git a/docker/Dockerfile.demo_android b/docker/Dockerfile.demo_android
new file mode 100644
index 000000000000..2adcdb42f4e4
--- /dev/null
+++ b/docker/Dockerfile.demo_android
@@ -0,0 +1,36 @@
+# Minimum docker image for demo purposes
+FROM ubuntu:16.04
+
+RUN apt-get update --fix-missing
+
+COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
+RUN bash /install/ubuntu_install_core.sh
+
+COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
+RUN bash /install/ubuntu_install_python.sh
+
+COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
+RUN bash /install/ubuntu_install_python_package.sh
+
+COPY install/ubuntu_install_keras.sh /install/ubuntu_install_keras.sh
+RUN bash /install/ubuntu_install_keras.sh
+
+COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
+RUN bash /install/ubuntu_install_java.sh
+
+COPY install/ubuntu_install_llvm.sh /install/ubuntu_install_llvm.sh
+RUN bash /install/ubuntu_install_llvm.sh
+
+COPY install/ubuntu_install_gradle.sh /install/ubuntu_install_gradle.sh
+RUN bash /install/ubuntu_install_gradle.sh
+
+COPY install/ubuntu_install_androidsdk.sh /install/ubuntu_install_androidsdk.sh
+RUN bash /install/ubuntu_install_androidsdk.sh
+
+# Build TVM
+COPY install/install_tvm_cpu.sh /install/install_tvm_cpu.sh
+RUN bash /install/install_tvm_cpu.sh
+
+# Environment variables
+ENV PYTHONPATH=/usr/tvm/python:/usr/tvm/topi/python:/usr/tvm/nnvm/python/:/usr/tvm/vta/python:${PYTHONPATH}
+
diff --git a/docker/Dockerfile.demo_gpu b/docker/Dockerfile.demo_gpu
index 6f249986e22c..d20293c4ed3d 100644
--- a/docker/Dockerfile.demo_gpu
+++ b/docker/Dockerfile.demo_gpu
@@ -1,6 +1,6 @@
 # Minimum docker image for demo purposes
 # prebuilt-image: tvmai/demo-gpu
-FROM nvidia/cuda:8.0-cudnn7-devel
+FROM nvidia/cuda:9.0-cudnn7-devel
 
 RUN apt-get update --fix-missing
 
diff --git a/docker/Dockerfile.demo_opencl b/docker/Dockerfile.demo_opencl
new file mode 100644
index 000000000000..460b901bf08f
--- /dev/null
+++ b/docker/Dockerfile.demo_opencl
@@ -0,0 +1,65 @@
+# USAGE: sudo docker build libs/tvm -f libs/tvm/docker/Dockerfile.ocl -t l4b/tvm:ocl
+
+# REFERENCE: https://docs.docker.com/engine/reference/builder
+
+FROM ubuntu:18.04
+
+RUN echo "Labelling this image"
+LABEL Description="Docker image for TVM built with OpenCL & OpenGL support"
+
+RUN echo "Preparing to install dependencies"
+RUN apt-get update
+# ENV DEBIAN_FRONTEND noninteractive
+RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
+
+RUN echo "Installing utility libraries"
+RUN apt-get install -y apt-utils sudo
+RUN apt-get install -y cmake g++ llvm
+RUN apt-get install -y git
+# make wget unzip libtinfo-dev libz-dev libcurl4-openssl-dev
+RUN apt-get install -y libopenblas-dev
+
+# RUN echo "Installing gtest"
+# RUN apt-get install -y libgtest-dev
+# RUN cd /usr/src/gtest && cmake CMakeLists.txt && make && cp *.a /usr/lib
+
+RUN echo "Installing Python"
+RUN apt-get install -y python3-dev python3-pip
+RUN pip3 install setuptools numpy nose-timer cython decorator scipy tornado psutil xgboost
+
+RUN echo "Installing Jupyter notebook"
+RUN pip3 install matplotlib Image Pillow jupyter[notebook]
+
+RUN echo "Installing OpenCL libraries"
+RUN apt-get install -y libviennacl-dev mesa-opencl-icd ocl-icd-opencl-dev clinfo
+RUN apt-get install -y libclblas-dev libclfft-dev libclsparse-dev
+
+RUN echo "Installing OpenGL libraries"
+RUN apt-get install -y libcogl-dev libegl1 libgles1 libglfw3-dev
+# libglew-dev
+
+RUN echo "Upgrading dependencies"
+RUN apt-get upgrade -y
+
+RUN echo "Cloning TVM source & submodules"
+ENV TVM_PAR_DIR="/usr"
+RUN mkdir -p TVM_PAR_DIR && \
+	cd ${TVM_PAR_DIR} && \
+	git clone https://github.com/dmlc/tvm --recursive
+#RUN git submodule update --init --recursive
+
+
+RUN echo "Building TVM"
+#USE_BLAS: "openblas" | "mkl" | "atlas" | "apple" | "none"
+ENV TVM_HOME="/usr/tvm"
+ENV TVM_BUILD_DIR="${TVM_HOME}/build"
+RUN mkdir -p ${TVM_BUILD_DIR} && \
+	cd ${TVM_BUILD_DIR} && \
+	cmake .. -DUSE_BLAS=openblas -DUSE_LLVM=ON -DUSE_OPENCL=ON -DUSE_OPENGL=ON && \
+	make -j6
+
+RUN echo "Building Python package"
+ENV PYTHONPATH=${TVM_HOME}/python:${TVM_HOME}/topi/python:${TVM_HOME}/nnvm/python:${PYTHONPATH}
+RUN cd ${TVM_HOME}/python && python3 setup.py install --user
+RUN cd ${TVM_HOME}/topi/python && python3 setup.py install --user
+RUN cd ${TVM_HOME}/nnvm/python && python3 setup.py install --user
diff --git a/docker/README.md b/docker/README.md
index e9b8b503062f..df9ea42af68a 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -85,3 +85,9 @@ Here are some common use examples to perform CI tasks.
   ```bash
   ./docker/ci_build.sh ci_gpu make -C docs html
   ```
+
+- build golang test suite.
+
+  ```bash
+  ./docker/build.sh ci_cpu tests/scripts/task_golang.sh
+  ```
diff --git a/docker/bash.sh b/docker/bash.sh
index ba935d7ed089..0813edd5527d 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -55,5 +55,5 @@ ${DOCKER_BINARY} run --rm --pid=host\
     -e "CI_BUILD_GID=$(id -g)" \
     ${CI_DOCKER_EXTRA_PARAMS[@]} \
     ${DOCKER_IMAGE_NAME}\
-    bash /docker/with_the_same_user \
+    bash --login /docker/with_the_same_user \
     ${COMMAND[@]}
diff --git a/docker/build.sh b/docker/build.sh
index 1d476e52e642..5b6c4450f6e4 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -127,5 +127,5 @@ ${DOCKER_BINARY} run --rm --pid=host \
     -e "CI_BUILD_GID=$(id -g)" \
     ${CI_DOCKER_EXTRA_PARAMS[@]} \
     ${DOCKER_IMG_NAME} \
-    bash docker/with_the_same_user \
+    bash --login docker/with_the_same_user \
     ${COMMAND[@]}
diff --git a/docker/install/install_tvm_cpu.sh b/docker/install/install_tvm_cpu.sh
index 51593e66506e..461ad244d37c 100644
--- a/docker/install/install_tvm_cpu.sh
+++ b/docker/install/install_tvm_cpu.sh
@@ -6,6 +6,8 @@ echo set\(USE_RPC ON\) >> config.cmake
 echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_GRAPH_RUNTIME ON\) >> config.cmake
 echo set\(USE_BLAS openblas\) >> config.cmake
+echo set\(USE_SGX /opt/sgxsdk\) >> config.cmake
+echo set\(RUST_SGX_SDK /opt/rust-sgx-sdk\) >> config.cmake
 mkdir -p build
 cd build
 cmake ..
diff --git a/docker/install/ubuntu_install_androidsdk.sh b/docker/install/ubuntu_install_androidsdk.sh
new file mode 100644
index 000000000000..a5c02e573b43
--- /dev/null
+++ b/docker/install/ubuntu_install_androidsdk.sh
@@ -0,0 +1,69 @@
+. /etc/profile
+
+set -o errexit -o nounset
+
+ANDROID_HOME=/opt/android-sdk-linux
+ASDKTOOLS_HOME=/opt/android-sdk-tools
+ASDKTOOLS_VERSION=3859397
+ASDKTOOLS_SHA256=444e22ce8ca0f67353bda4b85175ed3731cae3ffa695ca18119cbacef1c1bea0
+
+wget http://dl.google.com/android/repository/sdk-tools-linux-${ASDKTOOLS_VERSION}.zip -O sdk-tools-linux.zip
+echo "${ASDKTOOLS_SHA256} *sdk-tools-linux.zip" | sha256sum --check -
+unzip sdk-tools-linux.zip
+rm sdk-tools-linux.zip
+mv tools "${ASDKTOOLS_HOME}/"
+# The following popular fix makes sdkmanager honour $http_proxy variables
+mv ${ASDKTOOLS_HOME}/bin/sdkmanager ${ASDKTOOLS_HOME}/bin/sdkmanager-vanilla
+cat >${ASDKTOOLS_HOME}/bin/sdkmanager <<"EOF"
+#!/bin/sh
+if test -n "$http_proxy"; then
+  PROXY_HOST=`echo $http_proxy | sed 's@.*//\(.*\):.*@\1@'`
+  PROXY_PORT=`echo $http_proxy | sed 's@.*//.*:\(.*\)@\1@'`
+  PROXY="--proxy=http --proxy_host=$PROXY_HOST --proxy_port=$PROXY_PORT"
+else
+  PROXY=""
+fi
+exec "`dirname $0`/sdkmanager-vanilla" $PROXY "$@"
+EOF
+for f in ${ASDKTOOLS_HOME}/bin/* ; do
+  chmod +x "$f"
+  ln --symbolic "$f" "/usr/bin/`basename $f`"
+done
+
+
+cat >/install/package-list-minimal.txt <<EOF
+build-tools;26.0.3
+build-tools;27.0.3
+cmake;3.6.4111459
+emulator
+extras;android;gapid;1
+extras;android;gapid;3
+extras;android;m2repository
+extras;google;auto
+extras;google;google_play_services
+extras;google;instantapps
+extras;google;m2repository
+extras;google;market_apk_expansion
+extras;google;market_licensing
+extras;google;simulators
+extras;google;webdriver
+extras;m2repository;com;android;support;constraint;constraint-layout;1.0.2
+extras;m2repository;com;android;support;constraint;constraint-layout-solver;1.0.2
+lldb;2.3
+platforms;android-26
+platforms;android-27
+tools
+ndk-bundle
+EOF
+
+mkdir /root/.android 2>/dev/null || true
+touch /root/.android/repositories.cfg
+yes | sdkmanager --licenses --sdk_root="$ANDROID_HOME"
+sdkmanager --verbose --package_file=/install/package-list-minimal.txt --sdk_root="$ANDROID_HOME"
+test -d "${ANDROID_HOME}/build-tools/27.0.3"
+test -d "${ANDROID_HOME}/ndk-bundle"
+for f in ${ANDROID_HOME}/ndk-bundle/* ; do
+  ln --symbolic "$f" "/usr/bin/`basename $f`"
+done
+echo "export ANDROID_HOME=${ANDROID_HOME}" >> /etc/profile
+
diff --git a/docker/install/ubuntu_install_antlr.sh b/docker/install/ubuntu_install_antlr.sh
new file mode 100644
index 000000000000..d2f2d6a8c48f
--- /dev/null
+++ b/docker/install/ubuntu_install_antlr.sh
@@ -0,0 +1,3 @@
+cd /usr/local/lib
+wget https://www.antlr.org/download/antlr-4.7.1-complete.jar
+cd -
diff --git a/docker/install/ubuntu_install_golang.sh b/docker/install/ubuntu_install_golang.sh
new file mode 100644
index 000000000000..2361ccfbd2e4
--- /dev/null
+++ b/docker/install/ubuntu_install_golang.sh
@@ -0,0 +1,4 @@
+#install the necessary dependancies for golang build
+apt-get update && apt-get install -y golang-1.10-go
+apt-get update && apt-get install -y golang-1.10-doc
+apt-get update && apt-get install -y golint
diff --git a/docker/install/ubuntu_install_gradle.sh b/docker/install/ubuntu_install_gradle.sh
new file mode 100644
index 000000000000..b1535c98cabb
--- /dev/null
+++ b/docker/install/ubuntu_install_gradle.sh
@@ -0,0 +1,17 @@
+. /etc/profile
+
+set -o errexit -o nounset
+
+GRADLE_HOME=/opt/gradle
+GRADLE_VERSION=4.10-rc-2
+GRADLE_SHA256=e90d3c32910e259814bcca82b3911172ecca1ff1ab5ed69b4de3c1df8b378b40
+
+echo "Downloading Gradle"
+wget --output-document=gradle.zip "https://services.gradle.org/distributions/gradle-${GRADLE_VERSION}-bin.zip"
+echo "Checking Gradle hash"
+echo "${GRADLE_SHA256} *gradle.zip" | sha256sum --check -
+echo "Installing Gradle"
+unzip gradle.zip
+rm gradle.zip
+mv "gradle-${GRADLE_VERSION}" "${GRADLE_HOME}/"
+ln --symbolic "${GRADLE_HOME}/bin/gradle" /usr/bin/gradle
diff --git a/docker/install/ubuntu_install_java.sh b/docker/install/ubuntu_install_java.sh
index ba07b2985efb..462edc491627 100644
--- a/docker/install/ubuntu_install_java.sh
+++ b/docker/install/ubuntu_install_java.sh
@@ -1 +1,4 @@
+set -o errexit -o nounset
 apt-get update && apt-get install -y openjdk-8-jdk maven
+test -d "/usr/lib/jvm/java-8-openjdk-amd64/jre"
+echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre" >> /etc/profile
diff --git a/docker/install/ubuntu_install_nnpack.sh b/docker/install/ubuntu_install_nnpack.sh
new file mode 100644
index 000000000000..83225d4aa820
--- /dev/null
+++ b/docker/install/ubuntu_install_nnpack.sh
@@ -0,0 +1,13 @@
+apt-get update && apt-get install -y --no-install-recommends --force-yes git cmake
+
+
+git clone https://github.com/Maratyszcza/NNPACK NNPACK
+cd NNPACK
+# TODO: specific tag?
+git checkout 1e005b0c2
+cd -
+
+mkdir -p NNPACK/build
+cd NNPACK/build
+cmake -DCMAKE_INSTALL_PREFIX:PATH=. -DNNPACK_INFERENCE_ONLY=OFF -DNNPACK_CONVOLUTION_ONLY=OFF -DNNPACK_BUILD_TESTS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON .. && make -j4 && make install
+cd -
diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index 6724116cb720..bd6e67cc1ed9 100644
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -1,3 +1,3 @@
 # install libraries for python package on ubuntu
-pip2 install nose pylint numpy nose-timer cython decorator scipy tornado
-pip3 install nose pylint numpy nose-timer cython decorator scipy tornado typed_ast pytest mypy orderedset antlr4-python3-runtime
+pip2 install nose pylint numpy nose-timer cython decorator scipy tornado typing antlr4-python2-runtime attrs
+pip3 install nose pylint numpy nose-timer cython decorator scipy tornado typed_ast pytest mypy orderedset antlr4-python3-runtime attrs
diff --git a/docker/install/ubuntu_install_rust.sh b/docker/install/ubuntu_install_rust.sh
new file mode 100644
index 000000000000..a8a9bddacf2c
--- /dev/null
+++ b/docker/install/ubuntu_install_rust.sh
@@ -0,0 +1,15 @@
+apt-get update && apt-get install -y --no-install-recommends --force-yes curl
+
+export RUSTUP_HOME=/opt/rust
+export CARGO_HOME=/opt/rust
+# this rustc is one supported by the installed version of rust-sgx-sdk
+curl https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path --default-toolchain nightly-2018-10-01
+. $CARGO_HOME/env
+rustup toolchain add nightly
+rustup component add rust-src
+cargo +nightly install sccache
+cargo +nightly install rustfmt-nightly --version 0.99.5 --force
+cargo +nightly install xargo
+
+# make rust usable by all users
+chmod -R a+w /opt/rust
diff --git a/docker/install/ubuntu_install_sgx.sh b/docker/install/ubuntu_install_sgx.sh
new file mode 100644
index 000000000000..a8201ac74a97
--- /dev/null
+++ b/docker/install/ubuntu_install_sgx.sh
@@ -0,0 +1,21 @@
+apt-get update && apt-get install -y --no-install-recommends --force-yes \
+    build-essential git cmake \
+    wget python pkg-config software-properties-common \
+    autoconf automake libtool ocaml \
+    protobuf-compiler libprotobuf-dev \
+    libssl-dev libcurl4-openssl-dev curl
+
+git clone https://github.com/intel/linux-sgx.git
+cd linux-sgx
+git checkout sgx_2.2
+curl 'https://gist.githubusercontent.com/nhynes/c770b0e91610f8c020a8d1a803a1e7cb/raw/8f5372d9cb88929b3cc49a384943bb363bc06827/intel-sgx.patch' | git apply
+./download_prebuilt.sh
+make -j4 sdk && make -j4 sdk_install_pkg
+./linux/installer/bin/sgx_linux_x64_sdk*.bin --prefix /opt
+cd -
+
+git clone https://github.com/baidu/rust-sgx-sdk.git /opt/rust-sgx-sdk
+cd /opt/rust-sgx-sdk
+git checkout v1.0.4
+curl 'https://gist.githubusercontent.com/nhynes/37164039c5d3f33aa4f123e4ba720036/raw/5b7fc24d4faa0bd6efce19f8324f79d5562991e0/rust-sgx-sdk.diff' | git apply
+cd -
diff --git a/docker/install/ubuntu_install_tensorflow.sh b/docker/install/ubuntu_install_tensorflow.sh
new file mode 100644
index 000000000000..407954f8fd46
--- /dev/null
+++ b/docker/install/ubuntu_install_tensorflow.sh
@@ -0,0 +1 @@
+pip3 install tensorflow-gpu
diff --git a/docker/with_the_same_user b/docker/with_the_same_user
index 470d64384de6..27f2e66a29d5 100644
--- a/docker/with_the_same_user
+++ b/docker/with_the_same_user
@@ -29,6 +29,7 @@ echo "${CI_BUILD_USER} ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-nopasswd-sudo
 HOME=${CI_BUILD_HOME}\
     sudo -u "#${CI_BUILD_UID}" --preserve-env\
     PATH=${PATH}\
+    JAVA_HOME=${JAVA_HOME}\
     LD_LIBRARY_PATH=${LD_LIBRARY_PATH}\
     PYTHONPATH=${PYTHONPATH}\
     HOME=${CI_BUILD_HOME}\
diff --git a/docs/api/python/autotvm.rst b/docs/api/python/autotvm.rst
index f03406dbc720..93d6905077fb 100644
--- a/docs/api/python/autotvm.rst
+++ b/docs/api/python/autotvm.rst
@@ -16,6 +16,11 @@ tvm.autotvm.measure
 
 .. autofunction:: tvm.autotvm.measure.create_measure_batch
 
+.. autoclass:: tvm.autotvm.measure.measure_methods.LocalBuilder
+
+.. autoclass:: tvm.autotvm.measure.measure_methods.RPCRunner
+
+.. autoclass:: tvm.autotvm.measure.measure_methods.LocalRunner
 
 tvm.autotvm.tuner
 ~~~~~~~~~~~~~~~~~
diff --git a/docs/api/python/index.rst b/docs/api/python/index.rst
index 59bd1795b7ec..ddad9d10f8f9 100644
--- a/docs/api/python/index.rst
+++ b/docs/api/python/index.rst
@@ -24,3 +24,4 @@ Python API
    vta/index
    nnvm/index
    hybrid
+   relay/index
diff --git a/docs/api/python/intrin.rst b/docs/api/python/intrin.rst
index 3942c57f1a04..59f695196ce8 100644
--- a/docs/api/python/intrin.rst
+++ b/docs/api/python/intrin.rst
@@ -6,7 +6,10 @@ tvm.intrin
 
    tvm.call_packed
    tvm.call_pure_intrin
+   tvm.call_intrin
    tvm.call_pure_extern
+   tvm.call_extern
+   tvm.call_llvm_intrin
    tvm.register_intrin_rule
    tvm.exp
    tvm.log
@@ -18,7 +21,10 @@ tvm.intrin
 
 .. autofunction:: tvm.call_packed
 .. autofunction:: tvm.call_pure_intrin
+.. autofunction:: tvm.call_intrin
 .. autofunction:: tvm.call_pure_extern
+.. autofunction:: tvm.call_extern
+.. autofunction:: tvm.call_llvm_intrin
 .. autofunction:: tvm.register_intrin_rule
 .. autofunction:: tvm.exp
 .. autofunction:: tvm.log
diff --git a/docs/api/python/nnvm/frontend.rst b/docs/api/python/nnvm/frontend.rst
index f872a6b878e2..eb07a13e8340 100644
--- a/docs/api/python/nnvm/frontend.rst
+++ b/docs/api/python/nnvm/frontend.rst
@@ -10,3 +10,7 @@ nnvm.frontend
 .. autofunction:: nnvm.frontend.from_coreml
 
 .. autofunction:: nnvm.frontend.from_keras
+
+.. autofunction:: nnvm.frontend.from_tensorflow
+
+.. autofunction:: nnvm.frontend.from_darknet
diff --git a/docs/api/python/nnvm/index.rst b/docs/api/python/nnvm/index.rst
index c0e5912c76be..64447bb793fb 100644
--- a/docs/api/python/nnvm/index.rst
+++ b/docs/api/python/nnvm/index.rst
@@ -11,3 +11,4 @@ This document contains the python API to NNVM compiler toolchain.
    symbol
    graph
    top
+   testing
diff --git a/docs/api/python/nnvm/testing.rst b/docs/api/python/nnvm/testing.rst
new file mode 100644
index 000000000000..56783622648d
--- /dev/null
+++ b/docs/api/python/nnvm/testing.rst
@@ -0,0 +1,14 @@
+nnvm.testing
+------------
+
+.. automodule:: nnvm.testing
+
+.. autofunction:: nnvm.testing.ctx_list
+
+nnvm.testing.check_computation
+------------------------------
+
+.. automodule:: nnvm.testing.check_computation
+    :members:
+
+.. include:: testing_new_ops.rst
diff --git a/docs/api/python/nnvm/testing_new_ops.rst b/docs/api/python/nnvm/testing_new_ops.rst
new file mode 100644
index 000000000000..dfe7df485b78
--- /dev/null
+++ b/docs/api/python/nnvm/testing_new_ops.rst
@@ -0,0 +1,135 @@
+Testing new operations
+----------------------
+
+When adding new operations, it is a good idea to test them. Testing
+should be done with the function ``nnvm.testing.check_function``. You
+should provide it with the symbol representing the result of a
+computation and a reference numpy implementation. By default, it will
+also check analytical gradients against numerical gradients if
+analytical gradients are implemented for your operation. You can also
+pass a reference implementation for the gradients, but numerical
+gradients will still be checked. Numerical gradient checking may be
+switched off explicitly, but doing this is not a good idea generally.
+Here is an example testing the logarithm operation:
+
+.. code:: python
+
+    import numpy as np
+    import nnvm
+    import nnvm.symbol as sym
+    from nnvm.testing.check_computation import check_function
+
+    x = sym.Variable("x")
+    y = sym.log(x)
+
+    def forward(x):
+        return np.log(x)
+
+    def backward(head_grads, x):
+        return [1. / x * head_grads]
+
+    dtype = "float32"
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, in_range=(0.001, 2.0), dtype=dtype, shape=shape)
+
+If you run the code above, you might get an ``AssertionError`` in rare
+cases. That’s why it is recommended to run new tests a lot of times.
+
+.. code:: python
+
+    for _ in range(10000):
+        check_function(y, forward, backward, in_range=(0.001, 2.0), dtype=dtype, shape=shape)
+
+If you run the code above then sooner or later you will get an exception
+which may look like this:
+
+.. code-block:: text
+
+    AssertionError: Analytical and numerical grads wrt x differ too much
+    analytical grad = [
+            ...
+        ]
+    numerical grad = [
+            ...
+        ]
+    distance > atol*sqrt(n) + rtol*grad_norm
+    distance 308.50885009765625 > 0.01*55.42562584220407 + 0.1*2167.70703125
+
+It means that either you have a mistake in the ``FGradient`` function or
+the numerical error is too high. Generally, if you look at the printed
+gradients and see that they differ only slightly or just in a single
+position, then it is a numerical error. But if the gradients look
+completely different, especially if many corresponding positions have
+different signs, then it must be something wrong with the analytical
+gradient implementation.
+
+Then try to make this error reproducible, and also try to reduce the
+shape of inputs, but not too much, a vector of 10 elements is a
+reasonable choice. Also you won’t need reference functions ``forward``
+and ``backward``, and restricting the number of targets might also be a
+good idea. Since the error may manifest itself only in rare cases, you
+might want to run it in a loop.
+
+.. code:: python
+
+    shape = {'x': (10,)}
+    np.random.seed(42)
+
+    for _ in range(1000):
+        check_function(y, in_range=(0.001, 2.0), dtype=dtype, shape=shape,
+                       numerical_grads=True, only_targets=['llvm'])
+
+Running this code will result in the following:
+
+.. code-block:: text
+
+    check_function failed while checking gradients numerically, here is the main graph
+    Graph(%x, %head_grads_0) {
+      %x, shape=[10], dtype=0
+      %head_grads_0, shape=[10], dtype=0
+      %1 = log(%x), shape=[10], dtype=0
+      %3 = elemwise_div(%head_grads_0, %x), shape=[10], dtype=0
+      ret %1, %3, %head_grads_0
+    }
+    graph_attr_keys = [layout_inputs, dtype_num_unknown_nodes, dtype, shape_num_unknown_nodes, shape]
+
+    Generated inputs:
+    {'x': array([2.5660574e-01, 1.5313280e+00, 1.0232578e-03, 8.3371508e-01,
+           1.0454979e+00, 1.1021420e-01, 1.9461832e+00, 4.5302454e-01,
+           6.0909325e-01, 6.0858107e-01], dtype=float32), 'head_grads_0': array([0.4616029 , 0.00394617, 1.4589603 , 1.9337242 , 0.44936267,
+           1.3264314 , 1.4840508 , 1.6970023 , 0.84583575, 0.60655886],
+          dtype=float32)}
+
+    ...
+
+    AssertionError: Analytical and numerical grads wrt x differ too much
+    analytical grad = [1.7988799e+00 2.5769596e-03 1.4257993e+03 2.3194065e+00 4.2980734e-01
+     1.2035031e+01 7.6254421e-01 3.7459390e+00 1.3886802e+00 9.9667716e-01]
+     numerical grad = [1.7948151e+00 1.9073486e-03 9.9268610e+02 2.3174286e+00 4.2915344e-01
+     1.1980057e+01 7.6198578e-01 3.7412643e+00 1.3866425e+00 9.9563599e-01]
+    distance > atol*sqrt(n) + rtol*grad_norm
+    distance 433.11322021484375 > 0.01*3.1622776601683795 + 0.1*992.7716674804688
+
+In this case the largest difference is in the 2nd position (starting
+from 0) which corresponds to input value ``1.0232578e-03``. This value
+is too close to the singularity, so the numerical derivative gets too
+imprecise. The solution is to shrink the range for ``x``, here, for
+example, ``(0.002, 2.0)`` turned out to be enough. Don’t forget to run
+lots of tests, so that other people don’t get false positives.
+
+.. code:: python
+
+    for _ in range(100):
+        check_function(y, in_range={x: (0.002, 2.0)}, dtype=dtype, shape=(1, 3, 32, 32),
+                       numerical_grads=True, only_targets=['llvm'])
+
+If you need a more precise control over which values get passed to the
+checking function, you can use ``values={x: ...}``:
+
+.. code:: python
+
+    x_val = np.array([1.2594858e+00, 1.0960974e-01, 1.4975418e+00, 6.3585603e-01,
+           1.2692513e-03, 1.0227472e+00, 9.4656967e-02, 5.5306298e-01,
+           1.4142460e+00, 1.2631655e-01], dtype=np.float32)
+    check_function(y, values={x: x_val}, dtype=dtype, shape=shape,
+                   numerical_grads=True, only_targets=['llvm'])
diff --git a/docs/api/python/relay/backend.rst b/docs/api/python/relay/backend.rst
new file mode 100644
index 000000000000..5cbc250b55ba
--- /dev/null
+++ b/docs/api/python/relay/backend.rst
@@ -0,0 +1,16 @@
+tvm.relay.backend
+-----------------
+
+.. automodule:: tvm.relay.backend
+
+Interpreter
+-----------
+
+.. automodule:: tvm.relay.backend.interpreter
+    :members:
+
+.. automodule:: tvm.relay.backend.compile_engine
+    :members:
+
+.. automodule:: tvm.relay.backend.graph_runtime_codegen
+    :members:
diff --git a/docs/api/python/relay/base.rst b/docs/api/python/relay/base.rst
new file mode 100644
index 000000000000..72315dca0193
--- /dev/null
+++ b/docs/api/python/relay/base.rst
@@ -0,0 +1,16 @@
+tvm.relay.base
+--------------
+.. automodule:: tvm.relay.base
+
+.. autofunction:: tvm.relay.base.register_relay_node
+
+.. autofunction:: tvm.relay.base.register_relay_attr_node
+
+.. autoclass:: tvm.relay.base.RelayNode
+    :members:
+
+.. autoclass:: tvm.relay.base.Span
+    :members:
+
+.. autoclass:: tvm.relay.base.Id
+    :members:
diff --git a/docs/api/python/relay/build_module.rst b/docs/api/python/relay/build_module.rst
new file mode 100644
index 000000000000..a278940f0fd5
--- /dev/null
+++ b/docs/api/python/relay/build_module.rst
@@ -0,0 +1,19 @@
+tvm.relay.build_module
+----------------------
+
+.. automodule:: tvm.relay.build_module
+
+.. autofunction:: tvm.relay.build_module.build
+
+.. autofunction:: tvm.relay.build_module.optimize
+
+.. autofunction:: tvm.relay.build_module.create_executor
+
+.. autoclass:: tvm.relay.build_module.BuildConfig
+    :members:
+
+.. autofunction:: tvm.relay.build_module.build_config
+    :members:
+
+.. autoclass:: tvm.relay.build_module.GraphExecutor
+    :members:
diff --git a/docs/api/python/relay/expr.rst b/docs/api/python/relay/expr.rst
new file mode 100644
index 000000000000..540d6bfbab65
--- /dev/null
+++ b/docs/api/python/relay/expr.rst
@@ -0,0 +1,53 @@
+tvm.relay.expr
+--------------
+
+.. automodule:: tvm.relay.expr
+
+.. autofunction:: tvm.relay.expr.var
+
+.. autofunction:: tvm.relay.expr.const
+
+.. autofunction:: tvm.relay.expr.bind
+
+.. autoclass:: tvm.relay.expr.Expr
+    :members:
+
+.. autoclass:: tvm.relay.expr.Constant
+    :members:
+
+.. autoclass:: tvm.relay.expr.Tuple
+    :members:
+
+.. autoclass:: tvm.relay.expr.Var
+    :members:
+
+.. autoclass:: tvm.relay.expr.GlobalVar
+    :members:
+
+.. autoclass:: tvm.relay.expr.Function
+    :members:
+
+.. autoclass:: tvm.relay.expr.Call
+    :members:
+
+.. autoclass:: tvm.relay.expr.Let
+    :members:
+
+.. autoclass:: tvm.relay.expr.If
+    :members:
+
+.. autoclass:: tvm.relay.expr.TupleGetItem
+    :members:
+
+.. autoclass:: tvm.relay.expr.TempExpr
+    :members:
+
+.. autoclass:: tvm.relay.expr.ExprFunctor
+    :members:
+
+.. autoclass:: tvm.relay.expr.ExprMutator
+    :members:
+
+.. autoclass:: tvm.relay.expr.TupleWrapper
+    :members
+
diff --git a/docs/api/python/relay/frontend.rst b/docs/api/python/relay/frontend.rst
new file mode 100644
index 000000000000..a418e042bf3d
--- /dev/null
+++ b/docs/api/python/relay/frontend.rst
@@ -0,0 +1,7 @@
+
+tvm.relay.frontend
+------------------
+
+.. automodule:: tvm.relay.frontend
+
+.. autofunction:: tvm.relay.frontend.from_mxnet
diff --git a/docs/api/python/relay/image.rst b/docs/api/python/relay/image.rst
new file mode 100644
index 000000000000..223213eca8e3
--- /dev/null
+++ b/docs/api/python/relay/image.rst
@@ -0,0 +1,9 @@
+
+tvm.relay.image
+---------------
+
+.. automodule:: tvm.relay.image
+    :members:
+
+.. automodule:: tvm.relay.op.image.image
+    :members:
diff --git a/docs/api/python/relay/index.rst b/docs/api/python/relay/index.rst
new file mode 100644
index 000000000000..da3d3a912dd0
--- /dev/null
+++ b/docs/api/python/relay/index.rst
@@ -0,0 +1,25 @@
+Relay API
+=========
+
+This document contains the Python API for the Relay frontend, optimizer, and
+compiler toolchain.
+
+Relay is the second-generation, high-level intermediate representation (IR) for the TVM
+compiler stack.
+
+.. toctree::
+   :maxdepth: 2
+
+   backend
+   base
+   build_module
+   expr
+   frontend
+   image
+   ir_pass
+   module
+   nn
+   op
+   scope_builder
+   ty
+   vision
diff --git a/docs/api/python/relay/ir_pass.rst b/docs/api/python/relay/ir_pass.rst
new file mode 100644
index 000000000000..d02ef4d94b0a
--- /dev/null
+++ b/docs/api/python/relay/ir_pass.rst
@@ -0,0 +1,4 @@
+tvm.relay.ir_pass
+-----------------
+.. automodule:: tvm.relay.ir_pass
+    :members:
\ No newline at end of file
diff --git a/docs/api/python/relay/module.rst b/docs/api/python/relay/module.rst
new file mode 100644
index 000000000000..ec9642b484ba
--- /dev/null
+++ b/docs/api/python/relay/module.rst
@@ -0,0 +1,7 @@
+tvm.relay.module
+----------------
+
+.. automodule:: tvm.relay.module
+
+.. autoclass:: tvm.relay.module.Module
+    :members:
diff --git a/docs/api/python/relay/nn.rst b/docs/api/python/relay/nn.rst
new file mode 100644
index 000000000000..8e3f47f7bead
--- /dev/null
+++ b/docs/api/python/relay/nn.rst
@@ -0,0 +1,7 @@
+tvm.relay.nn
+------------
+.. automodule:: tvm.relay.nn
+    :members:
+
+.. automodule:: tvm.relay.op.nn.nn
+    :members:
diff --git a/docs/api/python/relay/op.rst b/docs/api/python/relay/op.rst
new file mode 100644
index 000000000000..7413a818f73f
--- /dev/null
+++ b/docs/api/python/relay/op.rst
@@ -0,0 +1,25 @@
+tvm.relay.op
+------------
+.. automodule:: tvm.relay.op
+    :members:
+
+.. automodule:: tvm.relay.op.op
+    :members:
+
+.. automodule:: tvm.relay.op.reduce
+    :members:
+
+.. automodule:: tvm.relay.op.tensor
+    :members:
+
+.. automodule:: tvm.relay.op.transform
+    :members:
+
+.. automodule:: tvm.relay.op.nn.nn
+    :members:
+
+.. automodule:: tvm.relay.op.vision.multibox
+    :members:
+
+.. automodule:: tvm.relay.op.vision.nms
+    :members:
diff --git a/docs/api/python/relay/scope_builder.rst b/docs/api/python/relay/scope_builder.rst
new file mode 100644
index 000000000000..19fca89bf2d2
--- /dev/null
+++ b/docs/api/python/relay/scope_builder.rst
@@ -0,0 +1,7 @@
+tvm.relay.scope_builder
+-----------------------
+
+.. automodule:: tvm.relay.scope_builder
+
+.. autoclass:: tvm.relay.scope_builder.ScopeBuilder
+    :members:
diff --git a/docs/api/python/relay/ty.rst b/docs/api/python/relay/ty.rst
new file mode 100644
index 000000000000..edf15275db03
--- /dev/null
+++ b/docs/api/python/relay/ty.rst
@@ -0,0 +1,32 @@
+tvm.relay.ty
+------------
+
+.. automodule:: tvm.relay.ty
+    :members:
+
+.. autoclass:: tvm.relay.ty.Type
+    :members:
+
+.. autoclass:: tvm.relay.ty.TensorType
+    :members:
+
+.. autoclass:: tvm.relay.ty.Kind
+    :members:
+
+.. autoclass:: tvm.relay.ty.TypeVar
+    :members:
+
+.. autoclass:: tvm.relay.ty.TypeConstraint
+    :members:
+
+.. autoclass:: tvm.relay.ty.TupleType
+    :members:
+
+.. autoclass:: tvm.relay.ty.FuncType
+    :members:
+
+.. autoclass:: tvm.relay.ty.IncompleteType
+    :members:
+
+.. autoclass:: tvm.relay.ty.TypeRelation
+    :members:
diff --git a/docs/api/python/relay/vision.rst b/docs/api/python/relay/vision.rst
new file mode 100644
index 000000000000..7751dd688b15
--- /dev/null
+++ b/docs/api/python/relay/vision.rst
@@ -0,0 +1,12 @@
+
+tvm.relay.vision
+----------------
+
+.. automodule:: tvm.relay.vision
+    :members:
+
+.. automodule:: tvm.relay.op.vision.multibox
+    :members:
+
+.. automodule:: tvm.relay.op.vision.nms
+    :members:
diff --git a/docs/api/python/topi.rst b/docs/api/python/topi.rst
index 7f150ddbf7cd..886822475db9 100644
--- a/docs/api/python/topi.rst
+++ b/docs/api/python/topi.rst
@@ -30,6 +30,7 @@ List of operators
    topi.concatenate
    topi.split
    topi.take
+   topi.gather_nd
    topi.full
    topi.full_like
    topi.nn.relu
@@ -49,6 +50,7 @@ List of operators
    topi.min
    topi.argmax
    topi.argmin
+   topi.prod
    topi.broadcast_to
    topi.add
    topi.subtract
@@ -102,11 +104,13 @@ topi
 .. autofunction:: topi.concatenate
 .. autofunction:: topi.split
 .. autofunction:: topi.take
+.. autofunction:: topi.gather_nd
 .. autofunction:: topi.full
 .. autofunction:: topi.full_like
 .. autofunction:: topi.max
 .. autofunction:: topi.sum
 .. autofunction:: topi.min
+.. autofunction:: topi.prod
 .. autofunction:: topi.broadcast_to
 .. autofunction:: topi.add
 .. autofunction:: topi.subtract
diff --git a/docs/api_links.rst b/docs/api_links.rst
index 909cfe367f29..d9b2406206b3 100644
--- a/docs/api_links.rst
+++ b/docs/api_links.rst
@@ -1,7 +1,8 @@
-Links to C++ and JS API References
+Links to API References
 ==================================
 
 This page contains links to API references that are build with different doc build system.
 
 * `C++ doyxgen API <doxygen/index.html>`_
 * `Javascript jsdoc API <jsdoc/index.html>`_
+* `Java Javadoc API <javadoc/index.html>`_
diff --git a/docs/conf.py b/docs/conf.py
index 989d26f87d3e..717003824703 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -33,7 +33,7 @@
 # General information about the project.
 project = u'tvm'
 author = u'%s developers' % project
-copyright = u'2017, %s' % author
+copyright = u'2018, %s' % author
 github_doc_root = 'https://github.com/tqchen/tvm/tree/master/docs/'
 
 # add markdown parser
@@ -192,6 +192,7 @@ def run_doxygen(folder):
     ['../tutorials/language',
      '../tutorials/optimize',
      '../tutorials/autotvm',
+     '../tutorials/dev',
      '../tutorials/vta',
      '../tutorials/topi',
      '../tutorials/deployment',
diff --git a/docs/contribute/code_guide.rst b/docs/contribute/code_guide.rst
index dc7d998ca37f..d7aef2b60d48 100644
--- a/docs/contribute/code_guide.rst
+++ b/docs/contribute/code_guide.rst
@@ -15,6 +15,7 @@ C++ Code Styles
 - Favor passing by const reference (e.g. ``const Expr&``) over passing by value.
   Except when the function consumes the value by copy constructor or move,
   pass by value is better than pass by const reference in such cases.
+- Favor ``const`` member function when possible.
 
 Python Code Styles
 ------------------
diff --git a/docs/contribute/community.rst b/docs/contribute/community.rst
index 1023cf0ddccc..3a3e5ec3d0fd 100644
--- a/docs/contribute/community.rst
+++ b/docs/contribute/community.rst
@@ -1,51 +1,30 @@
-TVM Community Structure
+TVM Community Guideline
 =======================
 
-TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use, contribute to, and influence the direction of the project. We actively invite contributors who have earned the merit to be part of the development community. There are several roles in the community:
+TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use, contribute to, and influence the direction of the project. See `CONTRIBUTORS.md <https://github.com/dmlc/tvm/blob/master/CONTRIBUTORS.md>`_ for the current list of contributors.
 
-- Project Management Committee(PMC) Small group of active committers that moderate the discussion, RFC, manage project releases.
-- Committer Individual who has made substantial contributions to the project and is granted write access to the project and oversees the general direction of the projects.
-- Code Owner Individual who is responsible for a specific area of the codebase.
-- Reviewer Individual who is qualified to review for a specific area of the codebase.
-- Contributor Anyone who contributes to the project.
 
-This document explains responsibility and criteria for each role.
-See `CONTRIBUTORS.md <https://github.com/dmlc/tvm/blob/master/CONTRIBUTORS.md>`_ for the current list of contributors and their roles.
+General Development Process
+---------------------------
+Everyone in the community is welcomed to send patches, documents, and propose new directions to the project. The key guideline here is to enable everyone in the community to get involved and participate the decision and development.  When major changes are proposed, an RFC should be sent to allow discussion by the community. We encourage public discussion, archivable channels such as issues, discuss forum and mailing-list, so that everyone in the community can participate and review the process later.
 
+Code reviews are one of the key ways to ensure the quality of the code. High-quality code reviews prevent technical debt for long-term and are crucial to the success of the project. A pull request needs to be reviewed before it gets merged. A committer who has the expertise of the corresponding area would moderate the pull request and the merge the code when it is ready. The corresponding committer could request multiple reviewers who are familiar with the area of the code. We encourage contributors to request code reviews themselves and help review each other's code -- remember everyone is volunteering their time to the community, high-quality code review itself costs as much as the actual code contribution, you could get your code quickly reviewed if you do others the same favor.
 
-Project Management Committee
-----------------------------
+The community should strive to reach a consensus on technical decisions through discussion. We expect committers and PMCs to moderate technical discussions in a diplomatic way, and provide suggestions with clear technical reasoning when necessary.
 
-The PMC consists of a small group of active committers that moderate the discussion, provide mentorship to committers and code owners and manage the project release. PMC members need to actively manage the general project directions. Note that most major design choices and proposed changes should reach consensus among the committers.
 
-Committer
----------
-
-Committers are individuals who are granted the write access to the project. Committers oversee the general project directions and participate in the evaluation of the RFCs involving major design changes. Here is a list of useful things to do to help become a committer.
-
-- Deep understanding of one or a few modules in the project.
-- Good understanding of general project structure, demonstrated by discussion over RFCs, code reviews and proposals of new features
-- Active history of code reviews that demonstrate a good technical ability
-- Contribution history of high-quality documentation and tutorials to the promote project
-- History of creating clean, maintainable code and including good test cases.
 
-New committers are nominated by current committers from current code owners.
-
-Code Owner
+Committers
 ----------
+Committers are individuals who are granted the write access to the project. A committer is usually responsible for a certain area or several areas of the code where they oversee the code review process. The area of contribution can take all forms, including code contributions and code reviews, documents, education, and outreach. Committers are essential for a high quality and healthy project. The community actively look for new committers from contributors. Here is a list of useful traits that help the community to recognize potential committers:
 
-A code owner is an individual who is responsible for a specific area of the code-base. Code owners are responsible for the areas they are in charge of and oversee the code review process of the corresponding module. Changes to a specific area need to be approved by one of its owners in order to be merged. Once a pull request is approved by the designated code owner, the code can be directly merged into the repo. Code owners are essential for a high quality and healthy codebase.
-
-We welcome new code owners that help to keep good code quality, testing, and documentation in specific areas. Here is a list of useful traits that help the community to recognize potential code owners:
+- Sustained contribution to the project, demonstrated by discussion over RFCs, code reviews and proposals of new features, and other development activities. Being familiar with, and being able to take ownership on one or several areas of the project.
+- Quality of contributions: High-quality, readable code contributions indicated by pull requests that can be merged without a substantial code review.  History of creating clean, maintainable code and including good test cases. Informative code reviews to help other contributors that adhere to a good standard.
+- Community involvement: active participation in the discussion forum, promote the projects via tutorials, talks and outreach. We encourage committers to collaborate broadly, e.g. do code reviews and discuss designs with community members that they do not interact physically.
 
-- High-quality, readable code contributions indicated by pull requests that can be merged without a substantial code review
-- Good coverage of tests and documentation in the contributions
-- Informative code reviews to help other contributors that adhere to a good standard, spot problems in contributions etc.
-- Active participation in the discussion forum
+The Project Management Committee(PMC) consists group of active committers that moderate the discussion, manage the project release, and proposes new committer/PMC members. Potential candidates are usually proposed via an internal discussion among PMCs, followed by a consensus approval, i.e. least 3 +1 votes, and no vetoes. Any veto must be accompanied by reasoning. PMCs should serve the community by upholding the community practices and guidelines TVM a better community for everyone. PMCs should strive to identify new candidates outside of their own organization.
 
-Reviewer
---------
 
-A reviewer is an individual who actively contributed to the project and is willing to participate in the code review of new contributions. We invite reviewers from active contributors. The reviewer invitation will be sent to the potential reviewer’s email, so please log in to the discussion forum so that we can know which email address we could send an invitation to.
-We actively seek reviews from reviewers. High-quality code reviews prevent technical debt for long-term and are crucial to the success of the project.
-A pull request to the project has to be reviewed by a reviewer in order to be merged.
+Reviewers
+---------
+Reviewers are individuals who actively contributed to the project and are willing to participate in the code review of new contributions. We identify reviewers from active contributors. The committers should explicitly solicit reviews from reviewers.  High-quality code reviews prevent technical debt for long-term and are crucial to the success of the project. A pull request to the project has to be reviewed by at least one reviewer in order to be merged.
diff --git a/docs/contribute/pull_request.rst b/docs/contribute/pull_request.rst
index 80a0448c08dd..039ef65c7b13 100644
--- a/docs/contribute/pull_request.rst
+++ b/docs/contribute/pull_request.rst
@@ -11,7 +11,9 @@ This is a quick guide to submit a pull request, please also refer to the detaile
     git fetch upstream
     git rebase upstream/master
 
-- Make sure code style check pass by typing ``make lint``, and all the existing test-cases pass.
+- Make sure code style check pass by typing the following command, and all the existing test-cases pass.
+  - ``docker/bash.sh tvmai/ci-lint ./tests/scripts/task_lint.sh``  
+     (Note: You must install docker beforehand so you can run a docker image.)
 - Add test-cases to cover the new features or bugfix the patch introduces.
 - Document the code you wrote, see more at :ref:`doc_guide`
 - Send the pull request,  fix the problems reported by automatic checks.
@@ -24,3 +26,50 @@ This is a quick guide to submit a pull request, please also refer to the detaile
   - The detailed guidelines and summarizes useful lessons.
 
 - The patch can be merged after the reviewers approve the pull request.
+
+Testing
+-------
+Even though we have hooks to run unit tests automatically for each pull request, It's always recommended to run unit tests
+locally beforehand to reduce reviewers' burden and speedup review process.
+
+C++
+^^^
+.. code:: bash
+
+  # assume you are in tvm source root
+  TVM_ROOT=`pwd`
+
+  # you need to install google test first, gtest will be installed to $TVM_ROOT/lib
+  CACHE_PREFIX=. make -f 3rdparty/dmlc-core/scripts/packages.mk gtest
+
+  mkdir build
+  cd build
+  GTEST_LIB=$TVM_ROOT/lib cmake ..
+  make cpptest -j
+  for test in *_test; do
+    ./$test || exit -1
+  done
+
+Python
+^^^^^^
+If you want to run all tests:
+
+.. code:: bash
+
+  # build tvm
+  make
+
+  ./tests/scripts/task_python_unittest.sh
+
+If you want to run a single test:
+
+.. code:: bash
+
+  # build tvm
+  make
+
+  # let python know where to find tvm related libraries
+  export PYTHONPATH=python:topi/python
+  rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc
+
+  TVM_FFI=ctypes python -m nose -v tests/python/unittest/test_pass_storage_rewrite.py
\ No newline at end of file
diff --git a/docs/deploy/aocl_fpga.md b/docs/deploy/aocl_fpga.md
index bd0dae97879d..c9c50dc56be6 100644
--- a/docs/deploy/aocl_fpga.md
+++ b/docs/deploy/aocl_fpga.md
@@ -12,7 +12,7 @@ We use two python scripts for this tutorial.
 import tvm
 
 tgt_host="llvm"
-tgt="aocl -device=s5_ref -mattr=emulator"
+tgt="aocl_sw_emu"
 
 n = tvm.var("n")
 A = tvm.placeholder((n,), name='A')
@@ -38,7 +38,7 @@ import tvm
 import numpy as np
 import os
 
-tgt="aocl -device=s5_ref -mattr=emulator"
+tgt="aocl_sw_emu"
 
 fadd = tvm.module.load("myadd.so")
 fadd_dev = tvm.module.load("myadd.aocx")
@@ -52,7 +52,7 @@ b = tvm.nd.array(np.random.uniform(size=n).astype("float32"), ctx)
 c = tvm.nd.array(np.zeros(n, dtype="float32"), ctx)
 
 fadd(a, b, c)
-np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 ```
 
 Setup
diff --git a/docs/deploy/aws_fpga.md b/docs/deploy/aws_fpga.md
index 7554ce7f64cd..9d8af7d97a94 100644
--- a/docs/deploy/aws_fpga.md
+++ b/docs/deploy/aws_fpga.md
@@ -55,7 +55,7 @@ b = tvm.nd.array(np.random.uniform(size=n).astype("float32"), ctx)
 c = tvm.nd.array(np.zeros(n, dtype="float32"), ctx)
 
 fadd(a, b, c)
-np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 ```
 
 Setup
@@ -108,10 +108,10 @@ python run.py
 Synthesis
 ---------
 
-- Run synthesis with the following script. `XCL_EMULATION_MODE` must be set to 1 at this stage.
+- Run synthesis with the following script.
 
 ```bash
-export XCL_EMULATION_MODE=1
+unset XCL_EMULATION_MODE
 export XCL_TARGET=hw
 
 python build.py
diff --git a/docs/deploy/nnvm.md b/docs/deploy/nnvm.md
index e4ce14528b53..1e0d17f8b195 100644
--- a/docs/deploy/nnvm.md
+++ b/docs/deploy/nnvm.md
@@ -96,8 +96,8 @@ int main()
     run();
 
     DLTensor* y;
-    int out_ndim = 1;
-    int64_t out_shape[1] = {1000, };
+    int out_ndim = 2;
+    int64_t out_shape[2] = {1, 1000, };
     TVMArrayAlloc(out_shape, out_ndim, dtype_code, dtype_bits, dtype_lanes, device_type, device_id, &y);
 
     // get the function from the module(get output data)
diff --git a/docs/dev/codebase_walkthrough.rst b/docs/dev/codebase_walkthrough.rst
new file mode 100644
index 000000000000..6f5cff8a06d6
--- /dev/null
+++ b/docs/dev/codebase_walkthrough.rst
@@ -0,0 +1,226 @@
+=======================================
+**TVM Codebase Walkthrough by Example**
+=======================================
+
+Getting to know a new codebase can be a challenge. This is especially true for a codebase like that of TVM, where different components interact in non-obvious ways. In this guide, we try to illustrate the key elements that comprise a compilation pipeline with a simple example. For each important step, we show where in the codebase it is implemented. The purpose is to let new developers and interested users dive into the codebase more quickly.
+
+*******************************************
+Codebase Structure Overview
+*******************************************
+
+At the root of the TVM repository, we have following subdirectories that together comprise a bulk of the codebase.
+
+- ``src`` - C++ code for operator compilation and deployment runtimes.
+- ``src/relay`` - Implementation of Relay, a new IR for deep learning framework superseding ``nnvm`` below.
+- ``python`` - Python frontend that wraps C++ functions and objects implemented in ``src``.
+- ``topi`` - Compute definitions and backend schedules for standard neural network operators.
+- ``nnvm`` - C++ code and Python frontend for graph optimization and compilation. After the introduction of Relay, it remains in the codebase for backward compatibility.
+
+Using standard Deep Learning terminologies, ``src/relay`` is the component that manages a computational graph, and nodes in a graph are compiled and executed using infrastructures implemented in the rest of ``src``. ``python`` provides python bindings for the C++ API and driver code that users can use to execute compilation. Operators corresponding to each node are registered in ``src/relay/op``. Implementations for operators are in ``topi``, and they are coded in either C++ or Python.
+
+Relay is the new IR for deep networks that is intended to replace NNVM. If you have used NNVM, Relay provides equivalent or better functionalities. In fact, Relay goes beyond a traditional way of thinking deep networks in terms of computational graphs. But for the purpose of this document, we can think of Relay as a traditional computational graph framework. You can read more about Relay `here <https://docs.tvm.ai/dev/relay_intro.html>`_.
+
+When a user invokes graph compilation by ``relay.build(...)`` (or ``nnvm.compiler.build(...)`` for the older API), the following sequence of actions happens for each node in the graph:
+
+- Look up an operator implementation by querying the operator registry
+- Generate a compute expression and a schdule for the operator
+- Compile the operator into object code
+
+One of the interesting aspects of TVM codebase is that interop between C++ and Python is not unidirectional. Typically, all code that do heavy liftings are implemented in C++, and Python bindings are provided for user interface. This is also true in TVM, but in TVM codebase, C++ code also call into functions defined in a Python module. For example, the convolution operator is implemented in Python, and its implementation is invoked from C++ code in Relay.
+
+*******************************************
+Vector Add Example
+*******************************************
+
+We use a simple example that uses the low level TVM API directly. The example is vector addition, which is covered in detail in `this tutorial <https://docs.tvm.ai/tutorials/get_started.html#sphx-glr-tutorials-get-started-py>`_.
+
+::
+
+   n = 1024
+   A = tvm.placeholder((n,), name='A')
+   B = tvm.placeholder((n,), name='B')
+   C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
+
+Here, types of ``A``, ``B``, ``C`` are ``tvm.tensor.Tensor``, defined in ``python/tvm/tensor.py``. The Python ``Tensor`` is backed by C++ ``Tensor``, implemented in ``include/tvm/tensor.h`` and ``src/lang/tensor.cc``. All Python types in TVM can be thought of as a handle to the underlying C++ type with the same name. If you look at the definition of Python ``Tensor`` type below, you can see it is a subclass of ``NodeBase``.
+
+::
+
+   @register_node
+   class Tensor(NodeBase, _expr.ExprOp):
+       """Tensor object, to construct, see function.Tensor"""
+
+       def __call__(self, *indices):
+          ...
+
+The Node system is the basis of exposing C++ types to frontend languages, including Python. The way TVM implements Python wrapping is not straightforward. It is briefly covered in `this document <https://docs.tvm.ai/dev/runtime.html#tvm-node-and-compiler-stack>`_, and details are in ``python/tvm/_ffi/`` if you are interested.
+
+``Tensor`` is created by functions in ``python/tvm/api.py``, which in turn calls into C++ functions exposed in ``src/api/api_lang.cc``. All C++ functions that are callable from Python are exposed in the ``src/api`` subdirectory. For example, the ``tvm.compute()`` function above calls into ``_ComputeOp`` api exposed in ``src/api/api_lang.cc``:
+
+::
+
+   TVM_REGISTER_API("_ComputeOp")
+   .set_body([](TVMArgs args,  TVMRetValue* ret) {
+       *ret = ComputeOpNode::make(args[0],
+                                  args[1],
+                                  args[2],
+                                  args[3],
+                                  args[4]);
+     });
+
+We use ``TVM_REGISTER_*`` macro to expose C++ functions to frontend languages, in the form of `PackedFunc <https://docs.tvm.ai/dev/runtime.html#packedfunc>`_. ``PackedFunc`` is another mechanism by which TVM implements C++ and Python interop. In particular, this is what makes calling Python functions from the C++ codebase very easy.
+
+A ``Tensor`` object has an ``Operation`` object associated with it, defined in ``python/tvm/tensor.py``, ``include/tvm/operation.h``, and ``src/tvm/op`` subdirectory. A ``Tensor`` is an output of its ``Operation`` object. Each ``Operation`` object has in turn ``input_tensors()`` method, which returns a list of input ``Tensor`` to it. This way we can keep track of dependencies between ``Operation``.
+
+We pass the operation corresponding to the output tensor ``C`` to ``tvm.create_schedule()`` function in ``python/tvm/schedule.py``.
+
+::
+
+   s = tvm.create_schedule(C.op)
+
+This function is mapped to the C++ function in ``include/tvm/schedule.h``.
+
+::
+
+   inline Schedule create_schedule(Array<Operation> ops) {
+     return ScheduleNode::make(ops);
+   }
+
+``Schedule`` consists of collections of ``Stage`` and output ``Operation``.
+
+``Stage`` corresponds to one ``Operation``. In the vector add example above, there are two placeholder ops and one compute op, so the schedule ``s`` contains three stages. Each ``Stage`` holds information about a loop nest structure, types of each loop (``Parallel``, ``Vectorized``, ``Unrolled``), and where to execute its computation in the loop nest of the next ``Stage``, if any.
+
+``Schedule`` and ``Stage`` are defined in ``tvm/python/schedule.py``, ``include/tvm/schedule.h``, and ``src/schedule/schedule_ops.cc``.
+
+To keep it simple, we call ``tvm.build(...)`` on the default schedule created by ``create_schedule()`` function above.
+
+::
+
+   target = "cuda"
+   fadd = tvm.build(s, [A, B, C], target)
+
+``tvm.build()``, defined in ``python/tvm/build_module.py``, takes a schedule, input and output ``Tensor``, and a target, and returns a ``tvm.Module`` object, defined in ``python/tvm/module.py``. A ``Module`` object contains a compiled function which can be invoked with function call syntax.
+
+The process of ``tvm.build()`` can be divided into two steps:
+
+- Lowering, where a high level, initial loop nest structures are transformed into a final, low level IR
+- Code generation, where target machine code is generated from the low level IR
+
+Lowering is done by ``tvm.lower()`` function, defined in ``python/tvm/build_module.py``. First, bound inference is peformed, and an initial loop nest structure is created.
+
+::
+
+   def lower(sch,
+             args,
+             name="default_function",
+             binds=None,
+             simple_mode=False):
+      ...
+      bounds = schedule.InferBound(sch)
+      stmt = schedule.ScheduleOps(sch, bounds)
+      ...
+
+Bound inference is the process where all loop bounds and sizes of intermidiate buffers are inferred. If you target the CUDA backend and you use shared memory, its required minimum size is automatically determined here. Bound inference is implemented in ``src/schedule/bound.cc``, ``src/schedule/graph.cc`` and ``src/schedule/message_passing.cc``.
+
+``stmt``, which is the output of ``ScheduleOps()``, represents an initial loop nest structure. If you have applied ``reorder`` or ``split`` primitives to your schedule, then the initial loop nest already reflects that changes. ``ScheduleOps()`` is defined in ``src/schedule/schedule_ops.cc``.
+
+Next, we apply a number of lowering passes to ``stmt``. These passes are implemented in ``src/pass`` subdirectory. For example, if you have applied ``vectorize`` or ``unroll`` primitives to your schedule, they are applied in loop vectorization and unrolling passes below.
+
+::
+
+     ...
+     stmt = ir_pass.VectorizeLoop(stmt)
+     ...
+     stmt = ir_pass.UnrollLoop(
+         stmt,
+         cfg.auto_unroll_max_step,
+         cfg.auto_unroll_max_depth,
+         cfg.auto_unroll_max_extent,
+         cfg.unroll_explicit)
+     ...
+
+After lowering is done, ``build()`` function generates target machine code from the lowered function. This code can contain SSE or AVX instructions if you target x86, or PTX instructions for CUDA target. In addition to target specific machine code, TVM also generates host side code that is responsible for memory management, kernel launch etc.
+
+Code generation is done by ``build_module()`` function, defined in ``python/tvm/codege.py``. On the C++ side, code generation is implemented in ``src/codegen`` subdirectory. ``build_module()`` Python function will reach ``Build()`` function below in ``src/codegen/codegen.cc``:
+
+::
+
+   runtime::Module Build(const Array<LoweredFunc>& funcs,
+                         const std::string& target) {
+     std::string build_f_name = "codegen.build_" + target;
+     const PackedFunc* bf = runtime::Registry::Get(build_f_name);
+     runtime::Module m = (*bf)(funcs, target);
+     return m;
+   }
+
+
+``Build()`` function looks up the code generator for the given target in the ``PackedFunc`` registry, and invokes the function found. For example, ``codegen.build_cuda`` function is registered in ``src/codegen/build_cuda_on.cc``, like this:
+
+::
+
+   TVM_REGISTER_API("codegen.build_cuda")
+   .set_body([](TVMArgs args, TVMRetValue* rv) {
+       *rv = BuildCUDA(args[0]);
+     });
+
+``BuildCUDA()`` above generates CUDA kernel source from the lowered IR using ``CodeGenCUDA`` class defined in ``src/codegen/codegen_cuda.cc``, and compile the kernel using NVRTC. If you target a backend that uses LLVM, which includes x86, ARM, NVPTX and AMDGPU, code generation is done primarily by ``CodeGenLLVM`` class defined in ``src/codegen/llvm/codegen_llvm.cc``. ``CodeGenLLVM`` translates TVM IR into LLVM IR, runs a number of LLVM optimization passes, and generates target machine code.
+
+``Build()`` function in ``src/codegen/codegen.cc`` returns a ``runtime::Module`` object, defined in ``include/tvm/runtime/module.h`` and ``src/runtime/module.cc``. A ``Module`` object is a container for the underlying target specific ``ModuleNode`` object. Each backend implements a subclass of ``ModuleNode`` to add target specific runtime API calls. For example, the CUDA backend implements ``CUDAModuleNode`` class in ``src/runtime/cuda/cuda_module.cc``, which manages CUDA driver API. ``BuildCUDA()`` function above wraps ``CUDAModuleNode`` with ``runtime::Module`` and return it to the Python side. The LLVM backend implements ``LLVMModuleNode`` in ``src/codegen/llvm/llvm_module.cc``, which handles JIT execution of compiled code. Other subclasses of ``ModuleNode`` can be found under subdirectories of ``src/runtime`` corresponding to each backend.
+
+The returned module, which can be thought of as a combination of a compiled function and a device API, can be invoked on TVM's NDArray objects.
+
+::
+
+   ctx = tvm.context(target, 0)
+   a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+   b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
+   c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+   fadd(a, b, c)
+   output = c.asnumpy()
+
+Under the hood, TVM allocates device memory and manages memory transfers automatically. To do that, each backend needs to subclass ``DeviceAPI`` class, defined in ``include/tvm/runtime/device_api.h``, and override memory management methods to use device specific API. For example, the CUDA backend implements ``CUDADeviceAPI`` in ``src/runtime/cuda/cuda_device_api.cc`` to use ``cudaMalloc``, ``cudaMemcpy`` etc.
+
+The first time you invoke the compiled module with ``fadd(a, b, c)``, ``GetFunction()`` method of ``ModuleNode`` is called to get a ``PackedFunc`` that can be used for a kernel call. For example, in ``src/runtime/cuda/cuda_module.cc`` the CUDA backend implements ``CUDAModuleNode::GetFunction()`` like this:
+
+::
+
+   PackedFunc CUDAModuleNode::GetFunction(
+         const std::string& name,
+         const std::shared_ptr<ModuleNode>& sptr_to_self) {
+     auto it = fmap_.find(name);
+     const FunctionInfo& info = it->second;
+     CUDAWrappedFunc f;
+     f.Init(this, sptr_to_self, name, info.arg_types.size(), info.thread_axis_tags);
+     return PackFuncVoidAddr(f, info.arg_types);
+   }
+
+The ``PackedFunc``'s overloaded ``operator()`` will be called, which in turn calls ``operator()`` of ``CUDAWrappedFunc`` in ``src/runtime/cuda/cuda_module.cc``, where finally we see the ``cuLaunchKernel`` driver call:
+
+::
+
+   class CUDAWrappedFunc {
+    public:
+     void Init(...)
+     ...
+     void operator()(TVMArgs args,
+                     TVMRetValue* rv,
+                     void** void_args) const {
+       int device_id;
+       CUDA_CALL(cudaGetDevice(&device_id));
+       if (fcache_[device_id] == nullptr) {
+         fcache_[device_id] = m_->GetFunc(device_id, func_name_);
+       }
+       CUstream strm = static_cast<CUstream>(CUDAThreadEntry::ThreadLocal()->stream);
+       ThreadWorkLoad wl = thread_axis_cfg_.Extract(args);
+       CUresult result = cuLaunchKernel(
+           fcache_[device_id],
+           wl.grid_dim(0),
+           wl.grid_dim(1),
+           wl.grid_dim(2),
+           wl.block_dim(0),
+           wl.block_dim(1),
+           wl.block_dim(2),
+           0, strm, void_args, 0);
+     }
+   };
+
+This concludes an overview of how TVM compiles and executes a function. Although we did not detail TOPI or Relay, at the end all neural network operators go through the same compilation process as above. You are encouraged to dive into the details of the rest of the codebase.
diff --git a/docs/dev/debugger.rst b/docs/dev/debugger.rst
new file mode 100644
index 000000000000..a2a850a2dde4
--- /dev/null
+++ b/docs/dev/debugger.rst
@@ -0,0 +1,155 @@
+=================
+**Debugger**
+=================
+
+TVM Debugger is an interface for debugging TVM's computation graph execution. It helps to provide access to graph structures and tensor values at the TVM runtime.
+
+*******************************************
+**Debug Exchange Format**
+*******************************************
+
+**1. Computational Graph**
+==========================
+The optimized graph build by nnvm in json
+serialized format is dumped as it is. This contains the whole
+information about the graph. The UX can either use this graph directly
+or transform this graph to the format UX can understand.
+
+The Graph JSON format is explained below
+
+1. ``nodes``
+Nodes are either placeholders or computational nodes in NNVM graph. The nodes are stored
+as a list. A node contains the below information
+
+-     ``op`` - operation type, ``null`` means it is a placeholder/variable/input node and``tvm_op`` means this node can be executed
+-     ``name`` - Name of the node
+-     ``inputs`` - Position of the inputs for this operation, Inputs is a list of tuples with (nodeid, index, version). (Optional)
+-     ``attrs`` - Attributes of the node which contains the following information
+
+    -     ``flatten_data`` - Whether this data need to be flattened before execution
+    -     ``func_name`` - Fused function name, corresponds to the symbol in the lib generated by NNVM compilation process.
+    -     ``num_inputs`` - Number of inputs for this node
+    -     ``num_outputs`` - Number of outputs this node produces
+
+2. ``arg_nodes``
+arg_nodes is a list of indices of nodes which is placeholder/variable/input or constant/param to the graph.
+
+3. ``heads``
+heads is a list of entries as the output of the graph.
+
+4. ``node_row_ptr``
+node\_row\_ptr stores the history of forward path, so you can skip constructing the entire graph in inference tasks.
+
+5. ``attrs``
+attrs can contain version numbers or similar helpful information.
+
+- ``storage_id`` - Memory slot id for each node in the storage layout.
+- ``dtype`` - Datatype of each node (enum value).
+- ``dltype`` - Datatype of each node in order.
+- ``shape`` - Shape of each node k order.
+- ``device_index`` - Device assignment for each entry in the graph.
+
+Example of dumped graph:
+
+::
+
+    {
+      "nodes": [                                    # List of nodes
+        {
+          "op": "null",                             # operation type = null, this is a placeholder/variable/input or constant/param node
+          "name": "x",                              # Name of the argument node
+          "inputs": []                              # inputs for this node, its none since this is an argument node
+        },
+        {
+          "op": "tvm_op",                           # operation type = tvm_op, this node can be executed
+          "name": "relu0",                          # Name of the node
+          "attrs": {                                # Attributes of the node
+            "flatten_data": "0",                    # Whether this data need to be flattened
+            "func_name": "fuse_l2_normalize_relu",  # Fused function name, corresponds to the symbol in the lib generated by NNVM compilation process
+            "num_inputs": "1",                      # Number of inputs for this node
+            "num_outputs": "1"                      # Number of outputs this node produces
+          },
+          "inputs": [[0, 0, 0]]                     # Position of the inputs for this operation
+        }
+      ],
+      "arg_nodes": [0],                             # Which all nodes in this are argument nodes
+      "node_row_ptr": [0, 1, 2],                    # Row indices for faster depth first search
+      "heads": [[1, 0, 0]],                         # Position of the output nodes for this operation
+      "attrs": {                                    # Attributes for the graph
+        "storage_id": ["list_int", [1, 0]],         # memory slot id for each node in the storage layout
+        "dtype": ["list_int", [0, 0]],              # Datatype of each node (enum value)
+        "dltype": ["list_str", [                    # Datatype of each node in order
+            "float32",
+            "float32"]],
+        "shape": ["list_shape", [                   # Shape of each node k order
+            [1, 3, 20, 20],
+            [1, 3, 20, 20]]],
+        "device_index": ["list_int", [1, 1]],       # Device assignment for each node in order
+      }
+    }
+
+**2. Tensor dumping**
+=====================
+
+The tensor received after execution is in ``tvm.ndarray`` type. All the tensors will
+be saved as binary bytes in serialized format.  The result binary bytes can be loaded by the
+API "load_params".
+
+Example of loading the parameters
+   ::
+    with open(path_params, "rb") as fi:
+        loaded_params = bytearray(fi.read())
+
+    module.load_params(loaded_params)
+
+***************************************
+How to use Debugger?
+***************************************
+
+1. In ``config.cmake`` set the ``USE_GRAPH_RUNTIME_DEBUG`` flag to ``ON``
+
+   ::
+
+       # Whether enable additional graph debug functions
+       set(USE_GRAPH_RUNTIME_DEBUG ON)
+
+2. Do 'make' tvm, so that it will make the ``libtvm_runtime.so``
+
+3. In frontend script file instead of
+   ``from tvm.contrib import graph_runtime`` import the
+   ``debug_runtime``
+   ``from tvm.contrib.debugger import debug_runtime as graph_runtime``
+
+::
+
+    from tvm.contrib.debugger import debug_runtime as graph_runtime
+    m = graph_runtime.create(graph, lib, ctx, dump_root="/tmp/tvmdbg")
+    # set inputs
+    m.set_input('data', tvm.nd.array(data.astype(dtype)))
+    m.set_input(**params)
+    # execute
+    m.run()
+    tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
+
+The outputs are dumped to a temporary folder in ``/tmp`` folder or the
+folder specified while creating the runtime.
+
+***************************************
+Sample Output
+***************************************
+
+The below is the output of running  ``tvm/nnvm/tutorials/from_onnnx.py`` with debugger.
+
+::
+
+    Node Name               Ops                                                                  Time(us)   Time(%)  Start Time       End Time         Shape                Inputs  Outputs
+    ---------               ---                                                                  --------   -------  ----------       --------         -----                ------  -------
+    1_NCHW1c                fuse___layout_transform___4                                          56.52      0.02     15:24:44.177475  15:24:44.177534  (1, 1, 224, 224)     1       1
+    _contrib_conv2d_nchwc0  fuse__contrib_conv2d_NCHWc                                           12436.11   3.4      15:24:44.177549  15:24:44.189993  (1, 1, 224, 224, 1)  2       1
+    relu0_NCHW8c            fuse___layout_transform___broadcast_add_relu___layout_transform__    4375.43    1.2      15:24:44.190027  15:24:44.194410  (8, 1, 5, 5, 1, 8)   2       1
+    _contrib_conv2d_nchwc1  fuse__contrib_conv2d_NCHWc_1                                         213108.6   58.28    15:24:44.194440  15:24:44.407558  (1, 8, 224, 224, 8)  2       1
+    relu1_NCHW8c            fuse___layout_transform___broadcast_add_relu___layout_transform__    2265.57    0.62     15:24:44.407600  15:24:44.409874  (64, 1, 1)           2       1
+    _contrib_conv2d_nchwc2  fuse__contrib_conv2d_NCHWc_2                                         104623.15  28.61    15:24:44.409905  15:24:44.514535  (1, 8, 224, 224, 8)  2       1
+    relu2_NCHW2c            fuse___layout_transform___broadcast_add_relu___layout_transform___1  2004.77    0.55     15:24:44.514567  15:24:44.516582  (8, 8, 3, 3, 8, 8)   2       1
+    _contrib_conv2d_nchwc3  fuse__contrib_conv2d_NCHWc_3                                         25218.4    6.9      15:24:44.516628  15:24:44.541856  (1, 8, 224, 224, 8)  2       1
+    reshape1                fuse___layout_transform___broadcast_add_reshape_transpose_reshape    1554.25    0.43     15:24:44.541893  15:24:44.543452  (64, 1, 1)           2       1
diff --git a/docs/dev/index.rst b/docs/dev/index.rst
index f3ab322bfe53..3f4944fe1d52 100644
--- a/docs/dev/index.rst
+++ b/docs/dev/index.rst
@@ -8,6 +8,10 @@ In this part of documentation, we share the rationale for the specific choices m
    :maxdepth: 2
 
    runtime
+   debugger
    nnvm_json_spec
    nnvm_overview
    hybrid_script
+   relay_intro
+   relay_add_op
+   codebase_walkthrough
diff --git a/docs/dev/relay_add_op.rst b/docs/dev/relay_add_op.rst
new file mode 100644
index 000000000000..4a9625ce1198
--- /dev/null
+++ b/docs/dev/relay_add_op.rst
@@ -0,0 +1,145 @@
+Adding an Operator to Relay
+===========================
+
+In order to use TVM operators from within the Relay IR, the
+operators need to be registered in Relay in order to ensure
+that they will be integrated into Relay's type system.
+
+Registering an operator requires three steps:
+
+- Using the ``RELAY_REGISTER_OP`` macro in C++ to register the operator's arity and type information
+- Defining a C++ function to produce a call node for the operator and registering a Python API hook for the function
+- Wrapping the above Python API hook in a neater interface
+
+The file ``src/relay/op/tensor/binary.cc`` provides
+examples of the first two steps, while
+``python/tvm/relay/op/tensor.py`` gives examples of the
+last.
+
+Registering an Operator
+-----------------------
+
+TVM already has an operator registry, but Relay cannot properly
+incorporate TVM operators without additional type information.
+
+To allow for flexibility in registering operators and greater
+expressivity and granularity in expressing types in Relay, operators
+are typed using relations between input and output types. These relations
+are represented as functions that take in a list of input types and
+output types (any of these types may be incomplete) and return a list
+of input and output types that satisfies the relation. Essentially, a
+relation for an operator can enforce all the necessary typing rules
+(namely by inspecting the input types) in addition to computing the
+output type.
+
+For example, see ``src/relay/op/type_relations.h`` and their
+implementations. E.g., ``BroadcastRel`` takes two input types and an
+output type, checks that they are all tensor types with the same underlying
+data type, and finally ensures that the shape of the output type is the
+broadcast of the input types' shapes.
+
+It may be necessary to add another type relation to ``type_relations.h``
+if the existing ones do not capture the behavior of the desired operator.
+
+The ``RELAY_REGISTER_OP`` macro in C++ allows a developer
+to specify the following information about an operator in Relay:
+
+- Arity (number of arguments)
+- Names and descriptions for positional arguments
+- Support level (1 indicates an internal intrinsic; higher numbers indicate less integral or externally supported operators)
+- A type relation for the operator
+
+The below example is from ``binary.cc`` and uses a broadcasting
+add for tensors:
+
+.. code:: c
+
+    RELAY_REGISTER_OP("add")
+        .set_num_inputs(2)
+        .add_argument("lhs", "Tensor", "The left hand side tensor.")
+        .add_argument("rhs", "Tensor", "The right hand side tensor.")
+        .set_support_level(1)
+        .add_type_rel("Broadcast", BroadcastRel);
+
+Creating a Call Node
+--------------------
+
+This step requires simply writing a function that takes
+the arguments to the operator (as Relay expressions) and
+returning a call node to the operator (i.e., the node that
+should be placed into the Relay AST where the call to the
+operator is intended).
+
+At present call attributes and type arguments (the last two fields)
+are not supported, so it suffices to use ``Op::Get`` to fetch
+the operator's information from the operator registry and pass in
+the arguments to the call node, as below.
+
+.. code:: c
+
+    TVM_REGISTER_API("relay.op._make.add")
+        .set_body_typed<Expr(Expr, Expr)>([](Expr lhs, Expr rhs) {
+            static const Op& op = Op::Get("add");
+          return CallNode::make(op, {lhs, rhs}, Attrs(), {});
+        });
+
+Including a Python API Hook
+---------------------------
+
+It is generally the convention in Relay, that functions exported
+through ``TVM_REGISTER_API`` should be wrapped in a separate
+Python function rather than called directly in Python. In the case
+of the functions that produce calls to operators, it may be convenient
+to bundle them, as in ``python/tvm/relay/op/tensor.py``, where
+elementwise operators on tensors are all provided. For example,
+the following is how the add function from the previous section is
+exposed in Python:
+
+.. code:: python
+
+    def add(lhs, rhs):
+        """Elementwise addition.
+
+        Parameters
+        ----------
+        lhs : relay.Expr
+            The left hand side input data
+        rhs : relay.Expr
+            The right hand side input data
+
+        Returns
+        -------
+        result : relay.Expr
+            The computed result.
+        """
+        return _make.add(lhs, rhs)
+
+Note that these Python wrappers might also be good opportunities to
+provide an easier interface to the operator. For example, the
+``concat`` operator is registered as taking only one operator,
+namely a tuple with the tensors to be concatenated, but the Python
+wrapper takes the tensors as arguments and combines them into a tuple
+before producing the call node:
+
+.. code:: python
+
+    def concat(*args):
+        """Concatenate the input tensors along the zero axis.
+
+        Parameters
+        ----------
+        args: list of Tensor
+
+        Returns
+        -------
+        tensor: The concatenated tensor.
+        """
+        tup = Tuple(list(args))
+        return _make.concat(tup)
+
+Summary
+-------
+
+- A TVM operator can be registered in Relay using a relation to express the appropriate type information.
+- Using an operator in Relay requires a function to produce a call node for the operator.
+- It is best to have a simple Python wrapper for producing the call node.
diff --git a/docs/dev/relay_intro.rst b/docs/dev/relay_intro.rst
new file mode 100644
index 000000000000..66b643421a5b
--- /dev/null
+++ b/docs/dev/relay_intro.rst
@@ -0,0 +1,190 @@
+Introduction to Relay IR
+========================
+This article introduces Relay IR -- the second generation of NNVM.
+We expect readers from two kinds of background -- those who have a programming language background and deep learning
+framework developers who are familiar with the computational graph representation.
+
+We briefly summarize the design goal here, and will touch upon these points in the later part of the article.
+
+- Support traditional data flow-style programming and transformations.
+- Support functional-style scoping, let-binding and making it a fully featured differentiable language.
+- Being able to allow the user to mix the two programming styles.
+
+Build a Computational Graph with Relay
+--------------------------------------
+Traditional deep learning frameworks use computational graphs as their intermediate representation.
+A computational graph (or dataflow graph), is a directed acyclic graph (DAG) that represents the computation.
+Though dataflow graphs are limited in terms of the computations they are capable of expressing due to
+lacking control flow, their simplicity makes it easier to implement automatic differentiation and
+compile for heterogeneous execution environments (e.g., executing parts of the graph on specialized hardware).
+
+.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/relay/dataflow.png
+    :align: center
+    :scale: 70%
+
+
+You can use Relay to build a computational (dataflow) graph. Specifically, the above code shows how to
+construct a simple two-node graph. You can find that the syntax of the example is not that different from existing
+computational graph IR like NNVMv1, with the only difference in terms of terminology:
+
+- Existing frameworks usually use graph and subgraph
+- Relay uses function e.g. --  ``fn (%x)``, to indicate the graph
+
+Each dataflow node is a CallNode in Relay. The Relay Python DSL allows you to construct a dataflow graph quickly.
+One thing we want to highlight in the above code -- is that we explicitly constructed an Add node with
+both input point to ``%1``.  When a deep learning framework evaluates the above program, it will compute
+the nodes in topological order, and ``%1`` will only be computed once.
+While this fact is very natural to deep learning framework builders, it is something that might
+surprise a PL researcher in the first place.  If we implement a simple visitor to print out the result and
+treat the result as nested Call expression, it becomes ``log(%x) + log(%x)``.
+
+Such ambiguity is caused by different interpretations of program semantics when there is a shared node in the DAG.
+In a normal functional programming IR, nested expressions are treated as expression trees, without considering the
+fact that the ``%1`` is actually reused twice in ``%2``.
+
+The Relay IR is mindful of this difference. Usually, deep learning framework users build the computational
+graph in this fashion, where a DAG node reuse often occurs. As a result, when we print out the Relay program in
+the text format, we print one CallNode per line and assign a temporary id ``(%1, %2)`` to each CallNode so each common
+node can be referenced in later parts of the program.
+
+Module: Support Multiple Functions (Graphs)
+-------------------------------------------
+So far we have introduced how can we build a dataflow graph as a function. One might naturally ask: Can we support multiple
+functions and enable them to call each other? Relay allows grouping multiple functions together in a module; the code below
+shows an example of a function calling another function.
+
+.. code::
+
+   def @muladd(%x, %y, %z) {
+     %1 = mul(%x, %y)
+     %2 = add(%1, %z)
+     %2
+   }
+   def @myfunc(%x) {
+     %1 = @muladd(%x, 1, 2)
+     %2 = @muladd(%1, 2, 3)
+     %2
+   }
+
+The Module can be viewed as a ``Map<GlobalVar, Function>``. Here GlobalVar is just an id that is used to represent the functions
+in the module. ``@muladd`` and ``@myfunc`` are GlobalVars in the above example. When a CallNode is used to call another function,
+the corresponding GlobalVar is stored in the op field of the CallNode. It contains a level of indirection -- we need to look up
+body of the called function from the module using the corresponding GlobalVar. In this particular case, we could also directly
+store the reference to the Function as op in the CallNode. So, why do we need to introduce GlobalVar? The main reason is that
+GlobalVar decouples the definition/declaration and enables recursion and delayed declaration of the function.
+
+.. code ::
+
+  def @myfunc(%x) {
+    %1 = equal(%x, 1)
+     if (%1) {
+        %x
+     } else {
+       %2 = sub(%x, 1)
+       %3 = @myfunc(%2)
+        %4 = add(%3, %3)
+        %4
+    }
+  }
+
+In the above example, ``@myfunc`` recursively calls itself. Using GlobalVar ``@myfunc`` to represent the function avoids
+the cyclic dependency in the data structure.
+At this point, we have introduced the basic concepts in Relay. Notably, Relay has the following improvements over NNVMv1:
+
+- Succinct text format that eases debugging of writing passes.
+- First-class support for subgraphs-functions, in a joint module, this enables further chance of joint optimizations such as inlining and calling convention specification.
+- Naive front-end language interop, for example, all the data structure can be visited in Python, which allows quick prototyping of optimizations in Python and mixing them with C++ code.
+
+
+Let Binding and Scopes
+----------------------
+
+So far, we have introduced how to build a computational graph in the good old way used in deep learning frameworks.
+This section will talk about a new important construct introduced by Relay -- let bindings.
+
+Let binding is used in every high-level programming language. In Relay, it is a data structure with three
+fields ``Let(var, value, body)``. When we evaluate a let expression, we first evaluate the value part, assign
+it to the var, then return the evaluated result in the body expression.
+
+You can use a sequence of let bindings to construct a logically equivalent program to a dataflow program.
+The code example below shows one program with two forms side by side.
+
+.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/relay/dataflow_vs_func.png
+    :align: center
+    :scale: 70%
+
+
+The nested let binding is called A-normal form, and it is commonly used as IRs in functional programming languages.
+Now, please take a close look at the AST structure. While the two programs are semantically identical
+(so are their textual representations, except that A-normal form has let prefix), their AST structures are different.
+
+Since program optimizations take these AST data structures and transform them, the two different structures will
+affect the compiler code we are going to write. For example, if we want to detect a pattern ``add(log(x), y)``:
+
+- In the data-flow form, we can first access the add node, then directly look at its first argument to see if it is a log
+- In the A-normal form, we cannot directly do the check anymore, because the first input to add is ``%v1`` -- we will need to keep a map from variable to its bound values and look up that map, in order to know that ``%v1`` is a log.
+
+Different data structures will impact how you might write transformations, and we need to keep that in mind.
+So now, as a deep learning framework developer, you might ask, Why do we need let bindings?
+Your PL friends will always tell you that let is important -- as PL is a quite established field,
+there must be some wisdom behind that.
+
+Why We Might Need Let Binding
+-----------------------------
+One key usage of let binding is that it specifies the scope of computation. Let us take a look at the following example,
+which does not use let bindings.
+
+.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/master/images/relay/let_scope.png
+    :align: center
+    :scale: 70%
+
+The problem comes when we try to decide where we should evaluate node ``%1``. In particular, while the text format seems
+to suggest that we should evaluate node ``%1`` outside the if scope, the AST(as shown in the picture) does not suggest so.
+Actually, a dataflow graph never defines its scope of the evaluation. This introduces some ambiguity in the semantics.
+
+This ambiguity becomes more interesting when we have closures. Consider the following program, which returns a closure.
+We don’t know where should we compute ``%1``; it can be either inside or outside the closure.
+
+.. code::
+
+  fn (%x) {
+    %1 = log(%x)
+    %2 = fn(%y) {
+      add(%y, %1)
+    }
+    %2
+  }
+
+A let binding solves this problem, as the computation of the value happens at the let node. In both programs,
+if we change ``%1 = log(%x)`` to ``let %v1 = log(%x)``, we clearly specify the computation location to
+be outside of the if scope and closure. As you can see let-binding gives a more precise specification of the computation site
+and could be useful when we generate backend code (as such specification is in the IR).
+
+On the other hand, the dataflow form, which does not specify the scope of computation, does have its own advantages
+-- namely, we don’t need to worry about where to put the let when we generate the code. The dataflow form also gives more freedom
+to the later passes to decide where to put the evaluation point. As a result, it might not be a bad idea to use data flow
+form of the program in the initial phases of optimizations when you find it is convenient.
+Many optimizations in Relay today are written to optimize dataflow programs.
+
+However, when we lower the IR to an actual runtime program, we need to be precise about the scope of computation.
+In particular, we want to explicitly specify where the scope of computation should happen when we are using
+sub-functions and closures. Let-binding can be used to solve this problem in later stage execution specific optimizations.
+
+
+Implication on IR Transformations
+---------------------------------
+
+Hopefully, by now you are familiar with the two kinds of representations.
+Most functional programming languages do their analysis in A-normal form,
+where the analyzer does not need to be mindful that the expressions are DAGs.
+
+Relay choose to support both the dataflow form and let bindings. We believe that it is important to let the
+framework developer choose the representation they are familiar with.
+This does, however, have some implications on how we write passes:
+
+- If you come from a dataflow background and want to handle lets, keep a map of var to the expressions so you can perform lookup when encountering a var. This likely means a minimum change as we already need a map from expressions to transformed expressions anyway. Note that this will effectively remove all the lets in the program.
+- If you come from a PL background and like A-normal form, we will provide a dataflow to A-normal form pass.
+- For PL folks, when you are implementing something (like a dataflow-to-ANF transformation), be mindful that expressions can be DAGs, and this usually means that we should visit expressions with a ``Map<Expr, Result>`` and only compute the transformed result once, so the resulting expression keeps the common structure.
+
+There are additional advanced concepts such as symbolic shape inference, polymorphic functions
+that are not covered by this material; you are more than welcome to look at other materials.
diff --git a/docs/faq.md b/docs/faq.md
index 54df0ced8fa8..9b735e54d5dd 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -4,7 +4,7 @@ This document contains frequently asked questions.
 
 How to Install
 --------------
-See [Installation](http://tvm.ai/install/)
+See [Installation](http://docs.tvm.ai/install/)
 
 TVM's relation to Other IR/DSL Projects
 ---------------------------------------
diff --git a/docs/frontend/tensorflow.md b/docs/frontend/tensorflow.md
new file mode 100644
index 000000000000..acafbb5bb93e
--- /dev/null
+++ b/docs/frontend/tensorflow.md
@@ -0,0 +1,36 @@
+# Tensorflow Frontend
+Tensorflow frontend helps in importing tensorflow released model into TVM.
+
+This document helps few steps while importing various different models from
+[tensorflow research/slim](https://github.com/tensorflow/models/tree/master/research/slim).
+
+Current frontend is tested with all versions of below models
+- Inception (V1/V2/V3/V4)
+- Resnet (All)
+- Mobilenet (V1/V2 All)
+- Vgg (16/19)
+
+Tensorflow frontend expects a freezed protobuf format as input.
+
+Not all models are released as freezed protobuf. Some of them are checkpoints (.ckpt).
+Please refer to [export](https://github.com/tensorflow/models/tree/master/research/slim#exporting-the-inference-graph) 
+and [freeze](https://github.com/tensorflow/models/tree/master/research/slim#freezing-the-exported-graph) 
+instructions to generate protobuf from checkpoint.
+
+## General Instructions
+
+### Add Shapes:
+While freezing of protobuf add additional option ```add_shapes=True``` to embed output shapes of each node into graph.
+You may use ```nnvm.testing.tf.AddShapesToGraphDef``` from nnvm for the same.
+Please refer to [tensorflow tutorial](https://github.com/dmlc/tvm/blob/master/tutorials/nnvm/from_tensorflow.py).
+
+### Explicit Shape:
+There might be situations where the add_shapes=True may not provide sufficient information about shape.
+You may pass explicit dictionary of input shapes argument for ```from_tensorflow```.
+Please refer to [test cases](https://github.com/dmlc/tvm/blob/master/nnvm/tests/python/frontend/tensorflow/test_forward.py#L36).
+
+### GPU:
+Most of these tensorflow models are released for CPU with NHWC layout.
+To compile for GPU we need to pass extra argument ```layout='NCHW'``` for from_tensorflow.
+This option will do a layout conversion before and after for neural network ops.
+Remaining nnvm build options for GPU compilation remain as it is.
diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 2228e92b2f22..00cb96fe4e19 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -42,13 +42,14 @@ The minimal building requirements are
 - A recent c++ compiler supporting C++ 11 (g++-4.8 or higher)
 - CMake 3.5 or higher
 - We highly recommend to build with LLVM to enable all the features.
-- It is possible to build without llvm dependency if we only want to use CUDA/OpenCL
+- It is possible to build TVM without the LLVM dependency if you only want to use CUDA/OpenCL
+- If you want to use the NNVM compiler, then LLVM is required
 
 We use cmake to build the library.
 The configuration of tvm can be modified by `config.cmake`.
 
 
-- First, check the cmake in your system, you do not have cmake
+- First, check the cmake in your system. If you do not have cmake,
   you can obtain the latest version from `official website <https://cmake.org/download/>`_
 - First create a build directory, copy the ``cmake/config.cmake`` to the directory.
 
@@ -67,13 +68,13 @@ The configuration of tvm can be modified by `config.cmake`.
 
   - LLVM 4.0 or higher is needed for build with LLVM. Note that verison of LLVM from default apt may lower than 4.0.
   - Since LLVM takes long time to build from source, you can download pre-built version of LLVM from
-    [LLVM Download Page](http://releases.llvm.org/download.html).
+    `LLVM Download Page <http://releases.llvm.org/download.html>`_.
 
 
     - Unzip to a certain location, modify ``build/config.cmake`` to add ``set(USE_LLVM /path/to/your/llvm/bin/llvm-config)``
     - You can also directly set ``set(USE_LLVM ON)`` and let cmake search for a usable version of LLVM.
 
-  - You can also use [LLVM Nightly Ubuntu Build](https://apt.llvm.org/)
+  - You can also use `LLVM Nightly Ubuntu Build <https://apt.llvm.org/>`_
 
     - Note that apt-package append ``llvm-config`` with version number.
       For example, set ``set(LLVM_CONFIG llvm-config-4.0)`` if you installed 4.0 package
@@ -124,13 +125,15 @@ TVM package
 ~~~~~~~~~~~
 
 The python package is located at `tvm/python`
-There are several ways to install the package:
+There are two ways to install the package:
 
-1. Set the environment variable `PYTHONPATH` to tell python where to find
+Method 1
+   This method is **recommended for developers** who may change the codes.
+
+   Set the environment variable `PYTHONPATH` to tell python where to find
    the library. For example, assume we cloned `tvm` on the home directory
    `~`. then we can added the following line in `~/.bashrc`.
-   It is **recommended for developers** who may change the codes.
-   The changes will be immediately reflected once you pulled the code and rebuild the project (no need to call ``setup`` again)
+   The changes will be immediately reflected once you pull the code and rebuild the project (no need to call ``setup`` again)
 
    .. code:: bash
 
@@ -138,7 +141,8 @@ There are several ways to install the package:
        export PYTHONPATH=$TVM_HOME/python:$TVM_HOME/topi/python:$TVM_HOME/nnvm/python:${PYTHONPATH}
 
 
-2. Install tvm python bindings by `setup.py`:
+Method 2
+   Install tvm python bindings by `setup.py`:
 
    .. code:: bash
 
diff --git a/docs/langref/hybrid_script.rst b/docs/langref/hybrid_script.rst
index fdaed2b5be40..f8da87d8cfd2 100644
--- a/docs/langref/hybrid_script.rst
+++ b/docs/langref/hybrid_script.rst
@@ -22,13 +22,15 @@ you need to use ``tvm.hybrid.script`` decorator to indicate this is a hybrid fun
 
     @tvm.hybrid.script
     def outer_product(a, b, c):
+        c = output_tensor((100, 99), 'float32')
         for i in range(a.shape[0]):
             for j in range(b.shape[0]):
                 c[i, j] = a[i] * b[j]
-    a = numpy.random.rand(100)
-    b = numpy.random.rand(99)
-    c = numpy.zeros((100, 99))
-    outer_product(a, b, c)
+          return c
+    a = numpy.random.randn(100)
+    b = numpy.random.randn(99)
+    c = outer_product(a, b)
+
 
 This decorator will import `Keywords`_ required spontaneously when software emulation.
 After software emulation is done, the imported keywords will be cleaned up. Users do not need
@@ -40,25 +42,25 @@ or ``numpy`` numeric type.
 Backend Compilation
 ~~~~~~~~~~~~~~~~~~~
 
+This function is not encouraged to use, users are encouraged to use the second interface.
 The current parse interface looks like:
 
 .. code-block:: python
 
    a = tvm.placeholder((100, ), name='a')
    b = tvm.placeholder((99, ), name='b')
-   c = tvm.placeholder((100, 99), name='c')
-   tvm.hybrid.parse(outer_product, [a, b, c]) # return an ir root of this function
+   parser = tvm.hybrid.parse(outer_product, [a, b]) # return the parser of this function
 
-If we pass these tvm tensors to this function, it returns a op node:
 
-**Under construction, we are still deciding what kind of node should be returned.**
+If we pass these tvm tensors to this function, it returns a op node:
 
 .. code-block:: python
 
    a = tvm.placeholder((100, ), name='a')
    b = tvm.placeholder((99, ), name='b')
-   c = tvm.placeholder((100, 99), name='c')
-   op = outer_product(a, b, c) # return the corresponding op node
+   c = outer_product(a, b, c) # return the output tensor(s) of the operator
+
+**Under construction, we are still deciding what kind of node should be returned.**
 
 Tuning
 ~~~~~~
diff --git a/docs/langref/index.rst b/docs/langref/index.rst
index 65f78d1d278b..22ca00f7faa5 100644
--- a/docs/langref/index.rst
+++ b/docs/langref/index.rst
@@ -6,4 +6,5 @@ embedded languages in TVM stack.
 .. toctree::
    :maxdepth: 2
 
+   relay_op
    hybrid_script
diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
new file mode 100644
index 000000000000..f053165470fe
--- /dev/null
+++ b/docs/langref/relay_op.rst
@@ -0,0 +1,252 @@
+Relay Core Tensor Operators
+===========================
+
+This page contains the list of core tensor operator primitives pre-defined in tvm.relay.
+The core tensor operator primitives covers typical workloads in deep learning.
+They can represent workloads in front-end frameworks, and provide basic building blocks for optimization.
+Since deep learning is a fast evolving field and it is that possible to have operators that are not in here.
+
+
+.. note::
+
+   This document will directly list the function signature of
+   these operators in the python frontend.
+
+
+Overview of Operators
+---------------------
+**Level 1: Basic Operators**
+
+This level enables fully connected multi-layer perceptron.
+
+.. autosummary::
+   :nosignatures:
+
+   tvm.relay.log
+   tvm.relay.sqrt
+   tvm.relay.exp
+   tvm.relay.sigmoid
+   tvm.relay.add
+   tvm.relay.expand_dims
+   tvm.relay.concatenate
+   tvm.relay.nn.softmax
+   tvm.relay.nn.log_softmax
+   tvm.relay.subtract
+   tvm.relay.multiply
+   tvm.relay.divide
+   tvm.relay.mod
+   tvm.relay.tanh
+   tvm.relay.nn.relu
+   tvm.relay.nn.dropout
+   tvm.relay.nn.batch_norm
+   tvm.relay.nn.bias_add
+
+
+
+**Level 2: Convolutions**
+
+This level enables typical convnet models.
+
+.. autosummary::
+   :nosignatures:
+
+   tvm.relay.nn.conv2d
+   tvm.relay.nn.conv2d_transpose
+   tvm.relay.nn.dense
+   tvm.relay.nn.max_pool2d
+   tvm.relay.nn.avg_pool2d
+   tvm.relay.nn.global_max_pool2d
+   tvm.relay.nn.global_avg_pool2d
+   tvm.relay.nn.upsampling
+   tvm.relay.nn.batch_flatten
+   tvm.relay.nn.pad
+   tvm.relay.nn.lrn
+   tvm.relay.nn.l2_normalize
+
+
+**Level 3: Additional Math And Transform Operators**
+
+This level enables additional math and transform operators.
+
+.. autosummary::
+   :nosignatures:
+
+   tvm.relay.zeros
+   tvm.relay.nn.leaky_relu
+   tvm.relay.nn.prelu
+   tvm.relay.zeros_like
+   tvm.relay.ones
+   tvm.relay.ones_like
+   tvm.relay.reshape
+   tvm.relay.reshape_like
+   tvm.relay.copy
+   tvm.relay.transpose
+   tvm.relay.squeeze
+   tvm.relay.floor
+   tvm.relay.ceil
+   tvm.relay.trunc
+   tvm.relay.round
+   tvm.relay.abs
+   tvm.relay.negative
+   tvm.relay.take
+   tvm.relay.zeros
+   tvm.relay.zeros_like
+   tvm.relay.ones
+   tvm.relay.ones_like
+   tvm.relay.full
+   tvm.relay.full_like
+   tvm.relay.cast
+   tvm.relay.split
+
+
+**Level 4: Broadcast and Reductions**
+
+.. autosummary::
+   :nosignatures:
+
+   tvm.relay.right_shift
+   tvm.relay.left_shift
+   tvm.relay.equal
+   tvm.relay.not_equal
+   tvm.relay.greater
+   tvm.relay.greater_equal
+   tvm.relay.less
+   tvm.relay.less_equal
+   tvm.relay.maximum
+   tvm.relay.minimum
+   tvm.relay.power
+   tvm.relay.where
+   tvm.relay.argmax
+   tvm.relay.argmin
+   tvm.relay.sum
+   tvm.relay.max
+   tvm.relay.min
+   tvm.relay.mean
+   tvm.relay.prod
+   tvm.relay.strided_slice
+   tvm.relay.broadcast_to
+
+
+**Level 5: Vision/Image Operators**
+
+.. autosummary::
+   :nosignatures:
+
+   tvm.relay.image.resize
+   tvm.relay.vision.multibox_prior
+   tvm.relay.vision.multibox_transform_loc
+   tvm.relay.vision.nms
+
+
+**Level 10: Temporary Operators**
+
+This level support backpropagation of broadcast operators. It is temporary.
+
+.. autosummary::
+   :nosignatures:
+
+   tvm.relay.broadcast_to_like
+   tvm.relay.collapse_sum_like
+   tvm.relay.slice_like
+
+
+Level 1 Definitions
+-------------------
+.. autofunction:: tvm.relay.log
+.. autofunction:: tvm.relay.sqrt
+.. autofunction:: tvm.relay.exp
+.. autofunction:: tvm.relay.sigmoid
+.. autofunction:: tvm.relay.add
+.. autofunction:: tvm.relay.subtract
+.. autofunction:: tvm.relay.multiply
+.. autofunction:: tvm.relay.divide
+.. autofunction:: tvm.relay.mod
+.. autofunction:: tvm.relay.tanh
+.. autofunction:: tvm.relay.concatenate
+.. autofunction:: tvm.relay.expand_dims
+.. autofunction:: tvm.relay.nn.softmax
+.. autofunction:: tvm.relay.nn.log_softmax
+.. autofunction:: tvm.relay.nn.relu
+.. autofunction:: tvm.relay.nn.dropout
+.. autofunction:: tvm.relay.nn.batch_norm
+.. autofunction:: tvm.relay.nn.bias_add
+
+
+Level 2 Definitions
+-------------------
+.. autofunction:: tvm.relay.nn.conv2d
+.. autofunction:: tvm.relay.nn.conv2d_transpose
+.. autofunction:: tvm.relay.nn.dense
+.. autofunction:: tvm.relay.nn.max_pool2d
+.. autofunction:: tvm.relay.nn.avg_pool2d
+.. autofunction:: tvm.relay.nn.global_max_pool2d
+.. autofunction:: tvm.relay.nn.global_avg_pool2d
+.. autofunction:: tvm.relay.nn.upsampling
+.. autofunction:: tvm.relay.nn.batch_flatten
+.. autofunction:: tvm.relay.nn.lrn
+.. autofunction:: tvm.relay.nn.l2_normalize
+
+
+Level 3 Definitions
+-------------------
+.. autofunction:: tvm.relay.nn.leaky_relu
+.. autofunction:: tvm.relay.nn.prelu
+.. autofunction:: tvm.relay.floor
+.. autofunction:: tvm.relay.ceil
+.. autofunction:: tvm.relay.trunc
+.. autofunction:: tvm.relay.round
+.. autofunction:: tvm.relay.abs
+.. autofunction:: tvm.relay.negative
+.. autofunction:: tvm.relay.reshape
+.. autofunction:: tvm.relay.reshape_like
+.. autofunction:: tvm.relay.copy
+.. autofunction:: tvm.relay.squeeze
+.. autofunction:: tvm.relay.transpose
+.. autofunction:: tvm.relay.take
+.. autofunction:: tvm.relay.zeros
+.. autofunction:: tvm.relay.zeros_like
+.. autofunction:: tvm.relay.ones
+.. autofunction:: tvm.relay.ones_like
+.. autofunction:: tvm.relay.full
+.. autofunction:: tvm.relay.full_like
+.. autofunction:: tvm.relay.cast
+.. autofunction:: tvm.relay.split
+
+
+Level 4 Definitions
+-------------------
+.. autofunction:: tvm.relay.right_shift
+.. autofunction:: tvm.relay.left_shift
+.. autofunction:: tvm.relay.equal
+.. autofunction:: tvm.relay.not_equal
+.. autofunction:: tvm.relay.greater
+.. autofunction:: tvm.relay.greater_equal
+.. autofunction:: tvm.relay.less
+.. autofunction:: tvm.relay.less_equal
+.. autofunction:: tvm.relay.maximum
+.. autofunction:: tvm.relay.minimum
+.. autofunction:: tvm.relay.power
+.. autofunction:: tvm.relay.where
+.. autofunction:: tvm.relay.argmax
+.. autofunction:: tvm.relay.argmin
+.. autofunction:: tvm.relay.sum
+.. autofunction:: tvm.relay.max
+.. autofunction:: tvm.relay.min
+.. autofunction:: tvm.relay.mean
+.. autofunction:: tvm.relay.prod
+.. autofunction:: tvm.relay.strided_slice
+
+
+Level 5 Definitions
+-------------------
+.. autofunction:: tvm.relay.image.resize
+.. autofunction:: tvm.relay.vision.multibox_prior
+.. autofunction:: tvm.relay.vision.multibox_transform_loc
+.. autofunction:: tvm.relay.vision.nms
+
+
+Level 10 Definitions
+--------------------
+.. autofunction:: tvm.relay.broadcast_to_like
+.. autofunction:: tvm.relay.collapse_sum_like
+.. autofunction:: tvm.relay.slice_like
diff --git a/docs/nnvm_top.rst b/docs/nnvm_top.rst
index 96a37b779e1e..717ce985e002 100644
--- a/docs/nnvm_top.rst
+++ b/docs/nnvm_top.rst
@@ -29,6 +29,7 @@ This level enables fully connected multi-layer perceptron.
 
    nnvm.symbol.dense
    nnvm.symbol.relu
+   nnvm.symbol.prelu
    nnvm.symbol.tanh
    nnvm.symbol.sigmoid
    nnvm.symbol.exp
@@ -39,6 +40,8 @@ This level enables fully connected multi-layer perceptron.
    nnvm.symbol.elemwise_mul
    nnvm.symbol.elemwise_div
    nnvm.symbol.elemwise_sum
+   nnvm.symbol.elemwise_mod
+   nnvm.symbol.elemwise_pow
    nnvm.symbol.flatten
    nnvm.symbol.concatenate
    nnvm.symbol.expand_dims
@@ -50,6 +53,15 @@ This level enables fully connected multi-layer perceptron.
    nnvm.symbol.log_softmax
    nnvm.symbol.pad
    nnvm.symbol.block_grad
+   nnvm.symbol.matmul
+   nnvm.symbol.resize
+   nnvm.symbol.upsampling
+   nnvm.symbol.take
+   nnvm.symbol.l2_normalize
+   nnvm.symbol.flip
+   nnvm.symbol.lrn
+   nnvm.symbol.where
+   nnvm.symbol.gather_nd
 
 
 **Level 2: Convolutions**
@@ -92,6 +104,7 @@ This level enables typical convnet models.
    nnvm.symbol.__lshift_scalar__
    nnvm.symbol.__rshift_scalar__
 
+
 **Level 4: Broadcast and Reductions**
 
 .. autosummary::
@@ -102,6 +115,8 @@ This level enables typical convnet models.
    nnvm.symbol.sum
    nnvm.symbol.min
    nnvm.symbol.max
+   nnvm.symbol.mean
+   nnvm.symbol.prod
    nnvm.symbol.broadcast_add
    nnvm.symbol.broadcast_sub
    nnvm.symbol.broadcast_mul
@@ -117,11 +132,41 @@ This level enables typical convnet models.
    nnvm.symbol.ones_like
    nnvm.symbol.zeros
    nnvm.symbol.zeros_like
+   nnvm.symbol.slice_like
+   nnvm.symbol.strided_slice
+   nnvm.symbol.argmax
+   nnvm.symbol.argmin
+   nnvm.symbol.collapse_sum
+   nnvm.symbol.broadcast_equal
+   nnvm.symbol.broadcast_greater_equal
+   nnvm.symbol.broadcast_greater
+   nnvm.symbol.broadcast_left_shift
+   nnvm.symbol.broadcast_less_equal
+   nnvm.symbol.broadcast_less
+   nnvm.symbol.broadcast_max
+   nnvm.symbol.broadcast_min
+   nnvm.symbol.broadcast_mod
+   nnvm.symbol.broadcast_not_equal
+   nnvm.symbol.broadcast_pow
+   nnvm.symbol.broadcast_right_shift
+
+
+**Level 5: Vision Operators**
+
+.. autosummary::
+   :nosignatures:
+
+   nnvm.symbol.multibox_prior
+   nnvm.symbol.multibox_transform_loc
+   nnvm.symbol.nms
+   nnvm.symbol.yolo_region
+   nnvm.symbol.yolo_reorg
 
 Detailed Definitions
 --------------------
 .. autofunction:: nnvm.symbol.dense
 .. autofunction:: nnvm.symbol.relu
+.. autofunction:: nnvm.symbol.prelu
 .. autofunction:: nnvm.symbol.tanh
 .. autofunction:: nnvm.symbol.sigmoid
 .. autofunction:: nnvm.symbol.exp
@@ -132,6 +177,8 @@ Detailed Definitions
 .. autofunction:: nnvm.symbol.elemwise_mul
 .. autofunction:: nnvm.symbol.elemwise_div
 .. autofunction:: nnvm.symbol.elemwise_sum
+.. autofunction:: nnvm.symbol.elemwise_mod
+.. autofunction:: nnvm.symbol.elemwise_pow
 .. autofunction:: nnvm.symbol.flatten
 .. autofunction:: nnvm.symbol.concatenate
 .. autofunction:: nnvm.symbol.expand_dims
@@ -143,6 +190,15 @@ Detailed Definitions
 .. autofunction:: nnvm.symbol.log_softmax
 .. autofunction:: nnvm.symbol.pad
 .. autofunction:: nnvm.symbol.block_grad
+.. autofunction:: nnvm.symbol.matmul
+.. autofunction:: nnvm.symbol.resize
+.. autofunction:: nnvm.symbol.upsampling
+.. autofunction:: nnvm.symbol.take
+.. autofunction:: nnvm.symbol.l2_normalize
+.. autofunction:: nnvm.symbol.flip
+.. autofunction:: nnvm.symbol.lrn
+.. autofunction:: nnvm.symbol.where
+.. autofunction:: nnvm.symbol.gather_nd
 
 .. autofunction:: nnvm.symbol.conv2d
 .. autofunction:: nnvm.symbol.conv2d_transpose
@@ -176,6 +232,8 @@ Detailed Definitions
 .. autofunction:: nnvm.symbol.sum
 .. autofunction:: nnvm.symbol.min
 .. autofunction:: nnvm.symbol.max
+.. autofunction:: nnvm.symbol.mean
+.. autofunction:: nnvm.symbol.prod
 .. autofunction:: nnvm.symbol.broadcast_add
 .. autofunction:: nnvm.symbol.broadcast_sub
 .. autofunction:: nnvm.symbol.broadcast_mul
@@ -191,3 +249,26 @@ Detailed Definitions
 .. autofunction:: nnvm.symbol.ones_like
 .. autofunction:: nnvm.symbol.zeros
 .. autofunction:: nnvm.symbol.zeros_like
+.. autofunction:: nnvm.symbol.slice_like
+.. autofunction:: nnvm.symbol.strided_slice
+.. autofunction:: nnvm.symbol.argmax
+.. autofunction:: nnvm.symbol.argmin
+.. autofunction:: nnvm.symbol.collapse_sum
+.. autofunction:: nnvm.symbol.broadcast_equal
+.. autofunction:: nnvm.symbol.broadcast_greater_equal
+.. autofunction:: nnvm.symbol.broadcast_greater
+.. autofunction:: nnvm.symbol.broadcast_left_shift
+.. autofunction:: nnvm.symbol.broadcast_less_equal
+.. autofunction:: nnvm.symbol.broadcast_less
+.. autofunction:: nnvm.symbol.broadcast_max
+.. autofunction:: nnvm.symbol.broadcast_min
+.. autofunction:: nnvm.symbol.broadcast_mod
+.. autofunction:: nnvm.symbol.broadcast_not_equal
+.. autofunction:: nnvm.symbol.broadcast_pow
+.. autofunction:: nnvm.symbol.broadcast_right_shift
+
+.. autofunction:: nnvm.symbol.multibox_prior
+.. autofunction:: nnvm.symbol.multibox_transform_loc
+.. autofunction:: nnvm.symbol.nms
+.. autofunction:: nnvm.symbol.yolo_region
+.. autofunction:: nnvm.symbol.yolo_reorg
diff --git a/docs/vta/install.md b/docs/vta/install.md
index ca5969386e80..4a05f9fd8318 100644
--- a/docs/vta/install.md
+++ b/docs/vta/install.md
@@ -67,7 +67,7 @@ This guide covers the following themes:
 
 Setup your Pynq board based on the [Pynq board getting started tutorial](http://pynq.readthedocs.io/en/latest/getting_started.html).
 You should follow the instructions up to and including the *Turning On the PYNQ-Z1* step (no need to pursue the tutorial beyond this point).
-* Make sure that you've downloaded the latest Pynq image, [PYNQ-Z1 v2.1](http://pynq-testing.readthedocs.io/en/image_v2.2/getting_started/pynq_image.html) (released 21 Feb 2018), and have imaged your SD card with it (we recommend the free [Etcher](https://etcher.io/) program).
+* Make sure that you've downloaded the latest Pynq image, [PYNQ-Z1 v2.3](http://www.pynq.io/board.html) (released October 3rd 2018), and have imaged your SD card with it (we recommend the free [Etcher](https://etcher.io/) program).
 * For this test setup, follow the ["Connect to a Computer"](http://pynq.readthedocs.io/en/latest/getting_started.html#connect-to-a-computer) Ethernet setup instructions. To be able to talk to the board, make sure to [assign your computer a static IP address](http://pynq.readthedocs.io/en/latest/appendix.html#assign-your-computer-a-static-ip)
 
 Once the board is powered on and connected to your development machine, try connecting to it to make sure you've properly set up your Pynq board:
diff --git a/golang/Makefile b/golang/Makefile
new file mode 100644
index 000000000000..54019740c87a
--- /dev/null
+++ b/golang/Makefile
@@ -0,0 +1,64 @@
+.PHONY: clean all
+
+TVM_BASE   = $(CURDIR)/../
+TARGET     = gotvm
+LIBS       = -lm -ldl
+NATIVE_SRC = tvm_runtime_pack.cc
+
+GOPATH=$(CURDIR)/gopath
+GOPATHDIR=${GOPATH}/src/${TARGET}/
+CGO_CPPFLAGS="-I. -I${TVM_BASE}/ -I${TVM_BASE}/3rdparty/dmlc-core/include -I${TVM_BASE}/include -I${TVM_BASE}/3rdparty/dlpack/include/"
+CGO_CXXFLAGS="-std=c++11"
+CGO_CFLAGS="-I${TVM_BASE}"
+CGO_LDFLAGS="-ldl -lm"
+
+all:
+	@mkdir gopath 2>/dev/null || true
+	@mkdir gopath/src 2>/dev/null || true
+	@mkdir gopath/src/$(TARGET) 2>/dev/null || true
+	@cp src/$(TARGET).cc gopath/src/$(TARGET)
+	@cp src/$(TARGET).h gopath/src/$(TARGET)
+	@cp src/$(NATIVE_SRC) gopath/src/$(TARGET)
+	@cp src/*.go gopath/src/$(TARGET)
+	@export GOPATH=$(GOPATH); \
+	export CGO_CPPFLAGS=$(CGO_CPPFLAGS); \
+	export CGO_CXXFLAGS=$(CGO_CXXFLAGS); \
+	export CGO_CFLAGS=$(CGO_CFLAGS); \
+	export CGO_LDFLAGS=$(CGO_LDFLAGS); \
+	(cd $(GOPATHDIR) && go clean -cache \
+	&& golint && go build -o $(TARGET).a \
+	&& go install)
+	@find . -name gotvm.a
+	@#mkdir gopath/doc 2>/dev/null || true
+	@#godoc -html -goroot gopath/ gotvm | grep -v "for documentation on the gotvm command" > gopath/doc/gotvm.html
+	@#echo "Run 'godoc -http=:6060  -goroot=./gopath' for documentation"
+
+samples: all
+	cp gopath/pkg/linux_amd64/gotvm.a sample/ -rfa
+	make -C sample
+
+tests: all
+	@(cd sample; python3 deploy.py)
+	@export GOPATH=$(GOPATH); \
+	export CGO_CPPFLAGS=$(CGO_CPPFLAGS); \
+	export CGO_CXXFLAGS=$(CGO_CXXFLAGS); \
+	export CGO_CFLAGS=$(CGO_CFLAGS); \
+	export CGO_LDFLAGS=$(CGO_LDFLAGS); \
+	(cd $(GOPATHDIR) \
+	&& cp ../../../sample/deploy.so . \
+	&& go test -v)
+
+clean:
+	@if [ -d $(GOPATHDIR) ] ; then \
+	export GOPATH=$(GOPATH); \
+	export CGO_CPPFLAGS=$(CGO_CPPFLAGS); \
+	export CGO_CFLAGS=$(CGO_CFLAGS); \
+	export CGO_LDFLAGS=$(CGO_LDFLAGS); \
+	(cd $(GOPATHDIR) && go clean -cache); fi
+	@rm -rf gopath
+	@make -C sample clean
+
+lint:
+	@(cd src; golint)
+	@python3 ${TVM_BASE}/dmlc-core/scripts/lint.py gotvm cpp src/*.cc
+	@python3 ${TVM_BASE}/dmlc-core/scripts/lint.py gotvm cpp src/*.h
diff --git a/golang/README.md b/golang/README.md
new file mode 100644
index 000000000000..9c152dd7365c
--- /dev/null
+++ b/golang/README.md
@@ -0,0 +1,107 @@
+# gotvm - Golang Frontend for TVM Runtime
+
+This folder contain golang interface for TVM runtime. It brings TVM runtime to Golang.
+
+- It enable c runtime api of tvm exposed to golang.
+- It enables module loading (lib, graph and params) and inference operations.
+
+## Installation
+
+### Requirements
+
+- go compiler (https://golang.org/) version 0.10 or above.
+
+### Modules
+
+- src
+  Module that generates golang package corresponding to the c runtime api exposed from tvm source tree.
+  This process build golang package _gotvm.a_
+
+- samples
+  Sample golang reference application to inference through gotvm package.
+
+### Build
+
+Once the Requirements are installed
+
+To build _gotvm_ package
+
+```bash
+make
+```
+
+To build and run internal tests
+
+```bash
+make tests
+```
+
+To build sample apps.
+
+```bash
+make samples
+```
+
+## Run
+
+To Demonstrates sample TVM module compilation using python and deploy via golang.
+```bash
+./simple
+``` 
+
+To deploy a realtime module with lib, graph and param.
+```bash
+./complex
+```
+
+To demonstrate go function closure conversion to packed function handle.
+
+```bash
+./pack_func_convert
+```
+
+To demonstrate a packed function handle given as an argument.
+
+```bash
+pack_func_handle_arg
+```
+
+To register go function with runtime as a global function.
+
+```bash
+pack_func_register
+```
+
+To demonstrate function closure passed as argument to a function call.
+
+```bash
+./pack_func_closure_arg
+```
+
+To demonstrate function closure returned from a packed function.
+
+```bash
+./pack_func_closure_return
+```
+
+## Documentation
+gotvm.go is documented with sufficient information about gotvm package.
+A html version documentation can be accessed by running below command after building runtime.
+
+```bash
+godoc -http=:6060  -goroot=./gopath
+```
+After above command try http://127.0.0.1:6060 from any browser.
+
+Also please refer to the sample applications under sample folder.
+
+## Docker
+Docker setup may need below additions for dependencies and environment preparation.
+
+Please refer ```docker/install/ubuntu_install_golang.sh``` for the packages dependencies.
+
+go compiler 1.10 on ubuntu doesn't install on standard path, hence an explicit export may be needed as shown below.
+
+```bash
+export PATH="/usr/lib/go-1.10/bin:$PATH"```
+```
diff --git a/golang/sample/Makefile b/golang/sample/Makefile
new file mode 100644
index 000000000000..8ebea49da42f
--- /dev/null
+++ b/golang/sample/Makefile
@@ -0,0 +1,17 @@
+.PHONY: clean all
+
+SOURCES=$(wildcard *.go)
+EXECUTABLE=$(patsubst %.go, %, $(SOURCES))
+
+all: $(EXECUTABLE)
+	@golint
+	@python3 deploy.py
+
+%: %.o
+	@go tool link -linkmode external -extld "g++" -extldflags "-ldl" -o $@ $<
+
+%.o: %.go
+	@go tool compile -pack -o $@ $<
+
+clean:
+	@rm -f $(EXECUTABLE) *.so *.o *.a
diff --git a/golang/sample/complex.go b/golang/sample/complex.go
new file mode 100644
index 000000000000..7a8d0044375c
--- /dev/null
+++ b/golang/sample/complex.go
@@ -0,0 +1,171 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief Sample golang application deployment over tvm.
+ * \file complex.go
+ */
+
+package main
+
+import (
+    "fmt"
+    "io/ioutil"
+    "math/rand"
+    "./gotvm"
+    "runtime"
+)
+
+// NNVM compiled model paths.
+const (
+    modLib    = "./mobilenet.so"
+    modJSON   = "./mobilenet.json"
+    modParams = "./mobilenet.params"
+)
+
+// main
+func main() {
+    defer runtime.GC()
+    // Welcome
+    fmt.Printf("TVM Version   : v%v\n", gotvm.TVMVersion)
+    fmt.Printf("DLPACK Version: v%v\n\n", gotvm.DLPackVersion)
+
+    // Query global functions available
+    funcNames, err := gotvm.FuncListGlobalNames()
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Global Functions:%v\n", funcNames)
+
+    // Import tvm module (so)
+    modp, err := gotvm.LoadModuleFromFile(modLib)
+    if err != nil {
+        fmt.Print(err)
+        fmt.Printf("Please copy tvm compiled modules here and update the sample.go accordingly.\n")
+        fmt.Printf("You may need to update modLib, modJSON, modParams, tshapeIn, tshapeOut\n")
+        return
+    }
+    fmt.Printf("Module Imported:%p\n", modp)
+    bytes, err := ioutil.ReadFile(modJSON)
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    jsonStr := string(bytes)
+
+    // Load module on tvm runtime - call tvm.graph_runtime.create
+    funp, err := gotvm.GetGlobalFunction("tvm.graph_runtime.create")
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Calling tvm.graph_runtime.create\n")
+    // Call function
+    graphrt, err := funp.Invoke(jsonStr, modp, (int64)(gotvm.KDLCPU), (int64)(0))
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    graphmod := graphrt.AsModule()
+    fmt.Printf("Graph runtime Created\n")
+
+    // Array allocation attributes
+    tshapeIn  := []int64{1, 224, 224, 3}
+    tshapeOut := []int64{1, 1001}
+
+    // Allocate input Array
+    inX, err := gotvm.Empty(tshapeIn, "float32", gotvm.CPU(0))
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+
+    // Allocate output Array
+    out, err := gotvm.Empty(tshapeOut)
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Input and Output Arrays allocated\n")
+
+    // Get module function from graph runtime : load_params
+    // Read params
+    bytes, err = ioutil.ReadFile(modParams)
+    if err != nil {
+        fmt.Print(err)
+    }
+
+    // Load Params
+    funp, err = graphmod.GetFunction("load_params")
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Func load_params:%p\n", funp)
+
+    // Call function
+    _, err = funp.Invoke(bytes)
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Module params loaded\n")
+
+    // Set some data in input Array
+    inSlice := make([]float32, (244 * 244 * 3))
+    rand.Seed(10)
+    rand.Shuffle(len(inSlice), func(i, j int) {inSlice[i],
+                                               inSlice[j] = rand.Float32(),
+                                               rand.Float32() })
+    inX.CopyFrom(inSlice)
+
+    // Set Input
+    funp, err = graphmod.GetFunction("set_input")
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+
+    // Call function
+    _, err = funp.Invoke("input", inX)
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+
+    fmt.Printf("Module input is set\n")
+
+    // Run
+    funp, err = graphmod.GetFunction("run")
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+
+    // Call function
+    _, err = funp.Invoke()
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Module Executed \n")
+
+    // Call runtime function get_output
+    funp, err = graphmod.GetFunction("get_output")
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+
+    // Call function
+    _, err = funp.Invoke(int64(0), out)
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Got Module Output \n")
+
+    // Print results
+    outIntf, _ := out.AsSlice()
+    outSlice := outIntf.([]float32)
+    fmt.Printf("Result:%v\n", outSlice[:10])
+}
diff --git a/golang/sample/deploy.py b/golang/sample/deploy.py
new file mode 100644
index 000000000000..065638299bc6
--- /dev/null
+++ b/golang/sample/deploy.py
@@ -0,0 +1,40 @@
+"""
+Get Started with TVM Go
+=======================
+"""
+from __future__ import absolute_import, print_function
+
+import tvm
+import numpy as np
+
+# Global declarations of environment.
+
+tgt_host="llvm"
+tgt="llvm"
+
+######################################################################
+# Describe the Computation
+# ------------------------
+n = tvm.var("n")
+A = tvm.placeholder((n,), name='A')
+B = tvm.placeholder((n,), name='B')
+C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
+
+######################################################################
+# Schedule the Computation
+# ------------------------
+s = tvm.create_schedule(C.op)
+
+######################################################################
+# Compilation
+# -----------
+fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name="myadd")
+
+######################################################################
+# Save Compiled Module
+# --------------------
+from tvm.contrib import cc
+from tvm.contrib import util
+
+fadd.save("deploy.o")
+cc.create_shared("deploy.so", ["deploy.o"])
diff --git a/golang/sample/pack_func_closure_arg.go b/golang/sample/pack_func_closure_arg.go
new file mode 100644
index 000000000000..b31113160586
--- /dev/null
+++ b/golang/sample/pack_func_closure_arg.go
@@ -0,0 +1,57 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief Sample golang application to demonstrate go-closure given to a packed function argument.
+ * \file pack_func_closure_arg.go
+ */
+
+package main
+
+import (
+    "fmt"
+    "./gotvm"
+)
+
+
+// sampleFunctionArg receives a Packed Function handle and calls it.
+func sampleFunctionArg(args ...*gotvm.Value) (retVal interface{}, err error) {
+    // Reveive Packed Function Handle
+    pfunc := args[0].AsFunction()
+    // Call Packed Function
+    retVal, err = pfunc.Invoke(args[1].AsInt64(), args[2].AsInt64())
+    return
+}
+
+// main
+func main() {
+    // Not passing a function name implicitely
+    // picks the name from reflection as "main.sampleDunctionArg"
+    gotvm.RegisterFunction(sampleFunctionArg);
+    fmt.Printf("Registered: sampleFunctionArg\n")
+
+    // Get registered global function.
+    funp, err := gotvm.GetGlobalFunction("main.sampleFunctionArg")
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("GetGlobalFunction: main.sampleFunctionArg - Success\n")
+
+    // funccall is a simple golang callback function like C = A + B.
+    funccall := func (args ...*gotvm.Value) (retVal interface{}, err error) {
+        for _, v := range args {
+            fmt.Printf("ARGS:%T : %v\n", v.AsInt64(), v.AsInt64())
+        }
+        val1 := args[0].AsInt64()
+        val2 := args[1].AsInt64()
+        retVal = int64(val1+val2)
+        return
+    }
+
+    // Call function
+    result, err := funp.Invoke(funccall, 30, 50)
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Invoked sampleFunctionArg with function closure arg : Result:%v\n", result.AsInt64())
+}
diff --git a/golang/sample/pack_func_closure_return.go b/golang/sample/pack_func_closure_return.go
new file mode 100644
index 000000000000..98de8e2e5146
--- /dev/null
+++ b/golang/sample/pack_func_closure_return.go
@@ -0,0 +1,57 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief Sample golang application to demonstrate go-closure returned from a callback function.
+ * \file pack_func_closure_return.go
+ */
+
+package main
+
+import (
+    "fmt"
+    "./gotvm"
+)
+
+// sampleFunctionCb returns a function closure which is embed as packed function in TVMValue.
+func sampleFunctionCb(args ...*gotvm.Value) (retVal interface{}, err error) {
+    funccall := func (cargs ...*gotvm.Value) (fret interface{}, ferr error) {
+        for _, v := range cargs {
+            fmt.Printf("ARGS:%T : %v\n", v.AsInt64(), v.AsInt64())
+        }
+        val1 := cargs[0].AsInt64()
+        val2 := cargs[1].AsInt64()
+        fret = int64(val1+val2)
+        return
+    }
+    retVal = funccall
+    return
+}
+
+// main
+func main() {
+    // Not passing a function name implicitely
+    // picks the name from reflection as "main.sampleDunctionCb"
+    gotvm.RegisterFunction(sampleFunctionCb);
+    fmt.Printf("Registered: sampleFunctionCb\n")
+
+    // Get registered global function
+    funp, err := gotvm.GetGlobalFunction("main.sampleFunctionCb")
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("GetGlobalFunction: main.sampleFunctionCb - Success\n")
+
+    // Call function
+    result, err := funp.Invoke()
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Invoked main.sampleFunctionCb via Function handle\n")
+
+    pfunc := result.AsFunction()
+    fmt.Printf("Function Handle received via Packed Function call:%T - %v \n", pfunc, pfunc)
+
+    pfuncRet, err := pfunc.Invoke(30, 40)
+    fmt.Printf("Invoked closure inside sampleFunctionCb result:%v\n", pfuncRet.AsInt64())
+}
diff --git a/golang/sample/pack_func_convert.go b/golang/sample/pack_func_convert.go
new file mode 100644
index 000000000000..6748d67fe75f
--- /dev/null
+++ b/golang/sample/pack_func_convert.go
@@ -0,0 +1,44 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief Sample golang application to demonstrate function conversion to packed function.
+ * \file pack_func_convert.go
+ */
+
+package main
+
+import (
+    "fmt"
+    "./gotvm"
+)
+
+// sampleCb is a simple golang callback function like C = A + B.
+func sampleCb(args ...*gotvm.Value) (retVal interface{}, err error) {
+    for _, v := range args {
+        fmt.Printf("ARGS:%T : %v\n", v.AsInt64(), v.AsInt64())
+    }
+    val1 := args[0].AsInt64()
+    val2 := args[1].AsInt64()
+    retVal = int64(val1+val2)
+    return
+}
+
+// main
+func main() {
+    // Welcome
+
+    // Simple convert to a packed function
+    fhandle, err := gotvm.ConvertFunction(sampleCb)
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Converted function\n")
+
+    retVal, err := fhandle.Invoke(10, 20)
+    fmt.Printf("Invoke Completed\n")
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Result:%v\n", retVal.AsInt64())
+}
diff --git a/golang/sample/pack_func_handle_arg.go b/golang/sample/pack_func_handle_arg.go
new file mode 100644
index 000000000000..ad1313f93f5f
--- /dev/null
+++ b/golang/sample/pack_func_handle_arg.go
@@ -0,0 +1,60 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief Sample golang application to demonstrate converted packed
+ * function handle passed to another packed function.
+ * \file pack_func_handle_arg.go
+ */
+
+package main
+
+import (
+    "fmt"
+    "./gotvm"
+)
+
+// sampleCb is a simple golang callback function like C = A + B.
+func sampleCb(args ...*gotvm.Value) (retVal interface{}, err error) {
+    for _, v := range args {
+        fmt.Printf("ARGS:%T : %v\n", v.AsInt64(), v.AsInt64())
+    }
+    val1 := args[0].AsInt64()
+    val2 := args[1].AsInt64()
+    retVal = int64(val1+val2)
+    return
+}
+
+// sampleFunctionArg receives a Packed Function handle and calls it.
+func sampleFunctionArg(args ...*gotvm.Value) (retVal interface{}, err error) {
+    // Reveive Packed Function Handle
+    pfunc := args[0].AsFunction()
+
+    // Call Packed Function
+    retVal, err = pfunc.Invoke(args[1], args[2])
+    return
+}
+
+// main
+func main() {
+    // Simple convert to a packed function
+    fhandle, err := gotvm.ConvertFunction(sampleCb)
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+
+    gotvm.RegisterFunction(sampleFunctionArg);
+    fmt.Printf("Registered: sampleFunctionArg\n")
+
+    funp, err := gotvm.GetGlobalFunction("main.sampleFunctionArg")
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+
+    retVal, err := funp.Invoke(fhandle, 10, 20)
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("Result:%v\n", retVal.AsInt64())
+}
diff --git a/golang/sample/pack_func_register.go b/golang/sample/pack_func_register.go
new file mode 100644
index 000000000000..5da67e00c16c
--- /dev/null
+++ b/golang/sample/pack_func_register.go
@@ -0,0 +1,63 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief Sample golang application to demonstrate function register into TVM global functions.
+ * \file pack_func_register.go
+ */
+
+package main
+
+import (
+    "fmt"
+    "./gotvm"
+    "strings"
+)
+
+// sampleCb is a simple golang callback function like C = A + B.
+func sampleCb(args ...*gotvm.Value) (retVal interface{}, err error) {
+    for _, v := range args {
+        fmt.Printf("ARGS:%T : %v\n", v.AsInt64(), v.AsInt64())
+    }
+    val1 := args[0].AsInt64()
+    val2 := args[1].AsInt64()
+    retVal = int64(val1+val2)
+    return
+}
+
+// main
+func main() {
+    // Register sampleCb with TVM packed function system and call and check Global Function List.
+    gotvm.RegisterFunction(sampleCb, "sampleCb");
+    // Query global functions available
+    funcNames, err := gotvm.FuncListGlobalNames()
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+
+    found := 0
+    for ii := range (funcNames) {
+        if strings.Compare(funcNames[ii], "sampleCb") == 0 {
+            found = 1
+        }
+    }
+    if found == 0 {
+        fmt.Printf("Function registerd but, not listed\n")
+        return
+    }
+
+
+    // Get "sampleCb" and verify the call.
+    funp, err := gotvm.GetGlobalFunction("sampleCb")
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+
+    // Call function
+    result, err := funp.Invoke((int64)(10), (int64)(20))
+    if err != nil {
+        fmt.Print(err)
+        return
+    }
+    fmt.Printf("sampleCb result: %v\n", result.AsInt64())
+}
diff --git a/golang/sample/simple.go b/golang/sample/simple.go
new file mode 100644
index 000000000000..ada3963662de
--- /dev/null
+++ b/golang/sample/simple.go
@@ -0,0 +1,72 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief Sample golang application deployment over tvm.
+ * \file simple.go
+ */
+
+package main
+
+import (
+    "fmt"
+    "runtime"
+    "./gotvm"
+    "math/rand"
+)
+
+// NNVM compiled model paths.
+const (
+    modLib    = "./deploy.so"
+)
+
+// main
+func main() {
+    // Welcome
+    defer runtime.GC()
+    fmt.Printf("TVM Version   : v%v\n", gotvm.TVMVersion)
+    fmt.Printf("DLPACK Version: v%v\n\n", gotvm.DLPackVersion)
+
+    // Import tvm module (so)
+    modp, _ := gotvm.LoadModuleFromFile(modLib)
+    fmt.Printf("Module Imported\n")
+
+
+    // Allocate Array for inputs and outputs.
+    // Allocation by explicit type and context.
+    tshapeIn  := []int64{4}
+    inX, _ := gotvm.Empty(tshapeIn, "float32", gotvm.CPU(0))
+
+    // Default allocation on CPU
+    inY, _ := gotvm.Empty(tshapeIn, "float32")
+
+    // Default allocation to type "float32" and on CPU
+    out, _ := gotvm.Empty(tshapeIn)
+    fmt.Printf("Input and Output Arrays allocated\n")
+
+    // Fill Input Data : inX , inY
+    inXSlice := make([]float32, 4)
+    inYSlice := make([]float32, 4)
+    for i := range inXSlice {
+        inXSlice[i] = rand.Float32()
+        inYSlice[i] = rand.Float32()
+    }
+
+
+    // Copy the data on target memory through runtime CopyFrom api.
+    inX.CopyFrom(inXSlice)
+    inY.CopyFrom(inYSlice)
+    fmt.Printf("X: %v\n", inXSlice)
+    fmt.Printf("Y: %v\n", inYSlice)
+
+    // Get function "myadd"
+    funp, _ := modp.GetFunction("myadd")
+
+    // Call function
+    funp.Invoke(inX, inY, out)
+    fmt.Printf("Module function myadd executed\n")
+
+    // Get the output tensor as an interface holding a slice through runtime CopyTo api.
+    outSlice, _ := out.AsSlice()
+
+    // Print results
+    fmt.Printf("Result:%v\n", outSlice.([]float32))
+}
diff --git a/golang/src/array_test.go b/golang/src/array_test.go
new file mode 100644
index 000000000000..6917dd14e373
--- /dev/null
+++ b/golang/src/array_test.go
@@ -0,0 +1,596 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package
+ * \file array_test.go
+ */
+
+
+package gotvm
+
+import (
+    "testing"
+    "unsafe"
+    "math/rand"
+)
+
+// Create an array and check size.
+func TestArrayCreateSize(t *testing.T) {
+    _, err := Empty([]int64{4})
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    _, err = Empty([]int64{4, 5, 6})
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    _, err = Empty([]int64{})
+    if err == nil {
+        t.Error("Expected err for empty Array created, but didn't got !!")
+        return
+    }
+}
+
+// Check array creation via various different arguments.
+func TestArrayCreateArgs(t *testing.T) {
+    _, err := Empty([]int64{4, 2}, "float32", CPU(0))
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    _, err = Empty([]int64{4, 2}, "float32")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    _, err = Empty([]int64{4, 2}, CPU(0))
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    _, err = Empty([]int64{4, 2}, CPU(0), "float32")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+}
+
+// Create an array and check the NDim.
+func TestArrayNDim(t *testing.T) {
+    arr, err := Empty([]int64{4, 5, 6})
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    if 3 != arr.GetNdim() {
+        t.Errorf("GetNdim failed Expected: 3 Got :%v\n", arr.GetNdim())
+        return
+    }
+}
+
+// Create an array and check Shape.
+func TestArrayShape(t *testing.T) {
+    arr, err := Empty([]int64{4, 5, 6})
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    shape := arr.GetShape()
+    if len(shape) != 3 {
+        t.Errorf("Shape slice expected: 3 Got :%v\n", len(shape))
+        return
+    }
+
+    if shape[0] != 4 || shape[1] != 5 || shape[2] != 6 {
+        t.Errorf("Shape values expected {4, 5, 6} Got : %v\n", shape);
+        return
+    }
+}
+
+// Create an array and check created Context.
+func TestArrayCtx(t *testing.T) {
+    // TODO: Could some test cases for other targets
+    arr, err := Empty([]int64{4}, CPU(0))
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ctx := arr.GetCtx()
+    if ctx.DeviceType != KDLCPU {
+        t.Errorf("Ctx DeviceType expected: %v Got :%v\n", KDLCPU, ctx.DeviceType)
+        return
+    }
+    if ctx.DeviceID != 0 {
+        t.Errorf("Ctx DeviceID expected: %v Got :%v\n", KDLCPU, ctx.DeviceID)
+        return
+    }
+
+    arr, err = Empty([]int64{4}, CPU(2))
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ctx = arr.GetCtx()
+    if ctx.DeviceType != KDLCPU {
+        t.Errorf("Ctx DeviceType expected: %v Got :%v\n", KDLCPU, ctx.DeviceType)
+        return
+    }
+    if ctx.DeviceID != 2 {
+        t.Errorf("Ctx DeviceID expected: %v Got :%v\n", KDLCPU, ctx.DeviceID)
+        return
+    }
+}
+
+// Create array of different dtypes and check dtypes.
+func TestArrayDType(t *testing.T) {
+    for _, dtype := range  []string{"int8", "int16", "int32", "int64",
+                                    "uint8", "uint16", "uint32", "uint64",
+                                    "float32", "float64"} {
+        arr, err := Empty([]int64{4}, dtype)
+        if err != nil {
+            t.Error(err.Error())
+            return
+        }
+
+        if dtype != arr.GetDType() {
+            t.Errorf("Dtype expected: %v Got :%v\n", dtype, arr.GetDType())
+            return
+        }
+    }
+}
+
+// Copy Int8 data to created Array and verify.
+func TestArrayCopySliceInt8(t *testing.T) {
+    dlen := int64(32)
+    arr, err := Empty([]int64{4, dlen/4}, "int8")
+
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    bdata := make([]byte, dlen)
+    rand.Read(bdata)
+    data := (*[1<<31]int8)(unsafe.Pointer(&bdata[0]))[:dlen:dlen]
+
+    err = arr.CopyFrom(data)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ret, err := arr.AsSlice()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    switch ret.(type) {
+        case []int8:
+        default:
+            t.Errorf("Expected : %T but got :%T\n", data, ret)
+            return
+    }
+
+    dataRet := ret.([]int8)
+    if len(data) != len(dataRet) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(data), len(dataRet))
+            return
+    }
+    for i := range data {
+        if data[i] != dataRet[i] {
+            t.Errorf("Data expected: %v Got :%v\n", data, dataRet)
+            return
+        }
+    }
+}
+
+// Copy Int16 data to created Array and verify.
+func TestArrayCopySliceInt16(t *testing.T) {
+    dlen := int64(32)
+    arr, err := Empty([]int64{4, dlen/4}, "int16")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    bdata := make([]byte, dlen*2)
+    rand.Read(bdata)
+    data := (*[1<<31]int16)(unsafe.Pointer(&bdata[0]))[:dlen:dlen]
+
+    err = arr.CopyFrom(data)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ret, err := arr.AsSlice()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    switch ret.(type) {
+        case []int16:
+        default:
+            t.Errorf("Expected : %T but got :%T\n", data, ret)
+            return
+    }
+
+    dataRet := ret.([]int16)
+    if len(data) != len(dataRet) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(data), len(dataRet))
+            return
+    }
+    for i := range data {
+        if data[i] != dataRet[i] {
+            t.Errorf("Data expected: %v Got :%v\n", data, dataRet)
+            return
+        }
+    }
+}
+
+// Copy Int32 data to created Array and verify.
+func TestArrayCopySliceInt32(t *testing.T) {
+    dlen := int64(32)
+    arr, err := Empty([]int64{4, dlen/4}, "int32")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    bdata := make([]byte, dlen*4)
+    rand.Read(bdata)
+    data := (*[1<<31]int32)(unsafe.Pointer(&bdata[0]))[:dlen:dlen]
+
+    err = arr.CopyFrom(data)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ret, err := arr.AsSlice()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    switch ret.(type) {
+        case []int32:
+        default:
+            t.Errorf("Expected : %T but got :%T\n", data, ret)
+            return
+    }
+    dataRet := ret.([]int32)
+    if len(data) != len(dataRet) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(data), len(dataRet))
+            return
+    }
+    for i := range data {
+        if data[i] != dataRet[i] {
+            t.Errorf("Data expected: %v Got :%v\n", data, dataRet)
+            return
+        }
+    }
+}
+
+// Copy Int64 data to created Array and verify.
+func TestArrayCopySliceInt64(t *testing.T) {
+    dlen := int64(32)
+    arr, err := Empty([]int64{4, dlen/4}, "int64")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    bdata := make([]byte, dlen*8)
+    rand.Read(bdata)
+    data := (*[1<<31]int64)(unsafe.Pointer(&bdata[0]))[:dlen:dlen]
+
+    err = arr.CopyFrom(data)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ret, err := arr.AsSlice()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    switch ret.(type) {
+        case []int64:
+        default:
+            t.Errorf("Expected : %T but got :%T\n", data, ret)
+            return
+    }
+    dataRet := ret.([]int64)
+    if len(data) != len(dataRet) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(data), len(dataRet))
+            return
+    }
+    for i := range data {
+        if data[i] != dataRet[i] {
+            t.Errorf("Data expected: %v Got :%v\n", data, dataRet)
+            return
+        }
+    }
+}
+
+// Copy UInt8 data to created Array and verify.
+func TestArrayCopySliceUInt8(t *testing.T) {
+    dlen := int64(32)
+    arr, err := Empty([]int64{4, dlen/4}, "uint8")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    bdata := make([]byte, dlen)
+    rand.Read(bdata)
+    data := (*[1<<31]uint8)(unsafe.Pointer(&bdata[0]))[:dlen:dlen]
+
+    err = arr.CopyFrom(data)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ret, err := arr.AsSlice()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    switch ret.(type) {
+        case []uint8:
+        default:
+            t.Errorf("Expected : %T but got :%T\n", data, ret)
+            return
+    }
+    dataRet := ret.([]uint8)
+    if len(data) != len(dataRet) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(data), len(dataRet))
+            return
+    }
+    for i := range data {
+        if data[i] != dataRet[i] {
+            t.Errorf("Data expected: %v Got :%v\n", data, dataRet)
+            return
+        }
+    }
+}
+
+// Copy UInt16 data to created Array and verify.
+func TestArrayCopySliceUInt16(t *testing.T) {
+    dlen := int64(32)
+    arr, err := Empty([]int64{4, dlen/4}, "uint16")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    bdata := make([]byte, dlen*2)
+    rand.Read(bdata)
+    data := (*[1<<31]uint16)(unsafe.Pointer(&bdata[0]))[:dlen:dlen]
+
+    err = arr.CopyFrom(data)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ret, err := arr.AsSlice()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    switch ret.(type) {
+        case []uint16:
+        default:
+            t.Errorf("Expected : %T but got :%T\n", data, ret)
+            return
+    }
+    dataRet := ret.([]uint16)
+    if len(data) != len(dataRet) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(data), len(dataRet))
+            return
+    }
+    for i := range data {
+        if data[i] != dataRet[i] {
+            t.Errorf("Data expected: %v Got :%v\n", data, dataRet)
+            return
+        }
+    }
+}
+
+// Copy UInt32 data to created Array and verify.
+func TestArrayCopySliceUInt32(t *testing.T) {
+    dlen := int64(32)
+    arr, err := Empty([]int64{4, dlen/4}, "uint32")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    bdata := make([]byte, dlen*4)
+    rand.Read(bdata)
+    data := (*[1<<31]uint32)(unsafe.Pointer(&bdata[0]))[:dlen:dlen]
+
+    err = arr.CopyFrom(data)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ret, err := arr.AsSlice()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    switch ret.(type) {
+        case []uint32:
+        default:
+            t.Errorf("Expected : %T but got :%T\n", data, ret)
+            return
+    }
+    dataRet := ret.([]uint32)
+    if len(data) != len(dataRet) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(data), len(dataRet))
+            return
+    }
+    for i := range data {
+        if data[i] != dataRet[i] {
+            t.Errorf("Data expected: %v Got :%v\n", data, dataRet)
+            return
+        }
+    }
+}
+
+// Copy UInt64 data to created Array and verify.
+func TestArrayCopySliceUInt64(t *testing.T) {
+    dlen := int64(32)
+    arr, err := Empty([]int64{4, dlen/4}, "uint64")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    bdata := make([]byte, dlen*8)
+    rand.Read(bdata)
+    data := (*[1<<31]uint64)(unsafe.Pointer(&bdata[0]))[:dlen:dlen]
+
+    err = arr.CopyFrom(data)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ret, err := arr.AsSlice()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    switch ret.(type) {
+        case []uint64:
+        default:
+            t.Errorf("Expected : %T but got :%T\n", data, ret)
+            return
+    }
+    dataRet := ret.([]uint64)
+    if len(data) != len(dataRet) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(data), len(dataRet))
+            return
+    }
+    for i := range data {
+        if data[i] != dataRet[i] {
+            t.Errorf("Data expected: %v Got :%v\n", data, dataRet)
+            return
+        }
+    }
+}
+
+// Copy Float32 data to created Array and verify.
+func TestArrayCopySliceFloat32(t *testing.T) {
+    dlen := int64(32)
+    arr, err := Empty([]int64{4, dlen/4}, "float32")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    data := make([]float32, dlen)
+
+    for i := range data {
+        data[i] = rand.Float32()
+    }
+
+    err = arr.CopyFrom(data)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ret, err := arr.AsSlice()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    switch ret.(type) {
+        case []float32:
+        default:
+            t.Errorf("Expected : %T but got :%T\n", data, ret)
+            return
+    }
+    dataRet := ret.([]float32)
+    if len(data) != len(dataRet) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(data), len(dataRet))
+            return
+    }
+    for i := range data {
+        if data[i] != dataRet[i] {
+            t.Errorf("Data expected: %v \nGot :%v \n", data, dataRet)
+            return
+        }
+    }
+}
+
+// Copy Float64 data to created Array and verify.
+func TestArrayCopySliceFloat64(t *testing.T) {
+    dlen := int64(32)
+    arr, err := Empty([]int64{4, dlen/4}, "float64")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    data := make([]float64, dlen)
+
+    for i := range data {
+        data[i] = rand.Float64()
+    }
+
+    err = arr.CopyFrom(data)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    ret, err := arr.AsSlice()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    switch ret.(type) {
+        case []float64:
+        default:
+            t.Errorf("Expected : %T but got :%T\n", data, ret)
+            return
+    }
+    dataRet := ret.([]float64)
+    if len(data) != len(dataRet) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(data), len(dataRet))
+            return
+    }
+    for i := range data {
+        if data[i] != dataRet[i] {
+            t.Errorf("Data expected: %v Got :%v\n", data, dataRet)
+            return
+        }
+    }
+}
diff --git a/golang/src/bytearray.go b/golang/src/bytearray.go
new file mode 100644
index 000000000000..e40a630223dc
--- /dev/null
+++ b/golang/src/bytearray.go
@@ -0,0 +1,72 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package source for TVMByteArray interface.
+ * \file bytearray.go
+ */
+
+package gotvm
+
+//#include "gotvm.h"
+import "C"
+
+import (
+    "unsafe"
+)
+
+// ByteArray type wraps the TVMByteArray of C runtime API.
+// 
+// This can be used to hold raw data like params of a model.
+type ByteArray uintptr
+
+// nativeCPtr returns the type freed unitptr for ByteArray.
+func (tbytearray ByteArray) nativeCPtr() (retVal uintptr) {
+	retVal = (uintptr)(tbytearray)
+    return
+}
+
+// SetData is used to intialize ByteArray from a golang string object.
+//
+// This method initialize both data and data size of the underlaying object.
+// This function handles freeing old data object if any before allocating new.
+//
+// `val` is the golang string object from which the ByteArray is initialized.
+func (tbytearray ByteArray) setData(val string) {
+    bufPtr := ((*C.TVMByteArray)(unsafe.Pointer(tbytearray))).data
+    if bufPtr == (*_Ctype_char)(C.NULL) {
+        C.free(unsafe.Pointer(bufPtr))
+    }
+
+    ((*C.TVMByteArray)(unsafe.Pointer(tbytearray))).data = C.CString(val)
+    ((*C.TVMByteArray)(unsafe.Pointer(tbytearray))).size = C.ulong(len(val))
+}
+
+// getData returns the golang byte slice corresponding to the ByteArray.
+func (tbytearray ByteArray) getData() (retVal []byte) {
+	val := ((*C.TVMByteArray)(unsafe.Pointer(tbytearray))).data
+	blen := ((*C.TVMByteArray)(unsafe.Pointer(tbytearray))).size
+	retVal = C.GoBytes(unsafe.Pointer(val), C.int(blen))
+    return
+}
+
+// newByteArray initilizes the native TVMByteArray object with given byte slice
+//
+//`val` is the golang byte array used to initialize.
+//
+// returns newly created ByteArray.
+func newByteArray(val []byte) (retVal ByteArray) {
+    handle := ByteArray(C.malloc(C.sizeof_TVMByteArray))
+    ((*C.TVMByteArray)(unsafe.Pointer(handle))).data = (*_Ctype_char)(C.NULL)
+    ((*C.TVMByteArray)(unsafe.Pointer(handle))).size = 0
+    handle.setData(string(val))
+    retVal = handle
+    return
+}
+
+// deleteTVMByteArray releases the allocated native object of ByteArray.
+//
+// This delete handles freeing of underlaying native data object too.
+func (tbytearray ByteArray) deleteTVMByteArray() {
+    bufPtr := ((*C.TVMByteArray)(unsafe.Pointer(tbytearray))).data
+    C.free(unsafe.Pointer(bufPtr))
+	C.free(unsafe.Pointer(tbytearray.nativeCPtr()))
+}
diff --git a/golang/src/bytearray_test.go b/golang/src/bytearray_test.go
new file mode 100644
index 000000000000..f49e75ee2fa6
--- /dev/null
+++ b/golang/src/bytearray_test.go
@@ -0,0 +1,32 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package
+ * \file bytearray_test.go
+ */
+
+
+package gotvm
+
+import (
+    "testing"
+    "math/rand"
+)
+
+// Check ByteArray creation from byte slice and verify the data.
+func TestByteArrayGet(t *testing.T) {
+    data := make([]byte, 1024)
+    rand.Read(data)
+
+    barr := newByteArray(data)
+    dataRet := barr.getData()
+    if len(data) != len(dataRet) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(data), len(dataRet))
+            return
+    }
+    for i := range data {
+        if data[i] != dataRet[i] {
+            t.Errorf("Data expected: %v Got :%v at : %v\n", data[i], dataRet[i], i)
+            return
+        }
+    }
+}
diff --git a/golang/src/context.go b/golang/src/context.go
new file mode 100644
index 000000000000..8a3b613ea6b9
--- /dev/null
+++ b/golang/src/context.go
@@ -0,0 +1,89 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package source for TVMContext interface
+ * \file context.go
+ */
+
+package gotvm
+
+//#include "gotvm.h"
+import "C"
+
+// KDLCPU is golang enum correspond to TVM device type kDLCPU.
+var KDLCPU                  = int32(C.kDLCPU)
+// KDLGPU is golang enum correspond to TVM device type kDLGPU.
+var KDLGPU                  = int32(C.kDLGPU)
+// KDLCPUPinned is golang enum correspond to TVM device type kDLCPUPinned.
+var KDLCPUPinned            = int32(C.kDLCPUPinned)
+// KDLOpenCL is golang enum correspond to TVM device type kDLOpenCL.
+var KDLOpenCL               = int32(C.kDLOpenCL)
+// KDLMetal is golang enum correspond to TVM device type kDLMetal.
+var KDLMetal                = int32(C.kDLMetal)
+// KDLVPI is golang enum correspond to TVM device type kDLVPI.
+var KDLVPI                  = int32(C.kDLVPI)
+// KDLROCM is golang enum correspond to TVM device type kDLROCM.
+var KDLROCM                 = int32(C.kDLROCM)
+// KDLSDAccel is golang enum correspond to TVM device type kDLSDAccel.
+var KDLSDAccel              = int32(C.kDLSDAccel)
+// KDLVulkan is golang enum correspond to TVM device type kDLVulkan.
+var KDLVulkan               = int32(C.kDLVulkan)
+// KOpenGL is golang enum correspond to TVM device type kOpenGL.
+var KOpenGL                 = int32(C.kOpenGL)
+// KExtDev is golang enum correspond to TVM device type kDLExtDev.
+var KExtDev                 = int32(C.kDLExtDev)
+
+// Context dtype corresponding to TVMContext aka DLContext
+type Context struct {
+    DeviceType int32
+    DeviceID    int32
+}
+
+// CPU returns the Context object for CPU target on given index
+func CPU(index int32) Context {
+    return Context{KDLCPU, index}
+}
+
+// GPU returns the Context object for GPU target on given index
+func GPU(index int32) Context {
+    return Context{KDLGPU, index}
+}
+
+// CPUPinned returns the Context object for CPUPinned target on given index
+func CPUPinned(index int32) Context {
+    return Context{KDLCPUPinned, index}
+}
+
+// OpenCL returns the Context object for OpenCL target on given index
+func OpenCL(index int32) Context {
+    return Context{KDLOpenCL, index}
+}
+
+// Metal returns the Context object for Metal target on given index
+func Metal(index int32) Context {
+    return Context{KDLMetal, index}
+}
+
+// VPI returns the Context object for VPI target on given index
+func VPI(index int32) Context {
+    return Context{KDLVPI, index}
+}
+
+// ROCM returns the Context object for ROCM target on given index
+func ROCM(index int32) Context {
+    return Context{KDLROCM, index}
+}
+
+// SDAccel returns the Context object for SDAccel target on given index
+func SDAccel(index int32) Context {
+    return Context{KDLSDAccel, index}
+}
+
+// Vulkan returns the Context object for Vulkan target on given index
+func Vulkan(index int32) Context {
+    return Context{KDLVulkan, index}
+}
+
+// OpenGL returns the Context object for OpenGL target on given index
+func OpenGL(index int32) Context {
+    return Context{KOpenGL, index}
+}
diff --git a/golang/src/error.go b/golang/src/error.go
new file mode 100644
index 000000000000..00a24652953c
--- /dev/null
+++ b/golang/src/error.go
@@ -0,0 +1,31 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package source for error related API interface.
+ * \file error.go
+ */
+
+package gotvm
+
+//#include "gotvm.h"
+import "C"
+
+import (
+    "unsafe"
+)
+
+// getTVMLastError returns the detailed error string for any api called in TVM runtime.
+//
+// This is useful when any api returns non zero value.
+//
+// Returns golang string for the corresponding native error message.
+func getTVMLastError() (retVal string) {
+    errStr := C.TVMGetLastError()
+    retVal = C.GoString(errStr)
+    return
+}
+
+func setTVMLastError(errStr string) {
+    cstr := C.CString(errStr)
+    C.TVMAPISetLastError(cstr)
+    C.free(unsafe.Pointer(cstr))
+}
diff --git a/golang/src/error_test.go b/golang/src/error_test.go
new file mode 100644
index 000000000000..2a8c345b424b
--- /dev/null
+++ b/golang/src/error_test.go
@@ -0,0 +1,28 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package
+ * \file error_test.go
+ */
+
+
+package gotvm
+
+import (
+    "testing"
+    "strings"
+)
+
+// Check err receiving from TVM global function.
+func TestErrorTest(t *testing.T) {
+    _, err := LoadModuleFromFile("dummy.so")
+    if err == nil {
+        t.Error("Expected an error, but not received\n")
+        return
+    }
+
+    errStr := err.Error()
+    if !(strings.Contains(errStr, string("cannot open shared object"))) {
+        t.Error("Ah! TVM didn't report an error\n")
+    }
+}
+
diff --git a/golang/src/function.go b/golang/src/function.go
new file mode 100644
index 000000000000..fa1c53a5917f
--- /dev/null
+++ b/golang/src/function.go
@@ -0,0 +1,365 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package source for TVMFunction interface.
+ * \file function.go
+ */
+
+package gotvm
+
+//#include "gotvm.h"
+import "C"
+
+import (
+    "unsafe"
+    "encoding/binary"
+    "errors"
+    "runtime"
+    "reflect"
+    "fmt"
+)
+
+// Function type in golang hold pointer for the TVMFunction handle.
+type Function uintptr
+
+// nativeCPtr returns type freed uintptr for the Function.
+func (tvmfunction Function) nativeCPtr() (retVal uintptr) {
+    retVal = (uintptr)(tvmfunction)
+    return
+}
+
+// Invoke calls the TVM packed function referred by the handle with given arguments.
+func (tvmfunction *Function) Invoke(args ...interface{}) (retVal *Value, err error) {
+    funccall := func (fargs ...interface{}) (*Value, error) {
+        return callNativeFunction(tvmfunction, fargs)
+    }
+    // Check is any args are contain any ValueArray
+    // Possible is it's a args forward from one packed function to another.
+    valueArrayFound := false
+    for ii := range args {
+        switch args[ii].(type) {
+            case []*Value:
+                valueArrayFound = true
+        }
+    }
+
+    if !valueArrayFound {
+        return funccall(args...)
+    }
+    if len(args) != 1 {
+        err = fmt.Errorf("Not supported if packed function args are a mix of []Value and other types")
+        return
+    }
+
+    valArray := args[0].([]*Value)
+    if len(valArray) > 0 {
+        newArgs := make([]interface{}, len(valArray))
+        for ii := range valArray {
+            newVal := newTVMValue()
+            newVal.moveFrom(valArray[ii])
+            newArgs[ii] = newVal
+        }
+
+        return funccall(newArgs...)
+    }
+    return funccall()
+}
+
+// FuncListGlobalNames is used to query global callable packed function names from TVM.
+//
+// returns slice of string holding function names and error if any.
+func FuncListGlobalNames() (retVal []string, err error) {
+    var str string
+    ret := (int32)(C._TVMFuncListGlobalNames(unsafe.Pointer((&str))))
+    if ret != 0 {
+        err = errors.New(getTVMLastError())
+        return
+    }
+
+    str = goStringFromNative(*(*string)(unsafe.Pointer(&str)))
+    bin := binary.LittleEndian
+    size := bin.Uint64([]byte(str[:8]))
+    str = str[8:]
+    retVal = make([]string, size)
+    for i := range retVal {
+        len := bin.Uint64([]byte(str[:8]))
+        str = str[8:]
+        retVal[i] = str[:len]
+        str = str[len:]
+    }
+    return
+}
+
+// GetGlobalFunction is to get handle to the given global function name.
+//
+// `funcname` is the name of global packed function.
+//
+// returns a function closure with signature
+//         func (args ...interface{}) (interface{}, error) and  error if any.
+//
+// The closure function can be used to call Function with arguments directly.
+//
+// Variadic arguments can be any type which can be embed into Value.
+func GetGlobalFunction(funcname string) (retVal *Function, err error) {
+    var funp uintptr
+
+    cfuncname := C.CString(funcname)
+    ret := (int32)(C.TVMFuncGetGlobal(cfuncname,
+                                      (*_Ctype_TVMFunctionHandle)(unsafe.Pointer(&funp))))
+    C.free(unsafe.Pointer(cfuncname))
+
+    if ret != 0 {
+        err = errors.New(getTVMLastError())
+        return
+    }
+
+    handle := new(Function)
+    *handle = Function(funp)
+    finalizer := func(fhandle *Function) {
+        nativeTVMFuncFree(fhandle)
+        fhandle = nil
+    }
+    runtime.SetFinalizer(handle, finalizer)
+    retVal = handle
+    return
+}
+
+// callNativeFunction is routine which calls gotvm native wrapper with given arguments.
+//
+// `handle` is the handle for Function.
+//
+// `args` are the variadic arguments to the Function.
+//
+// returns the interface for the return value from TVM if any and error if any.
+func callNativeFunction(handle *Function, args []interface{}) (retVal *Value, err error) {
+    argsIn := make([]*Value, len(args))
+    var typeCodes []int32
+    if len(args) != 0 {
+        typeCodes = make([]int32, len(args))
+    } else {
+        typeCodes = make([]int32, 1)
+    }
+
+    for ii := range args {
+        argsIn[ii] = newTVMValue()
+        if typeCodes[ii], err = argsIn[ii].setValue(args[ii]); err != nil {
+            return
+        }
+    }
+
+    retVal = newTVMValue()
+    argsOut := []*Value{retVal}
+    retTypeCode := KNull
+    err = nativeTVMFuncCall(handle, argsIn, typeCodes, argsOut, &retTypeCode)
+    if err != nil {
+        retVal = nil
+        return
+    }
+    retVal.isLocal = false
+    retVal.dtype = retTypeCode
+    return
+}
+
+// nativeTVMFuncFree free the function handle allocated in TVM runtime.
+//
+// `funp` is the Function handle to be freed.
+func nativeTVMFuncFree(funp *Function) (retVal int32) {
+    retVal = (int32) (C.TVMFuncFree(C.TVMFunctionHandle(funp.nativeCPtr())))
+    return
+}
+
+// nativeToGoSlice converts native TVMValue array to Golang slice of TVMValue
+//
+//
+func nativeToGoSlice(nargValues (*C.void), argValues []*Value, typeCodes []int32) {
+    for ii := range argValues {
+        C._TVMValueNativeGet(unsafe.Pointer(argValues[ii].nativeCPtr()),
+                             unsafe.Pointer(nargValues),
+                             C.int(int32(ii)))
+        argValues[ii].dtype = typeCodes[ii]
+    }
+}
+
+// nativeFromGoSlice converts golang slice of TVMValue to native TVMValue array.
+//
+//
+func nativeFromGoSlice(argValues []*Value) (nptr (*C.void)) {
+    nargValues := ((uintptr)(C.malloc(C.ulong(C.sizeof_TVMValue * len(argValues)))))
+    for ii := range argValues {
+        C._TVMValueNativeSet(unsafe.Pointer(nargValues),
+                             unsafe.Pointer(argValues[ii].nativeCPtr()),
+                             C.int(int32(ii)))
+    }
+    nptr = (*C.void)(unsafe.Pointer(nargValues))
+    return
+}
+
+// nativeTVMFuncCall executes the function with given arguments
+//
+// `funp` Function handle to the packed function.
+//
+// `argValues` is the slice of Value which are arguments to the packed function.
+//
+// `typeCodes` is the alice of argument type codes corresponding to argValues.
+//
+// `retValues` is return argument which is slice of return values from the packed function.
+//
+// `retTypeCode` is int32 holding type codes for retValue
+//
+// Returns err indicating native error if any.
+func nativeTVMFuncCall(funp *Function, argValues []*Value, typeCodes []int32,
+                 retValues []*Value, retTypeCode *int32) (err error) {
+    nargValues := nativeFromGoSlice(argValues)
+    nretValues := nativeFromGoSlice(retValues)
+	result := (int32)(C.TVMFuncCall(_Ctype_TVMFunctionHandle(*funp),
+                                    (*_Ctype_TVMValue)(unsafe.Pointer(nargValues)),
+                                    (*_Ctype_int)(unsafe.Pointer(&(typeCodes[0]))),
+                                    C.int(len(argValues)),
+                                    (*_Ctype_TVMValue)(unsafe.Pointer(nretValues)),
+                                    (*_Ctype_int)(unsafe.Pointer(retTypeCode))))
+    nativeToGoSlice(nargValues, argValues, typeCodes)
+    nativeToGoSlice(nretValues, retValues, (*[1<<31] int32)(unsafe.Pointer(retTypeCode))[:1:1])
+    C.free(unsafe.Pointer(nargValues))
+    C.free(unsafe.Pointer(nretValues))
+
+    if result != 0 {
+	    err = errors.New(getTVMLastError())
+    }
+    return
+}
+
+// goCallBack is a structure holding the go callback function pointer.
+// This wrapping is necessary as cgo doesn't support
+// passing golang functions type conversion to native.
+type goCallBack struct {
+    cb func (args ...*Value) (interface{}, error)
+}
+
+//export goTVMCallback
+func goTVMCallback(args C.native_voidp, typeCodes C.native_voidp, numArgs int32,
+                   retArg C.native_voidp, resourceHandle C.native_voidp) (ret int32){
+    fcb := (*goCallBack)(resourceHandle)
+    // Make Value Sice from native TVMValue pointer.
+    argValues := make([]*Value, numArgs)
+
+    for ii := range argValues {
+        argValues[ii] = newTVMValue()
+        argValues[ii].isLocal = false
+    }
+
+    // Prepare arguments for golang callback function
+    nativeToGoSlice((*C.void)(unsafe.Pointer(args)), argValues,
+                    (*[1<<31] int32)(unsafe.Pointer(typeCodes))[:numArgs:numArgs])
+    cbargs := argValues
+
+    // Execute the callback
+    retVal, err := fcb.cb(cbargs...)
+    if err != nil {
+        errStr := err.Error()
+        setTVMLastError(errStr)
+        return -1
+    }
+
+    // It's possible a packed function directly return 
+    // the return value of another packed function.
+    //
+    // Inside a packed func :
+    //      ```return pfunc.Invoke(args)```
+    //
+    // In this case pfunc returns nil which is 
+    // returned as an interface holding nil *Value.
+    // Which becomes a valid retVal holding nil *Value.
+    isRetNull := false
+    switch retVal.(type) {
+        case *Value:
+            pRet := retVal.(*Value)
+            if pRet == nil {
+                isRetNull = true
+            }
+    }
+
+    // Handle return value from callback function
+    if retVal != nil && !isRetNull {
+        var retTypeCode int32
+        retValues := []*Value{newTVMValue()}
+
+        retTypeCode, err = retValues[0].setValue(retVal)
+        if err != nil {
+            errStr := err.Error()
+            setTVMLastError(errStr)
+            return -1
+        }
+        nretValues := nativeFromGoSlice(retValues)
+
+        // Handle KStr, KBytes: Local finalizers shouldn't try freeing them.
+        retValues[0].isLocal = false
+
+        apiRet := (int32) (C.TVMCFuncSetReturn(_Ctype_TVMRetValueHandle(retArg),
+                                               (*_Ctype_TVMValue)(unsafe.Pointer(nretValues)),
+                                               (*_Ctype_int)(unsafe.Pointer(&retTypeCode)), 1))
+        C.free(unsafe.Pointer(nretValues))
+        if apiRet != 0 {
+            errStr := string("TVMCFuncSetReturn failed ")
+            setTVMLastError(errStr)
+        }
+    }
+    return
+}
+
+// ConvertFunction converts given golang function to TVM packed function.
+//
+// `args[0]` function pointer for a type ```func (args ...interface{}) (interface{})```
+//
+// Returns Function handle and err if any.
+func ConvertFunction(args ...interface{}) (retVal *Function, err error) {
+    function := args[0].(func (args ...*Value) (interface{}, error))
+    fcb := &goCallBack{cb:function}
+    var funp uintptr
+
+    result := (int32) (C._ConvertFunction(unsafe.Pointer(fcb),
+                                          unsafe.Pointer(&funp)))
+    if result != 0 {
+	    err = errors.New(getTVMLastError())
+    }
+
+    handle := new(Function)
+    *handle = Function(funp)
+    finalizer := func(fhandle *Function) {
+        nativeTVMFuncFree(fhandle)
+        fhandle = nil
+    }
+    runtime.SetFinalizer(handle, finalizer)
+    retVal = handle
+    return
+}
+
+// RegisterFunction registers the golang func in TVM runtime global space.
+//
+// `args[0]` function pointer for a type ```func (args ...interface{}) (interface{})```
+//
+// `args[1]` Optional argument of function name with which it will be registered.
+//           If not passed we use function name from reflection.
+//
+// Returns err indicating native error if any.
+func RegisterFunction(args ...interface{}) (err error) {
+    fhandle, err := ConvertFunction(args...)
+    if err != nil {
+        return
+    }
+
+    funcname := runtime.FuncForPC(reflect.ValueOf(args[0]).Pointer()).Name()
+    if len(args) > 1 {
+        funcname = args[1].(string)
+    }
+
+    cfuncname := C.CString(funcname)
+    result := (int32) (C.TVMFuncRegisterGlobal(cfuncname,
+                                               _Ctype_TVMFunctionHandle(*fhandle),
+                                               0)); // Override = False
+    C.free(unsafe.Pointer(cfuncname))
+    if result != 0 {
+	    err = errors.New(getTVMLastError())
+    }
+    // Clear the finalizer as we don't need to control it anymore.
+    runtime.SetFinalizer(fhandle, nil)
+    return
+}
diff --git a/golang/src/function_test.go b/golang/src/function_test.go
new file mode 100644
index 000000000000..d53822837220
--- /dev/null
+++ b/golang/src/function_test.go
@@ -0,0 +1,331 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package
+ * \file function_test.go
+ */
+
+package gotvm
+
+import (
+    "testing"
+    "reflect"
+    "math/rand"
+    "strings"
+    "fmt"
+)
+
+// Check global function list API
+func TestFunctionGlobals(t *testing.T) {
+    funcNames, err := FuncListGlobalNames()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    if len(funcNames) < 1 {
+        t.Errorf("Global Function names received:%v\n", funcNames)
+    }
+}
+
+// Check GetFunction API
+func TestFunctionGlobalGet(t *testing.T) {
+    funp, err := GetGlobalFunction("tvm.graph_runtime.create")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    if reflect.TypeOf(funp).Kind() != reflect.Ptr {
+        t.Error("Function type mis matched\n")
+        return
+    }
+}
+
+func TestFunctionModuleGet(t *testing.T) {
+    modp, err := LoadModuleFromFile("./deploy.so")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    funp, err := modp.GetFunction("myadd")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    if reflect.TypeOf(funp).Kind() != reflect.Ptr {
+        t.Error("Function type mis matched\n")
+        return
+    }
+
+    dlen := int64(1024)
+    shape := []int64{dlen}
+    inX, _ := Empty(shape)
+    inY, _ := Empty(shape)
+    out, _ := Empty(shape)
+    dataX := make([]float32, (dlen))
+    dataY := make([]float32, (dlen))
+    outExpected :=  make([]float32, (dlen))
+
+    for i := range dataX {
+        dataX[i] = rand.Float32()
+        dataY[i] = rand.Float32()
+        outExpected[i] = dataX[i] + dataY[i]
+    }
+
+    inX.CopyFrom(dataX)
+    inY.CopyFrom(dataY)
+
+    funp.Invoke(inX, inY, out)
+    outi, _ := out.AsSlice()
+    outSlice := outi.([]float32)
+    if len(outSlice) != len(outExpected) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(outExpected), len(outSlice))
+            return
+    }
+    for i := range outSlice {
+        if outExpected[i] != outSlice[i] {
+            t.Errorf("Data expected: %v Got :%v at index %v\n", outExpected[i], outSlice[i], i)
+            return
+        }
+    }
+}
+
+// Check FunctionConvert API
+func TestFunctionConvert(t *testing.T) {
+    sampleCb := func (args ...*Value) (retVal interface{}, err error) {
+        val1 := args[0].AsInt64()
+        val2 := args[1].AsInt64()
+        retVal = int64(val1+val2)
+        return
+    }
+
+    fhandle, err := ConvertFunction(sampleCb)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    retVal, err := fhandle.Invoke(10, 20)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    if retVal.AsInt64() != int64(30) {
+        t.Errorf("Expected result :30 got:%v\n", retVal.AsInt64())
+        return
+    }
+}
+
+func TestFunctionError(t *testing.T) {
+    sampleCb := func (args ...*Value) (retVal interface{}, err error) {
+        err = fmt.Errorf("Sample Error XYZABC");
+        return
+    }
+
+    fhandle, err := ConvertFunction(sampleCb)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    _, err = fhandle.Invoke()
+    if err == nil {
+        t.Error("Expected error but didn't received\n")
+        return
+    }
+
+    if  !strings.Contains(err.Error(), string("Sample Error XYZABC")) {
+        t.Errorf("Expected Error should contain :\"Sample Error XYZABC\" got :%v\n", err.Error())
+    }
+}
+
+// Check FunctionRegister
+func TestFunctionRegister(t *testing.T) {
+    sampleCb := func (args ...*Value) (retVal interface{}, err error) {
+        val1 := args[0].AsInt64()
+        val2 := args[1].AsInt64()
+        retVal = int64(val1+val2)
+        return
+    }
+
+    RegisterFunction(sampleCb, "TestFunctionRegister.sampleCb");
+    // Query global functions available
+    funcNames, err := FuncListGlobalNames()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    found := 0
+    for ii := range (funcNames) {
+        if strings.Compare(funcNames[ii], "TestFunctionRegister.sampleCb") == 0 {
+            found = 1
+        }
+    }
+    if found == 0 {
+        t.Error("Registered function not found in global function list.")
+        return
+    }
+
+    // Get "sampleCb" and verify the call.
+    funp, err := GetGlobalFunction("TestFunctionRegister.sampleCb")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    // Call function
+    result, err := funp.Invoke((int64)(10), (int64)(20))
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    if result.AsInt64() != int64(30) {
+        t.Errorf("Expected result :30 got:%v\n", result.AsInt64())
+        return
+    }
+}
+
+// Check packed function receiving go-closure as argument.
+func TestFunctionClosureArg(t *testing.T) {
+    // sampleFunctionArg receives a Packed Function handle and calls it.
+    sampleFunctionArg := func (args ...*Value) (retVal interface{}, err error) {
+        // Reveive Packed Function Handle
+        pfunc := args[0].AsFunction()
+
+        // Call Packed Function by Value
+        ret, err := pfunc.Invoke(args[1], args[2])
+        if err != nil {
+            return
+        }
+
+        // Call Packed Function with extracted values
+        ret1, err := pfunc.Invoke(args[1].AsInt64(), args[2].AsInt64())
+        if err != nil {
+            return
+        }
+        if ret1.AsInt64() != ret.AsInt64() {
+            err = fmt.Errorf("Invoke with int64 didn't match with Value\n")
+            return
+        }
+        retVal = ret
+        return
+    }
+
+    RegisterFunction(sampleFunctionArg, "TestFunctionClosureArg.sampleFunctionArg");
+    funp, err := GetGlobalFunction("TestFunctionClosureArg.sampleFunctionArg")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    // funccall is a simple golang callback function like C = A + B.
+    funccall := func (args ...*Value) (retVal interface{}, err error) {
+        val1 := args[0].AsInt64()
+        val2 := args[1].AsInt64()
+        retVal = int64(val1+val2)
+        return
+    }
+
+    // Call function
+    result, err := funp.Invoke(funccall, 30, 50)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    if result.AsInt64() != int64(80) {
+        t.Errorf("Expected result :80 got:%v\n", result.AsInt64())
+        return
+    }
+}
+
+// Check packed function returning a go-closure.
+func TestFunctionClosureReturn(t *testing.T) {
+    // sampleFunctionCb returns a function closure which is embed as packed function in TVMValue.
+    sampleFunctionCb := func (args ...*Value) (retVal interface{}, err error) {
+        funccall := func (cargs ...*Value) (fret interface{}, ferr error) {
+            val1 := cargs[0].AsInt64()
+            val2 := cargs[1].AsInt64()
+            fret = int64(val1+val2)
+            return
+        }
+        retVal = funccall
+        return
+    }
+
+    RegisterFunction(sampleFunctionCb, "TestFunctionClosureReturn.sampleFunctionCb");
+    funp, err := GetGlobalFunction("TestFunctionClosureReturn.sampleFunctionCb")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    // Call function
+    result, err := funp.Invoke()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    pfunc := result.AsFunction()
+    pfuncRet, err := pfunc.Invoke(30, 40)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    if pfuncRet.AsInt64() != int64(70) {
+        t.Errorf("Expected result :70 got:%v\n", pfuncRet.AsInt64())
+        return
+    }
+}
+
+// Check packed function with no arguments and no return values.
+func TestFunctionNoArgsReturns(t *testing.T) {
+    sampleFunction := func (args ...*Value) (retVal interface{}, err error) {
+        return
+    }
+
+    fhandle, err := ConvertFunction(sampleFunction)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    _, err = fhandle.Invoke()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+}
+
+// Check packed function returning a go-closure with no arg and returns.
+func TestFunctionNoArgsReturns2(t *testing.T) {
+    // sampleFunctionCb returns a function closure which is embed as packed function in TVMValue.
+    sampleFunctionCb := func (args ...*Value) (retVal interface{}, err error) {
+        funccall := func (cargs ...*Value) (fret interface{}, ferr error) {
+            return
+        }
+        retVal = funccall
+        return
+    }
+
+    funp, err := ConvertFunction(sampleFunctionCb)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    // Call function
+    result, err := funp.Invoke()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    pfunc := result.AsFunction()
+    _, err = pfunc.Invoke()
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+}
diff --git a/golang/src/gotvm.cc b/golang/src/gotvm.cc
new file mode 100644
index 000000000000..cf84e670df79
--- /dev/null
+++ b/golang/src/gotvm.cc
@@ -0,0 +1,195 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm native interface definition
+ * \file gotvm.cxx
+ */
+
+// Standard includes
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <stdint.h>
+
+// golang string compatible definition
+typedef struct { char *p; int n; } _gostring_;
+#include <string>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// TVM runtime C interface
+#include <tvm/runtime/c_runtime_api.h>
+#include <dlpack/dlpack.h>
+
+/*!
+ * \brief Convert native char array to _gostring_ structure.
+ * _gostring_ structure represents the same memory footprint as golang string object.
+ *
+ * \param p is char pointer to a char array.
+ * \param l is the size of the char array. this method exclusively need length as
+ * its possible to have a bytearray in a string.
+ *
+ * \return _gostring_ object corresponding to native char array.
+ * Caller is responsible to free the memory block allocated here.
+ */
+static _gostring_ _native_to_gostring(const char *p, size_t l) {
+  _gostring_ ret;
+  ret.p = reinterpret_cast<char*>(malloc(l));
+  if (NULL == ret.p) {
+    ret.n = 0;
+    return ret;
+  }
+  memcpy(ret.p, p, l);
+  ret.n = l;
+  return ret;
+}
+
+/*!
+ * \brief embeds a 64bit uint value inside a string to serialize the data.
+ *
+ * \param s is string object.
+ * \param off is the offset in the string object.
+ * \param v is the uint64_t value which need to embed into given string.
+ */
+static void putuint64(std::string *s, size_t off, uint64_t v) {
+    for (int i = 0; i < 8; i++) {
+        (*s)[off + i] = (v >> (i * 8)) & 0xff;
+    }
+}
+
+// TVM runtime C interface wrappers
+
+/*!
+ * \brief Native interface to query TVM_VERSION in golang string format.
+ *
+ * \return char pointer to TVM-VERSION
+ */
+const char* _TVM_VERSION(void) {
+  const char *version = TVM_VERSION;
+  return version;
+}
+
+/*!
+ * \brief Native interface for getting TVMGlobal function list.
+ *
+ * \param names return by argument to return the function names.
+ * We wrap all strings into single string joined by (len+string)
+ * which is unpacked and processed in golang.
+ *
+ * \return c_runtime_api return status.
+ */
+int _TVMFuncListGlobalNames(_gostring_* names) {
+  int names_size;
+  char **names_array;
+  int result;
+
+  result = TVMFuncListGlobalNames(&names_size, (char const ***)&names_array);
+  if (result) {
+    return result;
+  }
+
+  size_t tot = 8;
+  for (int ii = 0; ii < names_size ; ++ii) {
+    tot += 8 + strlen(names_array[ii]);
+  }
+
+  std::string str;
+  str.resize(tot);
+  putuint64(&str, 0, names_size);
+  size_t off = 8;
+  for (int64_t ii = 0; ii < names_size ; ++ii) {
+    putuint64(&str, off, strlen(names_array[ii]));
+    off += 8;
+    str.replace(off, strlen(names_array[ii]), names_array[ii]);
+    off += strlen(names_array[ii]);
+  }
+  *names = _native_to_gostring(str.data(), str.size());
+  if (str.size() != names->n) {
+    TVMAPISetLastError("malloc failed during _native_to_gostring");
+    result = 1;
+  }
+  return result;
+}
+
+// Helpers for TVMValue
+
+/*!
+ * \brief Native helper to copy TVMValue from golang slice to native array.
+ * this helper is need as underlying momory for golang slice is not continueous.
+ *
+ * \param to_ptr is the native pointer of TVMValue array.
+ * \param from_ptr pointer to TVMValue in golang slice.
+ * \param array index in native array.
+ */
+void _TVMValueNativeSet(void* to_ptr, void* from_ptr, int ind) {
+  TVMValue *from_p = reinterpret_cast<TVMValue*>(from_ptr);
+  TVMValue *to_p = reinterpret_cast<TVMValue*>(to_ptr);
+  memcpy(to_p+ind, from_p, sizeof(TVMValue));
+}
+
+/*!
+ * \brief Native helper to copy TVMValue from golang slice to native array.
+ * this helper is need as underlying momory for golang slice is not continueous.
+ *
+ * \param to_ptr pointer to TVMValue in golang slice.
+ * \param from_ptr is the native pointer of TVMValue array.
+ * \param array index in native array.
+ */
+void _TVMValueNativeGet(void* to_ptr, void* from_ptr, int ind) {
+  TVMValue *from_p = reinterpret_cast<TVMValue*>(from_ptr);
+  TVMValue *to_p = reinterpret_cast<TVMValue*>(to_ptr);
+  memcpy(to_p, from_p+ind, sizeof(TVMValue));
+}
+
+extern int goTVMCallback(void*, void*, int, void*, void*);
+
+/*!
+ * \brief _TVMCallback is the TVM runtime callback function for PackedFunction system.
+ *
+ * \param args is an array of TVMValue
+ * \param type_codes is an array of int
+ * \param num_args is int representing number of in arguments
+ * \param ret is the return value handle to set the packed function return.
+ * \param resource_handle is the golang private data pointer.
+ *
+ * \returns the error status as TVM_DLL
+ */
+int _TVMCallback(TVMValue* args,
+                 int* type_codes,
+                 int num_args,
+                 TVMRetValueHandle ret,
+                 void* resource_handle) {
+    return goTVMCallback(args, type_codes, num_args, ret, resource_handle);
+}
+
+/*!
+ * _TVMPackedCFuncFinalizer is finalizer for packed function system.
+ *
+ */
+void _TVMPackedCFuncFinalizer(void* resource_handle) {
+    return;
+}
+
+/*!
+ * /brief _ConvertFunction creates a packed function for with given resource handle.
+ *
+ * /param fptr is the pointer to golang resource handle.
+ * /param *fhandle is the return argument holding packed function.
+ *
+ * /return is an int indicating the return status.
+ */
+int _ConvertFunction(void* fptr, TVMFunctionHandle *fhandle) {
+  int ret = TVMFuncCreateFromCFunc(_TVMCallback,
+                                   fptr,
+                                   _TVMPackedCFuncFinalizer,
+                                   fhandle);
+  return ret;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/golang/src/gotvm.go b/golang/src/gotvm.go
new file mode 100644
index 000000000000..3f7aac93d769
--- /dev/null
+++ b/golang/src/gotvm.go
@@ -0,0 +1,24 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package
+ * \file gotvm.go
+ */
+
+
+// Package gotvm is TVM runtime interface definition for golang.
+//
+// Application need to import this package to access the c_runtime_api exposed by TVM.
+package gotvm
+
+//#include "gotvm.h"
+import "C"
+
+// DLPackVersion is the dlpack version of tvm runtime.
+var DLPackVersion           = int(C.DLPACK_VERSION)
+// TVMVersion is the TVM runtime version.
+var TVMVersion              = getTVMVersion()
+
+func getTVMVersion() (retStr string) {
+    retStr = C.GoString(C._TVM_VERSION())
+    return
+}
diff --git a/golang/src/gotvm.h b/golang/src/gotvm.h
new file mode 100644
index 000000000000..e4487a362cca
--- /dev/null
+++ b/golang/src/gotvm.h
@@ -0,0 +1,42 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm native interface declaration.
+ * \file gotvm.h
+ *
+ * These declarations are in cgo interface definition while calling API
+ * across golang and native C boundaries.
+ */
+
+#ifndef GOTVM_GOTVM_H_
+#define GOTVM_GOTVM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <dlpack/dlpack.h>
+
+// Some type definitions for golang "C"
+typedef void* native_voidp;
+
+// Version
+extern char* _TVM_VERSION(void);
+
+// Wrappers : For incompatible cgo API.
+// To handle array of strings wrapped into __gostring__
+extern int _TVMFuncListGlobalNames(void*);
+// To handle TVMValue slice to/from native sequential TVMValue array.
+extern void _TVMValueNativeSet(void* to, void* from, int index);
+extern void _TVMValueNativeGet(void* to, void* from, int index);
+
+// Callbacks
+extern int _ConvertFunction(void* fptr, void* funp);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // GOTVM_GOTVM_H_
diff --git a/golang/src/gotvm_test.go b/golang/src/gotvm_test.go
new file mode 100644
index 000000000000..5058de400ba7
--- /dev/null
+++ b/golang/src/gotvm_test.go
@@ -0,0 +1,30 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package
+ * \file gotvm_test.go
+ */
+
+
+package gotvm
+
+import (
+    "testing"
+    "reflect"
+)
+
+// Check TVMVersion API
+func TestTVMVersion(t *testing.T) {
+    if len(TVMVersion) == 0 {
+        t.Error("TVMVersion not set\n")
+    }
+    if reflect.TypeOf(TVMVersion).Kind() != reflect.String {
+        t.Error("TVMVersion type mismatch\n")
+    }
+}
+
+// Check DLPackVersion API
+func TestDLPackVersion(t *testing.T) {
+    if reflect.TypeOf(DLPackVersion).Kind() != reflect.Int {
+        t.Error("TVMVersion type mismatch\n")
+    }
+}
diff --git a/golang/src/module.go b/golang/src/module.go
new file mode 100644
index 000000000000..422cb6be20ff
--- /dev/null
+++ b/golang/src/module.go
@@ -0,0 +1,121 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package source for TVMModule interface.
+ * \file module.go
+ */
+
+package gotvm
+
+//#include "gotvm.h"
+import "C"
+
+import (
+    "errors"
+    "runtime"
+    "unsafe"
+)
+
+// Module type in golang hold pointer for the TVMModule handle.
+//
+// Module initialization happen through TVMModLoadFromFile api in TVM runtime.
+type Module uintptr
+
+// nativeCPtr returns type freed uintptr for the Module.
+func (tvmmodule *Module) nativeCPtr() (retVal uintptr) {
+    retVal = (uintptr)(*tvmmodule)
+    return
+}
+
+// LoadModuleFromFile loads the given module in TVM runtime.
+//
+// `modpath` is the path to tvm module.
+//
+// `args` is an optional arguments of ["dll", "dylib", "dso", "so"] with default value "so"
+//
+// returns pointer to Module and err or if any.
+func LoadModuleFromFile(modpath string, args ...interface{}) (retVal *Module, err error) {
+    modtype := "so"
+    if len(args) > 0 {
+       modtype  = args[0].(string)
+    }
+    var modp uintptr
+
+    cmodpath := C.CString(modpath)
+    cmodtype := C.CString(modtype)
+
+    ret := (int32)(C.TVMModLoadFromFile(cmodpath,
+                                        cmodtype,
+                                        (*_Ctype_TVMModuleHandle)(unsafe.Pointer(&modp))))
+
+    C.free(unsafe.Pointer(cmodpath))
+    C.free(unsafe.Pointer(cmodtype))
+
+    if ret != 0 {
+        err = errors.New(getTVMLastError())
+        return
+    }
+
+    handle := new(Module)
+    *handle = Module(modp)
+    finalizer := func(mhandle *Module) {
+        nativeTVMModFree(mhandle)
+        mhandle = nil
+    }
+    runtime.SetFinalizer(handle, finalizer)
+    retVal = handle
+    return
+}
+
+// nativeTVMModFree free the module handle allocated in TVM runtime.
+//
+// `modp` is the Module handle to be freed.
+func nativeTVMModFree(modp *Module) (retVal int32) {
+    retVal = (int32) (C.TVMModFree(C.TVMModuleHandle(modp.nativeCPtr())))
+    return
+}
+
+// GetFunction returns the function pointer from the module for given function name.
+//
+// `tvmmodule` is handle for Module
+//
+// `funcname` function name in module.
+//
+// `args` variadic args of `queryImport`
+//
+// returns function closure with signature
+//         func (args ...interface{}) (interface{}, error) and error if any.
+//
+// The closure function can be used to call Function with arguments directly.
+//
+// Variadic arguments can be any type which can be embed into Value.
+func (tvmmodule *Module) GetFunction (
+      funcname string, args ...interface{}) (
+      retVal *Function, err error){
+    queryImports := int32(1)
+    if len(args) > 0 {
+        queryImports = int32(args[1].(int))
+    }
+
+    var funp uintptr
+    cfuncname := C.CString(funcname)
+    ret := (int32)(C.TVMModGetFunction((_Ctype_TVMModuleHandle)(*tvmmodule),
+                                       cfuncname,
+                                       C.int(queryImports),
+                                       (*_Ctype_TVMFunctionHandle)(unsafe.Pointer(&funp))))
+    C.free(unsafe.Pointer(cfuncname))
+
+    if ret != 0 {
+        err = errors.New(getTVMLastError())
+        return
+    }
+
+    handle := new(Function)
+    *handle = Function(funp)
+    finalizer := func(fhandle *Function) {
+        nativeTVMFuncFree(fhandle)
+        fhandle = nil
+    }
+    runtime.SetFinalizer(handle, finalizer)
+    retVal = handle
+    return
+}
diff --git a/golang/src/module_test.go b/golang/src/module_test.go
new file mode 100644
index 000000000000..fac094438e96
--- /dev/null
+++ b/golang/src/module_test.go
@@ -0,0 +1,93 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package
+ * \file module_test.go
+ */
+
+
+package gotvm
+
+import (
+    "testing"
+    "reflect"
+)
+
+// Check module loading - dll
+func TestModuleTestLoad1(t *testing.T) {
+    // dll
+    mod, err := LoadModuleFromFile("./deploy.so", "dll")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    if reflect.TypeOf(mod).Kind() != reflect.Ptr {
+        t.Error("Module type mis matched\n")
+        return
+    }
+}
+
+// Check module loading - dylib
+func TestModuleTestLoad2(t *testing.T) {
+    // dylib
+    mod, err := LoadModuleFromFile("./deploy.so", "dylib")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    if reflect.TypeOf(mod).Kind() != reflect.Ptr {
+        t.Error("Module type mis matched\n")
+        return
+    }
+}
+
+func TestModuleTestLoad3(t *testing.T) {
+    // dso
+    mod, err := LoadModuleFromFile("./deploy.so", "dso")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    if reflect.TypeOf(mod).Kind() != reflect.Ptr {
+        t.Error("Module type mis matched\n")
+        return
+    }
+}
+
+// Check module loading - so
+func TestModuleTestLoad4(t *testing.T) {
+    // so
+    mod, err := LoadModuleFromFile("./deploy.so", "so")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    if reflect.TypeOf(mod).Kind() != reflect.Ptr {
+        t.Error("Module type mis matched\n")
+        return
+    }
+}
+
+// Check module loading - default (so)
+func TestModuleTestLoad5(t *testing.T) {
+    // default type as so
+    mod, err := LoadModuleFromFile("./deploy.so")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    if reflect.TypeOf(mod).Kind() != reflect.Ptr {
+        t.Error("Module type mis matched\n")
+        return
+    }
+}
+
+// Check module loading err
+func TestModuleTestLoadErr(t *testing.T) {
+    // Unknown file should return error
+    _, err := LoadModuleFromFile("xyzabc.so")
+    if err == nil {
+        t.Error("Expected an error, but not received\n")
+        return
+    }
+}
+
diff --git a/golang/src/ndarray.go b/golang/src/ndarray.go
new file mode 100644
index 000000000000..ceae7e58c203
--- /dev/null
+++ b/golang/src/ndarray.go
@@ -0,0 +1,329 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package source for TVMArray aka DLTensor
+ * \file ndarray.go
+ */
+
+package gotvm
+
+//#include "gotvm.h"
+import "C"
+
+import (
+    "unsafe"
+    "fmt"
+    "errors"
+    "runtime"
+    "reflect"
+)
+
+// Array type in golang hold pointer for the TVMArray object from dlpack.
+//
+// Array initialization happen through Empty api
+type Array uintptr
+
+// nativeCPtr returns type freed uintptr for the Array.
+func (parray Array) nativeCPtr() (retVal uintptr) {
+    retVal = (uintptr)(parray)
+    return
+}
+
+func (parray Array) nativeCopyFrom(data unsafe.Pointer, datalen int) (err error) {
+    ret := C.TVMArrayCopyFromBytes((*_Ctype_TVMArray)(unsafe.Pointer(parray.nativeCPtr())),
+                                   data,
+                                   C.ulong(datalen))
+    if ret != 0 {
+        err = errors.New(getTVMLastError())
+    }
+    return
+}
+
+// CopyFrom copies given golang data slice into Array.
+//
+// `val` is interface homding a slice of Array data type.
+//
+// returns err is any.
+// TOD: Use reflections for better handling
+func (parray Array) CopyFrom(val interface{}) (err error) {
+    var data unsafe.Pointer
+    var datalen int
+    dtype := ((*_Ctype_TVMArray)(unsafe.Pointer(parray))).dtype
+
+    switch val.(type) {
+        case []int8:
+            sliceVal := val.([]int8)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            return parray.nativeCopyFrom(data, datalen)
+        case []int16:
+            sliceVal := val.([]int16)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            return parray.nativeCopyFrom(data, datalen)
+        case []int32:
+            sliceVal := val.([]int32)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            return parray.nativeCopyFrom(data, datalen)
+        case []int64:
+            sliceVal := val.([]int64)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            return parray.nativeCopyFrom(data, datalen)
+        case []uint8:
+            sliceVal := val.([]uint8)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            return parray.nativeCopyFrom(data, datalen)
+         case []uint16:
+            sliceVal := val.([]uint16)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            return parray.nativeCopyFrom(data, datalen)
+        case []uint32:
+            sliceVal := val.([]uint32)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            return parray.nativeCopyFrom(data, datalen)
+        case []uint64:
+            sliceVal := val.([]uint64)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            return parray.nativeCopyFrom(data, datalen)
+        case []float32:
+            sliceVal := val.([]float32)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            return parray.nativeCopyFrom(data, datalen)
+        case []float64:
+            sliceVal := val.([]float64)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            return parray.nativeCopyFrom(data, datalen)
+        default:
+            err = fmt.Errorf("Given type not supported : %v\n", reflect.TypeOf(val))
+            return
+    }
+    return
+}
+
+func (parray Array) nativeCopyTo (data unsafe.Pointer, datalen int) (err error){
+    ret := C.TVMArrayCopyToBytes((*_Ctype_TVMArray)(unsafe.Pointer(parray.nativeCPtr())),
+                                  unsafe.Pointer(data),
+                                  C.ulong(datalen))
+
+    if ret != 0 {
+        err = errors.New(getTVMLastError())
+    }
+   return
+}
+
+// AsSlice returns the unitptr of for the data inside Array.
+//
+// returns the slice of array inside Array and err of any.
+// TOD: Use reflections for better handling
+func (parray Array) AsSlice() (retVal interface{}, err error) {
+    shape := parray.GetShape()
+    size := int64(1)
+    var data unsafe.Pointer
+    var datalen int
+
+    for ii := range shape {
+        size *= shape[ii]
+    }
+    dtype := ((*_Ctype_TVMArray)(unsafe.Pointer(parray))).dtype
+
+    switch parray.GetDType() {
+        case "int8":
+            sliceVal := make([]int8, size)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            err = parray.nativeCopyTo(data, datalen)
+            retVal = sliceVal
+        case "int16":
+            sliceVal := make([]int16, size)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            err = parray.nativeCopyTo(data, datalen)
+            retVal = sliceVal
+        case "int32":
+            sliceVal := make([]int32, size)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            err = parray.nativeCopyTo(data, datalen)
+            retVal = sliceVal
+        case "int64":
+            sliceVal := make([]int64, size)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            err = parray.nativeCopyTo(data, datalen)
+            retVal = sliceVal
+        case "uint8":
+            sliceVal := make([]uint8, size)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            err = parray.nativeCopyTo(data, datalen)
+            retVal = sliceVal
+        case "uint16":
+            sliceVal := make([]uint16, size)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            err = parray.nativeCopyTo(data, datalen)
+            retVal = sliceVal
+        case "uint32":
+            sliceVal := make([]uint32, size)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            err = parray.nativeCopyTo(data, datalen)
+            retVal = sliceVal
+        case "uint64":
+            sliceVal := make([]uint64, size)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            err = parray.nativeCopyTo(data, datalen)
+            retVal = sliceVal
+        case "float32":
+            sliceVal := make([]float32, size)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            err = parray.nativeCopyTo(data, datalen)
+            retVal = sliceVal
+        case "float64":
+            sliceVal := make([]float64, size)
+            data = unsafe.Pointer(&sliceVal[0])
+            datalen = len(sliceVal) * int(dtype.bits / 8)
+            err = parray.nativeCopyTo(data, datalen)
+            retVal = sliceVal
+        default:
+            err = fmt.Errorf("Given type not supported : %v\n", parray.GetDType())
+            return
+    }
+    return
+}
+
+// GetNdim returns the number of dimentions in Array
+func (parray Array) GetNdim() (retVal int32) {
+    retVal = int32(((*_Ctype_TVMArray)(unsafe.Pointer(parray))).ndim)
+    return
+}
+
+// GetShape returns the number of dimentions in Array
+func (parray Array) GetShape() (retVal []int64) {
+    shapePtr := (*C.int64_t)(((*_Ctype_TVMArray)(unsafe.Pointer(parray))).shape)
+    ndim := parray.GetNdim()
+
+    shapeSlice := (*[1<<31] int64)(unsafe.Pointer(shapePtr))[:ndim:ndim]
+    retVal = make([]int64, ndim)
+    copy(retVal, shapeSlice)
+    return
+}
+
+// GetDType returns the number of dimentions in Array
+func (parray Array) GetDType() (retVal string) {
+    ret := ((*_Ctype_TVMArray)(unsafe.Pointer(parray))).dtype
+    retVal, _ = dtypeFromTVMType(*(*pTVMType)(unsafe.Pointer(&ret)))
+    return
+}
+
+// GetCtx returns the number of dimentions in Array
+func (parray Array) GetCtx() (retVal Context) {
+    ret := ((*_Ctype_TVMArray)(unsafe.Pointer(parray))).ctx
+    retVal = *(*Context)(unsafe.Pointer(&ret))
+    return
+}
+
+// nativeTVMArrayAlloc is used to allocate TVMArray from given attributes.
+//
+// `shape` is int64 slice holding shape of the Array to be created.
+//
+// `ndim` is the rank of the Array to be created.
+//
+// `dtypeCode`, `dtypeBits` and `dtypeLanes` describe the data type in Array.
+//
+// `deviceType` indicates the device on whose memory the Array to allocated.
+//
+// `deviceID` indicates device index if multiple devices of same type present.
+//
+// return argument holding native pointer to newly created Array and error is any.
+func nativeTVMArrayAlloc(shape []int64, ndim int32,
+                   dtypeCode int32, dtypeBits int32, dtypeLanes int32,
+                   deviceType int32, deviceID int32) (retVal uintptr, err error) {
+    ret := (int32)(C.TVMArrayAlloc((*_Ctype_long)(&(shape[0])),
+                                   C.int(ndim),
+                                   C.int(dtypeCode),
+                                   C.int(dtypeBits),
+                                   C.int(dtypeLanes),
+                                   C.int(deviceType),
+                                   C.int(deviceID),
+                                   (*_Ctype_TVMArrayHandle)(unsafe.Pointer(&retVal))))
+    if ret != 0 {
+        err = errors.New(getTVMLastError())
+        return
+    }
+    return
+}
+
+// Empty is used to allocate TVM empty array of given epecification.
+//
+// `shape` is int64 slice holding shape of the Array
+//
+// `args` is variadic args for
+//
+//        `args[0]` is string for data type. Default value is 'float32'
+//
+//        `args[1]` is Context. Default value is '{KDLCPU, 0}'
+//
+// returns pointer to Array on successful execution and error if any.
+func Empty(shape []int64, args ...interface{}) (parray *Array, err error) {
+    typeName := "float32"
+    ctx := Context{KDLCPU, 0}
+
+    if len(shape) < 1 {
+        err = fmt.Errorf("Invalid shape for Array creation: %v\n", len(shape))
+        return
+    }
+
+    for i, val := range args {
+        switch val.(type) {
+            case string:
+                typeName = args[i].(string)
+            case Context:
+                ctx = args[i].(Context)
+            default:
+                err = fmt.Errorf("Invalid Optional Argument Type: %T\n", val)
+                return
+        }
+    }
+
+    tvmType, err := dtypeToTVMType(typeName)
+    if err != nil {
+        return
+    }
+    ndim := int32(len(shape))
+    newArray, err := nativeTVMArrayAlloc(shape, ndim, int32(tvmType.code),
+                                    int32(tvmType.bits), int32(tvmType.lanes),
+                                    ctx.DeviceType, ctx.DeviceID)
+    if err != nil {
+        return
+    }
+    handle := new(Array)
+    *handle = Array(newArray)
+
+    finalizer := func (ahandle *Array) {
+        nativeTVMArrayFree(*ahandle)
+        ahandle = nil
+    }
+    runtime.SetFinalizer(handle, finalizer)
+    parray = handle
+    return
+}
+
+// nativeTVMArrayFree is used to release the Array.
+//
+// `parray` is the Array handle.
+//
+// `ret` indicates the status of this api execution.
+func nativeTVMArrayFree(parray Array) (retVal int32) {
+    retVal = (int32)(C.TVMArrayFree((*_Ctype_TVMArray)(unsafe.Pointer(parray.nativeCPtr()))))
+    return
+}
diff --git a/golang/src/tvm_runtime_pack.cc b/golang/src/tvm_runtime_pack.cc
new file mode 100644
index 000000000000..718a79eb7445
--- /dev/null
+++ b/golang/src/tvm_runtime_pack.cc
@@ -0,0 +1,49 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief This is an all in one TVM runtime file.
+ * \file tvm_runtime_pack.cc
+ */
+#include "src/runtime/c_runtime_api.cc"
+#include "src/runtime/cpu_device_api.cc"
+#include "src/runtime/workspace_pool.cc"
+#include "src/runtime/module_util.cc"
+#include "src/runtime/module.cc"
+#include "src/runtime/registry.cc"
+#include "src/runtime/file_util.cc"
+#include "src/runtime/threading_backend.cc"
+#include "src/runtime/thread_pool.cc"
+#include "src/runtime/ndarray.cc"
+
+// NOTE: all the files after this are optional modules
+// that you can include remove, depending on how much feature you use.
+
+// Likely we only need to enable one of the following
+// If you use Module::Load, use dso_module
+// For system packed library, use system_lib_module
+#include "src/runtime/dso_module.cc"
+#include "src/runtime/system_lib_module.cc"
+
+// Graph runtime
+#include "src/runtime/graph/graph_runtime.cc"
+
+// Uncomment the following lines to enable RPC
+// #include "../../src/runtime/rpc/rpc_session.cc"
+// #include "../../src/runtime/rpc/rpc_event_impl.cc"
+// #include "../../src/runtime/rpc/rpc_server_env.cc"
+
+// These macros enables the device API when uncommented.
+#define TVM_CUDA_RUNTIME 1
+#define TVM_METAL_RUNTIME 1
+#define TVM_OPENCL_RUNTIME 1
+
+// Uncomment the following lines to enable Metal
+// #include "../../src/runtime/metal/metal_device_api.mm"
+// #include "../../src/runtime/metal/metal_module.mm"
+
+// Uncomment the following lines to enable CUDA
+// #include "../../src/runtime/cuda/cuda_device_api.cc"
+// #include "../../src/runtime/cuda/cuda_module.cc"
+
+// Uncomment the following lines to enable OpenCL
+// #include "../../src/runtime/opencl/opencl_device_api.cc"
+// #include "../../src/runtime/opencl/opencl_module.cc"
diff --git a/golang/src/type.go b/golang/src/type.go
new file mode 100644
index 000000000000..27364295bf8b
--- /dev/null
+++ b/golang/src/type.go
@@ -0,0 +1,72 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package for TVMType interface
+ * \file type.go
+ */
+
+package gotvm
+
+//#include "gotvm.h"
+import "C"
+
+import (
+    "fmt"
+)
+
+// pTVMType corresponding to data types.
+type pTVMType struct {
+    code uint8
+    bits uint8
+    lanes uint16
+}
+
+// data type to pTVMType mapping
+var dtypeMap = map[string] pTVMType {
+    "int8": pTVMType{0, 8, 1},
+    "int16": pTVMType{0, 16, 1},
+    "int32": pTVMType{0, 32, 1},
+    "int64": pTVMType{0, 64, 1},
+    "uint8": pTVMType{1, 8, 1},
+    "uint16": pTVMType{1, 16, 1},
+    "uint32": pTVMType{1, 32, 1},
+    "uint64": pTVMType{1, 64, 1},
+    "float32": pTVMType{2, 32, 1},
+    "float64": pTVMType{2, 64, 1},
+}
+
+// dtypeFromTVMType return the pTVMType corresponding to given dtype
+//
+// `dtype` string for the given data type.
+func dtypeFromTVMType(tvmtype pTVMType) (retVal string, err error) {
+    for k, v := range dtypeMap {
+        if v.code == tvmtype.code && v.bits == tvmtype.bits && v.lanes == tvmtype.lanes {
+            retVal = k
+            return
+        }
+    }
+
+    err = fmt.Errorf("Cannot map TVMType:%v to dtype", tvmtype)
+    return
+}
+
+// dtypeToTVMType return the pTVMType corresponding to given dtype
+//
+// `dtype` string for the given data type.
+func dtypeToTVMType(args ...interface{}) (tvmtype pTVMType, err error) {
+    dtype := args[0].(string)
+    lanes := 1
+
+    if len(args) == 2 {
+        lanes = args[1].(int)
+    }
+
+    for k, v := range dtypeMap {
+        if k == dtype {
+            tvmtype = v
+            tvmtype.lanes = uint16(lanes)
+            return
+        }
+    }
+    err = fmt.Errorf("Cannot map dtype:%v to TVMType", dtype)
+    return
+}
diff --git a/golang/src/util.go b/golang/src/util.go
new file mode 100644
index 000000000000..aa5a6016c97f
--- /dev/null
+++ b/golang/src/util.go
@@ -0,0 +1,24 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package source for common utilities
+ * \file util.go
+ */
+
+package gotvm
+
+//#include "gotvm.h"
+import "C"
+
+import (
+    "unsafe"
+)
+
+// Native string map for go string
+type nativeGoString struct { p uintptr; n int32 }
+
+func goStringFromNative (s string) (retStr string) {
+    p := *(*nativeGoString)(unsafe.Pointer(&s))
+    retStr = string((*[0x7fffffff]byte)(unsafe.Pointer(p.p))[:p.n])
+    C.free(unsafe.Pointer(p.p))
+    return
+}
diff --git a/golang/src/value.go b/golang/src/value.go
new file mode 100644
index 000000000000..2a953560f237
--- /dev/null
+++ b/golang/src/value.go
@@ -0,0 +1,360 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package source for TVMValue interface
+ * \file value.go
+ */
+
+package gotvm
+
+//#include "gotvm.h"
+import "C"
+
+import (
+    "fmt"
+    "runtime"
+    "unsafe"
+)
+
+// KHandle is golang type code for TVM enum kHandle.
+var KHandle                 = int32(C.kHandle)
+// KNull is golang type code for TVM kNull.
+var KNull                   = int32(C.kNull)
+// KTVMType is golang type code for TVM kTVMType.
+var KTVMType                = int32(C.kTVMType)
+// KTVMContext is golang type code for TVM kTVMContext.
+var KTVMContext             = int32(C.kTVMContext)
+// KArrayHandle is golang type code for TVM kArrayHandle.
+var KArrayHandle            = int32(C.kArrayHandle)
+// KNodeHandle is golang type code for TVM kNodeHandle.
+var KNodeHandle             = int32(C.kNodeHandle)
+// KModuleHandle is gonag type code for TVM kModuleHandle.
+var KModuleHandle           = int32(C.kModuleHandle)
+// KFuncHandle is gonalg type code for TVM kFuncHandle.
+var KFuncHandle             = int32(C.kFuncHandle)
+// KStr is golang type code for TVM kStr.
+var KStr                    = int32(C.kStr)
+// KBytes is golang type code for TVM kBytes.
+var KBytes                  = int32(C.kBytes)
+// KNDArrayContainer is golang typecode for kNDArrayContainer.
+var KNDArrayContainer       = int32(C.kNDArrayContainer)
+// KExtBegin is golang enum corresponding to TVM kExtBegin.
+var KExtBegin               = int32(C.kExtBegin)
+// KNNVMFirst is golang enum corresponding to TVM kNNVMFirst.
+var KNNVMFirst              = int32(C.kNNVMFirst)
+// KNNVMLast is golang enum corresponding to TVM kNNVMLast.
+var KNNVMLast               = int32(C.kNNVMLast)
+// KExtReserveEnd is golang enum corresponding to TVM kExtReserveEnd.
+var KExtReserveEnd          = int32(C.kExtReserveEnd)
+// KExtEnd is golang enum corresponding to TVM kExtEnd.
+var KExtEnd                 = int32(C.kExtEnd)
+// KDLInt is golang type code for TVM kDLInt.
+var KDLInt                  = int32(C.kDLInt)
+// KDLUInt is golang type code for TVM kDLUInt.
+var KDLUInt                 = int32(C.kDLUInt)
+// KDLFloat is golang type code for TVM kDLFloat.
+var KDLFloat                = int32(C.kDLFloat)
+
+// Value Typemap for union exposed by TVM runtime API.
+//
+// gotvm maps it to a uintptr and then dynamically allocates memory by newTVMValue method.
+type Value struct {
+    nptr  uintptr
+    dtype int32
+    isLocal bool
+}
+
+// AsInt64 returns the int64 value inside the Value.
+func (tvmval *Value)  AsInt64() (retVal int64) {
+    retVal = tvmval.getVInt64()
+    return
+}
+
+// AsFloat64 returns the Float64 value inside the Value.
+func (tvmval *Value)  AsFloat64() (retVal float64) {
+    retVal = tvmval.getVFloat64()
+    return
+}
+
+// AsModule returns the Module inside the Value.
+func (tvmval *Value)  AsModule() (retVal *Module) {
+    mhandle := tvmval.getVMHandle()
+    retVal = &mhandle
+    return
+}
+
+// AsFunction returns the Function inside the Value.
+func (tvmval *Value)  AsFunction() (retVal *Function) {
+    fhandle := tvmval.getVFHandle()
+    retVal = &fhandle
+
+    return
+}
+
+// AsBytes returns the byte slice value inside the Value.
+func (tvmval *Value)  AsBytes() (retVal []byte) {
+    retVal = tvmval.getVBHandle().getData()
+    return
+}
+
+// AsStr returns the golang string in the Value.
+func (tvmval *Value) AsStr() (retVal string) {
+    str := tvmval.getVStr()
+    retVal = str
+    return
+}
+
+// nativeCPtr return the unitptr corresponding to Value type.
+func (tvmval *Value) nativeCPtr() (ret uintptr) {
+    ret = (uintptr)(tvmval.nptr)
+    return
+}
+
+// moveFrom copies the tvmval from other Value object.
+func (tvmval *Value) moveFrom(fromval *Value) () {
+    C.memcpy(unsafe.Pointer(tvmval.nativeCPtr()),
+             unsafe.Pointer(fromval.nativeCPtr()),
+             C.sizeof_TVMValue)
+
+    // Move the dtype too.
+    tvmval.dtype = fromval.dtype
+    fromval.dtype = KNull
+    return
+}
+
+// setVInt64 initializes the Value object with given int64 value.
+//
+// `val` is the int64 value to initialize the Value
+func (tvmval *Value) setVInt64(val int64) {
+    valp := (*C.int64_t)(unsafe.Pointer(tvmval.nativeCPtr()))
+    *valp = C.int64_t(val)
+    tvmval.dtype = KDLInt
+    return
+}
+
+
+// getVInt64 returns the int64 value inside the Value.
+func (tvmval *Value) getVInt64() (retVal int64) {
+    valp := (*C.int64_t)(unsafe.Pointer(tvmval.nativeCPtr()))
+    retVal = int64(*valp)
+    return
+}
+
+// setVFloat64 initializes the Value object with given float64 value.
+//
+// `val` is the float64 value to initialize the Value.
+func (tvmval *Value) setVFloat64(val float64) {
+    valp := (*C.double)(unsafe.Pointer(tvmval.nativeCPtr()))
+    *valp = C.double(val)
+    tvmval.dtype = KDLFloat
+    return
+}
+
+// getVFloat64 returns the float64 value inside Value.
+func (tvmval *Value) getVFloat64() (retVal float64) {
+    valp := (*C.double)(unsafe.Pointer(tvmval.nativeCPtr()))
+    retVal = float64(*valp)
+    return
+}
+
+// setVHandle initializes the handle inside the Value.
+//
+// Can be used to store any uintptr type object like
+// module handle, function handle and any object's nativeCPtr.
+//
+// `val` is the uintptr type of given handle.
+func (tvmval *Value) setVHandle(val uintptr) {
+    valp := (**C.void)(unsafe.Pointer(tvmval.nativeCPtr()))
+    *valp = (*C.void)(unsafe.Pointer(val))
+}
+
+// getVHandle returns the uintptr handle
+func (tvmval *Value) getVHandle() (retVal uintptr) {
+    valp := (**C.void)(unsafe.Pointer(tvmval.nativeCPtr()))
+    retVal = uintptr(unsafe.Pointer(*valp))
+    return
+}
+
+// setVStr intializes the Value with given golang string object.
+//
+// `val` is the golang string object used to initialize the Value.
+func (tvmval *Value) setVStr(val string) {
+    valp := (**C.char)(unsafe.Pointer(tvmval.nativeCPtr()))
+    *valp = C.CString(val)
+    tvmval.dtype = KStr
+    return
+}
+
+
+// getVStr returns the golang string for the native string inside Value.
+func (tvmval *Value) getVStr() (retVal string) {
+    valp := (**C.char)(unsafe.Pointer(tvmval.nativeCPtr()))
+    retVal = C.GoString(*valp)
+    return
+}
+
+// unSetVStr release the memory allocated in setVStr
+func (tvmval *Value) unSetVStr() {
+    valp := (**C.char)(unsafe.Pointer(tvmval.nativeCPtr()))
+	C.free(unsafe.Pointer(*valp))
+    tvmval.dtype = KNull
+}
+
+// setVAHandle is used to set Array handle in Value.
+//
+// Application can call the setVHandle with nativeCPtr instead too.
+// This is a wrapper to accept Array directly.
+func (tvmval *Value) setVAHandle(ptvmarray Array) {
+    tvmval.setVHandle(ptvmarray.nativeCPtr())
+    tvmval.dtype = KArrayHandle
+    return
+}
+
+// getVAHandle is used to get Array handle in Value.
+func (tvmval *Value) getVAHandle() (retVal Array) {
+	retVal = (Array)(tvmval.getVHandle())
+    return
+}
+
+// setVMHandle is used to set Module handle in Value.
+//
+// Application can call the setVHandle with nativeCPtr instead too.
+// This is a wrapper to accept Module directly.
+func (tvmval *Value) setVMHandle(tvmmodule Module) {
+    tvmval.setVHandle(tvmmodule.nativeCPtr())
+    tvmval.dtype = KModuleHandle
+    return
+}
+
+// getVMHandle is used to get Module handle in Value.
+func (tvmval *Value) getVMHandle() (retVal Module) {
+	retVal = (Module)(tvmval.getVHandle())
+    return
+}
+
+// setVFHandle is used to set Function handle in Value.
+//
+// Application can call the setVHandle with nativeCPtr instead.
+// This is a wrapper to accept Function directly.
+func (tvmval *Value) setVFHandle(tvmfunction Function) {
+    tvmval.setVHandle(tvmfunction.nativeCPtr())
+    tvmval.dtype = KFuncHandle
+    return
+}
+
+// getVFHandle is used to get Function handle in Value.
+func (tvmval *Value) getVFHandle() (retVal Function) {
+	retVal = (Function)(tvmval.getVHandle())
+    return
+}
+
+// setVBHandle is used to set ByteArray handle in Value.
+//
+// Application can call the setVHandle with nativeCPtr instead.
+// This is a wrapper to accept ByteArray directly.
+func (tvmval *Value) setVBHandle(tbytearray ByteArray) {
+    tvmval.setVHandle(tbytearray.nativeCPtr())
+    tvmval.dtype = KBytes
+    return
+}
+
+// getVBHandle is used to get ByteArray handle in Value.
+func (tvmval *Value) getVBHandle() (retVal ByteArray) {
+	retVal = (ByteArray)(tvmval.getVHandle())
+    return
+}
+
+// setValue is used to set the given value in Value.
+//
+// `val` is value of types accepted by Value container or native union.
+func (tvmval *Value) setValue(val interface{}) (retVal int32, err error) {
+    retVal = KNull
+    switch val.(type) {
+        case string:
+            tvmval.setVStr(val.(string))
+        case uint8:
+            tvmval.setVInt64(int64(val.(uint8)))
+        case uint16:
+            tvmval.setVInt64(int64(val.(uint16)))
+        case uint32:
+            tvmval.setVInt64(int64(val.(uint32)))
+        case uint64:
+            tvmval.setVInt64(int64(val.(uint64)))
+        case int:
+            tvmval.setVInt64(int64(val.(int)))
+        case int8:
+            tvmval.setVInt64(int64(val.(int8)))
+        case int16:
+            tvmval.setVInt64(int64(val.(int16)))
+        case int32:
+            tvmval.setVInt64(int64(val.(int32)))
+        case int64:
+            tvmval.setVInt64(val.(int64))
+        case float32:
+            tvmval.setVFloat64(float64(val.(float32)))
+        case float64:
+            tvmval.setVFloat64(val.(float64))
+        case *Module:
+            tvmval.setVMHandle(*(val.(*Module)))
+        case *Function:
+            tvmval.setVFHandle(*(val.(*Function)))
+        case *ByteArray:
+            tvmval.setVBHandle(*(val.(*ByteArray)))
+        case []byte:
+            barray := newByteArray(val.([]byte))
+            tvmval.setVBHandle(barray)
+        case *Array:
+            tvmval.setVAHandle(*(val.(*Array)))
+        case func (args ...*Value) (interface{}, error):
+            fhandle, apierr := ConvertFunction(val)
+            if apierr != nil {
+                err = fmt.Errorf("Given value Type not defined for Value: %v : %T\n", val, val);
+                return
+            }
+            tvmval.setVFHandle(*fhandle)
+
+            // Clear the finalizer as we don't need to control it anymore.
+            runtime.SetFinalizer(fhandle, nil)
+        case *Value:
+            tvmval.moveFrom(val.(*Value))
+        case Value:
+            fromval := val.(Value)
+            tvmval.moveFrom(&fromval)
+        default:
+            err = fmt.Errorf("Given value Type not defined for Value: %v : %T\n", val, val);
+    }
+    retVal = tvmval.dtype
+    return
+}
+
+// newTVMValue initialize the TVMValue native object.
+//
+// This is intended to use as intermediate type between native and golang types.
+// Allocated from FuncCall or Callback to handle conversions.
+func newTVMValue() (retVal *Value) {
+    handle := new(Value)
+
+    handle.nptr = (uintptr(C.malloc(C.sizeof_TVMValue)))
+    handle.dtype = KNull
+    handle.isLocal = true
+    finalizer := func(vhandle *Value) {
+        vhandle.deleteTVMValue()
+        vhandle = nil
+    }
+    runtime.SetFinalizer(handle, finalizer)
+    retVal = handle
+    return
+}
+
+// deleteTVMValue free the native Value object which is allocated in newTVMValue.
+func (tvmval Value) deleteTVMValue() {
+    if tvmval.isLocal == true {
+        if tvmval.dtype == KStr {
+            tvmval.unSetVStr()
+        }
+        if tvmval.dtype == KBytes {
+            tvmval.getVBHandle().deleteTVMByteArray()
+        }
+    }
+
+	C.free(unsafe.Pointer(tvmval.nativeCPtr()))
+}
diff --git a/golang/src/value_test.go b/golang/src/value_test.go
new file mode 100644
index 000000000000..251af82cb7b9
--- /dev/null
+++ b/golang/src/value_test.go
@@ -0,0 +1,237 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief gotvm package
+ * \file value_test.go
+ */
+
+package gotvm
+
+import (
+    "testing"
+    "math/rand"
+    "strings"
+)
+
+// Check Int64 Value looping via packed function calling another packed function.
+func TestValueLoopInt64(t *testing.T) {
+    // Receive a function Handle and argument and echo the Value on the handle.
+    sampleFunctionLoop := func (args ...*Value) (retVal interface{}, err error) {
+        // Reveive Packed Function Handle
+        pfunc := args[0].AsFunction()
+        newArgs := args[1:]
+
+        // Call Packed Function by Value
+        return pfunc.Invoke(newArgs)
+    }
+
+    fhandle, err := ConvertFunction(sampleFunctionLoop)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    // funccall is a simple golang callback function like C = A + B.
+    funccall := func (args ...*Value) (retVal interface{}, err error) {
+        retVal = args[0]
+        return
+    }
+
+    result := rand.Int63()
+    retVal, err := fhandle.Invoke(funccall, result)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+    if retVal.AsInt64() != result {
+        t.Errorf("Expected : %v got:%v\n", result, retVal.AsInt64())
+        return
+    }
+}
+
+// Check Int32 Value looping via packed function calling another packed function.
+func TestValueLoopInt32(t *testing.T) {
+    // Receive a function Handle and argument and echo the Value on the handle.
+    sampleFunctionLoop := func (args ...*Value) (retVal interface{}, err error) {
+        // Reveive Packed Function Handle
+        pfunc := args[0].AsFunction()
+        newArgs := args[1:]
+
+        // Call Packed Function by Value
+        return pfunc.Invoke(newArgs)
+    }
+
+    fhandle, err := ConvertFunction(sampleFunctionLoop)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    // funccall is a simple golang callback function like C = A + B.
+    funccall := func (args ...*Value) (retVal interface{}, err error) {
+        retVal = args[0]
+        return
+    }
+
+    result := rand.Int31()
+    retVal, err := fhandle.Invoke(funccall, result)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    if retVal.AsInt64() != int64(result) {
+        t.Errorf("Expected : %v got:%v\n", result, retVal.AsInt64())
+        return
+    }
+}
+
+// Check Float32 Value looping via packed function calling another packed function.
+func TestValueLoopFloat32(t *testing.T) {
+    // Receive a function Handle and argument and echo the Value on the handle.
+    sampleFunctionLoop := func (args ...*Value) (retVal interface{}, err error) {
+        // Reveive Packed Function Handle
+        pfunc := args[0].AsFunction()
+        newArgs := args[1:]
+        // Call Packed Function by Value
+        return pfunc.Invoke(newArgs)
+    }
+
+    fhandle, err := ConvertFunction(sampleFunctionLoop)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    // funccall is a simple golang callback function like C = A + B.
+    funccall := func (args ...*Value) (retVal interface{}, err error) {
+        retVal = args[0]
+        return
+    }
+
+    result := rand.Float32()
+    retVal, err := fhandle.Invoke(funccall, result)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    if retVal.AsFloat64() != float64(result) {
+        t.Errorf("Expected : %v got:%v\n", result, retVal.AsInt64())
+        return
+    }
+}
+
+// Check Float64 Value looping via packed function calling another packed function.
+func TestValueLoopFloat64(t *testing.T) {
+    // Receive a function Handle and argument and echo the Value on the handle.
+    sampleFunctionLoop := func (args ...*Value) (retVal interface{}, err error) {
+        // Reveive Packed Function Handle
+        pfunc := args[0].AsFunction()
+        newArgs := args[1:]
+        // Call Packed Function by Value
+        return pfunc.Invoke(newArgs)
+    }
+
+    fhandle, err := ConvertFunction(sampleFunctionLoop)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    // funccall is a simple golang callback function like C = A + B.
+    funccall := func (args ...*Value) (retVal interface{}, err error) {
+        retVal = args[0]
+        return
+    }
+
+    result := rand.Float64()
+    retVal, err := fhandle.Invoke(funccall, result)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    if retVal.AsFloat64() != result {
+        t.Errorf("Expected : %v got:%v\n", result, retVal.AsInt64())
+        return
+    }
+}
+
+func TestValueLoopString(t *testing.T) {
+    // Receive a function Handle and argument and echo the Value on the handle.
+    sampleFunctionLoop := func (args ...*Value) (retVal interface{}, err error) {
+        // Reveive Packed Function Handle
+        pfunc := args[0].AsFunction()
+        argStr := args[1].AsStr()
+        // Call Packed Function by Value
+        return pfunc.Invoke(argStr)
+    }
+
+    fhandle, err := ConvertFunction(sampleFunctionLoop)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    // funccall is a simple golang callback function like C = A + B.
+    funccall := func (args ...*Value) (retVal interface{}, err error) {
+        retVal =  args[0].AsStr()
+        return
+    }
+
+    retVal, err := fhandle.Invoke(funccall, "TestString")
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    vStr := retVal.AsStr()
+    if strings.Compare(vStr, string("TestString")) != 0  {
+        t.Errorf("Expected : %v got:%v\n", string("TestString"), vStr)
+        return
+    }
+}
+
+// Check []byte Value looping via packed function calling another packed function.
+func TestValueLoopByteSlice(t *testing.T) {
+    // Receive a function Handle and argument and echo the Value on the handle.
+    sampleFunctionLoop := func (args ...*Value) (retVal interface{}, err error) {
+        // Reveive Packed Function Handle
+        pfunc := args[0].AsFunction()
+        argBytes := args[1].AsBytes()
+        // Call Packed Function by Value
+        return pfunc.Invoke(argBytes)
+    }
+
+    fhandle, err := ConvertFunction(sampleFunctionLoop)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    // funccall is a simple golang callback function like C = A + B.
+    funccall := func (args ...*Value) (retVal interface{}, err error) {
+        retVal = args[0].AsBytes()
+        return
+    }
+
+    result := make([]byte, 1024)
+    rand.Read(result)
+    retVal, err := fhandle.Invoke(funccall, result)
+    if err != nil {
+        t.Error(err.Error())
+        return
+    }
+
+    received := retVal.AsBytes()
+    if len(result) != len(received) {
+            t.Errorf("Data expected Len: %v Got :%v\n", len(result), len(received))
+            return
+    }
+    for i := range result {
+        if result[i] != received[i] {
+            t.Errorf("Data expected: %v Got :%v at index %v\n", result[i], received[i], i)
+            return
+        }
+    }
+}
diff --git a/include/tvm/api_registry.h b/include/tvm/api_registry.h
index 93bff2762481..1532872397c3 100644
--- a/include/tvm/api_registry.h
+++ b/include/tvm/api_registry.h
@@ -1,16 +1,18 @@
 /*!
  *  Copyright (c) 2017 by Contributors
  * \file tvm/api_registry.h
- * \brief This files include necessary headers to
- *  be used to register an global API function.
+ * \brief This file contains utilities related to
+ *  the TVM's global function registry.
  */
 #ifndef TVM_API_REGISTRY_H_
 #define TVM_API_REGISTRY_H_
 
-#include "./base.h"
-#include "./packed_func_ext.h"
-#include "./runtime/registry.h"
+#include <string>
+#include "base.h"
+#include "packed_func_ext.h"
+#include "runtime/registry.h"
 
+namespace tvm {
 /*!
  * \brief Register an API function globally.
  * It simply redirects to TVM_REGISTER_GLOBAL
@@ -24,4 +26,113 @@
  */
 #define TVM_REGISTER_API(OpName) TVM_REGISTER_GLOBAL(OpName)
 
+/*!
+ * \brief Node container of EnvFunc
+ * \sa EnvFunc
+ */
+class EnvFuncNode : public Node {
+ public:
+  /*! \brief Unique name of the global function */
+  std::string name;
+  /*! \brief The internal packed function */
+  PackedFunc func;
+  /*! \brief constructor */
+  EnvFuncNode() {}
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("name", &name);
+  }
+
+  static constexpr const char* _type_key = "EnvFunc";
+  TVM_DECLARE_NODE_TYPE_INFO(EnvFuncNode, Node);
+};
+
+/*!
+ * \brief A serializable function backed by TVM's global environment.
+ *
+ * This is a wrapper to enable serializable global PackedFunc.
+ * An EnvFunc is saved by its name in the global registry
+ * under the assumption that the same function is registered during load.
+ */
+class EnvFunc : public NodeRef {
+ public:
+  EnvFunc() {}
+  explicit EnvFunc(NodePtr<Node> n) : NodeRef(n) {}
+  /*! \return The internal global function pointer */
+  const EnvFuncNode* operator->() const {
+    return static_cast<EnvFuncNode*>(node_.get());
+  }
+  /*!
+   * \brief Invoke the function.
+   * \param args The arguments
+   * \returns The return value.
+   */
+  template<typename... Args>
+  runtime::TVMRetValue operator()(Args&&... args) const {
+    const EnvFuncNode* n = operator->();
+    CHECK(n != nullptr);
+    return n->func(std::forward<Args>(args)...);
+  }
+  /*!
+   * \brief Get a global function based on the name.
+   * \param name The name of the global function.
+   * \return The created global function.
+   * \note The function can be unique
+   */
+  TVM_DLL static EnvFunc Get(const std::string& name);
+  /*! \brief specify container node */
+  using ContainerType = EnvFuncNode;
+};
+
+/*!
+ * \brief Please refer to \ref TypedEnvFuncAnchor "TypedEnvFunc<R(Args..)>"
+ */
+template<typename FType>
+class TypedEnvFunc;
+
+/*!
+ * \anchor TypedEnvFuncAnchor
+ * \brief A typed version of EnvFunc.
+ * It is backed by a GlobalFuncNode internally.
+ *
+ * \tparam R The return value of the function.
+ * \tparam Args The argument signature of the function.
+ * \sa EnvFunc
+ */
+template<typename R, typename... Args>
+class TypedEnvFunc<R(Args...)> : public NodeRef {
+ public:
+  /*! \brief short hand for this function type */
+  using TSelf = TypedEnvFunc<R(Args...)>;
+  TypedEnvFunc() {}
+  explicit TypedEnvFunc(NodePtr<Node> n) : NodeRef(n) {}
+  /*!
+   * \brief Assign global function to a TypedEnvFunc
+   * \param other Another global function.
+   * \return reference to self.
+   */
+  TSelf& operator=(const EnvFunc& other) {
+    this->node_ = other.node_;
+    return *this;
+  }
+  /*! \return The internal global function pointer */
+  const EnvFuncNode* operator->() const {
+    return static_cast<EnvFuncNode*>(node_.get());
+  }
+  /*!
+   * \brief Invoke the function.
+   * \param args The arguments
+   * \returns The return value.
+   */
+  R operator()(Args... args) const {
+    const EnvFuncNode* n = operator->();
+    CHECK(n != nullptr);
+    return runtime::detail::typed_packed_call_dispatcher<R>
+        ::run(n->func, std::forward<Args>(args)...);
+  }
+  /*! \brief specify container node */
+  using ContainerType = EnvFuncNode;
+};
+
+}  // namespace tvm
 #endif  // TVM_API_REGISTRY_H_
diff --git a/include/tvm/arithmetic.h b/include/tvm/arithmetic.h
index 6a3c395fd404..fe0405264c51 100644
--- a/include/tvm/arithmetic.h
+++ b/include/tvm/arithmetic.h
@@ -9,7 +9,7 @@
 #include <vector>
 #include <unordered_map>
 #include <memory>
-#include "./expr.h"
+#include "expr.h"
 
 namespace tvm {
 
@@ -38,7 +38,7 @@ class IntSet : public NodeRef {
   /*! \brief constructor */
   IntSet() {}
   // constructor from not container.
-  explicit IntSet(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit IntSet(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
diff --git a/include/tvm/attrs.h b/include/tvm/attrs.h
new file mode 100644
index 000000000000..cc1abe6e57de
--- /dev/null
+++ b/include/tvm/attrs.h
@@ -0,0 +1,855 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/attrs.h
+ * \brief TVM attribute module
+ *
+ *  This module enables declaration of named attributes
+ *  which support default value setup and bound checking.
+ *
+ * \code
+ *   struct MyAttrs : public tvm::AttrsNode<MyAttrs> {
+ *     float learning_rate;
+ *     int num_hidden;
+ *     std::string name;
+ *     // declare attribute fields in header file
+ *     TVM_DECLARE_ATTRS(MyAttrs, "attrs.MyAttrs") {
+ *       TVM_ATTR_FIELD(num_hidden).set_lower_bound(1);
+ *       TVM_ATTR_FIELD(learning_rate).set_default(0.01f);
+ *       TVM_ATTR_FIELD(name).set_default("hello");
+ *     }
+ *   };
+ *   // register it in cc file
+ *   TVM_REGISTER_NODE_TYPE(MyAttrs);
+ * \endcode
+ *
+ * \sa AttrsNode, TVM_DECLARE_ATTRS, TVM_ATTR_FIELD
+ */
+#ifndef TVM_ATTRS_H_
+#define TVM_ATTRS_H_
+
+#include <dmlc/common.h>
+#include <unordered_map>
+#include <vector>
+#include <functional>
+#include <type_traits>
+#include <string>
+#include "ir.h"
+#include "base.h"
+#include "expr.h"
+#include "packed_func_ext.h"
+
+namespace tvm {
+/*!
+ * \brief Declare an attribute function.
+ * \param ClassName The name of the class.
+ * \param TypeKey The type key to be used by the TVM node system.
+ */
+#define TVM_DECLARE_ATTRS(ClassName, TypeKey)                   \
+  static constexpr const char* _type_key = TypeKey;             \
+  TVM_DECLARE_NODE_TYPE_INFO(ClassName, ::tvm::BaseAttrsNode);  \
+  template<typename FVisit>                                     \
+  void __VisitAttrs__(FVisit& __fvisit__)  // NOLINT(*)
+
+
+/*!
+ * \brief Declare an attribute field.
+ * \param FieldName The field name.
+ */
+#define TVM_ATTR_FIELD(FieldName) \
+  __fvisit__(#FieldName, &FieldName)
+
+
+/*!
+ * \brief Create a NodeRef type that represents null.
+ * \tparam TNodeRef the type to be created.
+ * \return A instance that will represent None.
+ */
+template<typename TNodeRef>
+inline TNodeRef NullValue() {
+  return TNodeRef(NodePtr<Node>(nullptr));
+}
+
+template<>
+inline Type NullValue<Type>() {
+  return Type(Type::Handle, 0, 0);
+}
+
+/*! \brief Error thrown during attribute checking. */
+struct AttrError : public dmlc::Error {
+  /*!
+   * \brief constructor
+   * \param msg error message
+   */
+  explicit AttrError(const std::string &msg)
+      : dmlc::Error(msg) {}
+};
+
+/*!
+ * \brief Information about attribute fields in string representations.
+ */
+class AttrFieldInfoNode : public Node {
+ public:
+  /*! \brief name of the field */
+  std::string name;
+  /*! \brief type docstring information in str. */
+  std::string type_info;
+  /*! \brief detailed description of the type */
+  std::string description;
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("name", &name);
+    v->Visit("type_info", &type_info);
+    v->Visit("description", &description);
+  }
+  static constexpr const char* _type_key = "AttrFieldInfo";
+  TVM_DECLARE_NODE_TYPE_INFO(AttrFieldInfoNode, Node);
+};
+
+/*! \brief AttrFieldInfo */
+TVM_DEFINE_NODE_REF(AttrFieldInfo, AttrFieldInfoNode);
+
+class AttrsHashHandler;
+class AttrsEqualHandler;
+/*!
+ * \brief Content-aware Equality comparator for attrs.
+ *
+ * This comparator will recursively deep compare the following Attributes.
+ *
+ * - IntImm, UIntImm, FloatImm, StringImm
+ * - Any subclass of BaseAttrsNode
+ * - Array of Attributes.
+ * - Map from string to Attributes.
+ */
+class AttrsEqual {
+ public:
+  bool operator()(const double& lhs, const double& rhs) const {
+    return lhs == rhs;
+  }
+  bool operator()(const int64_t& lhs, const int64_t& rhs) const {
+    return lhs == rhs;
+  }
+  bool operator()(const uint64_t& lhs, const uint64_t& rhs) const {
+    return lhs == rhs;
+  }
+  bool operator()(const int& lhs, const int& rhs) const {
+    return lhs == rhs;
+  }
+  bool operator()(const bool& lhs, const bool& rhs) const {
+    return lhs == rhs;
+  }
+  bool operator()(const std::string& lhs, const std::string& rhs) const {
+    return lhs == rhs;
+  }
+  bool operator()(const Type& lhs, const Type& rhs) const {
+    return lhs == rhs;
+  }
+  // node comparator
+  TVM_DLL bool operator()(const NodeRef& lhs, const NodeRef& rhs) const;
+
+ protected:
+  friend class AttrsEqualHandler;
+  /*! \brief internal handle. */
+  AttrsEqualHandler* handler_{nullptr};
+};
+
+/*!
+ * \brief Content-aware hash function.
+ *
+ * This hash functor will recursively hash the content of the Attributes.
+ * It is guaranteed that if AttrsEqual(a, b) == true, then AttrsHash(a) == AttrsHash(b);
+ */
+class AttrsHash {
+ public:
+  size_t operator()(const double& value) const {
+    return std::hash<double>()(value);
+  }
+  size_t operator()(const int64_t& value) const {
+    return std::hash<int64_t>()(value);
+  }
+  size_t operator()(const uint64_t& value) const {
+    return std::hash<uint64_t>()(value);
+  }
+  size_t operator()(const int& value) const {
+    return std::hash<int>()(value);
+  }
+  size_t operator()(const bool& value) const {
+    return std::hash<bool>()(value);
+  }
+  size_t operator()(const std::string& value) const {
+    return std::hash<std::string>()(value);
+  }
+  size_t operator()(const Type& value) const {
+    return std::hash<int>()(
+        static_cast<int>(value.code()) |
+        (static_cast<int>(value.bits()) << 8) |
+        (static_cast<int>(value.lanes()) << 16));
+  }
+  TVM_DLL size_t operator()(const NodeRef& value) const;
+
+ private:
+  friend class AttrsHashHandler;
+  /*! \brief internal handle. */
+  AttrsHashHandler* handler_{nullptr};
+};
+
+/*!
+ * \brief Base class of all attribute class
+ * \note Do not subclass AttrBaseNode directly,
+ *       subclass AttrsNode instead.
+ * \sa AttrsNode
+ */
+class BaseAttrsNode : public Node {
+ public:
+  using TVMArgs = runtime::TVMArgs;
+  using TVMRetValue = runtime::TVMRetValue;
+  /*!
+   * \brief Initialize the attributes by sequence of arguments
+   * \param args The postional arguments in the form
+   *        [key0, value0, key1, value1, ..., key_n, value_n]
+   */
+  template<typename... Args>
+  inline void InitBySeq(Args&& ...args);
+  /*!
+   * \brief Print readible docstring to ostream, add newline.
+   * \param os the stream to print the docstring to.
+   */
+  inline void PrintDocString(std::ostream &os) const;  // NOLINT(*)
+  /*!
+   * \brief Visit attributes that do not equal the default value.
+   *
+   * \note This is useful to extract fields for concise printing.
+   * \param v The visitor
+   */
+  TVM_DLL virtual void VisitNonDefaultAttrs(AttrVisitor* v) = 0;
+  /*!
+   * \brief Get the field information
+   * \return The fields in the Attrs.
+   */
+  TVM_DLL virtual Array<AttrFieldInfo> ListFieldInfo() const = 0;
+  /*!
+   * \brief Initialize the attributes by arguments.
+   * \param kwargs The key value pairs for initialization.
+   *        [key0, value0, key1, value1, ..., key_n, value_n]
+   * \param allow_unknown Whether allow additional unknown fields.
+   * \note This function throws when the required field is not present.
+   */
+  TVM_DLL virtual void InitByPackedArgs(const TVMArgs& kwargs, bool allow_unknown = false) = 0;
+  /*!
+   * \brief Whether this attribute's content equals to another node.
+   * \param other The pointer to another node.
+   * \param equal The equal comparator
+   * \return The comparison result.
+   */
+  TVM_DLL virtual bool ContentEqual(
+      const Node* other, AttrsEqual equal) const = 0;
+  /*!
+   * \brief Content aware hash.
+   * \param hasher The hasher to run the hash.
+   * \return the hash result.
+   */
+  TVM_DLL virtual size_t ContentHash(AttrsHash hasher) const = 0;
+
+  static constexpr const char* _type_key = "Attrs";
+  TVM_DECLARE_BASE_NODE_INFO(BaseAttrsNode, Node);
+};
+
+/*! \brief Base attribute container for all attributes */
+class Attrs : public NodeRef {
+ public:
+  // normal constructor
+  Attrs() {}
+  // construct from shared ptr.
+  explicit Attrs(NodePtr<Node> n) : NodeRef(n) {}
+
+  /*! \return The attribute node */
+  const BaseAttrsNode* operator->() const {
+    return ptr();
+  }
+  /*! \brief specify container node */
+  using ContainerType = BaseAttrsNode;
+
+ private:
+  /*! \return the internal attribute node */
+  const BaseAttrsNode* ptr() const {
+    return static_cast<const BaseAttrsNode*>(node_.get());
+  }
+};
+
+/*!
+ * \brief Specialized attribute type that is backed by a map.
+ *  The DictAttrsNode implements the Attrs behavior,
+ *  its fields are directly accessible via object.field_name
+ *  like other normal nodes.
+ */
+class DictAttrsNode : public BaseAttrsNode {
+ public:
+  /*! \brief internal attrs map */
+  Map<std::string, NodeRef> dict;
+  /*!
+   * \brief Consruct a Attrs backed by DictAttrsNode.
+   * \param dict The attributes.
+   * \return The dict attributes.
+   */
+  TVM_DLL static Attrs make(Map<std::string, NodeRef> dict);
+  // implementations
+  void VisitAttrs(AttrVisitor* v) final;
+  void VisitNonDefaultAttrs(AttrVisitor* v) final;
+  void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final;
+  Array<AttrFieldInfo> ListFieldInfo() const final;
+  bool ContentEqual(const Node* other, AttrsEqual equal) const final;
+  size_t ContentHash(AttrsHash hasher) const final;
+  // type info
+  static constexpr const char* _type_key = "DictAttrs";
+  TVM_DECLARE_NODE_TYPE_INFO(DictAttrsNode, BaseAttrsNode);
+};
+
+
+// Namespace containing detail implementations
+namespace detail {
+using runtime::TVMArgValue;
+
+// helper entry that does nothing in set_default/bound/describe calls.
+struct AttrNopEntry {
+  using TSelf = AttrNopEntry;
+
+  TSelf& describe(DMLC_ATTRIBUTE_UNUSED const char* str) {
+    return *this;
+  }
+  template<typename T>
+  TSelf& set_default(DMLC_ATTRIBUTE_UNUSED const T& value) {
+    return *this;
+  }
+  template<typename T>
+  TSelf& set_lower_bound(DMLC_ATTRIBUTE_UNUSED const T& begin) {
+    return *this;
+  }
+  template<typename T>
+  TSelf& set_upper_bound(DMLC_ATTRIBUTE_UNUSED const T& end) {
+    return *this;
+  }
+};
+
+// Wrapper for normal visitor.
+class AttrNormalVisitor {
+ public:
+  explicit AttrNormalVisitor(AttrVisitor* visitor)
+      : visitor_(visitor) {
+  }
+  template<typename T>
+  AttrNopEntry operator()(const char* key, T* value) {
+    visitor_->Visit(key, value);
+    return AttrNopEntry();
+  }
+
+ private:
+  AttrVisitor* visitor_;
+};
+
+// Wrapper for normal visitor.
+class AttrsEqualVisitor {
+ public:
+  bool result_{true};
+  // constructor
+  AttrsEqualVisitor(const Node* lhs, const Node* rhs, const AttrsEqual& equal)
+      : lhs_(lhs), rhs_(rhs), equal_(equal) {
+  }
+  template<typename T>
+  AttrNopEntry operator()(const char* key, T* lhs_value) {
+    if (!result_) return AttrNopEntry();
+    const T* rhs_value =
+        reinterpret_cast<const T*>(
+            reinterpret_cast<const char*>(rhs_) +
+            (reinterpret_cast<const char*>(lhs_value) -
+             reinterpret_cast<const char*>(lhs_)));
+    if (!equal_(*lhs_value, *rhs_value)) {
+      result_ = false;
+    }
+    return AttrNopEntry();
+  }
+
+ private:
+  const Node* lhs_;
+  const Node* rhs_;
+  const AttrsEqual& equal_;
+};
+
+class AttrsHashVisitor {
+ public:
+  explicit AttrsHashVisitor(const AttrsHash& hasher)
+      : hasher_(hasher) {}
+
+  size_t result_{0};
+
+  template<typename T>
+  AttrNopEntry operator()(const char* key, T* value) {
+    result_ = dmlc::HashCombine(result_, hasher_(*value));
+    return AttrNopEntry();
+  }
+
+ private:
+  const AttrsHash& hasher_;
+};
+
+// helper entry that does initialization, set default.
+template<typename T>
+struct AttrInitEntry {
+  // The attributes
+  using TSelf = AttrInitEntry<T>;
+  // The type key
+  const char* type_key_;
+  // field name
+  const char* key_;
+  // internal value.
+  T* value_;
+  // whether the value is missing.
+  bool value_missing_{true};
+  // If the value is still missing in destruction time throw an error.
+  ~AttrInitEntry() DMLC_THROW_EXCEPTION {
+    if (value_missing_) {
+      std::ostringstream os;
+      os << type_key_ << ": Cannot find required field \'" << key_
+         << "\' during initialization";
+      throw AttrError(os.str());
+    }
+  }
+  // override fields.
+  // This function sets the lower bound of the attribute
+  TSelf& set_lower_bound(DMLC_ATTRIBUTE_UNUSED const T& begin) {
+    if (this->value_missing_)  return *this;
+    const T& val = *value_;
+    if (begin > val) {
+      std::ostringstream os;
+      os << type_key_ << "." << key_ << ": "
+         << "value " << val
+         << " is smaller than the lower bound " << begin;
+      throw AttrError(os.str());
+    }
+    return *this;
+  }
+  // This function sets the upper bound of the attribute
+  TSelf& set_upper_bound(DMLC_ATTRIBUTE_UNUSED const T& end) {
+    if (this->value_missing_)  return *this;
+    const T& val = *value_;
+    if (val > end) {
+      std::ostringstream os;
+      os << type_key_ << "." << key_ << ": "
+         << "value " << val
+         << " is bigger than the upper bound " << end;
+      throw AttrError(os.str());
+    }
+    return *this;
+  }
+  // set default when
+  TSelf& set_default(DMLC_ATTRIBUTE_UNUSED const T& value) {
+    if (!value_missing_) return *this;
+    *value_ = value;
+    value_missing_ = false;
+    return *this;
+  }
+  TSelf& describe(DMLC_ATTRIBUTE_UNUSED const char* str) {
+    return *this;
+  }
+};
+
+// Template function to allow smart conversion
+// from Expr types into the constants.
+template<typename T>
+inline void SetValue(T* ptr, const TVMArgValue& val) {
+  *ptr = val.operator T();
+}
+template<typename T>
+inline void SetIntValue(T* ptr, const TVMArgValue& val) {
+  if (val.type_code() == kDLInt) {
+    *ptr = static_cast<T>(val.value().v_int64);
+  } else {
+    Expr expr = val;
+    CHECK(expr.defined());
+    if (const ir::IntImm* op = expr.as<ir::IntImm>()) {
+      *ptr = static_cast<T>(op->value);
+    } else if (const ir::UIntImm* op = expr.as<ir::UIntImm>()) {
+      *ptr = static_cast<T>(op->value);
+    } else {
+      LOG(FATAL) << "Expect int value, but get " << expr->type_key();
+    }
+  }
+}
+template<>
+inline void SetValue<std::string>(std::string* ptr, const TVMArgValue& val) {
+  if (val.type_code() == kStr) {
+    *ptr = val.operator std::string();
+  } else {
+    Expr expr = val;
+    const ir::StringImm* op = expr.as<ir::StringImm>();
+    CHECK(op != nullptr);
+    *ptr = op->value;
+  }
+}
+template<>
+inline void SetValue(Type* ptr, const TVMArgValue& val) {
+  *ptr = val.operator Type();
+}
+template<>
+inline void SetValue<double>(double* ptr, const TVMArgValue& val) {
+  if (val.type_code() == kDLFloat || val.type_code() == kDLInt) {
+    *ptr = val.operator double();
+  } else {
+    Expr expr = val;
+    CHECK(expr.defined());
+    if (const ir::IntImm* op = expr.as<ir::IntImm>()) {
+      *ptr = static_cast<double>(op->value);
+    } else if (const ir::IntImm* op = expr.as<ir::IntImm>()) {
+      *ptr = static_cast<double>(op->value);
+    } else if (const ir::UIntImm* op = expr.as<ir::UIntImm>()) {
+      *ptr = static_cast<double>(op->value);
+    } else {
+      LOG(FATAL) << "Expect float value, but get " << expr->type_key();
+    }
+  }
+}
+template<>
+inline void SetValue<int>(int* ptr, const TVMArgValue& val) {
+  SetIntValue(ptr, val);
+}
+template<>
+inline void SetValue<int64_t>(int64_t* ptr, const TVMArgValue& val) {
+  SetIntValue(ptr, val);
+}
+template<>
+inline void SetValue<uint64_t>(uint64_t* ptr, const TVMArgValue& val) {
+  SetIntValue(ptr, val);
+}
+template<>
+inline void SetValue<bool>(bool* ptr, const TVMArgValue& val) {
+  SetIntValue(ptr, val);
+}
+
+// Visitor for value initialization
+template<typename FFind>
+class AttrInitVisitor {
+ public:
+  // Counter of number of matched attributes during visit.
+  // This is used to decide if there is additional unmatched attributes.
+  size_t hit_count_{0};
+  // constructor
+  AttrInitVisitor(const char* type_key, FFind ffind)
+      : type_key_(type_key), ffind_(ffind) {
+  }
+
+  template<typename T>
+  AttrInitEntry<T> operator()(const char* key, T* value) {
+    TVMArgValue val;
+    AttrInitEntry<T> opt;
+    opt.type_key_ = type_key_;
+    opt.key_ = key;
+    opt.value_ = value;
+    if (ffind_(key, &val)) {
+      SetValue(value, val);
+      opt.value_missing_ = false;
+      ++hit_count_;
+    } else {
+      opt.value_missing_ = true;
+    }
+    return opt;
+  }
+
+ private:
+  // the type key
+  const char* type_key_;
+  FFind ffind_;
+};
+
+template<typename FFind>
+inline AttrInitVisitor<FFind> CreateInitVisitor(
+    const char* type_key,
+    FFind ffind) {
+  return AttrInitVisitor<FFind>(type_key, ffind);
+}
+
+/*!
+ * \brief Helper struct to get the type name known to tvm.
+ * \tparam T the type we are interested in.
+ */
+template<typename T>
+struct TypeName {
+  static constexpr const char* value = T::ContainerType::_type_key;
+};
+
+template<>
+struct TypeName<int> {
+  static constexpr const char* value = "int";
+};
+
+template<>
+struct TypeName<int64_t> {
+  static constexpr const char* value = "int64";
+};
+
+template<>
+struct TypeName<uint64_t> {
+  static constexpr const char* value = "uint64_t";
+};
+
+template<>
+struct TypeName<Type> {
+  static constexpr const char* value = "Type";
+};
+
+template<>
+struct TypeName<std::string> {
+  static constexpr const char* value = "str";
+};
+
+template<>
+struct TypeName<bool> {
+  static constexpr const char* value = "bool";
+};
+
+template<>
+struct TypeName<void*> {
+  static constexpr const char* value = "handle";
+};
+
+template<>
+struct TypeName<double> {
+  static constexpr const char* value = "double";
+};
+
+class AttrDocEntry {
+ public:
+  using TSelf = AttrDocEntry;
+
+  explicit AttrDocEntry(NodePtr<AttrFieldInfoNode> info)
+      : info_(info) {
+  }
+  TSelf& describe(DMLC_ATTRIBUTE_UNUSED const char* str) {
+    info_->description = str;
+    return *this;
+  }
+  template<typename T>
+  TSelf& set_default(DMLC_ATTRIBUTE_UNUSED const T& value) {
+    std::ostringstream os;
+    os << info_->type_info << ", default=" << value;
+    info_->type_info = os.str();
+    return *this;
+  }
+  template<typename T>
+  TSelf& set_lower_bound(DMLC_ATTRIBUTE_UNUSED T begin) {
+    return *this;
+  }
+  template<typename T>
+  TSelf& set_upper_bound(DMLC_ATTRIBUTE_UNUSED T end) {
+    return *this;
+  }
+
+ private:
+  NodePtr<AttrFieldInfoNode> info_;
+};
+
+class AttrDocVisitor {
+ public:
+  template<typename T>
+  AttrDocEntry operator()(const char* key, T* v) {
+    NodePtr<AttrFieldInfoNode> info
+        = make_node<AttrFieldInfoNode>();
+    info->name = key;
+    info->type_info = TypeName<T>::value;
+    fields_.push_back(AttrFieldInfo(info));
+    return AttrDocEntry(info);
+  }
+
+  Array<AttrFieldInfo> fields_;
+};
+
+class AttrExistVisitor {
+ public:
+  std::string key_;
+  bool exist_{false};
+
+  template<typename T>
+  AttrNopEntry operator()(const char* key, T* v) {
+    if (exist_) return AttrNopEntry();
+    if (key == key_) exist_ = true;
+    return AttrNopEntry();
+  }
+};
+
+template<typename T>
+struct AttrTriggerNonDefaultEntry {
+  using TSelf = AttrTriggerNonDefaultEntry<T>;
+  // constructor
+  AttrTriggerNonDefaultEntry(
+      AttrVisitor* visitor, const char* key, T* data)
+      : visitor_(visitor), key_(key), data_(data) {}
+
+  ~AttrTriggerNonDefaultEntry() DMLC_THROW_EXCEPTION {
+    if (trigger_) {
+      visitor_->Visit(key_, data_);
+    }
+  }
+  TSelf& describe(DMLC_ATTRIBUTE_UNUSED const char* str) {
+    return *this;
+  }
+  TSelf& set_default(const T& value) {
+    if (AttrsEqual()(value, *data_)) {
+      trigger_ = false;
+    }
+    return *this;
+  }
+  TSelf& set_lower_bound(DMLC_ATTRIBUTE_UNUSED const T& begin) {
+    return *this;
+  }
+  TSelf& set_upper_bound(DMLC_ATTRIBUTE_UNUSED const T& end) {
+    return *this;
+  }
+
+ private:
+  AttrVisitor* visitor_;
+  const char * key_;
+  T *data_;
+  bool trigger_{true};
+};
+
+class AttrNonDefaultVisitor {
+ public:
+  explicit AttrNonDefaultVisitor(AttrVisitor* visitor)
+      : visitor_(visitor) {
+  }
+  template<typename T>
+  AttrTriggerNonDefaultEntry<T>
+  operator()(const char* key, T* value) {
+    return AttrTriggerNonDefaultEntry<T>(visitor_, key, value);
+  }
+
+ private:
+  AttrVisitor* visitor_;
+};
+}  // namespace detail
+
+/*!
+ * \brief The base class of the all the
+ *  Use "curiously recurring template pattern".
+ *
+ * \tparam DerivedType The final attribute type.
+ */
+template<typename DerivedType>
+class AttrsNode : public BaseAttrsNode {
+ public:
+  void VisitAttrs(AttrVisitor* v) final {
+    ::tvm::detail::AttrNormalVisitor vis(v);
+    self()->__VisitAttrs__(vis);
+  }
+
+  void VisitNonDefaultAttrs(AttrVisitor* v) final {
+    ::tvm::detail::AttrNonDefaultVisitor vis(v);
+    self()->__VisitAttrs__(vis);
+  }
+
+  void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final {
+    CHECK_EQ(args.size() % 2, 0);
+    const int kLinearSearchBound = 16;
+    int hit_count = 0;
+    // applies two stratgies to lookup
+    if (args.size() < kLinearSearchBound) {
+      // linear search.
+      auto ffind = [&args](const char* key, runtime::TVMArgValue* val) {
+        for (int i = 0; i < args.size(); i += 2) {
+          CHECK_EQ(args.type_codes[i], kStr);
+          if (!std::strcmp(key, args.values[i].v_str)) {
+            *val = args[i + 1];
+            return true;
+          }
+        }
+        return false;
+      };
+      auto vis = ::tvm::detail::CreateInitVisitor(DerivedType::_type_key, ffind);
+      self()->__VisitAttrs__(vis);
+      hit_count = vis.hit_count_;
+    } else {
+      // construct a map then do lookup.
+      std::unordered_map<std::string, runtime::TVMArgValue> kwargs;
+      for (int i = 0; i < args.size(); i += 2) {
+        CHECK_EQ(args.type_codes[i], kStr);
+        kwargs[args[i].operator std::string()] = args[i + 1];
+      }
+      auto ffind = [&kwargs](const char *key, runtime::TVMArgValue* val) {
+        auto it = kwargs.find(key);
+        if (it != kwargs.end()) {
+          *val = it->second;
+          return true;
+        }
+        return false;
+      };
+      auto vis = ::tvm::detail::CreateInitVisitor(DerivedType::_type_key, ffind);
+      self()->__VisitAttrs__(vis);
+      hit_count = vis.hit_count_;
+    }
+    // error handling, slow path
+    if (hit_count * 2 != args.size() && !allow_unknown) {
+      for (int i = 0; i < args.size(); i += 2) {
+        ::tvm::detail::AttrExistVisitor visitor;
+        visitor.key_ = args[i].operator std::string();
+        self()->__VisitAttrs__(visitor);
+        if (!visitor.exist_) {
+          std::ostringstream os;
+          os << DerivedType::_type_key
+             << ": does not have field \'" << visitor.key_
+             << "\', Possible fields:\n";
+          os << "----------------\n";
+          this->PrintDocString(os);
+          throw AttrError(os.str());
+        }
+      }
+    }
+  }
+
+  Array<AttrFieldInfo> ListFieldInfo() const final {
+    ::tvm::detail::AttrDocVisitor visitor;
+    self()->__VisitAttrs__(visitor);
+    return visitor.fields_;
+  }
+
+  bool ContentEqual(const Node* other, AttrsEqual equal) const final {
+    DerivedType* pself = self();
+    if (pself == other) return true;
+    if (other == nullptr) return false;
+    if (pself->type_index() != other->type_index()) return false;
+    ::tvm::detail::AttrsEqualVisitor visitor(pself, other, equal);
+    self()->__VisitAttrs__(visitor);
+    return visitor.result_;
+  }
+
+  size_t ContentHash(AttrsHash hasher) const final {
+    ::tvm::detail::AttrsHashVisitor visitor(hasher);
+    visitor.result_ = std::hash<std::string>()(this->type_key());
+    self()->__VisitAttrs__(visitor);
+    return visitor.result_;
+  }
+
+ private:
+  DerivedType* self() const {
+    return const_cast<DerivedType*>(
+        static_cast<const DerivedType*>(this));
+  }
+};
+
+
+template<typename... Args>
+inline void BaseAttrsNode::InitBySeq(Args&& ...args) {
+  runtime::PackedFunc pf([this](const TVMArgs& args, TVMRetValue *rv) {
+      this->InitByPackedArgs(args);
+    });
+  pf(std::forward<Args>(args)...);
+}
+
+inline void BaseAttrsNode::PrintDocString(std::ostream &os) const { // NOLINT(*)
+  Array<AttrFieldInfo> entry = this->ListFieldInfo();
+  for (AttrFieldInfo info : entry) {
+    os << info->name << " : " << info->type_info << '\n';
+    if (info->description.length() != 0) {
+      os << "    " << info->description << '\n';
+    }
+  }
+}
+
+}  // namespace tvm
+#endif  // TVM_ATTRS_H_
diff --git a/include/tvm/base.h b/include/tvm/base.h
index 1d7cf8add3ca..7104688aa169 100644
--- a/include/tvm/base.h
+++ b/include/tvm/base.h
@@ -8,11 +8,11 @@
 
 #include <dmlc/logging.h>
 #include <dmlc/registry.h>
-#include <tvm/node.h>
+#include <tvm/node/node.h>
 #include <string>
 #include <memory>
 #include <functional>
-#include "./runtime/registry.h"
+#include "runtime/registry.h"
 
 namespace tvm {
 
@@ -25,7 +25,7 @@ using ::tvm::AttrVisitor;
   class TypeName : public ::tvm::NodeRef {                       \
    public:                                                       \
     TypeName() {}                                                 \
-    explicit TypeName(std::shared_ptr<::tvm::Node> n) : NodeRef(n) {}   \
+    explicit TypeName(::tvm::NodePtr<::tvm::Node> n) : NodeRef(n) {}     \
     const NodeName* operator->() const {                          \
       return static_cast<const NodeName*>(node_.get());           \
     }                                                             \
@@ -48,7 +48,7 @@ std::string SaveJSON(const NodeRef& node);
  *
  * \return The shared_ptr of the Node.
  */
-std::shared_ptr<Node> LoadJSON_(std::string json_str);
+NodePtr<Node> LoadJSON_(std::string json_str);
 
 /*!
  * \brief Load the node from json string.
@@ -68,26 +68,72 @@ inline NodeType LoadJSON(const std::string& json_str) {
   return NodeType(LoadJSON_(json_str));
 }
 
-/*! \brief typedef the factory function of data iterator */
-using NodeFactory = std::function<std::shared_ptr<Node> ()>;
 /*!
- * \brief Registry entry for NodeFactory
+ * \brief Registry entry for NodeFactory.
+ *
+ *  There are two types of Nodes that can be serialized.
+ *  The normal node requires a registration a creator function that
+ *  constructs an empty Node of the corresponding type.
+ *
+ *  The global singleton(e.g. global operator) where only global_key need to be serialized,
+ *  in this case, FGlobalKey need to be defined.
  */
-struct NodeFactoryReg
-    : public dmlc::FunctionRegEntryBase<NodeFactoryReg,
-                                        NodeFactory> {
+struct NodeFactoryReg {
+  /*!
+   * \brief creator function.
+   * \param global_key Key that identifies a global single object.
+   *        If this is not empty then FGlobalKey
+   * \return The created function.
+   */
+  using FCreate = std::function<NodePtr<Node>(const std::string& global_key)>;
+  /*!
+   * \brief Global key function, only needed by global objects.
+   * \param node The node pointer.
+   * \return node The global key to the node.
+   */
+  using FGlobalKey = std::function<std::string(const Node* node)>;
+  /*! \brief registered name */
+  std::string name;
+  /*!
+   * \brief The creator function
+   */
+  FCreate fcreator = nullptr;
+  /*!
+   * \brief The global key function.
+   */
+  FGlobalKey fglobal_key = nullptr;
+  // setter of creator
+  NodeFactoryReg& set_creator(FCreate f) {  // NOLINT(*)
+    this->fcreator = f;
+    return *this;
+  }
+  // setter of creator
+  NodeFactoryReg& set_global_key(FGlobalKey f) {  // NOLINT(*)
+    this->fglobal_key = f;
+    return *this;
+  }
+  // global registry singleton
+  TVM_DLL static ::dmlc::Registry<::tvm::NodeFactoryReg> *Registry();
 };
 
+/*!
+ * \brief Register a Node type
+ * \note This is necessary to enable serialization of the Node.
+ */
 #define TVM_REGISTER_NODE_TYPE(TypeName)                                \
   static DMLC_ATTRIBUTE_UNUSED ::tvm::NodeFactoryReg & __make_Node ## _ ## TypeName ## __ = \
-      ::dmlc::Registry<::tvm::NodeFactoryReg>::Get()->__REGISTER__(TypeName::_type_key) \
-      .set_body([]() { return std::make_shared<TypeName>(); })
+      ::tvm::NodeFactoryReg::Registry()->__REGISTER__(TypeName::_type_key) \
+      .set_creator([](const std::string&) { return ::tvm::make_node<TypeName>(); })
+
+
+#define TVM_STRINGIZE_DETAIL(x) #x
+#define TVM_STRINGIZE(x) TVM_STRINGIZE_DETAIL(x)
+#define TVM_DESCRIBE(...) describe(__VA_ARGS__ "\n\nFrom:" __FILE__ ":" TVM_STRINGIZE(__LINE__))
+/*!
+ * \brief Macro to include current line as string
+ */
+#define TVM_ADD_FILELINE "\n\nDefined in " __FILE__ ":L" TVM_STRINGIZE(__LINE__)
 
-TVM_DLL::dmlc::Registry<::tvm::NodeFactoryReg > * GetTVMNodeFactoryRegistry();
 
-#define TVM_EXTERNAL_REGISTER_NODE_TYPE(TypeName)                                \
-  static DMLC_ATTRIBUTE_UNUSED ::tvm::NodeFactoryReg & __make_Node ## _ ## TypeName ## __ = \
-      ::tvm::GetTVMNodeFactoryRegistry()->__REGISTER__(TypeName::_type_key) \
-      .set_body([]() { return std::make_shared<TypeName>(); })
 }  // namespace tvm
 #endif  // TVM_BASE_H_
diff --git a/include/tvm/buffer.h b/include/tvm/buffer.h
index 41fa1fa804a8..2c72db169a2d 100644
--- a/include/tvm/buffer.h
+++ b/include/tvm/buffer.h
@@ -6,11 +6,12 @@
 #ifndef TVM_BUFFER_H_
 #define TVM_BUFFER_H_
 
-#include <tvm/container.h>
 #include <string>
 
-#include "./base.h"
-#include "./expr.h"
+#include "base.h"
+#include "expr.h"
+#include "ir_operator.h"
+#include "tvm/node/container.h"
 
 namespace tvm {
 
@@ -31,7 +32,7 @@ enum class AccessMask : int {
 class Buffer : public NodeRef {
  public:
   Buffer() {}
-  explicit Buffer(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit Buffer(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief Return a new buffer that is equivalent with current one
    *  but always add stride field.
diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h
index 96b876fe92f0..8bb3345a5eb3 100644
--- a/include/tvm/build_module.h
+++ b/include/tvm/build_module.h
@@ -9,12 +9,11 @@
 #include <string>
 #include <vector>
 #include <utility>
-#include "./runtime/packed_func.h"
-#include "./schedule_pass.h"
-#include "./lowered_func.h"
+#include "runtime/packed_func.h"
+#include "schedule_pass.h"
+#include "lowered_func.h"
 
 namespace tvm {
-using namespace tvm::runtime;
 
 /*!
 * \brief Container for target device information.
@@ -40,7 +39,7 @@ class TargetNode : public Node {
   Array<Expr> libs_array;
 
   /*! \return the full device string to pass to codegen::Build */
-  EXPORT std::string str() const;
+  TVM_DLL const std::string& str() const;
 
   void VisitAttrs(AttrVisitor* v) final {
     v->Visit("target_name", &target_name);
@@ -54,41 +53,45 @@ class TargetNode : public Node {
   }
 
   /*! \brief Get the keys for this target as a vector of string */
-  EXPORT std::vector<std::string> keys() const;
+  TVM_DLL std::vector<std::string> keys() const;
 
   /*! \brief Get the options for this target as a vector of string */
-  EXPORT std::vector<std::string> options() const;
+  TVM_DLL std::vector<std::string> options() const;
 
   /*! \brief Get the keys for this target as an unordered_set of string */
-  EXPORT std::unordered_set<std::string> libs() const;
+  TVM_DLL std::unordered_set<std::string> libs() const;
 
   static constexpr const char* _type_key = "Target";
   TVM_DECLARE_NODE_TYPE_INFO(TargetNode, Node);
+
+ private:
+  /*! \brief Internal string repr. */
+  mutable std::string str_repr_;
 };
 
 class Target : public NodeRef {
  public:
   Target() {}
-  explicit Target(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit Target(NodePtr<Node> n) : NodeRef(n) {}
 
   /*!
   * \brief Create a Target given a string
   * \param target_str the string to parse
   */
-  EXPORT static Target create(const std::string& target_str);
+  TVM_DLL static Target create(const std::string& target_str);
 
   /*!
   * \brief Push a new target context onto the thread local stack. The Target on top of
   * the stack is used to determine which specialization to use when invoking a GenericFunc.
   * \param target The target to set as the current context.
   */
-  EXPORT static void EnterTargetScope(const tvm::Target& target);
+  TVM_DLL static void EnterTargetScope(const tvm::Target& target);
 
   /*!
   * \brief Pop a target off the thread local context stack, restoring the previous target
   * as the current context.
   */
-  EXPORT static void ExitTargetScope();
+  TVM_DLL static void ExitTargetScope();
 
   /*!
   * \brief Get the current target context from thread local storage.
@@ -98,7 +101,7 @@ class Target : public NodeRef {
   * \return The target that is the current context. The target may not be defined if
   * allow_not_defined is true.
   */
-  EXPORT static tvm::Target current_target(bool allow_not_defined = true);
+  TVM_DLL static tvm::Target current_target(bool allow_not_defined = true);
 
   inline const TargetNode* operator->() const {
       return static_cast<const TargetNode*>(node_.get());
@@ -130,39 +133,39 @@ struct TargetContext {
 /*! \brief This namespace provides functions to construct Target instances */
 namespace target {
 /*! \return A target for LLVM */
-EXPORT Target llvm(const std::vector<std::string>& options =
+TVM_DLL Target llvm(const std::vector<std::string>& options =
                    std::vector<std::string>());
 
 /*! \return A target for CUDA */
-EXPORT Target cuda(const std::vector<std::string>& options =
+TVM_DLL Target cuda(const std::vector<std::string>& options =
                    std::vector<std::string>());
 
 /*! \return A target for ROCm */
-EXPORT Target rocm(const std::vector<std::string>& options =
+TVM_DLL Target rocm(const std::vector<std::string>& options =
                    std::vector<std::string>());
 
 /*! \return A target for OpenCL */
-EXPORT Target opencl(const std::vector<std::string>& options =
+TVM_DLL Target opencl(const std::vector<std::string>& options =
                      std::vector<std::string>());
 
 /*! \return A target for Metal */
-EXPORT Target metal(const std::vector<std::string>& options =
+TVM_DLL Target metal(const std::vector<std::string>& options =
                     std::vector<std::string>());
 
 /*! \return A target for rasp */
-EXPORT Target rasp(const std::vector<std::string>& options =
+TVM_DLL Target rasp(const std::vector<std::string>& options =
                    std::vector<std::string>());
 
 /*! \return A target for Mali */
-EXPORT Target mali(const std::vector<std::string>& options =
+TVM_DLL Target mali(const std::vector<std::string>& options =
                    std::vector<std::string>());
 
 /*! \return A target for Intel Graphics */
-EXPORT Target intel_graphics(const std::vector<std::string>& options =
+TVM_DLL Target intel_graphics(const std::vector<std::string>& options =
                              std::vector<std::string>());
 
 /*! \return A target for stackvm */
-EXPORT Target stackvm(const std::vector<std::string>& options =
+TVM_DLL Target stackvm(const std::vector<std::string>& options =
                       std::vector<std::string>());
 
 }  // namespace target
@@ -212,11 +215,14 @@ class BuildConfigNode : public Node {
   bool partition_const_loop = false;
 
   /*! \brief Whether to dump the IR of each pass (only when building from python) */
-  std::vector< std::pair<int, PackedFunc> > add_lower_pass;
+  std::vector< std::pair<int, runtime::PackedFunc> > add_lower_pass;
 
   /*! \brief Whether to dump the IR of each pass (only when building from python) */
   bool dump_pass_ir = false;
 
+  /*! \brief Whether to instrument loads and stores with check for out of the bounds. */
+  bool instrument_bound_checkers = false;
+
   void VisitAttrs(AttrVisitor* v) final {
     v->Visit("data_alignment", &data_alignment);
     v->Visit("offset_factor", &offset_factor);
@@ -229,6 +235,7 @@ class BuildConfigNode : public Node {
     v->Visit("detect_global_barrier", &detect_global_barrier);
     v->Visit("partition_const_loop", &partition_const_loop);
     v->Visit("dump_pass_ir", &dump_pass_ir);
+    v->Visit("instrument_bound_checkers", &instrument_bound_checkers);
   }
 
   static constexpr const char* _type_key = "BuildConfig";
@@ -241,7 +248,7 @@ class BuildConfigNode : public Node {
 class BuildConfig : public ::tvm::NodeRef {
  public:
   BuildConfig() {}
-  explicit BuildConfig(std::shared_ptr<::tvm::Node> n) : NodeRef(n) {}
+  explicit BuildConfig(NodePtr<::tvm::Node> n) : NodeRef(n) {}
 
   const BuildConfigNode* operator->() const {
     return static_cast<const BuildConfigNode*>(node_.get());
@@ -255,20 +262,20 @@ class BuildConfig : public ::tvm::NodeRef {
    * \brief Push a new BuildConfig context onto the thread local stack.
    * \param build_config The configuration to set as the current context.
    */
-  EXPORT static void EnterBuildConfigScope(const tvm::BuildConfig& build_config);
+  TVM_DLL static void EnterBuildConfigScope(const tvm::BuildConfig& build_config);
 
   /*!
    * \brief Pop a build config off the thread local context stack, restoring the previous
    * configuration as the current context.
    */
-  EXPORT static void ExitBuildConfigScope();
+  TVM_DLL static void ExitBuildConfigScope();
 
   /*!
    * \brief Get the current BuildConfig context from thread local storage, or a default
    * configuration if a BuildConfig scope has not been entered.
    * \return The configuration that is the current context.
    */
-  EXPORT static tvm::BuildConfig Current();
+  TVM_DLL static tvm::BuildConfig Current();
 
   using ContainerType = BuildConfigNode;
 };
@@ -297,7 +304,7 @@ struct BuildConfigContext {
 * \brief Construct a BuildConfig containing a new BuildConfigNode
 * \return The new BuildConfig
 */
-EXPORT BuildConfig build_config();
+TVM_DLL BuildConfig build_config();
 
 /*!
 * \brief Build a LoweredFunc given a schedule, args and binds
@@ -308,11 +315,11 @@ EXPORT BuildConfig build_config();
 * \param config The build configuration.
 * \return The lowered function.
 */
-EXPORT Array<LoweredFunc> lower(Schedule sch,
-                                const Array<Tensor>& args,
-                                const std::string& name,
-                                const std::unordered_map<Tensor, Buffer>& binds,
-                                const BuildConfig& config);
+TVM_DLL Array<LoweredFunc> lower(Schedule sch,
+                                 const Array<Tensor>& args,
+                                 const std::string& name,
+                                 const std::unordered_map<Tensor, Buffer>& binds,
+                                 const BuildConfig& config);
 
 /*!
 * \brief Build a device and host module for a specific target from an array of lowered functions.
@@ -322,10 +329,10 @@ EXPORT Array<LoweredFunc> lower(Schedule sch,
 * \param config The build configuration.
 * \return The built module.
 */
-EXPORT runtime::Module build(const Array<LoweredFunc>& funcs,
-                             const Target& target,
-                             const Target& target_host,
-                             const BuildConfig& config);
+TVM_DLL runtime::Module build(const Array<LoweredFunc>& funcs,
+                              const Target& target,
+                              const Target& target_host,
+                              const BuildConfig& config);
 
 class GenericFuncNode;
 
@@ -335,7 +342,7 @@ class GenericFuncNode;
 class GenericFunc : public NodeRef {
  public:
   GenericFunc() {}
-  explicit GenericFunc(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit GenericFunc(NodePtr<Node> n) : NodeRef(n) {}
 
   /*!
    * \brief Set the default function implementaiton.
@@ -344,7 +351,7 @@ class GenericFunc : public NodeRef {
    * false, an error will be logged if the call would override a previously registered function.
    * \return reference to self.
    */
-  TVM_DLL GenericFunc& set_default(const PackedFunc value,
+  TVM_DLL GenericFunc& set_default(const runtime::PackedFunc value,
                                    bool allow_override = false);
   /*!
    * \brief Register a specialized function
@@ -355,7 +362,7 @@ class GenericFunc : public NodeRef {
    * \return reference to self.
    */
   TVM_DLL GenericFunc& register_func(const std::vector<std::string>& tags,
-                                     const PackedFunc value,
+                                     const runtime::PackedFunc value,
                                      bool allow_override = false);
   /*!
    * \brief Call generic function by directly passing in unpacked format.
@@ -372,14 +379,15 @@ class GenericFunc : public NodeRef {
    * \endcode
    */
   template<typename... Args>
-  inline TVMRetValue operator()(Args&& ...args) const;
+  inline runtime::TVMRetValue operator()(Args&& ...args) const;
   /*!
    * \brief Invoke the relevant function for the current target context, set by set_target_context.
    * Arguments are passed in packed format.
    * \param args The arguments to pass to the function.
    * \param ret The return value
    */
-  TVM_DLL void CallPacked(TVMArgs args, TVMRetValue* ret) const;
+  TVM_DLL void CallPacked(runtime::TVMArgs args,
+                          runtime::TVMRetValue* ret) const;
 
   /*!
    * \brief Find or register the GenericFunc instance corresponding to the give name
@@ -412,14 +420,14 @@ class GenericFunc : public NodeRef {
 };
 
 template<typename... Args>
-inline TVMRetValue GenericFunc::operator()(Args&& ...args) const {
+inline runtime::TVMRetValue GenericFunc::operator()(Args&& ...args) const {
   const int kNumArgs = sizeof...(Args);
   const int kArraySize = kNumArgs > 0 ? kNumArgs : 1;
   TVMValue values[kArraySize];
   int type_codes[kArraySize];
-  detail::for_each(TVMArgsSetter(values, type_codes),
+  runtime::detail::for_each(TVMArgsSetter(values, type_codes),
     std::forward<Args>(args)...);
-  TVMRetValue rv;
+  runtime::TVMRetValue rv;
   CallPacked(TVMArgs(values, type_codes, kNumArgs), &rv);
   return rv;
 }
@@ -432,9 +440,9 @@ class GenericFuncNode : public Node {
   /*! \brief name of the function */
   std::string name_;
   /* \brief the generic builder */
-  PackedFunc generic_func_;
+  runtime::PackedFunc generic_func_;
   /* \brief map from keys to registered functions */
-  std::unordered_map<std::string, PackedFunc> dispatch_dict_;
+  std::unordered_map<std::string, runtime::PackedFunc> dispatch_dict_;
 
   static constexpr const char* _type_key = "GenericFunc";
   TVM_DECLARE_NODE_TYPE_INFO(GenericFuncNode, Node);
diff --git a/include/tvm/c_dsl_api.h b/include/tvm/c_dsl_api.h
index 6f15ef9a3e80..027a3952d9d4 100644
--- a/include/tvm/c_dsl_api.h
+++ b/include/tvm/c_dsl_api.h
@@ -14,7 +14,7 @@
 #ifndef TVM_C_DSL_API_H_
 #define TVM_C_DSL_API_H_
 
-#include "./runtime/c_runtime_api.h"
+#include "runtime/c_runtime_api.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/include/tvm/channel.h b/include/tvm/channel.h
index 28d9b5f7ce4a..051b57a194c4 100644
--- a/include/tvm/channel.h
+++ b/include/tvm/channel.h
@@ -17,7 +17,7 @@ class Channel : public NodeRef {
  public:
   /*! \brief default constructor  */
   Channel() {}
-  explicit Channel(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit Channel(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
diff --git a/include/tvm/codegen.h b/include/tvm/codegen.h
index 6b5116a143cc..fca88de6a238 100644
--- a/include/tvm/codegen.h
+++ b/include/tvm/codegen.h
@@ -7,11 +7,11 @@
 #define TVM_CODEGEN_H_
 
 #include <string>
-#include "./base.h"
-#include "./expr.h"
-#include "./lowered_func.h"
-#include "./api_registry.h"
-#include "./runtime/packed_func.h"
+#include "base.h"
+#include "expr.h"
+#include "lowered_func.h"
+#include "api_registry.h"
+#include "runtime/packed_func.h"
 
 namespace tvm {
 /*! \brief namespace for lowlevel IR pass and codegen */
diff --git a/include/tvm/expr.h b/include/tvm/expr.h
index 8c789f8df1dc..35083cafae81 100644
--- a/include/tvm/expr.h
+++ b/include/tvm/expr.h
@@ -7,12 +7,11 @@
 #define TVM_EXPR_H_
 
 #include <ir/Expr.h>
-#include <ir/IROperator.h>
 #include <ir/IRPrinter.h>
 #include <string>
 #include <algorithm>
-#include "./base.h"
-#include "./runtime/c_runtime_api.h"
+#include "base.h"
+#include "runtime/c_runtime_api.h"
 
 namespace tvm {
 
@@ -30,18 +29,11 @@ using HalideIR::VarExpr;
 using HalideIR::IR::RangeNode;
 using HalideIR::IR::FunctionRef;
 using HalideIR::IR::FunctionBaseNode;
+using HalideIR::Internal::IntImm;
 using HalideIR::Internal::Stmt;
 using HalideIR::Internal::IRPrinter;
 using HalideIR::Internal::Variable;
 
-using HalideIR::Internal::make_const;
-using HalideIR::Internal::make_zero;
-using HalideIR::Internal::as_const_int;
-using HalideIR::Internal::as_const_uint;
-using HalideIR::Internal::const_true;
-using HalideIR::Internal::const_false;
-using HalideIR::Internal::is_no_op;
-
 inline Type TVMShapeIndexType() {
   if (std::is_signed<tvm_index_t>::value) {
     return Int(sizeof(tvm_index_t) * 8);
@@ -65,6 +57,8 @@ inline TVMType Type2TVMType(Type t) {
 // Get number of bytes considering vector type.
 inline int GetVectorBytes(Type dtype) {
   int data_bits = dtype.bits() * dtype.lanes();
+  // allow bool to exist
+  if (dtype == Bool()) return 1;
   CHECK_EQ(data_bits % 8, 0U)
       << "Need to load/store by multiple of bytes";
   return data_bits / 8;
@@ -75,7 +69,7 @@ class Var : public HalideIR::VarExpr {
  public:
   EXPORT explicit Var(const std::string& name_hint = "v",
                Type t = Int(32)) : VarExpr(name_hint, t) {}
-  explicit Var(std::shared_ptr<Node> n) : VarExpr(n) {}
+  explicit Var(NodePtr<Node> n) : VarExpr(n) {}
   explicit Var(VarExpr v) : VarExpr(v) {}
   /*!
    * \brief Make a new copy of var with same type, append suffix
@@ -90,6 +84,51 @@ class Var : public HalideIR::VarExpr {
 };
 
 
+/*!
+ * \brief Container of constant integer (IntImm).
+ *
+ * This is used to store and automate type check
+ * attributes that must be constant integer.
+ */
+class Integer : public Expr {
+ public:
+  Integer() : Expr() {}
+  /*!
+   * \brief constructor from node.
+   */
+  explicit Integer(NodePtr<Node> node) : Expr(node) {}
+  /*!
+   * \brief Construct integer from int value.
+   */
+  Integer(int value) : Expr(value) {}  // NOLINT(*)
+  /*!
+   * \brief Assign an expression to integer.
+   * \param other another expression.
+   */
+  Integer& operator=(const Integer& other) {
+    node_ = other.node_;
+    return *this;
+  }
+  /*!
+   * \brief Get pointer to the internal value.
+   * \return the content of the integer.
+   */
+  const IntImm* operator->() const {
+    return static_cast<const IntImm*>(node_.get());
+  }
+  /*!
+   * \brief convert to int64_t
+   */
+  operator int64_t() const {
+    CHECK(node_ != nullptr)
+        << " Trying get reference a null Integer";
+    return (*this)->value;
+  }
+  /*! \brief type indicate the container type */
+  using ContainerType = IntImm;
+};
+
+
 /*! \brief container class of iteration variable. */
 class IterVarNode;
 
@@ -106,7 +145,7 @@ class Range : public HalideIR::IR::Range {
  public:
   /*! \brief constructor */
   Range() {}
-  explicit Range(std::shared_ptr<Node> n) : HalideIR::IR::Range(n) {}
+  explicit Range(NodePtr<Node> n) : HalideIR::IR::Range(n) {}
   /*!
    * \brief constructor by begin and end
    * \param begin The begin of the range.
@@ -117,6 +156,8 @@ class Range : public HalideIR::IR::Range {
   TVM_DLL static Range make_by_min_extent(Expr min, Expr extent);
 };
 
+using Region = Array<Range>;
+
 /*!
  * \brief Type of iteration variable.
  *  Each IterVar have a specific type.
@@ -196,7 +237,7 @@ class IterVar : public NodeRef {
   // construct a new iter var without a domain
   IterVar() {}
   // construct from shared ptr.
-  explicit IterVar(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit IterVar(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -230,6 +271,13 @@ using Domain = Array<Range>;
 
 // print functions for expr
 TVM_DLL std::ostream& operator<<(std::ostream& os, const NodeRef& n);  // NOLINT(*)
+
+/*!
+ * \brief Dump the node to stderr, used for debug purposes.
+ * \param node The input node
+ */
+TVM_DLL void Dump(const NodeRef& node);
+
 // definition of Node.
 /*!
  * \brief An iteration variable representing an iteration
diff --git a/include/tvm/ir.h b/include/tvm/ir.h
index 9ea16131188d..adaffa77dae6 100644
--- a/include/tvm/ir.h
+++ b/include/tvm/ir.h
@@ -10,8 +10,9 @@
 #include <ir/IR.h>
 #include <type_traits>
 #include <string>
-#include "./base.h"
-#include "./expr.h"
+#include "base.h"
+#include "expr.h"
+#include "runtime/util.h"
 
 namespace tvm {
 namespace ir {
@@ -27,7 +28,7 @@ struct CommReducerNode;
 
 struct CommReducer : public NodeRef {
   CommReducer() {}
-  explicit CommReducer(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit CommReducer(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -205,6 +206,8 @@ constexpr const char* scan_init_scope = "scan_init_scope";
  *  This gives hint to require stride of dim to be k * align + offset.
  */
 constexpr const char* buffer_dim_align = "buffer_dim_align";
+/*! \brief Mark stores/loads with theirs bounds.  */
+constexpr const char* buffer_bound = "buffer_bound";
 /*!
  * \brief Bind the buffer specification to the region of the op
  *  When this scope occurs, the stmt.node is a Array<NodeRef> = [buffer, tensor]
@@ -236,6 +239,11 @@ constexpr const char* pipeline_exec_scope = "pipeline_exec_scope";
  */
 constexpr const char* opengl_stage_scope = "opengl_stage_scope";
 
+/*!
+ * \brief Mark that it is in the device scope.
+ */
+constexpr const char* device_scope = "device_scope";
+
 /*!
  * \brief Check if attr_key is a pragma key extension
  * \param attr_key The attr key to be compared
@@ -449,25 +457,6 @@ constexpr const char* tvm_global_barrier_kinit = "tvm_global_barrier_kinit";
  */
 constexpr const char* tvm_thread_allreduce = "tvm_thread_allreduce";
 
-/*! \brief The kind of structure field info */
-enum TVMStructFieldKind : int {
-  // array head address
-  kArrAddr,
-  kArrData,
-  kArrShape,
-  kArrStrides,
-  kArrNDim,
-  kArrTypeCode,
-  kArrTypeBits,
-  kArrTypeLanes,
-  kArrByteOffset,
-  kArrDeviceId,
-  kArrDeviceType,
-  kArrKindBound_,
-  // TVMValue field
-  kTVMValueContent,
-  kTVMValueKindBound_
-};
 }   // namespace intrinsic
 
 // Reuse IR node defintiion from HalideIR
@@ -513,8 +502,6 @@ using HalideIR::Internal::Block;
 using HalideIR::Internal::IfThenElse;
 using HalideIR::Internal::Evaluate;
 using HalideIR::Internal::Shuffle;
-// ir functions
-using HalideIR::Internal::is_const_power_of_two_integer;
 
 /*!
  * \brief Create a type annotation expression
diff --git a/include/tvm/ir_functor_ext.h b/include/tvm/ir_functor_ext.h
index 3784608c8da1..43868114307d 100644
--- a/include/tvm/ir_functor_ext.h
+++ b/include/tvm/ir_functor_ext.h
@@ -6,8 +6,8 @@
 #ifndef TVM_IR_FUNCTOR_EXT_H_
 #define TVM_IR_FUNCTOR_EXT_H_
 
-#include <tvm/ir_functor.h>
-#include "./ir.h"
+#include "tvm/node/ir_functor.h"
+#include "ir.h"
 
 namespace tvm {
 namespace ir {
diff --git a/include/tvm/ir_mutator.h b/include/tvm/ir_mutator.h
index b8aae3638149..6cc80d55352b 100644
--- a/include/tvm/ir_mutator.h
+++ b/include/tvm/ir_mutator.h
@@ -6,10 +6,10 @@
 #ifndef TVM_IR_MUTATOR_H_
 #define TVM_IR_MUTATOR_H_
 
-#include <tvm/ir_functor.h>
 #include <unordered_map>
-#include "./expr.h"
-#include "./ir.h"
+#include "expr.h"
+#include "ir.h"
+#include "tvm/node/ir_functor.h"
 
 namespace tvm {
 namespace ir {
diff --git a/include/tvm/ir_operator.h b/include/tvm/ir_operator.h
index 947c3b736d80..5abd95b8c166 100644
--- a/include/tvm/ir_operator.h
+++ b/include/tvm/ir_operator.h
@@ -1,24 +1,426 @@
 /*!
- *  Copyright (c) 2017 by Contributors
+ *  Copyright (c) 2018 by Contributors
  * \file tvm/ir_operator.h
- * \brief Common operators of Expr
+ * \brief Common operators defined for Expr.
+ *
+ * \note Most of the operator defined here perform simple constant folding
+ *   when the type is int32 or int64 for simplifying the index expressions.
  */
 #ifndef TVM_IR_OPERATOR_H_
 #define TVM_IR_OPERATOR_H_
 
 #include <algorithm>
-#include "./expr.h"
-#include "./ir.h"
+#include <type_traits>
+#include "expr.h"
+#include "ir.h"
 
 namespace tvm {
+/*!
+ * \brief Make a const value with certain data type.
+ * \param t The target type.
+ * \param value The input value
+ * \return the result expression.
+ * \tparam ValueType The constant value type
+ */
+template<typename ValueType,
+         typename = typename std::enable_if<std::is_pod<ValueType>::value>::type>
+inline Expr make_const(Type t, ValueType value);
+/*!
+ * \brief Make a const zero expr.
+ * \param t The target type.
+ * \return the result expression.
+ */
+inline Expr make_zero(Type t);
+/*!
+ * \brief Make a constant true expression.
+ * \param lanes The number of lanes in the bool
+ * \return The result expression.
+ */
+inline Expr const_true(int lanes = 1) {
+  return make_const(UInt(1, lanes), 1);
+}
+/*!
+ * \brief Make a constant false expression.
+ * \param lanes The number of lanes in the bool
+ * \return The result expression.
+ */
+inline Expr const_false(int lanes = 1) {
+  return make_const(UInt(1, lanes), 0);
+}
+/*!
+ * \brief Get x as constant int expression.
+ * \param x The expression
+ * \return the address to the int expression,
+ *         return nullptr, if x is not IntImm.
+ */
+inline const int64_t* as_const_int(const Expr& x) {
+  if (!x.defined()) return nullptr;
+  if (const ir::IntImm* op = x.as<ir::IntImm>()) {
+    return &(op->value);
+  } else {
+    return nullptr;
+  }
+}
+
+/*!
+ * \brief Get x as constant uint expression.
+ * \param x The expression
+ * \return the address to the int expression,
+ *         return nullptr, if x is not UIntImm.
+ */
+inline const uint64_t* as_const_uint(const Expr& x) {
+  if (!x.defined()) return nullptr;
+  if (const ir::UIntImm* op = x.as<ir::UIntImm>()) {
+    return &(op->value);
+  } else {
+    return nullptr;
+  }
+}
 
-using HalideIR::likely;
-using HalideIR::likely_if_innermost;
-// functions
-using HalideIR::cast;
-using HalideIR::min;
-using HalideIR::max;
-using HalideIR::select;
+/*!
+ * \brief Check whether x is a constant integer expression.
+ * \param x The input argument
+ * \param value the value to be compared against.
+ * \return whether x is constant expression.
+ */
+inline bool is_const_int(const Expr& x, int64_t value);
+
+/*!
+ * \brief Check whether stmt is nop.
+ * \param stmt The input statement
+ * \return whether stmt is nop
+ */
+inline bool is_no_op(const Stmt& stmt);
+
+/*!
+ * \brief Check whether x is a constant integer 1
+ * \param x The input argument.
+ * \note This only return true for integer types.
+ * \return whether x is constant 1
+ */
+inline bool is_one(const Expr& x) {
+  return is_const_int(x, 1);
+}
+
+/*!
+ * \brief Check whether x is a constant integer 0
+ * \param x The input argument
+ * \return whether x is constant 0
+ * \note This only return true for integer types.
+ */
+inline bool is_zero(const Expr& x) {
+  return is_const_int(x, 0);
+}
+
+/*!
+ * \brief Check whether x is a constant.
+ * \note This only return true for integer types.
+ * \return whether x is constant
+ */
+inline bool is_const(const Expr& x);
+
+/*!
+ * \brief Check whether x is a constant power of two
+ * If x is power of two, write the power to the shift.
+ *
+ * \param x The input expression.
+ * \param shift The output shift if x is power of two.
+ * \return whether x is constant power of two
+ */
+TVM_DLL bool is_const_power_of_two_integer(const Expr& x, int* shift);
+
+/*!
+ * \brief cast value to type.
+ *
+ * \param t the target type.
+ * \param value The value
+ * \return The result expression.
+ * \note This function may return value if the type is the same.
+ */
+TVM_DLL Expr cast(const Type& t, Expr value);
+/*!
+ * \brief perform reinterpret cast value to type.
+ *
+ * \param t the target type.
+ * \param value The value
+ * \return The result expression.
+ * \note This function may return value if the type is the same.
+ */
+TVM_DLL Expr reinterpret(const Type& t, Expr value);
+/*!
+ * \brief add operator
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator+(Expr a, Expr b);
+/*!
+ * \brief subtraction operator
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator-(Expr a, Expr b);
+/*!
+ * \brief negation.
+ *
+ * \param a input.
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator-(Expr a);
+/*!
+ * \brief multiplication operator
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator*(Expr a, Expr b);
+/*!
+ * \brief division operator
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator/(Expr a, Expr b);
+/*!
+ * \brief mod operator
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator%(Expr a, Expr b);
+/*!
+ * \brief left shift operator
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator<<(Expr a, Expr b);
+/*!
+ * \brief right shift operator
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator>>(Expr a, Expr b);
+/*!
+ * \brief greater
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator>(Expr a, Expr b);
+/*!
+ * \brief greater_equal
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator>=(Expr a, Expr b);
+/*!
+ * \brief less
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator<(Expr a, Expr b);
+/*!
+ * \brief less_equal
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator<=(Expr a, Expr b);
+/*!
+ * \brief equal
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator==(Expr a, Expr b);
+/*!
+ * \brief not_equal
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator!=(Expr a, Expr b);
+/*!
+ * \brief and
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note This operator does eager constant folding.
+ */
+TVM_DLL Expr operator&&(Expr a, Expr b);
+/*!
+ * \brief or
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note This operator does eager constant folding.
+ */
+TVM_DLL Expr operator||(Expr a, Expr b);
+/*!
+ * \brief not
+ *
+ * \param a left operand
+ * \return The result expression.
+ * \note This operator does eager constant folding.
+ */
+TVM_DLL Expr operator!(Expr a);
+/*!
+ * \brief take maximum of two values
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr max(Expr a, Expr b);
+/*!
+ * \brief take minimum of two values
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr min(Expr a, Expr b);
+/*!
+ * \brief right shift
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator>>(Expr a, Expr b);
+/*!
+ * \brief left shift
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator<<(Expr a, Expr b);
+/*!
+ * \brief take bitwise and of two values
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator&(Expr a, Expr b);
+/*!
+ * \brief take bitwise or of two values
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator|(Expr a, Expr b);
+/*!
+ * \brief take bitwise xor of two values
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator^(Expr a, Expr b);
+/*!
+ * \brief take bitwise negation of two values
+ *
+ * \param a the input expression.
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr operator~(Expr a);
+/*!
+ * \brief select result by condition
+ *
+ * \param cond The condition
+ * \param true_value The value when results are true.
+ * \param false_value The value when results are false.
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL Expr select(Expr cond, Expr true_value, Expr false_value);
+/*!
+ * \brief Mark condition as likely.
+ * \param cond The condition
+ * \return The marked expression.
+ */
+TVM_DLL Expr likely(Expr cond);
+/*!
+ * \brief Calculate power(x, y)
+ * \param x The left operand.
+ * \param y The right operand.
+ */
+TVM_DLL Expr pow(Expr x, Expr y);
+/*!
+ * \brief Calculate absolute value of x.
+ * \param x The input data
+ *
+ * \return The aboslute value of input data x
+ */
+TVM_DLL Expr abs(Expr x);
 
 /*!
  * \brief sum of of source expression over axis
@@ -41,14 +443,19 @@ TVM_DLL Expr max(Expr source, Array<IterVar> axis);
  */
 TVM_DLL Expr min(Expr source, Array<IterVar> axis);
 
+/*!
+ * \brief product of of source expression over axis
+ * \param source The source expression.
+ * \param axis List of iteration variables that will be used for reduction.
+ */
+TVM_DLL Expr prod(Expr source, Array<IterVar> axis);
 
-// Unary intrinsic operators
+// Intrinsic operators
 #define TVM_DECLARE_INTRIN_UNARY(OpName)                                \
   inline Expr OpName(Expr x) {                                          \
     return ir::Call::make(x.type(), #OpName, {x}, ir::Call::PureIntrinsic); \
   }                                                                     \
 
-
 TVM_DECLARE_INTRIN_UNARY(exp);
 TVM_DECLARE_INTRIN_UNARY(tanh);
 TVM_DECLARE_INTRIN_UNARY(sigmoid);
@@ -58,38 +465,152 @@ TVM_DECLARE_INTRIN_UNARY(floor);
 TVM_DECLARE_INTRIN_UNARY(ceil);
 TVM_DECLARE_INTRIN_UNARY(round);
 TVM_DECLARE_INTRIN_UNARY(trunc);
+TVM_DECLARE_INTRIN_UNARY(popcount);
 
-/*!
- * \brief Calculate power(x, y)
- * \param x The left operand.
- * \param y The right operand.
- */
-inline Expr pow(Expr x, Expr y) {
-  match_types(x, y);
-  CHECK(x.type().is_float()) << "power only applies to float";
-  return ir::Call::make(x.type(), "pow", { x, y }, ir::Call::PureIntrinsic);
+
+// Implementation details after this
+inline bool is_const(const Expr& x) {
+  if (x.as<ir::IntImm>() || x.as<ir::UIntImm>()) {
+    return true;
+  } else if (const auto* op = x.as<ir::Broadcast>()) {
+    const Expr& val = op->value;
+    if (val.as<ir::IntImm>() || val.as<ir::UIntImm>()) {
+      return true;
+    }
+  }
+  return false;
 }
 
-/*!
- * \brief Calculate absolute value of x, elementwise
- * \param x The input data
- *
- * \return The aboslute value of input data x
- */
-inline Expr abs(Expr x) {
-  if (x.type().is_int()) {
-    return select(x >= make_zero(x.type()), x, -x);
-  } else if (x.type().is_float()) {
-    return ir::Call::make(x.type(), "fabs", {x}, ir::Call::PureIntrinsic);
-  } else if (x.type().is_uint()) {
-    return x;
+inline bool is_positive_const(const Expr& a) {
+  if (const ir::IntImm* op = a.as<ir::IntImm>()) {
+    return op->value > 0;
+  } else if (const ir::UIntImm* op = a.as<ir::UIntImm>()) {
+    return op->value > 0;
   } else {
-    LOG(WARNING) << "Warning: Data type " << x.type()
-      <<" not supported for absolute op. Skipping absolute op...";
-    return x;
+    return false;
   }
 }
 
-}  // namespace tvm
+inline bool is_negative_const(const Expr& a) {
+  if (const ir::IntImm* op = a.as<ir::IntImm>()) {
+    return op->value < 0;
+  } else {
+    return false;
+  }
+}
+
+inline bool is_const_int(const Expr& x, int64_t value) {
+  if (const auto* op = x.as<ir::IntImm>()) {
+    return op->value == value;
+  } else if (const auto* op = x.as<ir::UIntImm>()) {
+    return op->value == static_cast<uint64_t>(value);
+  } else if (const auto* op = x.as<ir::Broadcast>()) {
+    const Expr& val = op->value;
+    if (const auto* opv = val.as<ir::IntImm>()) {
+      return opv->value == value;
+    } else if (const auto* opv = val.as<ir::UIntImm>()) {
+      return opv->value == static_cast<uint64_t>(value);
+    }
+  }
+  return false;
+}
 
+inline bool is_no_op(const Stmt& stmt) {
+  if (!stmt.defined()) return true;
+  if (const auto* op = stmt.as<ir::Evaluate>()) {
+    return is_const(op->value);
+  }
+  return false;
+}
+
+template<typename ValueType>
+inline Expr MakeConstScalar(Type t, ValueType value) {
+  if (t.is_int()) return ir::IntImm::make(t, static_cast<int64_t>(value));
+  if (t.is_uint()) return ir::UIntImm::make(t, static_cast<uint64_t>(value));
+  if (t.is_float()) return ir::FloatImm::make(t, static_cast<double>(value));
+  LOG(FATAL) << "cannot make const for type " << t;
+  return Expr();
+}
+
+template<typename ValueType, typename>
+inline Expr make_const(Type t, ValueType value) {
+  if (t.lanes() == 1) {
+    return MakeConstScalar(t, value);
+  } else {
+    return ir::Broadcast::make(
+        MakeConstScalar(t.element_of(), value), t.lanes());
+  }
+}
+
+inline Expr make_zero(Type t) {
+  if (t.is_handle()) {
+    return reinterpret(t, make_const(UInt(64), 0));
+  }
+  return make_const(t, 0);
+}
+
+// additional const expression overloading
+#define TVM_DEFINE_ASSIGN_OP_OVERLOAD(Name, OpFunc)            \
+  inline Expr Name(Expr& a, Expr b) {                          \
+    a = OpFunc(a, b);                                          \
+    return a;                                                  \
+  }
+
+#define TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(Name)              \
+  inline Expr Name(const Expr& a, float b) {                   \
+    return Name(a, Expr(b));                                   \
+  }                                                            \
+  inline Expr Name(float a, const Expr& b) {                   \
+    return Name(Expr(a), b);                                   \
+  }                                                            \
+  inline Expr Name(int a, const Expr& b) {                     \
+    return Name(make_const(b.type(), a), b);                   \
+  }                                                            \
+  inline Expr Name(const Expr& a, int b) {                     \
+    return Name(a, make_const(a.type(), b));                   \
+  }
+
+#define TVM_DEFINE_LOGICAL_OP_CONST_VAL_OVERLOAD(Name)                  \
+  inline Expr Name(const Expr& a, bool b) {                             \
+    return Name(a, Expr(b));                                            \
+  }                                                                     \
+  inline Expr Name(bool a, const Expr& b) {                             \
+    return Name(Expr(a), b);                                            \
+  }
+
+#define TVM_DEFINE_INT_OP_CONST_VAL_OVERLOAD(Name)                      \
+  inline Expr Name(const Expr& a, int b) {                              \
+    return Name(a, make_const(a.type(), b));                            \
+  }                                                                     \
+  inline Expr Name(int a, const Expr& b) {                              \
+    return Name(make_const(b.type(), a), b);                            \
+  }
+
+
+TVM_DEFINE_ASSIGN_OP_OVERLOAD(operator+=, operator+);
+TVM_DEFINE_ASSIGN_OP_OVERLOAD(operator-=, operator-);
+TVM_DEFINE_ASSIGN_OP_OVERLOAD(operator*=, operator*);
+TVM_DEFINE_ASSIGN_OP_OVERLOAD(operator/=, operator/);
+TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(operator+);
+TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(operator-);
+TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(operator*);
+TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(operator/);
+TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(max);
+TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(min);
+TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(operator>);  // NOLINT(*)
+TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(operator>=);
+TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(operator<);  // NOLINT(*)
+TVM_DEFINE_BINOP_CONST_VAL_OVERLOAD(operator<=);
+// integer related ops
+TVM_DEFINE_INT_OP_CONST_VAL_OVERLOAD(operator%);
+TVM_DEFINE_INT_OP_CONST_VAL_OVERLOAD(operator>>); // NOLINT(*)
+TVM_DEFINE_INT_OP_CONST_VAL_OVERLOAD(operator<<); // NOLINT(*)
+TVM_DEFINE_INT_OP_CONST_VAL_OVERLOAD(operator&);
+TVM_DEFINE_INT_OP_CONST_VAL_OVERLOAD(operator|);
+TVM_DEFINE_INT_OP_CONST_VAL_OVERLOAD(operator^);
+// logical ops
+TVM_DEFINE_LOGICAL_OP_CONST_VAL_OVERLOAD(operator&&);
+TVM_DEFINE_LOGICAL_OP_CONST_VAL_OVERLOAD(operator||);
+
+}  // namespace tvm
 #endif  // TVM_IR_OPERATOR_H_
diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h
index d875621a3f5e..68bfe53407c8 100644
--- a/include/tvm/ir_pass.h
+++ b/include/tvm/ir_pass.h
@@ -9,15 +9,14 @@
 #ifndef TVM_IR_PASS_H_
 #define TVM_IR_PASS_H_
 
-#include <tvm/ir_functor.h>
 #include <arithmetic/Simplify.h>
 #include <unordered_map>
 #include <vector>
 #include <string>
-#include "./expr.h"
-#include "./buffer.h"
-#include "./schedule.h"
-#include "./lowered_func.h"
+#include "expr.h"
+#include "buffer.h"
+#include "schedule.h"
+#include "lowered_func.h"
 
 namespace tvm {
 namespace ir {
@@ -182,11 +181,13 @@ Stmt Inline(Stmt stmt,
  * \param extern_buffer Map specifies external
  *    buffer assignment of input and outputs.
  * \param cache_line_size The size of CPU cache line.
+ * \param create_bound_attribute Whether to create bound attributes.
  * \return Transformed stmt.
  */
 Stmt StorageFlatten(Stmt stmt,
                     Map<Tensor, Buffer> extern_buffer,
-                    int cache_line_size);
+                    int cache_line_size,
+                    bool create_bound_attribute = false);
 
 /*!
  * \brief Remove No Op from the Stmt.
@@ -218,7 +219,7 @@ Stmt NarrowChannelAccess(Stmt stmt);
  * \param auto_max_step The maximum step before stop attach automatic unroll
  * \param auto_max_depth The maximum depth before stop attach automatic unroll
  * \param auto_max_extent The maximum extent of the loop we can unroll,
- *                        this is an legacy option that donot take the loop total steps into account.
+ *                     this is an legacy option that do not take the loop total steps into account.
  * \param explicit_unroll Whether explicitly unroll the loop, or leave unroll annotation to codegen.
  * \return Transformed stmt.
  */
@@ -235,6 +236,13 @@ Stmt UnrollLoop(Stmt stmt,
  */
 Stmt VectorizeLoop(Stmt stmt);
 
+/*!
+* \brief instruments bound checkers.
+* \param stmt The statment to be instrumented.
+* \return Instrumented Stmt.
+*/
+Stmt InstrumentBoundCheckers(Stmt stmt);
+
 /*!
  * \brief Inject virtual thread loops into stmt.
  * \param stmt The statment to be transformed.
@@ -327,6 +335,15 @@ Stmt RewriteUnsafeSelect(Stmt stmt);
  */
 Stmt LowerStorageAccessInfo(Stmt stmt);
 
+/*!
+ * \brief Decorate the stmt with a device scope, this is helpful for 
+ * hardware accelerator without thread blocks.
+ *
+ * \param stmt The stmt to be trasnformed
+ * \return Transformed stmt.
+ */
+Stmt DecorateDeviceScope(Stmt stmt);
+
 /*!
  * \brief Make an user callable API LoweredFunc.
  *
diff --git a/include/tvm/ir_visitor.h b/include/tvm/ir_visitor.h
index 8919b0f7a5c2..755f15078ce2 100644
--- a/include/tvm/ir_visitor.h
+++ b/include/tvm/ir_visitor.h
@@ -6,8 +6,8 @@
 #ifndef TVM_IR_VISITOR_H_
 #define TVM_IR_VISITOR_H_
 
-#include <tvm/ir_functor.h>
-#include "./ir.h"
+#include "ir.h"
+#include "tvm/node/ir_functor.h"
 
 namespace tvm {
 namespace ir {
diff --git a/include/tvm/lowered_func.h b/include/tvm/lowered_func.h
index 19f7e27f1c75..5cb59fd47712 100644
--- a/include/tvm/lowered_func.h
+++ b/include/tvm/lowered_func.h
@@ -7,13 +7,13 @@
 #ifndef TVM_LOWERED_FUNC_H_
 #define TVM_LOWERED_FUNC_H_
 
-#include <tvm/container.h>
 #include <ir/FunctionBase.h>
 #include <string>
 
-#include "./base.h"
-#include "./expr.h"
-#include "./tensor.h"
+#include "base.h"
+#include "expr.h"
+#include "tensor.h"
+#include "tvm/node/container.h"
 
 namespace tvm {
 
@@ -27,7 +27,7 @@ class LoweredFuncNode;
 class LoweredFunc : public FunctionRef {
  public:
   LoweredFunc() {}
-  explicit LoweredFunc(std::shared_ptr<Node> n) : FunctionRef(n) {}
+  explicit LoweredFunc(NodePtr<Node> n) : FunctionRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
diff --git a/include/tvm/operation.h b/include/tvm/operation.h
index d13680531af9..02cd0d016f39 100644
--- a/include/tvm/operation.h
+++ b/include/tvm/operation.h
@@ -9,12 +9,12 @@
 #include <string>
 #include <vector>
 #include <unordered_map>
-#include "./expr.h"
-#include "./ir_operator.h"
-#include "./tensor.h"
-#include "./schedule.h"
-#include "./arithmetic.h"
-#include "./buffer.h"
+#include "expr.h"
+#include "ir_operator.h"
+#include "tensor.h"
+#include "schedule.h"
+#include "arithmetic.h"
+#include "buffer.h"
 
 namespace tvm {
 
@@ -49,7 +49,7 @@ class OperationNode : public FunctionBaseNode {
   }
   /*!
    * \return The list of iteration variable at root
-   * \note root_iter_vars dedides the shape of the outputs.
+   * \note root_iter_vars decides the shape of the outputs.
    */
   virtual Array<IterVar> root_iter_vars() const = 0;
   /*!
@@ -239,6 +239,74 @@ class TVM_DLL ComputeOpNode : public OperationNode {
   TVM_DECLARE_NODE_TYPE_INFO(ComputeOpNode, OperationNode);
 };
 
+/*!
+ * \brief A TenorCompute op that compute a tensor with an tensor intrinsic.
+ */
+class TensorComputeOpNode : public OperationNode {
+ public:
+  /*! \brief IterVar on each axis */
+  Array<IterVar> axis;
+  /*! \brief IterVar on each reduction axis, if the intrin will use the reduce axis */
+  Array<IterVar> reduce_axis;
+  /*! \brief number of axes that can be scheduled */
+  int schedulable_ndim;
+  /*! \brief TensorIntrin used to compute */
+  TensorIntrin intrin;
+  /*! \brief input tensors of intrin */
+  Array<Tensor> inputs;
+  /*! \brief region of input tensors */
+  Array<Region> input_regions;
+  /*! \brief constructor */
+  TensorComputeOpNode() {}
+  // override functions
+  int num_outputs() const final;
+  Array<IterVar> root_iter_vars() const final;
+  Type output_dtype(size_t i) const final;
+  Array<Expr> output_shape(size_t i) const final;
+  Array<Tensor> InputTensors() const final;
+  Operation ReplaceInputs(
+      const Operation& self,
+      const std::unordered_map<Tensor, Tensor>& rmap) const final;
+  void PropBoundToInputs(
+      const Operation& self,
+      const std::unordered_map<const Variable*, IntSet>& dom_map,
+      std::unordered_map<Tensor, TensorDom>* out_dom_map) const final;
+  void GatherBound(
+      const Operation& self,
+      const std::unordered_map<Tensor, TensorDom>& tensor_dom,
+      std::unordered_map<IterVar, Range>* out_dom_map) const final;
+  Stmt BuildRealize(
+      const Stage& stage,
+      const std::unordered_map<IterVar, Range>& realize_map,
+      const Stmt& body) const final;
+  Stmt BuildProvide(
+      const Stage& stage,
+      const std::unordered_map<IterVar, Range>& dom_map,
+      bool debug_keep_trivial_loop) const final;
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("name", &name);
+    v->Visit("tag", &tag);
+    v->Visit("axis", &axis);
+    v->Visit("reduce_axis", &reduce_axis);
+    v->Visit("schedulable_ndim", &schedulable_ndim);
+    v->Visit("intrin", &intrin);
+    v->Visit("inputs", &inputs);
+    v->Visit("input_regions", &input_regions);
+  }
+  static Operation make(std::string name,
+                        std::string tag,
+                        Array<IterVar> axis,
+                        Array<IterVar> reduce_axis,
+                        int schedulable_ndim,
+                        TensorIntrin intrin,
+                        Array<Tensor> tensors,
+                        Array<Region> regions);
+
+  static constexpr const char* _type_key = "TensorComputeOp";
+  TVM_DECLARE_NODE_TYPE_INFO(TensorComputeOpNode, OperationNode);
+};
+
 /*!
  * \brief Symbolic scan.
  */
@@ -326,7 +394,7 @@ class ExternOpNode : public OperationNode {
  public:
   /*! \brief The input tensors */
   Array<Tensor> inputs;
-  /*! \brief Symbolic placeholder representationinputs */
+  /*! \brief Symbolic placeholder representation of inputs */
   Array<Buffer> input_placeholders;
   /*! \brief Symbolic placeholder representation of outputs */
   Array<Buffer> output_placeholders;
@@ -366,6 +434,8 @@ class ExternOpNode : public OperationNode {
     v->Visit("tag", &tag);
     v->Visit("attrs", &attrs);
     v->Visit("inputs", &inputs);
+    v->Visit("input_placeholders", &input_placeholders);
+    v->Visit("output_placeholders", &output_placeholders);
     v->Visit("body", &body);
   }
   EXPORT static Operation make(std::string name,
@@ -380,6 +450,69 @@ class ExternOpNode : public OperationNode {
   TVM_DECLARE_NODE_TYPE_INFO(ExternOpNode, OperationNode);
 };
 
+/*!
+ * \brief A computation operator that generated by hybrid script.
+ */
+class HybridOpNode : public OperationNode {
+ public:
+  /*! \brief The input tensors */
+  Array<Tensor> inputs;
+  /*! \brief Symbolic placeholder representation of outputs */
+  Array<Tensor> outputs;
+  /*! \brief the statement that generates the computation. This is
+   * slightly different from the body in ExternOpNode. All the output
+   * tensors keep its own name specified by users in the script.
+   * However, when compilation, these tensors will be placed by those
+   * actual output tensors. */
+  Stmt body;
+
+  /*! \brief constructor */
+  HybridOpNode() {}
+  // override functions
+  int num_outputs() const final;
+  Array<IterVar> root_iter_vars() const final;
+  Type output_dtype(size_t i) const final;
+  Array<Expr> output_shape(size_t i) const final;
+  Array<Tensor> InputTensors() const final;
+  Operation ReplaceInputs(
+      const Operation& self,
+      const std::unordered_map<Tensor, Tensor>& rmap) const final;
+  void PropBoundToInputs(
+      const Operation& self,
+      const std::unordered_map<const Variable*, IntSet>& dom_map,
+      std::unordered_map<Tensor, TensorDom>* out_dom_map) const final;
+  void GatherBound(
+      const Operation& self,
+      const std::unordered_map<Tensor, TensorDom>& tensor_dom,
+      std::unordered_map<IterVar, Range>* out_dom_map) const final;
+  Stmt BuildRealize(
+      const Stage& stage,
+      const std::unordered_map<IterVar, Range>& realize_map,
+      const Stmt& body) const final;
+  Stmt BuildProvide(
+      const Stage& stage,
+      const std::unordered_map<IterVar, Range>& dom_map,
+      bool debug_keep_trivial_loop) const final;
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("name", &name);
+    v->Visit("tag", &tag);
+    v->Visit("attrs", &attrs);
+    v->Visit("inputs", &inputs);
+    v->Visit("outputs", &outputs);
+    v->Visit("body", &body);
+  }
+  EXPORT static Operation make(std::string name,
+                               std::string tag,
+                               Map<std::string, NodeRef> attrs,
+                               Array<Tensor> inputs,
+                               Array<Tensor> outputs,
+                               Stmt body);
+
+  static constexpr const char* _type_key = "HybridOp";
+  TVM_DECLARE_NODE_TYPE_INFO(HybridOpNode, OperationNode);
+};
+
 /*! \brief The compute function to specify the input source of a Tensor */
 using FCompute = std::function<Expr (const Array<Var>& i)>;
 
diff --git a/include/tvm/packed_func_ext.h b/include/tvm/packed_func_ext.h
index 95964547ef8e..45366f3ad55a 100644
--- a/include/tvm/packed_func_ext.h
+++ b/include/tvm/packed_func_ext.h
@@ -10,12 +10,13 @@
 #include <sstream>
 #include <string>
 #include <memory>
+#include <limits>
 #include <type_traits>
 
-#include "./base.h"
-#include "./expr.h"
-#include "./tensor.h"
-#include "./runtime/packed_func.h"
+#include "base.h"
+#include "expr.h"
+#include "tensor.h"
+#include "runtime/packed_func.h"
 
 namespace tvm {
 using runtime::TVMArgs;
@@ -34,6 +35,8 @@ struct NodeTypeChecker {
     // It can be turned off, but will make non strict checking.
     // TODO(tqchen) possibly find alternative to turn of RTTI
     using ContainerType = typename T::ContainerType;
+    // always allow nullptr.
+    if (sptr == nullptr) return true;
     return sptr->derived_from<ContainerType>();
   }
   static inline void PrintName(std::ostringstream& os) { // NOLINT(*)
@@ -45,7 +48,7 @@ struct NodeTypeChecker {
 template<typename T>
 struct NodeTypeChecker<Array<T> > {
   static inline bool Check(Node* sptr) {
-    if (sptr == nullptr) return false;
+    if (sptr == nullptr) return true;
     if (!sptr->is_type<ArrayNode>()) return false;
     ArrayNode* n = static_cast<ArrayNode*>(sptr);
     for (const auto& p : n->data) {
@@ -63,7 +66,7 @@ struct NodeTypeChecker<Array<T> > {
 template<typename V>
 struct NodeTypeChecker<Map<std::string, V> > {
   static inline bool Check(Node* sptr) {
-    if (sptr == nullptr) return false;
+    if (sptr == nullptr) return true;
     if (!sptr->is_type<StrMapNode>()) return false;
     StrMapNode* n = static_cast<StrMapNode*>(sptr);
     for (const auto& kv : n->data) {
@@ -82,7 +85,7 @@ struct NodeTypeChecker<Map<std::string, V> > {
 template<typename K, typename V>
 struct NodeTypeChecker<Map<K, V> > {
   static inline bool Check(Node* sptr) {
-    if (sptr == nullptr) return false;
+    if (sptr == nullptr) return true;
     if (!sptr->is_type<MapNode>()) return false;
     MapNode* n = static_cast<MapNode*>(sptr);
     for (const auto& kv : n->data) {
@@ -114,9 +117,9 @@ inline TNodeRef TVMArgValue::AsNodeRef() const {
   static_assert(
       std::is_base_of<NodeRef, TNodeRef>::value,
       "Conversion only works for NodeRef");
-  if (type_code_ == kNull) return TNodeRef();
+  if (type_code_ == kNull) return TNodeRef(NodePtr<Node>(nullptr));
   TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
-  std::shared_ptr<Node>& sptr = *ptr<std::shared_ptr<Node> >();
+  NodePtr<Node>& sptr = *ptr<NodePtr<Node> >();
   CHECK(NodeTypeChecker<TNodeRef>::Check(sptr.get()))
       << "Expected type " << NodeTypeName<TNodeRef>()
       << " but get " << sptr->type_key();
@@ -126,13 +129,15 @@ inline TNodeRef TVMArgValue::AsNodeRef() const {
 inline TVMArgValue::operator HalideIR::Expr() const {
   if (type_code_ == kNull) return Expr();
   if (type_code_ == kDLInt) {
+    CHECK_LE(value_.v_int64, std::numeric_limits<int>::max());
+    CHECK_GE(value_.v_int64, std::numeric_limits<int>::min());
     return Expr(static_cast<int>(value_.v_int64));
   }
   if (type_code_ == kDLFloat) {
     return Expr(static_cast<float>(value_.v_float64));
   }
   TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
-  std::shared_ptr<Node>& sptr = *ptr<std::shared_ptr<Node> >();
+  NodePtr<Node>& sptr = *ptr<NodePtr<Node> >();
   if (sptr->is_type<IterVarNode>()) {
     return IterVar(sptr)->var;
   }
@@ -145,27 +150,41 @@ inline TVMArgValue::operator HalideIR::Expr() const {
   return Expr(sptr);
 }
 
-inline std::shared_ptr<Node>& TVMArgValue::node_sptr() {
+inline TVMArgValue::operator tvm::Integer() const {
+  if (type_code_ == kNull) return Integer();
+  if (type_code_ == kDLInt) {
+    CHECK_LE(value_.v_int64, std::numeric_limits<int>::max());
+    CHECK_GE(value_.v_int64, std::numeric_limits<int>::min());
+    return Integer(static_cast<int>(value_.v_int64));
+  }
+  NodePtr<Node>& sptr = *ptr<NodePtr<Node> >();
+  CHECK(NodeTypeChecker<Integer>::Check(sptr.get()))
+      << "Expected type " << NodeTypeName<Expr>()
+      << " but get " << sptr->type_key();
+  return Integer(sptr);
+}
+
+inline NodePtr<Node>& TVMArgValue::node_sptr() {
   TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
-  return *ptr<std::shared_ptr<Node> >();
+  return *ptr<NodePtr<Node> >();
 }
 
 
 template<typename TNodeRef, typename>
 inline bool TVMArgValue::IsNodeType() const {
   TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
-  std::shared_ptr<Node>& sptr =
-      *ptr<std::shared_ptr<Node> >();
+  NodePtr<Node>& sptr =
+      *ptr<NodePtr<Node> >();
   return NodeTypeChecker<TNodeRef>::Check(sptr.get());
 }
 
 // extensions for TVMRetValue
 inline TVMRetValue& TVMRetValue::operator=(
-    const std::shared_ptr<Node>& other) {
+    const NodePtr<Node>& other) {
   if (other.get() == nullptr) {
     SwitchToPOD(kNull);
   } else {
-    SwitchToClass<std::shared_ptr<Node> >(kNodeHandle, other);
+    SwitchToClass<NodePtr<Node> >(kNodeHandle, other);
   }
   return *this;
 }
@@ -174,7 +193,7 @@ inline TVMRetValue& TVMRetValue::operator=(const NodeRef& other) {
   if (!other.defined()) {
     SwitchToPOD(kNull);
   } else {
-    SwitchToClass<std::shared_ptr<Node> >(kNodeHandle, other.node_);
+    SwitchToClass<NodePtr<Node> >(kNodeHandle, other.node_);
   }
   return *this;
 }
@@ -186,7 +205,7 @@ inline TNodeRef TVMRetValue::AsNodeRef() const {
       "Conversion only works for NodeRef");
   if (type_code_ == kNull) return TNodeRef();
   TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
-  std::shared_ptr<Node>& sptr = *ptr<std::shared_ptr<Node> >();
+  NodePtr<Node>& sptr = *ptr<NodePtr<Node> >();
   CHECK(NodeTypeChecker<TNodeRef>::Check(sptr.get()))
       << "Expected type " << NodeTypeName<TNodeRef>()
       << " but get " << sptr->type_key();
@@ -195,7 +214,7 @@ inline TNodeRef TVMRetValue::AsNodeRef() const {
 
 inline void TVMArgsSetter::operator()(size_t i, const NodeRef& other) const {  // NOLINT(*)
   if (other.defined()) {
-    values_[i].v_handle = const_cast<std::shared_ptr<Node>*>(&(other.node_));
+    values_[i].v_handle = const_cast<NodePtr<Node>*>(&(other.node_));
     type_codes_[i] = kNodeHandle;
   } else {
     type_codes_[i] = kNull;
diff --git a/include/tvm/relay/attrs/debug.h b/include/tvm/relay/attrs/debug.h
new file mode 100644
index 000000000000..8243dc0a3b91
--- /dev/null
+++ b/include/tvm/relay/attrs/debug.h
@@ -0,0 +1,29 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/attrs/debug.h
+ * \brief Auxiliary attributes for debug operators.
+ */
+#ifndef TVM_RELAY_ATTRS_DEBUG_H_
+#define TVM_RELAY_ATTRS_DEBUG_H_
+
+#include <tvm/attrs.h>
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief Options for the debug operators.
+ */
+struct DebugAttrs : public tvm::AttrsNode<DebugAttrs> {
+  EnvFunc debug_func;
+
+  TVM_DECLARE_ATTRS(DebugAttrs, "relay.attrs.DebugAttrs") {
+    TVM_ATTR_FIELD(debug_func)
+        .describe("The function to use when debugging.");
+  }
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_ATTRS_DEBUG_H_
diff --git a/include/tvm/relay/attrs/image.h b/include/tvm/relay/attrs/image.h
new file mode 100644
index 000000000000..527bb647314f
--- /dev/null
+++ b/include/tvm/relay/attrs/image.h
@@ -0,0 +1,41 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/attrs/image.h
+ * \brief Auxiliary attributes for image operators.
+ */
+#ifndef TVM_RELAY_ATTRS_IMAGE_H_
+#define TVM_RELAY_ATTRS_IMAGE_H_
+
+#include <tvm/attrs.h>
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+/*! \brief Attributes used in image resize operator */
+struct ResizeAttrs : public tvm::AttrsNode<ResizeAttrs> {
+  Array<IndexExpr> size;
+  std::string layout;
+  std::string method;
+  bool align_corners;
+
+  TVM_DECLARE_ATTRS(ResizeAttrs, "relay.attrs.ResizeAttrs") {
+    TVM_ATTR_FIELD(size).set_default(NullValue<Array<IndexExpr> >())
+        .describe("Output Size.");
+    TVM_ATTR_FIELD(layout).set_default("NCHW")
+        .describe("Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
+                  "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                  "dimensions respectively. Resize is applied on the 'H' and"
+                  "'W' dimensions.");
+    TVM_ATTR_FIELD(method).set_default("BILINEAR")
+        .describe("Specify the mode to use for scaling."
+                  "NEAREST_NEIGHBOR -  Nearest Neighbor"
+                  "BILINEAR - Bilinear Interpolation");
+    TVM_ATTR_FIELD(align_corners).set_default(false)
+        .describe("Should be true to preserve the values at the corner pixels");
+  }
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_ATTRS_IMAGE_H_
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
new file mode 100644
index 000000000000..724749368aa9
--- /dev/null
+++ b/include/tvm/relay/attrs/nn.h
@@ -0,0 +1,371 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/attrs/nn.h
+ * \brief Auxiliary attributes for nn operators.
+ */
+#ifndef TVM_RELAY_ATTRS_NN_H_
+#define TVM_RELAY_ATTRS_NN_H_
+
+#include <tvm/attrs.h>
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief Add a 1D Tensor to an axis of a data.
+ *
+ * \note bias_add is a special add operator that is in nn
+ *   and enables automatic derivation of bias's shape.
+ *   You can directly use add for more generalized case.
+ */
+struct BiasAddAttrs : public tvm::AttrsNode<BiasAddAttrs> {
+  int axis;
+
+  TVM_DECLARE_ATTRS(BiasAddAttrs, "relay.attrs.BiasAddAttrs") {
+    TVM_ATTR_FIELD(axis)
+        .describe("The axis to add the bias")
+        .set_default(1);
+  }
+};
+
+/*! \brief Attributes used in convolution operators */
+struct Conv2DAttrs : public tvm::AttrsNode<Conv2DAttrs> {
+  Array<IndexExpr> strides;
+  Array<IndexExpr> padding;
+  Array<IndexExpr> dilation;
+  int groups;
+  IndexExpr channels;
+  Array<IndexExpr> kernel_size;
+  std::string data_layout;
+  std::string weight_layout;
+  std::string out_layout;
+  DataType out_dtype;
+
+  TVM_DECLARE_ATTRS(Conv2DAttrs, "relay.attrs.Conv2DAttrs") {
+    TVM_ATTR_FIELD(strides).set_default(Array<IndexExpr>({1, 1}))
+        .describe("Specifies the strides of the convolution.");
+    TVM_ATTR_FIELD(padding).set_default(Array<IndexExpr>({0, 0}))
+        .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                  "on both sides for padding number of points");
+    TVM_ATTR_FIELD(dilation).set_default(Array<IndexExpr>({1, 1}))
+        .describe("Specifies the dilation rate to use for dilated convolution.");
+    TVM_ATTR_FIELD(groups).set_default(1)
+        .describe("Controls the connections between inputs and outputs."
+                  "At groups=1, all inputs are convolved to all outputs."
+                  "At groups=2, the operation becomes equivalent to having two convolution"
+                  "layers side by side, each seeing half the input channels, and producing"
+                  "half the output channels, and both subsequently concatenated.");
+    TVM_ATTR_FIELD(channels)
+        .describe("The number of output channels in the convolution."
+                  " If it is not set, inferred by shape of the weight.")
+        .set_default(NullValue<IndexExpr>());
+    TVM_ATTR_FIELD(kernel_size)
+        .describe("Specifies the dimensions of the convolution window.")
+        .set_default(NullValue<Array<IndexExpr> >());
+    TVM_ATTR_FIELD(data_layout).set_default("NCHW")
+        .describe("Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
+                  "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                  "dimensions respectively. Convolution is applied on the 'H' and"
+                  "'W' dimensions.");
+    TVM_ATTR_FIELD(weight_layout).set_default("OIHW")
+        .describe("Dimension ordering of weight. Can be 'OIHW', 'OIHW16o16i', etc."
+                  "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
+                  "dimensions respectively.");
+    TVM_ATTR_FIELD(out_layout).set_default("")
+        .describe("Dimension ordering of output. Can be 'NCHW', 'NHWC', etc."
+                  "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                  "dimensions respectively. Default to be same as input layout.");
+
+    // use 0 bits to indicate none.
+    TVM_ATTR_FIELD(out_dtype)
+        .set_default(NullValue<DataType>())
+        .describe("Output data type, set to explicit type under mixed precision setting");
+  }
+};
+
+/*! \brief Attributes used in softmax operators */
+struct SoftmaxAttrs : public tvm::AttrsNode<SoftmaxAttrs> {
+  int axis;
+
+  TVM_DECLARE_ATTRS(SoftmaxAttrs, "relay.attrs.SoftmaxAttrs") {
+      TVM_ATTR_FIELD(axis).set_default(-1)
+          .describe("The axis to sum over when computing softmax.");
+  }
+};
+
+/*! \brief Attributes used in transposed convolution operator */
+struct Conv2DTransposeAttrs : public tvm::AttrsNode<Conv2DTransposeAttrs> {
+  IndexExpr channels;
+  Array<IndexExpr> kernel_size;
+  Array<IndexExpr> strides;
+  Array<IndexExpr> padding;
+  Array<IndexExpr> output_padding;
+  Array<IndexExpr> dilation;
+  int groups;
+  std::string data_layout;
+  std::string weight_layout;
+  std::string out_layout;
+  DataType out_dtype;
+
+  TVM_DECLARE_ATTRS(Conv2DTransposeAttrs, "relay.attrs.Conv2DTransposeAttrs") {
+    TVM_ATTR_FIELD(channels)
+      .set_default(NullValue<IndexExpr>())
+      .describe("The dimensionality of the output space"
+                "i.e. the number of output channels in the convolution.");
+    TVM_ATTR_FIELD(kernel_size)
+      .describe("The dimensions of the convolution window.")
+      .set_default(NullValue<Array<IndexExpr> >());
+    TVM_ATTR_FIELD(strides).set_default(Array<IndexExpr>({1, 1}))
+      .describe("The strides of the convolution.");
+    TVM_ATTR_FIELD(output_padding).set_default(Array<IndexExpr>({0, 0}))
+      .describe("Zero-padding added to one side of the output.");
+    TVM_ATTR_FIELD(padding).set_default(Array<IndexExpr>({0, 0}))
+      .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                "on both sides for padding number of points");
+    TVM_ATTR_FIELD(dilation).set_default(Array<IndexExpr>({1, 1}))
+      .describe("Specifies the dilation rate to use for dilated convolution.");
+    TVM_ATTR_FIELD(groups).set_default(1)
+      .describe("Controls the connections between inputs and outputs."
+                "At groups=1, all inputs are convolved to all outputs."
+                "At groups=2, the operation becomes equivalent to having two convolution"
+                "layers side by side, each seeing half the input channels, and producing"
+                "half the output channels, and both subsequently concatenated.");
+    TVM_ATTR_FIELD(data_layout).set_default("NCHW")
+      .describe("Dimension ordering of data. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Convolution is applied on the 'H' and"
+                "'W' dimensions.");
+    TVM_ATTR_FIELD(weight_layout).set_default("OIHW")
+      .describe("Dimension ordering of data and weight. Can be 'OIHW', 'OIHW16o16i', etc."
+                "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
+                "dimensions respectively.");
+    TVM_ATTR_FIELD(out_layout).set_default("")
+        .describe("Dimension ordering of output. Can be 'NCHW', 'NHWC', etc."
+                      "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                      "dimensions respectively. Default to be same as input layout.");
+    TVM_ATTR_FIELD(out_dtype)
+        .set_default(NullValue<DataType>())
+        .describe("Output data type, set to explicit type under mixed precision setting");
+  }
+};
+
+/*! \brief Attributes for max pool operator */
+struct MaxPool2DAttrs : public tvm::AttrsNode<MaxPool2DAttrs> {
+  Array<IndexExpr> pool_size;
+  Array<IndexExpr> strides;
+  Array<IndexExpr> padding;
+  std::string layout;
+  bool ceil_mode;
+
+  TVM_DECLARE_ATTRS(MaxPool2DAttrs, "relay.attrs.MaxPool2DAttrs") {
+    TVM_ATTR_FIELD(pool_size)
+      .describe("Size of the pooling windows.");
+    TVM_ATTR_FIELD(strides).set_default(Array<IndexExpr>({1, 1}))
+      .describe("Specifies the strides of the convolution.");
+    TVM_ATTR_FIELD(padding).set_default(Array<IndexExpr>({0, 0}))
+      .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                "Padding support both symmetric and asymmetric as"
+                "one int : same padding used on all sides"
+                "two int : bottom, right will use same padding as top, left"
+                "four int : padding width in the order of (top, left, bottom, right)");
+    TVM_ATTR_FIELD(layout).set_default("NCHW")
+      .describe("Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Convolution is applied on the 'H' and"
+                "'W' dimensions.");
+    TVM_ATTR_FIELD(ceil_mode).set_default(false)
+      .describe("When true, will use ceil instead of floor to compute the output shape.");
+  }
+};
+
+/*! \brief Attributes for avg pool operator */
+struct AvgPool2DAttrs : public tvm::AttrsNode<AvgPool2DAttrs> {
+  Array<IndexExpr> pool_size;
+  Array<IndexExpr> strides;
+  Array<IndexExpr> padding;
+  std::string layout;
+  bool ceil_mode;
+  bool count_include_pad;
+
+  TVM_DECLARE_ATTRS(AvgPool2DAttrs, "relay.attrs.AvgPool2DAttrs") {
+    TVM_ATTR_FIELD(pool_size)
+      .describe("Size of the pooling windows.");
+    TVM_ATTR_FIELD(strides).set_default(Array<IndexExpr>({1, 1}))
+      .describe("Specifies the strides of the convolution.");
+    TVM_ATTR_FIELD(padding).set_default(Array<IndexExpr>({0, 0}))
+      .describe("If padding is non-zero, then the input is implicitly zero-padded"
+                "Padding support both symmetric and asymmetric as"
+                "one int : same padding used on all sides"
+                "two int : bottom, right will use same padding as top, left"
+                "four int : padding width in the order of (top, left, bottom, right)");
+    TVM_ATTR_FIELD(layout).set_default("NCHW")
+      .describe("Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Convolution is applied on the 'H' and"
+                "'W' dimensions.");
+    TVM_ATTR_FIELD(ceil_mode).set_default(false)
+      .describe("When true, will use ceil instead of floor to compute the output shape.");
+    TVM_ATTR_FIELD(count_include_pad).set_default(false)
+      .describe("When true, will include padding to compute the average");
+  }
+};
+
+/*! \brief Attributes for global pool operator */
+struct GlobalPool2DAttrs : public tvm::AttrsNode<GlobalPool2DAttrs> {
+  std::string layout;
+
+  TVM_DECLARE_ATTRS(GlobalPool2DAttrs, "relay.attrs.GlobalPool2DAttrs") {
+    TVM_ATTR_FIELD(layout).set_default("NCHW")
+      .describe("Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc."
+                "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                "dimensions respectively. Convolution is applied on the 'H' and"
+                "'W' dimensions.");
+  }
+};
+
+
+/*! \brief Attributes for dense operator */
+struct DenseAttrs : public tvm::AttrsNode<DenseAttrs> {
+  IndexExpr units;
+
+  TVM_DECLARE_ATTRS(DenseAttrs, "relay.attrs.DenseAttrs") {
+    TVM_ATTR_FIELD(units)
+        .describe("Number of hidden units of the dense transformation.");
+  }
+};
+
+
+/*! \brief Attributes for upsampling operator */
+struct UpSamplingAttrs : public tvm::AttrsNode<UpSamplingAttrs> {
+  int scale;
+  std::string layout;
+  std::string method;
+
+  TVM_DECLARE_ATTRS(UpSamplingAttrs, "relay.attrs.UpSamplingAttrs") {
+    TVM_ATTR_FIELD(scale)
+        .describe("Should be true to preserve the values at the corner pixels");
+    TVM_ATTR_FIELD(layout).set_default("NCHW")
+        .describe("Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
+                  "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
+                  "dimensions respectively. Upsampling is applied on the 'H' and"
+                  "'W' dimensions.");
+    TVM_ATTR_FIELD(method).set_default("NEAREST_NEIGHBOR")
+        .describe("Specify the mode to use for scaling."
+                  "NEAREST_NEIGHBOR -  Nearest Neighbor"
+                  "BILINEAR - Bilinear Interpolation");
+  }
+};
+
+/*! \brief Attributes used for the padding operator */
+struct PadAttrs : public tvm::AttrsNode<PadAttrs> {
+  double pad_value;
+  Array<Array<IndexExpr> > pad_width;
+
+  TVM_DECLARE_ATTRS(PadAttrs, "relay.attrs.PadAttrs") {
+    TVM_ATTR_FIELD(pad_value).set_default(0.0)
+      .describe("Specifies the strides of the convolution.");
+    TVM_ATTR_FIELD(pad_width)
+      .describe("Number of values padded to the edges of each axis, "
+                "in the format of ((before_1, after_1), ..., (before_N, after_N))");
+  }
+};
+
+
+/*! \brief Attributes for leaky relu operator */
+struct LeakyReluAttrs : public tvm::AttrsNode<LeakyReluAttrs> {
+  double alpha;
+
+  TVM_DECLARE_ATTRS(LeakyReluAttrs, "relay.attrs.LeakyReluAttrs") {
+    TVM_ATTR_FIELD(alpha).set_lower_bound(0.0).set_default(0.25)
+        .describe("Slope coefficient for the negative half axis.");
+  }
+};
+
+
+/*! \brief Attributes for prelu operator */
+struct PReluAttrs : public tvm::AttrsNode<PReluAttrs> {
+  int axis;
+
+  TVM_DECLARE_ATTRS(PReluAttrs, "relay.attrs.PReluAttrs") {
+    TVM_ATTR_FIELD(axis).set_default(1)
+        .describe("Specify which shape axis the channel is specified.");
+  }
+};
+
+
+/*! \brief Attributes used in dropout operator */
+struct DropoutAttrs : public tvm::AttrsNode<DropoutAttrs> {
+  double rate;
+  TVM_DECLARE_ATTRS(DropoutAttrs, "relay.attrs.DropoutAttrs") {
+    TVM_ATTR_FIELD(rate)
+      .describe("Fraction of the input that gets dropped out during training time")
+      .set_default(0.5);
+  }
+};  // struct DropoutAttrs
+
+/*! \brief Attributes used in batch_norm operator */
+struct BatchNormAttrs : public tvm::AttrsNode<BatchNormAttrs> {
+  int axis;
+  double epsilon;
+  bool center;
+  bool scale;
+
+  TVM_DECLARE_ATTRS(BatchNormAttrs, "relay.attrs.BatchNormAttrs") {
+    TVM_ATTR_FIELD(axis)
+      .describe("Specify which shape axis denotes the channel.")
+      .set_default(1);
+    TVM_ATTR_FIELD(epsilon)
+      .describe("Small float added to variance to avoid dividing by zero")
+      .set_default(1e-5);
+    TVM_ATTR_FIELD(center)
+      .describe("If True, add offset of beta to normalized tensor. If False, beta is ignored")
+      .set_default(true);
+    TVM_ATTR_FIELD(scale)
+      .describe("If True, multiply by gamma. If False, gamma is not used. "
+                "When the next layer is piecewise linear (also, e.g., nn.relu), "
+                "this can be disabled since the scaling will be done by the next layer.")
+      .set_default(true);
+  }
+};  // struct BatchNormAttrs
+
+
+/*! \brief Attributes for LRN operator */
+struct LRNAttrs : public tvm::AttrsNode<LRNAttrs> {
+  int size;
+  int axis;
+  double bias;
+  double alpha;
+  double beta;
+
+  TVM_DECLARE_ATTRS(LRNAttrs, "relay.attrs.LRNAttrs") {
+    TVM_ATTR_FIELD(size).set_default(5)
+      .describe("The size of the local region to be considered for normalization.");
+    TVM_ATTR_FIELD(axis).set_default(1)
+      .describe("Axis of input data layout channel.");
+    TVM_ATTR_FIELD(bias).set_default(2)
+      .describe("The offset parameter to avoid division by 0.");
+    TVM_ATTR_FIELD(alpha).set_default(0.0001)
+      .describe("The scaling parameter.");
+    TVM_ATTR_FIELD(beta).set_default(0.75)
+      .describe("The exponent parameter.");
+  }
+};
+
+
+/*! \brief Attributes for L2Normalize operator */
+struct L2NormalizeAttrs : public tvm::AttrsNode<L2NormalizeAttrs> {
+  double eps;
+  Array<Integer> axis;
+
+  TVM_DECLARE_ATTRS(L2NormalizeAttrs, "relay.attrs.L2NormalizeAttrs") {
+    TVM_ATTR_FIELD(eps)
+      .describe("A lower bound value for the norm, to avoid division by 0.");
+    TVM_ATTR_FIELD(axis)
+      .describe("Axis over the normalization applied.");
+  }
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_ATTRS_NN_H_
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
new file mode 100644
index 000000000000..7e614a8cafd4
--- /dev/null
+++ b/include/tvm/relay/attrs/transform.h
@@ -0,0 +1,182 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/attrs/transform.h
+ * \brief Transform operators.
+ */
+#ifndef TVM_RELAY_ATTRS_TRANSFORM_H_
+#define TVM_RELAY_ATTRS_TRANSFORM_H_
+
+#include <tvm/attrs.h>
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+/*! \brief data type cast */
+struct CastAttrs : public tvm::AttrsNode<CastAttrs> {
+  DataType dtype;
+
+  TVM_DECLARE_ATTRS(CastAttrs, "relay.attrs.CastAttrs") {
+    TVM_ATTR_FIELD(dtype)
+        .describe("Target data type");
+  }
+};  // struct CastAttrs.
+
+/*! \brief Attributes used in expand_dims operators */
+struct ExpandDimsAttrs : public tvm::AttrsNode<ExpandDimsAttrs> {
+  int axis;
+  int num_newaxis;
+
+  TVM_DECLARE_ATTRS(ExpandDimsAttrs, "relay.attrs.ExpandDimsAttrs") {
+    TVM_ATTR_FIELD(axis)
+        .describe("The axis at which the input array is expanded."
+                  "Should lie in range `[-data.ndim - 1, data.ndim]`."
+                  "If `axis < 0`, it is the first axis inserted;"
+                  "If `axis >= 0`, it is the last axis inserted in Python's negative indexing.");
+    TVM_ATTR_FIELD(num_newaxis)
+        .describe("Number of axises to be inserted. Should be >= 0.")
+        .set_lower_bound(0)
+        .set_default(1);
+  }
+};  // struct ExpandDimsAttrs
+
+/*! \brief Attributes used in concatenate operators */
+struct ConcatenateAttrs : public tvm::AttrsNode<ConcatenateAttrs> {
+  int axis;
+  TVM_DECLARE_ATTRS(ConcatenateAttrs, "relay.attrs.ConcatenateAttrs") {
+    TVM_ATTR_FIELD(axis)
+        .describe("The axis at which the input arrays are concatenated."
+                  "Should lie in range `[-ndim, ndim)`.")
+        .set_default(0);
+  }
+};  // struct ConcatenateAttrs
+
+/*! \brief Attributes used in transpose operators */
+struct TransposeAttrs : public tvm::AttrsNode<TransposeAttrs> {
+  Array<Integer> axes;
+  TVM_DECLARE_ATTRS(TransposeAttrs, "relay.attrs.TransposeAttrs") {
+    TVM_ATTR_FIELD(axes)
+        .describe("The target axes order, reverse order if not specified.");
+  }
+};  // struct TransposeAttrs
+
+/*! \brief Attributes used in reshape operators */
+struct ReshapeAttrs : public tvm::AttrsNode<ReshapeAttrs> {
+  Array<Integer> newshape;
+  TVM_DECLARE_ATTRS(ReshapeAttrs, "relay.attrs.ReshapeAttrs") {
+    TVM_ATTR_FIELD(newshape)
+        .describe("The new shape. Should be compatible with the original shape.");
+  }
+};  // struct ReshapeAttrs
+
+struct TakeAttrs : public tvm::AttrsNode<TakeAttrs> {
+  Integer axis;
+
+  TVM_DECLARE_ATTRS(TakeAttrs, "relay.attrs.TakeAttrs") {
+    TVM_ATTR_FIELD(axis).set_default(NullValue<Integer>())
+        .describe("The axis over which to select values.");
+  }
+};
+
+/*! \brief Attributes that specify a tensor */
+struct InitOpAttrs : public tvm::AttrsNode<InitOpAttrs> {
+  Array<IndexExpr> shape;
+  DataType dtype;
+
+  TVM_DECLARE_ATTRS(InitOpAttrs, "relay.attrs.InitOpAttrs") {
+    TVM_ATTR_FIELD(shape)
+      .describe("Target shape.");
+    TVM_ATTR_FIELD(dtype)
+      .describe("Target data type.")
+      .set_default(NullValue<DataType>());
+  }
+};  // struct InitOpAttrs
+
+/*! \brief Attributes used in squeeze operators */
+struct SqueezeAttrs : public tvm::AttrsNode<SqueezeAttrs> {
+  // use axis to make the name numpy compatible.
+  Array<Integer> axis;
+
+  TVM_DECLARE_ATTRS(SqueezeAttrs, "relay.attrs.SqueezeAttrs") {
+    TVM_ATTR_FIELD(axis)
+        .describe("The axis to squeeze in the input tensor."
+                  "If `axis = None`, all axis of dimension 1 get squeezed;"
+                  "Else, the dimension in axes get squeezed."
+                  "It is an error if an axis does not has dimension 1.")
+        .set_default(NullValue<Array<Integer> >());
+  }
+};  // struct SqueezeAttrs
+
+struct SplitAttrs : public tvm::AttrsNode<SplitAttrs> {
+  NodeRef indices_or_sections;
+  int axis;
+
+  TVM_DECLARE_ATTRS(SplitAttrs, "relay.attrs.SplitAttrs") {
+    TVM_ATTR_FIELD(indices_or_sections)
+        .describe("Indices or sections to split into. Accepts an int or a tuple"
+                  "If indices_or_sections is an integer, the input will be divided equally"
+                  "along given axis. If such a split is not possible, an error is raised."
+                  "If indices_or_sections is a tuple of sorted integers,"
+                  "the entries indicate where along axis the array is split.");
+    TVM_ATTR_FIELD(axis).set_default(0)
+        .describe("the axis to be splitted.");
+  }
+};
+
+/*! \brief Attributes for StridedSlice operator */
+struct StridedSliceAttrs : public tvm::AttrsNode<StridedSliceAttrs> {
+  Array<Integer> begin;
+  Array<Integer> end;
+  Array<Integer> strides;
+
+  TVM_DECLARE_ATTRS(StridedSliceAttrs, "relay.attrs.StridedSliceAttrs") {
+    TVM_ATTR_FIELD(begin)
+        .describe("Indices for begin of slice, begin index is also inclusive");
+    TVM_ATTR_FIELD(end)
+        .describe("Indices for end of slice, end index is exclusive");
+    TVM_ATTR_FIELD(strides).set_default(Array<Integer>({}))
+        .describe("Stride values of the slice");
+  }
+};
+
+
+struct SliceLikeAttrs : public tvm::AttrsNode<SliceLikeAttrs> {
+  Array<Integer> axes;
+
+  TVM_DECLARE_ATTRS(SliceLikeAttrs, "relay.attrs.SliceLikeAttrs") {
+    TVM_ATTR_FIELD(axes)
+        .describe("List of axes on which input data will be sliced according to the "
+                  "corresponding size of the second input. By default will slice "
+                  "on all axes. Negative axes mean counting in reverse.");
+  }
+};
+
+// Clip
+struct ClipAttrs : public tvm::AttrsNode<ClipAttrs> {
+  double a_min;
+  double a_max;
+
+  TVM_DECLARE_ATTRS(ClipAttrs, "relay.attrs.ClipAttrs") {
+  TVM_ATTR_FIELD(a_min)
+    .describe("The minimum clip value.");
+  TVM_ATTR_FIELD(a_max)
+    .describe("The maximum clip value.");
+  }
+};
+
+
+struct LayoutTransformAttrs : public tvm::AttrsNode<LayoutTransformAttrs> {
+  std::string src_layout;
+  std::string dst_layout;
+
+  TVM_DECLARE_ATTRS(LayoutTransformAttrs, "relay.attrs.LayoutTransformAttrs") {
+    TVM_ATTR_FIELD(src_layout)
+        .describe("The source layout of the tensor. (e.g. NCHW)");
+    TVM_ATTR_FIELD(dst_layout)
+        .describe("The destination layout of the tensor. (e.g. NCHW16c)");
+  }
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_ATTRS_TRANSFORM_H_
diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
new file mode 100644
index 000000000000..b736bd9c06a0
--- /dev/null
+++ b/include/tvm/relay/attrs/vision.h
@@ -0,0 +1,79 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/attrs/vision.h
+ * \brief Auxiliary attributes for vision operators.
+ */
+#ifndef TVM_RELAY_ATTRS_VISION_H_
+#define TVM_RELAY_ATTRS_VISION_H_
+
+#include <tvm/attrs.h>
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+/*! \brief Attributes used in multibox_prior operators */
+struct MultiBoxPriorAttrs : public tvm::AttrsNode<MultiBoxPriorAttrs> {
+  Array<IndexExpr> sizes;
+  Array<IndexExpr> ratios;
+  Array<IndexExpr> steps;
+  Array<IndexExpr> offsets;
+  bool clip;
+
+  TVM_DECLARE_ATTRS(MultiBoxPriorAttrs, "relay.attrs.MultiBoxPriorAttrs") {
+    TVM_ATTR_FIELD(sizes)
+      .set_default(Array<IndexExpr>({static_cast<float>(1.0)}))
+      .describe("List of sizes of generated MultiBoxPriores.");
+    TVM_ATTR_FIELD(ratios)
+      .set_default(Array<IndexExpr>({static_cast<float>(1.0)}))
+      .describe("List of aspect ratios of generated MultiBoxPriores.");
+    TVM_ATTR_FIELD(steps)
+      .set_default(Array<IndexExpr>({static_cast<float>(-1.0),
+                                     static_cast<float>(-1.0)}))
+      .describe("Priorbox step across y and x, -1 for auto calculation.");
+    TVM_ATTR_FIELD(offsets)
+      .set_default(Array<IndexExpr>({static_cast<float>(0.5),
+                                     static_cast<float>(0.5)}))
+      .describe("Priorbox center offsets, y and x respectively.");
+    TVM_ATTR_FIELD(clip).set_default(false)
+      .describe("Whether to clip out-of-boundary boxes.");
+  }
+};
+
+struct MultiBoxTransformLocAttrs
+    : public tvm::AttrsNode<MultiBoxTransformLocAttrs> {
+  bool clip;
+  double threshold;
+  Array<IndexExpr> variances;
+
+  TVM_DECLARE_ATTRS(MultiBoxTransformLocAttrs,
+                    "relay.attrs.MultiBoxTransformLocAttrs") {
+    TVM_ATTR_FIELD(clip).set_default(true)
+      .describe("Clip out-of-boundary boxes.");
+    TVM_ATTR_FIELD(threshold).set_default(0.01)
+      .describe("Threshold to be a positive prediction.");
+    TVM_ATTR_FIELD(variances)
+      .set_default(Array<IndexExpr>({0.1f, 0.1f , 0.2f, 0.2f}))
+      .describe("Variances to be decoded from box regression output.");
+  }
+};
+
+/*! \brief Attributes used in non_maximum_suppression operators */
+struct NMSAttrs : public tvm::AttrsNode<NMSAttrs>{
+  double overlap_threshold;
+  bool force_suppress;
+  int topk;
+
+  TVM_DECLARE_ATTRS(NMSAttrs, "relay.attrs.NMSAttrs") {
+      TVM_ATTR_FIELD(overlap_threshold).set_default(0.5)
+        .describe("Non-maximum suppression threshold.");
+      TVM_ATTR_FIELD(force_suppress).set_default(false)
+        .describe("Suppress all detections regardless of class_id.");
+      TVM_ATTR_FIELD(topk).set_default(-1)
+        .describe("Keep maximum top k detections before nms, -1 for no limit.");
+  }
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_ATTRS_VISION_H_
diff --git a/include/tvm/relay/base.h b/include/tvm/relay/base.h
new file mode 100644
index 000000000000..f72f557a9765
--- /dev/null
+++ b/include/tvm/relay/base.h
@@ -0,0 +1,201 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/base.h
+ * \brief Base classes for the Relay IR.
+ */
+#ifndef TVM_RELAY_BASE_H_
+#define TVM_RELAY_BASE_H_
+
+#include <tvm/api_registry.h>
+#include <tvm/ir.h>
+#include <tvm/node/node.h>
+#include <string>
+#include <vector>
+
+namespace tvm {
+/*!
+ * \brief Relay: a high level functional IR for TVM.
+ *
+ * This namespace contains the abstract syntax tree, and other
+ * essential data structures for the Relay IR.
+ *
+ * You can find more about Relay by reading the language reference.
+ */
+namespace relay {
+
+#define RELAY_DEBUG(...) \
+{ auto fdebug = runtime::Registry::Get("relay.debug"); \
+  CHECK(fdebug) << "Could not find Relay Python debugger function."; \
+  (*fdebug)("RELAY_DEBUG", __FILE__, __LINE__, __VA_ARGS__); \
+}
+
+/*!
+ * \brief We always used NodeRef for referencing nodes.
+ *
+ *  By default, NodeRef is a std::shared_ptr of node
+ */
+using NodeRef = tvm::NodeRef;
+
+/*!
+ * \brief Content data type.
+ */
+using DataType = ::tvm::Type;
+
+/*!
+ * \brief Symbolic expression for tensor shape.
+ */
+using IndexExpr = ::tvm::Expr;
+
+/*!
+ * \brief Hash function for nodes.
+ * e.g. std::unordered_map<Expr, Value, NodeHash, NodeEqual>
+ */
+using NodeHash = ::tvm::NodeHash;
+/*!
+ * \brief Equality check function for nodes.
+ */
+using NodeEqual = ::tvm::NodeEqual;
+
+/*!
+ * \brief Macro to make it easy to define node ref type given node
+ * \param TypeName The name of the reference type.
+ * \param NodeName The internal container name.
+ * \param NodeRefBase The base type.
+ */
+#define RELAY_DEFINE_NODE_REF(TypeName, NodeName, NodeRefBase)          \
+  class TypeName : public NodeRefBase {                                 \
+   public:                                                              \
+    TypeName() {}                                                        \
+    explicit TypeName(::tvm::NodePtr<::tvm::Node> n) : NodeRefBase(n) {} \
+    const NodeName* operator->() const {                                \
+      return static_cast<const NodeName*>(node_.get());                 \
+    }                                                                   \
+    operator bool() { return this->defined(); }                         \
+    using ContainerType = NodeName;                                     \
+  };
+
+/*!
+ * \brief The source name in the Span
+ * \sa SourceNameNode, Span
+ */
+class SourceName;
+/*!
+ * \brief The name of a source fragment.
+ */
+class SourceNameNode : public Node {
+ public:
+  /*! \brief The source name. */
+  std::string name;
+  // override attr visitor
+  void VisitAttrs(AttrVisitor* v) final { v->Visit("name", &name); }
+
+  static constexpr const char* _type_key = "relay.SourceName";
+  TVM_DECLARE_NODE_TYPE_INFO(SourceNameNode, Node);
+};
+
+/*!
+ * \brief The source name of a file span.
+ * \sa SourceNameNode, Span
+ */
+class SourceName : public NodeRef {
+ public:
+  /*! \brief default constructor  */
+  SourceName() {}
+
+  /*! \brief constructor from node pointer */
+  explicit SourceName(NodePtr<Node> n) : NodeRef(n) {}
+  /*!
+   * \brief access the internal node container
+   * \return the pointer to the internal node container
+   */
+  inline const SourceNameNode* operator->() const;
+
+  /*!
+   * \brief Get an SourceName for a given operator name.
+   *  Will raise an error if the source name has not been registered.
+   * \param name Name of the operator.
+   * \return SourceName valid throughout program lifetime.
+   */
+  TVM_DLL static SourceName Get(const std::string& name);
+
+  /*! \brief specify container node */
+  using ContainerType = SourceNameNode;
+};
+
+/*!
+ * \brief Span information for debugging purposes
+ */
+class Span;
+/*!
+ * \brief Stores locations in frontend source that generated a node.
+ */
+class SpanNode : public Node {
+ public:
+  /*! \brief The source name */
+  SourceName source;
+  /*! \brief Line number */
+  int lineno;
+  /*! \brief column offset */
+  int col_offset;
+  // override attr visitor
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("source", &source);
+    v->Visit("lineno", &lineno);
+    v->Visit("col_offset", &col_offset);
+  }
+
+  TVM_DLL static Span make(SourceName source, int lineno, int col_offset);
+
+  static constexpr const char* _type_key = "relay.Span";
+  TVM_DECLARE_NODE_TYPE_INFO(SpanNode, Node);
+};
+
+RELAY_DEFINE_NODE_REF(Span, SpanNode, NodeRef);
+
+/*!
+ * \brief This is the base node container of all relay structures.
+ */
+class RelayNode : public Node {
+ public:
+  /*! \brief The location of the program in a SourceFragment can be null,
+   * check with span.defined() */
+  mutable Span span;
+
+  static constexpr const char* _type_key = "relay.Node";
+  TVM_DECLARE_BASE_NODE_INFO(RelayNode, Node);
+};
+
+/*!
+ * \brief The unique identifier of variables.
+ *
+ * Id is like name to the variables,
+ * except that id is unique for each Var.
+ *
+ * \note Do not create Id directly, they are created in Var.
+ */
+class IdNode : public Node {
+ public:
+  /*!
+   * \brief The name of the variable,
+   *  this only acts as a hint to the user,
+   *  and is not used for equality.
+   */
+  std::string name_hint;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("name_hint", &name_hint);
+  }
+
+  static constexpr const char* _type_key = "relay.Id";
+  TVM_DECLARE_NODE_TYPE_INFO(IdNode, Node);
+};
+
+RELAY_DEFINE_NODE_REF(Id, IdNode, NodeRef);
+
+
+struct Module;
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_BASE_H_
diff --git a/include/tvm/relay/error.h b/include/tvm/relay/error.h
new file mode 100644
index 000000000000..1c2b90611bbd
--- /dev/null
+++ b/include/tvm/relay/error.h
@@ -0,0 +1,34 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file error.h
+ * \brief The set of errors raised by Relay.
+ */
+#ifndef TVM_RELAY_ERROR_H_
+#define TVM_RELAY_ERROR_H_
+
+#include <string>
+#include "./base.h"
+
+namespace tvm {
+namespace relay {
+
+struct Error : public dmlc::Error {
+  explicit Error(const std::string &msg) : dmlc::Error(msg) {}
+};
+
+struct InternalError : public Error {
+  explicit InternalError(const std::string &msg) : Error(msg) {}
+};
+
+struct FatalTypeError : public Error {
+  explicit FatalTypeError(const std::string &s) : Error(s) {}
+};
+
+struct TypecheckerError : public Error {
+  explicit TypecheckerError(const std::string &msg) : Error(msg) {}
+};
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_ERROR_H_
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
new file mode 100644
index 000000000000..14b3cd91701c
--- /dev/null
+++ b/include/tvm/relay/expr.h
@@ -0,0 +1,491 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/expr.h
+ * \brief Relay expression language.
+ */
+#ifndef TVM_RELAY_EXPR_H_
+#define TVM_RELAY_EXPR_H_
+
+#include <tvm/attrs.h>
+#include <string>
+#include <functional>
+#include "./base.h"
+#include "./type.h"
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief A Relay expression.
+ */
+class Expr;
+/*!
+ * \brief Base type of the Relay expression hiearchy.
+ */
+class ExprNode : public RelayNode {
+ public:
+  /*!
+   * \brief Stores the result of type inference(type checking).
+   *
+   * \note This can be undefined before type inference.
+   *       This value is discarded during serialization.
+   */
+  mutable Type checked_type_ = Type(nullptr);
+  /*!
+   * \return The checked_type
+   */
+  const Type& checked_type() const {
+    CHECK(checked_type_.defined()) << "internal error: the type checker has "
+                                      "not populated the checked_type "
+                                      "field for this node";
+    return this->checked_type_;
+  }
+  /*!
+   * \brief Check if the inferred(checked) type of the Expr
+   *  is backed by a TTypeNode and return it.
+   *
+   * \note This function will thrown an error if the node type
+   *       of this Expr is not TTypeNode.
+   *
+   * \return The corresponding TTypeNode pointer.
+   * \tparam The specific TypeNode we look for.
+   */
+  template<typename TTypeNode>
+  inline const TTypeNode* type_as() const;
+
+  static constexpr const char* _type_key = "relay.Expr";
+  TVM_DECLARE_BASE_NODE_INFO(ExprNode, RelayNode);
+};
+
+RELAY_DEFINE_NODE_REF(Expr, ExprNode, NodeRef);
+
+/*!
+ * \brief Constant tensor, backed by an NDArray on the cpu(0) device.
+ *
+ * \note Scalar constants are represented by rank-0 const tensor.
+ *  Constant folding are handled uniformly via Tensor types.
+ */
+class Constant;
+/*!
+ * \brief Constant tensor type.
+ */
+class ConstantNode : public ExprNode {
+ public:
+  /*! \brief The data of the tensor */
+  runtime::NDArray data;
+
+  /*! \return The corresponding tensor type of the data */
+  TensorType tensor_type() const;
+
+  /*! \return Whether it is scalar(rank-0 tensor) */
+  bool is_scalar() const {
+    return data->ndim == 0;
+  }
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("data", &data);
+    v->Visit("span", &span);
+    v->Visit("_checked_type_", &checked_type_);
+  }
+
+  TVM_DLL static Constant make(runtime::NDArray data);
+
+  static constexpr const char* _type_key = "relay.Constant";
+  TVM_DECLARE_NODE_TYPE_INFO(ConstantNode, ExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(Constant, ConstantNode, Expr);
+
+/*! \brief Tuple of multiple Exprs */
+class Tuple;
+/*! \brief Tuple container */
+class TupleNode : public ExprNode {
+ public:
+  /*! \brief the fields of the tuple */
+  tvm::Array<relay::Expr> fields;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("fields", &fields);
+    v->Visit("span", &span);
+    v->Visit("_checked_type_", &checked_type_);
+  }
+
+  TVM_DLL static Tuple make(tvm::Array<relay::Expr> fields);
+
+  static constexpr const char* _type_key = "relay.Tuple";
+  TVM_DECLARE_NODE_TYPE_INFO(TupleNode, ExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(Tuple, TupleNode, Expr);
+
+/*!
+ * \brief Local variables used in the let expression.
+ *
+ * Its semantics are similar to tvm.Var node used in TVM's low level
+ * tensor expression language.
+ *
+ * \note Each Var is bind only once and is immutable.
+ */
+class Var;
+/*! \brief Container for Var */
+class VarNode : public ExprNode {
+ public:
+  /*!
+   * \brief The unique identifier of the Var.
+   *
+   * vid will be preserved for the same Var during type inference
+   * and other rewritings, while the VarNode might be recreated
+   * to attach additional information.
+   * This property can be used to keep track of parameter Var
+   * information across passes.
+   */
+  Id vid;
+  /*!
+   * \brief type annotaion of the variable.
+   * This field records user provided type annotation of the Var.
+   * This field is optional and can be None.
+   */
+  Type type_annotation;
+
+  /*! \return The name hint of the variable */
+  const std::string& name_hint() const {
+    return vid->name_hint;
+  }
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("vid", &vid);
+    v->Visit("type_annotation", &type_annotation);
+    v->Visit("span", &span);
+    v->Visit("_checked_type_", &checked_type_);
+  }
+
+  TVM_DLL static Var make(std::string name_hint,
+                          Type type_annotation);
+
+  TVM_DLL static Var make(Id vid,
+                          Type type_annotation);
+
+  static constexpr const char* _type_key = "relay.Var";
+  TVM_DECLARE_NODE_TYPE_INFO(VarNode, ExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(Var, VarNode, Expr);
+
+/*!
+ * \brief Global variable that leaves in the top-level module.
+ * This is used to enable recursive calls between function.
+ *
+ * \note A GlobalVar may only point to functions.
+ */
+class GlobalVar;
+/*! \brief A GlobalId from the node's current type to target type. */
+class GlobalVarNode : public ExprNode {
+ public:
+  /*! \brief The name of the variable, this only acts as a hint. */
+  std::string name_hint;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("name_hint", &name_hint);
+    v->Visit("span", &span);
+    v->Visit("_checked_type_", &checked_type_);
+  }
+
+  TVM_DLL static GlobalVar make(std::string name_hint);
+
+  static constexpr const char* _type_key = "relay.GlobalVar";
+  TVM_DECLARE_NODE_TYPE_INFO(GlobalVarNode, ExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(GlobalVar, GlobalVarNode, Expr);
+
+/*!
+ * \brief Function (subgraph in computational graph)
+ */
+class Function;
+/*! \brief Function container */
+class FunctionNode : public ExprNode {
+ public:
+  /*! \brief Function parameters */
+  tvm::Array<Var> params;
+  /*!
+   * \brief
+   * The expression which represents the computation of the function,
+   * the expression may reference the parameters, and the type of it
+   * or sub-expressions may reference the type variables.
+   */
+  Expr body;
+  /*! \brief User annotated return type of the function. */
+  Type ret_type;
+  /*!
+   * \brief Type parameters of the function.
+   *  Enables the function to vary its type based on these.
+   *  This corresponds to template paramaters in c++'s terminology.
+   *
+   * \note This can be usually empty for non-polymorphic functions.
+   */
+  tvm::Array<TypeVar> type_params;
+
+  /*!
+   * \brief The attributes which store metadata about functions.
+   */
+  tvm::Attrs attrs;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("params", &params);
+    v->Visit("body", &body);
+    v->Visit("ret_type", &ret_type);
+    v->Visit("type_params", &type_params);
+    v->Visit("span", &span);
+    v->Visit("attrs", &attrs);
+    v->Visit("_checked_type_", &checked_type_);
+  }
+
+  /*!
+   * \brief Return the derived function annotation of this expression.
+   *
+   * \return The function type annotation.
+   * \note The function type annotation can contain IncompleteType.
+   */
+  TVM_DLL FuncType func_type_annotation() const;
+
+  /*!
+   * \brief Check whether the function is a primitive function.
+   *
+   * \return Whether the function is primitive or not.
+   */
+  bool IsPrimitive() const;
+
+  TVM_DLL static Function make(tvm::Array<Var> params,
+                               Expr body,
+                               Type ret_type,
+                               tvm::Array<TypeVar> ty_params,
+                               tvm::Attrs attrs = Attrs());
+
+  static constexpr const char* _type_key = "relay.Function";
+  TVM_DECLARE_NODE_TYPE_INFO(FunctionNode, ExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(Function, FunctionNode, Expr);
+
+
+TVM_DLL NodeRef FunctionGetAttr(const Function& func, const std::string& key);
+TVM_DLL Function FunctionSetAttr(const Function& func, const std::string& key, const NodeRef& data);
+
+
+/*!
+ * \brief Call corresponds to operator invocation.
+ *  Corresponds to the operator in computational graph terminology.
+ */
+class Call;
+/*! \brief Call container. */
+class CallNode : public ExprNode {
+ public:
+  /*!
+   * \brief The operator(function) being invoked
+   *
+   *  - It can be relay::Op which corresponds to the primitive operators.
+   *  - It can also be user defined functions (Function, GlobalVar, Var).
+   */
+  Expr op;
+
+  /*! \brief The arguments(inputs) of the call */
+  tvm::Array<relay::Expr> args;
+
+  /*! \brief The additional attributes */
+  Attrs attrs;
+
+  /*!
+   * \brief The type arguments passed to polymorphic(template) function.
+   *
+   * This is the advance feature that is only used when the function is
+   * polymorphic. It is safe to be ignored in most cases. For example, in the
+   * following code, the type_args of addone call is [int].
+   *
+   * \code
+   *
+   * template<typename T>
+   * T addone(T a) { return a + 1; }
+   *
+   * void main() {
+   *   int x = addone<int>(10);
+   * }
+   *
+   * \endcode
+   */
+  tvm::Array<Type> type_args;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("op", &op);
+    v->Visit("args", &args);
+    v->Visit("attrs", &attrs);
+    v->Visit("type_args", &type_args);
+    v->Visit("span", &span);
+    v->Visit("_checked_type_", &checked_type_);
+  }
+
+  TVM_DLL static Call make(Expr op,
+                           Array<Expr> args,
+                           Attrs attrs = Attrs(),
+                           Array<Type> type_args = Array<Type>());
+
+  static constexpr const char* _type_key = "relay.Call";
+  TVM_DECLARE_NODE_TYPE_INFO(CallNode, ExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(Call, CallNode, Expr);
+
+/*!
+ * \brief Let binding that binds a local var and optionally a type annotation.
+ *
+ * \note Let is useful to transform the program to be A-normal form.
+ *  where each of the expression corresponds to a let binding.
+ *
+ *  For developers who are familar with the computational graph.
+ *  Each of the let can be viewed as a operator node in the computational graph.
+ *  Traversing the list of let bindings is similar to running
+ * PostDFS-order(topo-order) traversal on the computational graph.
+ */
+class Let;
+/*! \brief A binding of a sub-network. */
+class LetNode : public ExprNode {
+ public:
+  /*! \brief The variable we bind to */
+  Var var;
+  /*! \brief The value we bind var to */
+  Expr value;
+  /*! \brief The body of the let binding */
+  Expr body;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("var", &var);
+    v->Visit("value", &value);
+    v->Visit("body", &body);
+    v->Visit("span", &span);
+    v->Visit("_checked_type_", &checked_type_);
+  }
+
+  TVM_DLL static Let make(Var var, Expr value, Expr body);
+
+  static constexpr const char* _type_key = "relay.Let";
+  TVM_DECLARE_NODE_TYPE_INFO(LetNode, ExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(Let, LetNode, Expr);
+
+/*!
+ * \brief Condition expression
+ *
+ * Unlike traditional statement `if`s, the if evalutes
+ * to the result of the branch taken.
+ *
+ * let x = if (true) { 1 } else { 0 }; // x is 1
+ * let y = if (false) { 1 } else { 0 }; // y is 0
+ *
+ * \note This is similar to C's ternary operator.
+ */
+class If;
+/*! \brief container of If */
+class IfNode : public ExprNode {
+ public:
+  /*! \brief The condition */
+  Expr cond;
+  /*! \brief The expression evaluated when condition is true. */
+  Expr true_branch;
+  /*! \brief The expression evaluated when condition is false */
+  Expr false_branch;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("cond", &cond);
+    v->Visit("true_branch", &true_branch);
+    v->Visit("false_branch", &false_branch);
+    v->Visit("span", &span);
+    v->Visit("_checked_type_", &checked_type_);
+  }
+
+  TVM_DLL static If make(Expr cond, Expr true_branch, Expr false_branch);
+
+  static constexpr const char* _type_key = "relay.If";
+  TVM_DECLARE_NODE_TYPE_INFO(IfNode, ExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(If, IfNode, Expr);
+
+/*! \brief Get index-th field out of a tuple. */
+class TupleGetItem;
+class TupleGetItemNode : public ExprNode {
+ public:
+  /*! \brief The tuple Expression */
+  Expr tuple;
+  /*! \brief which value to get */
+  int index;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("tuple_value", &tuple);
+    v->Visit("index", &index);
+    v->Visit("span", &span);
+    v->Visit("_checked_type_", &checked_type_);
+  }
+
+  TVM_DLL static TupleGetItem make(Expr tuple, int index);
+
+  static constexpr const char * _type_key = "relay.TupleGetItem";
+  TVM_DECLARE_NODE_TYPE_INFO(TupleGetItemNode, ExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(TupleGetItem, TupleGetItemNode, Expr);
+
+/*!
+ * \brief Base class of the temporary expression.
+ *
+ * TempExprs are pass specific expression that can be
+ * useful to define intermediate result in the
+ * rewriting pass such as layout or type transformation.
+ *
+ * Subclass TempExprNode allows us to pattern match on
+ * specific kind TempExpr and use them for expression rewriting.
+ *
+ * TempExpr should only be used within a pass,
+ */
+class TempExprNode : public ExprNode {
+ public:
+  /*!
+   * \brief Convert the expression to a normal(non-temp) Expr.
+   * \return The corresponding normal(non-temp) expression.
+   */
+  virtual Expr Realize() const = 0;
+
+  static constexpr const char* _type_key = "relay.TempExpr";
+  TVM_DECLARE_BASE_NODE_INFO(TempExprNode, ExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(TempExpr, TempExprNode, Expr);
+
+// implementataions
+template<typename TTypeNode>
+inline const TTypeNode* ExprNode::type_as() const {
+  static_assert(std::is_base_of<TypeNode, TTypeNode>::value,
+                "TType must be a special case of type");
+  CHECK(checked_type_.defined())
+      << "Type inference for this Expr has not completed. Try to call infer_type pass.";
+  const TTypeNode* node = checked_type_.as<TTypeNode>();
+  CHECK(node != nullptr)
+      << "Expected type to be " << TTypeNode::_type_key
+      << ", but get " << checked_type_->type_key();
+  return node;
+}
+
+/*!
+ * \brief Print node as text format.
+ * \param node The node to be printed.
+ * \param show_meta_data Whether to print meta data section.
+ * \param annotate An optional callback function for attaching
+ *        additional comment block to an expr.
+ * \return The text representation.
+ */
+std::string RelayPrint(
+    const NodeRef& node,
+    bool show_meta_data = true,
+    runtime::TypedPackedFunc<std::string(Expr)> annotate = nullptr);
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_EXPR_H_
diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
new file mode 100644
index 000000000000..60b18218a313
--- /dev/null
+++ b/include/tvm/relay/expr_functor.h
@@ -0,0 +1,206 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/expr_functor.h
+ * \brief A more powerful visitor which enables defining arbitrary function
+ * signatures with type based dispatch on first argument.
+ */
+#ifndef TVM_RELAY_EXPR_FUNCTOR_H_
+#define TVM_RELAY_EXPR_FUNCTOR_H_
+
+#include <tvm/node/ir_functor.h>
+#include <string>
+#include "./expr.h"
+#include "./op.h"
+#include "./error.h"
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief A dynamical functor that dispatches on in the first Expr argument.
+ *  You can use this as a more powerful Visitor, since it allows you to
+ *  define function signatures of Visit Function.
+ *
+ * \sa tvm/ir_functor.h
+ *
+ * \tparam FType function signiture
+ *  This type is only defined for FType with function signature R(const Expr&,
+ * Args...)
+ */
+template <typename FType>
+class ExprFunctor;
+
+// functions to be overriden.
+#define EXPR_FUNCTOR_DEFAULT                                      \
+  { return VisitExprDefault_(op, std::forward<Args>(args)...); }
+
+#define RELAY_EXPR_FUNCTOR_DISPATCH(OP)                                \
+  vtable.template set_dispatch<OP>(                                    \
+      [](const NodeRef& n, TSelf* self, Args... args) {                \
+        return self->VisitExpr_(static_cast<const OP*>(n.node_.get()), \
+                                std::forward<Args>(args)...);          \
+      });
+
+template <typename R, typename... Args>
+class ExprFunctor<R(const Expr& n, Args...)> {
+ private:
+  using TSelf = ExprFunctor<R(const Expr& n, Args...)>;
+  using FType = tvm::IRFunctor<R(const NodeRef& n, TSelf* self, Args...)>;
+
+ public:
+  /*! \brief the result type of this functor */
+  using result_type = R;
+  /*! \brief virtual destructor */
+  virtual ~ExprFunctor() {}
+  /*!
+   * \brief Same as call.
+   * \param n The expression node.
+   * \param args Additional arguments.
+   * \return The result of the call
+   */
+  R operator()(const Expr& n, Args... args) {
+    return VisitExpr(n, std::forward<Args>(args)...);
+  }
+  /*!
+   * \brief The functor call.
+   * \param n The expression node.
+   * \param args Additional arguments.
+   * \return The result of the call
+   */
+  virtual R VisitExpr(const Expr& n, Args... args) {
+    static FType vtable = InitVTable();
+    return vtable(n, this, std::forward<Args>(args)...);
+  }
+  // Functions that can be overriden by subclass
+  virtual R VisitExpr_(const ConstantNode* op,
+                       Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExpr_(const TupleNode* op,
+                       Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExpr_(const VarNode* op,
+                       Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExpr_(const GlobalVarNode* op,
+                       Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExpr_(const FunctionNode* op,
+                       Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExpr_(const CallNode* op, Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExpr_(const LetNode* op, Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExpr_(const IfNode* op,
+                       Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExpr_(const OpNode* op,
+                       Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExpr_(const TupleGetItemNode* op, Args... args) EXPR_FUNCTOR_DEFAULT;
+  virtual R VisitExprDefault_(const Node* op, Args...) {
+    throw Error(std::string("Do not have a default for ") + op->type_key());
+  }
+
+ private:
+  // initialize the vtable.
+  static FType InitVTable() {
+    FType vtable;
+    // Set dispatch
+    RELAY_EXPR_FUNCTOR_DISPATCH(ConstantNode);
+    RELAY_EXPR_FUNCTOR_DISPATCH(TupleNode);
+    RELAY_EXPR_FUNCTOR_DISPATCH(VarNode);
+    RELAY_EXPR_FUNCTOR_DISPATCH(GlobalVarNode);
+    RELAY_EXPR_FUNCTOR_DISPATCH(FunctionNode);
+    RELAY_EXPR_FUNCTOR_DISPATCH(CallNode);
+    RELAY_EXPR_FUNCTOR_DISPATCH(LetNode);
+    RELAY_EXPR_FUNCTOR_DISPATCH(IfNode);
+    RELAY_EXPR_FUNCTOR_DISPATCH(OpNode);
+    RELAY_EXPR_FUNCTOR_DISPATCH(TupleGetItemNode);
+    return vtable;
+  }
+};
+
+/*!
+ * \brief A simple visitor wrapper around ExprFunctor.
+ *  Recursively visit the content.
+ *
+ * ExprVisitor treats Expr as dataflow graph,
+ * and only visit each Expr node once.
+ */
+class ExprVisitor
+    : public ::tvm::relay::ExprFunctor<void(const Expr& n)> {
+ public:
+  void VisitExpr(const Expr& expr) override;
+  void VisitExpr_(const VarNode* op) override;
+  void VisitExpr_(const GlobalVarNode* op) override;
+  void VisitExpr_(const ConstantNode* op) override;
+  void VisitExpr_(const TupleNode* op) override;
+  void VisitExpr_(const FunctionNode* op) override;
+  void VisitExpr_(const CallNode* op) override;
+  void VisitExpr_(const LetNode* op) override;
+  void VisitExpr_(const IfNode* op) override;
+  void VisitExpr_(const OpNode* op) override;
+  void VisitExpr_(const TupleGetItemNode* op) override;
+  virtual void VisitType(const Type& t);
+
+ protected:
+  // Internal visiting counter
+  std::unordered_map<const Node*, size_t> visit_counter_;
+};
+
+/*!
+ * \brief A wrapper around ExprFunctor which functionally updates the AST.
+ *
+ * ExprMutator treats Expr as dataflow graph, and only Mutate each Expr once.
+ * The mutated results are memoized in a map and reused so that
+ * local transformation on the dataflow preserves the graph structure.
+ */
+class ExprMutator
+    : public ::tvm::relay::ExprFunctor<Expr(const Expr&)> {
+ public:
+  /*!
+   * \brief Mutate is alias for VisitExpr
+   * \return expr.
+   */
+  Expr Mutate(const Expr& expr) {
+    return this->VisitExpr(expr);
+  }
+  Expr VisitExpr(const Expr& expr) override;
+  Expr VisitExpr_(const VarNode* op) override;
+  Expr VisitExpr_(const ConstantNode* op) override;
+  Expr VisitExpr_(const GlobalVarNode* op) override;
+  Expr VisitExpr_(const OpNode* op) override;
+  Expr VisitExpr_(const TupleNode* op) override;
+  Expr VisitExpr_(const FunctionNode* op) override;
+  Expr VisitExpr_(const CallNode* call_node) override;
+  Expr VisitExpr_(const LetNode* op) override;
+  Expr VisitExpr_(const IfNode* op) override;
+  Expr VisitExpr_(const TupleGetItemNode* op) override;
+  /*!
+   * \brief Used to visit the types inside of expressions.
+   *
+   * Can be overloaded to transform the types in arbitrary
+   * ways, one way would be to define a sub-class of type
+   * visitor for types which transform them appropriately.
+   */
+  virtual Type VisitType(const Type& t);
+
+ protected:
+  /*! \brief Internal map used for memoization. */
+  std::unordered_map<Expr, Expr, NodeHash, NodeEqual> memo_;
+};
+
+/*!
+ * \brief recursively visit the ir in post DFS order node, apply fvisit
+ * Each node is guaranteed to be visited only once.
+ * \param node The ir to be visited.
+ * \param fvisit The visitor function to be applied.
+ */
+void PostOrderVisit(const NodeRef& node, std::function<void(const NodeRef&)> fvisit);
+
+/*
+ * \brief Bind function parameters or free variables.
+ *
+ * Parameter binding can only happen if expr is a Function.
+ * binds cannot change internal arguments of internal functions.
+ *
+ * \param expr The function to be binded.
+ * \param binds The map of arguments to
+ */
+Expr Bind(const Expr& expr, const tvm::Map<Var, Expr>& binds);
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_EXPR_FUNCTOR_H_
diff --git a/include/tvm/relay/interpreter.h b/include/tvm/relay/interpreter.h
new file mode 100644
index 000000000000..1099ef0f3cfd
--- /dev/null
+++ b/include/tvm/relay/interpreter.h
@@ -0,0 +1,146 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/interpreter.h
+ * \brief An interpreter for Relay.
+ *
+ * This file implements a simple reference interpreter for Relay programs.
+ * Given a Relay module, and a Relay expression it produces a value.
+ *
+ * The interpreter's values are a naive representation of the values that
+ * can be produced by a Relay program and are exposed via tvm::Node's
+ * system to Python for introspection and debugging.
+ *
+ * The interpreter's intent is to serve as a reference semantics for the Relay IR,
+ * as well as for debugging and testing.
+ */
+#ifndef TVM_RELAY_INTERPRETER_H_
+#define TVM_RELAY_INTERPRETER_H_
+
+#include <tvm/build_module.h>
+#include <tvm/relay/module.h>
+#include <tvm/relay/expr.h>
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief A Relay value.
+ */
+class Value;
+
+/*!
+ *\brief Create a Interpreter function that can
+ *  evaluate an expression and produce a value.
+ *
+ * The resulting value can be passed to Python, making it easy to use
+ * for testing and debugging.
+ *
+ * The interpreter interprets the program fragments not supported by the
+ * TVM runtime, although the interpreter is naively implemented it uses
+ * TVM operators for evaluating all operators.
+ *
+ * Our intent is that this will never be the most efficient implementation of
+ * Relay's semantics, but a readable and clear one.
+ *
+ * \param mod The function module.
+ * \param context The primary context that the interepreter runs on.
+ * \param target Compiler target flag to compile the functions on the context.
+ * \return A function that takes in an expression and returns a value.
+ */
+runtime::TypedPackedFunc<Value(Expr)>
+CreateInterpreter(Module mod, DLContext context, Target target);
+
+/*! \brief The base container type of Relay values. */
+class ValueNode : public RelayNode {
+ public:
+  static constexpr const char* _type_key = "relay.Value";
+  TVM_DECLARE_BASE_NODE_INFO(ValueNode, RelayNode);
+};
+
+class Value : public NodeRef {
+ public:
+  Value() {}
+  explicit Value(NodePtr<Node> n) : NodeRef(n) {}
+  const ValueNode* operator->() const {
+    return static_cast<const ValueNode*>(node_.get());
+  }
+
+  using ContainerType = ValueNode;
+};
+
+/*! \brief A Relay closure, i.e a scope and a function. */
+class Closure;
+
+/*! \brief The container type of Closures. */
+class ClosureNode : public ValueNode {
+ public:
+  /*! \brief The set of free variables in the closure.
+   *
+   * These are the captured variables which are required for
+   * evaluation when we call the closure.
+   */
+  tvm::Map<Var, Value> env;
+  /*! \brief The function which implements the closure.
+   *
+   * \note May reference the variables contained in the env.
+   */
+  Function func;
+
+  ClosureNode() {}
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("env", &env);
+    v->Visit("func", &func);
+  }
+
+  TVM_DLL static Closure make(tvm::Map<Var, Value> env, Function func);
+
+  static constexpr const char* _type_key = "relay.Closure";
+  TVM_DECLARE_NODE_TYPE_INFO(ClosureNode, ValueNode);
+};
+
+RELAY_DEFINE_NODE_REF(Closure, ClosureNode, Value);
+
+/*! \brief A tuple value. */
+class TupleValue;
+
+/*! \brief Tuple (x, ... y). */
+struct TupleValueNode : ValueNode {
+  tvm::Array<Value> fields;
+
+  TupleValueNode() {}
+
+  void VisitAttrs(tvm::AttrVisitor* v) final { v->Visit("fields", &fields); }
+
+  TVM_DLL static TupleValue make(tvm::Array<Value> value);
+
+  static constexpr const char* _type_key = "relay.TupleValue";
+  TVM_DECLARE_NODE_TYPE_INFO(TupleValueNode, ValueNode);
+};
+
+RELAY_DEFINE_NODE_REF(TupleValue, TupleValueNode, Value);
+
+/*! \brief A tensor value. */
+class TensorValue;
+
+/*! \brief The tensor value container, wrapping an NDArray. */
+struct TensorValueNode : ValueNode {
+  runtime::NDArray data;
+
+  TensorValueNode() {}
+
+  void VisitAttrs(tvm::AttrVisitor* v) final { v->Visit("data", &data); }
+
+  /*! \brief Build a value from an NDArray. */
+  TVM_DLL static TensorValue make(runtime::NDArray data);
+
+  static constexpr const char* _type_key = "relay.TensorValue";
+  TVM_DECLARE_NODE_TYPE_INFO(TensorValueNode, ValueNode);
+};
+
+RELAY_DEFINE_NODE_REF(TensorValue, TensorValueNode, Value);
+
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_INTERPRETER_H_
diff --git a/include/tvm/relay/logging.h b/include/tvm/relay/logging.h
new file mode 100644
index 000000000000..c53cd15ee72e
--- /dev/null
+++ b/include/tvm/relay/logging.h
@@ -0,0 +1,33 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/logging.h
+ * \brief A wrapper around dmlc-core/logging.h which adds the ability
+ * to toggle logging via an environment variable.
+ */
+
+#ifndef TVM_RELAY_LOGGING_H_
+#define TVM_RELAY_LOGGING_H_
+
+#include <dmlc/logging.h>
+#include <string>
+#include <cstdlib>
+#include <iostream>
+
+namespace tvm {
+namespace relay {
+
+static bool logging_enabled() {
+  if (auto var = std::getenv("RELAY_LOG")) {
+    std::string is_on(var);
+    return is_on == "1";
+  } else {
+      return false;
+  }
+}
+
+#define RELAY_LOG(severity) LOG_IF(severity, logging_enabled())
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_LOGGING_H_
diff --git a/include/tvm/relay/module.h b/include/tvm/relay/module.h
new file mode 100644
index 000000000000..b04d6fec20c5
--- /dev/null
+++ b/include/tvm/relay/module.h
@@ -0,0 +1,129 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/module.h
+ * \brief The global environment: contains information needed to
+ * compile & optimize Relay programs.
+ */
+#ifndef TVM_RELAY_MODULE_H_
+#define TVM_RELAY_MODULE_H_
+
+#include <tvm/relay/error.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/type.h>
+#include <string>
+#include <vector>
+
+namespace tvm {
+namespace relay {
+
+struct Module;
+
+/*! \brief The global environment of Relay programs.
+ *
+ *  The global environment contains the global
+ *  information needed to compile a Relay program.
+ *
+ *  It contains all global functions, and configuration
+ *  options.
+ *
+ *  Many operations require access to the global
+ *  Module. We pass the Module by value
+ *  in a functional style as an explicit argument,
+ *  but we mutate the Module while optimizing
+ *  Relay programs.
+ *
+ *  The functional style allows users to construct custom
+ *  environments easily, for example each thread can store
+ *  an Module while auto-tuning.
+ * */
+
+class ModuleNode : public RelayNode {
+ public:
+  /*! \brief A map from ids to all global functions. */
+  tvm::Map<GlobalVar, Function> functions;
+
+  ModuleNode() {}
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("functions", &functions);
+    v->Visit("global_var_map_", &global_var_map_);
+  }
+
+  TVM_DLL static Module make(tvm::Map<GlobalVar, Function> global_funcs);
+
+  /*!
+   * \brief Add a function to the global environment.
+   * \param var The name of the global function.
+   * \param func The function.
+   * \param update Controls whether you can replace a definition in the
+   * environment.
+   */
+  void Add(const GlobalVar& var, const Function& func, bool update = false);
+
+  /*!
+   * \brief Update a function in the global environment.
+   * \param var The name of the global function to update.
+   * \param func The new function.
+   */
+  void Update(const GlobalVar& var, const Function& func);
+
+  /*!
+   * \brief Remove a function from the global environment.
+   * \param var The name of the global function to update.
+   */
+  void Remove(const GlobalVar& var);
+
+  /*!
+   * \brief Lookup a global function by its variable.
+   * \param str The unique string specifying the global variable.
+   * \returns The global variable.
+   */
+  GlobalVar GetGlobalVar(const std::string& str);
+
+  /*!
+   * \brief Lookup a global function by its variable.
+   * \param var The global var to lookup.
+   * \returns The function named by the variable argument.
+   */
+  Function Lookup(const GlobalVar& var);
+
+  /*!
+   * \brief Lookup a global function by its string name
+   * \param name The name of the function.
+   * \returns The function named by the argument.
+   */
+  Function Lookup(const std::string& name);
+
+  /*!
+   * \brief Update the functions inside this environment by
+   *        functions in another environment.
+   * \param other The other environment.
+   */
+  void Update(const Module& other);
+
+  static constexpr const char* _type_key = "relay.Module";
+  TVM_DECLARE_NODE_TYPE_INFO(ModuleNode, Node);
+
+ private:
+  /*! \brief A map from string names to global variables that
+   * ensures global uniqueness.
+   */
+  tvm::Map<std::string, GlobalVar> global_var_map_;
+};
+
+struct Module : public NodeRef {
+  Module() {}
+  explicit Module(NodePtr<tvm::Node> p) : NodeRef(p) {}
+
+  inline ModuleNode* operator->() const {
+    return static_cast<ModuleNode*>(node_.get());
+  }
+
+  using ContainerType = ModuleNode;
+};
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_MODULE_H_
diff --git a/include/tvm/relay/op.h b/include/tvm/relay/op.h
new file mode 100644
index 000000000000..d3c5edd31461
--- /dev/null
+++ b/include/tvm/relay/op.h
@@ -0,0 +1,571 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/op.h
+ * \brief Primitive operator definition.
+ */
+#ifndef TVM_RELAY_OP_H_
+#define TVM_RELAY_OP_H_
+
+#include <functional>
+#include <limits>
+#include <string>
+#include <typeinfo>
+#include <utility>
+#include <vector>
+
+#include "base.h"
+#include "expr.h"
+#include "type.h"
+
+namespace tvm {
+namespace relay {
+
+// forward declare name.
+template <typename ValueType>
+class OpMap;
+class GenericOpMap;
+class OpRegistry;
+
+/*!
+ * \brief Node container of operator structure.
+ */
+class OpNode : public relay::ExprNode {
+ public:
+  /*! \brief name of the operator */
+  std::string name;
+  /*! \brief the type of the operator */
+  mutable FuncType op_type;
+  /*!
+   * \brief detailed description of the operator
+   *  This can be used to generate docstring automatically for the operator.
+   */
+  std::string description;
+  /* \brief Information of input arguments to the operator */
+  Array<AttrFieldInfo> arguments;
+  /*!
+   * \brief The type key of the attribute field
+   *  This can be empty, in which case it defaults to anything.
+   */
+  std::string attrs_type_key;
+  /*!
+   * \brief attribute type index,
+   * this field varies in each run and is not exposed to frontend.
+   */
+  uint32_t attrs_type_index{0};
+  /*!
+   * \brief number of input arguments to the operator,
+   * -1 means it is variable length
+   */
+  int32_t num_inputs = -1;
+  /*!
+   * \brief support level of the operator,
+   *  The lower the more priority it contains.
+   *  This is in analogies to BLAS levels.
+   */
+  int32_t support_level = 10;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("name", &name);
+    v->Visit("op_type", &op_type);
+    v->Visit("description", &description);
+    v->Visit("arguments", &arguments);
+    v->Visit("attrs_type_key", &attrs_type_key);
+    v->Visit("num_inputs", &num_inputs);
+    v->Visit("support_level", &support_level);
+  }
+
+  /*!
+   * \brief Check that if current op is a "primtive operator".
+   * That is the arguments are all type variables, and there is a single
+   * type relation applied to the input and output types.
+   */
+  bool IsPrimitiveOp() const {
+    if (is_primitive_ != -1) return is_primitive_ != 0;
+    is_primitive_ = this->IsPrimitiveOp_() ? 1 : 0;
+    return is_primitive_ != 0;
+  }
+
+  static constexpr const char* _type_key = "relay.Op";
+  TVM_DECLARE_NODE_TYPE_INFO(OpNode, ExprNode);
+
+ private:
+  // friend class
+  friend class GenericOpMap;
+  friend class OpRegistry;
+  friend bool IsPrimitiveOp(const Expr&);
+  // Program internal unique index of operator.
+  // Used to help index the program.
+  uint32_t index_{0};
+  // whether this is a primitive op. -1 means unknown.
+  mutable int is_primitive_{-1};
+  // Internal function to compute if it is primitive op
+  bool IsPrimitiveOp_() const {
+    const auto& fn_ty = this->op_type;
+    if (fn_ty->type_constraints.size() != 1) return false;
+    const TypeRelationNode* rel = fn_ty->type_constraints[0].as<TypeRelationNode>();
+    if (rel == nullptr) return false;
+    // validate if the type parameter matches up
+    for (size_t i = 0; i < fn_ty->type_params.size(); ++i) {
+      if (!fn_ty->type_params[i].same_as(rel->args[i])) return false;
+    }
+    return true;
+  }
+};
+
+/*!
+ * \brief Operator reference class.
+ */
+class Op : public relay::Expr {
+ public:
+  /*! \brief default constructor  */
+  Op() {}
+  /*! \brief constructor from node pointer */
+  explicit Op(NodePtr<Node> n) : Expr(n) {}
+  /*!
+   * \brief access the internal node container
+   * \return the pointer to the internal node container
+   */
+  inline const OpNode* operator->() const;
+  /*!
+   * \brief Get additional registered attribute about operators.
+   *  If nothing has been registered, an empty OpMap will be returned.
+   * \param attr_name The name of the attribute.
+   * \return An OpMap of specified attr_name.
+   * \tparam ValueType The type of the attribute.
+   */
+  template <typename ValueType>
+  inline static OpMap<ValueType> GetAttr(const std::string& attr_name);
+  /*!
+   * \brief Get an Op for a given operator name.
+   *  Will raise an error if the op has not been registered.
+   * \param op_name Name of the operator.
+   * \return Pointer to a Op, valid throughout program lifetime.
+   */
+  TVM_DLL static const Op& Get(const std::string& op_name);
+
+  /*! \brief specify container node */
+  using ContainerType = OpNode;
+
+ private:
+  /*!
+   * \brief Get generic attrmap given attr name
+   * \param key The attribute key
+   * \return reference to GenericOpMap
+   */
+  TVM_DLL static const GenericOpMap& GetGenericAttr(const std::string& key);
+};
+
+/*! \brief Helper structure to register operators */
+class OpRegistry {
+ public:
+  /*! \return the operator */
+  const Op& op() const { return op_; }
+  /*!
+   * \brief setter function during registration
+   *  Set the description of operator
+   * \param descr the description string.
+   * \return reference to self.
+   */
+  inline OpRegistry& describe(const std::string& descr);  // NOLINT(*)
+  /*!
+   * \brief Add argument information to the function.
+   * \param name Name of the argument.
+   * \param type Type of the argument.
+   * \param description Description of the argument.
+   * \return reference to self.
+   */
+  inline OpRegistry& add_argument(const std::string& name,
+                                  const std::string& type,
+                                  const std::string& description);
+  /*!
+   * \brief Attach the type function corresponding to the return type.
+   * \param rel_name The type relation name to register.
+   * \param type_rel_func The backing relation function which can solve an arbitrary
+   * relation on variables.
+   * \return reference to self.
+   */
+  inline OpRegistry& add_type_rel(
+      const std::string& rel_name,
+      runtime::TypedPackedFunc<bool(const Array<Type>&,
+                                    int,
+                                    const Attrs&,
+                                    const TypeReporter&)> type_rel_func);
+  /*!
+   * \brief Set the type key of attributes.
+   * \param type_key The type of of the attrs field.
+   * \return reference to self.
+   */
+  inline OpRegistry& set_attrs_type_key(const std::string& type_key);
+  /*!
+   * \brief Set the num_inputs
+   * \param n The number of inputs to be set.
+   * \return reference to self.
+   */
+  inline OpRegistry& set_num_inputs(int32_t n);  // NOLINT(*)
+  /*!
+   * \brief Set the support level of op.
+   * \param level The support level.
+   * \return reference to self.
+   */
+  inline OpRegistry& set_support_level(int32_t level);  // NOLINT(*)
+  /*!
+   * \brief Register additional attributes to operator.
+   * \param attr_name The name of the attribute.
+   * \param value The value to be set.
+   * \param plevel The priority level of this set,
+   *  an higher priority level attribute
+   *  will replace lower priority level attribute.
+   *  Must be bigger than 0.
+   *
+   *  Cannot set with same plevel twice in the code.
+   *
+   * \tparam ValueType The type of the value to be set.
+   */
+  template <typename ValueType>
+  inline OpRegistry& set_attr(const std::string& attr_name,  // NOLINT(*)
+                              const ValueType& value, int plevel = 10);
+
+  // set the name of the op to be the same as registry
+  inline OpRegistry& set_name() {  // NOLINT(*)
+    if (get()->name.length() == 0) {
+      get()->name = name;
+    }
+    return *this;
+  }
+  /*! \return The global single registry */
+  TVM_DLL static ::dmlc::Registry<OpRegistry>* Registry();
+
+ private:
+  friend class ::dmlc::Registry<OpRegistry>;
+  // the name
+  std::string name;
+  /*! \brief The operator */
+  Op op_;
+  // private constructor
+  OpRegistry();
+  // return internal pointer to op.
+  inline OpNode* get();
+  // update the attribute OpMap
+  TVM_DLL void UpdateAttr(const std::string& key, TVMRetValue value,
+                          int plevel);
+};
+
+/*!
+ * \brief Generic map to store additional information of Op.
+ */
+class GenericOpMap {
+ public:
+  /*!
+   * \brief Check if the map has op as key.
+   * \param op The key to the map
+   * \return 1 if op is contained in map, 0 otherwise.
+   */
+  inline int count(const Op& op) const;
+  /*!
+   * \brief get the corresponding value element at op
+   * \param op The key to the map
+   * \return the const reference to the content value.
+   */
+  inline const TVMRetValue& operator[](const Op& op) const;
+  /*!
+   * \brief get the corresponding value element at op with default value.
+   * \param op The key to the map
+   * \param def_value The default value when the key does not exist.
+   * \return the const reference to the content value.
+   * \tparam ValueType The content value type.
+   */
+  template <typename ValueType>
+  inline ValueType get(const Op& op, ValueType def_value) const;
+  /*!
+   * \brief get the corresponding value element at op with default value.
+   * \param expr The key to the map
+   * \param def_value The default value when the key does not exist
+   *         or if expr is not an Op.
+   * \return the const reference to the content value.
+   * \tparam ValueType The content value type.
+   */
+  template <typename ValueType>
+  inline ValueType get(const Expr& expr, ValueType def_value) const;
+
+ private:
+  friend class OpRegistry;
+  // the attribute field.
+  std::string attr_name_;
+  // internal data
+  std::vector<std::pair<TVMRetValue, int> > data_;
+  // The value
+  GenericOpMap() = default;
+};
+
+/*!
+ * \brief Map<Op,ValueType> used to store meta-information about Op.
+ * \tparam ValueType The type of the value stored in map.
+ */
+template <typename ValueType>
+class OpMap {
+ public:
+  /*!
+   * \brief Check if the map has op as key.
+   * \param op The key to the map
+   * \return 1 if op is contained in map, 0 otherwise.
+   */
+  inline int count(const Op& op) const;
+  /*!
+   * \brief get the corresponding value element at op
+   * \param op The key to the map
+   * \return the const reference to the content value.
+   */
+  inline ValueType operator[](const Op& op) const;
+  /*!
+   * \brief get the corresponding value element at op with default value.
+   * \param op The key to the map
+   * \param def_value The default value when the key does not exist.
+   * \return the const reference to the content value.
+   */
+  inline ValueType get(const Op& op, ValueType def_value) const;
+  /*!
+   * \brief get the corresponding value element at op with default value.
+   * \param expr The key to the map
+   * \param def_value The default value when the key does not exist
+   *         or if expr is not an Op.
+   * \return the const reference to the content value.
+   */
+  inline ValueType get(const Expr& expr, ValueType def_value) const;
+
+ private:
+  friend class Op;
+  // constructor
+  explicit OpMap(const GenericOpMap& map) : map_(map) {}
+  /*! \brief The internal map field */
+  const GenericOpMap& map_;
+};
+
+// internal macros to make
+#define RELAY_REGISTER_VAR_DEF \
+  static DMLC_ATTRIBUTE_UNUSED ::tvm::relay::OpRegistry& __make_##RelayOp
+
+/*!
+ * \def RELAY_REGISTER_OP
+ * \brief Register a new operator, or set attribute of the corresponding op.
+ *
+ * \param OpName The name of registry
+ *
+ * \code
+ *
+ *  RELAY_REGISTER_OP("add")
+ *  .describe("add two inputs together")
+ *  .set_num_inputs(2)
+ *  .set_attr<OpKernel>("gpu_kernel", AddKernel);
+ *
+ * \endcode
+ */
+#define RELAY_REGISTER_OP(OpName)                        \
+  DMLC_STR_CONCAT(RELAY_REGISTER_VAR_DEF, __COUNTER__) = \
+      ::tvm::relay::OpRegistry::Registry()               \
+          ->__REGISTER_OR_GET__(OpName)                  \
+          .set_name()
+
+// implementations
+inline const OpNode* Op::operator->() const {
+  return static_cast<const OpNode*>(node_.get());
+}
+
+template <typename ValueType>
+inline OpMap<ValueType> Op::GetAttr(const std::string& key) {
+  return OpMap<ValueType>(Op::GetGenericAttr(key));
+}
+
+inline OpNode* OpRegistry::get() {
+  return const_cast<OpNode*>(op_.operator->());
+}
+
+inline OpRegistry& OpRegistry::describe(
+    const std::string& descr) {  // NOLINT(*)
+  get()->description = descr;
+  return *this;
+}
+
+inline OpRegistry& OpRegistry::add_argument(const std::string& name,
+                                            const std::string& type,
+                                            const std::string& description) {
+  auto n = make_node<AttrFieldInfoNode>();
+  n->name = name;
+  n->type_info = type;
+  n->description = description;
+  get()->arguments.push_back(AttrFieldInfo(n));
+  return *this;
+}
+
+inline OpRegistry& OpRegistry::add_type_rel(
+    const std::string& rel_name,
+    runtime::TypedPackedFunc<bool(const Array<Type>&,
+                                  int,
+                                  const Attrs&,
+                                  const TypeReporter&)> type_rel_func) {
+  auto func_name = std::string("tvm.relay.type_relation.") + rel_name;
+  TypeRelationFn env_type_rel_func;
+
+  if (runtime::Registry::Get(func_name)) {
+    auto env_func = EnvFunc::Get(func_name);
+    env_type_rel_func = env_func;
+  } else {
+    runtime::Registry::Register(func_name)
+        .set_body(type_rel_func.packed());
+    auto env_func = EnvFunc::Get(func_name);
+    env_type_rel_func = env_func;
+  }
+
+  Array<TypeVar> type_params;
+  Array<Type> arg_types;
+
+  // Add inputs.
+  std::string input_name_prefix = "in";
+  for (int i = 0; i < get()->num_inputs; i++) {
+    auto name = input_name_prefix + std::to_string(i);
+    auto param = TypeVarNode::make(name, TypeVarNode::Kind::kType);
+    type_params.push_back(param);
+    arg_types.push_back(param);
+  }
+
+  Array<Type> ty_call_args = arg_types;
+
+  // Add output type.
+  auto out_param = TypeVarNode::make("out", TypeVarNode::Kind::kType);
+  type_params.push_back(out_param);
+  // this will trigger copy on write.
+  ty_call_args.push_back(out_param);
+
+  // The attributes of primitive op is nullptr
+  //
+  // The attributes of primitive operator can vary at the call site.
+  // The type of sum is also dependent on Attrs being passed.
+  // So puting nullptr in the Attrs means that the operator is polymorphic on Attrs.
+  //
+  // A common example is sum(x, axis), where the choice of axis
+  // can affect the type of the function.
+  TypeConstraint type_rel =
+      TypeRelationNode::make(env_type_rel_func,
+                             ty_call_args,
+                             arg_types.size(),
+                             Attrs());
+
+  auto func_type =
+      FuncTypeNode::make(arg_types, out_param, type_params, {type_rel});
+
+  get()->op_type = func_type;
+
+  return *this;
+}
+
+inline OpRegistry& OpRegistry::set_num_inputs(int32_t n) {  // NOLINT(*)
+  get()->num_inputs = n;
+  return *this;
+}
+
+inline OpRegistry& OpRegistry::set_attrs_type_key(  // NOLINT(*)
+    const std::string& type_key) {
+  get()->attrs_type_key = type_key;
+  get()->attrs_type_index = Node::TypeKey2Index(type_key.c_str());
+  return *this;
+}
+
+inline OpRegistry& OpRegistry::set_support_level(int32_t n) {  // NOLINT(*)
+  get()->support_level = n;
+  return *this;
+}
+
+template <typename ValueType>
+inline OpRegistry& OpRegistry::set_attr(  // NOLINT(*)
+    const std::string& attr_name, const ValueType& value, int plevel) {
+  CHECK_GT(plevel, 0) << "plevel in set_attr must be greater than 0";
+  TVMRetValue rv;
+  rv = value;
+  UpdateAttr(attr_name, rv, plevel);
+  return *this;
+}
+
+// member functions of OpMap
+inline int GenericOpMap::count(const Op& op) const {
+  if (op.defined()) {
+    const uint32_t idx = op->index_;
+    return idx < data_.size() ? (data_[idx].second != 0) : 0;
+  } else {
+    return 0;
+  }
+}
+
+inline const TVMRetValue& GenericOpMap::operator[](const Op& op) const {
+  CHECK(op.defined());
+  const uint32_t idx = op->index_;
+  CHECK(idx < data_.size() && data_[idx].second != 0)
+      << "Attribute " << attr_name_ << " has not been registered for Operator "
+      << op->name;
+  return data_[idx].first;
+}
+
+template <typename ValueType>
+inline ValueType GenericOpMap::get(const Op& op, ValueType value) const {
+  CHECK(op.defined());
+  const uint32_t idx = op->index_;
+  if (idx < data_.size() && data_[idx].second != 0) {
+    return data_[idx].first;
+  } else {
+    return value;
+  }
+}
+
+template <typename ValueType>
+inline ValueType GenericOpMap::get(const Expr& expr, ValueType value) const {
+  CHECK(expr.defined());
+  if (const OpNode* op = expr.as<OpNode>()) {
+    const uint32_t idx = op->index_;
+    if (idx < data_.size() && data_[idx].second != 0) {
+      return data_[idx].first;
+    } else {
+      return value;
+    }
+  } else {
+    return value;
+  }
+}
+
+template <typename ValueType>
+inline int OpMap<ValueType>::count(const Op& op) const {
+  return map_.count(op);
+}
+
+template <typename ValueType>
+inline ValueType OpMap<ValueType>::operator[](const Op& op) const {
+  return map_[op];
+}
+
+template <typename ValueType>
+inline ValueType OpMap<ValueType>::get(const Op& op,
+                                       ValueType def_value) const {
+  return map_.get<ValueType>(op, def_value);
+}
+
+template <typename ValueType>
+inline ValueType OpMap<ValueType>::get(const Expr& expr,
+                                       ValueType def_value) const {
+  return map_.get<ValueType>(expr, def_value);
+}
+
+/*!
+ * \brief Check that an expression is a "primtive operator".
+ *
+ * Will return true if the expression is an operator which
+ * matches the form of primtive operators registered directly
+ * by the Relay codebase.
+ *
+ * That is the arguments are all type variables, and there is a single
+ * type relation applied to the input and output types.
+ */
+inline bool IsPrimitiveOp(const Expr& expr) {
+  const auto* op = expr.as<OpNode>();
+  return op != nullptr && op->IsPrimitiveOp();
+}
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_OP_H_
diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h
new file mode 100644
index 000000000000..c2839a471d20
--- /dev/null
+++ b/include/tvm/relay/op_attr_types.h
@@ -0,0 +1,129 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file nnvm/compiler/op_attr_types.h
+ * \brief The Expr and related elements in DataFlow construction.
+ */
+#ifndef TVM_RELAY_OP_ATTR_TYPES_H_
+#define TVM_RELAY_OP_ATTR_TYPES_H_
+
+#include <tvm/tensor.h>
+#include <tvm/schedule.h>
+#include <tvm/build_module.h>
+#include <tvm/relay/type.h>
+#include <tvm/relay/expr.h>
+
+namespace tvm {
+namespace relay {
+
+/*! \brief operator pattern used in graph fusion */
+enum OpPatternKind {
+  // Elementwise operation
+  kElemWise = 0,
+  // Broadcasting operator, can always map output axis to the input in order.
+  // for example :code:`out[i, ax1, j, ax2] = input[i, j]`.
+  // Note that the axis need to be in order so transpose is not a bcast operator.
+  kBroadcast = 1,
+  // Injective operator, can always injectively map output axis to a single input axis.
+  // All injective operator can still be safely fused to injective and reduction.
+  kInjective = 2,
+  // Communicative reduction operator.
+  kCommReduce = 3,
+  // Complex operation, can still fuse elemwise operations into its output.
+  // but cannot chain another complex op
+  kOutEWiseFusable = 4,
+  // Opaque operation, cannot fuse anything.
+  kOpaque = 8
+};
+
+/*! \brief the operator pattern */
+using TOpPattern = int;
+
+/*!
+ * \brief Whether operator is stateful or contain internal state.
+ *
+ * All the primitive ops we registered so far are pure.
+ * This attribute is left for potential future compatible reasons.
+ * We can always work around the stateful ops by adding an additional
+ * handle argument and return it.
+ */
+using TOpIsStateful = bool;
+
+/*!
+ * \brief Mark the operator as non-computational.
+ */
+using TNonComputational = bool;
+
+/*!
+ * \brief Computation description interface.
+ *
+ * \note This function have a special convention
+ *  for functions with tuple input/output.
+ *
+ *  So far we restrict tuple support to the following case:
+ *  - Function which takes a single tuple as input.
+ *  - Function which outputs a single tuple.
+ *
+ *  In both cases, the tuple is flattened as array.
+ *
+ * \param attrs The attribute of the primitive
+ * \param inputs The input tensors.
+ * \param out_type The output type information
+ &                 these are always placeholders.
+ * \return The output compute description of the operator.
+ */
+using FTVMCompute = runtime::TypedPackedFunc<
+  Array<Tensor>(const Attrs& attrs,
+                const Array<Tensor>& inputs,
+                const Type& out_type,
+                const Target& target)>;
+
+/*!
+ * \brief Build the computation schedule for
+ *  op whose root is at current op.
+ *
+ * \param attrs The attribute of the node.
+ * \param outs The output tensors.
+ * \param target The build target.
+ * \return schedule The computation schedule.
+ */
+using FTVMSchedule = runtime::TypedPackedFunc<
+  Schedule(const Attrs& attrs,
+           const Array<Tensor>& outs,
+           const Target& target)>;
+
+/*!
+ * \brief Alternate the layout of operators or replace the
+ *  operator with other expressions. This function will be invoked
+ *  in AlterOpLayout pass.
+ * \param attrs The attribute of the original node.
+ * \param inputs The input symbols of the original node.
+ * \param tinfos An array of placeholders, use for getting the inferred shape
+ *               and dtype of the inputs.
+ * \return new_expr The modified expression.
+ */
+using FTVMAlterOpLayout = runtime::TypedPackedFunc<
+  Expr(const Attrs& attrs,
+       const Array<Expr>& args,
+       const Array<Tensor>& tinfos)>;
+
+/*!
+ * \brief Forward rewriting rule for a specific op.
+ *
+ * \param ref_call The reference old call type to be rewritten.
+ *                 We can make use of the op and type information.
+ * \param new_args The new arguments (some of them could be TempExpr).
+ * \param ctx  Optional context information about ref_call.
+ * \return The rewriten result call, can also return nullptr,
+ *         which indicate the rewriter should use the default fallback
+ *         rule that realizes all its input and compose the call.
+ *
+ * \note When we register the function, we can register
+ *       a different signature with ctx to be a specific node type.
+ */
+using FForwardRewrite = runtime::TypedPackedFunc<
+  Expr(const Call& ref_call,
+       const Array<Expr>& new_args,
+       const NodeRef& ctx)>;
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_OP_ATTR_TYPES_H_
diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
new file mode 100644
index 000000000000..8fff7016a827
--- /dev/null
+++ b/include/tvm/relay/pass.h
@@ -0,0 +1,218 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/pass.h
+ * \brief The set of Relay passes written in C++.
+ */
+#ifndef TVM_RELAY_PASS_H_
+#define TVM_RELAY_PASS_H_
+
+#include <tvm/relay/module.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/op_attr_types.h>
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief Infer the type of an expression.
+ *
+ * The result of type checking is a new expression with unambigous
+ * type information filled in, as well as it's checked type field
+ * populated with the result type.
+ *
+ * \param expr The expression to type check.
+ * \param mod The module used for referencing global functions, can be
+ * None.
+ *
+ * \return A type checked expression with its checked_type field populated.
+ */
+Expr InferType(const Expr& expr, const Module& mod);
+/*!
+ * \brief Infer the type of a function as if it is mapped to var in the mod.
+ *
+ * \param f the function.
+ * \param mod The module used for referencing global functions.
+ * \param var The global variable corresponding to the function.
+ *
+ * \return A type checked Function with its checked_type field populated.
+ * \note this function mutates mod and is not thread-safe.
+ */
+Function InferType(const Function& f, const Module& mod,
+                   const GlobalVar& var);
+
+/*!
+ * \brief Check that types are well kinded by applying "kinding rules".
+ *
+ * This pass ensures we do not do things that violate the design of the
+ * type system when writing down types.
+ *
+ * For example tensors are not allowed to contain functions in Relay.
+ *
+ * We check this by ensuring the `dtype` field of a Tensor always contains
+ * a data type such as `int`, `float`, `uint`.
+ *
+ * \param t The type to check.
+ * \param mod The global module.
+ *
+ * \return true if the rules are satisified otherwise false
+ */
+bool KindCheck(const Type& t, const Module& mod);
+
+/*! \brief Compare two expressions for structural equivalence.
+ *
+ * This comparison operator respects scoping and compares
+ * expressions without regard to variable choice.
+ *
+ * For example: `let x = 1 in x` is equal to `let y = 1 in y`.
+ *
+ *   See https://en.wikipedia.org/wiki/Lambda_calculus#Alpha_equivalence
+ *   for more details.
+ *
+ *   \param e1 The left hand expression.
+ *   \param e2 The right hand expression.
+ *
+ *   \return true if equal, otherwise false
+ */
+bool AlphaEqual(const Expr& e1, const Expr& e2);
+
+/*! \brief Compare two types for structural equivalence.
+ *
+ * This comparison operator respects scoping and compares
+ * expressions without regard to variable choice.
+ *
+ * For example: `forall s, Tensor[f32, s]` is equal to
+ * `forall w, Tensor[f32, w]`.
+ *
+ * See https://en.wikipedia.org/wiki/Lambda_calculus#Alpha_equivalence
+ * for more details.
+ *
+ * \param t1 The left hand type.
+ * \param t2 The right hand type.
+ *
+ * \return true if equal, otherwise false
+ */
+bool AlphaEqual(const Type& t1, const Type& t2);
+
+/*! \brief Check that each Var is only bound once.
+ *
+ * For example, the expression `let x = 1 in let x = 2 in 3` bound x twice.
+ *
+ * `let f = (\x -> x) in let g = (\x -> x + 1) in f(g(2))` also bound x twice,
+ * although x is not shadowed.
+ *
+  * \param expr the expression to check.
+ *
+  * \return true iff all Var in expr is bound at most once.
+ */
+bool WellFormed(const Expr& expr);
+
+/*! \brief Get free type parameters from expression expr.
+ *
+ * Free variables are variables that are not bound by a
+ * let or a function parameter in the context.
+ *
+ * \param expr the expression.
+ *
+ * \return List of free vars, in the PostDFS order in the expression.
+ */
+tvm::Array<Var> FreeVars(const Expr& expr);
+
+/*! \brief Get free TypeVars from expression expr.
+ *
+ * Free type parameters are type parameters that are not bound by a function
+ * type in the context.
+ *
+ * \param expr the expression.
+ *
+ * \return List of free vars, in the PostDFS order visited by expr.
+ */
+tvm::Array<TypeVar> FreeTypeVars(const Expr& expr);
+
+/*! \brief Remove expressions which does not effect the program result.
+ *
+ * It will remove let bindings which are not referenced, and branches that will
+ * not be entered.
+ *
+ * For example, this pass should turn `let a = 1 in 2` into `2`, as the value of
+ * the expression does not depend on a. Another example is `if (true) then 1
+ * else 2` will be optimized into 1.
+ *
+ * \param e the expression to optimize.
+ *
+ * \return the optimized expression.
+ */
+Expr DeadCodeElimination(const Expr& e);
+
+/*!
+ * \brief Fold constant expressions.
+ * \param expr the expression to be optimized.
+ * \return The optimized expression.
+ */
+Expr FoldConstant(const Expr& expr);
+
+/*!
+ * \brief Fuse operations into expr into seperate functions.
+ * \param expr The expression.
+ * \param fuse_opt_level Optimization level.
+ * \return The optimized expression.
+ */
+Expr FuseOps(const Expr& expr, int fuse_opt_level);
+
+/*!
+ * \brief Apply rewrite rules to rewrite the expr in post DFS order.
+ * \param expr The expression.
+ * \param rewrite_map_attr_name The Op's attr name which corresponds to the rewrite
+ *                              rule function.
+ * \param fcontext Additional callback to provide context argument for each call node.
+ * \param fmulti_ref_trigger Transformation function to be called when
+ *                           an Expr consumed by multiple callers.
+ * \return The rewritten expression.
+ */
+Expr ForwardRewrite(const Expr& expr,
+                    const std::string& rewrite_map_attr_name,
+                    std::function<NodeRef(const Call&)> fcontext = nullptr,
+                    std::function<Expr(const Expr&)> fmulti_ref_trigger = nullptr);
+
+/*!
+ * \brief Apply rewrite rules to rewrite the expr in post DFS order.
+ * \param expr The expression.
+ * \param rewrite_func The rewrite func that will apply to all operators.
+ * \param fcontext Additional callback to provide context argument for each call node.
+ * \param fmulti_ref_trigger Transformation function to be called when
+ *                           an Expr consumed by multiple callers.
+ * \return The rewritten expression.
+ */
+Expr ForwardRewrite(const Expr& expr,
+                    const FForwardRewrite& rewrite_func,
+                    std::function<NodeRef(const Call&)> fcontext = nullptr,
+                    std::function<Expr(const Expr&)> fmulti_ref_trigger = nullptr);
+
+
+/*! \brief A hashing structure in the style of std::hash. */
+struct StructuralHash {
+  /*! \brief Hash a Relay type.
+   *
+   * Implements structural hashing of a Relay type.
+   *
+   *  \param type the type to hash.
+   *
+   *  \return the hash value.
+   */
+  size_t operator()(const Type& type) const;
+
+  /*! \brief Hash a Relay expression.
+   *
+   * Implements structural hashing of a Relay expression.
+   *
+   * \param expr the expression to hash.
+   *
+   * \return the hash value.
+   */
+  size_t operator()(const Expr& expr) const;
+};
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_PASS_H_
diff --git a/include/tvm/relay/type.h b/include/tvm/relay/type.h
new file mode 100644
index 000000000000..69a8a4fb0bd7
--- /dev/null
+++ b/include/tvm/relay/type.h
@@ -0,0 +1,398 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/type.h
+ * \brief Relay typed AST nodes.
+ */
+#ifndef TVM_RELAY_TYPE_H_
+#define TVM_RELAY_TYPE_H_
+
+#include <tvm/api_registry.h>
+#include <tvm/ir.h>
+#include <tvm/node/node.h>
+#include <string>
+
+#include "base.h"
+#include "../attrs.h"
+
+namespace tvm {
+namespace relay {
+
+/*! \brief Base type of the Relay type hiearchy. */
+class TypeNode : public RelayNode {
+ public:
+  static constexpr const char* _type_key = "relay.Type";
+  TVM_DECLARE_BASE_NODE_INFO(TypeNode, Node);
+};
+
+/*!
+ * \brief Type is the base type of relay type hiearchy.
+ *
+ * Relay's type system contains following two key concepts:
+ *
+ * - TensorType: type of certain Tensor values in the expression.
+ * - FunctionType: the type of the function.
+ *
+ * There are also advanced types to support generic(polymorphic types),
+ * which can be ignored when first reading the code base.
+ */
+class Type : public NodeRef {
+ public:
+  Type() {}
+  explicit Type(NodePtr<tvm::Node> p) : NodeRef(p) {}
+
+  using ContainerType = TypeNode;
+};
+
+/*!
+ * \brief Base of all Tensor types
+ *  This container can hold TensorType or GenericTensorType.
+ */
+class BaseTensorTypeNode : public TypeNode {
+ public:
+  static constexpr const char* _type_key = "relay.BaseTensorType";
+  TVM_DECLARE_BASE_NODE_INFO(BaseTensorTypeNode, TypeNode);
+};
+
+RELAY_DEFINE_NODE_REF(BaseTensorType, BaseTensorTypeNode, Type);
+
+/*!
+ * \brief This is the most commonly used type in relay.
+ *  TensorType have a fixed dimension, data type.
+ *
+ *  The elements of shape can be either IntImm(constant integer),
+ *  or any symbolic integer expression.
+ *  The symbolic integer allows generic shape inference in certain cases.
+ * \sa TensorTypeNode The container class of TensorType.
+ */
+class TensorType;
+/*! \brief TensorType container node */
+class TensorTypeNode : public BaseTensorTypeNode {
+ public:
+  /*!
+   * \brief The shape of the tensor,
+   *  represented by IndexExpr(tvm::Expr).
+   */
+  Array<IndexExpr> shape;
+  /*! \brief The content data type */
+  DataType dtype;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("shape", &shape);
+    v->Visit("dtype", &dtype);
+    v->Visit("span", &span);
+  }
+
+  /*! \brief Return product of elements in the shape.
+   *  \return (d1 * d_2 ... * d_n) if shape is (d_1, d_2, ..., d_n) and 1 if shape size is zero.
+   */
+  TVM_DLL IndexExpr Size() const;
+
+  TVM_DLL static TensorType make(Array<IndexExpr> shape, DataType dtype);
+
+  /*! \brief Construct an scalar containing elements of dtype.  */
+  TVM_DLL static TensorType Scalar(DataType dtype);
+
+  static constexpr const char* _type_key = "relay.TensorType";
+  TVM_DECLARE_NODE_TYPE_INFO(TensorTypeNode, BaseTensorTypeNode);
+};
+
+RELAY_DEFINE_NODE_REF(TensorType, TensorTypeNode, Type);
+
+/*!
+ * \brief Type parameter in the function.
+ *  This can be viewed as template parameter in c++ template function.
+ *
+ * For example, in the following pesudo code,
+ * the TypeVar of f is TypeVar(kind=kShapeVar, var=n).
+ * This function can take in a Tensor with shape=(3, 3) and
+ * returns a Tensor with shape=(9,)
+ *
+ * \code
+ *
+ *  template<i32 n>
+ *  f(x : Tensor[i32, (n, n)]) -> Tensor[i32, (n * n)]
+ *
+ * \endcode
+ * \sa TypeVarNode The actual container class of TypeVar
+ */
+class TypeVar;
+/*! \brief TypeVar container node */
+class TypeVarNode : public TypeNode {
+ public:
+  /*! \brief possible kinds of TypeVar */
+  enum Kind : int {
+    /*! \brief template variable in shape expression */
+    kType = 0,
+    kShapeVar = 1,
+    kBaseType = 2,
+    kShape = 3
+  };
+  /*!
+   * \brief The variable itself is only meaningful when
+   *  kind is ShapeVar, otherwise, we only use the name.
+   */
+  tvm::Var var;
+  /*! \brief The kind of type parameter */
+  Kind kind;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("var", &var);
+    v->Visit("kind", &kind);
+    v->Visit("span", &span);
+  }
+
+  TVM_DLL static TypeVar make(std::string name, Kind kind);
+
+  static constexpr const char* _type_key = "relay.TypeVar";
+  TVM_DECLARE_NODE_TYPE_INFO(TypeVarNode, TypeNode);
+};
+
+RELAY_DEFINE_NODE_REF(TypeVar, TypeVarNode, Type);
+
+/*!
+ * \brief IncompleteType.
+ * This is intermediate values that is used during type inference.
+ *
+ * If we view the type relations as "computational graph of types",
+ * then IncompleteType represents intermediate values of the graph,
+ * TypeVar represents the input to the graph.
+ */
+class IncompleteType;
+
+/*! \brief IncompleteType container node */
+class IncompleteTypeNode : public TypeNode {
+ public:
+  TypeVarNode::Kind kind;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("kind", &kind);
+    v->Visit("span", &span);
+  }
+
+  TVM_DLL static IncompleteType make(TypeVarNode::Kind kind);
+
+  static constexpr const char* _type_key = "relay.IncompleteType";
+  TVM_DECLARE_NODE_TYPE_INFO(IncompleteTypeNode, TypeNode);
+};
+
+RELAY_DEFINE_NODE_REF(IncompleteType, IncompleteTypeNode, Type);
+
+/*!
+ * \brief Potential Constraints in the type.
+ * \note This is reserved for future use.
+ */
+class TypeConstraint;
+/*! \brief TypeConstraint container node. */
+class TypeConstraintNode : public TypeNode {
+ public:
+  static constexpr const char* _type_key = "relay.TypeConstraint";
+  TVM_DECLARE_BASE_NODE_INFO(TypeConstraintNode, TypeNode);
+};
+
+RELAY_DEFINE_NODE_REF(TypeConstraint, TypeConstraintNode, Type);
+
+class FuncType;
+/*!
+ * \brief Function type in Relay.
+ *
+ * Relay support polymorphic function type.
+ * This can be roughly viewed as template function in C++.
+ *
+ * \sa TypeVar, TypeConstraint
+ */
+class FuncTypeNode : public TypeNode {
+ public:
+  /*! \brief type type of arguments */
+  tvm::Array<Type> arg_types;
+  /*! \brief The type of return value. */
+  Type ret_type;
+  // The following fields are used in polymorphic(template) functions
+  // For normal functions, the following two fields will be empty.
+  /*! \brief The type parameters of the function */
+  tvm::Array<TypeVar> type_params;
+  /*!
+   * \brief potential constraint the type need to obey
+   * \note this field is reserved for futher purposes.
+   */
+  tvm::Array<TypeConstraint> type_constraints;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("arg_types", &arg_types);
+    v->Visit("ret_type", &ret_type);
+    v->Visit("type_params", &type_params);
+    v->Visit("type_constraints", &type_constraints);
+    v->Visit("span", &span);
+  }
+
+  TVM_DLL static FuncType make(tvm::Array<Type> arg_types,
+                               Type ret_type,
+                               tvm::Array<TypeVar> type_params,
+                               tvm::Array<TypeConstraint> type_constraints);
+
+  static constexpr const char* _type_key = "relay.FuncType";
+  TVM_DECLARE_NODE_TYPE_INFO(FuncTypeNode, TypeNode);
+};
+
+RELAY_DEFINE_NODE_REF(FuncType, FuncTypeNode, Type);
+
+/*!
+ * \brief The type of tuple values.
+ */
+class TupleType;
+/*!
+ * \brief TupleType container.
+ */
+class TupleTypeNode : public TypeNode {
+ public:
+  /*! \brief The type of each field in the tuple. */
+  tvm::Array<Type> fields;
+
+  TupleTypeNode() {}
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("fields", &fields);
+    v->Visit("span", &span);
+  }
+
+  TVM_DLL static TupleType make(tvm::Array<Type> fields);
+
+  static constexpr const char* _type_key = "relay.TupleType";
+  TVM_DECLARE_NODE_TYPE_INFO(TupleTypeNode, TypeNode);
+};
+
+RELAY_DEFINE_NODE_REF(TupleType, TupleTypeNode, Type);
+
+class TypeReporter;
+
+/*!
+ * \brief reporter that reports back to the
+ *  type resolution information.
+ */
+class TypeReporterNode : public Node {
+ public:
+  /*!
+   * \brief Create a type equality constraint.
+   *
+   *  The "assign direction" acts as a hint to the solver
+   *  showing that it is more likely to resolve dst by src.
+   *  But it is possible for the solver to resolve src by dst as well.
+   */
+  TVM_DLL virtual void Assign(const Type& dst, const Type& src) = 0;
+  /*!
+   * \brief assert shape expression comparison.
+   * \note Use assert only if any of the condition input is symbolic.
+   * \param cond The condition of operation.
+   * \return false if assertation can be proven to have failed
+   *      true if solver can still proceed.
+   */
+  TVM_DLL virtual bool Assert(const IndexExpr& cond)= 0;
+  /*!
+   * \brief assert shape expression equals each other.
+   * \param lhs The left operand.
+   * \param rhs The right operand.
+   * \return false if assertation can be proven to have failed
+   *      true if solver can still proceed.
+   */
+  TVM_DLL virtual bool AssertEQ(const IndexExpr& lhs, const IndexExpr& rhs) = 0;
+
+  // solver is not serializable.
+  void VisitAttrs(tvm::AttrVisitor* v) final {}
+
+  static constexpr const char* _type_key = "relay.TypeReporter";
+  TVM_DECLARE_NODE_TYPE_INFO(TypeReporterNode, Node);
+};
+
+/*!
+ * \brief Container class of TypeReporter.
+ * \sa TypeReporterNode
+ */
+class TypeReporter : public NodeRef {
+ public:
+  TypeReporter() {}
+  explicit TypeReporter(::tvm::NodePtr<::tvm::Node> n) : NodeRef(n) {
+  }
+  TypeReporterNode* operator->() const {
+    return static_cast<TypeReporterNode*>(node_.get());
+  }
+  using ContainerType = TypeReporterNode;
+};
+
+/*!
+ * \brief User defined type constraint function.
+ *
+ * If the input type information can be used to fully decide
+ * the IncompleteTypes, then the function should call
+ * reporter.Assign to report the new types, and return true.
+ * Otherwise, the function should return false.
+ *
+ * \param args The arguments to the relation.
+ *   The types are stored in the form of
+ *   [input_type_0, input_type_1, ... input_type_n,
+ *    output_type_0, output_type_1, ... output_type_m]
+ *
+ * \param num_inputs Number of input types in the args.
+ * \param attrs The additional attributes of the operator.
+ * \param reporter The reporter to report solution to.
+ * \return false if This relation cannot be resolved.
+ *   true if this relation has been resolved.
+ */
+using TypeRelationFn =
+    TypedEnvFunc<bool(const Array<Type>& args,
+                      int num_inputs,
+                      const Attrs& attrs,
+                      const TypeReporter& reporter)>;
+
+/*!
+ * \brief User defined type relation, is an input-output relation on types.
+ */
+class TypeRelation;
+/*!
+ * \brief TypeRelation container.
+ * \note This node is not directly serializable.
+ * The type function need to be lookedup in the module.
+ */
+class TypeRelationNode : public TypeConstraintNode {
+ public:
+  /*!
+   * \brief The function on input and output variables which
+   *  this is not directly serializable,
+   *  need to be looked-up in the module.
+   */
+  TypeRelationFn func;
+  /*! \brief The type arguments to the type function. */
+  tvm::Array<Type> args;
+  /*! \brief Number of inputs arguments */
+  int num_inputs;
+  /*! \brief Attributes to the relation function */
+  Attrs attrs;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("func", &func);
+    v->Visit("args", &args);
+    v->Visit("num_inputs", &num_inputs);
+    v->Visit("attrs", &attrs);
+    v->Visit("span", &span);
+  }
+
+  TVM_DLL static TypeRelation make(TypeRelationFn func,
+                                   Array<Type> args,
+                                   int num_args,
+                                   Attrs attrs);
+
+  static constexpr const char* _type_key = "relay.TypeRelation";
+  TVM_DECLARE_NODE_TYPE_INFO(TypeRelationNode, TypeConstraintNode);
+};
+
+RELAY_DEFINE_NODE_REF(TypeRelation, TypeRelationNode, TypeConstraint);
+
+// The following fields contains advanced typing
+// Only keep the class name and reserved for future usage.
+class GenericTensorType;
+// stores a DataType.
+class GenericDataType;
+// stores a DataType.
+class GenericShape;
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_TYPE_H_
diff --git a/include/tvm/runtime/c_backend_api.h b/include/tvm/runtime/c_backend_api.h
index 60e284610494..4adc9e2790fe 100644
--- a/include/tvm/runtime/c_backend_api.h
+++ b/include/tvm/runtime/c_backend_api.h
@@ -10,7 +10,7 @@
 #ifndef TVM_RUNTIME_C_BACKEND_API_H_
 #define TVM_RUNTIME_C_BACKEND_API_H_
 
-#include "./c_runtime_api.h"
+#include "c_runtime_api.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -118,7 +118,7 @@ TVM_DLL int TVMBackendParallelBarrier(int task_id, TVMParallelGroupEnv* penv);
 
 
 /*!
- * \brief Simple static initialization fucntion.
+ * \brief Simple static initialization function.
  *  Run f once and set handle to be not null.
  *  This function is mainly used for test purpose.
  *
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 17d00bf479aa..75e936d8f502 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -43,7 +43,7 @@
 #endif
 
 // TVM version
-#define TVM_VERSION "0.4.0"
+#define TVM_VERSION "0.5.dev"
 
 
 // TVM Runtime is DLPack compatible.
@@ -62,11 +62,7 @@ typedef int64_t tvm_index_t;
 typedef enum {
   kDLAOCL = 5,
   kDLSDAccel = 6,
-  kDLVulkan = 7,
   kOpenGL = 11,
-  // Extension DRAM type, used for quickly test extension device
-  // The device api can differ depending on the xpu driver registered.
-  kExtDev = 12,
   // AddExtraTVMType which is not in DLPack here
 } TVMDeviceExtType;
 
@@ -445,6 +441,32 @@ TVM_DLL int TVMArrayCopyFromTo(TVMArrayHandle from,
                                TVMArrayHandle to,
                                TVMStreamHandle stream);
 
+/*!
+ * \brief Produce an array from the DLManagedTensor that shares data memory
+ * with the DLManagedTensor.
+ * \param from The source DLManagedTensor.
+ * \param out The output array handle.
+ * \return 0 when success, -1 when failure happens
+ */
+TVM_DLL int TVMArrayFromDLPack(DLManagedTensor* from,
+                               TVMArrayHandle* out);
+
+/*!
+ * \brief Produce a DLMangedTensor from the array that shares data memory with
+ * the array.
+ * \param from The source array.
+ * \param out The DLManagedTensor handle.
+ * \return 0 when success, -1 when failure happens
+ */
+TVM_DLL int TVMArrayToDLPack(TVMArrayHandle from,
+                             DLManagedTensor** out);
+
+/*!
+ * \brief Delete (free) a DLManagedTensor's data.
+ * \param dltensor Pointer to the DLManagedTensor.
+ */
+TVM_DLL void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor);
+
 /*!
  * \brief Create a new runtime stream.
  *
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index 3458c143e662..2a5ea83a4d2d 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -7,8 +7,8 @@
 #define TVM_RUNTIME_DEVICE_API_H_
 
 #include <string>
-#include "./packed_func.h"
-#include "./c_runtime_api.h"
+#include "packed_func.h"
+#include "c_runtime_api.h"
 
 namespace tvm {
 namespace runtime {
@@ -178,6 +178,40 @@ class DeviceAPI {
 
 /*! \brief The device type bigger than this is RPC device */
 constexpr int kRPCSessMask = 128;
+
+/*!
+ * \brief The name of Device API factory.
+ * \param type The device type.
+ * \return the device name.
+ */
+inline const char* DeviceName(int type) {
+  switch (type) {
+    case kDLCPU: return "cpu";
+    case kDLGPU: return "gpu";
+    case kDLOpenCL: return "opencl";
+    case kDLSDAccel: return "sdaccel";
+    case kDLAOCL: return "aocl";
+    case kDLVulkan: return "vulkan";
+    case kDLMetal: return "metal";
+    case kDLVPI: return "vpi";
+    case kDLROCM: return "rocm";
+    case kOpenGL: return "opengl";
+    case kDLExtDev: return "ext_dev";
+    default: LOG(FATAL) << "unknown type =" << type; return "Unknown";
+  }
+}
+
+#ifndef _LIBCPP_SGX_NO_IOSTREAMS
+inline std::ostream& operator<<(std::ostream& os, DLContext ctx) {  // NOLINT(*)
+  int device_type = static_cast<int>(ctx.device_type);
+  if (device_type > kRPCSessMask) {
+    os << "remote[" << (device_type / kRPCSessMask) << "]-";
+    device_type = device_type % kRPCSessMask;
+  }
+  os << runtime::DeviceName(device_type) << "(" << ctx.device_id << ")";
+  return os;
+}
+#endif
 }  // namespace runtime
 }  // namespace tvm
 #endif  // TVM_RUNTIME_DEVICE_API_H_
diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index de0b02500b6d..675dd8728675 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -13,7 +13,7 @@
 #include <vector>
 #include <string>
 #include <unordered_map>
-#include "./c_runtime_api.h"
+#include "c_runtime_api.h"
 
 namespace tvm {
 namespace runtime {
@@ -103,8 +103,8 @@ class ModuleNode {
    * \param file_name The file to be saved to.
    * \param format The format of the file.
    */
-  virtual void SaveToFile(const std::string& file_name,
-                          const std::string& format);
+  TVM_DLL virtual void SaveToFile(const std::string& file_name,
+                                  const std::string& format);
   /*!
    * \brief Save the module to binary stream.
    * \param stream The binary stream to save to.
@@ -173,5 +173,5 @@ inline const ModuleNode* Module::operator->() const {
 }  // namespace runtime
 }  // namespace tvm
 
-#include "./packed_func.h"
+#include "packed_func.h"
 #endif  // TVM_RUNTIME_MODULE_H_
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index 2b51b2e0fcfe..0fc8e42b8bcb 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -9,8 +9,8 @@
 #include <atomic>
 #include <vector>
 #include <utility>
-#include "./c_runtime_api.h"
-#include "./serializer.h"
+#include "c_runtime_api.h"
+#include "serializer.h"
 
 namespace tvm {
 namespace runtime {
@@ -30,8 +30,11 @@ class NDArray {
    */
   explicit inline NDArray(Container* data);
   /*!
-   * \brief copy constructor
-   * \param other The value to be copied
+   * \brief copy constructor.
+   *
+   * It does not make a copy, but the reference count of the input NDArray is incremented
+   *
+   * \param other NDArray that shares internal data with the input NDArray.
    */
   inline NDArray(const NDArray& other);  // NOLINT(*)
   /*!
@@ -155,7 +158,7 @@ class NDArray {
    * that is DLPack compatible.
    *
    * The memory is retained until the NDArray went out of scope.
-   *
+   * \param tensor The DLPack tensor to copy from.
    * \return The created NDArray view.
    */
   TVM_DLL static NDArray FromDLPack(DLManagedTensor* tensor);
@@ -246,6 +249,7 @@ struct NDArray::Container {
 
  private:
   friend class NDArray;
+  friend class RPCWrappedFunc;
   /*!
    * \brief The shape container,
    *  can be used used for shape data.
@@ -259,12 +263,16 @@ struct NDArray::Container {
 // the usages of functions are documented in place.
 inline NDArray::NDArray(Container* data)
   : data_(data) {
-  data_->IncRef();
+  if (data != nullptr) {
+    data_->IncRef();
+  }
 }
 
 inline NDArray::NDArray(const NDArray& other)
   : data_(other.data_) {
-  data_->IncRef();
+  if (data_ != nullptr) {
+    data_->IncRef();
+  }
 }
 
 inline void NDArray::reset() {
@@ -274,6 +282,21 @@ inline void NDArray::reset() {
   }
 }
 
+/*! \brief return the size of data the DLTensor hold, in term of number of bytes
+ *
+ *  \param arr the input DLTensor
+ *
+ *  \return number of  bytes of data in the DLTensor.
+ */
+inline size_t GetDataSize(const DLTensor& arr) {
+  size_t size = 1;
+  for (tvm_index_t i = 0; i < arr.ndim; ++i) {
+    size *= static_cast<size_t>(arr.shape[i]);
+  }
+  size *= (arr.dtype.bits * arr.dtype.lanes + 7) / 8;
+  return size;
+}
+
 inline void NDArray::CopyFrom(DLTensor* other) {
   CHECK(data_ != nullptr);
   CopyFromTo(other, &(data_->dl_tensor));
diff --git a/include/tvm/runtime/node_base.h b/include/tvm/runtime/node_base.h
new file mode 100644
index 000000000000..bc62ac460cff
--- /dev/null
+++ b/include/tvm/runtime/node_base.h
@@ -0,0 +1,241 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/runtime/node_base.h
+ * \brief Base data structure for Node.
+ *
+ * \note Node is not a runtime feature.
+ *  This file only exposes the signature of NodePtr for PackedFunc.
+ */
+#ifndef TVM_RUNTIME_NODE_BASE_H_
+#define TVM_RUNTIME_NODE_BASE_H_
+
+#include <utility>
+#include <atomic>
+
+namespace tvm {
+
+// forward declarations
+template<typename T>
+class NodePtr;
+class Node;
+class NodeRef;
+
+/*!
+ * \brief Base class of Node for runtime destructor purposes.
+ *
+ * Node is a reference counted object which is used to construct AST.
+ * Each node is backed by a custom deleter, which deletes the object.
+ * Do not call create raw Node pointer, always use tvm::make_node.
+ *
+ * \note In most cases, please inheritate tvm::Node.
+ * \sa Node, NodePtr, make_node
+ */
+class NodeBase {
+ public:
+  /*!
+   * \brief type of NodeBase deleter
+   * \param self pointer to the NodeBase.
+   */
+  typedef void (*FDeleter)(NodeBase* self);
+
+ protected:
+  // default constructor and copy constructor
+  NodeBase() {}
+  // override the copy and assign constructors to do nothing.
+  // This is to make sure only contents, but not deleter and ref_counter
+  // are copied when a child class copies itself.
+  NodeBase(const NodeBase& other) {  // NOLINT(*)
+  }
+  NodeBase(NodeBase&& other) {  // NOLINT(*)
+  }
+  NodeBase& operator=(const NodeBase& other) {  //NOLINT(*)
+    return *this;
+  }
+  NodeBase& operator=(NodeBase&& other) {  //NOLINT(*)
+    return *this;
+  }
+
+ private:
+  /*! \brief Internal reference counter */
+  std::atomic<int> ref_counter_{0};
+  /*!
+   * \brief deleter of this object to enable customized allocation.
+   * If the deleter is nullptr, no deletion will be performed.
+   * The creator of the Node must always set the deleter field properly.
+   */
+  FDeleter deleter_ = nullptr;
+  // reference counting functions
+  void IncRef() {
+    ref_counter_.fetch_add(1, std::memory_order_relaxed);
+  }
+  void DecRef() {
+    if (ref_counter_.fetch_sub(1, std::memory_order_release) == 1) {
+      std::atomic_thread_fence(std::memory_order_acquire);
+      if (this->deleter_ != nullptr) {
+        (*this->deleter_)(this);
+      }
+    }
+  }
+  int use_count() const {
+    return ref_counter_.load(std::memory_order_relaxed);
+  }
+  // friend declaration
+  template<typename>
+  friend class NodePtr;
+  template<typename Y, typename... Args>
+  friend NodePtr<Y> make_node(Args&&...);
+};
+
+/*!
+ * \brief Smart pointer for Node containers,
+ *  must be subclass of NodeBase
+ * \tparam T the content data type.
+ */
+template<typename T>
+class NodePtr {
+ public:
+  /*! \brief default constructor */
+  NodePtr() {}
+  /*! \brief default constructor */
+  NodePtr(std::nullptr_t) {}  // NOLINT(*)
+  /*!
+   * \brief copy constructor
+   * \param other The value to be moved
+   */
+  NodePtr(const NodePtr<T>& other)  // NOLINT(*)
+      : NodePtr(other.data_) {
+  }
+  /*!
+   * \brief copy constructor
+   * \param other The value to be moved
+   */
+  template<typename Y>
+  NodePtr(const NodePtr<Y>& other)  // NOLINT(*)
+      : NodePtr(other.data_) {
+    static_assert(std::is_base_of<T, Y>::value,
+                  "can only assign of child class NodePtr to parent");
+  }
+  /*!
+   * \brief move constructor
+   * \param other The value to be moved
+   */
+  NodePtr(NodePtr<T>&& other) // NOLINT(*)
+      : data_(other.data_) {
+    other.data_ = nullptr;
+  }
+  /*!
+   * \brief move constructor
+   * \param other The value to be moved
+   */
+  template<typename Y>
+  NodePtr(NodePtr<Y>&& other)  // NOLINT(*)
+      : data_(other.data_) {
+    static_assert(std::is_base_of<T, Y>::value,
+                  "can only assign of child class NodePtr to parent");
+    other.data_ = nullptr;
+  }
+  /*! \brief destructor */
+  ~NodePtr() {
+    this->reset();
+  }
+  /*!
+   * \brief Swap this array with another NDArray
+   * \param other The other NDArray
+   */
+  void swap(NodePtr<T>& other) {  // NOLINT(*)
+    std::swap(data_, other.data_);
+  }
+  /*!
+   * \return Get the content of the pointer
+   */
+  T* get() const {
+    return static_cast<T*>(data_);
+  }
+  /*!
+   * \return The pointer
+   */
+  T* operator->() const {
+    return get();
+  }
+  /*!
+   * \return The reference
+   */
+  T& operator*() const { // NOLINT(*)
+    return *get();
+  }
+  /*!
+   * \brief copy assignmemt
+   * \param other The value to be assigned.
+   * \return reference to self.
+   */
+  NodePtr<T>& operator=(const NodePtr<T>& other) {  // NOLINT(*)
+    // takes in plane operator to enable copy elison.
+    // copy-and-swap idiom
+    NodePtr(other).swap(*this);  // NOLINT(*)
+    return *this;
+  }
+  /*!
+   * \brief move assignmemt
+   * \param other The value to be assigned.
+   * \return reference to self.
+   */
+  NodePtr<T>& operator=(NodePtr<T>&& other) {  // NOLINT(*)
+    // copy-and-swap idiom
+    NodePtr(std::move(other)).swap(*this); // NOLINT(*)
+    return *this;
+  }
+  /*! \brief reset the content of ptr to be nullptr */
+  void reset() {
+    if (data_ != nullptr) {
+      data_->DecRef();
+      data_ = nullptr;
+    }
+  }
+  /*! \return The use count of the ptr, for debug purposes */
+  int use_count() const {
+    return data_ != nullptr ? data_->use_count() : 0;
+  }
+  /*! \return whether the reference is unique */
+  bool unique() const {
+    return data_ != nullptr && data_->use_count() == 1;
+  }
+  /*! \return Whether two NodePtr do not equals each other */
+  bool operator==(const NodePtr<T>& other) const {
+    return data_ == other.data_;
+  }
+  /*! \return Whether two NodePtr equals each other */
+  bool operator!=(const NodePtr<T>& other) const {
+    return data_ != other.data_;
+  }
+  /*! \return Whether the pointer is nullptr */
+  bool operator==(std::nullptr_t null) const {
+    return data_ == nullptr;
+  }
+  /*! \return Whether the pointer is not nullptr */
+  bool operator!=(std::nullptr_t null) const {
+    return data_ != nullptr;
+  }
+
+ private:
+  /*! \brief internal pointer field */
+  NodeBase* data_{nullptr};
+  /*!
+   * \brief constructor from NodeBase
+   * \param data The node base pointer
+   */
+  explicit NodePtr(NodeBase* data)
+      : data_(data) {
+    if (data != nullptr) {
+      data_->IncRef();
+    }
+  }
+  // friend declaration
+  friend class Node;
+  template<typename>
+  friend class NodePtr;
+  template<typename Y, typename... Args>
+  friend NodePtr<Y> make_node(Args&&...);
+};
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_NODE_BASE_H_
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 6d8df4a5e3d6..1e5265c07959 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -14,9 +14,10 @@
 #include <limits>
 #include <memory>
 #include <type_traits>
-#include "./c_runtime_api.h"
-#include "./module.h"
-#include "./ndarray.h"
+#include "c_runtime_api.h"
+#include "module.h"
+#include "ndarray.h"
+#include "node_base.h"
 
 namespace HalideIR {
 // Forward declare type for extensions
@@ -31,11 +32,8 @@ struct Expr;
 #endif
 
 namespace tvm {
-// Forward declare NodeRef and Node for extensions.
-// This header works fine without depend on NodeRef
-// as long as it is not used.
-class Node;
-class NodeRef;
+// forward declarations
+class Integer;
 
 namespace runtime {
 // forward declarations
@@ -75,6 +73,8 @@ class PackedFunc {
   using FType = std::function<void (TVMArgs args, TVMRetValue* rv)>;
   /*! \brief default constructor */
   PackedFunc() {}
+  /*! \brief constructor from null */
+  PackedFunc(std::nullptr_t null) {}  // NOLINT(*)
   /*!
    * \brief constructing a packed function from a std::function.
    * \param body the internal container of packed function.
@@ -118,6 +118,181 @@ class PackedFunc {
   FType body_;
 };
 
+/*!
+ * \brief Please refer to \ref TypedPackedFuncAnchor "TypedPackedFunc<R(Args..)>"
+ */
+template<typename FType>
+class TypedPackedFunc;
+
+/*!
+ * \anchor TypedPackedFuncAnchor
+ * \brief A PackedFunc wrapper to provide typed function signature.
+ * It is backed by a PackedFunc internally.
+ *
+ * TypedPackedFunc enables compile time type checking.
+ * TypedPackedFunc works with the runtime system:
+ * - It can be passed as an argument of PackedFunc.
+ * - It can be assigned to TVMRetValue.
+ * - It can be directly converted to a type-erased PackedFunc.
+ *
+ * Developers should prefer TypedPackedFunc over PackedFunc in C++ code
+ * as it enables compile time checking.
+ * We can construct a TypedPackedFunc from a lambda function
+ * with the same signature.
+ *
+ * \code
+ *  // user defined lambda function.
+ *  auto addone = [](int x)->int {
+ *    return x + 1;
+ *  };
+ *  // We can directly convert
+ *  // lambda function to TypedPackedFunc
+ *  TypedPackedFunc<int(int)> ftyped(addone);
+ *  // invoke the function.
+ *  int y = ftyped(1);
+ *  // Can be directly converted to PackedFunc
+ *  PackedFunc packed = ftype;
+ * \endcode
+ * \tparam R The return value of the function.
+ * \tparam Args The argument signature of the function.
+ */
+template<typename R, typename ...Args>
+class TypedPackedFunc<R(Args...)> {
+ public:
+  /*! \brief short hand for this function type */
+  using TSelf = TypedPackedFunc<R(Args...)>;
+  /*! \brief default constructor */
+  TypedPackedFunc() {}
+  /*! \brief constructor from null */
+  TypedPackedFunc(std::nullptr_t null) {}  // NOLINT(*)
+  /*!
+   * \brief construct by wrap a PackedFunc
+   *
+   * Example usage:
+   * \code
+   * PackedFunc packed([](TVMArgs args, TVMRetValue *rv) {
+   *   int x = args[0];
+   *   *rv = x + 1;
+   *  });
+   * // construct from packed function
+   * TypedPackedFunc<int(int)> ftyped(packed);
+   * // call the typed version.
+   * CHECK_EQ(ftyped(1), 2);
+   * \endcode
+   *
+   * \param packed The packed function
+   */
+  inline TypedPackedFunc(PackedFunc packed);  // NOLINT(*)
+  /*!
+   * \brief constructor from TVMRetValue
+   * \param value The TVMRetValue
+   */
+  inline TypedPackedFunc(const TVMRetValue& value);  // NOLINT(*)
+  /*!
+   * \brief constructor from TVMArgValue
+   * \param value The TVMArgValue
+   */
+  inline TypedPackedFunc(const TVMArgValue& value);  // NOLINT(*)
+  /*!
+   * \brief construct from a lambda function with the same signature.
+   *
+   * Example usage:
+   * \code
+   * auto typed_lambda = [](int x)->int { return x + 1; }
+   * // construct from packed function
+   * TypedPackedFunc<int(int)> ftyped(typed_lambda);
+   * // call the typed version.
+   * CHECK_EQ(ftyped(1), 2);
+   * \endcode
+   *
+   * \param typed_lambda typed lambda function.
+   * \tparam FLambda the type of the lambda function.
+   */
+  template<typename FLambda,
+           typename = typename std::enable_if<
+             std::is_convertible<FLambda,
+                                 std::function<R(Args...)>
+                                 >::value>::type>
+  TypedPackedFunc(const FLambda& typed_lambda) {  // NOLINT(*)
+    this->AssignTypedLambda(typed_lambda);
+  }
+  /*!
+   * \brief copy assignment operator from typed lambda
+   *
+   * Example usage:
+   * \code
+   * // construct from packed function
+   * TypedPackedFunc<int(int)> ftyped;
+   * ftyped = [](int x) { return x + 1; }
+   * // call the typed version.
+   * CHECK_EQ(ftyped(1), 2);
+   * \endcode
+   *
+   * \param typed_lambda typed lambda function.
+   * \tparam FLambda the type of the lambda function.
+   * \returns reference to self.
+   */
+  template<typename FLambda,
+           typename = typename std::enable_if<
+             std::is_convertible<FLambda,
+                                 std::function<R(Args...)>
+                                 >::value>::type>
+  TSelf& operator=(FLambda typed_lambda) {  // NOLINT(*)
+    this->AssignTypedLambda(typed_lambda);
+    return *this;
+  }
+  /*!
+   * \brief copy assignment operator from PackedFunc.
+   * \param packed The packed function.
+   * \returns reference to self.
+   */
+  TSelf& operator=(PackedFunc packed) {
+    packed_ = packed;
+    return *this;
+  }
+  /*!
+   * \brief Invoke the operator.
+   * \param args The arguments
+   * \returns The return value.
+   */
+  inline R operator()(Args ...args) const;
+  /*!
+   * \brief convert to PackedFunc
+   * \return the internal PackedFunc
+   */
+  operator PackedFunc() const {
+    return packed();
+  }
+  /*!
+   * \return reference the internal PackedFunc
+   */
+  const PackedFunc& packed() const {
+    return packed_;
+  }
+  /*! \return Whether the packed function is nullptr */
+  bool operator==(std::nullptr_t null) const {
+    return packed_ == nullptr;
+  }
+  /*! \return Whether the packed function is not nullptr */
+  bool operator!=(std::nullptr_t null) const {
+    return packed_ != nullptr;
+  }
+
+ private:
+  friend class TVMRetValue;
+  /*! \brief The internal packed function */
+  PackedFunc packed_;
+  /*!
+   * \brief Assign the packed field using a typed lambda function.
+   *
+   * \param flambda The lambda function.
+   * \tparam FLambda The lambda function type.
+   * \note We capture the lambda when possible for maximum efficiency.
+   */
+  template<typename FLambda>
+  inline void AssignTypedLambda(FLambda flambda);
+};
+
 /*! \brief Arguments into TVM functions. */
 class TVMArgs {
  public:
@@ -223,6 +398,12 @@ class ExtTypeVTable {
 class TVMPODValue_ {
  public:
   operator double() const {
+    // Allow automatic conversion from int to float
+    // This avoids errors when user pass in int from
+    // the frontend while the API expects a float.
+    if (type_code_ == kDLInt) {
+      return static_cast<double>(value_.v_int64);
+    }
     TVM_CHECK_TYPE_CODE(type_code_, kDLFloat);
     return value_.v_float64;
   }
@@ -310,6 +491,8 @@ class TVMPODValue_ {
  */
 class TVMArgValue : public TVMPODValue_ {
  public:
+  /*! \brief default constructor */
+  TVMArgValue() {}
   /*!
    * \brief constructor
    * \param value of the function
@@ -345,6 +528,12 @@ class TVMArgValue : public TVMPODValue_ {
     if (type_code_ == kStr) {
       return String2TVMType(operator std::string());
     }
+    // None type
+    if (type_code_ == kNull) {
+      TVMType t;
+      t.code = kHandle; t.bits = 0; t.lanes = 0;
+      return t;
+    }
     TVM_CHECK_TYPE_CODE(type_code_, kTVMType);
     return value_.v_type;
   }
@@ -353,6 +542,10 @@ class TVMArgValue : public TVMPODValue_ {
     TVM_CHECK_TYPE_CODE(type_code_, kFuncHandle);
     return *ptr<PackedFunc>();
   }
+  template<typename FType>
+  operator TypedPackedFunc<FType>() const {
+    return TypedPackedFunc<FType>(operator PackedFunc());
+  }
   operator Module() const {
     TVM_CHECK_TYPE_CODE(type_code_, kModuleHandle);
     return *ptr<Module>();
@@ -373,8 +566,9 @@ class TVMArgValue : public TVMPODValue_ {
   inline bool IsNodeType() const;
   inline operator HalideIR::Type() const;
   inline operator HalideIR::Expr() const;
+  inline operator tvm::Integer() const;
   // get internal node ptr, if it is node
-  inline std::shared_ptr<Node>& node_sptr();
+  inline NodePtr<Node>& node_sptr();
 };
 
 /*!
@@ -413,7 +607,7 @@ class TVMRetValue : public TVMPODValue_ {
   using TVMPODValue_::operator TVMContext;
   using TVMPODValue_::operator NDArray;
   // Disable copy and assign from another value, but allow move.
-  TVMRetValue(const TVMRetValue& other) {
+  TVMRetValue(const TVMRetValue& other) : TVMPODValue_() {
     this->Assign(other);
   }
   // conversion operators
@@ -438,6 +632,10 @@ class TVMRetValue : public TVMPODValue_ {
     TVM_CHECK_TYPE_CODE(type_code_, kFuncHandle);
     return *ptr<PackedFunc>();
   }
+  template<typename FType>
+  operator TypedPackedFunc<FType>() const {
+    return TypedPackedFunc<FType>(operator PackedFunc());
+  }
   operator Module() const {
     TVM_CHECK_TYPE_CODE(type_code_, kModuleHandle);
     return *ptr<Module>();
@@ -475,6 +673,11 @@ class TVMRetValue : public TVMPODValue_ {
     value_.v_int64 = value;
     return *this;
   }
+  TVMRetValue& operator=(TVMContext value) {
+    this->SwitchToPOD(kTVMContext);
+    value_.v_ctx = value;
+    return *this;
+  }
   TVMRetValue& operator=(TVMType t) {
     this->SwitchToPOD(kTVMType);
     value_.v_type = t;
@@ -504,6 +707,10 @@ class TVMRetValue : public TVMPODValue_ {
     this->SwitchToClass(kFuncHandle, f);
     return *this;
   }
+  template<typename FType>
+  TVMRetValue& operator=(const TypedPackedFunc<FType>& f) {
+    return operator=(f.packed());
+  }
   TVMRetValue& operator=(Module m) {
     this->SwitchToClass(kModuleHandle, m);
     return *this;
@@ -557,7 +764,7 @@ class TVMRetValue : public TVMPODValue_ {
   template<typename TNodeRef>
   inline TNodeRef AsNodeRef() const;
   inline TVMRetValue& operator=(const NodeRef& other);
-  inline TVMRetValue& operator=(const std::shared_ptr<Node>& other);
+  inline TVMRetValue& operator=(const NodePtr<Node>& other);
   // type related
   inline operator HalideIR::Type() const;
   inline TVMRetValue& operator=(const HalideIR::Type& other);
@@ -587,8 +794,8 @@ class TVMRetValue : public TVMPODValue_ {
         break;
       }
       case kNodeHandle: {
-        SwitchToClass<std::shared_ptr<Node> >(
-            kNodeHandle, *other.template ptr<std::shared_ptr<Node> >());
+        SwitchToClass<NodePtr<Node> >(
+            kNodeHandle, *other.template ptr<NodePtr<Node> >());
         break;
       }
       default: {
@@ -633,7 +840,7 @@ class TVMRetValue : public TVMPODValue_ {
       case kStr: delete ptr<std::string>(); break;
       case kFuncHandle: delete ptr<PackedFunc>(); break;
       case kModuleHandle: delete ptr<Module>(); break;
-      case kNodeHandle: delete ptr<std::shared_ptr<Node> >(); break;
+      case kNodeHandle: delete ptr<NodePtr<Node> >(); break;
       case kNDArrayContainer: {
         static_cast<NDArray::Container*>(value_.v_handle)->DecRef();
         break;
@@ -674,6 +881,9 @@ inline const char* TypeCode2Str(int type_code) {
 
 #ifndef _LIBCPP_SGX_NO_IOSTREAMS
 inline std::ostream& operator<<(std::ostream& os, TVMType t) {  // NOLINT(*)
+  if (t.bits == 1 && t.lanes == 1 && t.code == kDLUInt) {
+    os << "bool"; return os;
+  }
   os << TypeCode2Str(t.code);
   if (t.code == kHandle) return os;
   os << static_cast<int>(t.bits);
@@ -682,15 +892,19 @@ inline std::ostream& operator<<(std::ostream& os, TVMType t) {  // NOLINT(*)
   }
   return os;
 }
+
 #endif
 
 inline std::string TVMType2String(TVMType t) {
+  if (t.bits == 0) return "";
 #ifndef _LIBCPP_SGX_NO_IOSTREAMS
   std::ostringstream os;
   os << t;
   return os.str();
 #else
-  std::string repr = "";
+  if (t.bits == 1 && t.lanes == 1 && t.code == kDLUInt) {
+    return "bool";
+  }
   repr += TypeCode2Str(t.code);
   if (t.code == kHandle) return repr;
   repr += std::to_string(static_cast<int>(t.bits));
@@ -703,6 +917,11 @@ inline std::string TVMType2String(TVMType t) {
 
 inline TVMType String2TVMType(std::string s) {
   TVMType t;
+  // handle None type
+  if (s.length() == 0) {
+    t.bits = 0; t.lanes = 0; t.code = kHandle;
+    return t;
+  }
   t.bits = 32; t.lanes = 1;
   const char* scan;
   if (s.substr(0, 3) == "int") {
@@ -715,6 +934,11 @@ inline TVMType String2TVMType(std::string s) {
     t.code = kHandle;
     t.bits = 64;  // handle uses 64 bit by default.
     scan = s.c_str() + 6;
+  } else if (s == "bool") {
+    t.code = kDLUInt;
+    t.bits = 1;
+    t.lanes = 1;
+    return t;
   } else {
     scan = s.c_str();
     LOG(FATAL) << "unknown type " << s;
@@ -722,9 +946,11 @@ inline TVMType String2TVMType(std::string s) {
   char* xdelim;  // emulate sscanf("%ux%u", bits, lanes)
   uint8_t bits = static_cast<uint8_t>(strtoul(scan, &xdelim, 10));
   if (bits != 0) t.bits = bits;
+  char* endpt = xdelim;
   if (*xdelim == 'x') {
-    t.lanes = static_cast<uint16_t>(strtoul(xdelim + 1, nullptr, 10));
+    t.lanes = static_cast<uint16_t>(strtoul(xdelim + 1, &endpt, 10));
   }
+  CHECK(endpt == s.c_str() + s.length()) << "unknown type " << s;
   return t;
 }
 
@@ -748,6 +974,8 @@ inline PackedFunc::FType PackedFunc::body() const {
   return body_;
 }
 
+
+
 // internal namespace
 namespace detail {
 
@@ -839,6 +1067,10 @@ class TVMArgsSetter {
     values_[i].v_handle = const_cast<PackedFunc*>(&value);
     type_codes_[i] = kFuncHandle;
   }
+  template<typename FType>
+  void operator()(size_t i, const TypedPackedFunc<FType>& value) const {  // NOLINT(*)
+    operator()(i, value.packed());
+  }
   void operator()(size_t i, const Module& value) const {  // NOLINT(*)
     values_[i].v_handle = const_cast<Module*>(&value);
     type_codes_[i] = kModuleHandle;
@@ -886,6 +1118,96 @@ inline TVMRetValue PackedFunc::operator()(Args&& ...args) const {
   return rv;
 }
 
+namespace detail {
+template<typename R, int nleft, int index, typename F>
+struct unpack_call_dispatcher {
+  template<typename ...Args>
+  static void run(const F& f,
+                  const TVMArgs& args_pack,
+                  TVMRetValue* rv,
+                  Args&&... unpacked_args) {
+    unpack_call_dispatcher<R, nleft - 1, index + 1, F>
+        ::run(f, args_pack, rv,
+              std::forward<Args>(unpacked_args)...,
+              args_pack[index]);
+  }
+};
+
+template<typename R, int index, typename F>
+struct unpack_call_dispatcher<R, 0, index, F> {
+  template<typename ...Args>
+  static void run(const F& f,
+                  const TVMArgs& args_pack,
+                  TVMRetValue* rv,
+                  Args&&... unpacked_args) {
+    *rv = R(f(std::forward<Args>(unpacked_args)...));
+  }
+};
+
+template<int index, typename F>
+struct unpack_call_dispatcher<void, 0, index, F> {
+  template<typename ...Args>
+  static void run(const F& f,
+                  const TVMArgs& args_pack,
+                  TVMRetValue* rv,
+                  Args&&... unpacked_args) {
+    f(std::forward<Args>(unpacked_args)...);
+  }
+};
+
+template<typename R, int nargs, typename F>
+inline void unpack_call(const F& f, const TVMArgs& args, TVMRetValue* rv) {
+  unpack_call_dispatcher<R, nargs, 0, F>::run(f, args, rv);
+}
+
+template<typename R, typename ...Args>
+inline R call_packed(const PackedFunc& pf, Args&& ...args) {
+  return R(pf(std::forward<Args>(args)...));
+}
+
+template<typename R>
+struct typed_packed_call_dispatcher {
+  template<typename ...Args>
+  static inline R run(const PackedFunc& pf, Args&& ...args) {
+    return pf(std::forward<Args>(args)...);
+  }
+};
+
+template<>
+struct typed_packed_call_dispatcher<void> {
+  template<typename ...Args>
+  static inline void run(const PackedFunc& pf, Args&& ...args) {
+    pf(std::forward<Args>(args)...);
+  }
+};
+}  // namespace detail
+
+template<typename R, typename ...Args>
+TypedPackedFunc<R(Args...)>::TypedPackedFunc(PackedFunc packed)
+  : packed_(packed) {}
+
+template<typename R, typename ...Args>
+TypedPackedFunc<R(Args...)>::TypedPackedFunc(const TVMRetValue& value)
+    : packed_(value.operator PackedFunc()) {}
+
+template<typename R, typename ...Args>
+TypedPackedFunc<R(Args...)>::TypedPackedFunc(const TVMArgValue& value)
+    : packed_(value.operator PackedFunc()) {}
+
+template<typename R, typename ...Args>
+template<typename FType>
+inline void TypedPackedFunc<R(Args...)>::AssignTypedLambda(FType flambda) {
+  packed_ = PackedFunc([flambda](const TVMArgs& args, TVMRetValue* rv) {
+      detail::unpack_call<R, sizeof...(Args)>(flambda, args, rv);
+    });
+}
+
+template<typename R, typename ...Args>
+inline R TypedPackedFunc<R(Args...)>::operator()(Args... args) const {
+  return detail::typed_packed_call_dispatcher<R>
+      ::run(packed_, std::forward<Args>(args)...);
+}
+
 // extension and node type handling
 namespace detail {
 template<typename T, typename TSrc, bool is_ext>
diff --git a/include/tvm/runtime/registry.h b/include/tvm/runtime/registry.h
index 2a328c8086e0..9466056a1282 100644
--- a/include/tvm/runtime/registry.h
+++ b/include/tvm/runtime/registry.h
@@ -27,7 +27,7 @@
 
 #include <string>
 #include <vector>
-#include "./packed_func.h"
+#include "packed_func.h"
 
 namespace tvm {
 namespace runtime {
@@ -47,6 +47,24 @@ class Registry {
   Registry& set_body(PackedFunc::FType f) {  // NOLINT(*)
     return set_body(PackedFunc(f));
   }
+  /*!
+   * \brief set the body of the function to be TypedPackedFunc.
+   *
+   * \code
+   *
+   * TVM_REGISTER_API("addone")
+   * .set_body_typed<int(int)>([](int x) { return x + 1; });
+   *
+   * \endcode
+   *
+   * \param f The body of the function.
+   * \tparam FType the signature of the function.
+   * \tparam FLambda The type of f.
+   */
+  template<typename FType, typename FLambda>
+  Registry& set_body_typed(FLambda f) {
+    return set_body(TypedPackedFunc<FType>(f).packed());
+  }
   /*!
    * \brief Register a function with given name
    * \param name The name of the function.
diff --git a/include/tvm/runtime/serializer.h b/include/tvm/runtime/serializer.h
index b2ab5483a22d..e9a7d1db50ec 100644
--- a/include/tvm/runtime/serializer.h
+++ b/include/tvm/runtime/serializer.h
@@ -9,8 +9,8 @@
 
 #include <dmlc/io.h>
 #include <dmlc/serializer.h>
-#include "./c_runtime_api.h"
-#include "./ndarray.h"
+#include "c_runtime_api.h"
+#include "ndarray.h"
 
 namespace dmlc {
 namespace serializer {
diff --git a/include/tvm/runtime/util.h b/include/tvm/runtime/util.h
index 160642ffcc85..6ec168a250b6 100644
--- a/include/tvm/runtime/util.h
+++ b/include/tvm/runtime/util.h
@@ -6,7 +6,7 @@
 #ifndef TVM_RUNTIME_UTIL_H_
 #define TVM_RUNTIME_UTIL_H_
 
-#include "./c_runtime_api.h"
+#include "c_runtime_api.h"
 
 namespace tvm {
 namespace runtime {
@@ -16,12 +16,38 @@ namespace runtime {
  * \param t The type
  * \param code The type code.
  * \param bits The number of bits to be matched.
- * \param lanes The number of lanes sin the type.
+ * \param lanes The number of lanes in the type.
  */
 inline bool TypeMatch(TVMType t, int code, int bits, int lanes = 1) {
   return t.code == code && t.bits == bits && t.lanes == lanes;
 }
-
 }  // namespace runtime
 }  // namespace tvm
+// Forward declare the intrinsic id we need
+// in structure fetch to enable stackvm in runtime
+namespace tvm {
+namespace ir {
+namespace intrinsic {
+/*! \brief The kind of structure field info used in intrinsic */
+enum TVMStructFieldKind : int {
+  // array head address
+  kArrAddr,
+  kArrData,
+  kArrShape,
+  kArrStrides,
+  kArrNDim,
+  kArrTypeCode,
+  kArrTypeBits,
+  kArrTypeLanes,
+  kArrByteOffset,
+  kArrDeviceId,
+  kArrDeviceType,
+  kArrKindBound_,
+  // TVMValue field
+  kTVMValueContent,
+  kTVMValueKindBound_
+};
+}  // namespace intrinsic
+}  // namespace ir
+}  // namespace tvm
 #endif  // TVM_RUNTIME_UTIL_H_
diff --git a/include/tvm/schedule.h b/include/tvm/schedule.h
index deaf74ccf222..af72f3153291 100644
--- a/include/tvm/schedule.h
+++ b/include/tvm/schedule.h
@@ -7,10 +7,10 @@
 #define TVM_SCHEDULE_H_
 
 #include <string>
-#include "./base.h"
-#include "./expr.h"
-#include "./tensor.h"
-#include "./tensor_intrin.h"
+#include "base.h"
+#include "expr.h"
+#include "tensor.h"
+#include "tensor_intrin.h"
 
 namespace tvm {
 
@@ -36,7 +36,7 @@ enum AttachType : int {
 class Stage : public NodeRef {
  public:
   Stage() {}
-  explicit Stage(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit Stage(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief create a new schedule for op.
    * \param op The operator in the schedule
@@ -260,7 +260,7 @@ class Stage : public NodeRef {
 class Schedule : public NodeRef {
  public:
   Schedule() {}
-  explicit Schedule(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit Schedule(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief Get a copy of current schedule.
    * \return The copied schedule.
@@ -383,7 +383,7 @@ class Schedule : public NodeRef {
 class IterVarRelation : public NodeRef {
  public:
   IterVarRelation() {}
-  explicit IterVarRelation(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit IterVarRelation(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -397,7 +397,7 @@ class IterVarRelation : public NodeRef {
 class IterVarAttr : public NodeRef {
  public:
   IterVarAttr() {}
-  explicit IterVarAttr(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit IterVarAttr(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
diff --git a/include/tvm/schedule_pass.h b/include/tvm/schedule_pass.h
index cd248f8b9b96..e2b4462b8d73 100644
--- a/include/tvm/schedule_pass.h
+++ b/include/tvm/schedule_pass.h
@@ -10,8 +10,8 @@
 #ifndef TVM_SCHEDULE_PASS_H_
 #define TVM_SCHEDULE_PASS_H_
 
-#include "./base.h"
-#include "./schedule.h"
+#include "base.h"
+#include "schedule.h"
 
 namespace tvm {
 namespace schedule {
diff --git a/include/tvm/target_info.h b/include/tvm/target_info.h
index 8569f188a4ab..338749cf832e 100644
--- a/include/tvm/target_info.h
+++ b/include/tvm/target_info.h
@@ -7,8 +7,8 @@
 #define TVM_TARGET_INFO_H_
 
 #include <string>
-#include "./base.h"
-#include "./expr.h"
+#include "base.h"
+#include "expr.h"
 
 namespace tvm {
 
diff --git a/include/tvm/tensor.h b/include/tvm/tensor.h
index 1a6338d9058c..16f7363a9e73 100644
--- a/include/tvm/tensor.h
+++ b/include/tvm/tensor.h
@@ -6,15 +6,16 @@
 #ifndef TVM_TENSOR_H_
 #define TVM_TENSOR_H_
 
-#include <tvm/container.h>
 #include <ir/FunctionBase.h>
+#include <tvm/node/container.h>
 #include <string>
 #include <vector>
 #include <type_traits>
 
-#include "./base.h"
-#include "./expr.h"
-#include "./arithmetic.h"
+#include "base.h"
+#include "expr.h"
+#include "ir_operator.h"
+#include "arithmetic.h"
 
 namespace tvm {
 
@@ -33,7 +34,7 @@ class Tensor : public NodeRef {
  public:
   /*! \brief default constructor, used internally */
   Tensor() {}
-  explicit Tensor(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit Tensor(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -45,6 +46,12 @@ class Tensor : public NodeRef {
    * \return whether the two tensors equals each other.
    */
   inline bool operator==(const Tensor& other) const;
+  /*!
+   * \brief check if two tensors are different.
+   * \param other tensor to be checked.
+   * \return whether the two tensors are different.
+   */
+  inline bool operator!=(const Tensor& other) const;
   /*! \return The dimension of the tensor */
   inline size_t ndim() const;
   /*!
@@ -118,7 +125,7 @@ class Operation : public FunctionRef {
  public:
   /*! \brief default constructor  */
   Operation() {}
-  explicit Operation(std::shared_ptr<Node> n) : FunctionRef(n) {}
+  explicit Operation(NodePtr<Node> n) : FunctionRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -184,6 +191,10 @@ inline bool Tensor::operator==(const Tensor& other) const {
   }
 }
 
+inline bool Tensor::operator!=(const Tensor& other) const {
+  return !(*this == other);
+}
+
 // macro to turn every operation of slice to expression
 #define DEFINE_OVERLOAD_SLICE_UNARY_OP(Op)                              \
   inline Expr operator Op (const Tensor::Slice& a) {                    \
diff --git a/include/tvm/tensor_intrin.h b/include/tvm/tensor_intrin.h
index bd3fd11021b4..6cffc931d42a 100644
--- a/include/tvm/tensor_intrin.h
+++ b/include/tvm/tensor_intrin.h
@@ -7,8 +7,8 @@
 #define TVM_TENSOR_INTRIN_H_
 
 #include <string>
-#include "./tensor.h"
-#include "./buffer.h"
+#include "tensor.h"
+#include "buffer.h"
 
 namespace tvm {
 
@@ -19,7 +19,7 @@ class TensorIntrinNode;
 class TensorIntrin : public NodeRef {
  public:
   TensorIntrin() {}
-  explicit TensorIntrin(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit TensorIntrin(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -74,13 +74,13 @@ class TensorIntrinNode : public Node {
     v->Visit("reduce_update", &reduce_update);
   }
 
-  static TensorIntrin make(std::string name,
-                           Operation op,
-                           Array<Tensor> inputs,
-                           Array<Buffer> buffers,
-                           Stmt body,
-                           Stmt reduce_init,
-                           Stmt reduce_update);
+  TVM_DLL static TensorIntrin make(std::string name,
+                                   Operation op,
+                                   Array<Tensor> inputs,
+                                   Array<Buffer> buffers,
+                                   Stmt body,
+                                   Stmt reduce_init,
+                                   Stmt reduce_update);
 
   static constexpr const char* _type_key = "TensorIntrin";
   TVM_DECLARE_NODE_TYPE_INFO(TensorIntrinNode, Node);
@@ -89,5 +89,57 @@ class TensorIntrinNode : public Node {
 inline const TensorIntrinNode* TensorIntrin::operator->() const {
   return static_cast<const TensorIntrinNode*>(node_.get());
 }
+
+// Internal node container of tensor intrinsic calling.
+class TensorIntrinCallNode;
+
+/*! \brief Tensor intrinsic calling node. */
+class TensorIntrinCall : public NodeRef {
+ public:
+  TensorIntrinCall() {}
+  explicit TensorIntrinCall(NodePtr<Node> n) : NodeRef(n) {}
+  /*!
+   * \brief access the internal node container
+   * \return the pointer to the internal node container
+   */
+  inline const TensorIntrinCallNode* operator->() const;
+
+  /*! \brief specify container node */
+  using ContainerType = TensorIntrinCallNode;
+};
+
+class TensorIntrinCallNode : public Node {
+ public:
+  /*! \brief the tensor intrinsic */
+  TensorIntrin intrin;
+  /*! \brief input tensors of the intrinsic */
+  Array<Tensor> tensors;
+  /*! \brief regions of input tensors */
+  Array<Region> regions;
+  /*!
+   * \brief IterVar on each reduction axis, if the
+   * intrin will use the reduce axis
+   */
+  Array<IterVar> reduce_axis;
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("intrin", &intrin);
+    v->Visit("tensors", &tensors);
+    v->Visit("regions", &regions);
+    v->Visit("reduce_axis", &reduce_axis);
+  }
+  static TensorIntrinCall make(TensorIntrin intrin,
+                               Array<Tensor> tensors,
+                               Array<Region> regions,
+                               Array<IterVar> reduce_axis);
+
+  static constexpr const char* _type_key = "TensorIntrinCall";
+  TVM_DECLARE_NODE_TYPE_INFO(TensorIntrinCallNode, Node);
+};
+
+inline const TensorIntrinCallNode* TensorIntrinCall::operator->() const {
+  return static_cast<const TensorIntrinCallNode*>(node_.get());
+}
+
 }  // namespace tvm
 #endif  // TVM_TENSOR_INTRIN_H_
diff --git a/include/tvm/tvm.h b/include/tvm/tvm.h
index 7e9c4305ffbb..645c68357f13 100644
--- a/include/tvm/tvm.h
+++ b/include/tvm/tvm.h
@@ -6,11 +6,11 @@
 #ifndef TVM_TVM_H_
 #define TVM_TVM_H_
 
-#include "./base.h"
-#include "./expr.h"
-#include "./ir_operator.h"
-#include "./tensor.h"
-#include "./operation.h"
-#include "./packed_func_ext.h"
+#include "base.h"
+#include "expr.h"
+#include "ir_operator.h"
+#include "tensor.h"
+#include "operation.h"
+#include "packed_func_ext.h"
 
 #endif  // TVM_TVM_H_
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/TVMContext.java b/jvm/core/src/main/java/ml/dmlc/tvm/TVMContext.java
index 0d108e0a2943..d9051f0d9d4d 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/TVMContext.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/TVMContext.java
@@ -30,6 +30,7 @@ public class TVMContext {
     MASK2STR.put(1, "cpu");
     MASK2STR.put(2, "gpu");
     MASK2STR.put(4, "opencl");
+    MASK2STR.put(7, "vulkan");
     MASK2STR.put(8, "metal");
     MASK2STR.put(9, "vpi");
 
@@ -38,6 +39,7 @@ public class TVMContext {
     STR2MASK.put("cuda", 2);
     STR2MASK.put("cl", 4);
     STR2MASK.put("opencl", 4);
+    STR2MASK.put("vulkan", 7);
     STR2MASK.put("metal", 8);
     STR2MASK.put("vpi", 9);
   }
@@ -81,6 +83,19 @@ public static TVMContext opencl() {
     return opencl(0);
   }
 
+  /**
+   * Construct a Vulkan device.
+   * @param devId The device id
+   * @return The created context
+   */
+  public static TVMContext vulkan(int devId) {
+    return new TVMContext(7, devId);
+  }
+
+  public static TVMContext vulkan() {
+    return vulkan(0);
+  }
+
   /**
    * Construct a metal device.
    * @param devId The device id
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java
index 0eec9224a40c..8ebf188b0667 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java
@@ -143,6 +143,24 @@ public TVMContext cl() {
     return cl(0);
   }
 
+  /**
+   * Construct remote OpenCL device.
+   * @param devId device id.
+   * @return Remote OpenCL context.
+   */
+  public TVMContext vulkan(int devId) {
+    return context(7, devId);
+  }
+
+  /**
+   * Construct remote OpenCL device.
+   * @return Remote OpenCL context.
+   */
+  public TVMContext vulkan() {
+    return vulkan(0);
+  }
+
+
   /**
    * Construct remote Metal device.
    * @param devId device id.
diff --git a/nnvm/Makefile b/nnvm/Makefile
index adbae329e144..8392aadc3f2d 100644
--- a/nnvm/Makefile
+++ b/nnvm/Makefile
@@ -13,12 +13,12 @@ TVMPATH = ..
 
 export LDFLAGS = -pthread -lm
 export CFLAGS = -std=c++11 -Wall -O2 -Iinclude -fPIC
-CFLAGS += -I$(TVMPATH)/include -I$(TVMPATH)/dlpack/include -I$(TVMPATH)/HalideIR/src -I$(TVMPATH)/topi/include
+CFLAGS += -I$(TVMPATH)/include -I$(TVMPATH)/3rdparty/dlpack/include -I$(TVMPATH)/3rdparty/HalideIR/src -I$(TVMPATH)/topi/include
 
 ifdef DMLC_CORE_PATH
   CFLAGS += -I$(DMLC_CORE_PATH)/include
 else
-  CFLAGS += -I$(ROOTDIR)/../dmlc-core/include
+  CFLAGS += -I$(TVMPATH)/3rdparty/dmlc-core/include
 endif
 
 ifneq ($(ADD_CFLAGS), NONE)
diff --git a/nnvm/amalgamation/Makefile b/nnvm/amalgamation/Makefile
index 1f286f055237..4305339e0075 100644
--- a/nnvm/amalgamation/Makefile
+++ b/nnvm/amalgamation/Makefile
@@ -4,7 +4,7 @@ export CFLAGS = -std=c++11 -Wall -O2 -Iinclude -fPIC
 ifdef DMLC_CORE_PATH
   CFLAGS += -I$(DMLC_CORE_PATH)/include
 else
-  CFLAGS += -I$(CURDIR)/../dmlc-core/include
+  CFLAGS += -I$(CURDIR)/../3rdparty/dmlc-core/include
 endif
 
 .PHONY: all clean
diff --git a/nnvm/include/nnvm/base.h b/nnvm/include/nnvm/base.h
index 449bd2f4626e..39ff70093bed 100644
--- a/nnvm/include/nnvm/base.h
+++ b/nnvm/include/nnvm/base.h
@@ -25,6 +25,9 @@ using dmlc::array_view;
 /*!\brief getter function of any type */
 using dmlc::get;
 
+/*!\brief "unsafe" getter function of any type */
+using dmlc::unsafe_get;
+
 }  // namespace nnvm
 
 // describe op registration point
diff --git a/nnvm/include/nnvm/compiler/util.h b/nnvm/include/nnvm/compiler/util.h
index 5d5bc4478530..0f7fb2a5c875 100644
--- a/nnvm/include/nnvm/compiler/util.h
+++ b/nnvm/include/nnvm/compiler/util.h
@@ -28,6 +28,17 @@ inline tvm::Array<tvm::Expr> ShapeToArray(TShape shape) {
   return result;
 }
 
+/*
+ * \brief Helper function to convert TShape to TVM array. Useful for
+ * passing data from NNVM param structures to TOPI ops.
+ *
+ * \param shape The shape to convert
+ *
+ * \return An Array of Expr, where each element is a constant int32
+ */
+inline tvm::Array<tvm::Integer> ShapeToIntArray(TShape shape) {
+  return tvm::Array<tvm::Integer>(ShapeToArray(shape).node_);
+}
 }  // namespace compiler
 }  // namespace nnvm
 #endif  // NNVM_COMPILER_UTIL_H_
diff --git a/nnvm/include/nnvm/graph.h b/nnvm/include/nnvm/graph.h
index 1d3b662ff0b8..93612ccb9ece 100644
--- a/nnvm/include/nnvm/graph.h
+++ b/nnvm/include/nnvm/graph.h
@@ -12,9 +12,9 @@
 #include <algorithm>
 #include <unordered_map>
 #include <unordered_set>
-#include "./base.h"
-#include "./node.h"
-#include "./symbolic.h"
+#include "base.h"
+#include "node.h"
+#include "symbolic.h"
 
 namespace nnvm {
 
@@ -229,7 +229,7 @@ inline const T& Graph::GetAttr(const std::string& attr_name) const {
   auto it = attrs.find(attr_name);
   CHECK(it != attrs.end())
       << "Cannot find attribute " << attr_name << " in the graph";
-  return nnvm::get<T>(*it->second);
+  return nnvm::unsafe_get<T>(*it->second);
 }
 
 inline bool Graph::HasAttr(const std::string& attr_name) const {
diff --git a/nnvm/include/nnvm/graph_attr_types.h b/nnvm/include/nnvm/graph_attr_types.h
index 2bd998fedfbb..2fe82c9a7de0 100644
--- a/nnvm/include/nnvm/graph_attr_types.h
+++ b/nnvm/include/nnvm/graph_attr_types.h
@@ -8,8 +8,8 @@
 
 #include <vector>
 #include <string>
-#include "./tuple.h"
-#include "./layout.h"
+#include "tuple.h"
+#include "layout.h"
 
 namespace nnvm {
 
diff --git a/nnvm/include/nnvm/node.h b/nnvm/include/nnvm/node.h
index 57afb0c5587a..ae782f04965e 100644
--- a/nnvm/include/nnvm/node.h
+++ b/nnvm/include/nnvm/node.h
@@ -10,9 +10,9 @@
 #include <string>
 #include <vector>
 #include <unordered_map>
-#include "./base.h"
-#include "./op.h"
-#include "./c_api.h"
+#include "base.h"
+#include "op.h"
+#include "c_api.h"
 
 namespace nnvm {
 
diff --git a/nnvm/include/nnvm/op.h b/nnvm/include/nnvm/op.h
index 5bdfcaca169d..9d171bbdb2bc 100644
--- a/nnvm/include/nnvm/op.h
+++ b/nnvm/include/nnvm/op.h
@@ -13,8 +13,8 @@
 #include <typeinfo>
 #include <limits>
 #include <functional>
-#include "./base.h"
-#include "./c_api.h"
+#include "base.h"
+#include "c_api.h"
 
 namespace nnvm {
 
diff --git a/nnvm/include/nnvm/op_attr_types.h b/nnvm/include/nnvm/op_attr_types.h
index b7f6be408a16..abed19f9bc7d 100644
--- a/nnvm/include/nnvm/op_attr_types.h
+++ b/nnvm/include/nnvm/op_attr_types.h
@@ -10,10 +10,10 @@
 #include <string>
 #include <utility>
 #include <functional>
-#include "./base.h"
-#include "./node.h"
-#include "./tuple.h"
-#include "./layout.h"
+#include "base.h"
+#include "node.h"
+#include "tuple.h"
+#include "layout.h"
 
 namespace nnvm {
 
diff --git a/nnvm/include/nnvm/pass.h b/nnvm/include/nnvm/pass.h
index 016d5ee2a763..2e8db6111887 100644
--- a/nnvm/include/nnvm/pass.h
+++ b/nnvm/include/nnvm/pass.h
@@ -8,8 +8,8 @@
 
 #include <vector>
 #include <functional>
-#include "./base.h"
-#include "./graph.h"
+#include "base.h"
+#include "graph.h"
 
 namespace nnvm {
 
diff --git a/nnvm/include/nnvm/pass_functions.h b/nnvm/include/nnvm/pass_functions.h
index 4c29e09d813a..5a98dd456fb2 100644
--- a/nnvm/include/nnvm/pass_functions.h
+++ b/nnvm/include/nnvm/pass_functions.h
@@ -13,9 +13,9 @@
 #include <string>
 #include <memory>
 #include <vector>
-#include "./base.h"
-#include "./pass.h"
-#include "./graph_attr_types.h"
+#include "base.h"
+#include "pass.h"
+#include "graph_attr_types.h"
 
 namespace nnvm {
 namespace pass {
diff --git a/nnvm/include/nnvm/symbolic.h b/nnvm/include/nnvm/symbolic.h
index ebb2ab5d30d0..42cf5dd775c2 100644
--- a/nnvm/include/nnvm/symbolic.h
+++ b/nnvm/include/nnvm/symbolic.h
@@ -15,8 +15,8 @@
 #include <tuple>
 #include <utility>
 
-#include "./base.h"
-#include "./node.h"
+#include "base.h"
+#include "node.h"
 
 namespace nnvm {
 /*!
diff --git a/nnvm/include/nnvm/top/nn.h b/nnvm/include/nnvm/top/nn.h
index 865024733494..143a9548f18a 100644
--- a/nnvm/include/nnvm/top/nn.h
+++ b/nnvm/include/nnvm/top/nn.h
@@ -11,7 +11,7 @@
 #include <nnvm/tuple.h>
 #include <nnvm/layout.h>
 #include <string>
-#include "./tensor.h"
+#include "tensor.h"
 
 namespace nnvm {
 namespace top {
diff --git a/nnvm/include/nnvm/top/tensor.h b/nnvm/include/nnvm/top/tensor.h
index 22ee9d7118e6..bed1b05984da 100644
--- a/nnvm/include/nnvm/top/tensor.h
+++ b/nnvm/include/nnvm/top/tensor.h
@@ -16,7 +16,7 @@ namespace top {
 struct ConcatenateParam : public dmlc::Parameter<ConcatenateParam> {
   int axis;
   DMLC_DECLARE_PARAMETER(ConcatenateParam) {
-    DMLC_DECLARE_FIELD(axis).set_lower_bound(0).set_default(1)
+    DMLC_DECLARE_FIELD(axis).set_default(1)
     .describe("the axis to be concated.");
   }
 };
@@ -43,7 +43,7 @@ struct SplitParam : public dmlc::Parameter<SplitParam> {
   DMLC_DECLARE_PARAMETER(SplitParam) {
     DMLC_DECLARE_FIELD(indices_or_sections)
         .describe("Number of outputs to be splitted");
-    DMLC_DECLARE_FIELD(axis).set_lower_bound(0).set_default(1)
+    DMLC_DECLARE_FIELD(axis).set_default(1)
         .describe("the axis to be splitted.");
   }
 };
@@ -205,6 +205,7 @@ struct ReduceParam : public dmlc::Parameter<ReduceParam> {
   TShape axis;
   bool keepdims;
   bool exclude;
+  int dtype;
 
   DMLC_DECLARE_PARAMETER(ReduceParam) {
     DMLC_DECLARE_FIELD(axis).set_default(TShape())
@@ -226,6 +227,8 @@ struct ReduceParam : public dmlc::Parameter<ReduceParam> {
                 "in the result as dimension with size one.");
     DMLC_DECLARE_FIELD(exclude).set_default(false)
       .describe("Whether to perform reduction on axis that are NOT in axis instead.");
+    DMLC_DECLARE_DTYPE_FIELD(dtype).set_default(kInt32)
+      .describe("Target data type.");
   }
 };
 
diff --git a/nnvm/include/nnvm/tuple.h b/nnvm/include/nnvm/tuple.h
index 7e83aecc11f0..36b8ef13c74a 100644
--- a/nnvm/include/nnvm/tuple.h
+++ b/nnvm/include/nnvm/tuple.h
@@ -12,7 +12,7 @@
 #include <utility>
 #include <iostream>
 #include <string>
-#include "./base.h"
+#include "base.h"
 
 namespace nnvm {
 
diff --git a/nnvm/python/nnvm/compiler/build_module.py b/nnvm/python/nnvm/compiler/build_module.py
index 217598c9d79a..b04d49478830 100644
--- a/nnvm/python/nnvm/compiler/build_module.py
+++ b/nnvm/python/nnvm/compiler/build_module.py
@@ -239,8 +239,9 @@ def build(graph, target=None, shape=None, dtype="float32",
         raise ValueError("Target is not set in env or passed as argument.")
     target = tvm.target.create(target)
 
-    # if not inside an autotvm config dispatch context, load pre-tuned parameters from TopHub
-    if autotvm.task.DispatchContext.current is None:
+    # If current dispatch context is fallback context (the default root context),
+    # then load pre-tuned parameters from TopHub
+    if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
         tophub_context = autotvm.tophub.context(target)
     else:
         tophub_context = autotvm.util.EmptyContext()
@@ -250,8 +251,8 @@ def build(graph, target=None, shape=None, dtype="float32",
         if not isinstance(shape, dict):
             raise TypeError("require shape to be dict")
         for value in shape.values():
-            if not all(isinstance(x, int) for x in value):
-                raise TypeError("shape value must be int iterator")
+            if not all(isinstance(x, tvm._ffi.base.integer_types) for x in value):
+                raise TypeError("shape value must be Integer types iterator")
 
         cfg = BuildConfig.current
         graph = graph if isinstance(graph, _graph.Graph) else _graph.create(graph)
diff --git a/nnvm/python/nnvm/frontend/__init__.py b/nnvm/python/nnvm/frontend/__init__.py
index 80f66c0d35e3..49f53df1174f 100644
--- a/nnvm/python/nnvm/frontend/__init__.py
+++ b/nnvm/python/nnvm/frontend/__init__.py
@@ -6,3 +6,4 @@
 from .keras import from_keras
 from .darknet import from_darknet
 from .tensorflow import from_tensorflow
+from .caffe2 import from_caffe2
diff --git a/nnvm/python/nnvm/frontend/caffe2.py b/nnvm/python/nnvm/frontend/caffe2.py
new file mode 100755
index 000000000000..2450af628a90
--- /dev/null
+++ b/nnvm/python/nnvm/frontend/caffe2.py
@@ -0,0 +1,458 @@
+# pylint: disable=import-self, invalid-name, line-too-long, unused-argument
+"""Caffe2 frontend"""
+from __future__ import absolute_import as _abs
+import tvm
+from nnvm import symbol as _sym
+from nnvm.frontend.common import get_nnvm_op, Renamer, AttrConverter as AttrCvt
+from .onnx_caffe2_utils import dimension_picker, dimension_constraint, infer_channels, revert_caffe2_pad
+from . import onnx
+
+__all__ = ['from_caffe2']
+
+
+def _clean_up_pool_args(args):
+    """ A helper function to clean up common arguments in conv and pooling ops.
+    """
+    assert isinstance(args, dict)
+
+    if 'stride_h' in args and 'stride_w' in args:
+        assert 'stride' not in args and 'strides' not in args
+        args['strides'] = [args['stride_h'], args['stride_w']]
+        args.pop('stride_h')
+        args.pop('stride_w')
+    elif 'stride' in args:
+        args['strides'] = [args['stride'], args['stride']]
+        args.pop('stride')
+
+    # rename 'kernel', 'kernels', to 'kernel_shape'
+    if 'kernel_h' in args and 'kernel_w' in args:
+        assert 'kernel' not in args and 'kernels' not in args
+        args['kernel_shape'] = [args['kernel_h'], args['kernel_w']]
+        args.pop('kernel_h')
+        args.pop('kernel_w')
+    elif 'kernel' in args:
+        args['kernel_shape'] = [args['kernel'], args['kernel']]
+        args.pop('kernel')
+    elif 'kernels' in args:
+        args['kernel_shape'] = args['kernels']
+        args.pop('kernels')
+
+    if 'pad_t' in args and 'pad_l' in args and 'pad_b' in args and 'pad_r' in args:
+        assert 'pad' not in args and 'pads' not in args
+        args['pads'] = [
+            args['pad_t'], args['pad_l'], args['pad_b'], args['pad_r']
+        ]
+        for pad in ['pad_t', 'pad_l', 'pad_b', 'pad_r']:
+            args.pop(pad)
+    elif 'pad' in args:
+        args['pads'] = [args['pad'], args['pad']]
+        args.pop('pad')
+
+    if 'dilation_h' in args and 'dilation_w' in args:
+        assert 'dilation' not in args and 'dilations' not in args
+        args['dilations'] = [args['dilation_h'], args['dilation_w']]
+        args.pop('dilation_h')
+        args.pop('dilation_w')
+    elif 'dilation' in args:
+        args['dilations'] = [args['dilation'], args['dilation']]
+        args.pop('dilation')
+
+    return args
+
+
+class Caffe2OpConverter(object):
+    """ A helper class for holding Caffe2 op converters.
+    """
+
+    @classmethod
+    def get_converter(cls):
+        """ Get converter.
+
+        :return: converter, which should be `_impl`.
+        """
+
+        if hasattr(cls, '_impl'):
+            return getattr(cls, '_impl')
+        else:
+            raise NotImplementedError('{} not implemented'.format(
+                cls.__name__))
+
+
+_caffe2_internal_args = {
+    # nnpack args
+    'algo',
+    'convolution_transform_strategy',
+    'float16_compute',
+    'shared_buffer',
+
+    # training args
+    'init_params',
+    'cudnn_exhaustive_search',
+    'exhaustive_search',
+
+    # training args
+    'adj',
+    'hwgq',
+
+    # args that we don't care
+    'legacy_pad',
+}
+
+
+class Pool(Caffe2OpConverter):
+    """ A helper class for pool op converters.
+    """
+
+    name = ''
+
+    @classmethod
+    def _impl(cls, inputs, args, params):
+        _clean_up_pool_args(args)
+        if 'global_pooling' in args and args['global_pooling'] == 1:
+            op_name = dimension_picker('global_' + cls.name)
+            return get_nnvm_op(op_name(args))(*inputs)
+
+        return AttrCvt(
+            op_name=dimension_picker(cls.name),
+            transforms={
+                'kernel_shape': 'pool_size',
+                'pads': ('padding', (0, 0), revert_caffe2_pad),
+                'strides': 'strides',
+            },
+            excludes={
+                # TVM poolop does not support dilation
+                'dilations',
+            },
+            ignores=_caffe2_internal_args | {'global_pooling', 'order'},
+            custom_check=dimension_constraint())(inputs, args, params)
+
+
+class AveragePool(Pool):
+    name = 'avg_pool'
+
+
+class MaxPool(Pool):
+    name = 'max_pool'
+
+
+class Conv(Caffe2OpConverter):
+    """ Operator converter for Conv.
+    """
+
+    @classmethod
+    def _impl(cls, inputs, args, params):
+        # get number of channels
+        channels = infer_channels(inputs[1], params)
+        args['channels'] = channels
+        _clean_up_pool_args(args)
+        return AttrCvt(
+            op_name=dimension_picker('conv'),
+            transforms={
+                'group': ('groups', 1),
+                'kernel_shape':
+                'kernel_size',
+                'pads': ('padding', (0, 0), revert_caffe2_pad),
+                'strides':
+                'strides',
+                'dilations': ('dilation', (1, 1)),
+                'order':
+                ('layout', ("NCHW"),
+                 lambda x: x if isinstance(x, str) else x.decode('UTF-8')),
+            },
+            excludes={},
+            ignores=_caffe2_internal_args,
+            extras={'use_bias': len(inputs) == 3},
+            custom_check=dimension_constraint())(inputs, args, params)
+
+
+class Concat(Caffe2OpConverter):
+    """ Operator converter for Concat.
+    """
+
+    @classmethod
+    def _impl(cls, inputs, args, params):
+        def _get_axis_from_order_str(order):
+            order = order if isinstance(order, str) else order.decode('UTF-8')
+            if order == 'NCHW':
+                return 1
+            elif order == 'NHWC':
+                return 3
+            else:
+                raise RuntimeError(
+                    "Unsupported storage order: {} in caffe2".format(order))
+
+        return AttrCvt(
+            op_name='concatenate',
+            transforms={
+                'order': ('axis', (1), _get_axis_from_order_str),
+            },
+            excludes={
+                'add_axis',
+            })(inputs, args, params)
+
+
+class NormalizePlanarYUV(Caffe2OpConverter):
+    """ Operator converter for NormalizePlanarYUV.
+    caffe2 definition: https://github.com/pytorch/pytorch/blob/master/caffe2/operators/norm_planar_yuv_op.cc
+    """
+
+    @classmethod
+    def _impl(cls, inputs, args, params):
+        assert len(inputs) == 3
+        mean = _sym.expand_dims(inputs[1], axis=2, num_newaxis=2)
+        std = _sym.expand_dims(inputs[2], axis=2, num_newaxis=2)
+
+        return _sym.broadcast_div(_sym.broadcast_sub(inputs[0], mean), std)
+
+
+class ResizeNearest(Caffe2OpConverter):
+    """ Operator converter for Upsample (nearest mode).
+    """
+
+    @classmethod
+    def _impl(cls, inputs, args, params):
+        width_scale = args['width_scale'] if 'width_scale' in args else 1
+        height_scale = args['height_scale'] if 'height_scale' in args else 1
+        assert width_scale == height_scale
+
+        return _sym.upsampling(
+            inputs[0], scale=int(width_scale), method="NEAREST_NEIGHBOR")
+
+
+class FC(Caffe2OpConverter):
+    """ Operator converter for FC.
+    """
+
+    @classmethod
+    def _impl(cls, inputs, args, params):
+        inputs[0] = _sym.flatten(inputs[0])
+        args['units'] = infer_channels(inputs[1], params)
+        return AttrCvt(
+            'dense',
+            ignores=['axis', 'axis_w'],
+            extras={'use_bias': len(inputs) == 3},
+        )(inputs, args, params)
+
+
+class SpatialBN(Caffe2OpConverter):
+    """ Operator converter for SpatialBN.
+    """
+
+    @classmethod
+    def _impl(cls, inputs, args, params):
+        return AttrCvt(
+            op_name='batch_norm',
+            disables=['momentum'],
+            ignores=[
+                'order', 'spatial', 'is_test', 'consumed_inputs', 'num_batches'
+            ])(inputs, args, params)
+
+
+# compatible operators that do NOT require any conversion.
+_identity_list = []
+
+# _convert_map defines maps of name to converter functor(callable)
+# for 1 to 1 mapping, use Renamer if nothing but name is different
+# use AttrCvt if attributes need to be converted
+# for 1 to N mapping(composed), use custom callable functions
+# for N to 1 mapping, currently not supported(?)
+
+# Minimal set of ops for squeezenet and resnet50
+def _get_convert_map():
+    return {
+        # caffe2/onnx common operators
+        'Add': onnx.Add.get_converter(opset=1),
+        'Sum': onnx.Sum.get_converter(opset=1),
+        'Softmax': onnx.Softmax.get_converter(opset=1),
+
+        # nn
+        'AveragePool': AveragePool.get_converter(),
+        'MaxPool': MaxPool.get_converter(),
+        'Conv': Conv.get_converter(),
+        'Concat': Concat.get_converter(),
+        'FC': FC.get_converter(),
+        'SpatialBN': SpatialBN.get_converter(),
+        'ResizeNearest': ResizeNearest.get_converter(),
+        'Relu': AttrCvt('relu', {}, ignores=['order']),
+        'Sigmoid': Renamer('sigmoid'),
+        'Dropout': AttrCvt('dropout', {'ratio': 'rate'}, ignores=['is_test']),
+
+        # c2 image preprocessing ops
+        'NormalizePlanarYUV': NormalizePlanarYUV.get_converter(),
+    }
+
+
+class Caffe2NetDef(object):
+    """A helper class for handling nnvm graph copying from pb2.GraphProto.
+    Definition: https://github.com/pytorch/pytorch/blob/master/caffe2/proto/caffe2.proto
+    """
+
+    def __init__(self):
+        self._nodes = {}
+        self._params = {}
+        self._visited_nodes = set()
+        self._ops = {}
+
+    def from_caffe2(self, init_net, predict_net):
+        """Construct nnvm nodes from caffe2 graph.
+
+        Parameters
+        ----------
+        workspace : Caffe2 workspace
+        predict_net : protobuf object
+
+        Returns
+        -------
+        sym : nnvm.sym.Symbol
+            The returned nnvm symbol
+        params : dict
+            A dict of name: tvm.nd.array pairs, used as pretrained weights
+        """
+        from caffe2.python import workspace
+        workspace.RunNetOnce(init_net)
+
+        # Input
+        input_name = predict_net.op[0].input[0]
+
+        # Params
+        self._params = {}
+        used_blobs = set()
+        for c2_op in predict_net.op:
+            for i in c2_op.input:
+                used_blobs.add(i)
+        for blob in workspace.Blobs():
+            if blob in used_blobs and blob != input_name:
+                self._params[blob] = tvm.nd.array(workspace.FetchBlob(blob))
+
+        # Variables
+        self._nodes = {}
+        for blob in predict_net.external_input:
+            self._nodes[blob] = _sym.Variable(name=blob)
+
+        # Ops
+        for c2_op in predict_net.op:
+            for blob in c2_op.output:
+                self._ops[blob] = c2_op
+        for c2_op in predict_net.op:
+            self._process_op(c2_op)
+
+        # Outputs
+        out = []
+        for blob in predict_net.external_output:
+            out.append(self._nodes[blob])
+
+        if len(out) > 1:
+            sym = _sym.Group(out)
+        else:
+            sym = out[0]
+
+        return sym, self._params
+
+    def _get_node(self, blob):
+        """Get the nnvm Symbol of blob and detect cyclic dependency in the graph."""
+        if blob in self._nodes:
+            return self._nodes[blob]
+
+        assert blob not in self._visited_nodes, 'Cyclic dependency in the graph (in {})'.format(
+            blob)
+        self._visited_nodes.add(blob)
+
+        self._process_op(self._ops[blob])
+        return self._nodes[blob]
+
+    def _process_op(self, c2_op):
+        op_type = c2_op.type
+        args = self._parse_arg(c2_op.arg)
+        inputs = [self._get_node(i) for i in c2_op.input]
+        tvm_op = self._convert_operator(op_type, inputs, args)
+        # Ignore all outputs except the first one
+        self._nodes[c2_op.output[0]] = tvm_op[0]
+
+    def _parse_arg(self, arg):
+        """Convert a list of Argument to a dict, with names as keys."""
+        args = {}
+        for a in arg:
+            for f in ['f', 'i', 's']:
+                if a.HasField(f):
+                    args[a.name] = getattr(a, f)
+            for f in ['floats', 'ints', 'strings']:
+                if list(getattr(a, f)):
+                    assert a.name not in args, "Only one type of attr is allowed"
+                    args[a.name] = tuple(getattr(a, f))
+            for f in ['n']:
+                if a.HasField(f):
+                    raise NotImplementedError(
+                        "Field {} is not supported in nnvm.".format(f))
+            for f in ['nets']:
+                if list(getattr(a, f)):
+                    raise NotImplementedError(
+                        "Field {} is not supported in nnvm.".format(f))
+            if a.name not in args:
+                raise ValueError("Cannot parse attribute: \n{}\n.".format(a))
+        return args
+
+    def _convert_operator(self,
+                          op_type,
+                          inputs,
+                          args,
+                          identity_list=None,
+                          convert_map=None):
+        """Convert from Caffe2 operator to nnvm operator.
+        The converter must specify conversions explicity for incompatible name, and
+        apply handlers to operator attributes.
+
+        Parameters
+        ----------
+        op_type : str
+            Operator name, such as Convolution, FullyConnected
+        inputs : list of nnvm.Symbol
+            List of input symbols.
+        args : dict
+            Dict of operator attributes
+        identity_list : list
+            List of operators that don't require conversion
+        convert_map : dict
+            Dict of name : callable, where name is the op's name that
+            require conversion to nnvm, callable are functions which
+            take args and return (new_op_type, new_args)
+
+        Returns
+        -------
+        sym : nnvm.Symbol
+            Converted nnvm Symbol
+        """
+        identity_list = identity_list if identity_list else _identity_list
+        convert_map = convert_map if convert_map else _get_convert_map()
+        if op_type in identity_list:
+            sym = get_nnvm_op(op_type)(*inputs, **args)
+        elif op_type in convert_map:
+            # Add a sanitizing step to convert all byte strings in args to strings
+            sym = convert_map[op_type](inputs, args, self._params)
+        else:
+            raise NotImplementedError(
+                "Operator {} not implemented.".format(op_type))
+        return sym
+
+
+def from_caffe2(init_net, predict_net):
+    """Load caffe2 graph which contains init_net and predict_net into nnvm graph.
+
+    Parameters
+    ----------
+    init_net : protobuf object
+        Caffe2 NetDef containing the weights
+
+    predict_net : protobuf object
+        Caffe2 NetDef containing the graph
+
+    Returns
+    -------
+    sym : nnvm.Symbol
+        Compatible nnvm symbol
+
+    params : dict of str to tvm.ndarray
+        Dict of converted parameters stored in tvm.ndarray format
+    """
+
+    caffe2 = Caffe2NetDef()
+    return caffe2.from_caffe2(init_net, predict_net)
diff --git a/nnvm/python/nnvm/frontend/coreml.py b/nnvm/python/nnvm/frontend/coreml.py
index e80cfe23f220..7dfd54317b55 100644
--- a/nnvm/python/nnvm/frontend/coreml.py
+++ b/nnvm/python/nnvm/frontend/coreml.py
@@ -217,6 +217,16 @@ def AddLayerParams(op, insyms, symtab):
         ret = _sym.__add_scalar__(ret, scalar=op.alpha)
     return ret
 
+def MultiplyLayerParams(op, insyms, symtab):
+    if not isinstance(insyms, list):
+        insyms = [insyms]
+    ret = insyms[0]
+    for i in range(1, len(insyms)):
+        ret = _sym.elemwise_mul(ret, insyms[i])
+    if op.alpha != 1:
+        ret = _sym.__mul_scalar__(ret, scalar=op.alpha)
+    return ret
+
 def ConcatLayerParams(op, insyms, symtab):
     if not isinstance(insyms, list):
         insyms = [insyms]
@@ -249,6 +259,49 @@ def PermuteLayerParams(op, insym, symtab):
     axes = tuple(op.axis)
     return _sym.transpose(insym, axes=axes)
 
+def UpsampleLayerParams(op, insym, symtab):
+    if op.scalingFactor[0] != op.scalingFactor[1]:
+        raise NotImplementedError("Upsampling only supported with same \
+            height and width scaling factor.")
+    interpolationMode = 'NEAREST_NEIGHBOR' if op.mode == 0 else 'BILINEAR'
+    return _sym.upsampling(insym, scale=op.scalingFactor[0], method=interpolationMode)
+
+def L2NormalizeLayerParams(op, insym, symtab):
+    return _sym.l2_normalize(insym, eps=op.epsilon, axis=1)
+
+def LRNLayerParams(op, insym, symtab):
+    par = {}
+    par['size'] = op.localSize
+    par['bias'] = op.k
+    par['alpha'] = op.alpha
+    par['beta'] = op.beta
+    par['axis'] = 1 #default layout is nchw
+    return _sym.lrn(data=insym, **par)
+
+def AverageLayerParams(op, insyms, symtab):
+    if not isinstance(insyms, list) or len(insyms) < 2:
+        raise ValueError("Expect minimum 2 inputs")
+    count = len(insyms)
+    _sum = insyms[0]
+    for i in range(1, count):
+        _sum = _sym.broadcast_add(_sum, insyms[i])
+    return _sum / count
+
+def MaxLayerParams(op, insyms, symtab):
+    if not isinstance(insyms, list) or len(insyms) < 2:
+        raise ValueError("Expect minimum 2 inputs")
+    _max = insyms[0]
+    for i in range(1, len(insyms)):
+        _max = _sym.broadcast_max(_max, insyms[i])
+    return _max
+
+def MinLayerParams(op, insyms, symtab):
+    if not isinstance(insyms, list) or len(insyms) < 2:
+        raise ValueError("Expect minimum 2 inputs")
+    _min = insyms[0]
+    for i in range(1, len(insyms)):
+        _min = _sym.broadcast_min(_min, insyms[i])
+    return _min
 
 _convert_map = {
     'NeuralNetworkMeanImage': NeuralNetworkMeanImage,
@@ -261,10 +314,17 @@ def PermuteLayerParams(op, insym, symtab):
     'SoftmaxLayerParams':SoftmaxLayerParams,
     'InnerProductLayerParams':InnerProductLayerParams,
     'AddLayerParams':AddLayerParams,
+    'MultiplyLayerParams':MultiplyLayerParams,
     'FlattenLayerParams':FlattenLayerParams,
     'ConcatLayerParams':ConcatLayerParams,
     'PaddingLayerParams':PaddingLayerParams,
     'PermuteLayerParams':PermuteLayerParams,
+    'UpsampleLayerParams':UpsampleLayerParams,
+    'L2NormalizeLayerParams':L2NormalizeLayerParams,
+    'LRNLayerParams':LRNLayerParams,
+    'AverageLayerParams':AverageLayerParams,
+    'MaxLayerParams':MaxLayerParams,
+    'MinLayerParams':MinLayerParams,
 }
 
 def coreml_op_to_nnvm(op, inname, outname, symtab):
diff --git a/nnvm/python/nnvm/frontend/darknet.py b/nnvm/python/nnvm/frontend/darknet.py
index 3a197a416219..18d07d07ac6b 100644
--- a/nnvm/python/nnvm/frontend/darknet.py
+++ b/nnvm/python/nnvm/frontend/darknet.py
@@ -32,8 +32,12 @@ class LAYERTYPE(object):
     NETWORK = 20
     XNOR = 21
     REGION = 22
-    REORG = 23
-    BLANK = 24
+    YOLO = 23
+    REORG = 24
+    UPSAMPLE = 25
+    LOGXENT = 26
+    L2NORM = 27
+    BLANK = 28
 
 class ACTIVATION(object):
     """Darknet ACTIVATION Class constant."""
@@ -257,6 +261,19 @@ def _darknet_reshape(inputs, attrs):
     new_attrs['shape'] = _darknet_required_attr(attrs, 'shape')
     return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
 
+def _darknet_upsampling(inputs, attrs):
+    """Process the upsampling operation."""
+    op_name, new_attrs = 'upsampling', {}
+    new_attrs['scale'] = attrs.get('scale', 1)
+    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
+def _darknet_l2normalize(inputs, attrs):
+    """Process the l2 normalization operation."""
+    op_name, new_attrs = 'l2_normalize', {}
+    new_attrs['eps'] = attrs.get('eps', 0)
+    new_attrs['axis'] = attrs.get('axis', 1)
+    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
 def _darknet_softmax_output(inputs, attrs):
     """Process the softmax operation."""
     temperature = attrs.get('temperature', 1)
@@ -298,6 +315,22 @@ def _darknet_region(inputs, attrs):
         new_attrs['softmax'] = attrs.get('softmax', 0)
     return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
 
+def _darknet_yolo(inputs, attrs):
+    """Process the yolo operation."""
+    num = attrs.get('n', 1)
+    classes = attrs.get('classes', 1)
+    input_shape = attrs.get('shape')
+    split_size = classes + 5
+    intermediate_shape = (input_shape[0], num, split_size, input_shape[2], input_shape[3])
+    data_block = _sym.reshape(inputs[0], shape=intermediate_shape)
+    split_indices = (2, 4)
+    split_res = _sym.split(data_block, indices_or_sections=split_indices, axis=2)
+    split_res0 = _sym.sigmoid(split_res[0])
+    split_res2 = _sym.sigmoid(split_res[2])
+    concat_list = [split_res0, split_res[1], split_res2]
+    out = _sym.concatenate(*concat_list, axis=2)
+    return _sym.reshape(out, shape=input_shape), None
+
 def _darknet_activations(inputs, attrs):
     """Process the activation function."""
     act = _darknet_required_attr(attrs, 'activation')
@@ -350,6 +383,9 @@ def _darknet_op_not_support(inputs, attrs):
     LAYERTYPE.REORG           : _darknet_reorg,
     LAYERTYPE.REGION          : _darknet_region,
     LAYERTYPE.SHORTCUT        : _darknet_shortcut,
+    LAYERTYPE.UPSAMPLE        : _darknet_upsampling,
+    LAYERTYPE.L2NORM          : _darknet_l2normalize,
+    LAYERTYPE.YOLO            : _darknet_yolo,
     LAYERTYPE.DETECTION       : _darknet_op_not_support,
     LAYERTYPE.CROP            : _darknet_op_not_support,
     LAYERTYPE.COST            : _darknet_op_not_support,
@@ -412,13 +448,20 @@ def __init__(self, net, dtype='float32'):
         self._sym_array = {}
         self._tvmparams = {}
         self._outs = []
-        self._rnn_state_ctr = 0
-
-    def _read_memory_buffer(self, shape, data):
+        self._state_ctr = {}
+        self._state_ctr['rnn'] = 0
+        self._state_ctr['crnn'] = 0
+        self._state_ctr['lstm'] = 0
+        self._state_ctr['cell_state'] = 0
+        self._state_ctr['gru'] = 0
+
+    def _read_memory_buffer(self, shape, data, dtype=None):
+        if dtype is None:
+            dtype = self.dtype
         length = 1
         for x in shape:
             length *= x
-        data_np = np.zeros(length, dtype=self.dtype)
+        data_np = np.zeros(length, dtype=dtype)
         for i in range(length):
             data_np[i] = data[i]
         return data_np.reshape(shape)
@@ -467,6 +510,31 @@ def _get_connected_weights(self, layer, opname):
             k = self._get_tvm_params_name(opname[0], 'bias')
             self._tvmparams[k] = tvm.nd.array(biases)
 
+    def _get_region_weights(self, layer, opname):
+        """Parse the biases for region layer."""
+        biases = self._read_memory_buffer((layer.n*2, ), layer.biases)
+        attributes = np.array([layer.n, layer.out_c, layer.out_h, layer.out_w,
+                               layer.classes, layer.coords, layer.background],
+                              dtype=np.int32)
+        k = self._get_tvm_params_name(opname, 'bias')
+        self._tvmparams[k] = tvm.nd.array(biases)
+        k = self._get_tvm_params_name(opname, 'attr')
+        self._tvmparams[k] = tvm.nd.array(attributes)
+
+    def _get_yolo_weights(self, layer, opname):
+        """Parse the biases and mask for yolo layer."""
+        biases = self._read_memory_buffer((layer.total*2, ), layer.biases)
+        mask = self._read_memory_buffer((layer.n, ), layer.mask, dtype='int32')
+        attributes = np.array([layer.n, layer.out_c, layer.out_h, layer.out_w,
+                               layer.classes, layer.total],
+                              dtype=np.int32)
+        k = self._get_tvm_params_name(opname, 'bias')
+        self._tvmparams[k] = tvm.nd.array(biases)
+        k = self._get_tvm_params_name(opname, 'mask')
+        self._tvmparams[k] = tvm.nd.array(mask)
+        k = self._get_tvm_params_name(opname, 'attr')
+        self._tvmparams[k] = tvm.nd.array(attributes)
+
     def _get_batchnorm_weights(self, layer, opname, size):
         """Parse the weights for batchnorm, which includes, scales, moving mean
         and moving variances."""
@@ -570,6 +638,18 @@ def _get_darknet_attrs(self, layer, layer_num):
             attr.update({'coords' : layer.coords})
             attr.update({'background' : layer.background})
             attr.update({'softmax' : layer.softmax})
+
+        elif LAYERTYPE.YOLO == layer.type:
+            attr.update({'n' : layer.n})
+            attr.update({'classes' : layer.classes})
+            attr.update({'shape' : (1, layer.c, layer.h, layer.w)})
+
+        elif LAYERTYPE.UPSAMPLE == layer.type:
+            attr.update({'scale' : layer.stride})
+
+        elif LAYERTYPE.L2NORM == layer.type:
+            pass
+
         else:
             err = "Darknet layer type {} is not supported in nnvm.".format(layer.type)
             raise NotImplementedError(err)
@@ -588,6 +668,11 @@ def _get_darknet_params(self, layer, opname):
         elif LAYERTYPE.CONNECTED == layer.type:
             self._get_connected_weights(layer, opname)
 
+        elif LAYERTYPE.REGION == layer.type:
+            self._get_region_weights(layer, opname)
+
+        elif LAYERTYPE.YOLO == layer.type:
+            self._get_yolo_weights(layer, opname)
     def _preproc_layer(self, layer, layer_num):
         """To preprocess each darknet layer, some layer doesnt need processing."""
         if layer_num == 0:
@@ -623,16 +708,16 @@ def _get_opname(self, layer):
         """Returs the layer name."""
         return layer.type
 
-    def _new_rnn_state_sym(self, state=None):
+    def _new_rnn_state_sym(self, state=None, name='rnn'):
         """Returs a symbol for state"""
-        name = "rnn%d_state" % (self._rnn_state_ctr)
-        self._rnn_state_ctr += 1
-        return _sym.Variable(name=name, init=state)
+        sym_name = name + "%d_state" % self._state_ctr[name]
+        self._state_ctr[name] += 1
+        return _sym.Variable(name=sym_name, init=state)
 
-    def _get_rnn_state_buffer(self, layer):
+    def _get_rnn_state_buffer(self, layer, name):
         """Get the state buffer for rnn."""
         buffer = np.zeros((1, layer.outputs), self.dtype)
-        return self._new_rnn_state_sym(buffer)
+        return self._new_rnn_state_sym(buffer, name)
 
     def _get_darknet_rnn_attrs(self, layer, sym):
         """Get the rnn converted symbol from attributes."""
@@ -653,7 +738,7 @@ def _handle_darknet_rnn_layers(self, layer_num, sym):
             attr.update({'batch' : layer.batch})
             attr.update({'num_hidden' : str(layer.outputs)})
 
-            state = self._get_rnn_state_buffer(layer)
+            state = self._get_rnn_state_buffer(layer, 'rnn')
 
             for _ in range(layer.steps):
                 input_layer = layer.input_layer
@@ -678,7 +763,7 @@ def _handle_darknet_rnn_layers(self, layer_num, sym):
             attr.update({'batch' : layer.batch})
             attr.update({'num_hidden' : str(layer.outputs)})
 
-            state = self._get_rnn_state_buffer(layer)
+            state = self._get_rnn_state_buffer(layer, 'crnn')
 
             for _ in range(layer.steps):
                 input_layer = layer.input_layer
@@ -698,8 +783,146 @@ def _handle_darknet_rnn_layers(self, layer_num, sym):
             self._sym_array[layer_num] = sym
             processed = True
 
+        elif LAYERTYPE.LSTM == layer.type:
+            if layer.steps > 1:
+                raise NotImplementedError("Currently support only single step GRU")
+
+            op_name_add = 'elemwise_add'
+            op_name_mul = 'elemwise_mul'
+            attrs = {}
+            act_attr = {}
+
+            h_state = self._get_rnn_state_buffer(layer, 'lstm')
+            c_state = self._get_rnn_state_buffer(layer, 'cell_state')
+            for _ in range(layer.steps):
+                sym_wf = self._get_darknet_rnn_attrs(layer.wf, h_state)
+                sym_wi = self._get_darknet_rnn_attrs(layer.wi, h_state)
+                sym_wg = self._get_darknet_rnn_attrs(layer.wg, h_state)
+                sym_wo = self._get_darknet_rnn_attrs(layer.wo, h_state)
+
+                input_sym = sym
+                sym_uf = self._get_darknet_rnn_attrs(layer.uf, input_sym)
+                sym_ui = self._get_darknet_rnn_attrs(layer.ui, input_sym)
+                sym_ug = self._get_darknet_rnn_attrs(layer.ug, input_sym)
+                sym_uo = self._get_darknet_rnn_attrs(layer.uo, input_sym)
+
+                new_inputs = _as_list([sym_wf, sym_uf])
+                add_f = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([sym_wi, sym_ui])
+                add_i = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([sym_wg, sym_ug])
+                add_g = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([sym_wo, sym_uo])
+                add_o = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                act_attr['activation'] = ACTIVATION.LOGISTIC
+                act_f, _ = _darknet_activations(_as_list(add_f), act_attr)
+
+                act_attr['activation'] = ACTIVATION.LOGISTIC
+                act_i, _ = _darknet_activations(_as_list(add_i), act_attr)
+
+                act_attr['activation'] = ACTIVATION.TANH
+                act_g, _ = _darknet_activations(_as_list(add_g), act_attr)
+
+                act_attr['activation'] = ACTIVATION.LOGISTIC
+                act_o, _ = _darknet_activations(_as_list(add_o), act_attr)
+
+                new_inputs = _as_list([act_i, act_g])
+                mul_t = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([act_f, c_state])
+                c_state = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([mul_t, c_state])
+                c_state = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                act_attr['activation'] = ACTIVATION.TANH
+                h_state, _ = _darknet_activations(_as_list(c_state), act_attr)
+
+                new_inputs = _as_list([act_o, h_state])
+                h_state = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
+                self._outs = self._outs + [c_state, h_state]
+                sym = h_state
+            self._sym_array[layer_num] = sym
+            processed = True
+
+        elif LAYERTYPE.GRU == layer.type:
+            if layer.steps > 1:
+                raise NotImplementedError("Currently support only single step GRU")
+
+            op_name_add = 'elemwise_add'
+            op_name_mul = 'elemwise_mul'
+            attrs = {}
+            act_attr = {}
+
+            state = self._get_rnn_state_buffer(layer, "gru")
+            for _ in range(layer.steps):
+                sym_wz = self._get_darknet_rnn_attrs(layer.wz, state)
+                sym_wr = self._get_darknet_rnn_attrs(layer.wr, state)
+
+                input_sym = sym
+                sym_uz = self._get_darknet_rnn_attrs(layer.uz, input_sym)
+                sym_ur = self._get_darknet_rnn_attrs(layer.ur, input_sym)
+                sym_uh = self._get_darknet_rnn_attrs(layer.uh, input_sym)
+
+                new_inputs = _as_list([sym_uz, sym_wz])
+                add_z = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([sym_ur, sym_wr])
+                add_r = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                act_attr['activation'] = ACTIVATION.LOGISTIC
+                act_z, _ = _darknet_activations(_as_list(add_z), act_attr)
+
+                act_attr['activation'] = ACTIVATION.LOGISTIC
+                act_r, _ = _darknet_activations(_as_list(add_r), act_attr)
+
+                new_inputs = _as_list([act_r, state])
+                forgot = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
+
+                sym_wh = self._get_darknet_rnn_attrs(layer.wh, forgot)
+
+                new_inputs = _as_list([sym_uh, sym_wh])
+                h_state = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                if layer.tanh == 1:
+                    act_attr['activation'] = ACTIVATION.TANH
+                else:
+                    act_attr['activation'] = ACTIVATION.LOGISTIC
+                h_state, _ = _darknet_activations(_as_list(h_state), act_attr)
+
+                sym = act_z * state + (1 - act_z) * h_state
+
+                self._outs = self._outs + [sym]
+            self._sym_array[layer_num] = sym
+            processed = True
+
         return processed, sym
 
+    def _make_outlist(self, sym, op_name, layer, layer_num):
+        if layer.type == LAYERTYPE.REGION:
+            k = self._get_tvm_params_name(op_name, 'attr')
+            self._outs.insert(0, _sym.Variable(name=k, init=self._tvmparams[k].asnumpy()))
+            k = self._get_tvm_params_name(op_name, 'bias')
+            self._outs.insert(0, _sym.Variable(name=k, init=self._tvmparams[k].asnumpy()))
+            if layer_num != self.net.n-1:
+                self._outs.insert(0, sym)
+
+        elif layer.type == LAYERTYPE.YOLO:
+            k = self._get_tvm_params_name(op_name, 'attr')
+            self._outs.insert(0, _sym.Variable(name=k, init=self._tvmparams[k].asnumpy()))
+            k = self._get_tvm_params_name(op_name, 'bias')
+            self._outs.insert(0, _sym.Variable(name=k, init=self._tvmparams[k].asnumpy()))
+            k = self._get_tvm_params_name(op_name, 'mask')
+            self._outs.insert(0, _sym.Variable(name=k, init=self._tvmparams[k].asnumpy()))
+            if layer_num != self.net.n-1:
+                self._outs.insert(0, sym)
+
+        return
+
     def from_darknet(self):
         """To convert the darknet symbol to nnvm symbols."""
         for i in range(self.net.n):
@@ -717,6 +940,8 @@ def from_darknet(self):
             layer_name, sym = _darknet_convert_symbol(op_name, _as_list(sym), attr)
             self._get_darknet_params(self.net.layers[i], layer_name)
             self._sym_array[i] = sym
+            self._make_outlist(sym, layer_name, layer, i)
+
         self._outs = _as_list(sym) + self._outs
         if isinstance(self._outs, list):
             sym = _sym.Group(self._outs)
diff --git a/nnvm/python/nnvm/frontend/keras.py b/nnvm/python/nnvm/frontend/keras.py
index 15493d18e7bb..9dabebc14b90 100644
--- a/nnvm/python/nnvm/frontend/keras.py
+++ b/nnvm/python/nnvm/frontend/keras.py
@@ -28,6 +28,10 @@ def _get_elu(insym, alpha):
     """
     return -alpha * _sym.relu(1 - _sym.exp(insym)) + _sym.relu(insym)
 
+def _convert_recurrent_activation(insym, keras_layer):
+    act_type = keras_layer.recurrent_activation.__name__
+    return _convert_activation(insym, act_type, None)
+
 def _convert_activation(insym, keras_layer, _):
     if isinstance(keras_layer, str):
         act_type = keras_layer
@@ -58,8 +62,10 @@ def _convert_activation(insym, keras_layer, _):
         return _get_elu(insym, alpha)
     elif act_type == 'selu':
         # Alpha, Gamma values, obtained from  https://arxiv.org/abs/1706.02515
-        alpha = keras_layer.alpha if hasattr(keras_layer, "alpha") else 1.6732
-        gamma = keras_layer.gamma if hasattr(keras_layer, "gamma") else 1.0507
+        alpha = keras_layer.alpha if hasattr(keras_layer, "alpha") \
+            else 1.6732632423543772848170429916717
+        gamma = keras_layer.gamma if hasattr(keras_layer, "gamma") \
+            else 1.0507009873554804934193349852946
         return gamma * _get_elu(insym, alpha)
     elif act_type == 'relu6':
         return _sym.clip(insym, a_min=0, a_max=6)
@@ -75,6 +81,8 @@ def _convert_activation(insym, keras_layer, _):
 def _convert_advanced_activation(insym, keras_layer, symtab):
     act_type = type(keras_layer).__name__
     if act_type == 'ReLU':
+        if keras_layer.max_value:
+            return _sym.clip(insym, a_min=0, a_max=keras_layer.max_value)
         return _sym.relu(insym)
     elif act_type == 'LeakyReLU':
         return _sym.leaky_relu(insym, alpha=keras_layer.alpha)
@@ -123,6 +131,14 @@ def _convert_dense(insym, keras_layer, symtab):
     if keras_layer.use_bias:
         params['use_bias'] = True
         params['bias'] = symtab.new_const(weightList[1])
+    input_shape = keras_layer.input_shape
+    input_dim = len(input_shape)
+    # In case of RNN dense, input shape will be (1, 1, n)
+    if input_dim > 2:
+        input_shape = tuple(dim if dim else 1 for dim in _as_list(input_shape)[0])
+        if input_dim != 3 or input_shape[0] != 1 or input_shape[1] != 1:
+            raise ValueError("Cannot flatten the inputs with shape.", input_shape, " for dense.")
+        insym = _sym.squeeze(insym, axis=0)
     out = _sym.dense(data=insym, **params)
     # defuse activation
     if sys.version_info.major < 3:
@@ -131,6 +147,8 @@ def _convert_dense(insym, keras_layer, symtab):
         act_type = keras_layer.activation.__name__
     if act_type != 'linear':
         out = _convert_activation(out, act_type, symtab)
+    if input_dim > 2:
+        out = _sym.expand_dims(out, axis=0)
     return out
 
 
@@ -153,8 +171,8 @@ def _convert_convolution(insym, keras_layer, symtab):
         dilation = [keras_layer.dilation_rate[0], keras_layer.dilation_rate[1]]
     else:
         dilation = [keras_layer.dilation_rate, keras_layer.dilation_rate]
-    kernel_h = (kernel_h - 1) * dilation[0] + 1
-    kernel_w = (kernel_w - 1) * dilation[1] + 1
+    dilated_kernel_h = (kernel_h - 1) * dilation[0] + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation[1] + 1
     stride_h, stride_w = keras_layer.strides
     params = {'weight': symtab.new_const(weight),
               'kernel_size': [kernel_h, kernel_w],
@@ -176,9 +194,12 @@ def _convert_convolution(insym, keras_layer, symtab):
     elif keras_layer.padding == 'same':
         in_h = keras_layer.input_shape[1]
         in_w = keras_layer.input_shape[2]
-        pad_t, pad_b = _get_pad_pair(in_h, kernel_h, stride_h)
-        pad_l, pad_r = _get_pad_pair(in_w, kernel_w, stride_w)
-        insym = _sym.pad(data=insym, pad_width=((0, 0), (0, 0), (pad_t, pad_b), (pad_l, pad_r)))
+        pad_t, pad_b = _get_pad_pair(in_h, dilated_kernel_h, stride_h)
+        pad_l, pad_r = _get_pad_pair(in_w, dilated_kernel_w, stride_w)
+        if pad_t == pad_b and pad_l == pad_r:
+            params['padding'] = (pad_t, pad_l)
+        else:
+            insym = _sym.pad(data=insym, pad_width=((0, 0), (0, 0), (pad_t, pad_b), (pad_l, pad_r)))
     else:
         raise TypeError("Unsupported padding type : {}".format(keras_layer.padding))
     if is_deconv:
@@ -269,14 +290,12 @@ def _convert_pooling(insym, keras_layer, symtab):
                   'padding': [0, 0]}
         if keras_layer.padding == 'valid':
             pass
-        # we insert a separate pad operator
         elif keras_layer.padding == 'same':
             in_h = keras_layer.input_shape[1]
             in_w = keras_layer.input_shape[2]
             pad_t, pad_b = _get_pad_pair(in_h, pool_h, stride_h)
             pad_l, pad_r = _get_pad_pair(in_w, pool_w, stride_w)
-            insym = _sym.pad(data=insym, pad_width=(
-                (0, 0), (0, 0), (pad_t, pad_b), (pad_l, pad_r)))
+            params['padding'] = [pad_t, pad_l, pad_b, pad_r]
         else:
             raise TypeError("Unsupported padding type : {}".format(keras_layer.padding))
         if pool_type == 'MaxPooling2D':
@@ -311,6 +330,21 @@ def _convert_upsample(insym, keras_layer, _):
     return _sym.upsampling(insym, **params)
 
 
+def _convert_cropping(insym, keras_layer, _):
+    _check_data_format(keras_layer)
+    crop_type = type(keras_layer).__name__
+    if crop_type == "Cropping1D":
+        raise NotImplementedError("Cropping1D not implemented")
+    elif crop_type == "Cropping2D":
+        (_, in_h, in_w, _) = keras_layer.input_shape
+        ((crop_t, crop_b), (crop_l, crop_r)) = keras_layer.cropping
+    else:
+        raise TypeError("Unrecognized cropping type : {}".format(crop_type))
+    int32_max = np.iinfo(np.int32).max
+    return _sym.strided_slice(insym, begin=[0, 0, crop_t, crop_l],
+                              end=[int32_max, int32_max, in_h-crop_b, in_w-crop_r])
+
+
 def _convert_batchnorm(insym, keras_layer, symtab):
     params = {'scale': False,
               'center': False,
@@ -375,6 +409,125 @@ def _convert_reshape(insym, keras_layer, _):
     shape = (-1, ch) + keras_layer.target_shape[:-1]
     return _sym.reshape(insym, shape=shape)
 
+def _convert_lstm(insym, keras_layer, symtab):
+    _check_data_format(keras_layer)
+    if not isinstance(insym, list):
+        buffer = np.zeros((1, keras_layer.units), 'float32')
+        c_sym = symtab.new_const(buffer)
+        h_sym = symtab.new_const(buffer)
+        insym = [insym, h_sym, c_sym]
+
+    in_data = insym[0]
+    next_h = insym[1]
+    next_c = insym[2]
+
+    weightList = keras_layer.get_weights()
+    inp_shape = tuple(dim if dim else 1 for dim in _as_list(keras_layer.input_shape)[0])
+
+    kernel_wt = symtab.new_const(weightList[0].transpose([1, 0]))
+    recurrent_wt = symtab.new_const(weightList[1].transpose([1, 0]))
+    in_bias = symtab.new_const(weightList[2])
+
+    units = list(weightList[0].shape)[1]
+
+    time_steps = inp_shape[1]
+    in_data = _sym.squeeze(in_data, axis=0)
+    in_data = _sym.split(in_data, indices_or_sections=time_steps, axis=0)
+    #loop for the number of time_steps
+    for data in in_data:
+        ixh1 = _sym.dense(data, kernel_wt, use_bias=False, units=units)
+        ixh2 = _sym.dense(next_h, recurrent_wt, in_bias, use_bias=True, units=units)
+        gate = ixh1 + ixh2
+        gates = _sym.split(gate, indices_or_sections=4, axis=1)
+        in_gate = _convert_recurrent_activation(gates[0], keras_layer)
+        in_transform = _convert_recurrent_activation(gates[1], keras_layer)
+        next_c = in_transform * next_c + in_gate * _convert_activation(gates[2], keras_layer, None)
+        out_gate = _convert_recurrent_activation(gates[3], keras_layer)
+        next_h = out_gate * _convert_activation(next_c, keras_layer, None)
+
+    out_shape = tuple(dim if dim else 1 for dim in _as_list(keras_layer.output_shape)[0])
+    out = _sym.reshape(next_h, shape=out_shape)
+    return [out, next_h, next_c]
+
+def _convert_simple_rnn(insym, keras_layer, symtab):
+    _check_data_format(keras_layer)
+    if not isinstance(insym, list):
+        buffer = np.zeros((1, keras_layer.units), 'float32')
+        prev_sym = symtab.new_const(buffer)
+        insym = [insym, prev_sym]
+    in_data = insym[0]
+    prev_sym = insym[1]
+
+    weightList = keras_layer.get_weights()
+    kernel_wt = symtab.new_const(weightList[0].transpose([1, 0]))
+    recurrent_wt = symtab.new_const(weightList[1].transpose([1, 0]))
+    in_bias = symtab.new_const(weightList[2])
+    units = list(weightList[0].shape)[1]
+
+    in_data = _sym.flatten(in_data)
+    ixh = _sym.dense(in_data, kernel_wt, in_bias, use_bias=True, units=units)
+    prev_sym = _sym.flatten(prev_sym)
+    ixh2 = _sym.dense(prev_sym, recurrent_wt, use_bias=False, units=units)
+    output = ixh + ixh2
+    output = _convert_activation(output, keras_layer, None)
+
+    out_shape = tuple(dim if dim else 1 for dim in _as_list(keras_layer.output_shape)[0])
+    output = _sym.reshape(output, shape=out_shape)
+
+    return [output, output]
+
+def _convert_gru(insym, keras_layer, symtab):
+    _check_data_format(keras_layer)
+    if not isinstance(insym, list):
+        buffer = np.zeros((1, keras_layer.units), 'float32')
+        h_tm1 = symtab.new_const(buffer)
+        insym = [insym, h_tm1]
+    in_data = insym[0]
+    h_tm1_sym = insym[1]
+
+    weightList = keras_layer.get_weights()
+    kernel_wt = symtab.new_const(weightList[0].transpose([1, 0]))
+    recurrent_wt = symtab.new_const(weightList[1].transpose([1, 0]))
+    in_bias = symtab.new_const(weightList[2])
+
+    units = list(weightList[0].shape)[1]
+
+    in_data = _sym.flatten(in_data)
+    matrix_x = _sym.dense(in_data, kernel_wt, in_bias, use_bias=True, units=units)
+
+    # inputs projected by all gate matrices at once
+    split_indices = [keras_layer.units, 2 * keras_layer.units]
+    gates = _sym.split(matrix_x, indices_or_sections=split_indices, axis=1)
+    x_z = gates[0]
+    x_r = gates[1]
+    x_h = gates[2]
+
+    # hidden state projected separately for update/reset and new
+    units = 2 * keras_layer.units
+    split_indices = [units]
+    rec_wts = _sym.split(recurrent_wt, indices_or_sections=split_indices, axis=0)
+
+    h_tm1_sym = _sym.flatten(h_tm1_sym)
+    matrix_inner = _sym.dense(h_tm1_sym, rec_wts[0], use_bias=False, units=units)
+
+    split_indices = [keras_layer.units]
+    recurrent = _sym.split(matrix_inner, indices_or_sections=split_indices, axis=1)
+    recurrent_z = recurrent[0]
+    recurrent_r = recurrent[1]
+
+    rec_act_z = _convert_recurrent_activation(x_z + recurrent_z, keras_layer)
+    rec_act_r = _convert_recurrent_activation(x_r + recurrent_r, keras_layer)
+
+    units = keras_layer.units
+    recurrent_h = _sym.dense(rec_act_r * h_tm1_sym, rec_wts[1], use_bias=False, units=units)
+    act_hh = _convert_activation(x_h + recurrent_h, keras_layer, None)
+
+    # previous and candidate state mixed by update gate
+    output = rec_act_z * h_tm1_sym + (1 - rec_act_z) * act_hh
+
+    out_shape = tuple(dim if dim else 1 for dim in _as_list(keras_layer.output_shape)[0])
+    output = _sym.reshape(output, shape=out_shape)
+    return [output, output]
 
 def _default_skip(insym, keras_layer, _): # pylint: disable=unused-argument
     """Layers that can be skipped because they are train time only."""
@@ -409,6 +562,7 @@ def _default_skip(insym, keras_layer, _): # pylint: disable=unused-argument
     'Multiply'                 : _convert_merge,
     'ZeroPadding2D'            : _convert_padding,
     'UpSampling2D'             : _convert_upsample,
+    'Cropping2D'               : _convert_cropping,
 
     # 'ZeroPadding1D'          : _convert_padding,
     # 'AveragePooling1D'       : _convert_pooling,
@@ -416,14 +570,13 @@ def _default_skip(insym, keras_layer, _): # pylint: disable=unused-argument
     # 'GlobalAveragePooling1D' : _convert_pooling,
     # 'GlobalMaxPooling1D'     : _convert_pooling,
     # 'Cropping1D'             : _convert_cropping,
-    # 'Cropping2D'             : _convert_cropping,
     # 'UpSampling1D'           : _convert_upsample,
     # 'UpSampling3D'           : _convert_upsample,
     # 'Conv1D'                 : _convert_convolution1d,
 
-    # 'GRU'                    : _convert_gru,
-    # 'LSTM'                   : _convert_lstm,
-    # 'SimpleRNN'              : _convert_simple_rnn,
+    'SimpleRNN'                : _convert_simple_rnn,
+    'LSTM'                     : _convert_lstm,
+    'GRU'                      : _convert_gru,
     # 'Bidirectional'          : _convert_bidirectional,
     # 'TimeDistributed'        : _default_skip,
 
@@ -446,6 +599,11 @@ def _check_unsupported_layers(model):
         if type(layer).__name__ not in _convert_map:
             raise ValueError("Keras layer {} not supported.".format(type(layer).__name__))
 
+def _as_list(arr):
+    """Force being a list, ignore if already is."""
+    if isinstance(arr, list):
+        return arr
+    return [arr]
 
 def keras_op_to_nnvm(insym, keras_layer, outname, symtab):
     """Convert keras layer to nnvm symbol, and update symtab.
@@ -466,9 +624,12 @@ def keras_op_to_nnvm(insym, keras_layer, outname, symtab):
     """
     if type(keras_layer).__name__ not in _convert_map:
         raise NotImplementedError("{} is not supported".format((type(keras_layer).__name__)))
-    ret = _convert_map[type(keras_layer).__name__](insym, keras_layer, symtab)
-    symtab.set_var(outname, ret)
+    outs = _convert_map[type(keras_layer).__name__](insym, keras_layer, symtab)
+    outs = _as_list(outs)
 
+    for t_idx, out in enumerate(outs):
+        name = outname + ":" + str(t_idx)
+        symtab.set_var(name, out)
 
 def from_keras(model):
     """Convert keras model to NNVM format.
@@ -509,7 +670,13 @@ def from_keras(model):
             if inbound_nodes is None:
                 raise TypeError("Unknown layer type or unsupported Keras version : {}"
                                 .format(keras_layer))
-            for my_idx, node in enumerate(inbound_nodes):
+            for node_idx, node in enumerate(inbound_nodes):
+                # If some nodes in imported model is not relevant to the current model,
+                # skip such layers. model._network_nodes contains keys of all nodes relevant
+                # to the current model.
+                if not model._node_key(keras_layer, node_idx) in model._network_nodes:
+                    continue
+
                 insym = []
 
                 # Since Keras allows creating multiple layers from the same name instance,
@@ -517,17 +684,25 @@ def from_keras(model):
                 # The one exception is InputLayer.  Changing input variable names after conversion
                 # would confuse users, so we should keep them as far as possible.  Fortunately,
                 # they are named uniquely to input_1, input_2, input_3 ... by default.
-                for pred_idx, pred in zip(node.node_indices, node.inbound_layers):
-                    if isinstance(pred, keras.engine.InputLayer):
-                        _sym = symtab.get_var(pred.name, must_contain=True)
+                zip_node = zip(node.node_indices, node.tensor_indices, node.inbound_layers)
+                for n_idx, t_idx, layer in zip_node:
+                    if isinstance(layer, keras.engine.InputLayer):
+                        sym = symtab.get_var(layer.name, must_contain=True)
                     else:
-                        _sym = symtab.get_var(pred.name + ':' + str(pred_idx), must_contain=True)
-                    insym.append(_sym)
+                        sym_name = layer.name + ':' + str(n_idx) + ':' + str(t_idx)
+                        sym = symtab.get_var(sym_name, must_contain=True)
+                    insym.append(sym)
 
                 if len(insym) == 1:
                     insym = insym[0]
-                keras_op_to_nnvm(insym, keras_layer, keras_layer.name + ':' + str(my_idx), symtab)
+                keras_op_to_nnvm(insym, keras_layer, keras_layer.name + ':' + str(node_idx), symtab)
+
+    #model._output_coordinates contains out_node(oc[0]), node_index(oc[1]) and tensor index(oc[2])
+    #Get all output nodes in symtab using the name made from above values. The out symbols
+    #were added to symtab in keras_op_to_nnvm using this name. For multiple outputs, make a list
+    #with these output symbols and Group them.
+    outsym = [symtab.get_var(oc[0].name + ":" + str(oc[1]) + ":" + str(oc[2]))
+              for oc in model._output_coordinates]
 
-    outsym = symtab.get_var(model._output_layers[0].name + ':0')
     tvmparams = {k:tvm.nd.array(np.array(v, dtype=np.float32)) for k, v in symtab.params.items()}
-    return outsym, tvmparams
+    return _sym.Group(outsym), tvmparams
diff --git a/nnvm/python/nnvm/frontend/mxnet.py b/nnvm/python/nnvm/frontend/mxnet.py
index f0217fc1ec85..03ba879aa5cf 100644
--- a/nnvm/python/nnvm/frontend/mxnet.py
+++ b/nnvm/python/nnvm/frontend/mxnet.py
@@ -259,12 +259,12 @@ def _crop_like(inputs, attrs):
 
 
 def _expand_dims(inputs, attrs):
-    op_name, new_attrs = "expand_dims", {}
+    op_name, new_attrs = 'expand_dims', {}
     new_attrs['axis'] = _required_attr(attrs, 'axis')
     return _get_nnvm_op(op_name)(*inputs, **new_attrs)
 
 def _lrn(inputs, attrs):
-    op_name, new_attrs = "lrn", {}
+    op_name, new_attrs = 'lrn', {}
     new_attrs['alpha'] = attrs.get('alpha', 0.0001)
     new_attrs['beta'] = attrs.get('beta', 0.75)
     new_attrs['bias'] = attrs.get('knorm', 2)
@@ -273,6 +273,28 @@ def _lrn(inputs, attrs):
     new_attrs['size'] = _required_attr(attrs, 'nsize')
     return _get_nnvm_op(op_name)(*inputs, **new_attrs)
 
+def _ones(_, attrs):
+    op_name = 'ones'
+    return _get_nnvm_op(op_name)(**attrs)
+
+def _zeros(_, attrs):
+    op_name = 'zeros'
+    return _get_nnvm_op(op_name)(**attrs)
+
+def _argmax(inputs, attrs):
+    op_name, new_attrs = 'argmax', {}
+    new_attrs['dtype'] = 'float32'
+    new_attrs['axis'] = attrs.get('axis', 0)
+    new_attrs['keepdims'] = _parse_bool_str(attrs, 'keepdims', default="False")
+    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+
+def _argmin(inputs, attrs):
+    op_name, new_attrs = 'argmin', {}
+    new_attrs['dtype'] = 'float32'
+    new_attrs['axis'] = attrs.get('axis', 0)
+    new_attrs['keepdims'] = _parse_bool_str(attrs, 'keepdims', default="False")
+    return _get_nnvm_op(op_name)(*inputs, **new_attrs)
+
 _identity_list = ['__add_scalar__', '__add_symbol__', '__div_scalar__',
                   '__div_symbol__', '__mul_scalar__', '__mul_symbol__',
                   '__pow_scalar__', '__rdiv_scalar__', '__rpow_scalar__',
@@ -281,8 +303,9 @@ def _lrn(inputs, attrs):
                   'broadcast_sub', 'broadcast_to', 'cast', 'elemwise_add',
                   'elemwise_div', 'elemwise_mul', 'elemwise_sub', 'exp',
                   'flatten', 'log', 'log_softmax', 'max', 'min', 'negative',
-                  'relu', 'sigmoid', 'slice_like', 'softmax', 'sum', 'tanh',
-                  'transpose']
+                  'ones_like', 'relu', 'sigmoid', 'slice_like', 'softmax',
+                  'sum', 'tanh', 'transpose', 'zeros_like', 'gather_nd',
+                  'reshape_like']
 
 _convert_map = {
     '_copy'         : _rename('copy'),
@@ -294,6 +317,10 @@ def _lrn(inputs, attrs):
     '_rminus_scalar': _rename('__rsub_scalar__'),
     '_contrib_MultiBoxPrior' : _rename('multibox_prior'),
     '_contrib_MultiBoxDetection' : _contrib_multibox_detection,
+    '_ones'         : _ones,
+    '_zeros'        : _zeros,
+    'argmax'        : _argmax,
+    'argmin'        : _argmin,
     'Activation'    : _activations,
     'BatchNorm'     : _batch_norm,
     'BatchNorm_v1'  : _batch_norm,
@@ -371,6 +398,55 @@ def _as_list(arr):
         return arr
     return [arr]
 
+def _topo_sort(symbol):
+    """Sort all symbols in the mxnet graph in topological order.
+
+    Parameters
+    ----------
+    symbol : mxnet.sym.Symbol
+
+    Returns:
+    -------
+    list
+        List of mxnet symbol
+    """
+    queue = []
+    symbol_map = {}
+    deps = {}
+    dep_cnts = {}
+    for s in symbol:
+        symbol_map[s.attr('name')] = s
+        queue.append(s)
+    while queue:
+        sym = queue.pop(0)
+        name = sym.attr('name')
+        childs = sym.get_children()
+        if childs is None:
+            dep_cnts[name] = 0
+        else:
+            dep_cnts[name] = len(set([c.attr('name') for c in childs]))
+            for child in childs:
+                child_name = child.attr('name')
+                if child_name not in deps:
+                    deps[child_name] = set()
+                deps[child_name].add(name)
+                if child_name not in symbol_map:
+                    symbol_map[child_name] = child
+                    queue.append(child)
+    order = []
+    while dep_cnts:
+        remove = []
+        for name in dep_cnts:
+            if dep_cnts[name] == 0:
+                order.append(symbol_map[name])
+                remove.append(name)
+                if name in deps:
+                    for other in deps[name]:
+                        dep_cnts[other] -= 1
+        for name in remove:
+            del dep_cnts[name]
+    return order
+
 def _from_mxnet_impl(symbol, graph):
     """Convert mxnet symbol to nnvm implementation.
     Reconstruct a nnvm symbol by traversing the mxnet symbol.
@@ -388,27 +464,37 @@ def _from_mxnet_impl(symbol, graph):
     nnvm.sym.Symbol
         Converted symbol
     """
-    if len(symbol.list_outputs()) > 1:
-        return [_from_mxnet_impl(s, graph) for s in symbol]
-
-    name = symbol.attr('name')
-    output_index = json.loads(symbol.tojson())['heads'][0][1]
-    node = graph.get(name, None)
-    if node:
-        return node[output_index]
-    attr = symbol.list_attr()
-    # op_name = symbol.attr('op_name')
-    childs = symbol.get_children()
-    if childs is not None:
-        op_name = symbol.attr('op_name')
-        childs = [_from_mxnet_impl(childs[i], graph) for i in range(len(childs.list_outputs()))]
-        childs = [x for y in childs for x in _as_list(y)]  # expand group symbol
-        node = _convert_symbol(op_name, childs, attr)
-    else:
-        op_name = json.loads(symbol.tojson())['nodes'][0]['op']
-        node = _sym.Variable(name=name, **attr)
-    graph[name] = node
-    return node[output_index]
+    def get_node(sym):
+        name = sym.attr('name')
+        if name not in graph:
+            return None
+        output_index = json.loads(sym.tojson())['heads'][0][1]
+        return graph[name][output_index]
+
+    assert symbol is not None
+    # Traverse all symbols in topological order
+    for sym in _topo_sort(symbol):
+        name = sym.attr('name')
+        attr = sym.list_attr()
+        op_name = sym.attr('op_name')
+        childs = sym.get_children()
+        if childs is not None:
+            childs = [get_node(child) for child in childs]
+            childs = [x for y in childs for x in _as_list(y)]
+            node = _convert_symbol(op_name, childs, attr)
+        elif op_name != 'null':
+            node = _convert_symbol(op_name, [], attr)
+        else:
+            node = _sym.Variable(name=name, **attr)
+        graph[name] = node
+    nodes = []
+    for sym in symbol:
+        node = get_node(sym)
+        assert node is not None
+        nodes.append(node)
+    if len(nodes) > 1:
+        return _sym.Group(nodes)
+    return nodes[0]
 
 def from_mxnet(symbol, arg_params=None, aux_params=None):
     """Convert from MXNet's model into compatible NNVM format.
diff --git a/nnvm/python/nnvm/frontend/onnx.py b/nnvm/python/nnvm/frontend/onnx.py
index fa26648b293a..ad0acc31a521 100644
--- a/nnvm/python/nnvm/frontend/onnx.py
+++ b/nnvm/python/nnvm/frontend/onnx.py
@@ -1,12 +1,12 @@
-# pylint: disable=import-self, invalid-name, unused-argument
+# pylint: disable=import-self, invalid-name, unused-argument, too-many-lines
 """ONNX: Open Neural Network Exchange frontend."""
 from __future__ import absolute_import as _abs
 import numpy as np
 import tvm
 from .. import symbol as _sym
-from .. import graph as _graph
-from ..compiler import graph_util
 from .common import get_nnvm_op, Renamer, SymbolTable, AttrConverter as AttrCvt
+from .onnx_caffe2_utils import dimension_picker, dimension_constraint, \
+    infer_channels, revert_caffe2_pad
 
 __all__ = ['from_onnx']
 
@@ -31,10 +31,9 @@ def get_converter(cls, opset):
             max([i for i, v in enumerate(versions) if v == opset]) - 1]
         if hasattr(cls, '_impl_v{}'.format(version)):
             return getattr(cls, '_impl_v{}'.format(version))
-        else:
-            raise NotImplementedError(
-                'opset version {} of {} not implemented'.format(
-                    version, cls.__name__))
+        raise NotImplementedError(
+            'opset version {} of {} not implemented'.format(
+                version, cls.__name__))
 
 
 class Elemwise(OnnxOpConverter):
@@ -75,16 +74,16 @@ class Pool(OnnxOpConverter):
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
         return AttrCvt(
-            op_name=_dimension_picker(cls.name),
+            op_name=dimension_picker(cls.name),
             transforms={
                 'kernel_shape': 'pool_size',
-                'pads': ('padding', (0, 0), _revert_caffe2_pad)
+                'pads': ('padding', (0, 0), revert_caffe2_pad)
             },
             # very weird attributes here in onnx, force check
             ignores=['dilations'],
             # TODO(zhreshold): make sure ceil_mode in onnx, and layout?
             extras={'ceil_mode': False},
-            custom_check=_dimension_constraint())(inputs, attr, params)
+            custom_check=dimension_constraint())(inputs, attr, params)
 
 
 class Absolute(OnnxOpConverter):
@@ -119,18 +118,18 @@ class Conv(OnnxOpConverter):
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
         # get number of channels
-        channels = _infer_channels(inputs[1], params)
+        channels = infer_channels(inputs[1], params)
         attr['channels'] = channels
         return AttrCvt(
-            op_name=_dimension_picker('conv'),
+            op_name=dimension_picker('conv'),
             transforms={
                 'kernel_shape': 'kernel_size',
                 'dilations': ('dilation', (0, 0)),
-                'pads': ('padding', (0, 0), _revert_caffe2_pad),
+                'pads': ('padding', (0, 0), revert_caffe2_pad),
                 'group': ('groups', 1)
             },
             extras={'use_bias': len(inputs) == 3},
-            custom_check=_dimension_constraint())(inputs, attr, params)
+            custom_check=dimension_constraint())(inputs, attr, params)
 
 
 class ConvTranspose(OnnxOpConverter):
@@ -138,20 +137,20 @@ class ConvTranspose(OnnxOpConverter):
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
         # get number of channels
-        channels = _infer_channels(inputs[1], params, True)
+        channels = infer_channels(inputs[1], params, True)
         attr['channels'] = channels
         groups = attr.pop('group')
         attr['groups'] = groups
         return AttrCvt(
-            op_name=_dimension_picker('conv', '_transpose'),
+            op_name=dimension_picker('conv', '_transpose'),
             transforms={
                 'kernel_shape': 'kernel_size',
                 'dilations': ('dilation', (0, 0)),
-                'pads': ('padding', (0, 0), _revert_caffe2_pad)
+                'pads': ('padding', (0, 0), revert_caffe2_pad)
             },
             disables=['output_shape'],
             extras={'use_bias': len(inputs) == 3},
-            custom_check=_dimension_constraint())(inputs, attr, params)
+            custom_check=dimension_constraint())(inputs, attr, params)
 
 
 class Div(Elemwise):
@@ -181,7 +180,7 @@ def _impl_v1(cls, inputs, attr, params):
         transA = int(attr.get('transA', 0))
         transB = int(attr.get('transB', 0))
         # get number of channels
-        channels = _infer_channels(inputs[1], params, not transB)
+        channels = infer_channels(inputs[1], params, not transB)
         if transA:
             inputs[0] = _sym.transpose(inputs[0], axes=(1, 0))
         if not transB:
@@ -200,22 +199,44 @@ class Mul(Elemwise):
 
 
 class Pad(OnnxOpConverter):
+    """ Operator converter for Pad.
+    """
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
-        # get number of channels
-        channels = _infer_channels(inputs[1], params, True)
-        attr['channels'] = channels
-        groups = attr.pop('group')
-        attr['groups'] = groups
+        pad_width = []
+        pads = attr.pop('paddings')
+        dims = int(len(pads) / 2)
+        for i in range(dims):
+            pad_width.append((pads[i], pads[i+dims]))
+        attr['pad_width'] = pad_width
+
         return AttrCvt(
             op_name='pad',
             transforms={
                 'value': 'pad_value',
-                'pads': 'pad_width'
             },
-            custom_check=lambda attrs: attrs.get('mode') == 'constant')(
-                inputs, attr, params)
+            ignores=['mode'],
+            custom_check=(lambda attrs: attrs.get('mode', 'constant').decode("utf-8") == 'constant',
+                          'split mode != constant'))(inputs, attr, params)
+
+    @classmethod
+    def _impl_v2(cls, inputs, attr, params):
+        pad_width = []
+        pads = attr.pop('pads')
+        dims = int(len(pads) / 2)
+        for i in range(dims):
+            pad_width.append((pads[i], pads[i+dims]))
+        attr['pad_width'] = pad_width
+
+        return AttrCvt(
+            op_name='pad',
+            transforms={
+                'value': 'pad_value',
+            },
+            ignores=['mode'],
+            custom_check=(lambda attrs: attrs.get('mode', 'constant').decode("utf-8") == 'constant',
+                          'split mode != constant'))(inputs, attr, params)
 
 
 class ParametricSoftPlus(OnnxOpConverter):
@@ -233,7 +254,7 @@ class Prelu(OnnxOpConverter):
     def _impl_v1(cls, inputs, attr, params):
         assert len(inputs) == 2, "Prelu need 2 inputs, {} given".format(
             len(inputs))
-        channels = _infer_channels(inputs[1], params, False)
+        channels = infer_channels(inputs[1], params, False)
         if channels == 1:
             return inputs[0] * inputs[1]
         return _sym.broadcast_mul(inputs[0], inputs[1])
@@ -325,9 +346,9 @@ class ThresholdedRelu(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
-        alpha = float(attr.get('alpha', 0.0))
-        return _sym.relu(inputs[0] - alpha)
-
+        alpha = float(attr.get('alpha', 1.0))
+        alpha_tensor = _sym.full_like(inputs[0], fill_value=float(alpha))
+        return _sym.elemwise_mul(inputs[0], _sym.greater(inputs[0], alpha_tensor))
 
 class ImageScaler(OnnxOpConverter):
 
@@ -341,17 +362,6 @@ def _impl_v1(cls, inputs, attr, params):
         return ret
 
 
-def _revert_caffe2_pad(attr):
-    """Caffe2 require two times the normal padding."""
-    if len(attr) == 4:
-        attr = attr[:2]
-    elif len(attr) == 2:
-        pass
-    else:
-        raise ValueError("Invalid caffe2 type padding: {}".format(attr))
-    return attr
-
-
 def _broadcast_constraint():
 
     def _broadcast_check(attrs):
@@ -362,50 +372,35 @@ def _broadcast_check(attrs):
     return _broadcast_check, "Specifying broadcast axis not allowed."
 
 
-def _dimension_picker(prefix, surfix=''):
-
-    def _impl(attr):
-        kernel = attr['kernel_shape']
-        if len(kernel) == 2:
-            return prefix + '2d' + surfix
-        else:
-            raise NotImplementedError("Only 2d kernel supported.")
-
-    return _impl
-
-
-def _dimension_constraint():
-
-    def _dim_check(attrs):
-        if len(attrs['kernel_shape']) == 2:
-            return True
-        return False
-
-    return _dim_check, "Only 2d kernel supported."
-
-
-def _infer_channels(inputs, params, transpose=False):
-    """A hack for getting 'channles' or 'units' since onnx don't provide
-    these attributes. We check the shape of weights provided to get the number.
-    """
-    g = _graph.create(inputs)
-    shape_dict = {k: v.shape for k, v in params.items()}
-    _, out_shapes = graph_util.infer_shape(g, **shape_dict)
-    channels = out_shapes[0][0] if not transpose else out_shapes[0][1]
-    return channels
-
-
 def _fully_connected(opset):
 
     def _impl(inputs, attr, params):
         # get number of channels
-        channels = _infer_channels(inputs[1], params)
+        channels = infer_channels(inputs[1], params)
         attr['units'] = channels
         return AttrCvt('dense', ignores=['axis', 'axis_w'])(inputs, attr)
 
     return _impl
 
 
+class Upsample(OnnxOpConverter):
+    """ Operator converter for Upsample (nearest mode).
+    """
+
+    @classmethod
+    def _impl_v7(cls, inputs, attr, params):
+        scales = attr.get('scales')
+        assert len(scales) == 4 and scales[0] == 1.0 and scales[1] == 1.0 and scales[2] == scales[3]
+        mode = attr.get('mode')
+        if mode == b'nearest':
+            method = "NEAREST_NEIGHBOR"
+        elif mode == b'linear':
+            method = "BILINEAR"
+        else:
+            raise ValueError("Invalid ONNX upsample mode: {}".format(mode))
+        return _sym.upsampling(inputs[0], scale=int(scales[-1]), method=method, layout='NCHW')
+
+
 class Shape(OnnxOpConverter):
     """ Operator converter for Shape.
     """
@@ -446,6 +441,23 @@ def _impl_v1(cls, inputs, attr, params):
             inputs[0] = _sym.expand_dims(inputs[0], axis=axes, num_newaxis=1)
         return inputs[0]
 
+
+class Split(OnnxOpConverter):
+    """ Operator converter for Split.
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        attr['indices_or_sections'] = []
+        index = 0
+        for i in attr['split'][:-1]:
+            index += i
+            attr['indices_or_sections'].append(index)
+        return AttrCvt(
+            op_name='split',
+            ignores=['split'])(inputs, attr, params)
+
+
 class Slice(OnnxOpConverter):
     """ Operator converter for Slice.
     """
@@ -511,6 +523,126 @@ def _impl_v1(cls, inputs, attr, params):
         return _sym.lrn(inputs[0], size=nsize, axis=axis,
                         alpha=alpha, beta=beta, bias=bias)
 
+class Maximum(OnnxOpConverter):
+    """ Operator converter for Maximum.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        if not isinstance(inputs, list) or len(inputs) < 2:
+            raise ValueError("Expect minimum 2 inputs")
+        _max = inputs[0]
+        for i in range(1, len(inputs)):
+            _max = AttrCvt(op_name='broadcast_max')([_max, inputs[i]], {})
+        return _max
+
+class Minimum(OnnxOpConverter):
+    """ Operator converter for Minimum.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        if not isinstance(inputs, list) or len(inputs) < 2:
+            raise ValueError("Expect minimum 2 inputs")
+        _min = inputs[0]
+        for i in range(1, len(inputs)):
+            _min = AttrCvt(op_name='broadcast_min')([_min, inputs[i]], {})
+        return _min
+
+class Mean(OnnxOpConverter):
+    """ Operator converter for Mean.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        if not isinstance(inputs, list) or len(inputs) < 2:
+            raise ValueError("Expect minimum 2 inputs")
+        count = len(inputs)
+        _sum = inputs[0]
+        for i in range(1, count):
+            _sum = AttrCvt(op_name='broadcast_add')([_sum, inputs[i]], {})
+        return _sum / count
+
+class HardSigmoid(OnnxOpConverter):
+    """ Operator converter for HardSigmoid.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        alpha = attr.get('alpha', 0.2)
+        beta = attr.get('beta', 0.5)
+        transformX = (inputs[0] * alpha) + beta
+        attr = {'a_min':0, 'a_max':1}
+        return AttrCvt(op_name='clip')([transformX], attr)
+
+class ArgMax(OnnxOpConverter):
+    """ Operator converter for ArgMax.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        axis = attr.get('axis', 0)
+        keepdims = attr.get('keepdims', True)
+        attr = {'axis':axis, 'keepdims':keepdims}
+        return AttrCvt(op_name='argmax')(inputs, attr)
+
+class ArgMin(OnnxOpConverter):
+    """ Operator converter for ArgMin.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        axis = attr.get('axis', 0)
+        keepdims = attr.get('keepdims', True)
+        attr = {'axis':axis, 'keepdims':keepdims}
+        return AttrCvt(op_name='argmin')(inputs, attr)
+
+class Softmax(OnnxOpConverter):
+    """ Operator converter for Softmax.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        # set default value when axis is not set in the model
+        if 'axis' not in attr:
+            attr['axis'] = 1
+        return AttrCvt(
+            op_name='softmax',
+            transforms={
+                'axis': ('axis', 1),
+            })(inputs, attr, params)
+
+class ConstantFill(OnnxOpConverter):
+    """ Operator converter for ConstantFill.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        is_full = True
+        num_inputs = len(inputs)
+        if 'shape' in attr:
+            if num_inputs > 0:
+                raise ImportError(
+                    "Can't set shape and input tensor at a time")
+            shape = attr.pop('shape')
+        else:
+            if num_inputs == 0:
+                raise ImportError(
+                    "Either shape attribute or input should be set")
+            if 'input_as_shape' in attr and attr['input_as_shape']:
+                shape = params[inputs[0].list_output_names()[0]].asnumpy()
+            else:
+                is_full = False
+
+        if not is_full:
+            if 'extra_shape' in attr:
+                raise ImportError(
+                    "Extra Shape not supported with fill_like")
+
+            out = AttrCvt(
+                op_name='full_like',
+                transforms={'value': 'fill_value'},
+                ignores=['dtype'])(inputs, attr)
+            return _sym.cast(out, dtype=attr['dtype'].decode("utf-8"))
+        if 'extra_shape' in attr:
+            shape = shape + attr.pop('extra_shape')
+
+        return AttrCvt(
+            op_name='full',
+            transforms={'value': 'fill_value'},
+            extras={'shape':shape})(inputs, attr)
 
 # compatible operators that do NOT require any conversion.
 _identity_list = []
@@ -529,7 +661,7 @@ def _get_convert_map(opset):
         'ThresholdedRelu': ThresholdedRelu.get_converter(opset),
         'ScaledTanh': ScaledTanh.get_converter(opset),
         'ParametricSoftplus': ParametricSoftPlus.get_converter(opset),
-        # 'ConstantFill'
+        'ConstantFill': ConstantFill.get_converter(opset),
         # 'GivenTensorFill'
         'FC': AttrCvt('dense', ignores=['axis', 'axis_w']),
         'Scale': Scale.get_converter(opset),
@@ -539,7 +671,7 @@ def _get_convert_map(opset):
         # 'MeanVarianceNormalization'
         # 'Crop'
         # 'Embedding'
-        # 'Upsample'
+        'Upsample' : Upsample.get_converter(opset),
         'SpatialBN': BatchNorm.get_converter(opset),
 
         # defs/generator
@@ -572,14 +704,14 @@ def _get_convert_map(opset):
         'Pow': Renamer('broadcast_pow'),
         'PRelu': Prelu.get_converter(opset),
         'Sigmoid': Renamer('sigmoid'),
-        # 'HardSigmoid'
-        # 'Max' : this is the elemwise maximum
-        # 'Min' : this is the elemwise minimum
+        'HardSigmoid': HardSigmoid.get_converter(opset),
+        'Max': Maximum.get_converter(opset),
+        'Min': Minimum.get_converter(opset),
         'Sum': Sum.get_converter(opset),
-        # 'Mean'
+        'Mean': Mean.get_converter(opset),
         'Clip': AttrCvt('clip', transforms={'min': 'a_min', 'max': 'a_max'}),
         # softmax default axis is different in onnx
-        'Softmax': AttrCvt('softmax', {'axis': ('axis', 1)}),
+        'Softmax': Softmax.get_converter(opset),
         'LogSoftmax': AttrCvt('log_softmax', {'axis': ('axis', 1)}),
         # 'Hardmax'
         'Softsign': Softsign.get_converter(opset),
@@ -602,24 +734,24 @@ def _get_convert_map(opset):
         'LRN': LRN.get_converter(opset),
 
         # defs/reduction
-        'ReduceMax': AttrCvt('max', {'axes', 'axis'}),
-        'ReduceMin': AttrCvt('min', {'axes', 'axis'}),
-        'ReduceSum': AttrCvt('sum', {'axes', 'axis'}),
-        # 'ReduceMean'
+        'ReduceMax': AttrCvt('max', {'axes': 'axis'}),
+        'ReduceMin': AttrCvt('min', {'axes': 'axis'}),
+        'ReduceSum': AttrCvt('sum', {'axes': 'axis'}),
+        'ReduceMean': AttrCvt('mean', {'axes': 'axis'}),
         # 'ReduceProd'
         # 'ReduceLogSumExp'
-        # 'ArgMax'
-        # 'ArgMin'
+        'ArgMax': ArgMax.get_converter(opset),
+        'ArgMin': ArgMin.get_converter(opset),
 
         # defs/tensor
         'Cast': Cast.get_converter(opset),
         'Reshape': Reshape.get_converter(opset),
         'Concat': Renamer('concatenate'),
-        'Split': AttrCvt('split', {'split': 'indices_or_sections'}),
+        'Split': Split.get_converter(opset),
         'Slice': Slice.get_converter(opset),
         'Transpose': AttrCvt('transpose', {'perm': 'axes'}),
         'Gather': Gather.get_converter(opset),
-        'Squeeze': Renamer('squeeze'),
+        'Squeeze': AttrCvt('squeeze', {'axes': 'axis'}),
         'Unsqueeze': Unsqueeze.get_converter(opset),
         'Pad': Pad.get_converter(opset),
         'Shape': Shape.get_converter(opset),
diff --git a/nnvm/python/nnvm/frontend/onnx_caffe2_utils.py b/nnvm/python/nnvm/frontend/onnx_caffe2_utils.py
new file mode 100644
index 000000000000..4dfc366d0b6f
--- /dev/null
+++ b/nnvm/python/nnvm/frontend/onnx_caffe2_utils.py
@@ -0,0 +1,46 @@
+"""Util functions shared by the ONNX and Caffe2 frontends."""
+from __future__ import absolute_import as _abs
+from nnvm import graph as _graph
+from nnvm.compiler import graph_util
+
+
+def dimension_picker(prefix, surfix=''):
+    def _impl(attr):
+        kernel = attr['kernel_shape']
+        if len(kernel) == 2:
+            return prefix + '2d' + surfix
+        else:
+            raise NotImplementedError("Only 2d kernel supported.")
+
+    return _impl
+
+
+def dimension_constraint():
+    def _dim_check(attrs):
+        if len(attrs['kernel_shape']) == 2:
+            return True
+        return False
+
+    return _dim_check, "Only 2d kernel supported."
+
+
+def infer_channels(inputs, params, transpose=False):
+    """A hack for getting 'channels' or 'units' since caffe2 don't provide
+    these attributes. We check the shape of weights provided to get the number.
+    """
+    g = _graph.create(inputs)
+    shape_dict = {k: v.shape for k, v in params.items()}
+    _, out_shapes = graph_util.infer_shape(g, **shape_dict)
+    channels = out_shapes[0][0] if not transpose else out_shapes[0][1]
+    return channels
+
+
+def revert_caffe2_pad(pads):
+    """Caffe2 require two times the normal padding."""
+    if len(pads) == 4:
+        pads = pads[:2]
+    elif len(pads) == 2:
+        pass
+    else:
+        raise ValueError("Invalid caffe2 type padding: {}".format(pads))
+    return pads
diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index d761e34c7c59..47aca3816e6f 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -9,7 +9,7 @@
 import tvm
 from .. import symbol as _sym
 from .. import graph as _graph
-from .. compiler import graph_util
+from .. compiler import graph_util, build_module
 from .common import get_nnvm_op, AttrConverter as AttrConvert
 
 __all__ = ['from_tensorflow']
@@ -35,6 +35,8 @@ def __call__(self, inputs, attrs, *args):
         self._ignores.append('use_cudnn_on_gpu')
         self._ignores.append('_node_name')
         self._ignores.append('is_training')
+        self._ignores.append('_target_layout')
+        self._ignores.append('_input_0d_mismatch')
         # Retain the names
         try:
             attrs['name'] = attrs['_node_name']
@@ -109,11 +111,6 @@ def _elemwise(name):
     def _impl(inputs, attr, *args):
         assert len(inputs) == 2, "Math op take 2 inputs, {} given".format(len(inputs))
         op_name = _math_name_picker(name)(attr)
-        axis = int(attr.get('axis', 0))
-        conv_ops = ["conv2d", "conv2d_transpose"]
-        if op_name == 'broadcast_add' and inputs[0].attr('op_name') in conv_ops:
-            # TODO: remove hard coded infershape
-            inputs[1] = _sym.expand_dims(inputs[1], axis=axis, num_newaxis=2)
         return get_nnvm_op(op_name)(*inputs)
     return _impl
 
@@ -121,19 +118,27 @@ def _pooling(name):
     def _impl(inputs, attr, params):
 
         attr['data_format'] = attr['data_format'].decode("utf-8")
+        flip_layout = False
+
+        input_shape = attr['_input_shapes'][inputs[0]][0]
 
         if attr['data_format'] == 'NHWC':
             attr['kernel_shape'] = (attr['ksize'][1], attr['ksize'][2])
+            attr['strides'] = (attr['strides'][1], attr['strides'][2])
         elif attr['data_format'] == 'NCHW':
             attr['kernel_shape'] = (attr['ksize'][2], attr['ksize'][3])
+            attr['strides'] = (attr['strides'][2], attr['strides'][3])
         else:
             raise TypeError("Unsupported data_format type : {}".format(attr['data_format']))
 
-        # Fix strides
-        attr['strides'] = (attr['strides'][1], attr['strides'][2])
+        if attr['_target_layout'] == "NCHW" and attr['data_format'] == "NHWC":
+            tmp_shape = attr['_input_shapes'][inputs[0]][0]
+            input_shape = [tmp_shape[ii] for ii in (0, 3, 1, 2)]
+            inputs[0] = _sym.transpose(inputs[0], axes=(0, 3, 1, 2))
+            attr['data_format'] = "NCHW"
+            flip_layout = True
 
         # Fix padding
-        input_shapes = attr['_input_shapes'][inputs[0]]
         attr['padding'] = attr['padding'].decode("utf-8")
 
         if attr['padding'] == 'VALID':
@@ -142,11 +147,11 @@ def _impl(inputs, attr, params):
             stride_h, stride_w = attr['strides']
             kernel_h, kernel_w = attr['kernel_shape']
             if attr['data_format'] == 'NHWC':
-                in_h = input_shapes[0][1]
-                in_w = input_shapes[0][2]
+                in_h = input_shape[1]
+                in_w = input_shape[2]
             else:
-                in_h = input_shapes[0][2]
-                in_w = input_shapes[0][3]
+                in_h = input_shape[2]
+                in_w = input_shape[3]
 
             pad_v = _get_pad_pair(in_h, kernel_h, stride_h)
             pad_h = _get_pad_pair(in_w, kernel_w, stride_w)
@@ -158,7 +163,7 @@ def _impl(inputs, attr, params):
         if name == "avg_pool":
             attr['count_include_pad'] = False
 
-        return AttrCvt(
+        out = AttrCvt(
             op_name=_dimension_picker(name),
             transforms={
                 'kernel_shape':'pool_size',
@@ -166,36 +171,66 @@ def _impl(inputs, attr, params):
             ignores=['ksize'],
             extras={'ceil_mode': False},
             custom_check=_dimension_constraint())(inputs, attr)
+
+        if flip_layout:
+            out = _sym.transpose(out, axes=(0, 2, 3, 1))
+
+        return out
     return _impl
 
 def _conv(opname):
     def _impl(inputs, attr, params):
         attr['data_format'] = attr['data_format'].decode("utf-8")
-        input_shapes = attr['_input_shapes'][inputs[0]]
+        flip_layout = False
+
+        # NCHW Layout require weights transpose
+        if attr['data_format'] == 'NCHW':
+            tmp_shape = attr['_input_shapes'][inputs[1]][0]
+            tmp_shape = [tmp_shape[ii] for ii in (3, 2, 0, 1)]
+            inputs[1] = _sym.transpose(inputs[1], axes=(3, 2, 0, 1))
+            attr['_input_shapes'][inputs[1]] = [tmp_shape]
+
+        input_shape = attr['_input_shapes'][inputs[0]][0]
+        weights_shape = attr['_input_shapes'][inputs[1]][0]
+
+        if attr['_target_layout'] == "NCHW" and attr['data_format'] == "NHWC":
+            input_shape = [input_shape[ii] for ii in (0, 3, 1, 2)]
+            inputs[0] = _sym.transpose(inputs[0], axes=(0, 3, 1, 2))
+            if opname == 'conv':
+                weights_shape = [weights_shape[ii] for ii in (3, 2, 0, 1)]
+                inputs[1] = _sym.transpose(inputs[1], axes=(3, 2, 0, 1))
+            else:
+                weights_shape = [weights_shape[ii] for ii in (2, 3, 0, 1)]
+                inputs[1] = _sym.transpose(inputs[1], axes=(2, 3, 0, 1))
 
-        # Extract kernel shape from params
-        conv_param_weights = params[inputs[1].list_output_names()[0]]
+            attr['data_format'] = "NCHW"
+            attr['strides'] = [attr['strides'][ii] for ii in (0, 3, 1, 2)]
+            flip_layout = True
 
         if attr['data_format'] == 'NHWC':
-            kernel_h, kernel_w, _, depth_mult = conv_param_weights.shape
-            attr['kernel_shape'] = (conv_param_weights.shape[0], conv_param_weights.shape[1])
+            kernel_h, kernel_w, _, depth_mult = weights_shape
+            attr['kernel_shape'] = (weights_shape[0], weights_shape[1])
             if opname == 'conv':
-                attr['channels'] = conv_param_weights.shape[3]
+                attr['channels'] = weights_shape[3]
             else:
-                attr['channels'] = input_shapes[0][3] * depth_mult
+                attr['channels'] = input_shape[3] * depth_mult
 
             if 'dilations' in attr:
-                attr['dilations'] = (attr['dilations'][0], attr['dilations'][1])
+                attr['dilations'] = (attr['dilations'][1], attr['dilations'][2])
+            attr['strides'] = (attr['strides'][1], attr['strides'][2])
         elif attr['data_format'] == 'NCHW':
-            depth_mult, _, kernel_h, kernel_w = conv_param_weights.shape
-            attr['kernel_shape'] = (conv_param_weights.shape[2], conv_param_weights.shape[3])
+            depth_mult, _, kernel_h, kernel_w = weights_shape
+            attr['kernel_shape'] = (weights_shape[2], weights_shape[3])
             if opname == 'conv':
-                attr['channels'] = conv_param_weights.shape[1]
+                attr['channels'] = weights_shape[0]
             else:
-                attr['channels'] = input_shapes[0][1] * depth_mult
+                attr['channels'] = input_shape[0] * depth_mult
+                if attr['channels'] < 0:
+                    attr['channels'] *= -1
 
             if 'dilations' in attr:
                 attr['dilations'] = (attr['dilations'][2], attr['dilations'][3])
+            attr['strides'] = (attr['strides'][2], attr['strides'][3])
         else:
             raise TypeError("Unsupported data format type : {}".format(attr['data_format']))
 
@@ -203,9 +238,6 @@ def _impl(inputs, attr, params):
         if opname == 'depthwise':
             attr['groups'] = attr['channels']
 
-        # Fix strides
-        attr['strides'] = (attr['strides'][1], attr['strides'][2])
-
         # Fix padding
         attr['padding'] = attr['padding'].decode("utf-8")
 
@@ -215,14 +247,18 @@ def _impl(inputs, attr, params):
             stride_h, stride_w = attr['strides']
             kernel_h, kernel_w = attr['kernel_shape']
             if attr['data_format'] == 'NHWC':
-                in_h = input_shapes[0][1]
-                in_w = input_shapes[0][2]
+                in_h = input_shape[1]
+                in_w = input_shape[2]
             else:
-                in_h = input_shapes[0][2]
-                in_w = input_shapes[0][3]
+                in_h = input_shape[2]
+                in_w = input_shape[3]
 
-            pad_v = _get_pad_pair(in_h, kernel_h, stride_h)
-            pad_h = _get_pad_pair(in_w, kernel_w, stride_w)
+            dilation_h = attr['dilations'][0]
+            dilation_w = attr['dilations'][1]
+            dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+            dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+            pad_v = _get_pad_pair(in_h, dilated_kernel_h, stride_h)
+            pad_h = _get_pad_pair(in_w, dilated_kernel_w, stride_w)
 
             if attr['data_format'] == 'NHWC':
                 inputs[0] = _sym.pad(data=inputs[0],
@@ -248,7 +284,7 @@ def _impl(inputs, attr, params):
             else:
                 attr['kernel_layout'] = 'HWOI' if attr['data_format'] == 'NHWC' else 'OIHW'
 
-        return AttrCvt(
+        out = AttrCvt(
             op_name=_dimension_picker('conv'),
             transforms={
                 'kernel_shape': 'kernel_size',
@@ -257,6 +293,11 @@ def _impl(inputs, attr, params):
                 'group': ('groups', 1)},
             extras={'use_bias': len(inputs) == 3},
             custom_check=_dimension_constraint())(inputs, attr)
+
+        if flip_layout:
+            out = _sym.transpose(out, axes=(0, 2, 3, 1))
+
+        return out
     return _impl
 
 def _decode_image():
@@ -270,7 +311,8 @@ def _cast():
     def _impl(inputs, attr, params):
         # Convert from tensorflow Dtype to str
         attr['DstT'] = attr['DstT'].name
-        return AttrCvt(op_name='cast', transforms={'DstT': 'dtype'}, ignores=['SrcT'])(inputs, attr)
+        return AttrCvt(op_name='cast', transforms={'DstT': 'dtype'},
+                       ignores=['SrcT', 'Truncate'])(inputs, attr)
     return _impl
 
 def _expand_dims():
@@ -278,8 +320,7 @@ def _impl(inputs, attr, params):
         dim_input = inputs.pop(1)
         axis = params[dim_input.list_output_names()[0]]
         params.pop(dim_input.list_output_names()[0])
-        return AttrCvt(op_name="expand_dims", ignores=['Tdim'],
-                       extras={'axis': axis.asnumpy()[0]})(inputs, attr)
+        return _expand_dims_0d_aware(inputs[0], attr, axis=axis.asnumpy()[0])
     return _impl
 
 def _resize_bilinear():
@@ -305,7 +346,7 @@ def _matmul():
     def _impl(inputs, attr, params):
         channels = _infer_channels(inputs[1], params, not attr['transpose_b'])
         if attr['transpose_a']:
-            inputs[0] = _sym.transpose(inputs[0], axis(1, 0))
+            inputs[0] = _sym.transpose(inputs[0], axes=(1, 0))
         if not attr['transpose_b']:
             inputs[1] = _sym.transpose(inputs[1], axes=(1, 0))
         return AttrCvt(op_name="dense",
@@ -339,6 +380,14 @@ def _impl(inputs, attr, params):
             extras={'axis': axis.asnumpy()[0]})(inputs, attr)
     return _impl
 
+def _pack():
+    def _impl(inputs, attr, params):
+        axis = int(attr["axis"])
+        inputs_reshaped = [_expand_dims_0d_aware(i, attr, axis=axis, num_newaxis=1) for i in inputs]
+        return _sym.concatenate(*inputs_reshaped, axis=axis, name=attr["_node_name"])
+
+    return _impl
+
 def _reshape():
     def _impl(inputs, attr, params):
         try:
@@ -351,9 +400,19 @@ def _impl(inputs, attr, params):
                 extras={'shape':tuple(shape_arg.asnumpy())},
                 ignores=['Tshape'])(inputs, attr)
         except KeyError:
-            return AttrCvt(
-                op_name="reshape_like",
-                ignores=['Tshape'])(inputs, attr)
+            # Shape operator is already pruned, hence
+            # try to infer shape by precompute prune if possible.
+            if all(in_node in params for in_node in inputs[1].list_input_names()):
+                graph = _graph.create(_sym.Group(inputs[1]))
+                params_pre = {k: params[k] for k in inputs[1].list_input_names()}
+                params_new = build_module._run_graph(graph, params_pre)
+                inputs.pop(1)
+                return AttrCvt(
+                    op_name="reshape",
+                    extras={'shape':tuple(params_new[0].asnumpy().flatten())},
+                    ignores=['Tshape'])(inputs, attr)
+            else:
+                raise RuntimeError("Reshape with dynamic shape input not supported yet.")
     return _impl
 
 def _bias_add():
@@ -373,12 +432,27 @@ def _fused_batch_norm():
     def _impl(inputs, attr, params):
         # Tensorflow: (data, gamma, beta, moving_mean, moving_variance)
         # NNVM:       (data, gamma, beta, moving_mean, moving_varience)
-        return AttrCvt(
-            op_name='batch_norm',
-            transforms={'scale_after_normalization':'scale', 'variance_epsilon':'epsilon'},
-            extras={'axis': 3}, # Fix axis
-            ignores=['data_format'],
-            disables=['momentum'])(inputs, attr)
+        axis = 3
+        need_cast = False
+
+        if 'data_format' in attr:
+            attr['data_format'] = attr['data_format'].decode("utf-8")
+            if attr['data_format'] == 'NCHW':
+                axis = 1
+        if 'U' in attr:
+            need_cast = True
+            inputs[0] = _sym.cast(inputs[0], dtype=attr['U'].name)
+
+        out = AttrCvt(op_name='batch_norm',
+                      transforms={'scale_after_normalization':'scale',
+                                  'variance_epsilon':'epsilon'},
+                      extras={'axis': axis},
+                      ignores=['data_format', 'U'],
+                      disables=['momentum'])(inputs, attr)
+
+        if need_cast:
+            out = _sym.cast(out, dtype=attr['T'].name)
+        return out
     return _impl
 
 def _batch_norm():
@@ -389,10 +463,16 @@ def _impl(inputs, attr, params):
         # (data, gamma, beta, moving_mean, moving_var)
         new_inputs = [inputs[0], inputs[4], inputs[3], inputs[1], inputs[2]]
 
+        axis = 3
+        if 'data_format' in attr:
+            attr['data_format'] = attr['data_format'].decode("utf-8")
+            if attr['data_format'] == 'NCHW':
+                axis = 1
+
         return AttrCvt(
             op_name='batch_norm',
             transforms={'scale_after_normalization':'scale', 'variance_epsilon':'epsilon'},
-            extras={'axis': 3}, # Fix axis
+            extras={'axis': axis},
             ignores=['data_format'],
             disables=['momentum'])(new_inputs, attr)
     return _impl
@@ -404,9 +484,7 @@ def _impl(inputs, attr, params):
 
 def _shape():
     def _impl(inputs, attr, params):
-        # Result of this operator is prominently used by reshape operator.
-        # Just pass the input as it is so that reshape_like can be used there.
-        return inputs[0]
+        return np.array(attr['_input_shapes'][inputs[0]][0], dtype='int32')
     return _impl
 
 def _fill():
@@ -422,7 +500,6 @@ def _impl(inputs, attr, params):
 
 def _lrn():
     def _impl(inputs, attr, params):
-        new_inputs = []
         attr_new = {}
         depth_radius = attr.get('depth_radius', 5)
         size = (depth_radius * 2) + 1
@@ -431,12 +508,14 @@ def _impl(inputs, attr, params):
         attr_new['bias'] = attr.get('bias', 1)
         attr_new['alpha'] = attr.get('alpha', 1) * size
         attr_new['beta'] = attr.get('beta', 0.5)
-        return AttrCvt(op_name='lrn')(new_inputs, attr_new)
+        return AttrCvt(op_name='lrn')(inputs, attr_new)
     return _impl
 
 def _sum():
     def _impl(inputs, attr, params):
         axis = params.pop(inputs[1].list_output_names()[0]).asnumpy()
+        # convert to tuple for preventing invalid parameter format error
+        axis = tuple(axis)
         return AttrCvt(
             op_name='sum',
             extras={'axis': axis},
@@ -494,6 +573,7 @@ def _transform_mask(stride_dim, ellipsis_mask):
             m_begin = [0] * data_dim
             m_end = [0] * data_dim
             m_stride = [0] * data_dim
+            fshape_indices = []
             #Count new axis after ellipsis_mask, consider while applying ellipsis_mask.
             ellipsis_seen = False
             new_axes_after_ellipsis = 0
@@ -518,7 +598,10 @@ def _transform_mask(stride_dim, ellipsis_mask):
                         m_begin[final_index] = 0
                         m_end[final_index] = data_shape[0][final_index]
                         m_stride[final_index] = 1
+                        fshape_indices.append(final_index)
                         final_index += 1
+                elif mask &new_axis_mask:
+                    fshape_indices.append(-1)
                 elif not mask & new_axis_mask:
                     if final_index == len(m_begin):
                         break
@@ -539,28 +622,33 @@ def _transform_mask(stride_dim, ellipsis_mask):
                                                  if begin[index] < 0 else begin[index]
                         m_end[final_index] = begin[index] + 1
                         m_stride[final_index] = 1
+                        fshape_indices.append(-2)
+                    else:
+                        fshape_indices.append(final_index)
+
                     final_index += 1
-            return m_begin, m_end, m_stride
+            return m_begin, m_end, m_stride, fshape_indices
 
+        fshape_indices = None
         if begin_mask or end_mask or ellipsis_mask or new_axis_mask or shrink_axis_mask:
-            begin, end, stride = _transform_mask(stride_dim, ellipsis_mask)
+            begin, end, stride, fshape_indices = _transform_mask(stride_dim, ellipsis_mask)
         out = _sym.strided_slice(inputs[0], begin=begin, end=end, stride=stride)
         out_shape = _infer_out_shapes(out, params)[0]
+        if not fshape_indices:
+            fshape_indices = range(len(out_shape))
 
         #Create final output shape.
         final_output = []
-        out_index = 0
-        index = 0
-        while out_index != len(out_shape):
-            #axis with shrink_axis_mask dimension=1 and it is ignored.
-            mask = 1 << index
-            if (new_axis_mask & mask) and not ellipsis_mask & mask:
+        for gather_index in fshape_indices:
+            if gather_index == -1:
                 final_output.append(1)
-            elif (not mask & shrink_axis_mask) or index >= stride_dim:
-                #Shrink is considered till stride_dim
-                final_output.append(out_shape[out_index])
-                out_index += 1
-            index += 1
+            elif gather_index == -2:
+                pass
+            else:
+                final_output.append(out_shape[gather_index])
+        # Prevent 0-dim tensors which are not accepted by nnvm
+        if not final_output:
+            final_output.append(1)
         return _sym.reshape(out, shape=tuple(final_output))
     return _impl
 
@@ -605,7 +693,7 @@ def _impl(inputs, in_state_c, in_state_h, attr, params):
         ixh = _sym.concatenate(*[in_data, in_state_h], axis=1)
         in_weight = _sym.transpose(in_weight)
         gates = _sym.dense(ixh, in_weight, in_bias, use_bias=True,
-                           units=num_hidden_layers, name="dense")
+                           units=num_hidden_layers)
         gate_list = _sym.split(gates, indices_or_sections=4, axis=1)
         in_gate = _sym.sigmoid(gate_list[0])
         in_transform = _sym.tanh(gate_list[1])
@@ -642,6 +730,124 @@ def _impl(inputs, attr, params):
     return _impl
 
 
+def _transpose():
+    def _impl(inputs, attr, params):
+        # If perm is not specified, axes is left empty,
+        # otherwise its value is get from params
+        param_name = inputs[1].list_output_names()[0]
+        axes = params.get(param_name, tvm.nd.array([])).asnumpy()
+        return _sym.transpose(inputs[0], axes=tuple(axes))
+    return _impl
+
+def _rank():
+    def _impl(inputs, attr, params):
+        input_shapes = attr['_input_shapes'][inputs[0]]
+        assert len(inputs) == 1
+
+        name = attr["_node_name"]
+        params[name] = tvm.nd.array([len(input_shapes[0])])
+        return _sym.Variable(name=name, shape=params[name].shape)
+    return _impl
+
+def _range():
+    def _impl(inputs, attr, params):
+        start = params.pop(inputs[0].list_output_names()[0]).asnumpy()[0]
+        limit = params.pop(inputs[1].list_output_names()[0]).asnumpy()[0]
+        delta = params.pop(inputs[2].list_output_names()[0]).asnumpy()[0]
+
+        name = attr["_node_name"]
+        params[name] = tvm.nd.array([start, limit, delta])
+        return _sym.Variable(name=name, shape=params[name].shape)
+    return _impl
+
+def _elu():
+    def _impl(inputs, attr, params):
+        alpha = 1.0
+        return -alpha * _sym.relu(1 - _sym.exp(inputs[0])) + _sym.relu(inputs[0])
+    return _impl
+
+def _selu():
+    def _impl(inputs, attr, params):
+        alpha = 1.6732632423543772848170429916717
+        gamma = 1.0507009873554804934193349852946
+        return gamma * (-alpha * _sym.relu(1 - _sym.exp(inputs[0])) + _sym.relu(inputs[0]))
+    return _impl
+
+def _mean():
+    def _impl(inputs, attr, params):
+        axis = params.pop(inputs[1].list_output_names()[0])
+        return AttrCvt(op_name="mean", ignores=['Tdim', 'Tidx'],
+                       transforms={'keep_dims': 'keepdims'},
+                       extras={'axis': tuple(axis.asnumpy())})(inputs[0], attr)
+    return _impl
+
+def _broadcast(name):
+    def _impl(inputs, attr, params):
+        op_name = _math_name_picker(name)(attr)
+        return AttrCvt(
+            op_name=op_name,
+            ignores=['name', 'Tidx']
+        )(inputs, attr)
+    return _impl
+
+def _split(has_size_vector):
+    # TF documentation https://www.tensorflow.org/api_docs/python/tf/split
+    def _impl(inputs, attr, params):
+        try:
+            # order and number of inputs are different:
+            # if has_size_vector:
+            #     https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/split-v
+            # else:
+            #     https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/split
+
+            # in addition, `axis` and `num_or_size_splits` can be tensors in TensorFlow,
+            # we can only support constants
+            if has_size_vector:
+                input_node_index = 0
+                input_axis_index = 2
+                size_splits_input_name = inputs[1].list_output_names()[0]
+                size_splits = params[size_splits_input_name].asnumpy()
+                section_beginnings = np.cumsum(size_splits)[:-1]
+                indices_or_sections = tuple(section_beginnings)
+            else:
+                input_node_index = 1
+                input_axis_index = 0
+                indices_or_sections = attr['num_split']
+            input_node = inputs[input_node_index]
+            axis_input_name = inputs[input_axis_index].list_output_names()[0]
+            axis_input_value = params[axis_input_name].asnumpy()[0]
+        except (IndexError, KeyError):
+            raise TypeError( \
+                "Unsupported argument for split: `axis` and `num_or_size_splits` " \
+                "should be constants")
+        return _sym.split(input_node,
+                          indices_or_sections=indices_or_sections,
+                          axis=axis_input_value)
+    return _impl
+
+def _unpack():
+    def _impl(inputs, attr, params):
+        input_node = inputs[0]
+        axis = attr['axis']
+        input_shape = attr['_input_shapes'][input_node][0]
+        axis_length = input_shape[axis]
+        if axis_length < 0:
+            raise TypeError("Unstack with unknown axis length")
+        splitted = _sym.split(input_node,
+                              indices_or_sections=axis_length,
+                              axis=axis,
+                              name=attr.get('_node_name', 'unstack'))
+
+        return _sym.Group([_sym.squeeze(split_item, axis=axis) for split_item in splitted])
+    return _impl
+
+def _expand_dims_0d_aware(data, attr, axis, num_newaxis=1):
+    if data in attr['_input_0d_mismatch']:
+        return data if num_newaxis == 1 else \
+            _sym.expand_dims(data, axis=axis, num_newaxis=num_newaxis-1)
+
+    return _sym.expand_dims(data, axis=axis, num_newaxis=num_newaxis)
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -657,29 +863,37 @@ def _impl(inputs, attr, params):
     'BatchNormWithGlobalNormalization'  : _batch_norm(),
     'BiasAdd'                           : _bias_add(),
     'Cast'                              : _cast(),
+    'Ceil'                              : AttrCvt('ceil'),
     'CheckNumerics'                     : _check_numerics(),
     'Concat'                            : _concat(),
     'ConcatV2'                          : _concatV2(),
     'Conv2D'                            : _conv('conv'),
     'DecodeJpeg'                        : _decode_image(),
+    'Elu'                               : _elu(),
     'ExpandDims'                        : _expand_dims(),
+    'Floor'                             : AttrCvt('floor'),
     'Identity'                          : _identity(),
     'MatMul'                            : _matmul(),
     'MaxPool'                           : _pooling('max_pool'),
     'Add'                               : _elemwise('add'),
     'Sub'                               : _elemwise('sub'),
     'Mul'                               : _elemwise('mul'),
+    'RealDiv'                           : _elemwise('div'),
     'Maximum'                           : _elemwise('max'),
     'Minimum'                           : _elemwise('min'),
     'Sum'                               : _sum(),
     'Square'                            : _square(),
+    'Pack'                              : _pack(),
+    'LeakyRelu'                         : AttrCvt('leaky_relu'),
     'Relu'                              : AttrCvt('relu'),
     'Reshape'                           : _reshape(),
     'ResizeBilinear'                    : _resize_bilinear(),
+    'Selu'                              : _selu(),
     'Softmax'                           : AttrCvt('softmax', {'axis': ('axis', 1)}),
     'Rsqrt'                             : _rsqrt(),
     'Squeeze'                           : _squeeze(),
     'FusedBatchNorm'                    : _fused_batch_norm(),
+    'FusedBatchNormV2'                  : _fused_batch_norm(),
     'Relu6'                             : _relu6(),
     'DepthwiseConv2dNative'             : _conv('depthwise'),
     'Shape'                             : _shape(),
@@ -690,6 +904,20 @@ def _impl(inputs, attr, params):
     'LRN'                               : _lrn(),
     'Pad'                               : _pad('Pad'),
     'PadV2'                             : _pad('PadV2'),
+    'Range'                             : _range(),
+    'Rank'                              : _rank(),
+    'Transpose'                         : _transpose(),
+    'Tanh'                              : AttrCvt('tanh'),
+    'Mean'                              : _mean(),
+    'Less'                              : _broadcast('less'),
+    'Greater'                           : _broadcast('greater'),
+    'LessEqual'                         : _broadcast('less_equal'),
+    'GreaterEqual'                      : _broadcast('greater_equal'),
+    'Equal'                             : _broadcast('equal'),
+    'NotEqual'                          : _broadcast('not_equal'),
+    'Split'                             : _split(False),
+    'SplitV'                            : _split(True),
+    'Unpack'                            : _unpack(),
 }
 
 # _convert_map_rnn defines maps of rnn operator name to
@@ -885,29 +1113,35 @@ def __init__(self):
         self._output_shapes = {}
         self._num_param = 0
         self._num_rnn_layer = False
+        self._outputs_are_0d = {}
 
-    def from_tensorflow(self, graph):
+    def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
         """Construct nnvm nodes from tensorflow  graph definition - GraphDef.
 
         Follow the tensorflow graph definition to parse and convert it to NNVM.
         Some of the assumptions listed below.
 
-            -> First Placeholder or Const node will be considered as graph input.
-            -> Rest all Const nodes are params.
+            -> All Placeholders are considered as graph input.
+            -> All Const nodes are params.
             -> Last node is assumed as graph output.
-            -> _output_shapes : Attribute should present in the tenserflow forzen graph.
+            -> _output_shapes : Graph should be frozen with add_shapes=True.
+                                Or user can pass input shape dictionaly optionally.
             -> DecodeJpeg, ResizeBilinear: These are dummy operators.
                                            Hence user should handle preprocessing outside.
             -> CheckNumerics: No implementation as of now for this.
                               Just copies input to output.
 
-        TODO: Change algorithm to stop treating first 'Const' in a special way.
-
         Parameters
         ----------
         graph : tensorflow graph definition object
             The loaded tensorflow GraphDef
 
+        layout : target layout to be used (Optional)
+            NCHW only supported now to enable NHWC models on GPU.
+
+        shape : Dictionary of input dimensions (Optional)
+            Graph level input shape dictionary.
+
         Returns
         -------
         sym : nnvm.sym.Symbol
@@ -928,13 +1162,14 @@ def from_tensorflow(self, graph):
             raise NotImplementedError( \
                 "The following operators are not implemented: {}".format(missing_operators))
 
+        final_op = None
         # Parse the nodes to re-create TF graph using Symbol API of NNVM
         for node in graph.node:
             # Tensorflow doesn't have seperate list for params extraction.
             # Operator name 'Const' is treated as a parameter to build NNVM params dict.
 
             input_shapes = {}
-
+            input_0d_mismatch = set()
             attr = self._parse_attr(node.attr)
 
             #Variable converted to Const will not have only value attr
@@ -945,17 +1180,23 @@ def from_tensorflow(self, graph):
                         tensor_value.tensor_shape)]
             elif '_output_shapes' in attr:
                 self._output_shapes[node.name] = \
-                    [tensor_util.TensorShapeProtoToList(shape) \
-                    for shape in attr['_output_shapes']]
+                    [tensor_util.TensorShapeProtoToList(tshape) \
+                    for tshape in attr['_output_shapes']]
+            elif shape:
+                # Keep the list indexable to avoid key error.
+                # Actual value will be filled after node creation.
+                self._output_shapes[node.name] = [None]
             else:
                 raise NotImplementedError( \
                     "Please freeze the graph with add_shapes=True")
+            self._outputs_are_0d[node.name] = [ \
+                not tshape if isinstance(tshape, list) else False \
+                for tshape in self._output_shapes[node.name]]
 
             if node.op == "Placeholder":
                 self._nodes[node.name] = _sym.Variable(name=node.name,
                                                        shape=self._output_shapes[node.name][0])
 
-                #input_shapes[self._nodes[node.name]] = self._output_shapes[node.name]
             elif node.op == "Const":
                 # All Const nodes are Param nodes, lets parse
                 self._num_param += 1
@@ -974,42 +1215,80 @@ def from_tensorflow(self, graph):
                 # Pass the node name too in attr
                 attr["_node_name"] = node.name
 
-                #ToDo: Some of the tensorflow operators internaly maintain
-                #execution layers and its output name will the layer number along with
-                #graph node name.eg: Node name:- 'Model/RNN/cell_0/RnnCell', but the
-                #output name will be 'Model/RNN/cell_0/RnnCell:0'. In this case,
-                #the digit has to be ignored.
-                if ":" in node.input[0]:
-                    in_name, _ = node.input[0].split(':')
-                    node.input[0] = in_name
+                # Pass the target layout
+                attr["_target_layout"] = layout
 
                 # Fill shapes for all inputs in a list
-                try:
-                    inputs = [self._nodes[i] for i in node.input]
-                    for i in node.input:
-                        input_shapes[self._nodes[i]] = self._output_shapes[i]
-                    attr['_input_shapes'] = input_shapes
-                except KeyError:
-                    # TODO: Need to find clean way to handle '^CheckNumerics'
-                    pass
+                inputs = []
+                for i in node.input:
+                    # Some TensorFlow operators internally maintain execution layers
+                    # and their output name includes the layer number along with
+                    # graph node name. E.g. the node name is 'Model/RNN/cell_0/RnnCell', but the
+                    # output tensor name is 'Model/RNN/cell_0/RnnCell:0'. In this case,
+                    # the number has to be ignored for single-output nodes.
+                    # On the other hand, for multi-output nodes the number is the output index,
+                    # and the lack of the number implies 0.
+                    tensor_name = i.split(':')
+                    node_name = tensor_name[0]
+                    if node_name in self._nodes:
+                        in_sym = self._nodes[node_name]
+                        if len(in_sym.list_output_names()) > 1:
+                            tensor_slot = int(tensor_name[1]) if len(tensor_name) > 1 else 0
+                            in_sym = in_sym[tensor_slot]
+                            input_shape = self._output_shapes[node_name][tensor_slot]
+                        else:
+                            tensor_slot = 0
+                            input_shape = self._output_shapes[node_name][0]
+                        inputs.append(in_sym)
+                        input_shapes[in_sym] = [input_shape]
+                        # This means the node is 1d in NNVM and 0d in TF.
+                        # See `_expand_dims_0d_aware`.
+                        if self._outputs_are_0d[node_name][tensor_slot] and input_shape:
+                            input_0d_mismatch.add(in_sym)
+                attr['_input_shapes'] = input_shapes
+                attr['_input_0d_mismatch'] = input_0d_mismatch
 
                 inputs = self._fix_extranodes(node.op, attr, inputs)
-
                 op = self._convert_operator(node.op, inputs, attr, graph)
+
+                # Check is op is converted to param
+                if isinstance(op, np.ndarray):
+                    self._params[node.name] = tvm.nd.array(op)
+                    op = _sym.Variable(name=node.name,
+                                       shape=self._params[node.name].shape)
+
                 # Assuming only one output.
                 self._nodes[node.name] = op
-                node_output = op
-
-        # Assume the final node is the output node
-        out = node_output
+                final_op = op
+
+            # Infer shapes if passed explicitely
+            node_output = self._nodes[node.name]
+            if shape:
+                g = _graph.create(node_output)
+                shape_dict = {k: v.shape for k, v in self._params.items()}
+                shape_dict.update(shape)
+                _, out_shapes = graph_util.infer_shape(g, **shape_dict)
+                self._output_shapes[node.name] = out_shapes
+
+        out = []
+        if outputs is None:
+            out.append(final_op)
+        else:
+            for out_name in outputs:
+                if ":" in out_name:
+                    out_name, out_num = out_name.split(":")
+                    out_num = int(out_num)
+                    out.append(self._nodes[out_name][out_num])
+                else:
+                    out.append(self._nodes[out_name])
 
         #Add the RNN outputs also with 'head' nodes of the nnvm graph
         if self._num_rnn_layer:
             out_rnn = _sym.concatenate(*self._out_rnn, axis=0)
-            out = [out, out_rnn]
+            out.append(out_rnn)
 
         if isinstance(out, list):
-            out = _sym.Group(out)
+            out = _sym.Group(out) if len(out) > 1 else out[0]
 
         return out, self._params
 
@@ -1094,9 +1373,9 @@ def _get_attr(self, buf):
             for f in fields:
                 if getattr(x.list, f):
                     if f == "type":
-                        ret = [dtypes.as_dtype(x) for x in list(getattr(x.list, f))]
+                        ret += [dtypes.as_dtype(x) for x in list(getattr(x.list, f))]
                     else:
-                        ret = list(getattr(x.list, f))
+                        ret += list(getattr(x.list, f))
         else:
             for f in fields:
                 if x.HasField(f):
@@ -1203,7 +1482,7 @@ def _fix_extranodes(self, op_name, attr, inputs):
 
         return inputs
 
-def from_tensorflow(graph):
+def from_tensorflow(graph, layout="NHWC", shape=None, outputs=None):
     """  Load tensorflow graph which is a python tensorflow graph object into nnvm graph.
     The companion parameters will be handled automatically.
 
@@ -1221,5 +1500,5 @@ def from_tensorflow(graph):
         Dict of converted parameters stored in tvm.ndarray format
     """
     g = GraphProto()
-    sym, params = g.from_tensorflow(graph)
+    sym, params = g.from_tensorflow(graph, layout, shape, outputs)
     return sym, params
diff --git a/nnvm/python/nnvm/testing/__init__.py b/nnvm/python/nnvm/testing/__init__.py
index bff828d68280..44b8529821d0 100644
--- a/nnvm/python/nnvm/testing/__init__.py
+++ b/nnvm/python/nnvm/testing/__init__.py
@@ -4,10 +4,14 @@
 from .config import ctx_list
 from .utils import create_workload
 from . import mobilenet
+from . import mobilenet_v2
 from . import mlp
 from . import resnet
 from . import vgg
+from . import densenet
 from . import squeezenet
+from . import inception_v3
 from . import dcgan
 from . import dqn
-from . import yolo2_detection
+from . import yolo_detection
+from . import check_computation
diff --git a/nnvm/python/nnvm/testing/check_computation.py b/nnvm/python/nnvm/testing/check_computation.py
new file mode 100644
index 000000000000..7ab4dc0d4c6c
--- /dev/null
+++ b/nnvm/python/nnvm/testing/check_computation.py
@@ -0,0 +1,538 @@
+# pylint: disable=cell-var-from-loop,no-else-return
+"""Helper utilities to check functions and their gradients."""
+from __future__ import absolute_import as _abs
+
+import logging
+import numpy as np
+
+import tvm
+from tvm.contrib import graph_runtime
+from tvm.testing import check_numerical_grads
+
+import nnvm
+from nnvm.compiler import graph_util
+from nnvm.compiler.graph_attr import TCODE_TO_DTYPE, DTYPE_TO_TCODE
+from .config import ctx_list
+
+def infer_shapes_dtypes(graph, shape=None, dtype=None, fallback_dtype=None):
+    """Runs dtype and shape inference passes on a graph and returns the resulting graph
+    along with the inferred information.
+
+    Parameters
+    ----------
+    graph : nnvm.graph.Graph
+        A graph we want to run inference on.
+
+    shape : Dict[str, Tuple[int]] or Tuple[int], optional
+        A dict mapping input variable names to shapes.
+        By default shapes will be inferred from variables' attributes.
+        Note that this parameter takes precedence over variables' attributes.
+
+    dtype : Dict[str, str] or str, optional
+        A dict mapping input variable names to dtypes, or just a single dtype.
+        By default dtypes will be inferred from variables' attributes.
+        Note that this parameter takes precedence over variables' attributes.
+
+    fallback_dtype : str, optional
+        A dtype that will be used for variables whose dtype can't be inferred from other
+        variables' dtypes.
+
+    Returns
+    -------
+    graph : nnvm.graph.Graph
+        The resulting graph with dtype and shape information on its nodes.
+
+    input_shapes : Dict[str, Tuple[int]]
+        The inferred shapes of input variables merged with the `shape` dictionary.
+
+    input_dtypes : Dict[str, str]
+        The inferred dtypes of input variables merged with the `dtype` dictionary.
+
+    output_shapes : List[Tuple[int]]
+        The inferred shapes of outputs.
+
+    output_dtypes : List[str]
+        The inferred dtypes of outputs.
+    """
+    # Preprocess input parameters
+    if shape is None:
+        provided_shapes = {}
+    elif isinstance(shape, dict):
+        provided_shapes = shape
+    else:
+        provided_shapes = {x: shape for x in graph.symbol.list_input_variables()}
+
+    if dtype is None:
+        provided_dtypes = {}
+    elif isinstance(dtype, dict):
+        provided_dtypes = dtype
+    else:
+        provided_dtypes = {x: dtype for x in graph.symbol.list_input_variables()}
+
+    provided_shapes = _dict_var_to_dict_str(provided_shapes)
+    provided_dtypes = _dict_var_to_dict_str(provided_dtypes)
+
+    # The graph may already contain shape and dtype info, so extract it and merge with
+    # the user-specified shapes and dtypes (use the user-specified one on contradiction)
+    preexisting_shapes = graph.json_attr('shape')
+    preexisting_dtypes = graph.json_attr('dtype')
+
+    if preexisting_shapes:
+        for x in graph.index.input_names:
+            if x not in provided_shapes:
+                x_shape = tuple(preexisting_shapes[graph.index.entry_id(x)])
+                provided_shapes[x] = x_shape
+
+    if preexisting_dtypes:
+        for x in graph.index.input_names:
+            if x not in provided_dtypes:
+                x_dtype = TCODE_TO_DTYPE[preexisting_dtypes[graph.index.entry_id(x)]]
+                provided_dtypes[x] = x_dtype
+
+    # Perform inference
+    nnvm.compiler.graph_attr.set_shape_inputs(graph, provided_shapes)
+    nnvm.compiler.graph_attr.set_dtype_inputs(graph, provided_dtypes)
+
+    graph = graph.apply('InferShape').apply('InferType')
+
+    inferred_shapes = graph.json_attr('shape')
+    inferred_dtypes = graph.json_attr('dtype')
+
+    index = graph.index
+
+    output_shapes = [tuple(inferred_shapes[index.entry_id(entry)])
+                     for entry in index.output_entries]
+    output_dtypes = [TCODE_TO_DTYPE[inferred_dtypes[index.entry_id(entry)]]
+                     for entry in index.output_entries]
+
+    # Postprocess the results
+    input_shapes = provided_shapes.copy()
+    input_dtypes = provided_dtypes.copy()
+
+    for x in graph.symbol.list_input_variables():
+        x_name = x.attr('name')
+        x_entry_id = graph.index.entry_id(x_name)
+        input_shapes[x_name] = tuple(inferred_shapes[x_entry_id])
+        input_dtypes[x_name] = TCODE_TO_DTYPE[inferred_dtypes[x_entry_id]]
+
+    # Merge the original user-specified shapes in case some of them are specified for non-existing
+    # variables
+    for x_name, x_shape in provided_shapes.items():
+        x_shape = tuple(x_shape)
+        if input_shapes.get(x_name, x_shape) != x_shape:
+            raise RuntimeError("Inferred shape differs from the provided shape.\n"
+                               "Provided shapes: {}\nInferred shapes: {}"
+                               .format(provided_shapes, input_shapes))
+        else:
+            input_shapes[x_name] = x_shape
+
+    # Merge the original user-specified dtypes
+    for x_name, x_dtype in provided_dtypes.items():
+        if not isinstance(x_dtype, str):
+            x_dtype = TCODE_TO_DTYPE[x_dtype]
+        if input_dtypes.get(x_name, x_dtype) != x_dtype:
+            raise RuntimeError("Inferred dtype differs from the provided dtype.\n"
+                               "Provided dtypes: {}\nInferred dtypes: {}"
+                               .format(provided_dtypes, input_dtypes))
+        else:
+            input_dtypes[x_name] = x_dtype
+
+    # If some dtypes weren't inferred and there is a fallback dtype, assign it to those varibles
+    # and repeat the inference
+    if fallback_dtype is not None and not all(input_dtypes.values()):
+        input_dtypes = {x: input_dtypes[x] if input_dtypes[x] else fallback_dtype
+                        for x in input_dtypes}
+        return infer_shapes_dtypes(graph, input_shapes, input_dtypes, fallback_dtype=None)
+
+    return graph, input_shapes, input_dtypes, output_shapes, output_dtypes
+
+def graph_to_function(graph, target, ctx, shape=None, dtype=None):
+    """Convert a graph to a function taking a keyword args and returning a list of results
+    (both args and results are numpy arrays).
+
+    Example::
+
+        fun = graph_to_function(graph, llvm, cpu(0))
+        [res1, res2] = fun(x=np.zeros((1,2)), y=np.zeros((1,)))
+
+    Parameters
+    ----------
+    graph : nnvm.graph.Graph
+        A graph we want to convert to a function.
+
+    target : str or :any:`tvm.target.Target`
+        The build target
+
+    ctx : TVMContext
+        The context to deploy the module.
+
+    shape : Dict[str, Tuple[int]], optional
+        A dict mapping input variable names to shapes.
+        By default shapes will be inferred from variables' attributes.
+        Note that this parameter takes precedence over variables' attributes.
+
+    dtype : Dict[str, str] or str, optional
+        A dict mapping input variable names to dtypes, or just a single dtype.
+        By default dtypes will be inferred from variables' attributes.
+        Note that this parameter takes precedence over variables' attributes.
+
+    Returns
+    -------
+    function : Callable[..., List[numpy.ndarray]]
+    """
+    # Infer missing shapes and dtypes
+    graph, shape, dtype, output_shapes, output_dtypes = \
+        infer_shapes_dtypes(graph, shape=shape, dtype=dtype)
+
+    if None in dtype.values():
+        raise ValueError("Input variables with no type: {}".format(dtype))
+
+    if not all(shape.values()):
+        raise ValueError("Input variables with no shape: {}".format(shape))
+
+    compute_graph, lib, params = nnvm.compiler.build(graph, target, shape=shape, dtype=dtype)
+    module = graph_runtime.create(compute_graph, lib, ctx)
+
+    if params:
+        module.set_inputs(**params)
+
+    def run(**kwargs):
+        module.run(**kwargs)
+        res = []
+        for i, (o_shape, o_dtype) in enumerate(zip(output_shapes, output_dtypes)):
+            res.append(module.get_output(i, tvm.nd.empty(o_shape, o_dtype)).asnumpy())
+        return res
+
+    return run
+
+def _dict_var_to_dict_str(dictionary):
+    """Convert a Dict[nnvm.Symbol, T] to Dict[str, T]"""
+    if isinstance(dictionary, dict):
+        return {s.attr('name') if isinstance(s, nnvm.symbol.Symbol) else s:
+                dictionary[s] for s in dictionary}
+    else:
+        return dictionary
+
+def check_function(symbol, forward=None, backward=None, grad_input_vars=None,
+                   shape=None, dtype=None, in_range=None, values=None,
+                   exclude_targets=None, only_targets=None,
+                   additional_params=None,
+                   numerical_grads=None, numerical_grads_params=None,
+                   atol=1e-5, rtol=1e-5, quiet=False):
+    """Compute the function and/or its gradients on a random input and raise
+    an exception if the result doesn't match the reference implementation.
+
+    Parameters
+    ----------
+    symbol : nnvm.Symbol
+        A symbol representing the output.
+
+    forward : Callable[..., List[numpy.ndarray]], optional
+        A reference implementation to compare with.
+
+    backward : Callable[..., List[numpy.ndarray] or Dict[str, numpy.ndarray]], optional
+        A reference implementation of gradients. Should also accept head_grads besides
+        normal inputs which is a list of gradients of some scalar wrt the outputs or just a
+        single gradient if there are multiple outputs.
+        Should return either a dict mapping input variable names to the respective
+        gradients or a list of gradients wrt variables from grad_input_vars in
+        exactly the same order (in alphabetical order by default).
+
+    grad_input_vars : List[nnvm.Symbol or str], optional
+        A list of variables with respect to which the gradients will be computed.
+        None (default) means that all input variables will be used in an alphabetical order.
+
+    shape : Dict[nnvm.Symbol or str, Tuple[int]] or Tuple[int], optional
+        A dict mapping input variable names to shapes, or just a single shape.
+        By default shapes will be inferred from variables' attributes (see the Examples).
+        Note that this parameter takes precedence over variables' attributes.
+
+    dtype : Dict[nnvm.Symbol or str, str] or str, optional
+        A dict mapping input variable names to dtypes, or just a single dtype.
+        By default dtypes will be inferred from variables' attributes (see the Examples).
+        If dtypes cannot be inferred for some variables then float32 will be used as a fallback.
+        Note that this parameter takes precedence over variables' attributes.
+
+    in_range : Dict[nnvm.Symbol or str, (float, float)] or (float, float), optional
+        A dict mapping input variable names to ranges or just a single range
+        (the same for all variables). Input values will be generated from
+        uniform distributions on these ranges. `head_grads` can also be
+        assigned a range this way.
+
+    values : Dict[nnvm.Symbol or str, numpy.ndarray], optional
+        A dict explicitly providing values for some variables instead of random generation.
+
+    exclude_targets : Set[str], optional
+        Skip compiling and running anything for these targets.
+
+    only_targets : Set[str], optional
+        Test only for those targets from `ctx_list()` that are also in this set.
+
+    additional_params : dict, optional
+        A dict of additional parameters which will be passed to forward and backward.
+
+    numerical_grads : bool or 'if_possible', optional
+        Whether to additionally check against numerically computed gradients. If 'if_possible' or
+        None is passed (which is the default) then it will try to create a gradient computation
+        graph and then check gradients numerically only if this graph can be created (i.e. if there
+        are some operations with unimplemented gradients, it will just issue a warning).
+        Checking against numerical gradients is done via the `check_numerical_grads` function.
+
+    numerical_grads_params : dict, optional
+        Additional parameters for `check_numerical_grads`.
+
+    atol : float, optional
+        Absolute tolerance for `tvm.testing.assert_allclose`. NOT used for numerical gradients.
+
+    rtol : float, optional
+        Relative tolerance for `tvm.testing.assert_allclose`. NOT used for numerical gradients.
+
+    quiet : bool, optional
+        Don't dump additional information to stdout on failure.
+
+    Examples
+    --------
+    .. code-block:: python
+
+        x = sym.Variable("x", shape=(1, 2))
+        y = sym.Variable("y", shape=(1, 2))
+
+        # check the function and its gradients both numerically and using a reference function
+        check_function(x + 2*y,
+                       lambda x, y: x + 2*y,
+                       lambda x, y, head_grads: {'x': head_grads, 'y': 2*head_grads})
+
+        # just check gradients numerically
+        check_function(x + 2*y, numerical_grads=True)
+
+        # just check the forward computation
+        check_function(x + 2*y, lambda x, y: x + 2*y, numerical_grads=False)
+
+        # specifying dtype
+        check_function(x + 2*y, lambda x, y: x + 2*y, dtype='float64')
+
+        # dtypes can also be specified during variable creation with dtype codes
+        x = sym.Variable("x", dtype=0)
+        check_function(x + 1, shape=(2, 2), numerical_grads=True)
+    """
+    # validate and preprocess the input params
+    if numerical_grads is None and forward is None and backward is None:
+        raise ValueError("No reference function was passed to check_function. If you only want to "
+                         "check gradients numerically, pass numerical_grads=True explicitly.")
+
+    if numerical_grads is None:
+        numerical_grads = 'if_possible'
+
+    if numerical_grads not in [False, True, 'if_possible']:
+        raise ValueError("numerical_grads must be a bool or 'if_possible', not {}"
+                         .format(numerical_grads))
+
+    if additional_params is None:
+        additional_params = {}
+
+    input_vars = symbol.list_input_variables()
+    input_dict = {x.attr('name'): x for x in input_vars}
+
+    if grad_input_vars is None:
+        grad_input_vars = sorted(input_vars, key=lambda x: x.attr('name'))
+    else:
+        grad_input_vars = [input_dict[x] if isinstance(x, str) else x for x in grad_input_vars]
+
+    in_range = _dict_var_to_dict_str(in_range)
+    values = _dict_var_to_dict_str(values)
+
+    out_len = len(symbol.list_output_names())
+
+    # Infer the output shapes and dtypes, and preprocess the shape and dtype params
+    forward_graph, shape, dtype, out_shapes, out_dtypes = \
+        infer_shapes_dtypes(nnvm.graph.create(symbol), shape=shape, dtype=dtype,
+                            fallback_dtype='float32')
+
+    if not all(out_shapes) or not all(out_dtypes):
+        if not quiet:
+            print(forward_graph.ir(join_node_attrs=['shape', 'dtype']))
+        raise ValueError("Could not infer shapes or dtypes for outputs.\n"
+                         "out_shapes = {}\nout_dtypes = {}".format(out_shapes, out_dtypes))
+
+    backward_graph = None
+
+    # If we want gradients, we have to recreate the graph, but now with gradient computations
+    # Note that here we need out_shapes for defining the shape of head grads, so we have to
+    # create the graph twice
+    if backward is not None or numerical_grads:
+        try:
+            head_grads_symbols = [nnvm.symbol.Variable("head_grads_" + str(i),
+                                                       shape=out_shapes[i],
+                                                       dtype=DTYPE_TO_TCODE[out_dtypes[i]])
+                                  for i in range(out_len)]
+            grad_symbols = graph_util.gradients([symbol], grad_input_vars,
+                                                grad_ys=head_grads_symbols)
+            # Sometimes grads do not depend on head_grads, so head_grads does not appear
+            # in the variable list; adding it manually prevents this, making things a bit easier
+            backward_graph = \
+                nnvm.graph.create(nnvm.symbol.Group([symbol] + grad_symbols + head_grads_symbols))
+
+            backward_graph, shape, dtype, out_shapes, out_dtypes = \
+                infer_shapes_dtypes(backward_graph, shape=shape, dtype=dtype,
+                                    fallback_dtype='float32')
+        except nnvm._base.NNVMError as err:
+            if backward is None and numerical_grads == "if_possible":
+                logging.warning("Won't check gradients because: %s", str(err).split('\n', 1)[0])
+                numerical_grads = False
+                backward_graph = None
+            else:
+                raise
+
+    main_graph = backward_graph if backward_graph is not None else forward_graph
+
+    # Generate random data for inputs (including head_grads)
+
+    np_inputs = {}
+
+    for x in main_graph.symbol.list_input_variables():
+        x_name = x.attr('name')
+        x_shape = shape[x_name]
+        x_dtype = dtype[x_name]
+
+        if values is not None and x_name in values:
+            np_inputs[x_name] = values[x_name].astype(x_dtype)
+            continue
+
+        low = -1.0
+        high = 1.0
+        if in_range is not None:
+            if isinstance(in_range, dict):
+                if x_name in in_range:
+                    low = in_range[x_name][0]
+                    high = in_range[x_name][1]
+            else:
+                low = in_range[0]
+                high = in_range[1]
+
+        np_inputs[x_name] = np.random.uniform(size=x_shape, low=low, high=high).astype(x_dtype)
+
+    np_inputs_without_head_grads = {k: np_inputs[k] for k in np_inputs
+                                    if not k.startswith('head_grads_')}
+
+    nothing_was_done = True
+
+    # Compute and compare the results
+    for target, ctx in ctx_list():
+        if exclude_targets is not None:
+            if target in exclude_targets or str(target) in exclude_targets:
+                logging.info("Skipping target = %s, ctx = %s", target, ctx)
+                continue
+        if only_targets is not None:
+            if target not in only_targets and str(target) not in only_targets:
+                logging.info("Skipping target = %s, ctx = %s", target, ctx)
+                continue
+
+        logging.info("Checking computation on target = %s, ctx = %s", target, ctx)
+
+        debug_stage = None
+
+        try:
+            nnvm_res = None
+
+            debug_stage = "compiling"
+            main_function = graph_to_function(main_graph, target, ctx)
+
+            # nnvm_res contains the output and gradients (if they are needed)
+            debug_stage = "running"
+            nnvm_res = main_function(**np_inputs)
+
+            if backward_graph is not None:
+                grad_var_names = [x.attr('name') for x in grad_input_vars]
+                nnvm_grads = {x: v for x, v in zip(grad_var_names, nnvm_res[out_len:])}
+
+            if forward is not None:
+                nothing_was_done = False
+                debug_stage = "checking forward computation"
+                logging.debug(debug_stage)
+
+                params = {}
+                params.update(np_inputs_without_head_grads)
+                params.update(additional_params)
+                numpy_res = forward(**params)
+
+                if isinstance(numpy_res, tuple):
+                    numpy_res = list(numpy_res)
+
+                if not isinstance(numpy_res, list):
+                    numpy_res = [numpy_res]
+
+                if len(numpy_res) != out_len:
+                    raise ValueError("Forward function returned {} values, but "
+                                     "the nnvm graph returns {} values"
+                                     .format(len(numpy_res), out_len))
+
+                for i in range(out_len):
+                    tvm.testing.assert_allclose(nnvm_res[i], numpy_res[i], atol=atol, rtol=rtol)
+
+            if backward is not None:
+                nothing_was_done = False
+                debug_stage = "checking gradients"
+                logging.debug(debug_stage)
+
+                np_head_grads = [np_inputs["head_grads_" + str(i)] for i in range(out_len)]
+
+                if out_len == 1:
+                    np_head_grads = np_head_grads[0]
+
+                params = {'head_grads': np_head_grads}
+                params.update(np_inputs_without_head_grads)
+                params.update(additional_params)
+                numpy_grads = backward(**params)
+
+                if not isinstance(numpy_grads, dict):
+                    if isinstance(numpy_grads, tuple):
+                        numpy_grads = list(numpy_grads)
+                    if not isinstance(numpy_grads, list):
+                        numpy_grads = [numpy_grads]
+                    numpy_grads = {x: v for x, v in zip(grad_var_names, numpy_grads)}
+                    if len(numpy_grads) != len(grad_var_names):
+                        raise ValueError("The backward function returns a list of gradients which "
+                                         "does not contain gradients for these variables: {}"
+                                         .format(set(grad_var_names) - set(numpy_grads)))
+
+                for x_name in numpy_grads:
+                    tvm.testing.assert_allclose(nnvm_grads[x_name], numpy_grads[x_name],
+                                                atol=atol, rtol=rtol)
+
+            if numerical_grads:
+                nothing_was_done = False
+                debug_stage = "checking gradients numerically"
+                logging.debug(debug_stage)
+
+                forward_function = graph_to_function(forward_graph, target, ctx)
+
+                # Since the result may be non-scalar, we have to put another operation on the top,
+                # so we just multiple by the randomly generated head_grads and then sum everything.
+                # This way we can reuse the gradient values which has been already computed.
+                def scalar_function(**kwargs):
+                    res = forward_function(**kwargs)
+                    return np.sum([np.dot(np_inputs['head_grads_' + str(i)].ravel(), res[i].ravel())
+                                   for i in range(out_len)])
+
+                if numerical_grads_params is None:
+                    numerical_grads_params = {}
+
+                check_numerical_grads(
+                    scalar_function,
+                    input_values=np_inputs_without_head_grads,
+                    grad_values=nnvm_grads,
+                    **numerical_grads_params)
+
+        except:
+            if not quiet:
+                print("\ncheck_function failed while {}, here is the main graph"
+                      .format(debug_stage))
+                print(main_graph.ir(join_node_attrs=['shape', 'dtype']))
+                if nnvm_res is not None:
+                    print("Generated inputs:")
+                    print(np_inputs)
+                    print()
+            raise
+
+    if nothing_was_done:
+        logging.warning("Nothing was done in check_function. Check ctx_list().")
diff --git a/nnvm/python/nnvm/testing/config.py b/nnvm/python/nnvm/testing/config.py
index 0eab3e6b3389..bf22ea7e3887 100644
--- a/nnvm/python/nnvm/testing/config.py
+++ b/nnvm/python/nnvm/testing/config.py
@@ -10,5 +10,5 @@ def ctx_list():
     device_list = (device_list.split(",") if device_list
                    else ["llvm", "cuda"])
     device_list = set(device_list)
-    res = [("llvm", tvm.cpu(0)), ("cuda", tvm.gpu(0))]
-    return [x for x in res if x[1].exist and x[0] in device_list]
+    res = [(device, tvm.context(device, 0)) for device in device_list]
+    return [x for x in res if x[1].exist]
diff --git a/nnvm/python/nnvm/testing/darknet.py b/nnvm/python/nnvm/testing/darknet.py
index 362fd3058954..328ad2ae6a10 100644
--- a/nnvm/python/nnvm/testing/darknet.py
+++ b/nnvm/python/nnvm/testing/darknet.py
@@ -55,10 +55,10 @@ def _letterbox_image(img, w_in, h_in):
     imc, imh, imw = img.shape
     if (w_in / imw) < (h_in / imh):
         new_w = w_in
-        new_h = imh * w_in / imw
+        new_h = imh * w_in // imw
     else:
         new_h = h_in
-        new_w = imw * h_in/imh
+        new_w = imw * h_in // imh
     resized = _resize_image(img, new_w, new_h)
     boxed = np.full((imc, h_in, w_in), 0.5, dtype=float)
     _, resizedh, resizedw = resized.shape
@@ -115,8 +115,12 @@ class LAYERTYPE(object):
     NETWORK = 20
     XNOR = 21
     REGION = 22
-    REORG = 23
-    BLANK = 24
+    YOLO = 23
+    REORG = 24
+    UPSAMPLE = 25
+    LOGXENT = 26
+    L2NORM = 27
+    BLANK = 28
 
 class ACTIVATION(object):
     """Darknet ACTIVATION Class constant."""
@@ -182,12 +186,16 @@ class ACTIVATION(object):
     NETWORK,
     XNOR,
     REGION,
+    YOLO,
     REORG,
+    UPSAMPLE,
+    LOGXENT,
+    L2NORM,
     BLANK
 } LAYERTYPE;
 
 typedef enum{
-    SSE, MASKED, LONE, SEG, SMOOTH
+    SSE, MASKED, L1, SEG, SMOOTH, WGAN
 } COSTTYPE;
 
 
@@ -241,18 +249,20 @@ class ACTIVATION(object):
     float shift;
     float ratio;
     float learning_rate_scale;
+    float clip;
     int softmax;
     int classes;
     int coords;
     int background;
     int rescore;
     int objectness;
-    int does_cost;
     int joint;
     int noadjust;
     int reorg;
     int log;
     int tanh;
+    int *mask;
+    int total;
 
     float alpha;
     float beta;
@@ -265,13 +275,17 @@ class ACTIVATION(object):
     float class_scale;
     int bias_match;
     int random;
+    float ignore_thresh;
+    float truth_thresh;
     float thresh;
+    float focus;
     int classfix;
     int absolute;
 
     int onlyforward;
     int stopbackward;
     int dontload;
+    int dontsave;
     int dontloadscales;
 
     float temperature;
@@ -309,6 +323,7 @@ class ACTIVATION(object):
 
     float * delta;
     float * output;
+    float * loss;
     float * squared;
     float * norms;
 
@@ -462,6 +477,7 @@ class ACTIVATION(object):
     int train;
     int index;
     float *cost;
+    float clip;
 } network;
 
 
@@ -491,6 +507,12 @@ class ACTIVATION(object):
 layer make_region_layer(int batch, int w, int h, int n, int classes, int coords);
 layer make_softmax_layer(int batch, int inputs, int groups);
 layer make_rnn_layer(int batch, int inputs, int outputs, int steps, ACTIVATION activation, int batch_normalize, int adam);
+layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes);
+layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int steps, ACTIVATION activation, int batch_normalize);
+layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam);
+layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam);
+layer make_upsample_layer(int batch, int w, int h, int c, int stride);
+layer make_l2norm_layer(int batch, int inputs);
 void free_network(network *net);
 """
                    )
diff --git a/nnvm/python/nnvm/testing/densenet.py b/nnvm/python/nnvm/testing/densenet.py
new file mode 100644
index 000000000000..e97d306af933
--- /dev/null
+++ b/nnvm/python/nnvm/testing/densenet.py
@@ -0,0 +1,49 @@
+"""
+DenseNet, load model from gluon model zoo
+
+Reference:
+Huang, Gao, et al. "Densely Connected Convolutional Networks." CVPR 2017
+"""
+
+from .utils import create_workload
+from ..frontend.mxnet import _from_mxnet_impl
+
+def get_workload(batch_size, num_classes=1000, num_layers=121, dtype="float32"):
+    """Get benchmark workload for mobilenet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    num_layers : int, optional
+        The number of layers
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    import mxnet as mx
+    from mxnet.gluon.model_zoo.vision import get_model
+
+    image_shape = (1, 3, 224, 224)
+
+    block = get_model('densenet%d' % num_layers, classes=num_classes, pretrained=False)
+
+    data = mx.sym.Variable('data')
+    sym = block(data)
+    sym = mx.sym.SoftmaxOutput(sym)
+
+    net = _from_mxnet_impl(sym, {})
+
+    return create_workload(net, batch_size, image_shape[1:], dtype)
diff --git a/nnvm/python/nnvm/testing/inception_v3.py b/nnvm/python/nnvm/testing/inception_v3.py
new file mode 100644
index 000000000000..f14daa1ae656
--- /dev/null
+++ b/nnvm/python/nnvm/testing/inception_v3.py
@@ -0,0 +1,255 @@
+"""
+Inception V3, suitable for images with around 299 x 299
+
+Reference:
+Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision."
+arXiv preprint arXiv:1512.00567 (2015).
+
+Adopted from https://github.com/apache/incubator-mxnet/blob/
+             master/example/image-classification/symbols/inception-v3.py
+"""
+# pylint: disable=invalid-name,missing-docstring,unused-argument
+from .. import symbol as sym
+from .utils import create_workload
+
+def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''):
+    conv = sym.conv2d(data=data, channels=num_filter, kernel_size=kernel,
+                      strides=stride, padding=pad, use_bias=False,
+                      name='%s%s_conv2d' % (name, suffix))
+    bn = sym.batch_norm(data=conv, name='%s%s_batchnorm' % (name, suffix), epsilon=2e-5)
+    act = sym.relu(data=bn, name='%s%s_relu' % (name, suffix))
+    return act
+
+def Pooling(data, kernel, stride, pad, pool_type, name):
+    if pool_type == 'max':
+        return sym.max_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad, name=name)
+    elif pool_type == 'avg':
+        return sym.avg_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad, name=name,
+                              count_include_pad=True)
+    else:
+        raise ValueError("Invalid pooling type: " + pool_type)
+
+def Inception7A(data,
+                num_1x1,
+                num_3x3_red, num_3x3_1, num_3x3_2,
+                num_5x5_red, num_5x5,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name))
+    tower_5x5 = Conv(data, num_5x5_red, name=('%s_tower' % name), suffix='_conv')
+    tower_5x5 = Conv(tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name),
+                     suffix='_conv_1')
+    tower_3x3 = Conv(data, num_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name),
+                     suffix='_conv_1')
+    tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name),
+                     suffix='_conv_2')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool,
+                      name=('%s_pool_%s_pool' % (pool, name)))
+
+    cproj = Conv(pooling, proj, name=('%s_tower_2' % name), suffix='_conv')
+    concat = sym.concatenate(*[tower_1x1, tower_5x5, tower_3x3, cproj],
+                             name='ch_concat_%s_chconcat' % name)
+    return concat
+
+# First Downsample
+def Inception7B(data,
+                num_3x3,
+                num_d3x3_red, num_d3x3_1, num_d3x3_2,
+                pool,
+                name):
+    tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2),
+                     name=('%s_conv' % name))
+    tower_d3x3 = Conv(data, num_d3x3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel=(3, 3), pad=(1, 1), stride=(1, 1),
+                      name=('%s_tower' % name), suffix='_conv_1')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel=(3, 3), pad=(0, 0), stride=(2, 2),
+                      name=('%s_tower' % name), suffix='_conv_2')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0, 0), pool_type="max",
+                      name=('max_pool_%s_pool' % name))
+    concat = sym.concatenate(*[tower_3x3, tower_d3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7C(data,
+                num_1x1,
+                num_d7_red, num_d7_1, num_d7_2,
+                num_q7_red, num_q7_1, num_q7_2, num_q7_3, num_q7_4,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d7 = Conv(data=data, num_filter=num_d7_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3),
+                    name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0),
+                    name=('%s_tower' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=data, num_filter=num_q7_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_1, kernel=(7, 1), pad=(3, 0),
+                    name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_2, kernel=(1, 7), pad=(0, 3),
+                    name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_3, kernel=(7, 1), pad=(3, 0),
+                    name=('%s_tower_1' % name), suffix='_conv_3')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_4, kernel=(1, 7), pad=(0, 3),
+                    name=('%s_tower_1' % name), suffix='_conv_4')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool,
+                      name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1),
+                 name=('%s_tower_2' % name), suffix='_conv')
+    # concat
+    concat = sym.concatenate(*[tower_1x1, tower_d7, tower_q7, cproj],
+                             name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7D(data,
+                num_3x3_red, num_3x3,
+                num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3,
+                pool,
+                name):
+    tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=('%s_tower' % name),
+                     suffix='_conv')
+    tower_3x3 = Conv(data=tower_3x3, num_filter=num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2),
+                     name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=data, num_filter=num_d7_3x3_red, name=('%s_tower_1' % name),
+                        suffix='_conv')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3),
+                        name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0),
+                        name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_3x3, kernel=(3, 3), stride=(2, 2),
+                        name=('%s_tower_1' % name), suffix='_conv_3')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, pad=(0, 0),
+                      name=('%s_pool_%s_pool' % (pool, name)))
+    # concat
+    concat = sym.concatenate(*[tower_3x3, tower_d7_3x3, pooling],
+                             name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7E(data,
+                num_1x1,
+                num_d3_red, num_d3_1, num_d3_2,
+                num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d3 = Conv(data=data, num_filter=num_d3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3_a = Conv(data=tower_d3, num_filter=num_d3_1, kernel=(1, 3), pad=(0, 1),
+                      name=('%s_tower' % name), suffix='_mixed_conv')
+    tower_d3_b = Conv(data=tower_d3, num_filter=num_d3_2, kernel=(3, 1), pad=(1, 0),
+                      name=('%s_tower' % name), suffix='_mixed_conv_1')
+    tower_3x3_d3 = Conv(data=data, num_filter=num_3x3_d3_red, name=('%s_tower_1' % name),
+                        suffix='_conv')
+    tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1),
+                        name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_1, kernel=(1, 3), pad=(0, 1),
+                          name=('%s_tower_1' % name), suffix='_mixed_conv')
+    tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_2, kernel=(3, 1), pad=(1, 0),
+                          name=('%s_tower_1' % name), suffix='_mixed_conv_1')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool,
+                      name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' % name),
+                 suffix='_conv')
+    # concat
+    concat = sym.concatenate(
+        *[tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj],
+        name='ch_concat_%s_chconcat' % name)
+    return concat
+
+
+def get_symbol(num_classes=1000, **kwargs):
+    data = sym.Variable(name="data")
+    # stage 1
+    conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv")
+    conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1")
+    conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2")
+    pool = Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", pad=(0, 0),
+                   name="pool")
+    # stage 2
+    conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3")
+    conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4")
+    pool1 = Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", pad=(0, 0),
+                    name="pool1")
+
+    # stage 3
+    in3a = Inception7A(pool1, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 32, "mixed")
+    in3b = Inception7A(in3a, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_1")
+    in3c = Inception7A(in3b, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_2")
+    in3d = Inception7B(in3c, 384,
+                       64, 96, 96,
+                       "max", "mixed_3")
+    # stage 4
+    in4a = Inception7C(in3d, 192,
+                       128, 128, 192,
+                       128, 128, 128, 128, 192,
+                       "avg", 192, "mixed_4")
+    in4b = Inception7C(in4a, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_5")
+    in4c = Inception7C(in4b, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_6")
+    in4d = Inception7C(in4c, 192,
+                       192, 192, 192,
+                       192, 192, 192, 192, 192,
+                       "avg", 192, "mixed_7")
+    in4e = Inception7D(in4d, 192, 320,
+                       192, 192, 192, 192,
+                       "max", "mixed_8")
+    # stage 5
+    in5a = Inception7E(in4e, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "avg", 192, "mixed_9")
+    in5b = Inception7E(in5a, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "max", 192, "mixed_10")
+    # pool
+    pool = Pooling(data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", pad=(0, 0),
+                   name="global_pool")
+    flatten = sym.flatten(data=pool, name="flatten")
+    fc1 = sym.dense(data=flatten, units=num_classes, name='fc1')
+    softmax = sym.softmax(data=fc1, name='softmax')
+    return softmax
+
+def get_workload(batch_size=1, num_classes=1000,
+                 image_shape=(3, 299, 299), dtype="float32", **kwargs):
+    """Get benchmark workload for InceptionV3
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    kwargs : dict
+        Extra arguments
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_symbol(num_classes=num_classes, **kwargs)
+    return create_workload(net, batch_size, image_shape, dtype)
diff --git a/nnvm/python/nnvm/testing/mobilenet_v2.py b/nnvm/python/nnvm/testing/mobilenet_v2.py
new file mode 100644
index 000000000000..dc3c7cd85660
--- /dev/null
+++ b/nnvm/python/nnvm/testing/mobilenet_v2.py
@@ -0,0 +1,51 @@
+"""
+MobileNetV2, load model from gluon model zoo
+
+Reference:
+Inverted Residuals and Linear Bottlenecks:
+Mobile Networks for Classification, Detection and Segmentation
+https://arxiv.org/abs/1801.04381
+"""
+
+from .utils import create_workload
+from ..frontend.mxnet import _from_mxnet_impl
+
+def get_workload(batch_size, num_classes=1000, multiplier=1.0, dtype="float32"):
+    """Get benchmark workload for mobilenet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    multiplier : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    import mxnet as mx
+    from mxnet.gluon.model_zoo.vision.mobilenet import MobileNetV2
+
+    image_shape = (1, 3, 224, 224)
+
+    block = MobileNetV2(multiplier=multiplier, classes=num_classes)
+
+    data = mx.sym.Variable('data')
+    sym = block(data)
+    sym = mx.sym.SoftmaxOutput(sym)
+
+    net = _from_mxnet_impl(sym, {})
+
+    return create_workload(net, batch_size, image_shape[1:], dtype)
diff --git a/nnvm/python/nnvm/testing/resnet.py b/nnvm/python/nnvm/testing/resnet.py
index 6de0213679d1..e63ceff7c3f0 100644
--- a/nnvm/python/nnvm/testing/resnet.py
+++ b/nnvm/python/nnvm/testing/resnet.py
@@ -46,18 +46,16 @@ def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True):
         Base name of the operators
     """
     if bottle_neck:
-        # the same as https://github.com/facebook/fb.resnet.torch#notes,
-        # a bit difference with origin paper
         bn1 = sym.batch_norm(data=data, epsilon=2e-5, name=name + '_bn1')
         act1 = sym.relu(data=bn1, name=name + '_relu1')
         conv1 = sym.conv2d(
             data=act1, channels=int(num_filter*0.25), kernel_size=(1, 1),
-            strides=(1, 1), padding=(0, 0), use_bias=False, name=name + '_conv1')
+            strides=stride, padding=(0, 0), use_bias=False, name=name + '_conv1')
         bn2 = sym.batch_norm(data=conv1, epsilon=2e-5, name=name + '_bn2')
         act2 = sym.relu(data=bn2, name=name + '_relu2')
         conv2 = sym.conv2d(
             data=act2, channels=int(num_filter*0.25), kernel_size=(3, 3),
-            strides=stride, padding=(1, 1), use_bias=False, name=name + '_conv2')
+            strides=(1, 1), padding=(1, 1), use_bias=False, name=name + '_conv2')
         bn3 = sym.batch_norm(data=conv2, epsilon=2e-5, name=name + '_bn3')
         act3 = sym.relu(data=bn3, name=name + '_relu3')
         conv3 = sym.conv2d(
diff --git a/nnvm/python/nnvm/testing/squeezenet.py b/nnvm/python/nnvm/testing/squeezenet.py
index a445e8cfb7da..eab2cf06fee6 100644
--- a/nnvm/python/nnvm/testing/squeezenet.py
+++ b/nnvm/python/nnvm/testing/squeezenet.py
@@ -98,7 +98,7 @@ def get_symbol(num_classes, version, **kwargs):
 
 def get_workload(batch_size=1, num_classes=1000, version='1.0',
                  image_shape=(3, 224, 224), dtype="float32", **kwargs):
-    """Get benchmark workload for resnet
+    """Get benchmark workload for SqueezeNet
 
     Parameters
     ----------
diff --git a/nnvm/python/nnvm/testing/tf.py b/nnvm/python/nnvm/testing/tf.py
index 0372d7450586..effe19808a59 100644
--- a/nnvm/python/nnvm/testing/tf.py
+++ b/nnvm/python/nnvm/testing/tf.py
@@ -13,6 +13,8 @@
 import tensorflow as tf
 from tensorflow.core.framework import graph_pb2
 
+from tvm.contrib import util
+
 ######################################################################
 # Some helper functions
 # ---------------------
@@ -43,6 +45,32 @@ def ProcessGraphDefParam(graph_def):
             raise TypeError('graph_def must be a GraphDef proto.')
     return graph_def
 
+
+def AddShapesToGraphDef(session, out_node):
+    """ Add shapes attribute to nodes of the graph.
+        Input graph here is the default graph in context.
+
+    Parameters
+    ----------
+    session : tf.Session
+        Tensorflow session
+    out_node : String
+        Final output node of the graph.
+
+    Returns
+    -------
+    graph_def : Obj
+        tensorflow graph definition with shapes attribute added to nodes.
+
+    """
+
+    graph_def = tf.graph_util.convert_variables_to_constants(
+        session,
+        session.graph.as_graph_def(add_shapes=True),
+        [out_node],
+        )
+    return graph_def
+
 class NodeLookup(object):
     """Converts integer node ID's to human readable labels."""
 
@@ -108,7 +136,45 @@ def id_to_string(self, node_id):
             return ''
         return self.node_lookup[node_id]
 
-def get_workload(model_path):
+def get_workload_official(model_url, model_sub_path, temp_dir):
+    """ Import workload from tensorflow official
+
+    Parameters
+    ----------
+    model_url: str
+        URL from where it will be downloaded.
+
+    model_sub_path:
+        Sub path in extracted tar for the ftozen protobuf file.
+
+    temp_dir: TempDirectory
+        The temporary directory object to download the content.
+
+    Returns
+    -------
+    graph_def: graphdef
+        graph_def is the tensorflow workload for mobilenet.
+
+    """
+
+    model_tar_name = os.path.basename(model_url)
+
+    from mxnet.gluon.utils import download
+    temp_path = temp_dir.relpath("./")
+    path_model = temp_path + model_tar_name
+
+    download(model_url, path_model)
+
+    import tarfile
+    if path_model.endswith("tgz") or path_model.endswith("gz"):
+        tar = tarfile.open(path_model)
+        tar.extractall(path=temp_path)
+        tar.close()
+    else:
+        raise RuntimeError('Could not decompress the file: ' + path_model)
+    return temp_path + model_sub_path
+
+def get_workload(model_path, model_sub_path=None):
     """ Import workload from frozen protobuf
 
     Parameters
@@ -116,6 +182,9 @@ def get_workload(model_path):
     model_path: str
         model_path on remote repository to download from.
 
+    model_sub_path: str
+        Model path in the compressed archive.
+
     Returns
     -------
     graph_def: graphdef
@@ -123,18 +192,24 @@ def get_workload(model_path):
 
     """
 
-    repo_base = 'https://github.com/dmlc/web-data/raw/master/tensorflow/models/'
-    model_name = os.path.basename(model_path)
-    model_url = os.path.join(repo_base, model_path)
+    temp = util.tempdir()
+    if model_sub_path:
+        path_model = get_workload_official(model_path, model_sub_path, temp)
+    else:
+        repo_base = 'https://github.com/dmlc/web-data/raw/master/tensorflow/models/'
+        model_name = os.path.basename(model_path)
+        model_url = os.path.join(repo_base, model_path)
 
-    from mxnet.gluon.utils import download
-    download(model_url, model_name)
+        from mxnet.gluon.utils import download
+        path_model = temp.relpath(model_name)
+        download(model_url, path_model)
 
     # Creates graph from saved graph_def.pb.
-    with tf.gfile.FastGFile(os.path.join("./", model_name), 'rb') as f:
+    with tf.gfile.FastGFile(path_model, 'rb') as f:
         graph_def = tf.GraphDef()
         graph_def.ParseFromString(f.read())
         graph = tf.import_graph_def(graph_def, name='')
+        temp.remove()
         return graph_def
 
 #######################################################################
diff --git a/nnvm/python/nnvm/testing/yolo2_detection.py b/nnvm/python/nnvm/testing/yolo_detection.py
similarity index 53%
rename from nnvm/python/nnvm/testing/yolo2_detection.py
rename to nnvm/python/nnvm/testing/yolo_detection.py
index b7744c45cff4..7c600d38db62 100644
--- a/nnvm/python/nnvm/testing/yolo2_detection.py
+++ b/nnvm/python/nnvm/testing/yolo_detection.py
@@ -9,30 +9,22 @@
 from __future__ import division
 import math
 from collections import namedtuple
+from functools import cmp_to_key
 import numpy as np
-from PIL import Image
-from PIL import ImageDraw
-from PIL import ImageFont
-
-def _entry_index(batch, w, h, outputs, classes, coords, location, entry):
-    n = int(location/(w*h))
-    loc = location%(w*h)
-    return batch*outputs + n*w*h*(coords+classes+1) + entry*w*h + loc
 
 Box = namedtuple('Box', ['x', 'y', 'w', 'h'])
-def _get_region_box(x, biases, n, index, i, j, w, h, stride):
-    b = Box(0, 0, 0, 0)
-    b = b._replace(x=(i + x[index + 0*stride]) / w)
-    b = b._replace(y=(j + x[index + 1*stride]) / h)
-    b = b._replace(w=np.exp(x[index + 2*stride]) * biases[2*n] / w)
-    b = b._replace(h=np.exp(x[index + 3*stride]) * biases[2*n+1] / h)
-    return b
-
-def _correct_region_boxes(boxes, n, w, h, netw, neth, relative):
-    new_w, new_h = (netw, (h*netw)/w) if (netw/w < neth/h) else ((w*neth/h), neth)
-    for i in range(n):
-        b = boxes[i]
-        b = boxes[i]
+
+def nms_comparator(a, b):
+    if 'sort_class' in b and b['sort_class'] >= 0:
+        diff = a['prob'][b['sort_class']] - b['prob'][b['sort_class']]
+    else:
+        diff = a['objectness'] - b['objectness']
+    return diff
+
+def _correct_boxes(dets, w, h, netw, neth, relative):
+    new_w, new_h = (netw, (h*netw)//w) if (netw/w < neth/h) else ((w*neth//h), neth)
+    for det in dets:
+        b = det['bbox']
         b = b._replace(x=(b.x - (netw - new_w)/2/netw) / (new_w/netw))
         b = b._replace(y=(b.y - (neth - new_h)/2/neth) / (new_h/neth))
         b = b._replace(w=b.w * netw/new_w)
@@ -42,7 +34,8 @@ def _correct_region_boxes(boxes, n, w, h, netw, neth, relative):
             b = b._replace(w=b.w * w)
             b = b._replace(y=b.y * h)
             b = b._replace(h=b.h * h)
-        boxes[i] = b
+        det['bbox'] = b
+    return dets
 
 def _overlap(x1, w1, x2, w2):
     l1 = x1 - w1/2
@@ -68,75 +61,106 @@ def _box_union(a, b):
 def _box_iou(a, b):
     return _box_intersection(a, b)/_box_union(a, b)
 
-def get_region_boxes(layer_in, imw, imh, netw, neth, thresh, probs,
-                     boxes, relative, tvm_out):
-    "To get the boxes for the image based on the prediction"
-    lw = layer_in.w
-    lh = layer_in.h
-    probs = [[0 for i in range(layer_in.classes + 1)] for y in range(lw*lh*layer_in.n)]
-    boxes = [Box(0, 0, 0, 0) for i in range(lw*lh*layer_in.n)]
-    for i in range(lw*lh):
-        row = int(i / lw)
-        col = int(i % lw)
-        for n in range(layer_in.n):
-            index = n*lw*lh + i
-            obj_index = _entry_index(0, lw, lh, layer_in.outputs, layer_in.classes,
-                                     layer_in.coords, n*lw*lh + i, layer_in.coords)
-            box_index = _entry_index(0, lw, lh, layer_in.outputs, layer_in.classes,
-                                     layer_in.coords, n*lw*lh + i, 0)
-            mask_index = _entry_index(0, lw, lh, layer_in.outputs, layer_in.classes,
-                                      layer_in.coords, n*lw*lh + i, 4)
-            scale = 1 if layer_in.background  else tvm_out[obj_index]
-            boxes[index] = _get_region_box(tvm_out, layer_in.biases, n, box_index, col,
-                                           row, lw, lh, lw*lh)
-            if not layer_in.softmax_tree:
-                max_element = 0
-                for j in range(layer_in.classes):
-                    class_index = _entry_index(0, lw, lh, layer_in.outputs, layer_in.classes,
-                                               layer_in.coords, n*lw*lh + i, layer_in.coords+1+j)
-                    prob = scale*tvm_out[class_index]
-                    probs[index][j] = prob if prob > thresh else 0
-                    max_element = max(max_element, prob)
-                probs[index][layer_in.classes] = max_element
-
-    _correct_region_boxes(boxes, lw*lh*layer_in.n, imw, imh, netw, neth, relative)
-    return boxes, probs
-
-
-def do_nms_sort(boxes, probs, total, classes, thresh):
-    "Does the sorting based on the threshold values"
-    SortableBbox = namedtuple('SortableBbox', ['index_var', 'class_var', 'probs'])
+def _get_box(data, biases, n, location, lw, lh, w, h):
+    bx = (location[2] + data[location[0]][0][location[1]][location[2]]) / lw
+    by = (location[1] + data[location[0]][1][location[1]][location[2]]) / lh
+    bw = np.exp(data[location[0]][2][location[1]][location[2]]) * biases[2*n] / w
+    bh = np.exp(data[location[0]][3][location[1]][location[2]]) * biases[2*n+1] / h
+    return Box(bx, by, bw, bh)
 
-    s = [SortableBbox(0, 0, []) for i in range(total)]
-    for i in range(total):
-        s[i] = s[i]._replace(index_var=i)
-        s[i] = s[i]._replace(class_var=0)
-        s[i] = s[i]._replace(probs=probs)
+def _get_yolo_detections(l, im_shape, net_shape, thresh, relative, dets):
+    data = l['output']
+    active_data_loc = np.asarray(np.where(data[:, 4, :, :] > thresh))
+    before_correct_dets = []
+    for i in range(active_data_loc.shape[1]):
+        location = [active_data_loc[0][i], active_data_loc[1][i], active_data_loc[2][i]]
+        box_b = _get_box(data, l['biases'], np.asarray(l['mask'])[location[0]], location,
+                         data.shape[2], data.shape[3], net_shape[0], net_shape[1])
+        objectness = data[location[0]][4][location[1]][location[2]]
+        classes = l['classes']
+        prob = objectness*data[location[0], 5:5 + 1 + classes, location[1], location[2]]
+        prob[prob < thresh] = 0
+        detection = {}
+        detection['bbox'] = box_b
+        detection['classes'] = classes
+        detection['prob'] = prob
+        detection['objectness'] = objectness
+        before_correct_dets.append(detection)
+    dets.extend(_correct_boxes(before_correct_dets, im_shape[0], im_shape[1],
+                               net_shape[0], net_shape[1], relative))
+    return
 
+def _get_region_detections(l, im_shape, net_shape, thresh, relative, dets):
+    data = l['output']
+    before_correct_dets = []
+    for row in range(data.shape[2]):
+        for col in range(data.shape[3]):
+            for n in range(data.shape[0]):
+                prob = [0]*l['classes']
+                scale = data[n, l['coords'], row, col] if not l['background'] else 1
+                location = [n, row, col]
+                box_b = _get_box(data, l['biases'], n, location,
+                                 data.shape[2], data.shape[3], data.shape[2], data.shape[3])
+                objectness = scale if scale > thresh else 0
+                if objectness:
+                    prob = scale * data[n, l['coords']+1: l['coords']+1+l['classes'],
+                                        row, col]
+                    prob[prob < thresh] = 0
+                detection = {}
+                detection['bbox'] = box_b
+                detection['prob'] = prob
+                detection['objectness'] = objectness
+                before_correct_dets.append(detection)
+    _correct_boxes(before_correct_dets, im_shape[0], im_shape[1],
+                   net_shape[0], net_shape[1], relative)
+    dets.extend(before_correct_dets)
+    return
+
+def fill_network_boxes(net_shape, im_shape,
+                       thresh, relative, tvm_out):
+    dets = []
+    for layer in tvm_out:
+        if layer['type'] == 'Yolo':
+            _get_yolo_detections(layer, im_shape, net_shape, thresh, relative, dets)
+        elif layer['type'] == 'Region':
+            _get_region_detections(layer, im_shape, net_shape, thresh, relative, dets)
+    return dets
+
+def do_nms_sort(dets, classes, thresh):
+    "Does the sorting based on the threshold values"
+    k = len(dets)-1
+    cnt = 0
+    while cnt < k:
+        if dets[cnt]['objectness'] == 0:
+            dets[k], dets[cnt] = dets[cnt], dets[k]
+            k = k - 1
+        else:
+            cnt = cnt + 1
+    total = k+1
     for k in range(classes):
         for i in range(total):
-            s[i] = s[i]._replace(class_var=k)
-        s = sorted(s, key=lambda x: x.probs[x.index_var][x.class_var], reverse=True)
+            dets[i]['sort_class'] = k
+        dets[0:total] = sorted(dets[0:total],
+                               key=cmp_to_key(nms_comparator), reverse=True)
         for i in range(total):
-            if probs[s[i].index_var][k] == 0:
+            if dets[i]['prob'][k] == 0:
                 continue
-            a = boxes[s[i].index_var]
+            a = dets[i]['bbox']
             for j in range(i+1, total):
-                b = boxes[s[j].index_var]
+                b = dets[j]['bbox']
                 if _box_iou(a, b) > thresh:
-                    probs[s[j].index_var][k] = 0
-    return boxes, probs
+                    dets[j]['prob'][k] = 0
 
-def draw_detections(im, num, thresh, boxes, probs, names, classes):
+def draw_detections(im, dets, thresh, names, classes):
     "Draw the markings around the detected region"
-    for i in range(num):
+    for det in dets:
         labelstr = []
         category = -1
         for j in range(classes):
-            if probs[i][j] > thresh:
+            if det['prob'][j] > thresh:
                 if category == -1:
                     category = j
-                labelstr.append(names[j])
+                labelstr.append(names[j] + " " + str(round(det['prob'][j], 4)))
         if category > -1:
             imc, imh, imw = im.shape
             width = int(imh * 0.006)
@@ -145,7 +169,7 @@ def draw_detections(im, num, thresh, boxes, probs, names, classes):
             green = _get_color(1, offset, classes)
             blue = _get_color(0, offset, classes)
             rgb = [red, green, blue]
-            b = boxes[i]
+            b = det['bbox']
             left = int((b.x-b.w/2.)*imw)
             right = int((b.x+b.w/2.)*imw)
             top = int((b.y-b.h/2.)*imh)
@@ -186,6 +210,10 @@ def _draw_label(im, r, c, label, rgb):
                         _set_pixel(im, i+c, j+r, k, val)#rgb[k] * val)
 
 def _get_label(labelstr, rgb):
+    from PIL import Image
+    from PIL import ImageDraw
+    from PIL import ImageFont
+
     text = labelstr
     colorText = "black"
     testDraw = ImageDraw.Draw(Image.new('RGB', (1, 1)))
diff --git a/nnvm/python/nnvm/to_relay.py b/nnvm/python/nnvm/to_relay.py
new file mode 100644
index 000000000000..a168f4fd88d2
--- /dev/null
+++ b/nnvm/python/nnvm/to_relay.py
@@ -0,0 +1,512 @@
+# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name, unused-argument
+"""Convert an NNVM graph to Relay."""
+import json
+from tvm import relay, nd
+from tvm.relay import op, expr, var
+from tvm.relay.frontend.common import StrAttrsDict
+from tvm.relay.frontend.nnvm_common import _rename
+import numpy
+from .symbol import Symbol
+from .compiler import graph_attr
+from .graph import create as graph_create
+
+def _nn_batch_flatten(children, attrs, odtype='float32'):
+    assert len(children) == 1
+    return op.nn.batch_flatten(children[0])
+
+
+def _dense(children, attrs, odtype='float32'):
+    use_bias = attrs.get_bool('use_bias', True)
+    units = attrs.get_int('units')
+    dense = op.nn.dense(children[0], children[1], units=units)
+    if use_bias:
+        return op.nn.bias_add(dense, children[2])
+    else:
+        return dense
+
+def _nn_softmax(children, attrs, odtype='float32'):
+    assert len(children) == 1
+    axis = attrs.get_int('axis', 1)
+    return op.nn.softmax(children[0], axis)
+
+def _conv2d(children, attrs, odtype='float32'):
+    use_bias = attrs.get_bool('use_bias', False)
+
+    if use_bias:
+        data, weight, bias = children
+    else:
+        data, weight = children
+
+    strides = attrs.get_int_tuple('strides', (1, 1))
+    padding = attrs.get_int_tuple('padding', (0, 0))
+    dilation = attrs.get_int_tuple('dilation', (1, 1))
+    groups = attrs.get_int('groups', 1)
+    data_layout = attrs.get_str('layout', 'NCHW')
+    weight_layout = attrs.get_str('kernel_layout', 'OIHW')
+    out_layout = ''
+    out_dtype = attrs.get_str('out_dtype', '')
+
+    conv_out = op.nn.conv2d(
+        data,
+        weight,
+        strides=strides,
+        padding=padding,
+        dilation=dilation,
+        groups=groups,
+        data_layout=data_layout,
+        weight_layout=weight_layout,
+        out_layout=out_layout,
+        out_dtype=out_dtype)
+
+    if use_bias:
+        return op.nn.bias_add(conv_out, bias)
+    else:
+        return conv_out
+
+
+def _conv2d_transpose(children, attrs, odtype='float32'):
+    use_bias = attrs.get_bool('use_bias', False)
+
+    if use_bias:
+        data, weight, bias = children
+    else:
+        data, weight = children
+
+    strides = attrs.get_int_tuple('strides', (1, 1))
+    padding = attrs.get_int_tuple('padding', (0, 0))
+    dilation = attrs.get_int_tuple('dilation', (1, 1))
+    groups = attrs.get_int('groups', 1)
+    data_layout = attrs.get_str('layout', 'NCHW')
+    weight_layout = attrs.get_str('kernel_layout', 'OIHW')
+    out_dtype = attrs.get_str('out_dtype', '')
+
+    out_conv2d = op.nn.conv2d_transpose(
+        data,
+        weight,
+        strides=strides,
+        padding=padding,
+        dilation=dilation,
+        groups=groups,
+        data_layout=data_layout,
+        weight_layout=weight_layout,
+        out_dtype=out_dtype)
+
+    if use_bias:
+        return op.nn.bias_add(out_conv2d, bias)
+    else:
+        return out_conv2d
+
+
+def _batch_norm(children, attrs, odtype='float32'):
+    data, gamma, beta, moving_mean, moving_view = children
+    axis = attrs.get_int('axis', 1)
+    epsilon = attrs.get_float('epsilon', 1e-05)
+    center = attrs.get_bool('center', True)
+    scale = attrs.get_bool('scale', True)
+
+    return op.nn.batch_norm(
+        data,
+        gamma,
+        beta,
+        moving_mean,
+        moving_view,
+        axis=axis,
+        epsilon=epsilon,
+        center=center,
+        scale=scale)[0]
+
+
+def _max_pool2d(children, attrs, odtype='float32'):
+    assert len(children) == 1
+    data = children[0]
+    pool_size = attrs.get_int_tuple('pool_size', (1, 1))
+    strides = attrs.get_int_tuple('strides', (1, 1))
+    padding = attrs.get_int_tuple('padding', (0, 0))
+    layout = attrs.get_int_tuple('layout', 'NCHW')
+    ceil_mode = attrs.get_bool('ceil_mode', False)
+
+    return op.nn.max_pool2d(
+        data,
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        layout=layout,
+        ceil_mode=ceil_mode)
+
+
+def _reshape(children, attrs, odtype='float32'):
+    data = children[0]
+    shape = attrs.get_int_list('shape')
+    return op.reshape(data, shape)
+
+
+def _transpose(children, attrs, odtype='float32'):
+    axes = attrs.get_int_list('axes', None)
+    return op.transpose(children[0], axes=axes)
+
+
+def _add(children, attrs, odtype='float32'):
+    if len(children) == 1:
+        left = children[0]
+        scalar = attrs.get_float('scalar')
+        right = relay.const(scalar, dtype=odtype)
+    else:
+        assert len(children) == 2
+        left = children[0]
+        right = children[1]
+
+    return op.add(left, right)
+
+
+def _subtract(children, attrs, odtype='float32'):
+    if len(children) == 1:
+        left = children[0]
+        scalar = attrs.get_float('scalar')
+        right = relay.const(scalar, dtype=odtype)
+    else:
+        assert len(children) == 2
+        left = children[0]
+        right = children[1]
+
+    return op.subtract(left, right)
+
+
+def _rsubtract(children, attrs, odtype='float32'):
+    if len(children) == 1:
+        left = children[0]
+        scalar = attrs.get_float('scalar')
+        right = relay.const(scalar, dtype=odtype)
+    else:
+        assert len(children) == 2
+        left = children[0]
+        right = children[1]
+
+    return op.subtract(right, left)
+
+
+def _multiply(children, attrs, odtype='float32'):
+    if len(children) == 1:
+        left = children[0]
+        scalar = attrs.get_float('scalar')
+        right = relay.const(scalar, dtype=odtype)
+    else:
+        assert len(children) == 2
+        left = children[0]
+        right = children[1]
+
+    return op.multiply(left, right)
+
+
+def _divide(children, attrs, odtype='float32'):
+    if len(children) == 1:
+        left = children[0]
+        scalar = attrs.get_float('scalar')
+        right = relay.const(scalar, dtype=odtype)
+    else:
+        assert len(children) == 2
+        left = children[0]
+        right = children[1]
+
+    return op.divide(left, right)
+
+
+def _rshift(children, attrs, odtype='float32'):
+    if len(children) == 1:
+        left = children[0]
+        scalar = attrs.get_float('scalar')
+        right = relay.const(scalar, dtype='int32')
+    else:
+        assert len(children) == 2
+        left = children[0]
+        right = children[1]
+
+    return op.right_shift(left, right)
+
+
+def _clip(children, attrs, odtype='float32'):
+    a_min = attrs.get_float('a_min')
+    a_max = attrs.get_float('a_max')
+    return op.clip(children[0], a_min, a_max)
+
+
+def _cast(children, attrs, odtype='float32'):
+    data = children[0]
+    dtype = attrs.get_str('dtype')
+    return data.astype(dtype)
+
+
+def _expand_dims(children, attrs, odtype='float32'):
+    data = children[0]
+    axis = attrs.get_int('axis')
+    num_newaxis = attrs.get_int('num_newaxis', 1)
+    return op.transform.expand_dims(data, axis, num_newaxis=num_newaxis)
+
+
+def broadcast_to(children, attrs, odtype='float32'):
+    # TODO(@jroesch) export broadcast to?
+    data = children[0]
+    shape = attrs.get_int_tuple('shape')
+    array = numpy.zeros(shape).astype(odtype)
+    rconst = relay.Constant(nd.array(array))
+    return op.broadcast_to_like(data, rconst)
+
+def _copy(children, attrs, odtype='float32'):
+    return op.copy(children[0])
+
+
+def _global_avg_pool2d(children, attrs, odtype='float32'):
+    data = children[0]
+    layout = attrs.get_str('layout', "NCHW")
+    return op.nn.global_avg_pool2d(data, layout)
+
+
+def _avg_pool2d(children, attrs, odtype='float32'):
+    data = children[0]
+    pool_size = attrs.get_int_tuple('pool_size', (1, 1))
+    strides = attrs.get_int_tuple('strides', (1, 1))
+    padding = attrs.get_int_tuple('padding', (0, 0))
+    layout = attrs.get_str('layout', "NCHW")
+    ceil_mode = attrs.get_bool('ceil_mode', False)
+    count_include_pad = attrs.get_bool('layout', False)
+    return op.nn.avg_pool2d(
+        data,
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        layout=layout,
+        ceil_mode=ceil_mode,
+        count_include_pad=count_include_pad)
+
+
+def _upsampling(children, attrs, odtype='float32'):
+    scale = attrs.get_int('scale')
+    layout = attrs.get_str('layout', 'NCHW')
+    method = attrs.get_str('method', 'NEAREST_NEIGHBOR')
+    return op.nn.upsampling(
+        children[0],
+        scale=scale,
+        layout=layout,
+        method=method)
+
+
+def _pad(children, attrs, odtype='float32'):
+    pad_value = attrs.get_float('pad_value', 0.0)
+    pad_width = attrs.get_tuple_tuple_int('pad_width')
+    return op.nn.pad(children[0], pad_width, pad_value=pad_value)
+
+def _leaky_relu(children, attrs, odtype='float32'):
+    alpha = attrs.get_float('alpha')
+    return op.nn.leaky_relu(children[0], alpha)
+
+
+def _full_like(children, attrs, odtype='float32'):
+    fill_value = relay.const(attrs.get_float('fill_value'), dtype='float32')
+    return op.full_like(children[0], fill_value)
+
+
+def _greater(children, attrs, odtype='float32'):
+    out_type = attrs.get_str('out_type')
+    if out_type:
+        return op.greater(children[0], children[1]).astype(out_type)
+    else:
+        return op.greater(children[0], children[1])
+
+
+def _greater_equal(children, attrs, odtype='float32'):
+    out_type = attrs.get_str('out_type', None)
+    if out_type:
+        return op.greater_equal(children[0], children[1]).astype(out_type)
+    else:
+        return op.greater_equal(children[0], children[1])
+
+
+def _less(children, attrs, odtype='float32'):
+    out_type = attrs.get_str('out_type', None)
+    if out_type:
+        return op.less(children[0], children[1]).astype(out_type)
+    else:
+        return op.less(children[0], children[1])
+
+
+def _less_equal(children, attrs, odtype='float32'):
+    out_type = attrs.get_str('out_type', None)
+    if out_type:
+        return op.less_equal(children[0], children[1]).astype(out_type)
+    else:
+        return op.less_equal(children[0], children[1])
+
+
+def _strided_slice(children, attrs, odtype='float32'):
+    begin = attrs.get_int_list('begin')
+    end = attrs.get_int_list('end')
+    strides = attrs.get_int_list('strides', None)
+    return op.strided_slice(children[0], begin, end, strides=strides)
+
+
+def _split(children, attrs, odtype='float32'):
+    indices_or_sections = None
+    try:
+        indices_or_sections = attrs.get_int('indices_or_sections', None)
+    except ValueError:
+        indices_or_sections = indices_or_sections or attrs.get_int_tuple(
+            'indices_or_sections')
+
+    axis = attrs.get_int('axis', 0)
+
+    return op.split(children[0], indices_or_sections, axis)
+
+def _squeeze(children, attrs, odtype='float32'):
+    axis = None
+    try:
+        axis = [attrs.get_int('axis', None)]
+    except ValueError:
+        axis = axis or attrs.get_int_tuple('axis', None)
+
+    return op.squeeze(children[0], axis)
+
+def _concatenate(children, attrs, odtype='float32'):
+    axis = attrs.get_int('axis', None)
+    return op.concatenate(children, axis)
+
+
+NNVM_OP_2_RELAY_OP = {
+    'flatten': _nn_batch_flatten,
+    'dense': _dense,
+    'softmax': _nn_softmax,
+    'conv2d': _conv2d,
+    'batch_norm': _batch_norm,
+    'max_pool2d': _max_pool2d,
+    'reshape': _reshape,
+    'transpose': _transpose,
+    # Addition
+    '__add_scalar__': _add,
+    'broadcast_add': _add,
+    'elemwise_add': _add,
+    # Subtraction
+    '__sub_scalar__': _subtract,
+    '__rsub_scalar__': _rsubtract,
+    'broadcast_sub': _subtract,
+    'elemwise_sub': _subtract,
+    # Multiply
+    '__mul_scalar__': _multiply,
+    'broadcast_mul': _multiply,
+    'elemwise_mul': _multiply,
+    # Division
+    '__div_scalar__': _divide,
+    'broadcast_div': _divide,
+    'elemwise_div': _divide,
+    # Negative
+    'negative': _rename("negative"),
+
+    # Comparsion
+    'greater': _greater,
+    'greater_equal': _greater_equal,
+    'less': _less,
+    'less_equal': _less_equal,
+
+    # Activations
+    'sigmoid': _rename('sigmoid'),
+    'relu': _rename('nn.relu'),
+    'exp': _rename('exp'),
+    'log': _rename('log'),
+    'tanh': _rename('tanh'),
+    'leaky_relu': _leaky_relu,
+    'clip': _clip,
+    'round': _rename('round'),
+    'cast': _cast,
+    'expand_dims': _expand_dims,
+    'broadcast_to': broadcast_to,
+    '__rshift_scalar__': _rshift,
+    'copy': _copy,
+    'global_avg_pool2d': _global_avg_pool2d,
+    'avg_pool2d': _avg_pool2d,
+    'conv2d_transpose': _conv2d_transpose,
+    'upsampling': _upsampling,
+    'pad': _pad,
+    'full_like': _full_like,
+    'strided_slice': _strided_slice,
+    'split': _split,
+    'squeeze': _squeeze,
+    'concatenate': _concatenate,
+}
+
+
+def to_relay(graph, shape_dict, dtype_dict, params):
+    """Convert an NNVM graph into the corresponding Relay expression.
+
+    Parameters
+    ----------
+    graph : Graph
+       The input graph.
+
+    shape_dict : dict of str to shape
+       The input shape.
+
+    dtype_dict : dict of str to str/dtype
+       The input shape.
+
+    params : dict of str to array
+        The parameters.
+
+    Returns
+    -------
+    (expr, params) : Tuple[relay.Expr, dict of str to array]
+        The corresponding Relay expression and parameters.
+    """
+    if isinstance(graph, Symbol):
+        graph = graph_create(graph)
+
+    param_shapes = dict((k, params[k].shape) for k in params)
+    shape_dict = shape_dict.copy()
+    shape_dict.update(param_shapes)
+    graph = graph_attr.set_shape_inputs(graph, shape_dict)
+    graph = graph_attr.set_dtype_inputs(graph, dtype_dict)
+    graph = graph.apply(["InferShape", "InferType"])
+    shape = graph.json_attr("shape")
+    dtype = [graph_attr.TCODE_TO_DTYPE[di] for di in graph.json_attr("dtype")]
+    heads = [x[0] for x in json.loads(graph.json())['heads']]
+
+    gidx = graph.index
+    relay_map = {}
+    fn_params = []
+    output_ids = []
+
+    for nid, node in enumerate(gidx.nodes):
+        children = []
+        for i in node['inputs']:
+            child = relay_map[i[0]]
+            if isinstance(child, expr.TupleWrapper):
+                children.append(child[i[1]])
+            else:
+                children.append(child)
+
+        oshape = shape[gidx.entry_id(nid, 0)]
+        odtype = dtype[gidx.entry_id(nid, 0)]
+        attrs = node.get("attrs", {})
+        node_name = node["name"]
+        op_name = node["op"]
+
+        if op_name == "null":
+            v = var(node_name, shape=oshape, dtype=odtype)
+            fn_params.append(v)
+            relay_map[nid] = v
+        else:
+            if nid in heads:
+                output_ids.append(nid)
+
+            if op_name in NNVM_OP_2_RELAY_OP:
+                str_attrs = StrAttrsDict(attrs)
+                call = NNVM_OP_2_RELAY_OP[op_name](children, str_attrs, odtype)
+                relay_map[nid] = call
+            else:
+                raise Exception(
+                    "nnvm.to_relay: unsupported operator: {0}".format(op_name))
+
+    outputs = [relay_map[nid] for nid in output_ids]
+    if len(outputs) == 1:
+        body = outputs[0]
+    else:
+        body = expr.Tuple(outputs)
+
+    func = relay.Function(fn_params, body)
+    return func, params
diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
index f9a2c2813a04..a37a5d7e071e 100644
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -1,10 +1,10 @@
-# pylint: disable=invalid-name, unused-argument
+# pylint: disable=invalid-name, unused-argument, missing-docstring, no-else-return
 """Definition of nn ops"""
 from __future__ import absolute_import
 
 import tvm
 import topi
-from topi.util import get_const_int
+from topi.util import get_const_int, get_const_tuple
 from .tensor import _fschedule_broadcast, _fschedule_injective
 from . import registry as reg
 from .registry import OpPattern
@@ -90,37 +90,39 @@ def compute_conv2d(attrs, inputs, _):
     kernel_layout = attrs["kernel_layout"]
     out_dtype = attrs["out_dtype"]
     out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
-    assert layout == "NCHW" or layout == "NHWC"
+    assert layout in ["NCHW", "NHWC", "NCHW4c"]
     (dilation_h, dilation_w) = dilation
     if dilation_h < 1 or dilation_w < 1:
         raise ValueError("dilation should be positive value")
-    elif dilation == (1, 1):
-        kernel = inputs[1]
-    elif layout == "NCHW":
-        kernel = topi.nn.dilate(inputs[1], [1, 1, dilation_h, dilation_w])
-    else: #layout == NHWC
-        kernel = topi.nn.dilate(inputs[1], [1, dilation_h, dilation_w, 1])
 
-    if groups == 1:
+    if groups == 1 and layout == 'NCHW4c' and inputs[0].dtype == 'int8':
+        # pylint: disable=assignment-from-no-return
+        out = topi.nn.conv2d(inputs[0], inputs[1], strides, padding,
+                             dilation, layout, out_dtype=out_dtype)
+        # pylint: enable=assignment-from-no-return
+    elif groups == 1:
         out = topi.nn.conv2d(
-            inputs[0], kernel, strides, padding, layout, out_dtype=out_dtype)
+            inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype=out_dtype)
     elif layout == "NCHW" and \
          groups == get_const_int(inputs[0].shape[1]) and \
          groups == channels:
         out = topi.nn.depthwise_conv2d_nchw(
-            inputs[0], kernel, strides, padding, out_dtype=out_dtype)
+            inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
+    elif layout in ["NCHW", "NCHW4c"]:
+        out = topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups,
+                                        out_dtype=out_dtype)
     elif layout == "NHWC" and \
          kernel_layout == "HWOI" and \
          groups == get_const_int(inputs[0].shape[3]) and \
          groups == channels:
         out = topi.nn.depthwise_conv2d_nhwc(
-            inputs[0], kernel, strides, padding, out_dtype=out_dtype)
+            inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
     else:
         raise ValueError("not support arbitrary group number for now")
 
     if attrs.get_bool("use_bias"):
         bias = inputs[2]
-        expand_axis = 1 if layout == "NCHW" else 0
+        expand_axis = 1 if layout in ["NCHW", "NCHW4c"] else 0
         bias = topi.expand_dims(bias, axis=expand_axis, num_newaxis=2)
         out = topi.add(out, bias)
     return out
@@ -136,12 +138,16 @@ def schedule_conv2d(attrs, outs, target):
     with tvm.target.create(target):
         if groups == 1 and layout == "NCHW":
             return topi.generic.schedule_conv2d_nchw(outs)
+        elif groups == 1 and layout == "NCHW4c":
+            return topi.generic.schedule_conv2d_nchw(outs)
         elif groups == 1 and layout == "NHWC":
             return topi.generic.schedule_conv2d_nhwc(outs)
         elif groups == channels and layout == "NCHW":
             return topi.generic.schedule_depthwise_conv2d_nchw(outs)
         elif groups == channels and layout == "NHWC" and kernel_layout == "HWOI":
             return topi.generic.schedule_depthwise_conv2d_nhwc(outs)
+        elif layout in ["NCHW", "NCHW4c"]:
+            return topi.generic.schedule_group_conv2d_nchw(outs)
         else:
             raise ValueError("No compatible schedule")
 
@@ -158,16 +164,25 @@ def compute_contrib_conv2d_NCHWc(attrs, inputs, _):
     padding = attrs.get_int_tuple("padding")
     strides = attrs.get_int_tuple("strides")
     dilation = attrs.get_int_tuple("dilation")
-    kh, kw = attrs.get_int_tuple('kernel_size')
+    out_channel = attrs.get_int("channels")
     groups = attrs.get_int("groups")
-    channels = attrs.get_int("channels")
     layout = attrs.get_string("layout")
     out_layout = attrs.get_string("out_layout")
+    out_dtype = attrs.get_string("out_dtype")
+    out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
+    if layout == "NCHW":
+        _, in_channel, _, _ = get_const_tuple(inputs[0].shape)
+    else:
+        _, in_channel_chunk, _, _, in_channel_block = get_const_tuple(inputs[0].shape)
+        in_channel = in_channel_chunk * in_channel_block
     assert dilation == (1, 1), "not support dilate now"
     if groups == 1:
         # pylint: disable=assignment-from-no-return
-        out = topi.nn.conv2d_NCHWc(inputs[0], inputs[1], channels, (kh, kw),
-                                   strides, padding, layout, out_layout)
+        out = topi.nn.conv2d_NCHWc(inputs[0], inputs[1], strides, padding, dilation,
+                                   layout, out_layout, out_dtype)
+    elif groups == in_channel and groups == out_channel:
+        out = topi.nn.depthwise_conv2d_NCHWc(inputs[0], inputs[1], strides, padding,
+                                             dilation, layout, out_layout, out_dtype)
         # pylint: enable=assignment-from-no-return
     else:
         raise ValueError("not support arbitrary group number > 1 for now")
@@ -181,16 +196,12 @@ def compute_contrib_conv2d_NCHWc(attrs, inputs, _):
 def schedule_contrib_conv2d_NCHWc(attrs, outs, target):
     """Schedule definition of conv2d NCHWc"""
     groups = attrs.get_int("groups")
-    kh, kw = attrs.get_int_tuple('kernel_size')
-    oc = attrs.get_int("channels")
-    padding = attrs.get_int_tuple("padding")
-    strides = attrs.get_int_tuple("strides")
-    layout = attrs.get_string("layout")
-    out_layout = attrs.get_string("out_layout")
+    out_channel = attrs.get_int("channels")
     with tvm.target.create(target):
         if groups == 1:
-            return topi.generic.schedule_conv2d_NCHWc(oc, (kh, kw), strides, padding,
-                                                      layout, out_layout, outs)
+            return topi.generic.schedule_conv2d_NCHWc(outs)
+        elif groups == out_channel:
+            return topi.generic.schedule_depthwise_conv2d_NCHWc(outs)
         else:
             raise ValueError("not support group number > 1 for now")
 
@@ -225,7 +236,7 @@ def compute_contrib_conv2d_winograd_without_weight_transform(attrs, inputs, _):
 
     # pylint: disable=assignment-from-no-return
     out = topi.nn.conv2d_winograd_without_weight_transform(
-        inputs[0], inputs[1], strides, padding, layout, out_dtype,
+        inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype,
         tile_size)
 
     if attrs.get_bool("use_bias"):
@@ -280,20 +291,22 @@ def schedule_conv2d_transpose(attrs, outs, target):
 
 # max_pool2d
 @reg.register_schedule("max_pool2d")
-def schedule_max_pool2d(_, outs, target):
+def schedule_max_pool2d(attrs, outs, target):
     """Schedule definition of max_pool2d"""
+    layout = attrs["layout"]
     with tvm.target.create(target):
-        return topi.generic.schedule_pool(outs)
+        return topi.generic.schedule_pool(outs, layout)
 
 reg.register_pattern("max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # avg_pool2d
 @reg.register_schedule("avg_pool2d")
-def schedule_avg_pool2d(_, outs, target):
+def schedule_avg_pool2d(attrs, outs, target):
     """Schedule definition of avg_pool2d"""
+    layout = attrs["layout"]
     with tvm.target.create(target):
-        return topi.generic.schedule_pool(outs)
+        return topi.generic.schedule_pool(outs, layout)
 
 reg.register_pattern("avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
diff --git a/nnvm/python/nnvm/top/reduction.py b/nnvm/python/nnvm/top/reduction.py
index fd8e2f8df56e..aef6e1dcc4a8 100644
--- a/nnvm/python/nnvm/top/reduction.py
+++ b/nnvm/python/nnvm/top/reduction.py
@@ -49,3 +49,11 @@ def _compute(attrs, inputs, out_info):
 # argmin
 reg.register_pattern("argmin", OpPattern.COMM_REDUCE)
 reg.register_schedule("argmin", _fschedule_reduce)
+
+# mean
+reg.register_pattern("mean", OpPattern.COMM_REDUCE)
+reg.register_schedule("mean", _fschedule_reduce)
+
+# product
+reg.register_pattern("prod", OpPattern.COMM_REDUCE)
+reg.register_schedule("prod", _fschedule_reduce)
diff --git a/nnvm/python/nnvm/top/transform.py b/nnvm/python/nnvm/top/transform.py
index facb345c1abe..8fde9632a8af 100644
--- a/nnvm/python/nnvm/top/transform.py
+++ b/nnvm/python/nnvm/top/transform.py
@@ -2,6 +2,7 @@
 """Tensor transformation ops"""
 from __future__ import absolute_import
 
+import tvm
 import topi
 from .tensor import _fschedule_broadcast, _fschedule_injective
 from . import registry as reg
@@ -58,8 +59,13 @@ def compute_reshape_like(attrs, inputs, out_info):
 reg.register_schedule("squeeze", _fschedule_injective)
 
 # concatenate
+@reg.register_schedule("concatenate")
+def schedule_concatenate(_, outs, target):
+    """Schedule definition of concatenate"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_concatenate(outs)
+
 reg.register_pattern("concatenate", OpPattern.INJECTIVE)
-reg.register_schedule("concatenate", _fschedule_injective)
 
 # split
 reg.register_pattern("split", OpPattern.INJECTIVE)
@@ -80,3 +86,7 @@ def compute_reshape_like(attrs, inputs, out_info):
 # where
 reg.register_pattern("where", OpPattern.INJECTIVE)
 reg.register_schedule("where", _fschedule_injective)
+
+# gather_nd
+reg.register_pattern("gather_nd", OpPattern.INJECTIVE)
+reg.register_schedule("gather_nd", _fschedule_injective)
diff --git a/nnvm/src/c_api/c_api_error.cc b/nnvm/src/c_api/c_api_error.cc
index 399268667ddd..fd91bfb8b306 100644
--- a/nnvm/src/c_api/c_api_error.cc
+++ b/nnvm/src/c_api/c_api_error.cc
@@ -4,7 +4,7 @@
  * \brief C error handling
  */
 #include <dmlc/thread_local.h>
-#include "./c_api_common.h"
+#include "c_api_common.h"
 
 struct ErrorEntry {
   std::string last_error;
diff --git a/nnvm/src/c_api/c_api_graph.cc b/nnvm/src/c_api/c_api_graph.cc
index 831aaec33e8c..a0e84aef4482 100644
--- a/nnvm/src/c_api/c_api_graph.cc
+++ b/nnvm/src/c_api/c_api_graph.cc
@@ -9,7 +9,7 @@
 #include <nnvm/graph.h>
 #include <nnvm/pass.h>
 #include <dmlc/json.h>
-#include "./c_api_common.h"
+#include "c_api_common.h"
 
 using namespace nnvm;
 
diff --git a/nnvm/src/c_api/c_api_symbolic.cc b/nnvm/src/c_api/c_api_symbolic.cc
index 9f62dbd80b0c..e175cfc7da25 100644
--- a/nnvm/src/c_api/c_api_symbolic.cc
+++ b/nnvm/src/c_api/c_api_symbolic.cc
@@ -6,7 +6,7 @@
 #include <nnvm/c_api.h>
 #include <nnvm/op.h>
 #include <nnvm/symbolic.h>
-#include "./c_api_common.h"
+#include "c_api_common.h"
 
 using namespace nnvm;
 
diff --git a/nnvm/src/compiler/alter_op_layout.cc b/nnvm/src/compiler/alter_op_layout.cc
index bf28df3d04f8..f62e39efd9eb 100644
--- a/nnvm/src/compiler/alter_op_layout.cc
+++ b/nnvm/src/compiler/alter_op_layout.cc
@@ -12,8 +12,8 @@
 #include <tvm/tvm.h>
 #include <algorithm>
 #include <functional>
-#include "./compile_engine.h"
-#include "./graph_transform.h"
+#include "compile_engine.h"
+#include "graph_transform.h"
 
 namespace nnvm {
 namespace compiler {
@@ -46,7 +46,7 @@ Graph AlterOpLayout(const Graph& src) {
 
   std::vector<std::vector<Layout> > in_layouts_of_node(idx_graph.num_nodes());
   std::vector<std::vector<Layout> > out_layouts_of_node(idx_graph.num_nodes());
-  std::unordered_map<const Node*, uint32_t> new_nodes;
+  std::unordered_map<const Node*, uint32_t> unchanged_nodes;
 
   if (src.HasAttr("layout")) {
     // record layouts so that LayoutTransform pass can fix layouts correctly,
@@ -56,10 +56,8 @@ Graph AlterOpLayout(const Graph& src) {
     const auto& layouts = src.GetAttr<std::vector<Layout> >("layout");
     for (uint32_t nid = 0; nid < idx_graph.num_nodes(); ++nid) {
       const auto &inode = idx_graph[nid];
-      if (falter_op_layout.count(inode.source->op())) {
-        // do not record input layouts of nodes that will be replaced.
-        continue;
-      }
+      // record input layouts for all nodes,
+      // while replaced nodes will ignore the records here and have undefined input layouts.
       std::vector<Layout> in_layout;
       for (const auto& e : inode.inputs) {
         in_layout.emplace_back(layouts[idx_graph.entry_id(e)]);
@@ -80,7 +78,8 @@ Graph AlterOpLayout(const Graph& src) {
     nnvm::compiler::FTVMAlterOpLayout fn_alter_op_layout =
       falter_op_layout.get(n->op(), nullptr);
     if (fn_alter_op_layout == nullptr) {
-      new_nodes[n.get()] = nid;
+      // will restore the original input layouts later.
+      unchanged_nodes[n.get()] = nid;
       return false;
     }
 
@@ -106,7 +105,13 @@ Graph AlterOpLayout(const Graph& src) {
     Symbol op;
     bool do_alter =
       fn_alter_op_layout(n->attrs, Symbol::CreateGroup(op_inputs), tensor_infos, &op);
-    if (do_alter) *ret = op.outputs;
+
+    if (do_alter) {
+      *ret = op.outputs;
+    } else {
+      // will restore the original input layouts later.
+      unchanged_nodes[n.get()] = nid;
+    }
     return do_alter;
   };
 
@@ -118,15 +123,15 @@ Graph AlterOpLayout(const Graph& src) {
     std::vector<Layout> ret_layouts(ret_idx.num_node_entries(), Layout::Undef());
     for (uint32_t nid = 0; nid < ret_idx.num_nodes(); ++nid) {
       const auto& inode = ret_idx[nid];
-      if (new_nodes.count(inode.source)) {
+      if (unchanged_nodes.count(inode.source)) {
         const std::vector<Layout>& in_layouts =
-          in_layouts_of_node[new_nodes[inode.source]];
+          in_layouts_of_node[unchanged_nodes[inode.source]];
         for (uint32_t i = 0; i < inode.inputs.size(); ++i) {
           const auto& e = inode.inputs[i];
           ret_layouts[ret_idx.entry_id(e)] = in_layouts[i];
         }
         const std::vector<Layout>& out_layouts =
-          out_layouts_of_node[new_nodes[inode.source]];
+          out_layouts_of_node[unchanged_nodes[inode.source]];
         for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
           ret_layouts[ret_idx.entry_id(nid, i)] = out_layouts[i];
         }
diff --git a/nnvm/src/compiler/compile_engine.cc b/nnvm/src/compiler/compile_engine.cc
index a9d4aa2d016a..6df70b53ccae 100644
--- a/nnvm/src/compiler/compile_engine.cc
+++ b/nnvm/src/compiler/compile_engine.cc
@@ -11,8 +11,11 @@
 #include <nnvm/pass_functions.h>
 #include <nnvm/compiler/op_attr_types.h>
 #include <mutex>
-#include "./graph_hash.h"
-#include "./compile_engine.h"
+#include <tuple>
+#include <vector>
+#include <limits>
+#include "graph_hash.h"
+#include "compile_engine.h"
 
 namespace nnvm {
 namespace compiler {
@@ -91,7 +94,7 @@ class CompileEngine {
       return it->second->graph_func;
     }
     GraphFunc f = DoLower(key->graph, key->inputs, key->target, master_idx);
-    std::shared_ptr<GraphCacheEntryNode> n = std::make_shared<GraphCacheEntryNode>();
+    auto n = tvm::make_node<GraphCacheEntryNode>();
     n->graph_func = f;
     n->use_count = 1;
     n->master_idx = master_idx;
@@ -104,8 +107,7 @@ class CompileEngine {
     Array<NodeRef> items;
     for (auto& kv : cache_) {
       items.push_back(kv.first);
-      std::shared_ptr<GraphCacheEntryNode> n =
-          std::make_shared<GraphCacheEntryNode>(*(kv.second.operator->()));
+      auto n = tvm::make_node<GraphCacheEntryNode>(*(kv.second.operator->()));
       items.push_back(GraphCacheEntry(n));
     }
     return items;
@@ -123,7 +125,7 @@ class CompileEngine {
   // Set the given function on given graph key.
   void Set(const GraphKey& key, GraphFunc func) {
     std::lock_guard<std::mutex> lock(mutex_);
-    std::shared_ptr<GraphCacheEntryNode> n = std::make_shared<GraphCacheEntryNode>();
+    auto n = tvm::make_node<GraphCacheEntryNode>();
     n->graph_func = func;
     n->use_count = 1;
     cache_[key] = GraphCacheEntry(n);
@@ -262,7 +264,7 @@ class CompileEngine {
         graph, inputs, target, master_idx,
         &readable_name, &outputs);
 
-    std::shared_ptr<GraphFuncNode> gf = std::make_shared<GraphFuncNode>();
+    auto gf = tvm::make_node<GraphFuncNode>();
     gf->target = target;
     gf->func_name = GetUniqeName(readable_name);
     gf->inputs = inputs;
diff --git a/nnvm/src/compiler/compile_engine.h b/nnvm/src/compiler/compile_engine.h
index d84fe2facbd3..23e5e1d1a49c 100644
--- a/nnvm/src/compiler/compile_engine.h
+++ b/nnvm/src/compiler/compile_engine.h
@@ -18,7 +18,7 @@
 #include <tvm/lowered_func.h>
 #include <string>
 #include <utility>
-#include "./graph_hash.h"
+#include "graph_hash.h"
 
 namespace nnvm {
 namespace compiler {
@@ -71,7 +71,7 @@ struct GraphCacheEntryNode : public tvm::Node {
 class GraphCacheEntry : public ::tvm::NodeRef {
  public:
   GraphCacheEntry() {}
-  explicit GraphCacheEntry(std::shared_ptr<::tvm::Node> n) : NodeRef(n) {}
+  explicit GraphCacheEntry(::tvm::NodePtr<::tvm::Node> n) : NodeRef(n) {}
   GraphCacheEntryNode* operator->() {
     return static_cast<GraphCacheEntryNode*>(node_.get());
   }
diff --git a/nnvm/src/compiler/fold_scale_axis.cc b/nnvm/src/compiler/fold_scale_axis.cc
index e38082b69916..35e024efdc6a 100644
--- a/nnvm/src/compiler/fold_scale_axis.cc
+++ b/nnvm/src/compiler/fold_scale_axis.cc
@@ -9,8 +9,8 @@
 #include <nnvm/pass.h>
 #include <nnvm/compiler/op_attr_types.h>
 #include <nnvm/top/nn.h>
-#include "./pattern_util.h"
-#include "./graph_transform.h"
+#include "pattern_util.h"
+#include "graph_transform.h"
 
 namespace nnvm {
 namespace compiler {
@@ -493,8 +493,80 @@ bool Conv2DScaleAxisForward(
   if ((*in_info)[0].kind != kPending) return false;
   // only optimize for nchw for now
   if (param.kernel_layout == "OIHW" && (*in_info)[0].axis == 1) {
+    // Check whether it is depthwise conv2d
+    if (param.use_bias) {
+      CHECK_EQ(in_shape.size(), 3U) << "Input:[data, weight, bias]";
+    } else {
+      CHECK_EQ(in_shape.size(), 2U) << "Input:[data, weight]";
+    }
+
+    auto dshape = in_shape.at(0);
+    CHECK_EQ(dshape.ndim(), 4U) << "Input data shape should be 4D";
+
+    // TODO(FrozenGene): Currently, we don't support conv2d's groups != in channels.
+    if (param.groups > 1 && dshape[1] != param.groups) {
+      LOG(WARNING) << "FoldScaleAxis optimization doesn't support conv2d "
+                   << "with groups != in channels. We will skip FoldScaleAxis "
+                   << "optimization for this op.";
+      return false;
+    }
+
+
+    // input channel equals to groups, which means depthwise conv2d
+    bool is_depthwise_conv2d = (dshape[1] == param.groups);
+
+    // if it is depthwise convolution, the weight fold axis should along to axis 0.
+    // For example:
+    // data shape [1,54,63,127] weights shape [54,1,3,3], scale shape [54]
+    // depthwise convolution's weights shape means we have divided the data shape's channel
+    // to groups parties. Here, we divide 54 channels into 54 parties. Every part size is 1.
+    // weights shape's first dimision means how many parties we have divided (mapping to
+    // input shape's channel). So, in the depthwise convolution, we shouldn't do like
+    // traditional convolution(i.e. OIHW)
+
+    // Backgroud of this algorithm:
+
+    // Original Graph:
+    //    Graph(%x,
+    //          %in_scale,
+    //          %weight,
+    //          %bias,
+    //          %out_scale) {
+    //      %1 = __add_scalar__(%x, scalar='1')
+    //      %3 = expand_dims(%in_scale, num_newaxis='2', axis='1')
+    //      %4 = broadcast_mul(%1, %3)
+    //      %7 = conv2d(%4, %weight, %bias, padding='(1, 1)', kernel_size='(3, 3)', channels='2')
+    //      %8 = relu(%7)
+    //      %10 = expand_dims(%out_scale, num_newaxis='2', axis='1')
+    //      %11 = broadcast_mul(%8, %10)
+    //      ret %11
+    //    }
+
+    // Optimized Graph:
+    //    Graph(%x,
+    //          %weight,
+    //          %out_scale,
+    //          %in_scale,
+    //          %bias) {
+    //      %1 = __add_scalar__(%x, scalar='1')
+    //      %4 = expand_dims(%out_scale, num_newaxis='3', axis='1')
+    //      %5 = broadcast_mul(%weight, %4)
+    //      %7 = expand_dims(%in_scale, num_newaxis='2', axis='1')
+    //      %8 = broadcast_mul(%5, %7)
+    //      %10 = broadcast_mul(%bias, %out_scale)
+    //      %11 = conv2d(%1, %8, %10, padding='(1, 1)', kernel_size='(3, 3)', channels='2')
+    //      %12 = relu(%11)
+    //      ret %12
+    //    }
+
+    // Conv2DScaleAxisForward will need in_scale. Conv2DScaleAxisBackward will need out_scale.
+    // in_scale will apply into input data's channel (in_channel). out_scale will apply in
+    // conv2d's result, which will apply in weight's output channel.
+    // So, default Conv2DScaleAxisForward will fold axis 1 (weights' input channel).
+    // Conv2DScaleAxisBackward will fold axis 0 (weights' output channel).
+    // But depthwise convolution is another story as said previously.
     (*in_info)[1].kind = kMulConsumer;
-    (*in_info)[1].axis = 1;
+    (*in_info)[1].axis = is_depthwise_conv2d ? 0 : 1;
     (*in_info)[1].source = (*in_info)[0].source;
     return true;
   } else {
diff --git a/nnvm/src/compiler/graph_compile.cc b/nnvm/src/compiler/graph_compile.cc
index e51730c09d66..3316f3932e27 100644
--- a/nnvm/src/compiler/graph_compile.cc
+++ b/nnvm/src/compiler/graph_compile.cc
@@ -109,13 +109,14 @@ nnvm::Graph GraphCompile(const nnvm::Graph& g) {
       inputs.push_back(it->second);
     }
     // Find master idx in the subgraph.
-    int sub_master_idx = 0;
+    int sub_master_idx = -1;
     for (uint32_t i = 0; i < subidx.num_nodes(); i++) {
       if (subidx[i].source->op() == idx[master].source->op()) {
         sub_master_idx = i;
         break;
       }
     }
+    CHECK_NE(sub_master_idx, -1) << "A master node not found in the subgraph.";
     fe.compiled_func = GraphLower(fe.subgraph, inputs, target, sub_master_idx);
     for (LoweredFunc f : fe.compiled_func->funcs) {
       if (!func_set.count(f.get())) {
diff --git a/nnvm/src/compiler/graph_fuse.cc b/nnvm/src/compiler/graph_fuse.cc
index 52a8ae44f8ee..4d724ae66c35 100644
--- a/nnvm/src/compiler/graph_fuse.cc
+++ b/nnvm/src/compiler/graph_fuse.cc
@@ -14,10 +14,11 @@
 #include <nnvm/tuple.h>
 #include <tvm/lowered_func.h>
 #include <tvm/runtime/packed_func.h>
+#include <limits>
 
-#include "./graph_fuse.h"
-#include "./graph_runtime.h"
-#include "./pattern_util.h"
+#include "graph_fuse.h"
+#include "graph_runtime.h"
+#include "pattern_util.h"
 
 namespace nnvm {
 namespace compiler {
@@ -63,12 +64,16 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) {
       // Check if we can fuse to the master.
       int chosen_master = -1;
       bool ewise = inode.source->num_outputs() == 1;
+      bool mark_as_injective = false;
       for (const auto& e : inode.inputs) {
         if (fuse_vec[e.node_id] == FuseRule::kUknown) {
           TOpPattern ipt = pattern_vec[e.node_id];
           if (ipt != kElemWise) ewise = false;
-          if (ipt <= kInjective) {
+          if (ipt <= kBroadcast) {
+            fuse_vec[e.node_id] = FuseRule::kFuseToMaster;
+          } else if (ipt == kInjective) {
             fuse_vec[e.node_id] = FuseRule::kFuseToMaster;
+            mark_as_injective = true;
           } else if (ipt == kOutEWiseFusable &&
                      chosen_master == -1 &&
                      shape_vec[idx.entry_id(nid, 0)] == shape_vec[idx.entry_id(e)]) {
@@ -87,6 +92,8 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) {
       master_vec[nid] = chosen_master;
       if (chosen_master != -1) {
         pt = kOutEWiseFusable;
+      } else if (mark_as_injective) {
+        pt = kInjective;
       } else {
         pt = ewise ? kElemWise : kBroadcast;
       }
@@ -129,18 +136,55 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) {
 
   // Point to the group root id of each node.
   GroupVec group_vec(idx.num_nodes(), -1);
+  std::vector<std::vector<uint32_t> > node_ids_per_group(idx.num_nodes());
   for (uint32_t i = idx.num_nodes(); i != 0; --i) {
     uint32_t nid = i - 1;
     const auto& inode = idx[nid];
+    bool is_root = false;
     if (group_vec[nid] == -1) {
       group_vec[nid] = nid;
+      node_ids_per_group[nid].push_back(nid);
+      is_root = true;
     }
+
+    // Check if injective op and out_ewise_fusable op (e.g. conv2d) are in the same group.
+    bool parent_out_ewise = false;
+    bool parent_injective = false;
+    for (const auto& e : inode.inputs) {
+      if (fuse_vec[e.node_id] != FuseRule::kFuseToMaster) continue;
+      TOpPattern pt = pattern_vec[e.node_id];
+      if (pt == kOutEWiseFusable) {
+        parent_out_ewise = true;
+      } else if (pt == kInjective) {
+        parent_injective = true;
+      }
+    }
+    // Change the master node from out_ewise_fusable op to itself
+    if (parent_injective && parent_out_ewise) {
+      master_vec[nid] = nid;
+      if (!is_root) {
+        // Children nodes in the same group might be pointing to a master node in a different group.
+        for (uint32_t j : node_ids_per_group[group_vec[nid]]) {
+          master_vec[j] = nid;
+        }
+      }
+    }
+
     // Propagate the group id.
     for (const auto& e : inode.inputs) {
+      TOpPattern pt = pattern_vec[e.node_id];
+      if (parent_out_ewise && parent_injective) {
+        if (pt == kOutEWiseFusable) {
+          continue;  // Do not fuse out_ewise_fusable op
+        } else if (pt == kInjective) {
+          master_vec[e.node_id] = nid;
+        }
+      }
       if (fuse_vec[e.node_id] == FuseRule::kFuseToMaster) {
         CHECK(group_vec[e.node_id] == -1||
               group_vec[e.node_id] == group_vec[nid]);
         group_vec[e.node_id] = group_vec[nid];
+        node_ids_per_group[group_vec[nid]].push_back(e.node_id);
       }
     }
   }
@@ -192,12 +236,10 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) {
   */
   if (opt_level >= 1) {
     std::vector<std::vector<uint32_t> > children_group_ids(idx.num_nodes());
-    std::vector<std::vector<uint32_t> > node_ids_per_group(idx.num_nodes());
     for (uint32_t nid = idx.num_nodes() - 1; nid != 0; --nid) {
       const auto& inode = idx[nid];
       if (inode.source->is_variable()) continue;
       CHECK_NE(group_vec[nid], -1);
-      node_ids_per_group[group_vec[nid]].push_back(nid);
       if (inode.inputs.size() != 1) continue;
       const uint32_t parent_nid = inode.inputs[0].node_id;
       // if parent node has more than one child, record each child's group id.
diff --git a/nnvm/src/compiler/graph_hash.cc b/nnvm/src/compiler/graph_hash.cc
index d881130f72cc..f14a60e80d8c 100644
--- a/nnvm/src/compiler/graph_hash.cc
+++ b/nnvm/src/compiler/graph_hash.cc
@@ -10,8 +10,10 @@
 #include <tvm/ir.h>
 #include <tvm/runtime/packed_func.h>
 #include <functional>
-#include "./node_attr.h"
-#include "./graph_hash.h"
+#include <vector>
+#include <algorithm>
+#include "node_attr.h"
+#include "graph_hash.h"
 
 namespace nnvm {
 namespace compiler {
@@ -72,8 +74,7 @@ bool GraphKeyEqual::Equal(const GraphKey& a,
 GraphKey GraphKeyNode::make(Graph graph,
                             tvm::Array<Tensor> inputs,
                             std::string target) {
-  std::shared_ptr<GraphKeyNode> n
-      = std::make_shared<GraphKeyNode>();
+  auto n = tvm::make_node<GraphKeyNode>();
   n->graph = std::move(graph);
   n->inputs = inputs;
   n->target = std::move(target);
@@ -125,7 +126,7 @@ std::string GraphDeepCompare(const Graph& a,
   const IndexedGraph& idxb = b.indexed_graph();
   std::ostringstream err;
   if (idxa.num_nodes() != idxb.num_nodes()) {
-    err << "Number of nodes mismatch";
+    err << "Number of nodes mismatch (" <<  idxa.num_nodes() << " v.s " << idxb.num_nodes() << ")";
     return err.str();
   }
   if (idxa.num_node_entries() != idxb.num_node_entries()) {
diff --git a/nnvm/src/compiler/graph_runtime.cc b/nnvm/src/compiler/graph_runtime.cc
index 7301fd74117e..e4865df3f9f0 100644
--- a/nnvm/src/compiler/graph_runtime.cc
+++ b/nnvm/src/compiler/graph_runtime.cc
@@ -4,7 +4,7 @@
  * \brief Interface code with TVM graph runtime.
 */
 #include <dmlc/memory_io.h>
-#include "./graph_runtime.h"
+#include "graph_runtime.h"
 
 namespace nnvm {
 namespace compiler {
@@ -91,8 +91,7 @@ TVM_REGISTER_GLOBAL("nnvm.compiler._load_param_dict")
     for (size_t i = 0; i < size; ++i) {
       tvm::runtime::NDArray temp;
       temp.Load(strm);
-      std::shared_ptr<NDArrayWrapperNode> n
-          = std::make_shared<NDArrayWrapperNode>();
+      auto n = tvm::make_node<NDArrayWrapperNode>();
       n->name = std::move(names[i]);
       n->array = temp;
       ret.push_back(NDArrayWrapper(n));
@@ -100,6 +99,6 @@ TVM_REGISTER_GLOBAL("nnvm.compiler._load_param_dict")
     *rv = ret;
   });
 
-TVM_EXTERNAL_REGISTER_NODE_TYPE(NDArrayWrapperNode);
+TVM_REGISTER_NODE_TYPE(NDArrayWrapperNode);
 }  // namespace compiler
 }  // namespace nnvm
diff --git a/nnvm/src/compiler/graph_runtime.h b/nnvm/src/compiler/graph_runtime.h
index 272e2be7f251..e5ba3681d2bf 100644
--- a/nnvm/src/compiler/graph_runtime.h
+++ b/nnvm/src/compiler/graph_runtime.h
@@ -9,6 +9,7 @@
 #include <nnvm/graph.h>
 #include <tvm/base.h>
 #include <tvm/expr.h>
+#include <tvm/node/memory.h>
 #include <tvm/packed_func_ext.h>
 #include <tvm/runtime/ndarray.h>
 #include <vector>
diff --git a/nnvm/src/compiler/packed_func_ext.cc b/nnvm/src/compiler/packed_func_ext.cc
index d549f9e2004f..1a19feabfe8a 100644
--- a/nnvm/src/compiler/packed_func_ext.cc
+++ b/nnvm/src/compiler/packed_func_ext.cc
@@ -9,7 +9,7 @@
 #include <nnvm/compiler/packed_func_ext.h>
 #include <nnvm/compiler/op_attr_types.h>
 #include <tvm/runtime/c_runtime_api.h>
-#include "./node_attr.h"
+#include "node_attr.h"
 #include "compile_engine.h"
 
 namespace tvm {
@@ -96,7 +96,7 @@ TVM_REGISTER_GLOBAL("nnvm._register_compute")
                         const Array<Tensor>& out_info)
         -> Array<Tensor> {
       TVMRetValue ret = (*f)(GetAttrDict(attrs), inputs, out_info);
-      if ((*ret.ptr<std::shared_ptr<tvm::Node> >())->derived_from<tvm::TensorNode>()) {
+      if ((*ret.ptr<::tvm::NodePtr<tvm::Node> >())->derived_from<tvm::TensorNode>()) {
         return {ret.operator Tensor()};
       } else {
         return ret;
diff --git a/nnvm/src/compiler/simplify_inference.cc b/nnvm/src/compiler/simplify_inference.cc
index a0782222aa06..bf00bcb5a894 100644
--- a/nnvm/src/compiler/simplify_inference.cc
+++ b/nnvm/src/compiler/simplify_inference.cc
@@ -9,8 +9,8 @@
 #include <nnvm/pass.h>
 #include <nnvm/compiler/op_attr_types.h>
 #include <nnvm/top/nn.h>
-#include "./graph_transform.h"
-#include "./pattern_util.h"
+#include "graph_transform.h"
+#include "pattern_util.h"
 
 namespace nnvm {
 namespace compiler {
diff --git a/nnvm/src/pass/infer_shape_type.cc b/nnvm/src/pass/infer_shape_type.cc
index cc4916ce0b9f..0f322f12e9c4 100644
--- a/nnvm/src/pass/infer_shape_type.cc
+++ b/nnvm/src/pass/infer_shape_type.cc
@@ -215,7 +215,7 @@ NNVM_REGISTER_PASS(InferShape)
 .set_change_graph(false)
 .provide_graph_attr("shape");
 
-// inference fucntion for same type
+// inference function for same type
 inline bool SameType(const NodeAttrs& attrs,
                      std::vector<int> *iattr,
                      std::vector<int> *oattr) {
diff --git a/nnvm/src/pass/plan_memory.cc b/nnvm/src/pass/plan_memory.cc
index 51448bcf1065..e0788386e6ea 100644
--- a/nnvm/src/pass/plan_memory.cc
+++ b/nnvm/src/pass/plan_memory.cc
@@ -7,12 +7,37 @@
 #include <nnvm/pass.h>
 #include <nnvm/graph_attr_types.h>
 #include <nnvm/op_attr_types.h>
+#include <nnvm/top/tensor.h>
 #include <memory>
-#include "./graph_algorithm.h"
+#include "graph_algorithm.h"
 
 namespace nnvm {
 namespace pass {
 namespace {
+  using namespace nnvm::top;
+// Return bytes of data flag.
+static int GetDTypeSize(int type_flag) {
+  switch (type_flag) {
+    case kUint8:
+    case kInt8:
+      return 1;
+    case kFloat16:
+    case kInt16:
+    case kUint16:
+      return 2;
+    case kFloat32:
+    case kInt32:
+    case kUint32:
+      return 4;
+    case kFloat64:
+    case kInt64:
+    case kUint64:
+      return 8;
+    default:
+      LOG(FATAL) << "unknown type_flag=" << type_flag;
+      return -1;
+  }
+}
 
 // simple graph based allocator.
 class GraphAllocator {
@@ -199,7 +224,8 @@ size_t AllocMemory(const Graph& ret, const IndexedGraph& idx,
             ((storage_ref_count[sid_in] == 1 && !ignore_all_inputs) || identity[ipair]) &&
             entry_ref_count[eid_out] > 0 &&
             shape_vec[eid_out].Size() == shape_vec[eid_in].Size() &&
-            dtype_vec[eid_out] == dtype_vec[eid_in]) {
+             (dtype_vec[eid_out] == dtype_vec[eid_in] ||
+             GetDTypeSize(dtype_vec[eid_out]) == GetDTypeSize(dtype_vec[eid_in]))) {
           // inplace optimization
           taken[kv.first] = true;
           storage[eid_out] = sid_in;
diff --git a/nnvm/src/pass/saveload_json.cc b/nnvm/src/pass/saveload_json.cc
index 195d49bfb9b4..4a0706b6d501 100644
--- a/nnvm/src/pass/saveload_json.cc
+++ b/nnvm/src/pass/saveload_json.cc
@@ -209,16 +209,14 @@ std::shared_ptr<Symbol> JSONGraph2Symbol(const JSONGraph &jgraph, bool no_parse)
   for (const JSONNode &n : jgraph.nodes) {
     n.node->inputs.reserve(n.inputs.size());
     for (const JSONNode::Entry &e : n.inputs) {
+      CHECK(e.node_id < jgraph.nodes.size());
       n.node->inputs.emplace_back(NodeEntry{jgraph.nodes[e.node_id].node, e.index, e.version});
     }
     n.node->control_deps.reserve(n.control_deps.size());
     for (uint32_t nid : n.control_deps) {
+      CHECK(nid < jgraph.nodes.size());
       n.node->control_deps.push_back(jgraph.nodes[nid].node);
     }
-    // rebuild attribute parser
-    if (!no_parse && n.node->op() != nullptr && n.node->op()->attr_parser != nullptr) {
-      n.node->op()->attr_parser(&(n.node->attrs));
-    }
     for (const JSONGraph &subgraph : n.subgraphs) {
       // The "no_parse" option here, is to be compatible with
       // commit cfd3075e85807dcd8f9534c37e053583dee87524
@@ -227,14 +225,23 @@ std::shared_ptr<Symbol> JSONGraph2Symbol(const JSONGraph &jgraph, bool no_parse)
       // incubator-mxnet/src/nnvm/legacy_json_util.cc:UpgradeJSON_Parse
       n.node->attrs.subgraphs.push_back(JSONGraph2Symbol(subgraph, false));
     }
+    // rebuild attribute parser
+    if (!no_parse && n.node->op() != nullptr && n.node->op()->attr_parser != nullptr) {
+      n.node->op()->attr_parser(&(n.node->attrs));
+    } else if (!no_parse && n.node->is_variable()) {
+      n.node->attrs.parsed =
+        Symbol::CreateVariable(n.node->attrs.name).outputs[0].node->attrs.parsed;
+    }
   }
   // consistency check
   for (uint32_t nid : jgraph.arg_nodes) {
+    CHECK(nid < jgraph.nodes.size());
     CHECK(jgraph.nodes[nid].node->is_variable());
   }
   std::shared_ptr<Symbol> symbol = std::make_shared<Symbol>();
   symbol->outputs.reserve(jgraph.heads.size());
   for (const JSONNode::Entry &e : jgraph.heads) {
+    CHECK(e.node_id < jgraph.nodes.size());
     symbol->outputs.emplace_back(NodeEntry{jgraph.nodes[e.node_id].node, e.index, e.version});
   }
   return symbol;
diff --git a/nnvm/src/top/elemwise_op_common.h b/nnvm/src/top/elemwise_op_common.h
index e5bb0adcb078..ad8fc3d54ba8 100644
--- a/nnvm/src/top/elemwise_op_common.h
+++ b/nnvm/src/top/elemwise_op_common.h
@@ -12,7 +12,7 @@
 #include <vector>
 #include <utility>
 #include <functional>
-#include "./op_common.h"
+#include "op_common.h"
 
 namespace nnvm {
 namespace top {
diff --git a/nnvm/src/top/nn/convolution.cc b/nnvm/src/top/nn/convolution.cc
index 229d4ac30f78..df81c47823d9 100644
--- a/nnvm/src/top/nn/convolution.cc
+++ b/nnvm/src/top/nn/convolution.cc
@@ -12,7 +12,7 @@
 #include <tvm/packed_func_ext.h>
 #include <nnvm/compiler/op_attr_types.h>
 #include <tvm/tvm.h>
-#include "./nn_common.h"
+#include "nn_common.h"
 #include "../op_common.h"
 #include "../elemwise_op_common.h"
 #include "topi/nn.h"
@@ -73,16 +73,19 @@ inline bool Conv2DInferShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(param.channels % param.groups, 0U)
       << "output channels must divide group size";
 
-  TShape wshape({param.channels / param.groups,
+  // Restore depthwise conv2d kernel layout
+  // otherwise we will get error if we split output channel
+  // of depthwise conv2d kernel (because it will be 1 if
+  // use param.channels divide param.groups).
+  TShape wshape({param.channels,
                  dshape[1] / param.groups,
                  param.kernel_size[0],
                  param.kernel_size[1]});
-
   wshape = ConvertLayout(wshape, kOIHW, kernel_layout);
 
-  wshape[kernel_layout.indexof('O')] *= param.groups;
-
-  NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, Conv2DParam::kWeight, wshape);
+  if (in_shape->at(Conv2DParam::kWeight).ndim() == 0) {
+    NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, Conv2DParam::kWeight, wshape);
+  }
   if (param.use_bias) {
     static const Layout default_bias_layout("C");
     TShape bias_shape({param.channels});
@@ -344,7 +347,6 @@ NNVM_REGISTER_OP(_contrib_conv2d_NCHWc)
 .set_num_inputs(UseBiasNumInputs<Conv2DParam>)
 .set_support_level(2);
 
-
 NNVM_REGISTER_OP(_contrib_conv2d_winograd_weight_transform)
 .describe(R"code(Weight transformation of winograd fast convolution algorithm.
 Separate this into another nnvm symbol in order to enable Precompute Pass to compute the
diff --git a/nnvm/src/top/nn/nn.cc b/nnvm/src/top/nn/nn.cc
index 322d77b6d032..e301f167ff1d 100644
--- a/nnvm/src/top/nn/nn.cc
+++ b/nnvm/src/top/nn/nn.cc
@@ -12,7 +12,7 @@
 #include <nnvm/op_attr_types.h>
 #include <nnvm/compiler/op_attr_types.h>
 #include <nnvm/top/nn.h>
-#include "./nn_common.h"
+#include "nn_common.h"
 #include "../op_common.h"
 #include "../elemwise_op_common.h"
 #include "topi/nn/dense.h"
@@ -410,7 +410,8 @@ NNVM_REGISTER_OP(log_softmax)
                     const Array<Tensor>& inputs,
                     const Array<Tensor>& out_info) {
     const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
-    CHECK_EQ(param.axis, -1) << "Currently only axis=-1 is supported";
+    CHECK(param.axis == -1 || param.axis == static_cast<int32_t>(inputs[0].ndim()) - 1)
+        << "log_softmax currently only works on last dimension";
     return Array<Tensor>{ topi::nn::log_softmax(inputs[0]) };
   })
 .set_attr<FGradient>(
@@ -619,7 +620,8 @@ NNVM_REGISTER_OP(pad)
     for (size_t i = 0; i < pad_width.ndim(); ++i) {
       pad_after.push_back(tvm::make_const(tvm::Int(32), pad_width[i][1]));
     }
-    return Array<Tensor>{ topi::pad(inputs[0], pad_before, pad_after, param.pad_value) };
+    return Array<Tensor>{ topi::pad(inputs[0], pad_before, pad_after,
+                          tvm::make_const(inputs[0]->dtype, param.pad_value)) };
 })
 .set_support_level(1);
 
diff --git a/nnvm/src/top/nn/pooling.cc b/nnvm/src/top/nn/pooling.cc
index cccd5b1c710b..6a53e1994fc1 100644
--- a/nnvm/src/top/nn/pooling.cc
+++ b/nnvm/src/top/nn/pooling.cc
@@ -10,7 +10,7 @@
 #include <nnvm/compiler/op_attr_types.h>
 #include <nnvm/compiler/util.h>
 #include <nnvm/top/nn.h>
-#include "./nn_common.h"
+#include "nn_common.h"
 #include "../op_common.h"
 #include "../elemwise_op_common.h"
 #include "topi/nn/pooling.h"
@@ -77,7 +77,7 @@ inline bool Pool2DInferShape(const nnvm::NodeAttrs& attrs,
   } else {
     oshape[hidx] = ((dshape[hidx] + pad_h - param.pool_size[0] +
                     param.strides[0] - 1) / param.strides[0]) + 1;
-    oshape[widx] = ((dshape[3] + pad_w - param.pool_size[1] +
+    oshape[widx] = ((dshape[widx] + pad_w - param.pool_size[1] +
                     param.strides[1] - 1) / param.strides[1]) + 1;
   }
   NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);
diff --git a/nnvm/src/top/nn/upsampling.cc b/nnvm/src/top/nn/upsampling.cc
index 6c5e13441406..f4bbeb62aa29 100644
--- a/nnvm/src/top/nn/upsampling.cc
+++ b/nnvm/src/top/nn/upsampling.cc
@@ -11,7 +11,7 @@
 #include <nnvm/node.h>
 #include <nnvm/op_attr_types.h>
 #include <nnvm/top/nn.h>
-#include "./nn_common.h"
+#include "nn_common.h"
 #include "../op_common.h"
 #include "../elemwise_op_common.h"
 #include "topi/elemwise.h"
diff --git a/nnvm/src/top/tensor/elemwise.cc b/nnvm/src/top/tensor/elemwise.cc
index 239f44783392..3ee52008eb1c 100644
--- a/nnvm/src/top/tensor/elemwise.cc
+++ b/nnvm/src/top/tensor/elemwise.cc
@@ -307,7 +307,7 @@ NNVM_REGISTER_ELEMWISE_BINARY_OP(elemwise_mul)
 });
 
 NNVM_REGISTER_ELEMWISE_BINARY_OP(elemwise_div)
-.describe(R"code(Element-wise multiplication
+.describe(R"code(Element-wise division
 
 )code"  NNVM_ADD_FILELINE)
 .set_support_level(1)
diff --git a/nnvm/src/top/tensor/matrix_op.cc b/nnvm/src/top/tensor/matrix_op.cc
index c881e683a6c5..de95eddee1f6 100644
--- a/nnvm/src/top/tensor/matrix_op.cc
+++ b/nnvm/src/top/tensor/matrix_op.cc
@@ -3,7 +3,7 @@
  * \file matrix_op.cc
  * \brief Matrix operators
  */
-#include <topi/nn.h>
+#include <topi/transform.h>
 #include <nnvm/op.h>
 #include <nnvm/node.h>
 #include <nnvm/op_attr_types.h>
diff --git a/nnvm/src/top/tensor/reduce.cc b/nnvm/src/top/tensor/reduce.cc
index d8f426b4f4bc..105765fccc61 100644
--- a/nnvm/src/top/tensor/reduce.cc
+++ b/nnvm/src/top/tensor/reduce.cc
@@ -22,6 +22,7 @@ namespace top {
 using namespace tvm;
 using namespace nnvm::compiler;
 
+
 // reduce
 DMLC_REGISTER_PARAMETER(ReduceParam);
 
@@ -67,10 +68,11 @@ inline TShape ReduceShapeImpl(const TShape& ishape,
   if (r_axes.ndim() == indim)
     return TShape(keepdims ? indim : 1);
 
+  CHECK(r_axes.ndim() < indim);
   if (keepdims) {
     TShape oshape(ishape);
     for (unsigned i = 0, j = 0; i < indim; ++i) {
-      if (i != r_axes[j]) continue;
+      if (j >= r_axes.ndim() || i != r_axes[j]) continue;
       oshape[i] = 1;
       ++j;
     }
@@ -79,7 +81,7 @@ inline TShape ReduceShapeImpl(const TShape& ishape,
 
   TShape oshape(indim - r_axes.ndim());
   for (unsigned i = 0, j = 0, k = 0; i < indim; ++i) {
-    if (i == r_axes[j]) {
+    if (j < r_axes.ndim() && i == r_axes[j]) {
       ++j;
       continue;
     }
@@ -95,7 +97,7 @@ inline bool ReduceShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
   if ((*in_attrs)[0].ndim() == 0) return false;
   const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
-  NNVM_ASSIGN_INPUT_SHAPE(
+  NNVM_ASSIGN_OUTPUT_SHAPE(
       attrs, *out_attrs, 0,
       ReduceShapeImpl((*in_attrs)[0], param.axis,
                       param.keepdims, param.exclude));
@@ -162,9 +164,9 @@ Example::
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
     if (!r_axes.ndim()) return Array<Tensor> { topi::identity(inputs[0]) };
-    auto axis = ShapeToArray(r_axes);
+    auto axis = ShapeToIntArray(r_axes);
     return Array<Tensor>{
-      topi::sum(inputs[0], axis, param.keepdims) };
+      topi::sum(inputs[0], axis, param.keepdims, true) };
 })
 .set_attr<FGradient>(
   "FGradient", [](const NodePtr& n,
@@ -196,9 +198,9 @@ NNVM_REGISTER_REDUCE_OP(max)
     const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
-    auto axis = ShapeToArray(r_axes);
+    auto axis = ShapeToIntArray(r_axes);
     return Array<Tensor>{
-      topi::max(inputs[0], axis, param.keepdims) };
+      topi::max(inputs[0], axis, param.keepdims, true) };
 })
 .set_attr<FGradient>(
   "FGradient", [](const NodePtr& n,
@@ -229,9 +231,9 @@ NNVM_REGISTER_REDUCE_OP(min)
     const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
-    auto axis = ShapeToArray(r_axes);
+    auto axis = ShapeToIntArray(r_axes);
     return Array<Tensor>{
-      topi::min(inputs[0], axis, param.keepdims) };
+      topi::min(inputs[0], axis, param.keepdims, true) };
 })
 .set_attr<FGradient>(
   "FGradient", [](const NodePtr& n,
@@ -266,15 +268,13 @@ NNVM_REGISTER_BASE_REDUCE_OP(collapse_sum)
     return Array<Tensor>{ topi::collapse_sum(inputs[0], inputs[1]->shape) };
 });
 
-template<int Type>
 inline bool InferFixedType(const NodeAttrs& attrs,
                           std::vector<int>* in_attrs,
                           std::vector<int>* out_attrs) {
-  // Static type inference for argmax operation. Argmax return indices which
-  // should have Int32 type as shapes do.
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
-  NNVM_ASSIGN_OUTPUT_TYPE(attrs, *out_attrs, 0, static_cast<int>(Type));
+  const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
+  NNVM_ASSIGN_OUTPUT_TYPE(attrs, *out_attrs, 0, param.dtype);
   return true;
 }
 
@@ -285,7 +285,7 @@ values over a given axis.
 )code" NNVM_ADD_FILELINE)
 .add_argument("data", "Tensor", "The input")
 .set_attr<FInferShape>("FInferShape", ReduceShape)
-.set_attr<FInferType>("FInferType", InferFixedType<kInt32>)
+.set_attr<FInferType>("FInferType", InferFixedType)
 .set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
 .set_num_inputs(1)
 .set_attr<FTVMCompute>(
@@ -295,9 +295,10 @@ values over a given axis.
     const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
-    auto axis = ShapeToArray(r_axes);
-    return Array<Tensor>{
-      topi::argmax(inputs[0], axis, param.keepdims) };
+    auto axis = ShapeToIntArray(r_axes);
+    Tensor out = topi::argmax(inputs[0], axis, param.keepdims, true);
+    if (param.dtype == kFloat32) out = topi::cast(out, out_info[0]->dtype);
+    return Array<Tensor>{out};
 });
 
 NNVM_REGISTER_BASE_REDUCE_OP(argmin)
@@ -307,7 +308,7 @@ values over a given axis.
 )code" NNVM_ADD_FILELINE)
 .add_argument("data", "Tensor", "The input")
 .set_attr<FInferShape>("FInferShape", ReduceShape)
-.set_attr<FInferType>("FInferType", InferFixedType<kInt32>)
+.set_attr<FInferType>("FInferType", InferFixedType)
 .set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
 .set_num_inputs(1)
 .set_attr<FTVMCompute>(
@@ -317,9 +318,74 @@ values over a given axis.
     const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
     TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
                                   param.axis, param.exclude);
-    auto axis = ShapeToArray(r_axes);
+    auto axis = ShapeToIntArray(r_axes);
+    Tensor out = topi::argmin(inputs[0], axis, param.keepdims, true);
+    if (param.dtype == kFloat32) out = topi::cast(out, out_info[0]->dtype);
+    return Array<Tensor>{out};
+});
+
+NNVM_REGISTER_REDUCE_OP(mean)
+  .describe(R"code(Computes the mean of array elements over given axes.
+
+Example::
+
+  data = [[[1,2],[2,3],[1,3]],
+          [[1,4],[4,3],[5,2]],
+          [[7,1],[7,2],[7,3]]]
+
+  mean(data)
+  [3.22]
+
+  mean(data, axis=[1,2])
+  [ 2.  3.16666667  4.5]
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
+    TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
+                                  param.axis, param.exclude);
+    if (!r_axes.ndim()) return Array<Tensor> { topi::identity(inputs[0]) };
+    auto axis = ShapeToIntArray(r_axes);
+
+    Expr count = make_const(inputs[0]->dtype, 1);
+    for (auto& i : r_axes) {
+      count *= cast(inputs[0]->dtype, inputs[0]->shape[i]);
+    }
+
+    return Array<Tensor>{
+      topi::divide(topi::sum(inputs[0], axis, param.keepdims, true), count) };
+});
+
+NNVM_REGISTER_REDUCE_OP(prod)
+  .describe(R"code(Computes the products of array elements over given axes.
+
+Example::
+
+  data = [[[1,2],[2,3],[1,3]],
+          [[1,4],[4,3],[5,2]],
+          [[7,1],[7,2],[7,3]]]
+
+  mean(data, axis=1)
+  [35562240]
+
+  mean(data, axis=[1,2])
+  [ 36  480  2058]
+
+)code" NNVM_ADD_FILELINE)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+    const ReduceParam& param = nnvm::get<ReduceParam>(attrs.parsed);
+    TShape r_axes = GetReduceAxes(inputs[0]->shape.size(),
+                                  param.axis, param.exclude);
+    if (!r_axes.ndim()) return Array<Tensor> { topi::identity(inputs[0]) };
+    auto axis = ShapeToIntArray(r_axes);
     return Array<Tensor>{
-      topi::argmin(inputs[0], axis, param.keepdims) };
+      topi::prod(inputs[0], axis, param.keepdims, true) };
 });
 
 
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index 78255d20f040..9d259ae77d9b 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -93,23 +93,24 @@ inline bool ConcatenateInferShape(const NodeAttrs& attrs,
   TShape dshape;
   dim_t size = 0;
   bool has_zero = false;
+  int axis = param.axis >= 0 ? param.axis : in_shape->at(0).ndim() + param.axis;
   for (size_t i = 0; i < in_shape->size(); ++i) {
     TShape tmp = (*in_shape)[i];
     if (tmp.ndim()) {
-      CHECK_LT(static_cast<dim_t>(param.axis), tmp.ndim())
-          << "concat dim " << param.axis << " out of range of input shape " << tmp;
-      has_zero = tmp[param.axis] == 0 || has_zero;
-      size += tmp[param.axis];
-      tmp[param.axis] = 0;
+      CHECK_LT(static_cast<dim_t>(axis), tmp.ndim())
+          << "concat dim " << axis << " out of range of input shape " << tmp;
+      has_zero = tmp[axis] == 0 || has_zero;
+      size += tmp[axis];
+      tmp[axis] = 0;
       shape_assign(&dshape, tmp);
     }
   }
 
   TShape tmp = (*out_shape)[0];
   if (tmp.ndim()) {
-    CHECK_LT(static_cast<dim_t>(param.axis), tmp.ndim())
-        << "concat dim " << param.axis << " out of range of input shape " << tmp;
-    tmp[param.axis] = 0;
+    CHECK_LT(static_cast<dim_t>(axis), tmp.ndim())
+        << "concat dim " << axis << " out of range of input shape " << tmp;
+    tmp[axis] = 0;
     shape_assign(&dshape, tmp);
   }
 
@@ -119,7 +120,7 @@ inline bool ConcatenateInferShape(const NodeAttrs& attrs,
     NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, i, dshape);
   }
 
-  if (!has_zero) dshape[param.axis] = size;
+  if (!has_zero) dshape[axis] = size;
   NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, dshape);
   return dshape.Size() != 0;
 }
@@ -128,15 +129,31 @@ inline bool ConcatenateCorrectLayout(const NodeAttrs& attrs,
                                      std::vector<Layout> *ilayouts,
                                      const std::vector<Layout> *last_ilayouts,
                                      std::vector<Layout> *olayouts) {
+  const ConcatenateParam& param = nnvm::get<ConcatenateParam>(attrs.parsed);
   CHECK_EQ(ilayouts->size(), last_ilayouts->size());
   CHECK_EQ(olayouts->size(), 1U);
 
-  for (size_t i = 0; i < ilayouts->size(); ++i) {
-    const Layout& input = last_ilayouts->at(i).defined() ?
-                          last_ilayouts->at(i) : ilayouts->at(i);
-    NNVM_ASSIGN_LAYOUT(*ilayouts, i, input);
+  Layout layout;
+  if (!ilayouts->at(0).defined()) {
+    layout = last_ilayouts->at(0);
+  } else if (param.axis >= static_cast<int>(ilayouts->at(0).ndim())) {
+    CHECK(last_ilayouts->at(0).defined())
+      << "Current input layout " << ilayouts->at(0)
+      << " is invalid but last input layout is not "
+         "defined for the first input.";
+    layout = last_ilayouts->at(0);
+  } else if (last_ilayouts->at(0).defined()
+             && ilayouts->at(0)[param.axis]
+                != last_ilayouts->at(0)[param.axis]) {
+    layout = last_ilayouts->at(0);
+  } else {
+    layout = ilayouts->at(0);
   }
 
+  for (size_t i = 0; i < ilayouts->size(); ++i) {
+    NNVM_ASSIGN_LAYOUT(*ilayouts, i, layout);
+  }
+  NNVM_ASSIGN_LAYOUT(*olayouts, 0, layout);
   return true;
 }
 
@@ -154,7 +171,7 @@ Example::
    y = [[3,3],[4,4],[5,5]]
    z = [[6,6], [7,7],[8,8]]
 
-   concatenate(x,y,z,dim=0) = [[ 1.,  1.],
+   concatenate(x,y,z,axis=0) = [[ 1.,  1.],
                                [ 2.,  2.],
                                [ 3.,  3.],
                                [ 4.,  4.],
@@ -166,7 +183,7 @@ Example::
    Note that you cannot concat x,y,z along dimension 1 since dimension
    0 is not the same for all the input arrays.
 
-   concatenate(y,z,dim=1) = [[ 3.,  3.,  6.,  6.],
+   concatenate(y,z,axis=1) = [[ 3.,  3.,  6.,  6.],
                              [ 4.,  4.,  7.,  7.],
                              [ 5.,  5.,  8.,  8.]]
 
@@ -327,14 +344,23 @@ inline bool SplitInferShape(const NodeAttrs& attrs,
   const TShape& dshape = (*in_shape)[0];
   if (dshape.ndim() == 0) return false;
 
+  auto axis = param.axis;
+  if (axis < 0) {
+    axis += dshape.ndim();
+  }
+  CHECK_LT(axis, dshape.ndim())
+    << "axis should be within input dimension range but got " <<  axis;
+  CHECK_GT(axis, -1)
+    << "axis should be within input dimension range but got " <<  axis;
+
   if (param.equal_split) {
     int num_outputs = param.indices_or_sections[0];
     CHECK_EQ(out_shape->size(), static_cast<size_t>(num_outputs));
-    CHECK_LT(param.axis, dshape.ndim());
     TShape oshape = dshape;
-    CHECK_EQ(oshape[param.axis] % num_outputs, 0)
-        << "indices_or_sections need to be able to divide input.shape[axis]";
-    oshape[param.axis] /= num_outputs;
+    CHECK_EQ(oshape[axis] % num_outputs, 0)
+        << "indices_or_sections need to be able to divide input.shape[axis] got sections "
+        << num_outputs << " and dimension " << oshape[axis];
+    oshape[axis] /= num_outputs;
 
     for (size_t i = 0; i < out_shape->size(); ++i) {
       NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, i, oshape);
@@ -342,19 +368,19 @@ inline bool SplitInferShape(const NodeAttrs& attrs,
   } else {
     dim_t num_outputs = param.indices_or_sections.ndim() + 1;
     CHECK_EQ(out_shape->size(), static_cast<size_t>(num_outputs));
-    CHECK_LT(param.axis, dshape.ndim());
     TShape oshape = dshape;
     dim_t begin = 0;
     for (dim_t i = 0; i < num_outputs - 1; ++i) {
       CHECK_GT(param.indices_or_sections[i], begin)
-          << "indices_or_sections need to be a sorted ascending list";
-      oshape[param.axis] = param.indices_or_sections[i] - begin;
+          << "indices_or_sections need to be a sorted ascending list got "
+          << param.indices_or_sections;
+      oshape[axis] = param.indices_or_sections[i] - begin;
       begin = param.indices_or_sections[i];
       NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, i, oshape);
     }
-    CHECK_LT(begin, dshape[param.axis])
+    CHECK_LT(begin, dshape[axis])
         << "The sum of sections must match the input.shape[axis]";
-    oshape[param.axis] = dshape[param.axis] - begin;
+    oshape[axis] = dshape[axis] - begin;
     NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, num_outputs - 1, oshape);
   }
   return true;
@@ -394,14 +420,14 @@ along which to split the array.
       return Array<Tensor>{
         topi::split_sections(inputs[0], param.indices_or_sections[0], param.axis) };
     } else {
-      Array<Expr> indices;
+      Array<Integer> indices;
       for (auto i : param.indices_or_sections) {
-        indices.push_back(tvm::make_const(tvm::Int(32), i));
+        indices.push_back(static_cast<int>(i));
       }
       return Array<Tensor>{ topi::split(inputs[0], indices, param.axis) };
     }
 })
-.set_support_level(1);
+.set_support_level(3);
 
 // cast
 DMLC_REGISTER_PARAMETER(CastParam);
@@ -605,6 +631,15 @@ The significance of each is explained below:
 })
 .set_support_level(3);
 
+inline bool ReshapeLikeInferType(const NodeAttrs &attrs,
+                                 std::vector<int> *in_attrs,
+                                 std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  NNVM_ASSIGN_OUTPUT_TYPE(attrs, *out_attrs, 0, (*in_attrs)[0]);
+  return true;
+}
+
 NNVM_REGISTER_OP(reshape_like)
   .describe(R"code(Reshapes the input array by the size of another array.
 For an input array with shape ``(d1, d2, ..., dk)``, `reshape_like` operation reshapes
@@ -625,7 +660,7 @@ the input array into an output array with the same shape as the second input arr
     NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, in_attrs->at(1));
     return true;
 })
-.set_attr<FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_attr<FInferType>("FInferType", ReshapeLikeInferType)
 // never transform layout of the second input array.
 .set_attr<FCorrectLayout>("FCorrectLayout", ElemwiseFixedLayoutUnknownOut<1, 1>)
 .set_attr<FGradient>(
@@ -721,8 +756,8 @@ Examples::
                     const Array<Tensor>& inputs,
                     const Array<Tensor>& out_info) {
     const SqueezeParam& param = nnvm::get<SqueezeParam>(attrs.parsed);
-    auto axis = ShapeToArray(param.axis);
-    return Array<Tensor>{ topi::squeeze(inputs[0], axis) };
+    auto axis = ShapeToIntArray(param.axis);
+    return Array<Tensor>{ topi::squeeze(inputs[0], axis, true) };
 })
 .set_attr<FGradient>(
   "FGradient", [](const NodePtr& n,
@@ -839,7 +874,7 @@ Examples::
                     const Array<Tensor>& inputs,
                     const Array<Tensor>& out_info) {
     const TransposeParam& param = nnvm::get<TransposeParam>(attrs.parsed);
-    auto axes = ShapeToArray(param.axes);
+    auto axes = ShapeToIntArray(param.axes);
     return Array<Tensor>{ topi::transpose(inputs[0], axes) };
 })
 .set_attr<FGradient>(
@@ -945,23 +980,25 @@ Examples::
                     const Array<Tensor>& inputs,
                     const Array<Tensor>& out_info) {
     const StridedSliceParam& param = nnvm::get<StridedSliceParam>(attrs.parsed);
-    Array<Expr> begin;
-    Array<Expr> end;
-    Array<Expr> stride;
+    Array<Integer> begin;
+    Array<Integer> end;
+    Array<Integer> stride;
 
     for (int64_t i : param.begin) {
-        begin.push_back(tvm::make_const(tvm::Int(32), i));
+      begin.push_back(static_cast<int>(i));
     }
 
     for (int64_t i : param.end) {
-        end.push_back(tvm::make_const(tvm::Int(32), i));
+      end.push_back(static_cast<int>(i));
     }
 
     for (int64_t i : param.stride) {
-        stride.push_back(tvm::make_const(tvm::Int(32), i));
+      stride.push_back(static_cast<int>(i));
     }
 
-    return Array<Tensor>{ topi::strided_slice(inputs[0], begin, end, stride) };
+    return Array<Tensor>{
+      topi::strided_slice(inputs[0], begin, end, stride)
+    };
 })
 .set_support_level(1);
 
@@ -977,7 +1014,7 @@ Examples::
        [ 3, 4]]
 
   flip(x) = [[ 3.,  4.],
-                  [ 1.,  2.]]
+             [ 1.,  2.]]
 
   x = [[[ 1.,  2.],
         [ 3.,  4.]],
@@ -986,16 +1023,16 @@ Examples::
         [ 7.,  8.]]]
 
   flip(x) = [[[ 5.,  6.],
-                   [ 7.,  8.]],
+              [ 7.,  8.]],
 
-                  [[ 1.,  2.],
-                   [ 3.,  4.]]]
+             [[ 1.,  2.],
+              [ 3.,  4.]]]
 
   flip(x, axis=1) = [[[ 3.,  4.],
-                                 [ 1.,  2.]],
+                      [ 1.,  2.]],
 
-                                [[ 7.,  8.],
-                                 [ 5.,  6.]]]
+                     [[ 7.,  8.],
+                      [ 5.,  6.]]]
 )code" NNVM_ADD_FILELINE)
 .add_argument("data", "Tensor", "Source input")
 .add_arguments(FlipParam::__FIELDS__())
@@ -1118,7 +1155,7 @@ Examples::
 .set_attr<FCorrectLayout>("FCorrectLayout", TakeCorrectLayout)
 .set_num_inputs(2)
 .set_num_outputs(1)
-.set_support_level(1)
+.set_support_level(3)
 .set_attr<FTVMCompute>(
     "FTVMCompute", [](const NodeAttrs& attrs,
                       const Array<Tensor>& inputs,
@@ -1175,6 +1212,15 @@ inline bool SliceLikeShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+// Adapter function to make int array.
+Array<Integer> GetIntArray(Array<Expr> arr) {
+  for (size_t i = 0; i < arr.size(); ++i) {
+    CHECK(!arr[i].defined() || arr[i].as<IntImm>())
+        << "Expect an int array";
+  }
+  return Array<Integer>(arr.node_);
+}
+
 NNVM_REGISTER_OP(slice_like)
 .describe(R"code(Slice the first input respect to the second input.
 )code" NNVM_ADD_FILELINE)
@@ -1226,7 +1272,10 @@ NNVM_REGISTER_OP(slice_like)
       }
     }
     return Array<Tensor>{
-      topi::strided_slice(inputs[0], begin_idx, end_idx, strides)
+      topi::strided_slice(inputs[0],
+                          GetIntArray(begin_idx),
+                          GetIntArray(end_idx),
+                          GetIntArray(strides))
     };
 })
 .set_attr<FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
@@ -1327,5 +1376,107 @@ Examples::
 })
 .set_support_level(4);
 
+// gather_nd
+inline bool GatherNDInferShape(const nnvm::NodeAttrs& attrs,
+                               std::vector<TShape>* in_attrs,
+                               std::vector<TShape>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const TShape& data_shape = in_attrs->at(0);
+  const TShape& indices_shape = in_attrs->at(1);
+  CHECK_GT(indices_shape.ndim(), 1) << "indices must have at least 2 dimensions";
+  CHECK_LE(indices_shape[0], data_shape.ndim()) <<
+      "dim 0 of indices must be no more than rank of data";
+  std::vector<dim_t> oshape;
+  for (size_t i = 1; i < indices_shape.ndim(); ++i) {
+    oshape.push_back(indices_shape[i]);
+  }
+  for (size_t i = indices_shape[0]; i < data_shape.ndim(); ++i) {
+    oshape.push_back(data_shape[i]);
+  }
+  if (oshape.size() == 0) {
+    oshape.push_back(1);
+  }
+  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0,
+                           TShape(oshape.begin(), oshape.end()));
+  return true;
+}
+
+inline bool GatherNDInferType(const NodeAttrs &attrs,
+                              std::vector<int> *in_attrs,
+                              std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  NNVM_ASSIGN_OUTPUT_TYPE(attrs, *out_attrs, 0, (*in_attrs)[0]);
+  return true;
+}
+
+inline bool GatherNDCorrectLayout(const NodeAttrs& attrs,
+                                  std::vector<Layout> *ilayouts,
+                                  const std::vector<Layout> *last_ilayouts,
+                                  std::vector<Layout> *olayouts) {
+  CHECK_EQ(ilayouts->size(), last_ilayouts->size());
+  CHECK_EQ(olayouts->size(), 1U);
+
+  for (size_t i = 0; i < ilayouts->size(); ++i) {
+    const Layout& input = last_ilayouts->at(i).defined() ?
+                          last_ilayouts->at(i) : ilayouts->at(i);
+    NNVM_ASSIGN_LAYOUT(*ilayouts, i, input);
+  }
+
+  return true;
+}
+
+NNVM_REGISTER_OP(gather_nd)
+.describe(R"code(
+Gather elements or slices from ``data`` into a tensor specified by ``indices``.
+
+The shape of output tensor is inferred from ``indices``. Given ``data`` with
+shape ``(X0, X1, ..., X_{N-1})`` and ``indices`` with shape ``(Y_0, ...,
+Y_{M-1})``, the output will have shape ``(Y_1, ..., Y_{M-1}, X_{Y_0}, ...,
+X_{N-1})`` when ``Y_0 < N``, or ``(Y_1, ..., Y_{M-1})`` when ``Y_0 == N``. The
+operator is invalid when ``Y_0 > N``.
+
+The element in output is defined as follows::
+
+  output[y_1, ..., y_{M-1}, x_{Y_0}, ..., x_{N-1}] = data[indices[0, y_1, ..., y_{M-1}],
+                                                     ...,
+                                                     indices[Y_0-1, y_1, ..., y_{M-1}],
+                                                     x_{Y_0}, ..., x_{N-1}]
+
+Examples::
+
+  data = [[0, 1], [2, 3]]
+  indices = [[1], [0]]
+  gather_nd(data, indices) = [2]
+
+  data = [[0, 1], [2, 3]]
+  indices = [[1, 1, 0], [0, 1, 0]]
+  gather_nd(data, indices) = [2, 3, 0]
+
+  data = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
+  indices = [[0, 1], [1, 0]]
+  gather_nd(data, indices) = [[3, 4], [5, 6]]
+
+)code" NNVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "Input data.")
+.add_argument("indices", "Tensor", "Indices of data")
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<FInferShape>("FInferShape", GatherNDInferShape)
+.set_attr<FInferType>("FInferType", GatherNDInferType)
+.set_attr<FCorrectLayout>("FCorrectLayout", GatherNDCorrectLayout)
+.set_attr<FTVMCompute>(
+    "FTVMCompute", [](const NodeAttrs& attrs,
+                      const Array<Tensor>& inputs,
+                      const Array<Tensor>& out_info) {
+      return Array<Tensor>{
+        topi::gather_nd(inputs[0], inputs[1]) };
+  })
+.set_attr<FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"data", "indices"};
+})
+.set_support_level(3);
+
 }  // namespace top
 }  // namespace nnvm
diff --git a/nnvm/tests/python/compiler/test_alter_op_layout.py b/nnvm/tests/python/compiler/test_alter_op_layout.py
index 0fbf5ad3b479..cc3df61a28c7 100644
--- a/nnvm/tests/python/compiler/test_alter_op_layout.py
+++ b/nnvm/tests/python/compiler/test_alter_op_layout.py
@@ -45,9 +45,61 @@ def alter_conv2d_layout(attrs, inputs, tinfos):
 
     # check copy layouts
     for node in ["data", "relu", "flatten", "softmax", "conv_weight"]:
-        assert(layouts[node] == layouts_origin[node])
-    assert(layouts["conv_alter"] == layouts_origin["conv"])
+        assert layouts[node] == layouts_origin[node]
+    assert layouts["conv_alter"] == layouts_origin["conv"]
+
+
+def test_consecutive_alter_layout():
+    data = sym.Variable("data", shape=(1, 32, 512, 512))
+    pool1 = sym.global_avg_pool2d(data, name="global_avg_pool2d_1", layout="NCHW")
+    pool2 = sym.global_avg_pool2d(pool1, name="global_avg_pool2d_2", layout="NCHW")
+    relu = sym.relu(pool2, name="relu")
+
+    g = graph.create(relu)
+    g = g.apply("CorrectLayout")
+    g = graph_attr.set_dtype_inputs(g, "float32")
+    g = g.apply(["InferShape", "InferType"])
+    assert g.json_attr("layout") == ['NCHW', 'NCHW', 'NCHW', 'NCHW']
+
+    @reg.register_alter_op_layout("global_avg_pool2d", level=100)
+    def alter_global_avg_pool2d_layout(attrs, inputs, tinfos):
+        new_attrs = {k : attrs[k] for k in attrs.keys()}
+        new_attrs["layout"] = "NCHW16c"
+        return sym.global_avg_pool2d(inputs[0], **new_attrs)
+
+    g = g.apply("AlterOpLayout")
+
+    # pool1 get replaced - output layout of pool1 is not recorded
+    # pool2 get replaced - input layout of pool2 is not recorded
+    # thus the second entry must be undefined - it can neither recover from pool1's output,
+    # nor from pool2's input.
+    assert g.json_attr("layout") == ['NCHW', '__undef__', 'NCHW', 'NCHW']
+
+
+def test_alter_func_return_none():
+    data = sym.Variable("data", shape=(1, 32, 512, 512))
+    pool1 = sym.global_max_pool2d(data, name="pool1", layout="NCHW")
+    pool2 = sym.global_max_pool2d(pool1, name="pool2", layout="NCHW")
+    relu = sym.relu(pool2, name="relu")
+
+    g = graph.create(relu)
+    g = g.apply("CorrectLayout")
+    g = graph_attr.set_dtype_inputs(g, "float32")
+    g = g.apply(["InferShape", "InferType"])
+    assert g.json_attr("layout") == ['NCHW', 'NCHW', 'NCHW', 'NCHW']
+
+    @reg.register_alter_op_layout("global_max_pool2d", level=100)
+    def alter_global_max_pool2d_layout(attrs, inputs, tinfos):
+        return None
+
+    g = g.apply("AlterOpLayout")
+
+    # alter func return none, nothing get replaced,
+    # the layouts should remain the same
+    assert g.json_attr("layout") == ['NCHW', 'NCHW', 'NCHW', 'NCHW']
 
 
 if __name__ == "__main__":
     test_alter_conv2d_layout()
+    test_consecutive_alter_layout()
+    test_alter_func_return_none()
diff --git a/nnvm/tests/python/compiler/test_autotvm_task_extraction.py b/nnvm/tests/python/compiler/test_autotvm_task_extraction.py
new file mode 100644
index 000000000000..fd14934f8ade
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_autotvm_task_extraction.py
@@ -0,0 +1,63 @@
+"""Test task extraction for autotvm"""
+
+import nnvm.testing
+import nnvm.compiler
+from tvm import autotvm
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network"""
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if name == 'resnet-18':
+        net, params = nnvm.testing.resnet.get_workload(num_layers=18, batch_size=batch_size)
+    elif name == 'mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif name == 'squeezenet v1.1':
+        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1')
+    elif name == 'vgg-16':
+        net, params = nnvm.testing.vgg.get_workload(num_layers=16, batch_size=batch_size)
+    elif name == 'dcgan':
+        net, params = nnvm.testing.dcgan.get_workload(batch_size=batch_size)
+        input_shape = (batch_size, 100)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, input_shape, output_shape
+
+def test_task_extraction():
+    target = 'llvm'
+    dtype = 'float32'
+
+    net, params, input_shape, out_shape = get_network('resnet-18', batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d,))
+    assert len(tasks) == 12
+
+    net, params, input_shape, out_shape = get_network('resnet-18', batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.dense,))
+    assert len(tasks) == 1
+
+    net, params, input_shape, out_shape = get_network('resnet-18', batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d, nnvm.sym.dense))
+    assert len(tasks) == 13
+
+    net, params, input_shape, out_shape = get_network('mobilenet', batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d, nnvm.sym.dense))
+    assert len(tasks) == 20
+
+    net, params, input_shape, out_shape = get_network('dcgan', batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d_transpose,))
+    assert len(tasks) == 4
+
+if __name__ == '__main__':
+    test_task_extraction()
diff --git a/nnvm/tests/python/compiler/test_build.py b/nnvm/tests/python/compiler/test_build.py
index 5e1f0337c293..387225f550ab 100644
--- a/nnvm/tests/python/compiler/test_build.py
+++ b/nnvm/tests/python/compiler/test_build.py
@@ -27,7 +27,7 @@ def verify(graph, lib):
         # get outputs
         out = tvm.nd.empty(shape, dtype)
         get_output(0, out)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             out.asnumpy(), np.exp(na.asnumpy() + nb.asnumpy()))
 
     graph, lib, _ = nnvm.compiler.build(z, "llvm", shape_dict)
@@ -49,7 +49,7 @@ def test_run():
     nx = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
     ny = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
     res = _run_graph(z, {"x": nx, "y": ny})
-    np.testing.assert_allclose(
+    tvm.testing.assert_allclose(
         res[0].asnumpy(), np.exp(nx.asnumpy() + ny.asnumpy()))
 
 
@@ -73,7 +73,7 @@ def test_precompute_prune():
     m["load_params"](nnvm.compiler.save_param_dict(params))
     m.run()
     out = m.get_output(0, out=res)
-    np.testing.assert_allclose(
+    tvm.testing.assert_allclose(
         res.asnumpy(), nx.asnumpy() + 1 + ny.asnumpy() + na.asnumpy())
 
 
@@ -92,11 +92,69 @@ def test_dtypes():
         m.run(x=data)
         data = (data > 0) * data
         out = m.get_output(0, tvm.nd.empty(oshape, dtype))
-        np.testing.assert_allclose(out.asnumpy(), data, atol=1e-5, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), data, atol=1e-5, rtol=1e-5)
 
+def test_ndarray_output():
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+    z = x + y
+    shape = (10, 10)
+    dtype = tvm.float32
+    nx = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    ny = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    params = {"x": nx, "ny": ny}
+    graph, lib, params = nnvm.compiler.build(
+        z, "llvm", shape={"y": ny.shape, "x": nx.shape}, params=params)
+    m = graph_runtime.create(graph, lib, tvm.cpu(0))
+    m.set_input("x", nx)
+    m.set_input("y", ny)
+    m.run()
+    out = m.get_output(0)
+    tvm.testing.assert_allclose(
+        out.asnumpy(), nx.asnumpy() + ny.asnumpy())
+
+def test_ndarray_input():
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+    z = x + y
+    shape = (10, 10)
+    dtype = tvm.float32
+    nx = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    ny = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    params = {"x": nx, "ny": ny}
+    graph, lib, params = nnvm.compiler.build(
+        z, "llvm", shape={"y": ny.shape, "x": nx.shape}, params=params)
+    m = graph_runtime.create(graph, lib, tvm.cpu(0))
+    m.set_input("x", nx)
+    m.set_input("y", ny)
+    in_x = tvm.nd.empty(shape, dtype)
+    in_y = tvm.nd.empty(shape, dtype)
+    m.get_input("x", in_x)
+    m.get_input("y", in_y)
+    tvm.testing.assert_allclose(nx.asnumpy(), in_x.asnumpy())
+    tvm.testing.assert_allclose(ny.asnumpy(), in_y.asnumpy())
+    in_nx = m.get_input("x")
+    in_ny = m.get_input("y")
+    tvm.testing.assert_allclose(nx.asnumpy(), in_nx.asnumpy())
+    tvm.testing.assert_allclose(ny.asnumpy(), in_ny.asnumpy())
+
+def test_num_outputs():
+    x = sym.Variable('x')
+    z = sym.split(x, indices_or_sections=5, axis=1)
+    shape = (10, 10)
+    dtype = tvm.float32
+    nx = tvm.nd.array(np.random.uniform(size=shape).astype(dtype))
+    params = {"x": nx}
+    graph, lib, params = nnvm.compiler.build(
+        z, "llvm", shape={"x": nx.shape}, params=params)
+    m = graph_runtime.create(graph, lib, tvm.cpu(0))
+    assert m.get_num_outputs() == 5
 
 if __name__ == "__main__":
     test_precompute_prune()
     test_compile()
     test_run()
     test_dtypes()
+    test_ndarray_output()
+    test_ndarray_input()
+    test_num_outputs()
diff --git a/nnvm/tests/python/compiler/test_compiler_cache.py b/nnvm/tests/python/compiler/test_compiler_cache.py
index 970b193a6875..623f05048348 100644
--- a/nnvm/tests/python/compiler/test_compiler_cache.py
+++ b/nnvm/tests/python/compiler/test_compiler_cache.py
@@ -19,7 +19,7 @@ def verify(graph, lib):
         m.run(x=na, y=nb)
         # get outputs
         out = m.get_output(0, tvm.nd.empty(shape, dtype))
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             out.asnumpy(), np.exp(na.asnumpy() + nb.asnumpy()))
 
     engine = nnvm.compiler.engine
diff --git a/nnvm/tests/python/compiler/test_fold_axis.py b/nnvm/tests/python/compiler/test_fold_axis.py
index bbd50193b4b0..a7611fbde797 100644
--- a/nnvm/tests/python/compiler/test_fold_axis.py
+++ b/nnvm/tests/python/compiler/test_fold_axis.py
@@ -1,4 +1,5 @@
 """Unittest cases for fold_axis"""
+import tvm
 import nnvm
 import nnvm.testing.resnet
 import numpy as np
@@ -6,6 +7,7 @@
 from nnvm.compiler import graph_util, graph_attr
 
 def test_fold_axis_conv():
+    # Before simplify
     def before(x, conv_weight, conv_bias, in_scale, out_scale, channels):
         x = x * sym.expand_dims(in_scale, axis=1, num_newaxis=2)
         y = sym.conv2d(x, conv_weight, conv_bias,
@@ -31,7 +33,6 @@ def expected(x, conv_weight, conv_bias, in_scale, out_scale, channels):
         y = sym.relu(y)
         return y
 
-    # Before simplify
     def check(shape, channels):
         x = sym.Variable("x") + 1
         weight = sym.Variable("weight")
@@ -50,8 +51,55 @@ def check(shape, channels):
 
     check((2, 4, 10, 10), 2)
 
+def test_fold_axis_depthwise_conv():
+    # Before simplify
+    def before(x, conv_weight, conv_bias, in_scale, out_scale, channels):
+        x = x * sym.expand_dims(in_scale, axis=1, num_newaxis=2)
+        y = sym.conv2d(x, conv_weight, conv_bias,
+                       channels=channels,
+                       kernel_size=(3, 3),
+                       padding=(1, 1),
+                       groups=54,
+                       name="depthiwise_conv")
+        y = sym.relu(y)
+        y = y * sym.expand_dims(out_scale, axis=1, num_newaxis=2)
+        return y
+
+    def expected(x, conv_weight, conv_bias, in_scale, out_scale, channels):
+        conv_weight = conv_weight * sym.expand_dims(out_scale, axis=1, num_newaxis=3)
+        conv_weight = conv_weight * sym.expand_dims(in_scale, axis=1, num_newaxis=3)
+        conv_bias = conv_bias * out_scale
+        y = sym.conv2d(x,
+                       conv_weight,
+                       conv_bias,
+                       channels=channels,
+                       kernel_size=(3, 3),
+                       padding=(1, 1),
+                       groups=54,
+                       name="depthiwise_conv")
+        y = sym.relu(y)
+        return y
+
+    def check(shape, channels):
+        x = sym.Variable("x") + 1
+        weight = sym.Variable("weight")
+        bias = sym.Variable("bias")
+        in_scale = sym.Variable("in_scale")
+        out_scale = sym.Variable("out_scale")
+        y1 = before(x, weight, bias, in_scale, out_scale, channels)
+        y2 = expected(x, weight, bias, in_scale, out_scale, channels)
+        ishape = {"x": shape, "out_scale": (channels,), "in_scale": (shape[1],)}
+        g1 = nnvm.graph.create(y1)
+        g2 = nnvm.graph.create(y2)
+        graph_attr.set_shape_inputs(g1, ishape)
+        g1 = g1.apply("InferShape").apply("FoldScaleAxis")
+        # assert graph equals as expected
+        graph_util.check_graph_equal(g1, g2)
+
+    check((1, 54, 63, 127), 54)
 
 def test_fold_fail():
+    # Before simplify
     def before(x, scale, channels):
         y = sym.conv2d(x,
                        channels=channels,
@@ -61,7 +109,6 @@ def before(x, scale, channels):
         y = y * sym.expand_dims(scale, axis=1, num_newaxis=1)
         return y
 
-    # Before simplify
     def check(shape, channels):
         x = sym.Variable("x")
         bias = sym.Variable("bias")
@@ -101,10 +148,11 @@ def run_prune(graph, params, opt_level):
 
     x = run_prune(graph, params, 0)
     y = run_prune(graph, params, 3)
-    np.testing.assert_allclose(y[0].asnumpy(), x[0].asnumpy())
+    tvm.testing.assert_allclose(y[0].asnumpy(), x[0].asnumpy())
 
 
 if __name__ == "__main__":
     test_fold_resnet()
     test_fold_axis_conv()
     test_fold_fail()
+    test_fold_axis_depthwise_conv()
diff --git a/nnvm/tests/python/compiler/test_nhwc_layout.py b/nnvm/tests/python/compiler/test_nhwc_layout.py
index 96a8135435c3..f1aced94a0b3 100644
--- a/nnvm/tests/python/compiler/test_nhwc_layout.py
+++ b/nnvm/tests/python/compiler/test_nhwc_layout.py
@@ -50,7 +50,7 @@ def test_nhwc():
     oshape_nhwc = (1, 224, 224, out_channel)
     nchw_output = build_and_run(nchw_sym, nchw_params, data, oshape)
     nhwc_output = build_and_run(nhwc_sym, nhwc_params, data.transpose(0, 2, 3, 1), oshape_nhwc)
-    np.testing.assert_allclose(nchw_output, nhwc_output.transpose(0, 3, 1, 2), rtol=1e-5, atol=1e-5)
+    tvm.testing.assert_allclose(nchw_output, nhwc_output.transpose(0, 3, 1, 2), rtol=1e-5, atol=1e-5)
 
 
 if __name__ == "__main__":
diff --git a/nnvm/tests/python/compiler/test_op_fusion.py b/nnvm/tests/python/compiler/test_op_fusion.py
index 8d05ae02c579..4c4773773d47 100644
--- a/nnvm/tests/python/compiler/test_op_fusion.py
+++ b/nnvm/tests/python/compiler/test_op_fusion.py
@@ -22,7 +22,7 @@ def test_ewise_injective():
         x_np = np.random.uniform(size=dshape).astype(dtype)
         m.run(x=x_np)
         out = m.get_output(0, tvm.nd.empty((10, 6)))
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             out.asnumpy(),  x_np.reshape(out.shape) * 2 + 1,
             atol=1e-5, rtol=1e-5)
 
@@ -54,7 +54,7 @@ def test_conv_ewise_injective():
             data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME')
         c_np = c_np + bias.asnumpy().reshape(kshape[0], 1, 1) + 1
         c_np = c_np.reshape(c_np.shape[0], np.prod(c_np.shape[1:])) + 1
-        np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
 
 
 def test_injective_reduce_injective():
@@ -74,7 +74,111 @@ def test_injective_reduce_injective():
         c_np = np.sum(data.reshape(32, 18 * 18) + 1, axis=1)
         # get output
         out = m.get_output(0, tvm.nd.empty(c_np.shape, dtype))
-        np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+
+
+def test_injective_conv2d():
+    channels = 16
+    data = sym.Variable(name="data")
+    pool = sym.global_avg_pool2d(data=data)
+    weight = sym.reshape(pool, shape=[1, channels, 1, 1])
+    residual = sym.conv2d(data=data, kernel_size=(3,3), channels=channels, padding=(1, 1),
+                          layout="NCHW", kernel_layout="OIHW", use_bias=False, name="conv")
+    net = weight * data + residual
+    size = 56
+    dtype="float32"
+    dshape = (1, channels, size, size)
+    kshape = (channels, channels, 3, 3)
+    oshape = dshape
+    shape_dict = {"data": dshape}
+
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(net, target, shape_dict)
+        # data, global_avg_pool, conv weight, conv op, fused elemwise add
+        assert graph.index.num_nodes == 5
+
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(data=data, conv_weight=kernel)
+        # get output
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        residual = topi.testing.conv2d_nchw_python(
+            data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME')
+        weight = np.mean(data.asnumpy(), axis=(2, 3))
+        c_np = weight[:, :, np.newaxis, np.newaxis] * data.asnumpy() + residual
+        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+
+
+def test_concatenate_conv2d():
+    ch = 3
+    size = 8
+    data = sym.Variable(name="data")
+    concat = sym.concatenate(data, data, axis=1)
+    conv = sym.conv2d(data=concat, kernel_size=(1,1), channels=ch*2, use_bias=False, name="conv")
+    net = sym.elemwise_add(concat, conv)
+
+    dtype="float32"
+    dshape = (1, ch, size, size)
+    kshape = (ch*2, ch*2, 1, 1)
+    oshape = (1, ch*2, size, size)
+    shape_dict = {"data": dshape}
+
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(net, target, shape_dict)
+        # data, conv weight, conv op, concat
+        assert graph.index.num_nodes == 4
+
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(data=data, conv_weight=kernel)
+        # get output
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+
+        concat = np.concatenate((data.asnumpy(), data.asnumpy()), axis=1)
+        conv = topi.testing.conv2d_nchw_python(
+            concat, kernel.asnumpy(), (1,1), 'SAME')
+        ref = concat + conv
+        tvm.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5)
+
+
+def test_residual_block_layout_transform():
+    ch = 16
+    size = 32
+    data = sym.Variable(name="data")
+    conv1 = sym.conv2d(data=data, kernel_size=(3,3), channels=ch, padding = (1, 1), use_bias=False, name="conv1")
+    layout_transform1 = sym.__layout_transform__(data=conv1, src_layout="NCHW", dst_layout="NCHW8c")
+    layout_transform2 = sym.__layout_transform__(data=layout_transform1, src_layout="NCHW8c", dst_layout="NCHW")
+    conv2 = sym.conv2d(data=conv1, kernel_size=(3,3), channels=ch, padding = (1, 1), use_bias=False, name="conv2")
+    elemwise_sum = sym.elemwise_add(layout_transform2, conv2)
+    out = sym.relu(elemwise_sum)
+
+    dtype="float32"
+    dshape = (1, ch, size, size)
+    kshape = (ch, ch, 3, 3)
+    oshape = (1, ch, size, size)
+    shape_dict = {"data": dshape}
+
+    target = "llvm" # only test on llvm since it involves NCHW8c layout
+    ctx = tvm.context(target, 0)
+    graph, lib, _ = nnvm.compiler.build(out, target, shape_dict)
+    # data, conv1 weight, conv1, layout transform + elemwise add + relu, conv2 weight, conv2 op
+    assert graph.index.num_nodes == 6
+
+    data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+    kernel1 = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+    kernel2 = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+    m = graph_runtime.create(graph, lib, ctx)
+    m.run(data=data, conv1_weight=kernel1, conv2_weight=kernel2)
+    out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+
+    conv1 = topi.testing.conv2d_nchw_python(
+        data.asnumpy(), kernel1.asnumpy(), (1,1), 'SAME')
+    conv2 = topi.testing.conv2d_nchw_python(
+        conv1, kernel2.asnumpy(), (1,1), 'SAME')
+    ref = np.maximum(conv1 + conv2, 0)
+    tvm.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5)
 
 
 def build_and_run(sym, params, data, out_shape, target, ctx, opt_level=2):
@@ -114,7 +218,7 @@ def get_sym(out_channel):
         _, params2 = utils.create_workload(sym2, 1, dshape[1:], seed=0)
         output1, g1 = build_and_run(sym1, params1, data, oshape, target, ctx, opt_level=2)
         output2, g2 = build_and_run(sym2, params2, data, oshape, target, ctx, opt_level=0)
-        np.testing.assert_allclose(output1, output2, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(output1, output2, rtol=1e-5, atol=1e-5)
         # data, conv weight, bias, batch norm gamma, batch norm beta, conv op
         assert g1.index.num_nodes == 6
 
@@ -123,3 +227,6 @@ def get_sym(out_channel):
     test_ewise_injective()
     test_conv_ewise_injective()
     test_fuse_conv2d_elu()
+    test_injective_conv2d()
+    test_concatenate_conv2d()
+    test_residual_block_layout_transform()
diff --git a/nnvm/tests/python/compiler/test_optimizer.py b/nnvm/tests/python/compiler/test_optimizer.py
index fd620271d861..413227d88091 100644
--- a/nnvm/tests/python/compiler/test_optimizer.py
+++ b/nnvm/tests/python/compiler/test_optimizer.py
@@ -27,7 +27,7 @@ def helper(symbol, inputs, params, update_func, run_times, target, ctx, dtype="f
         m.run()
     y_np = update_func(**np_inputs)
     out = m.get_output(0, tvm.nd.empty(y_np.shape, dtype))
-    np.testing.assert_allclose(out.asnumpy(), y_np, atol=1e-5, rtol=1e-5)
+    tvm.testing.assert_allclose(out.asnumpy(), y_np, atol=1e-5, rtol=1e-5)
 
 
 def test_sgd():
diff --git a/nnvm/tests/python/compiler/test_param_dict.py b/nnvm/tests/python/compiler/test_param_dict.py
index a6605123fa0d..447db305d98c 100644
--- a/nnvm/tests/python/compiler/test_param_dict.py
+++ b/nnvm/tests/python/compiler/test_param_dict.py
@@ -68,7 +68,7 @@ def verify_nnvm(remote, target, shape, dtype):
         m.load_params(nnvm.compiler.save_param_dict(params))
         m.run()
         out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype, ctx=ctx))
-        np.testing.assert_allclose(a + 1, out.asnumpy())
+        tvm.testing.assert_allclose(a + 1, out.asnumpy())
 
     print("Test RPC connection to PowerPC...")
     remote = rpc.connect(host, port)
diff --git a/nnvm/tests/python/compiler/test_rpc_exec.py b/nnvm/tests/python/compiler/test_rpc_exec.py
index 111ba724e196..8177f1b153ab 100644
--- a/nnvm/tests/python/compiler/test_rpc_exec.py
+++ b/nnvm/tests/python/compiler/test_rpc_exec.py
@@ -43,7 +43,7 @@ def test_rpc_executor():
     # get outputs
     out = tvm.nd.empty(shape, dtype, ctx)
     get_output(0, out)
-    np.testing.assert_allclose(
+    tvm.testing.assert_allclose(
         out.asnumpy(), np.exp(na.asnumpy() + nb.asnumpy()))
     server.terminate()
 
diff --git a/nnvm/tests/python/compiler/test_simplify_inference.py b/nnvm/tests/python/compiler/test_simplify_inference.py
index e2826765995e..fd0e1e3c182e 100644
--- a/nnvm/tests/python/compiler/test_simplify_inference.py
+++ b/nnvm/tests/python/compiler/test_simplify_inference.py
@@ -10,7 +10,6 @@ def simple_bn(x, gamma, beta, moving_mean, moving_var,
         scale = sym.elemwise_mul(1 / sym.sqrt(moving_var + epsilon), gamma)
         shift = sym.elemwise_add(
             sym.elemwise_mul(sym.negative(moving_mean), scale), beta)
-        shape = [-1 if i == axis else 1 for i in range(len(shape))]
         # for 2D
         num_newaxis=len(shape) - axis - 1
         if num_newaxis:
diff --git a/nnvm/tests/python/compiler/test_to_relay.py b/nnvm/tests/python/compiler/test_to_relay.py
new file mode 100644
index 000000000000..25037cfd3587
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_to_relay.py
@@ -0,0 +1,41 @@
+import nnvm
+from nnvm import testing
+from nnvm import to_relay
+import tvm
+from tvm.relay import ir_pass
+from tvm.relay import create_executor
+from tvm.contrib import graph_runtime
+import numpy as np
+
+def check_model(sym, shapes, dtypes, params):
+    net = nnvm.graph.create(sym)
+    graph_json, mod, params = nnvm.compiler.build(
+        net,
+        'llvm',
+        shape=shapes,
+        dtype=dtypes,
+        params=params)
+    nnvm_rts = graph_runtime.create(graph_json, mod, tvm.cpu(0))
+    inputs = {}
+    for name in shapes:
+        np_array = np.random.rand(*shapes[name]).astype('float32')
+        inputs[name] = tvm.nd.array(np_array)
+
+    nnvm_rts.set_input(**params)
+    nnvm_rts.run(**inputs)
+    nnvm_out = nnvm_rts.get_output(0)
+    relay_model, params = to_relay.to_relay(net, shapes, dtypes, params)
+    relay_model = ir_pass.infer_type(relay_model)
+    relay_rts = create_executor(kind='graph', ctx=tvm.cpu(0), target='llvm')
+    inputs.update(params)
+    relay_out = relay_rts.evaluate(relay_model)(*list(inputs.values()))
+    np.testing.assert_allclose(nnvm_out.asnumpy(), relay_out.asnumpy())
+
+# def test_mlp():
+#     mlp, params = testing.mlp.get_workload(1)
+#     shapes =  { "data": (10, 3, 224, 224) }
+#     dtypes =  { "data": 'float32' }
+#     check_model(mlp, shapes, dtypes, params)
+
+if __name__ == "__main__":
+    test_mlp()
diff --git a/nnvm/tests/python/compiler/test_top_assign.py b/nnvm/tests/python/compiler/test_top_assign.py
index e411385712f5..95c16c96c443 100644
--- a/nnvm/tests/python/compiler/test_top_assign.py
+++ b/nnvm/tests/python/compiler/test_top_assign.py
@@ -27,11 +27,11 @@ def check(target, ctx):
         m.set_input("w", data)
         m.run()
         out = m.get_input("w2", tvm.nd.empty(dshape, dtype))
-        np.testing.assert_allclose(out.asnumpy(), data.asnumpy() + 2, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), data.asnumpy() + 2, rtol=1e-5)
 
         m.run()
         out = m.get_input("w2", tvm.nd.empty(dshape, dtype))
-        np.testing.assert_allclose(out.asnumpy(), data.asnumpy() + 3, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), data.asnumpy() + 3, rtol=1e-5)
 
     for target, ctx in ctx_list():
         check(target, ctx)
diff --git a/nnvm/tests/python/compiler/test_top_level1.py b/nnvm/tests/python/compiler/test_top_level1.py
index d9c6655fea1d..d89bf359f2ac 100644
--- a/nnvm/tests/python/compiler/test_top_level1.py
+++ b/nnvm/tests/python/compiler/test_top_level1.py
@@ -5,49 +5,163 @@
 import nnvm.symbol as sym
 import nnvm.compiler
 from nnvm.testing.config import ctx_list
+from nnvm.testing.check_computation import check_function
 
-def helper(symbol, inputs, dtype,
-           np_forward, np_backward=None,
-           need_input=True, need_head_grads=True,
-           rnd_min=-1, rnd_max=1):
-    ishapes = {}
-    itypes = {}
-    input_syms = []
-    np_inputs = {}
-    for (name, shape, s) in inputs:
-        ishapes.update({name: shape})
-        itypes.update({name: dtype})
-        np_inputs.update({name: np.random.uniform(rnd_min, rnd_max, size=shape).astype(dtype)})
-        input_syms.append(s)
-
-    for target, ctx in ctx_list():
-        graph, lib, _ = nnvm.compiler.build(symbol, target, ishapes, itypes)
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(**np_inputs)
-        y_np = np_forward(**np_inputs)
-        out = m.get_output(0, tvm.nd.empty(y_np.shape, dtype))
-        np.testing.assert_allclose(out.asnumpy(), y_np, atol=1e-5, rtol=1e-5)
-        # backward
-        if np_backward:
-            graph._set_symbol_list_attr("grad_ys", symbol)
-            graph._set_symbol_list_attr("grad_xs", input_syms)
-            graph._set_symbol_list_attr("grad_ys_out_grad", sym.Variable("head_grads", shape=y_np.shape))
-            graph = graph.apply("Gradient")
-            ishapes.update({"head_grads": y_np.shape})
-            graph, lib, _ = nnvm.compiler.build(graph, target, ishapes)
-            m = graph_runtime.create(graph, lib, ctx)
-            head_grads = np.random.uniform(size=y_np.shape).astype(dtype)
-            y_np = np_backward(head_grads=head_grads, **np_inputs)
-            b_inputs = {}
-            if need_input:
-                b_inputs.update(np_inputs)
-            if need_head_grads:
-                b_inputs.update({"head_grads":head_grads})
-            m.run(**b_inputs)
-            for i in range(len(y_np)):
-                out = m.get_output(i, tvm.nd.empty(y_np[i].shape, dtype))
-                np.testing.assert_allclose(out.asnumpy(), y_np[i], atol=1e-5, rtol=1e-5)
+def test_check_function():
+    # test the testing function
 
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+
+    # different styles of returning gradients from the backward function
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: [head_grads, 2*head_grads],
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: (head_grads, 2*head_grads),
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: {'x': head_grads, 'y': 2*head_grads},
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: {'y': 2*head_grads},
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: [2*head_grads],
+                   grad_input_vars=[y],
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: 2*head_grads,
+                   grad_input_vars=[y],
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: 2*head_grads,
+                   grad_input_vars=[y],
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float64')
+
+    # test just numerical gradients
+    # different styles of shape and dtype passing
+    check_function(x + 2*y, shape={'x': (1, 2), y: (1, 2)},
+                   numerical_grads=True)
+    check_function(x + 2*y, shape={'x': (1, 2), y: (1, 2)}, dtype='float32',
+                   numerical_grads=True)
+    check_function(x + 2*y, shape={'x': (1, 2), y: (1, 2)}, dtype={x: 'float32', 'y': 'float32'},
+                   numerical_grads=True)
+    check_function(x + 2*y, shape=(1, 2), dtype='float32',
+                   numerical_grads=True)
+
+    # specifying variable attributes on variable creation
+    # (in this case type codes must be used)
+    x = sym.Variable("x", dtype=0, shape=(1, 2))
+    check_function(x + 2*y, shape={y: (1, 2)}, dtype={'y': 'float32'}, numerical_grads=True)
+    y = sym.Variable("y", dtype=0, shape=(1, 2))
+
+    # shape overriding
+    def _fwd1(x, y):
+        assert x.shape == (1, 1)
+        assert y.shape == (1, 2)
+        return x + 2*y
+    check_function(x + 2*y, _fwd1, shape={x: (1, 1)})
+
+    # in_range
+    def _fwd2(x, y):
+        assert x.shape == (100,)
+        assert (x <= 0.9).all()
+        assert (x >= 0.8).all()
+        return x + 2*y
+    check_function(x + 2*y, _fwd2, shape=(100,), in_range=(0.8, 0.9), numerical_grads=False)
+    check_function(x + 2*y, _fwd2, shape=(100,), in_range={'x': (0.8, 0.9)}, numerical_grads=False)
+    check_function(x + 2*y, backward=lambda x, y, head_grads: [1.0, 2.0],
+                   in_range={'head_grads_0': (1.0, 1.0)})
+    # explicit passing of values
+    check_function(x + 2*y, backward=lambda x, y, head_grads: [1.0, 2.0],
+                   values={'head_grads_0': np.full((1, 2), 1.0)})
+
+    # check that the function reports errors
+    def _check_function_must_fail(*args, **kwargs):
+        error = AssertionError
+        if 'error' in kwargs:
+            error = kwargs['error']
+            del kwargs['error']
+        try:
+            check_function(*args, quiet=True, **kwargs)
+        except error:
+            pass
+        else:
+            raise AssertionError("check_function didn't raise an exception")
+
+    _check_function_must_fail(x + 2*y, error=ValueError)
+    _check_function_must_fail(x + 2*y, lambda x, y: x + y)
+    _check_function_must_fail(x + 2*y, backward=lambda x, y, head_grads: [1.0, 2.0])
+    _check_function_must_fail(sym.block_grad(x + 2*y), numerical_grads=True)
+    _check_function_must_fail(x*x, numerical_grads=True,
+                              numerical_grads_params={'atol': 0.0, 'rtol': 0.0})
+    _check_function_must_fail(sym.log(-x*x), numerical_grads=True, error=ValueError)
+
+    # different styles of returning results from the forward function
+    check_function(x + 2*y, lambda x, y: [x + 2*y], numerical_grads=False)
+    _check_function_must_fail(x + 2*y, lambda x, y: [x + 2*y, x], numerical_grads=False,
+                              error=ValueError)
+    _check_function_must_fail(x + 2*y, lambda x, y: [], numerical_grads=False,
+                              error=ValueError)
+
+    # multiple outputs
+    z = sym.Group([2*x + y, x + 2*y])
+    check_function(z, lambda x, y: [2*x + y, x + 2*y])
+    check_function(z, lambda x, y: (2*x + y, x + 2*y))
+    check_function(z, backward=lambda x, y, head_grads: [2*head_grads[0] + head_grads[1],
+                                                         head_grads[0] + 2*head_grads[1]])
+    _check_function_must_fail(z, backward=lambda x, y, head_grads: [2*head_grads[0],
+                                                                    2*head_grads[1]])
+    check_function(z, backward=lambda x, y, head_grads: [head_grads[1], 2*head_grads[1]],
+                   in_range={'head_grads_0': (0, 0)})
+    check_function(z, numerical_grads=True)
+
+    z = sym.Group([sym.block_grad(2*x + y), x + 2*y])
+    check_function(z, lambda x, y: [2*x + y, x + 2*y], numerical_grads=False)
+    _check_function_must_fail(z, lambda x, y: [2*x + y, x + 2*y])
+    _check_function_must_fail(z, numerical_grads=True)
+
+    z = sym.Group([2*x + y, sym.block_grad(x + 2*y)])
+    _check_function_must_fail(z, numerical_grads=True)
+
+    z = sym.Group([2*x + y, x + 2*y, x, y, sym.sum(x)])
+    check_function(z, lambda x, y: [2*x + y, x + 2*y, x, y, np.sum(x)])
+
+    # passing additional parameters to forward and backward
+    def _fwd3(x, p):
+        assert p == 'v'
+        return x + 1
+    def _bwd3(x, p, head_grads):
+        assert p == 'v'
+        return head_grads
+    check_function(x + 1, _fwd3, _bwd3, additional_params={'p': 'v'})
+
+    # implicitly created variables and shape/dtype inference for inputs
+    x = sym.Variable("x", shape=(2, 3), dtype=0)
+    b = sym.Variable("b")
+    y = sym.dense(data=x, bias=b, units=4)
+    # Don't check gradients on cuda because is doesn't yet support ewise after reduce
+    check_function(y, exclude_targets={'cuda'}, numerical_grads=True)
+    check_function(y, shape={'x': (3, 4)}, exclude_targets={'cuda'}, numerical_grads=True)
+    check_function(y, dtype={'x': 'float64'}, exclude_targets={'cuda'}, numerical_grads=True)
+
+    x = sym.Variable("x")
+    b = sym.Variable("b")
+    w = sym.Variable("w")
+    y = sym.dense(data=x, bias=b, weight=w, units=4)
+    def _fwd_dense(x, w, b):
+        return np.dot(x, w.T) + b
+    check_function(y, _fwd_dense, shape={'x': (1,2)}, dtype={'x': 'float32'}, numerical_grads=False)
+    check_function(y, _fwd_dense, shape={'x': (1,2)}, dtype={'w': 'float64'}, numerical_grads=False)
+    _check_function_must_fail(y, _fwd_dense, shape={'x': (1,2)},
+                              dtype={'w': 'float64', 'b': 'float32'},
+                              numerical_grads=False,
+                              error=nnvm._base.NNVMError)
+    # fails because no shape
+    _check_function_must_fail(y, _fwd_dense, numerical_grads=False, error=ValueError)
+    # ok because type is float32 by default
+    check_function(y, _fwd_dense, shape={'x': (1,2)}, numerical_grads=False)
 
 def test_relu():
     x = sym.Variable("x")
@@ -62,10 +176,8 @@ def backward(head_grads, x):
         return [(sub > 0).astype("float") * \
                 ((x > 0).astype("float") + 0.3 * (x < 0).astype("float")) * head_grads]
 
-    dtype = "float32"
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
 
 def test_prelu_nchw():
     x = sym.Variable("x")
@@ -75,15 +187,8 @@ def test_prelu_nchw():
     def forward(x, a):
         return (x < 0) * (x * a.reshape(3, 1, 1)) + (x>=0) * x
 
-    dtype = "float32"
-    dshape_x = (1, 3, 32, 32)
-    dshape_w = (3,)
-
-    inputs = [
-        ('x', dshape_x, x),
-        ('a', dshape_w, a)
-    ]
-    helper(y, inputs, dtype, forward)
+    shape = {'x': (1, 3, 32, 32), 'a': (3,)}
+    check_function(y, forward, shape=shape)
 
 def test_prelu_nhwc():
     x = sym.Variable("x")
@@ -93,17 +198,8 @@ def test_prelu_nhwc():
     def forward(x, a):
         return (x < 0) * (x * a.reshape(1, 1, 3)) + (x>=0) * x
 
-    dtype = "float32"
-    dshape_x = (1, 32, 32, 3)
-    dshape_w = (3,)
-
-    inputs = [
-        ('x', dshape_x, x),
-        ('a', dshape_w, a)
-    ]
-
-
-    helper(y, inputs, dtype, forward)
+    shape = {'x': (1, 32, 32, 3), 'a': (3,)}
+    check_function(y, forward, shape=shape)
 
 def test_sym_scalar_pow():
     scalar = 3
@@ -116,10 +212,8 @@ def forward(x):
     def backward(head_grads, x):
         return [scalar * x**(scalar -  1) * head_grads]
 
-    dtype = "float32"
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_scalar_sym_pow():
@@ -133,10 +227,8 @@ def forward(x):
     def backward(head_grads, x):
         return [np.log(scalar) * scalar**x * head_grads]
 
-    dtype = "float32"
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_exp():
@@ -149,10 +241,8 @@ def forward(x):
     def backward(head_grads, x):
         return [np.exp(x) * head_grads]
 
-    dtype = "float32"
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_log():
@@ -165,10 +255,8 @@ def forward(x):
     def backward(head_grads, x):
         return [1. / x * head_grads]
 
-    dtype = "float32"
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward, rnd_min=0.001)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, in_range=(0.002, 2.0), shape=shape)
 
 
 def test_tanh():
@@ -182,10 +270,8 @@ def backward(head_grads, x):
         y_np = forward(x)
         return [(1 - y_np**2) * head_grads]
 
-    dtype = "float32"
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_sigmoid():
@@ -199,10 +285,8 @@ def backward(head_grads, x):
         y_np = forward(x)
         return [y_np *(1 - y_np) * head_grads]
 
-    dtype = "float32"
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_softmax():
@@ -217,10 +301,10 @@ def backward(head_grads, x):
         grad = y * (head_grads - np.sum(y * head_grads, axis=1, keepdims=True))
         return [grad]
 
-    dtype = "float32"
-    dshape = (10, 1000)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    check_function(y, forward, backward,
+                   shape={'x': (10, 1000)}, numerical_grads=False)
+    check_function(y, forward, backward,
+                   shape={'x': (2, 10)})
 
 
 def test_log_softmax():
@@ -235,10 +319,10 @@ def backward(head_grads, x):
         grad = head_grads - np.exp(y) * np.sum(head_grads, axis=1, keepdims=True)
         return [grad]
 
-    dtype = "float32"
-    dshape = (10, 1000)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    check_function(y, forward, backward,
+                   shape={'x': (10, 1000)}, numerical_grads=False)
+    check_function(y, forward, backward,
+                   shape={'x': (2, 10)})
 
 
 def test_dense():
@@ -250,13 +334,16 @@ def test_dense():
 
     def forward(x, dense_weight, dense_bias):
         return np.dot(x, dense_weight.T) + dense_bias
-    dtype = "float32"
-    inputs = [
-        ('x', (10, 100), x),
-        ('dense_weight', (3, 100), w),
-        ('dense_bias', (3,), b)
-    ]
-    helper(y, inputs, dtype, forward)
+    shape = {
+        'x': (10, 100),
+        'w': (3, 100),
+        'b': (3,)
+    }
+    # Don't check gradients on cuda because is doesn't yet support ewise after reduce
+    check_function(y, forward, shape=shape,
+                   exclude_targets={'cuda'}, numerical_grads=True)
+    check_function(y, forward, shape=shape,
+                   only_targets={'cuda'}, numerical_grads=False)
 
 
 def test_batchnorm():
@@ -272,35 +359,25 @@ def test_batchnorm():
     def forward(x, gamma, beta, moving_mean, moving_var):
         return (x - moving_mean) / np.sqrt(moving_var + eps) * gamma + beta
 
-    dtype = "float32"
-    inputs = [
-        ('x', (10, 20), x),
-        ('gamma', (20,), gamma),
-        ('beta', (20,), beta),
-        ('moving_mean', (20,), moving_var),
-        ('moving_var', (20,), moving_mean)
-    ]
+    shape = {
+        'x': (10, 20),
+        'gamma': (20,),
+        'beta': (20,),
+        'moving_mean': (20,),
+        'moving_var': (20,)
+    }
 
-    helper(y, inputs,  dtype, forward, rnd_min=0.001)
+    check_function(y, forward, in_range=(0.001, 1.0), shape=shape)
 
 
 def verify_concatenate(ishape, axis):
-    x = [sym.Variable("x%d" % i) for i in range(len(ishape))]
+    x = [sym.Variable("x%d" % i, shape=ishape[i]) for i in range(len(ishape))]
     y = sym.concatenate(*x, axis=axis) + 1
-    dtype = "float32"
-    for target, ctx in ctx_list():
-        # set input
-        data = []
-        for i, shape in enumerate(ishape):
-            data.append(np.random.uniform(size=shape).astype(dtype))
-        pdict = {"x%d" % i :  v for i, v in enumerate(data)}
-        shape = {"x%d" % i :  v.shape for i, v in enumerate(data)}
-        graph, lib, _ = nnvm.compiler.build(y, target, shape)
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(**pdict)
-        out_np = np.concatenate(data, axis=axis) + 1
-        out = m.get_output(0, tvm.nd.empty(out_np.shape))
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+
+    def forward(**kwargs):
+        return np.concatenate(list(kwargs.values()), axis=axis) + 1
+
+    check_function(y, forward)
 
 
 def test_concatenate():
@@ -309,19 +386,13 @@ def test_concatenate():
 
 
 def verify_split(ishape, indices_or_sections, axis):
-    x = sym.Variable("x")
+    x = sym.Variable("x", shape=ishape)
     y = sym.split(x, indices_or_sections=indices_or_sections, axis=axis)
-    dtype = "float32"
-    x_np = np.random.uniform(size=ishape).astype(dtype)
-    res = np.split(x_np, indices_or_sections, axis=axis)
-    for target, ctx in ctx_list():
-        # set input
-        graph, lib, _ = nnvm.compiler.build(y, target, {"x": ishape})
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(x=x_np)
-        for i, arr  in enumerate(res):
-            out = m.get_output(i, tvm.nd.empty(arr.shape))
-            np.testing.assert_allclose(out.asnumpy(), arr, atol=1e-5, rtol=1e-5)
+
+    def forward(x):
+        return np.split(x, indices_or_sections, axis=axis)
+
+    check_function(y, forward)
 
 
 def test_split():
@@ -331,28 +402,22 @@ def test_split():
 
 def verify_strided_slice(ishape, begin, end, strideinp=None):
     stride = strideinp if strideinp else [1, 1, 1]
-    x = sym.Variable("x")
+    x = sym.Variable("x", shape=ishape)
     if strideinp:
         y = sym.strided_slice(x, begin = begin, end = end, stride = stride) + 1
     else:
         y = sym.strided_slice(x, begin = begin, end = end) + 1
-    x_np = np.random.uniform(size=ishape).astype("float32")
+
     for i in range(len(begin), 3):
         begin.append(0)
     for i in range(len(end), 3):
         end.append(ishape[i])
-    def test_forward(x, begin, end, stride):
+
+    def test_forward(x):
         return x[begin[0]:end[0]:stride[0],
                     begin[1]:end[1]:stride[1], begin[2]:end[2]:stride[2]] + 1
 
-    for target, ctx in ctx_list():
-        # set input
-        graph, lib, _ = nnvm.compiler.build(y, target, {"x": ishape})
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(x=x_np)
-        res = test_forward(x_np, begin, end, stride)
-        out = m.get_output(0, tvm.nd.empty(res.shape))
-        np.testing.assert_allclose(out.asnumpy(), res, atol=1e-5, rtol=1e-5)
+    check_function(y, test_forward)
 
 def test_strided_slice():
     verify_strided_slice((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2])
@@ -369,24 +434,18 @@ def verify_take(src_shape, indices_src, axis=None):
     src_dtype = "float32"
     indices_dtype = "int32"
     indices_src = np.array(indices_src, dtype=indices_dtype)
-    a = sym.Variable("a")
-    indices = sym.Variable("indices")
+    a = sym.Variable("a", shape=src_shape)
+    indices = sym.Variable("indices", shape=indices_src.shape)
     y = sym.take(a, indices, axis=axis)
-    for target, ctx in ctx_list():
-        # set input
-        shape_dict = {"a":src_shape, "indices":indices_src.shape}
-        type_dict = {"a":src_dtype, "indices":indices_dtype}
-        graph, lib, _ = nnvm.compiler.build(y, target, shape=shape_dict, dtype=type_dict)
-        m = graph_runtime.create(graph, lib, ctx)
-
-        shape_size = 1
-        for i in range(len(src_shape)):
-            shape_size = shape_size * src_shape[i]
-        a_src = np.arange(shape_size, dtype=src_dtype).reshape((src_shape))
-        out_np = np.take(a_src, indices_src, axis=axis)
-        m.run(a=a_src, indices=indices_src)
-        out = m.get_output(0, tvm.nd.empty(out_np.shape, dtype=src_dtype))
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+
+    def forward(a, indices):
+        return np.take(a, indices=indices, axis=axis)
+
+    a_src = np.arange(np.prod(src_shape), dtype=src_dtype).reshape(src_shape)
+
+    check_function(y, forward,
+                   dtype={'a': src_dtype, 'indices': indices_dtype},
+                   values={'a': a_src, 'indices': indices_src})
 
 def test_take():
     verify_take((4,), [1])
@@ -399,9 +458,9 @@ def test_take():
     verify_take((4,3,5,6), [[2,1,0,0]], -2)
 
 
-def verify_squeeze(dshape, axis):
+def verify_squeeze(shape, axis):
     x = sym.Variable("x")
-    if axis:
+    if axis is not None:
         y = sym.squeeze(x, axis=axis)
     else:
         y = sym.squeeze(x)
@@ -413,9 +472,7 @@ def forward(x):
     def backward(head_grads, x):
         return [np.reshape(head_grads, x.shape)]
 
-    dtype = "float32"
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_squeeze():
@@ -433,61 +490,40 @@ def forward(x):
                       pad_width=((0, 0), (0, 0), (0, 1), (2, 3)),
                       mode='constant', constant_values=1.)
 
-    dtype = "float32"
-    inputs = [('x', (1, 3, 28, 28), x)]
-    helper(y, inputs, dtype, forward)
+    shape = {'x': (1, 3, 28, 28)}
+    check_function(y, forward, shape=shape)
 
 def verify_lrn(ishape, size, axis, bias, alpha, beta):
-    x = sym.Variable("x")
+    x = sym.Variable("x", shape=ishape)
     y = sym.lrn(x, size=size, axis=axis, bias=bias, alpha=alpha, beta=beta)
-    dtype = "float32"
-    x_np = np.random.uniform(size=ishape).astype(dtype)
 
-    for target, ctx in ctx_list():
-        graph, lib, _ = nnvm.compiler.build(y, target, {"x": ishape})
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(x=x_np)
-        out = m.get_output(0, tvm.nd.empty(ishape))
-        out_np = topi.testing.lrn_python(x_np, size, axis, bias, alpha, beta)
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+    def forward1(x):
+        return topi.testing.lrn_python(x, size, axis, bias, alpha, beta)
+
+    check_function(y, forward1)
+
+    def forward2(x):
+        y = forward1(x)
+        return (y > 0)*y
 
     #Checking LRN op followed by elementwise op relu
-    z = sym.relu(y)
-    x_np = np.random.uniform(low=-10.0, high=10.0, size=ishape).astype(dtype)
-    for target, ctx in ctx_list():
-        graph, lib, _ = nnvm.compiler.build(z, target, {"x": ishape})
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(x=x_np)
-        out = m.get_output(0, tvm.nd.empty(ishape))
-        out_np = topi.testing.lrn_python(x_np, size, axis, bias, alpha, beta)
-        out_np = (out_np > 0) * out_np
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+    check_function(sym.relu(y), forward2, in_range={'x': (-10.0, 10.0)})
 
 def verify_l2_normalize(ishape, eps, axis):
-    x = sym.Variable("x")
+    x = sym.Variable("x", shape=ishape)
     y = sym.l2_normalize(x, eps=eps, axis=axis)
-    dtype = "float32"
-    x_np = np.random.uniform(size=ishape).astype(dtype)
 
-    for target, ctx in ctx_list():
-        graph, lib, _ = nnvm.compiler.build(y, target, {"x": ishape})
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(x=x_np)
-        out = m.get_output(0, tvm.nd.empty(ishape))
-        out_np = topi.testing.l2_normalize_python(x_np, eps, axis)
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+    def forward1(x):
+        return topi.testing.l2_normalize_python(x, eps, axis)
+
+    check_function(y, forward1)
+
+    def forward2(x):
+        y = forward1(x)
+        return (y > 0)*y
 
     #Checking L2 normalization op followed by elementwise op relu
-    z = sym.relu(y)
-    x_np = np.random.uniform(low=-10.0, high=10.0, size=ishape).astype(dtype)
-    for target, ctx in ctx_list():
-        graph, lib, _ = nnvm.compiler.build(z, target, {"x": ishape})
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(x=x_np)
-        out = m.get_output(0, tvm.nd.empty(ishape))
-        out_np = topi.testing.l2_normalize_python(x_np, eps, axis)
-        out_np = (out_np > 0) * out_np
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+    check_function(sym.relu(y), forward2, in_range={'x': (-10.0, 10.0)})
 
 def test_lrn():
     verify_lrn((1, 3, 20, 20), 3, 1, 1.0, 1.0, 0.5)
@@ -497,7 +533,38 @@ def test_l2_normalize():
     verify_l2_normalize((1, 3, 20, 20), 0.001, (1,))
     verify_l2_normalize((1, 3, 20, 20), 0.001, (1, 2))
 
+def verify_gather_nd(src_shape, indices_src):
+    src_dtype = "float32"
+    indices_dtype = "int32"
+    indices_src = np.array(indices_src, dtype=indices_dtype)
+    a = sym.Variable("a", shape=src_shape)
+    indices = sym.Variable("indices", shape=indices_src.shape)
+    y = sym.gather_nd(a, indices)
+
+    def forward(a, indices):
+        return topi.testing.gather_nd_python(a, indices)
+
+    a_src = np.arange(np.prod(src_shape), dtype=src_dtype).reshape(src_shape)
+
+    check_function(y, forward,
+                   dtype={'a': src_dtype, 'indices': indices_dtype},
+                   values={'a': a_src, 'indices': indices_src})
+
+def test_gather_nd():
+    verify_gather_nd((4,), [[1]])
+    verify_gather_nd((4,), [[1, 3, 2]])
+    verify_gather_nd((2, 3), [[1]])
+    verify_gather_nd((2, 3), [[1], [0]])
+    verify_gather_nd((2, 3), [[1, 0], [0, 2]])
+    verify_gather_nd((2, 3, 4), [[1, 0], [0, 2]])
+    verify_gather_nd((2, 3, 4), [[1, 0], [0, 2], [3, 1]])
+    verify_gather_nd((2, 3, 4), [[[1, 0], [0, 1]], [[0, 2], [1, 2]],
+                                 [[3, 1], [0, 2]]])
+    verify_gather_nd((2, 3, 4, 5), [[1, 0], [0, 2]])
+    verify_gather_nd((2, 3, 4, 5), [[1, 0], [2, 1], [3, 2], [4, 2]])
+
 if __name__ == "__main__":
+    test_check_function()
     test_split()
     test_concatenate()
     test_log_softmax()
@@ -519,3 +586,4 @@ def test_l2_normalize():
     test_lrn()
     test_l2_normalize()
     test_strided_slice()
+    test_gather_nd()
diff --git a/nnvm/tests/python/compiler/test_top_level2.py b/nnvm/tests/python/compiler/test_top_level2.py
index c26f5356557f..0585f3c974b7 100644
--- a/nnvm/tests/python/compiler/test_top_level2.py
+++ b/nnvm/tests/python/compiler/test_top_level2.py
@@ -22,7 +22,7 @@ def run_test_conv2d(sym, dtype, dshape, kshape, oshape, shape_dict, padding):
             c_np = topi.testing.conv2d_nchw_python(
                 data.asnumpy(), kernel.asnumpy(), 1, padding)
             c_np = c_np + bias.asnumpy().reshape(kshape[0], 1, 1)
-            np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+            tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
 
     x = sym.Variable("x")
     y = sym.conv2d(x, channels=10, kernel_size=(3,3),
@@ -71,7 +71,7 @@ def test_mixed_precision():
         c_np = topi.testing.conv2d_nchw_python(
             data.asnumpy().astype(out_dtype),
             kernel.asnumpy().astype(out_dtype), 1, 1)
-        np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
 
 
 def test_dilated_conv2d():
@@ -97,7 +97,7 @@ def test_dilated_conv2d():
         c_np = topi.testing.conv2d_nchw_python(
             data.asnumpy(), dkernel_np, 1, 1)
         c_np = c_np + bias.asnumpy().reshape(kshape[0], 1, 1)
-        np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
 
 
 def test_grouped_conv2d_nchw():
@@ -120,7 +120,7 @@ def test_grouped_conv2d_nchw():
         c_np = topi.testing.depthwise_conv2d_python_nchw(
             data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME')
         c_np = c_np + bias.asnumpy().reshape(kshape[0], 1, 1)
-        np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
 
 def test_grouped_conv2d_nhwc():
     x = sym.Variable("x")
@@ -142,7 +142,7 @@ def test_grouped_conv2d_nhwc():
         c_np = topi.testing.depthwise_conv2d_python_nhwc(
             data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME')
         c_np = c_np + bias.asnumpy().reshape(1, 1, kshape[2])
-        np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
 
 
 def test_conv2d_transpose():
@@ -167,7 +167,7 @@ def test_conv2d_transpose():
         c_np = c_np + bias.asnumpy().reshape(kshape[1], 1, 1)
         d_np = np.zeros(shape=oshape)
         d_np[:,:,0:c_np.shape[2],0:c_np.shape[3]] = c_np
-        np.testing.assert_allclose(out.asnumpy(), d_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), d_np, rtol=1e-5)
 
 
 def test_max_pool2d():
@@ -185,7 +185,7 @@ def test_max_pool2d():
         m.run(x=data)
         out = m.get_output(0, tvm.nd.empty(oshape, dtype))
         b_np = np.max(data.asnumpy().reshape(1,3,14,2,14,2), axis=(3,5))
-        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
 
 
 def test_avg_pool2d():
@@ -202,7 +202,7 @@ def test_avg_pool2d():
         m.run(x=data)
         out = m.get_output(0, tvm.nd.empty(oshape, dtype))
         b_np = np.mean(data.asnumpy().reshape(1,3,14,2,14,2), axis=(3,5))
-        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
 
 
 def test_avg_pool2d_no_count_pad():
@@ -237,7 +237,7 @@ def test_avg_pool2d_no_count_pad():
         data = tvm.nd.array(a_np)
         m.run(x=data)
         out = m.get_output(0, tvm.nd.empty((n, oc, oh, ow), dtype))
-        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
 
 
 def test_global_max_pool2d():
@@ -254,7 +254,7 @@ def test_global_max_pool2d():
         m.run(x=data)
         out = m.get_output(0, tvm.nd.empty(oshape, dtype))
         b_np = np.max(data.asnumpy(), axis=(2,3), keepdims=True)
-        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
 
 
 def test_global_avg_pool2d():
@@ -271,7 +271,7 @@ def test_global_avg_pool2d():
         m.run(x=data)
         out = m.get_output(0, tvm.nd.empty(oshape, dtype))
         b_np = np.mean(data.asnumpy(), axis=(2,3), keepdims=True)
-        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
 
 
 def test_upsampling_nearest_neighbor():
@@ -290,7 +290,7 @@ def test_upsampling_nearest_neighbor():
         m.run(x=data)
         out = m.get_output(0, tvm.nd.empty(oshape, dtype))
         b_np = topi.testing.upsampling_python(a_np, scale, "NCHW")
-        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5)
 
 def test_upsampling_bilinear():
     x = sym.Variable("x")
@@ -309,7 +309,7 @@ def test_upsampling_bilinear():
         m.run(x=data)
         out = m.get_output(0, tvm.nd.empty(oshape, dtype))
         b_np = topi.testing.bilinear_resize_python(a_np, (32*scale, 32*scale), "NCHW")
-        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
 def test_resize_bilinear():
     x = sym.Variable("x")
@@ -327,7 +327,7 @@ def test_resize_bilinear():
         m.run(x=data)
         out = m.get_output(0, tvm.nd.empty(oshape, dtype))
         b_np = topi.testing.bilinear_resize_python(a_np, (60, 60), "NHWC")
-        np.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
 if __name__ == "__main__":
     test_mixed_precision()
diff --git a/nnvm/tests/python/compiler/test_top_level3.py b/nnvm/tests/python/compiler/test_top_level3.py
index c8bd37c38e5b..11af2d0bc9c4 100644
--- a/nnvm/tests/python/compiler/test_top_level3.py
+++ b/nnvm/tests/python/compiler/test_top_level3.py
@@ -5,15 +5,14 @@
 import nnvm.symbol as sym
 import nnvm.compiler
 from nnvm.testing.config import ctx_list
-from test_top_level1 import helper
+from nnvm.testing.check_computation import check_function
 
 def check_map(symfunc, np_func, np_backward=None, dtype="float32", rnd_min=-1, rnd_max=1):
     x = sym.Variable("x")
     y = symfunc(x)
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, lambda x: np_func(x), np_backward,
-           rnd_min=rnd_min, rnd_max=rnd_max)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, lambda x: np_func(x), np_backward,
+                   dtype=dtype, shape=shape, in_range=(rnd_min, rnd_max))
 
 
 def test_floor():
diff --git a/nnvm/tests/python/compiler/test_top_level4.py b/nnvm/tests/python/compiler/test_top_level4.py
index 5bf134b49a7b..fc4e62fb7156 100644
--- a/nnvm/tests/python/compiler/test_top_level4.py
+++ b/nnvm/tests/python/compiler/test_top_level4.py
@@ -6,52 +6,7 @@
 import nnvm.symbol as sym
 import nnvm.compiler
 from nnvm.testing.config import ctx_list
-
-
-def helper(symbol, inputs, dtype,
-           np_forward, np_backward=None,
-           need_input=True, need_head_grads=True, in_range={}):
-    ishapes = {}
-    input_syms = []
-    np_inputs = {}
-    for (name, shape, s) in inputs:
-        ishapes.update({name: shape})
-        if name in in_range:
-            np_inputs.update({name: np.random.uniform(size=shape,
-                                                      low=in_range[name][0],
-                                                      high=in_range[name][1]).astype(dtype)})
-        else:
-            np_inputs.update({name: np.random.uniform(size=shape).astype(dtype)})
-        input_syms.append(s)
-
-    for target, ctx in ctx_list():
-        graph, lib, _ = nnvm.compiler.build(symbol, target, ishapes, dtype=dtype)
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(**np_inputs)
-        y_np = np_forward(**np_inputs)
-        out = m.get_output(0, tvm.nd.empty(y_np.shape, dtype))
-        np.testing.assert_allclose(out.asnumpy(), y_np, atol=1e-5, rtol=1e-5)
-        # backward
-        if np_backward:
-            graph._set_symbol_list_attr("grad_ys", symbol)
-            graph._set_symbol_list_attr("grad_xs", input_syms)
-            graph._set_symbol_list_attr("grad_ys_out_grad", sym.Variable("head_grads", shape=y_np.shape))
-            graph = graph.apply("Gradient")
-            ishapes.update({"head_grads": y_np.shape})
-            graph, lib, _ = nnvm.compiler.build(graph, target, ishapes)
-            m = graph_runtime.create(graph, lib, ctx)
-            head_grads = np.random.uniform(size=y_np.shape).astype(dtype)
-            y_np = np_backward(head_grads=head_grads, **np_inputs)
-            b_inputs = {}
-            if need_input:
-                b_inputs.update(np_inputs)
-            if need_head_grads:
-                b_inputs.update({"head_grads":head_grads})
-            m.run(**b_inputs)
-            for i in range(len(y_np)):
-                out = m.get_output(i, tvm.nd.empty(y_np[i].shape, dtype))
-                np.testing.assert_allclose(out.asnumpy(), y_np[i], atol=1e-5, rtol=1e-5)
-
+from nnvm.testing.check_computation import check_function
 
 def verify_transpose(dshape, axes):
     x = sym.Variable("x")
@@ -69,22 +24,29 @@ def verify_transpose(dshape, axes):
         m.run(x=data)
         out_np = np.transpose(data.asnumpy(), axes=axes) + 1
         out = m.get_output(0, tvm.nd.empty(out_np.shape))
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
 
 def verify_reduce_explicit(dshape, data, result, fsym, oshape=None, otype='float32', **kwargs):
     """ Verify reduce operations by comparign its result with `result` """
     x = sym.Variable("x")
     y = fsym(x + 0, **kwargs)
     for target, ctx in ctx_list():
+        # TODO(yuruofei): remove when cuda reduce schedule is done
+        if target == 'cuda' and fsym == sym.mean:
+            continue
         graph, lib, _ = nnvm.compiler.build(y, target, {"x": dshape})
         m = graph_runtime.create(graph, lib, ctx)
         # set input
         m.run(x=data)
         # oshape set to None means do not test the shape-correctness
-        oshape = result.shape if oshape is None else oshape
+        oshape = result.shape if isinstance(result, np.ndarray) else (1,) if oshape is None else oshape
         out = m.get_output(0, tvm.nd.empty(oshape, dtype=otype))
-        np.testing.assert_equal(out.asnumpy().shape, result.shape)
-        np.testing.assert_allclose(out.asnumpy(), result, atol=1e-5, rtol=1e-5)
+        if isinstance(result, np.ndarray):
+            np.testing.assert_equal(out.asnumpy().shape, result.shape)
+            tvm.testing.assert_allclose(out.asnumpy(), result, atol=1e-5, rtol=1e-5)
+        else:
+            tvm_out = out.asnumpy()
+            assert abs(result - tvm_out) <= (1e-5 + 1e-5 * abs(tvm_out))
 
 def verify_reduce(dshape, fnp, fsym, oshape=None, otype='float32', **kwargs):
     """ Verify reduce operations by generating data at random and calling numpy
@@ -106,7 +68,7 @@ def verify_collapse(dshape, target_shape, fnp):
         m.run(x=data)
         out = m.get_output(0, tvm.nd.empty(target_shape))
         out_np = fnp(data)
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
 
 
 def test_transpose():
@@ -134,6 +96,13 @@ def wrapper(data, axis=None, keepdims=False):
     verify_reduce((4, 4, 3), np.min, sym.min, keepdims=True)
     verify_reduce((4, 4, 3), np.sum, sym.sum, axis=(0, 2))
     verify_reduce((4, 4, 3), np.sum, sym.sum)
+    verify_reduce((128, 24, 128), np.mean, sym.mean, axis=(0, 1), keepdims=False)
+    verify_reduce((128, 24, 128), np.mean, sym.mean, axis=(0, 2), keepdims=False)
+    verify_reduce((128, 24, 128), np.mean, sym.mean, axis=(0, 1), keepdims=True)
+    verify_reduce((128, 24, 128), np.mean, sym.mean, axis=(0, 2), keepdims=True)
+    verify_reduce((128, 24, 128), np.mean, sym.mean, keepdims=True)
+    verify_reduce((128, 24, 128), np.mean, sym.mean, keepdims=False)
+    verify_reduce((128, 24, 128), np.mean, sym.mean, axis=(0, 1, 2), keepdims=True)
 
     data = np.array([[[1,2],[3,4]],[[3,44],[5,6]]], dtype=np.float32)
     verify_reduce_explicit([2,2,2], data, np.array([[1,1],[1,0]]), sym.argmax, otype='int32', axis=[0,2], exclude=True)
@@ -144,7 +113,7 @@ def wrapper(data, axis=None, keepdims=False):
             kwargs = { 'keepdims':keepdims }
             if axis is None:
                 # FIXME: NNVM doesn't support setting `axis=None` explicitly.
-                kwargs.update({'oshape': [1,1,1] if keepdims else [] })
+                kwargs.update({'oshape': [1,1,1] if keepdims else [1] })
             else:
                 kwargs.update({'axis': axis})
                 kwargs.update({'oshape': shape[:axis]+[1]+shape[axis+1:] if keepdims else shape[:axis]+shape[axis+1:]})
@@ -180,7 +149,7 @@ def verify_flip(ishape, axis):
         m = graph_runtime.create(graph, lib, ctx)
         m.run(x=x_np)
         out = m.get_output(0, tvm.nd.empty(res.shape))
-        np.testing.assert_allclose(out.asnumpy(), res, atol=1e-5, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), res, atol=1e-5, rtol=1e-5)
 
 
 def test_flip():
@@ -205,7 +174,7 @@ def verify_reshape(dshape, oshape):
         m.run(x=data)
         out_np = data.asnumpy().reshape(oshape) + 1
         out = m.get_output(0, tvm.nd.empty(out_np.shape))
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
 
 
 def test_reshape():
@@ -228,93 +197,92 @@ def backward(head_grads, x):
         mask2 = np.less_equal(x, a_max).astype("float")
         return [head_grads * mask1 * mask2]
 
-
-    dtype = "float32"
-    inputs = [('x', (3, 4, 5), x)]
-    helper(y, inputs, dtype, forward, backward)
+    shape = {'x': (3, 4, 5)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_broadcast():
     a = sym.Variable("a")
     b = sym.Variable("b")
-    inputs = [('a', (3, 4, 5), a),
-              ('b', (1, 5), b)]
-    dtype = "float32"
+    shape = {'a': (3, 4, 5), 'b': (1, 5)}
 
     def _collapse(g):
-        return g.reshape(-1, inputs[-1][1][-1]).sum(0, keepdims=True)
+        return g.reshape(-1, shape['b'][-1]).sum(0, keepdims=True)
 
     y = sym.broadcast_add(a, b)
     def _backward_add(head_grads, a, b):
         da = head_grads
         db = _collapse(head_grads)
         return da, db
-    helper(y, inputs, dtype, lambda a, b: a + b, _backward_add)
+    check_function(y, lambda a, b: a + b, _backward_add, shape=shape)
 
     y = sym.broadcast_sub(a, b)
     def _backward_sub(head_grads, a, b):
         da = head_grads
         db = -_collapse(head_grads)
         return da, db
-    helper(y, inputs, dtype, lambda a, b: a - b, _backward_sub)
+    check_function(y, lambda a, b: a - b, _backward_sub, shape=shape)
 
     y = sym.broadcast_mul(a, b)
     def _backward_mul(head_grads, a, b):
         da = head_grads * b
         db = _collapse(head_grads * a)
         return da, db
-    helper(y, inputs, dtype, lambda a, b: a * b, _backward_mul)
+    check_function(y, lambda a, b: a * b, _backward_mul, shape=shape)
 
     y = sym.broadcast_div(a, b)
     def _backward_div(head_grads, a, b):
         da = head_grads / b
         db = _collapse(- head_grads * a / b**2)
         return da, db
-    helper(y, inputs, dtype, lambda a, b: a / b, _backward_div)
+    # We avoid computing numerical derivatives too close to zero here
+    check_function(y, lambda a, b: a / b, _backward_div, shape=shape, numerical_grads=False)
+    check_function(y, lambda a, b: a / b, _backward_div, shape=shape,
+                   in_range={'b': (0.1, 20)})
 
     y = sym.broadcast_mod(a, b)
-    helper(y, inputs, 'int32',
-           lambda a, b: np.mod(a, b),
-           in_range={'a': (0.001, 100), 'b': (1, 100)})
+    check_function(y,
+                   lambda a, b: np.mod(a, b),
+                   in_range={'a': (0.001, 100), 'b': (1, 100)}, dtype='int32', shape=shape)
 
     y = sym.broadcast_max(a, b)
-    helper(y, inputs, dtype, lambda a, b: np.maximum(a, b))
+    check_function(y, lambda a, b: np.maximum(a, b), shape=shape)
 
     y = sym.broadcast_min(a, b)
-    helper(y, inputs, dtype, lambda a, b: np.minimum(a, b))
+    check_function(y, lambda a, b: np.minimum(a, b), shape=shape)
 
     y = sym.broadcast_pow(a, b)
-    helper(y, inputs, dtype,
-           lambda a, b: np.power(a, b),
-           in_range={'a': (0.001, 100), 'b': (0.001, 2)})
+    check_function(y,
+                   lambda a, b: np.power(a, b),
+                   in_range={'a': (0.001, 100), 'b': (0.001, 2)}, shape=shape)
 
     y = sym.broadcast_left_shift(a, b)
-    helper(y, inputs, 'int32', lambda a, b: a << b)
+    check_function(y, lambda a, b: a << b, dtype='int32', shape=shape)
 
     y = sym.broadcast_right_shift(a, b)
-    helper(y, inputs, 'int32', lambda a, b: a >> b)
+    check_function(y, lambda a, b: a >> b, dtype='int32', shape=shape)
 
     y = sym.broadcast_greater(a, b)
-    helper(y, inputs, dtype, lambda a, b: np.greater(a, b))
+    check_function(y, lambda a, b: np.greater(a, b), shape=shape)
 
     y = sym.broadcast_less(a, b)
-    helper(y, inputs, dtype, lambda a, b: np.less(a, b))
+    check_function(y, lambda a, b: np.less(a, b), shape=shape)
 
     y = sym.broadcast_equal(a, b)
-    helper(y, inputs, 'int32', lambda a, b: np.equal(a, b),
-           in_range={'a': (-2, 2), 'b': (-2, 2)})
+    check_function(y, lambda a, b: np.equal(a, b),
+                   in_range={'a': (-2, 2), 'b': (-2, 2)}, dtype='int32', shape=shape)
 
     y = sym.broadcast_not_equal(a, b)
-    helper(y, inputs, 'int32', lambda a, b: np.not_equal(a, b),
-           in_range={'a': (-2, 2), 'b': (-2, 2)})
+    check_function(y, lambda a, b: np.not_equal(a, b),
+                   in_range={'a': (-2, 2), 'b': (-2, 2)}, dtype='int32', shape=shape)
 
     y = sym.broadcast_greater_equal(a, b)
-    helper(y, inputs, 'int32', lambda a, b: np.greater_equal(a, b),
-           in_range={'a': (-3, 3), 'b': (-3, 3)})
+    check_function(y, lambda a, b: np.greater_equal(a, b),
+                   in_range={'a': (-3, 3), 'b': (-3, 3)}, dtype='int32', shape=shape)
 
     y = sym.broadcast_less_equal(a, b)
-    helper(y, inputs, 'int32', lambda a, b: np.less_equal(a, b),
-           in_range={'a': (-3, 3), 'b': (-3, 3)})
+    check_function(y, lambda a, b: np.less_equal(a, b),
+                   in_range={'a': (-3, 3), 'b': (-3, 3)}, dtype='int32', shape=shape)
 
 def test_greater():
     l = sym.Variable("l")
@@ -325,13 +293,10 @@ def forward(l, r):
         return np.greater(l, r).astype("float32")
 
     def backward(head_grads, l, r):
-        return [np.zeros_like(l)]
-
+        return {'l': np.zeros_like(l)}
 
-    dtype = "float32"
-    inputs = [('l', (3, 4, 5), l),
-              ('r', (3, 4, 5), r)]
-    helper(y, inputs, dtype, forward, backward, need_head_grads=False)
+    shape = {'l': (3, 4, 5), 'r': (3, 4, 5)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_less():
@@ -343,13 +308,10 @@ def forward(l, r):
         return np.less(l, r).astype("float32")
 
     def backward(head_grads, l, r):
-        return [np.zeros_like(l)]
-
+        return {'l': np.zeros_like(l)}
 
-    dtype = "float32"
-    inputs = [('l', (3, 4, 5), l),
-              ('r', (3, 4, 5), r)]
-    helper(y, inputs, dtype, forward, backward, need_head_grads=False)
+    shape = {'l': (3, 4, 5), 'r': (3, 4, 5)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_reshape_like():
@@ -364,11 +326,8 @@ def backward(head_grads, x, y):
         return [np.reshape(head_grads, x.shape),
                 np.zeros_like(y)]
 
-
-    dtype = "float32"
-    inputs = [('x', (3, 4, 5), x),
-              ('y', (5, 4, 3), y)]
-    helper(z, inputs, dtype, forward, backward)
+    shape = {'x': (3, 4, 5), 'y': (5, 4, 3)}
+    check_function(z, forward, backward, shape=shape)
 
 
 def verify_expand_like(in_shape, out_shape, axis, exclude):
@@ -412,10 +371,8 @@ def backward(head_grads, x, y):
                 np.zeros_like(y)]
 
 
-    dtype = "float32"
-    inputs = [('x', in_shape, x),
-              ('y', out_shape, y)]
-    helper(z, inputs, dtype, forward, backward, need_input=False)
+    shape = {'x': in_shape, 'y': out_shape}
+    check_function(z, forward, backward, shape=shape)
 
 
 def test_expand_like():
@@ -440,10 +397,8 @@ def forward(**inputs):
     def backward(head_grads, **inputs):
         return [head_grads] * num_args
 
-    dtype = "float32"
-    inputs = [("input" + str(i), (3, 4, 5), s[i])
-              for i in range(num_args)]
-    helper(y, inputs, dtype, forward, backward, need_input=False)
+    shape = {s[i]: (3, 4, 5) for i in range(num_args)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_elemwise_sum():
@@ -463,9 +418,9 @@ def backward(head_grads, x):
         return [np.zeros_like(head_grads)]
 
 
-    dtype = "float32"
-    inputs = [('x', (3, 4, 5), x)]
-    helper(y, inputs, dtype, forward, backward, need_head_grads=False)
+    shape = {'x': (3, 4, 5)}
+    # Numerical grad checking would fail for this function
+    check_function(y, forward, backward, shape=shape, numerical_grads=False)
 
 
 def test_full():
@@ -480,7 +435,7 @@ def test_full():
         m = graph_runtime.create(graph, lib, ctx)
         m.run(data=np.random.uniform(size=shape).astype(dtype))
         out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             out.asnumpy(),
             np.full(shape, fill_value=value, dtype=dtype),
             atol=1e-5, rtol=1e-5)
@@ -490,7 +445,7 @@ def test_full():
         m = graph_runtime.create(graph, lib, ctx)
         m.run(data=np.random.uniform(size=shape).astype(dtype))
         out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             out.asnumpy(),
             np.full(shape, fill_value=1, dtype=dtype),
             atol=1e-5, rtol=1e-5)
@@ -500,7 +455,7 @@ def test_full():
         m = graph_runtime.create(graph, lib, ctx)
         m.run(data=np.random.uniform(size=shape).astype(dtype))
         out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             out.asnumpy(),
             np.full(shape, fill_value=0, dtype=dtype),
             atol=1e-5, rtol=1e-5)
@@ -510,7 +465,7 @@ def test_full():
         m = graph_runtime.create(graph, lib, ctx)
         m.run()
         out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             out.asnumpy(),
             np.full(shape, fill_value=value, dtype=dtype),
             atol=1e-5, rtol=1e-5)
@@ -520,7 +475,7 @@ def test_full():
         m = graph_runtime.create(graph, lib, ctx)
         m.run()
         out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             out.asnumpy(),
             np.full(shape, fill_value=1, dtype=dtype),
             atol=1e-5, rtol=1e-5)
@@ -530,7 +485,7 @@ def test_full():
         m = graph_runtime.create(graph, lib, ctx)
         m.run()
         out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype))
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             out.asnumpy(),
             np.full(shape, fill_value=0, dtype=dtype),
             atol=1e-5, rtol=1e-5)
@@ -579,7 +534,7 @@ def verify_multibox_prior(dshape, sizes=(1,), ratios=(1,), steps=(-1, -1),
     m.set_input("data", np.random.uniform(size=dshape).astype(dtype))
     m.run()
     out = m.get_output(0, tvm.nd.empty(np_out.shape, dtype))
-    np.testing.assert_allclose(out.asnumpy(), np_out, atol=1e-5, rtol=1e-5)
+    tvm.testing.assert_allclose(out.asnumpy(), np_out, atol=1e-5, rtol=1e-5)
 
 def test_multibox_prior():
     verify_multibox_prior((1, 3, 50, 50))
@@ -616,7 +571,7 @@ def test_multibox_transform_loc():
     m.set_input(**{"cls_prob": np_cls_prob.astype(dtype), "loc_preds": np_loc_preds.astype(dtype), "anchors": np_anchors.astype(dtype)})
     m.run()
     out = m.get_output(0, tvm.nd.empty(expected_np_out.shape, dtype))
-    np.testing.assert_allclose(out.asnumpy(), expected_np_out, atol=1e-5, rtol=1e-5)
+    tvm.testing.assert_allclose(out.asnumpy(), expected_np_out, atol=1e-5, rtol=1e-5)
 
 def test_nms():
     dshape = (1, 5, 6)
@@ -644,7 +599,7 @@ def test_nms():
     m.set_input(**{"data": np_data, "valid_count": np_valid_count})
     m.run()
     out = m.get_output(0, tvm.nd.empty(np_result.shape, "float32"))
-    np.testing.assert_allclose(out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
+    tvm.testing.assert_allclose(out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
 
 def np_slice_like(np_data, np_shape_like, axis=[]):
     begin_idx = [0 for _ in np_data.shape]
@@ -679,7 +634,7 @@ def verify_slice_like(np_data, np_shape_like, axis=[]):
         m.set_input(**{"data1": np_data, "data2": np_shape_like})
         m.run()
         out = m.get_output(0, tvm.nd.empty(np_result.shape, dtype))
-        np.testing.assert_allclose(out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), np_result, atol=1e-5, rtol=1e-5)
 
 def test_slice_like():
     np_data = np.random.uniform(size=(3, 4, 5))
@@ -718,7 +673,7 @@ def verify_where(condition, x, y):
         m.set_input(**{"condition": condition, "x": x, "y": y})
         m.run()
         out = m.get_output(0, tvm.nd.empty(x.shape, dtype))
-        np.testing.assert_allclose(out.asnumpy(), np_out, atol=1e-5, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), np_out, atol=1e-5, rtol=1e-5)
 
 def test_where():
     shape = (13, 8, 224, 224, 6)
@@ -731,6 +686,28 @@ def test_where():
     y = np.random.uniform(size=shape).astype("float32")
     verify_where(condition, x, y)
 
+def test_argmax():
+    dshape = (204800, 2)
+    oshape = (1, 320, 640)
+
+    dtype = "float32"
+    x = sym.Variable("x", shape=dshape, dtype=dtype)
+    x = sym.reshape(x, shape=(1, 320, 640, 2))
+    x = sym.transpose(x, axes=(0, 3, 1, 2))
+    y = sym.argmax(x, axis=1)
+    target_str = "llvm"
+    target = tvm.target.create(target_str)
+    ctx = tvm.context(target_str, 0)
+    with nnvm.compiler.build_config(opt_level=2):
+        graph, lib, _ = nnvm.compiler.build(y, target, {"x": dshape})
+    m = graph_runtime.create(graph, lib, ctx)
+    data = np.random.uniform(size=dshape).astype(dtype)
+    m.run(x=data)
+    np_reshape = np.reshape(data, (1, 320, 640, 2))
+    np_transpose = np.transpose(np_reshape, axes=(0, 3, 1, 2))
+    np_argmax = np.argmax(np_transpose, axis=1)
+    out = m.get_output(0)
+    np.testing.assert_allclose(out.asnumpy(), np_argmax, atol=1e-5, rtol=1e-5)
 
 if __name__ == "__main__":
     test_reshape()
@@ -752,4 +729,5 @@ def test_where():
     test_nms()
     test_slice_like()
     test_where()
+    test_argmax()
     print(nnvm.compiler.engine.dump())
diff --git a/nnvm/tests/python/frontend/caffe2/model_zoo/__init__.py b/nnvm/tests/python/frontend/caffe2/model_zoo/__init__.py
new file mode 100644
index 000000000000..302177e75288
--- /dev/null
+++ b/nnvm/tests/python/frontend/caffe2/model_zoo/__init__.py
@@ -0,0 +1,18 @@
+"""Store for caffe2 examples and common models."""
+from __future__ import absolute_import as _abs
+import os
+import importlib
+
+models = [
+    'squeezenet',
+    'resnet50',
+    'vgg19',
+]
+
+# skip download if model exist
+for model in models:
+    try:
+        locals()['c2_' + model] = importlib.import_module('caffe2.python.models.' + model)
+    except ImportError:
+        os.system("python -m caffe2.python.models.download -i -f " + model)
+        locals()['c2_' + model] = importlib.import_module('caffe2.python.models.' + model)
diff --git a/nnvm/tests/python/frontend/caffe2/model_zoo/squeezenet.py b/nnvm/tests/python/frontend/caffe2/model_zoo/squeezenet.py
new file mode 100644
index 000000000000..2de2d1075494
--- /dev/null
+++ b/nnvm/tests/python/frontend/caffe2/model_zoo/squeezenet.py
@@ -0,0 +1,118 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=unused-argument
+
+"""
+Symbol of SqueezeNet
+
+Reference:
+Iandola, Forrest N., et al.
+"Squeezenet: Alexnet-level accuracy with 50x fewer parameters and< 0.5 mb model size." (2016).
+"""
+
+from nnvm import symbol as sym
+from nnvm.testing.utils import create_workload
+
+# Helpers
+def _make_fire(net, squeeze_channels, expand1x1_channels, expand3x3_channels):
+    net = _make_fire_conv(net, squeeze_channels, 1, 0)
+
+    left = _make_fire_conv(net, expand1x1_channels, 1, 0)
+    right = _make_fire_conv(net, expand3x3_channels, 3, 1)
+    # NOTE : Assume NCHW layout here
+    net = sym.concatenate(left, right, axis=1)
+
+    return net
+
+def _make_fire_conv(net, channels, kernel_size, padding=0):
+    net = sym.conv2d(net, channels=channels, kernel_size=(kernel_size, kernel_size),
+                     padding=(padding, padding))
+    net = sym.relu(net)
+    return net
+
+# Net
+def get_symbol(num_classes, version, **kwargs):
+    """Get symbol of SqueezeNet
+
+    Parameters
+    ----------
+    num_classes: int
+        The number of classification results
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+    """
+    assert version == '1.1', ("Unsupported SqueezeNet version {version}:"
+                              "1.1 expected".format(version=version))
+    net = sym.Variable("data")
+
+    net = sym.conv2d(net, channels=64, kernel_size=(3, 3), strides=(2, 2))
+    net = sym.relu(net)
+    net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+    net = _make_fire(net, 16, 64, 64)
+    net = _make_fire(net, 16, 64, 64)
+    net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+    net = _make_fire(net, 32, 128, 128)
+    net = _make_fire(net, 32, 128, 128)
+    net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+    net = _make_fire(net, 48, 192, 192)
+    net = _make_fire(net, 48, 192, 192)
+    net = _make_fire(net, 64, 256, 256)
+    net = _make_fire(net, 64, 256, 256)
+
+    net = sym.dropout(net, rate=0.5)
+    net = sym.conv2d(net, channels=num_classes, kernel_size=(1, 1))
+    net = sym.relu(net)
+    net = sym.global_avg_pool2d(net)
+    return sym.softmax(net, axis=1)
+
+def get_workload(batch_size=1, num_classes=1000, version='1.0',
+                 image_shape=(3, 224, 224), dtype="float32", **kwargs):
+    """Get benchmark workload for SqueezeNet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    kwargs : dict
+        Extra arguments
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_symbol(num_classes=num_classes, version=version, **kwargs)
+    return create_workload(net, batch_size, image_shape, dtype)
diff --git a/nnvm/tests/python/frontend/caffe2/test_forward.py b/nnvm/tests/python/frontend/caffe2/test_forward.py
new file mode 100644
index 000000000000..68a1ab7eda2b
--- /dev/null
+++ b/nnvm/tests/python/frontend/caffe2/test_forward.py
@@ -0,0 +1,93 @@
+import numpy as np
+import nnvm
+import tvm
+from tvm.contrib import graph_runtime
+from nnvm.testing.config import ctx_list
+from model_zoo import c2_squeezenet, c2_resnet50, c2_vgg19
+
+from caffe2.python import workspace
+
+
+def get_tvm_output(model,
+                   input_data,
+                   target,
+                   ctx,
+                   output_shape,
+                   output_dtype='float32'):
+    """ Generic function to execute and get tvm output"""
+    sym, params = nnvm.frontend.from_caffe2(model.init_net, model.predict_net)
+
+    # supporting multiple inputs in caffe2 in a bit tricky,
+    # because the input names can appear at the beginning or end of model.predict_net.external_input
+    assert isinstance(input_data, np.ndarray)
+
+    # here we use the first input blob to the first op to get the input name
+    input_names = model.predict_net.op[0].input[0]
+    shape_dict = {input_names: input_data.shape}
+    dtype_dict = {input_names: input_data.dtype}
+
+    graph, lib, params = nnvm.compiler.build(
+        sym, target, shape=shape_dict, dtype=dtype_dict, params=params)
+
+    ctx = tvm.cpu(0)
+    m = graph_runtime.create(graph, lib, ctx)
+
+    # set inputs
+    m.set_input(input_names, tvm.nd.array(input_data.astype(input_data.dtype)))
+    m.set_input(**params)
+
+    # execute
+    m.run()
+
+    # get outputs
+    if isinstance(output_shape, list) and isinstance(output_dtype, list):
+        tvm_output_list = []
+        for i, s in enumerate(output_shape):
+            tvm_output = m.get_output(i, tvm.nd.empty((s), output_dtype[i]))
+            tvm_output_list.append(tvm_output.asnumpy())
+        return tvm_output_list
+    else:
+        tvm_output = m.get_output(0, tvm.nd.empty((output_shape),
+                                                  output_dtype))
+        return tvm_output.asnumpy()
+
+
+def get_caffe2_output(model, x, dtype='float32'):
+    workspace.RunNetOnce(model.init_net)
+
+    input_blob = model.predict_net.op[0].input[0]
+    workspace.FeedBlob(input_blob, x.astype(dtype))
+    workspace.RunNetOnce(model.predict_net)
+
+    output_blob = model.predict_net.external_output[0]
+    c2_output = workspace.FetchBlob(output_blob)
+    return c2_output
+
+
+def verify_caffe2_forward_impl(model, data_shape, out_shape):
+    dtype = 'float32'
+    data = np.random.uniform(size=data_shape).astype(dtype)
+    c2_out = get_caffe2_output(model, data, dtype)
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, data, target, ctx, out_shape, dtype)
+        tvm.testing.assert_allclose(c2_out, tvm_out, rtol=1e-5, atol=1e-5)
+
+
+def verify_squeezenet1_1():
+    verify_caffe2_forward_impl(c2_squeezenet, (1, 3, 224, 224),
+                               (1, 1000, 1, 1))
+
+
+def verify_resnet50():
+    verify_caffe2_forward_impl(c2_resnet50, (1, 3, 224, 224),
+                               (1, 1000))
+
+
+def verify_vgg19():
+    verify_caffe2_forward_impl(c2_vgg19, (1, 3, 224, 224), (1, 1000))
+
+
+if __name__ == '__main__':
+    verify_squeezenet1_1()
+    verify_resnet50()
+    verify_vgg19()
diff --git a/nnvm/tests/python/frontend/caffe2/test_graph.py b/nnvm/tests/python/frontend/caffe2/test_graph.py
new file mode 100755
index 000000000000..425fc9a6201d
--- /dev/null
+++ b/nnvm/tests/python/frontend/caffe2/test_graph.py
@@ -0,0 +1,24 @@
+"""Test graph equality of caffe2 models."""
+import nnvm
+from nnvm.compiler import graph_util, graph_attr
+from model_zoo import c2_squeezenet, squeezenet
+
+def compare_graph(init, predict, nnvm_sym, ishape):
+    caffe2_sym, params = nnvm.frontend.from_caffe2(init, predict)
+    g1 = nnvm.graph.create(caffe2_sym)
+    g2 = nnvm.graph.create(nnvm_sym)
+    input_name = predict.external_input[0]
+    ishapes = {input_name: ishape}
+    graph_attr.set_shape_inputs(g1, ishapes)
+    graph_attr.set_shape_inputs(g2, ishapes)
+    g1 = g1.apply("InferShape").apply("SimplifyInference")
+    g2 = g2.apply("InferShape").apply("SimplifyInference")
+    graph_util.check_graph_equal(g1, g2)
+
+def test_squeeze_net():
+    symbol, params = squeezenet.get_workload(version='1.1')
+    compare_graph(c2_squeezenet.init_net, c2_squeezenet.predict_net, symbol, ishape=(1, 3, 224, 224))
+
+
+if __name__ == '__main__':
+    test_squeeze_net()
diff --git a/nnvm/tests/python/frontend/coreml/model_zoo/__init__.py b/nnvm/tests/python/frontend/coreml/model_zoo/__init__.py
index 87b9b5668432..0a39053b6d47 100644
--- a/nnvm/tests/python/frontend/coreml/model_zoo/__init__.py
+++ b/nnvm/tests/python/frontend/coreml/model_zoo/__init__.py
@@ -25,7 +25,7 @@ def get_resnet50():
 
 def get_cat_image():
     url = 'https://gist.githubusercontent.com/zhreshold/bcda4716699ac97ea44f791c24310193/raw/fa7ef0e9c9a5daea686d6473a62aacd1a5885849/cat.png'
-    dst = 'cat.jpg'
+    dst = 'cat.png'
     real_dst = os.path.abspath(os.path.join(os.path.dirname(__file__), dst))
     download(url, real_dst)
     img = Image.open(real_dst).resize((224, 224))
diff --git a/nnvm/tests/python/frontend/coreml/test_forward.py b/nnvm/tests/python/frontend/coreml/test_forward.py
index d5c460e56987..214c917cb96d 100644
--- a/nnvm/tests/python/frontend/coreml/test_forward.py
+++ b/nnvm/tests/python/frontend/coreml/test_forward.py
@@ -1,8 +1,12 @@
 import numpy as np
 
-import topi
+from coremltools.models.neural_network import NeuralNetworkBuilder
+from coremltools.models import datatypes
+
 import tvm
 from tvm.contrib import graph_runtime
+import topi
+import topi.testing
 import nnvm.symbol as sym
 import nnvm.compiler
 from nnvm.testing.config import ctx_list
@@ -40,6 +44,311 @@ def test_resnet50_checkonly():
     model_file = model_zoo.get_resnet50()
     test_model_checkonly(model_file, 'resnet50')
 
+def run_tvm_graph(graph_def, input_data, input_name, output_shape, output_dtype='float32'):
+    """ Generic function to compile on nnvm and execute on tvm """
+
+    sym, params = nnvm.frontend.from_coreml(graph_def)
+    target = 'llvm'
+    if isinstance(input_data, list):
+        shape_dict = {}
+        dtype_dict = {}
+        for i, e in enumerate(input_name):
+            shape_dict[e] = input_data[i].shape
+            dtype_dict[e] = input_data[i].dtype
+    else:
+        shape_dict = {input_name: input_data.shape}
+        dtype_dict = {input_name: input_data.dtype}
+
+    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict,
+                                             dtype=dtype_dict, params=params)
+
+    ctx = tvm.cpu(0)
+    from tvm.contrib import graph_runtime
+    m = graph_runtime.create(graph, lib, ctx)
+    # set inputs
+    if isinstance(input_data, list):
+        for i, e in enumerate(input_name):
+            m.set_input(e, tvm.nd.array(input_data[i].astype(input_data[i].dtype)))
+    else:
+        m.set_input(input_name, tvm.nd.array(input_data.astype(input_data.dtype)))
+
+    m.set_input(**params)
+    # execute
+    m.run()
+    # get outputs
+    if isinstance(output_shape, list) and isinstance(output_dtype, list):
+        tvm_output_list = []
+        for i, s in enumerate(output_shape):
+            tvm_output = m.get_output(i, tvm.nd.empty((s), output_dtype[i]))
+            tvm_output_list.append(tvm_output.asnumpy())
+        return tvm_output_list
+    else:
+        tvm_output = m.get_output(0, tvm.nd.empty((output_shape), output_dtype))
+        return tvm_output.asnumpy()
+
+def verify_AddLayerParams(input_dim, alpha=2):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.add(a_np1, a_np2) + alpha
+    inputs = [('input1', datatypes.Array(*input_dim)),
+              ('input2', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='Add',
+                            alpha=alpha,
+                            input_names=['input1', 'input2'],
+                            output_name='output',
+                            mode='ADD')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2],
+                           ['input1', 'input2'],
+                           b_np.shape,
+                           dtype)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_AddLayerParams():
+    verify_AddLayerParams((1, 2, 2), 0)
+    verify_AddLayerParams((1, 2, 2), 1)
+    verify_AddLayerParams((1, 3, 3), 2)
+
+def verify_MultiplyLayerParams(input_dim, alpha):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.multiply(a_np1, a_np2) * alpha
+    inputs = [('input1', datatypes.Array(*input_dim)),
+              ('input2', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='Mul',
+                            alpha=alpha,
+                            input_names=['input1', 'input2'],
+                            output_name='output',
+                            mode='MULTIPLY')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2],
+                           ['input1', 'input2'],
+                           b_np.shape,
+                           dtype)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_MultiplyLayerParams():
+    verify_MultiplyLayerParams((1, 2, 2), 0)
+    verify_MultiplyLayerParams((1, 2, 2), 1)
+    verify_MultiplyLayerParams((1, 3, 3), 2)
+
+def verify_ConcatLayerParams(input1_dim, input2_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input1_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input2_dim).astype(dtype)
+
+    b_np = np.concatenate((a_np1, a_np2), axis=1)
+    inputs = [('input1', datatypes.Array(*input1_dim)),
+              ('input2', datatypes.Array(*input2_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='Concate',
+                            input_names=['input1', 'input2'],
+                            output_name='output',
+                            mode='CONCAT')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2],
+                           ['input1', 'input2'],
+                           b_np.shape,
+                           dtype)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_ConcatLayerParams():
+    verify_ConcatLayerParams((1, 1, 2, 2), (1, 2, 2, 2))
+    verify_ConcatLayerParams((1, 2, 4, 4), (1, 3, 4, 4))
+
+def verify_UpsampleLayerParams(input_dim, scale, mode):
+    dtype = "float32"
+
+    a_np = np.full(input_dim, 1, dtype=dtype)
+    if mode == 'NN':
+        b_np = topi.testing.upsampling_python(a_np, scale)
+    else:
+        new_h = input_dim[2] * scale
+        new_w = input_dim[3] * scale
+        b_np = topi.testing.bilinear_resize_python(a_np, (new_h, new_w), 'NCHW')
+
+    input = [('input', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(input, output)
+    builder.add_upsample(name='Upsample',
+                         scaling_factor_h=scale,
+                         scaling_factor_w=scale,
+                         mode=mode,
+                         input_name='input',
+                         output_name='output')
+
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model, a_np, 'input', b_np.shape, dtype)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_UpsampleLayerParams():
+    verify_UpsampleLayerParams((1, 16, 32, 32), 2, 'NN')
+    verify_UpsampleLayerParams((1, 4, 6, 6), 3, 'BILINEAR')
+
+def verify_l2_normalize(input_dim, eps):
+    dtype = "float32"
+
+    a_np = np.random.uniform(size=input_dim).astype(dtype)
+    b_np = topi.testing.l2_normalize_python(a_np, eps, 1)
+
+    input = [('input', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(input, output)
+    builder.add_l2_normalize(name='L2', epsilon=eps, input_name='input', output_name='output')
+
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model, a_np, 'input', b_np.shape, dtype)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_l2_normalize():
+    verify_l2_normalize((1, 3, 20, 20), 0.001)
+
+def verify_lrn(input_dim, size, bias, alpha, beta):
+    dtype = "float32"
+    axis=1
+    a_np = np.random.uniform(size=input_dim).astype(dtype)
+    b_np = topi.testing.lrn_python(a_np, size, axis, bias, alpha, beta)
+
+    input = [('input', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(input, output)
+    builder.add_lrn(name='LRN',
+                    input_name='input',
+                    output_name='output',
+                    alpha=alpha,
+                    beta=beta,
+                    k=bias,
+                    local_size=size)
+
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model, a_np, 'input', b_np.shape, dtype)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_lrn():
+    verify_lrn((1, 3, 10, 20), 3, 1.0, 1.0, 0.5)
+
+def verify_average(input_dim1, input_dim2, axis=0):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim1).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim2).astype(dtype)
+
+    b_np = np.mean((a_np1, a_np2), axis=axis)
+
+    inputs = [('input1', datatypes.Array(*input_dim1)),
+              ('input2', datatypes.Array(*input_dim2))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='MEAN',
+                            input_names=['input1', 'input2'],
+                            output_name='output',
+                            mode='AVE')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2],
+                           ['input1', 'input2'],
+                           b_np.shape,
+                           dtype)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_average():
+    verify_average((1, 3, 20, 20), (1, 3, 20, 20))
+    verify_average((3, 20, 20), (1, 3, 20, 20))
+    verify_average((20, 20), (1, 3, 20, 20))
+
+def verify_max(input_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np3 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.max((a_np1, a_np2, a_np3), axis=0)
+
+    inputs = [('input1', datatypes.Array(*input_dim)),
+              ('input2', datatypes.Array(*input_dim)),
+              ('input3', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='Max',
+                            input_names=['input1', 'input2', 'input3'],
+                            output_name='output',
+                            mode='MAX')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2, a_np3],
+                           ['input1', 'input2', 'input3'],
+                           b_np.shape,
+                           dtype)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_max():
+    verify_max((1, 3, 20, 20))
+    verify_max((20, 20))
+
+def verify_min(input_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np3 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.min((a_np1, a_np2, a_np3), axis=0)
+
+    inputs = [('input1', datatypes.Array(*input_dim)),
+              ('input2', datatypes.Array(*input_dim)),
+              ('input3', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='Min',
+                            input_names=['input1', 'input2', 'input3'],
+                            output_name='output',
+                            mode='MIN')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2, a_np3],
+                           ['input1', 'input2', 'input3'],
+                           b_np.shape,
+                           dtype)
+        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_min():
+    verify_min((1, 3, 20, 20))
+    verify_min((20, 20))
+
 if __name__ == '__main__':
     test_mobilenet_checkonly()
     test_resnet50_checkonly()
+    test_forward_AddLayerParams()
+    test_forward_ConcatLayerParams()
+    test_forward_MultiplyLayerParams()
+    test_forward_UpsampleLayerParams()
+    test_forward_l2_normalize()
+    test_forward_lrn()
+    test_forward_average()
+    test_forward_max()
+    test_forward_min()
diff --git a/nnvm/tests/python/frontend/darknet/test_forward.py b/nnvm/tests/python/frontend/darknet/test_forward.py
index e68aed085664..1f5e89c6e4d5 100644
--- a/nnvm/tests/python/frontend/darknet/test_forward.py
+++ b/nnvm/tests/python/frontend/darknet/test_forward.py
@@ -13,6 +13,7 @@
 import tvm
 from tvm.contrib import graph_runtime
 from nnvm import frontend
+from nnvm.testing.darknet import LAYERTYPE
 from nnvm.testing.darknet import __darknetffi__
 import nnvm.compiler
 if sys.version_info >= (3,):
@@ -44,20 +45,30 @@ def _download(url, path, overwrite=False, sizecompare=False):
     except:
         urllib.urlretrieve(url, path)
 
-DARKNET_LIB = 'libdarknet.so'
+DARKNET_LIB = 'libdarknet2.0.so'
 DARKNETLIB_URL = 'https://github.com/siju-samuel/darknet/blob/master/lib/' \
                                     + DARKNET_LIB + '?raw=true'
 _download(DARKNETLIB_URL, DARKNET_LIB)
 LIB = __darknetffi__.dlopen('./' + DARKNET_LIB)
 
-def _get_tvm_output(net, data):
+def _read_memory_buffer(shape, data, dtype='float32'):
+    length = 1
+    for x in shape:
+        length *= x
+    data_np = np.zeros(length, dtype=dtype)
+    for i in range(length):
+        data_np[i] = data[i]
+    return data_np.reshape(shape)
+
+def _get_tvm_output(net, data, build_dtype='float32'):
     '''Compute TVM output'''
     dtype = 'float32'
     sym, params = frontend.darknet.from_darknet(net, dtype)
 
     target = 'llvm'
     shape_dict = {'data': data.shape}
-    graph, library, params = nnvm.compiler.build(sym, target, shape_dict, dtype, params=params)
+    graph, library, params = nnvm.compiler.build(sym, target, shape_dict,
+                                                 build_dtype, params=params)
     # Execute on TVM
     ctx = tvm.cpu(0)
     m = graph_runtime.create(graph, library, ctx)
@@ -66,14 +77,50 @@ def _get_tvm_output(net, data):
     m.set_input(**params)
     m.run()
     # get outputs
-    out_shape = (net.outputs,)
-    tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
+    tvm_out = []
+    for i in range(m.get_num_outputs()):
+        tvm_out.append(m.get_output(i).asnumpy())
     return tvm_out
 
-def test_forward(net):
+def test_forward(net, build_dtype='float32'):
     '''Test network with given input image on both darknet and tvm'''
     def get_darknet_output(net, img):
-        return LIB.network_predict_image(net, img)
+        LIB.network_predict_image(net, img)
+        out = []
+        for i in range(net.n):
+            layer = net.layers[i]
+            if layer.type == LAYERTYPE.REGION:
+                attributes = np.array([layer.n, layer.out_c, layer.out_h,
+                                       layer.out_w, layer.classes,
+                                       layer.coords, layer.background],
+                                      dtype=np.int32)
+                out.insert(0, attributes)
+                out.insert(0, _read_memory_buffer((layer.n*2, ), layer.biases))
+                layer_outshape = (layer.batch, layer.out_c,
+                                  layer.out_h, layer.out_w)
+                out.insert(0, _read_memory_buffer(layer_outshape, layer.output))
+            elif layer.type == LAYERTYPE.YOLO:
+                attributes = np.array([layer.n, layer.out_c, layer.out_h,
+                                       layer.out_w, layer.classes,
+                                       layer.total],
+                                      dtype=np.int32)
+                out.insert(0, attributes)
+                out.insert(0, _read_memory_buffer((layer.total*2, ), layer.biases))
+                out.insert(0, _read_memory_buffer((layer.n, ), layer.mask, dtype='int32'))
+                layer_outshape = (layer.batch, layer.out_c,
+                                  layer.out_h, layer.out_w)
+                out.insert(0, _read_memory_buffer(layer_outshape, layer.output))
+            elif i == net.n-1:
+                if layer.type == LAYERTYPE.CONNECTED:
+                    darknet_outshape = (layer.batch, layer.out_c)
+                elif layer.type in [LAYERTYPE.SOFTMAX]:
+                    darknet_outshape = (layer.batch, layer.outputs)
+                else:
+                    darknet_outshape = (layer.batch, layer.out_c,
+                                        layer.out_h, layer.out_w)
+                out.insert(0, _read_memory_buffer(darknet_outshape, layer.output))
+        return out
+
     dtype = 'float32'
 
     test_image = 'dog.jpg'
@@ -81,11 +128,7 @@ def get_darknet_output(net, img):
     _download(img_url, test_image)
     img = LIB.letterbox_image(LIB.load_image_color(test_image.encode('utf-8'), 0, 0), net.w, net.h)
     darknet_output = get_darknet_output(net, img)
-    darknet_out = np.zeros(net.outputs, dtype='float32')
-    for i in range(net.outputs):
-        darknet_out[i] = darknet_output[i]
     batch_size = 1
-
     data = np.empty([batch_size, img.c, img.h, img.w], dtype)
     i = 0
     for c in range(img.c):
@@ -94,8 +137,9 @@ def get_darknet_output(net, img):
                 data[0][c][h][k] = img.data[i]
                 i = i + 1
 
-    tvm_out = _get_tvm_output(net, data)
-    np.testing.assert_allclose(darknet_out, tvm_out, rtol=1e-3, atol=1e-3)
+    tvm_out = _get_tvm_output(net, data, build_dtype)
+    for tvm_outs, darknet_out in zip(tvm_out, darknet_output):
+        tvm.testing.assert_allclose(darknet_out, tvm_outs, rtol=1e-3, atol=1e-3)
 
 def test_rnn_forward(net):
     '''Test network with given input data on both darknet and tvm'''
@@ -106,12 +150,15 @@ def get_darknet_network_predict(net, data):
     np_arr = np.zeros([1, net.inputs], dtype='float32')
     np_arr[0, 84] = 1
     cffi_arr = ffi.cast('float*', np_arr.ctypes.data)
-    tvm_out = _get_tvm_output(net, np_arr)
+    tvm_out = _get_tvm_output(net, np_arr)[0]
     darknet_output = get_darknet_network_predict(net, cffi_arr)
     darknet_out = np.zeros(net.outputs, dtype='float32')
     for i in range(net.outputs):
         darknet_out[i] = darknet_output[i]
-    np.testing.assert_allclose(darknet_out, tvm_out, rtol=1e-4, atol=1e-4)
+    last_layer = net.layers[net.n-1]
+    darknet_outshape = (last_layer.batch, last_layer.outputs)
+    darknet_out = darknet_out.reshape(darknet_outshape)
+    tvm.testing.assert_allclose(darknet_out, tvm_out, rtol=1e-4, atol=1e-4)
 
 def test_forward_extraction():
     '''test extraction model'''
@@ -152,8 +199,8 @@ def test_forward_resnet50():
     test_forward(net)
     LIB.free_network(net)
 
-def test_forward_yolo():
-    '''test yolo model'''
+def test_forward_yolov2():
+    '''test yolov2 model'''
     model_name = 'yolov2'
     cfg_name = model_name + '.cfg'
     weights_name = model_name + '.weights'
@@ -162,7 +209,22 @@ def test_forward_yolo():
     _download(cfg_url, cfg_name)
     _download(weights_url, weights_name)
     net = LIB.load_network(cfg_name.encode('utf-8'), weights_name.encode('utf-8'), 0)
-    test_forward(net)
+    build_dtype = {}
+    test_forward(net, build_dtype)
+    LIB.free_network(net)
+
+def test_forward_yolov3():
+    '''test yolov3 model'''
+    model_name = 'yolov3'
+    cfg_name = model_name + '.cfg'
+    weights_name = model_name + '.weights'
+    cfg_url = 'https://github.com/pjreddie/darknet/blob/master/cfg/' + cfg_name + '?raw=true'
+    weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
+    _download(cfg_url, cfg_name)
+    _download(weights_url, weights_name)
+    net = LIB.load_network(cfg_name.encode('utf-8'), weights_name.encode('utf-8'), 0)
+    build_dtype = {}
+    test_forward(net, build_dtype)
     LIB.free_network(net)
 
 def test_forward_convolutional():
@@ -239,6 +301,8 @@ def test_forward_shortcut():
     layer_2 = LIB.make_convolutional_layer(1, 111, 111, 32, 32, 1, 1, 1, 0, 1, 0, 0, 0, 0)
     layer_3 = LIB.make_shortcut_layer(1, 0, 111, 111, 32, 111, 111, 32)
     layer_3.activation = 1
+    layer_3.alpha = 1
+    layer_3.beta = 1
     net.layers[0] = layer_1
     net.layers[1] = layer_2
     net.layers[2] = layer_3
@@ -269,6 +333,44 @@ def test_forward_region():
     net.layers[1] = layer_2
     net.w = net.h = 224
     LIB.resize_network(net, 224, 224)
+    build_dtype = {}
+    test_forward(net, build_dtype)
+    LIB.free_network(net)
+
+def test_forward_yolo_op():
+    '''test yolo layer'''
+    net = LIB.make_network(2)
+    layer_1 = LIB.make_convolutional_layer(1, 224, 224, 3, 14, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    layer_2 = LIB.make_yolo_layer(1, 111, 111, 2, 9, __darknetffi__.NULL, 2)
+    net.layers[0] = layer_1
+    net.layers[1] = layer_2
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    build_dtype = {}
+    test_forward(net, build_dtype)
+    LIB.free_network(net)
+
+def test_forward_upsample():
+    '''test upsample layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_upsample_layer(1, 19, 19, 3, 3)
+    layer.scale = 1
+    net.layers[0] = layer
+    net.w = net.h = 19
+    LIB.resize_network(net, 19, 19)
+    test_forward(net)
+    LIB.free_network(net)
+
+def test_forward_l2normalize():
+    '''test l2 normalization layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_l2norm_layer(1, 224*224*3)
+    layer.c = layer.out_c = 3
+    layer.h = layer.out_h = 224
+    layer.w = layer.out_w = 224
+    net.layers[0] = layer
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
     test_forward(net)
     LIB.free_network(net)
 
@@ -287,7 +389,7 @@ def test_forward_softmax():
     '''test softmax layer'''
     net = LIB.make_network(1)
     layer_1 = LIB.make_softmax_layer(1, 75, 1)
-    layer_1.temperature=1
+    layer_1.temperature = 1
     net.layers[0] = layer_1
     net.w = net.h = 5
     LIB.resize_network(net, net.w, net.h)
@@ -298,7 +400,7 @@ def test_forward_softmax_temperature():
     '''test softmax layer'''
     net = LIB.make_network(1)
     layer_1 = LIB.make_softmax_layer(1, 75, 1)
-    layer_1.temperature=0.8
+    layer_1.temperature = 0.8
     net.layers[0] = layer_1
     net.w = net.h = 5
     LIB.resize_network(net, net.w, net.h)
@@ -306,7 +408,7 @@ def test_forward_softmax_temperature():
     LIB.free_network(net)
 
 def test_forward_rnn():
-    '''test softmax layer'''
+    '''test RNN layer'''
     net = LIB.make_network(1)
     batch = 1
     inputs = 256
@@ -325,7 +427,7 @@ def test_forward_rnn():
     LIB.free_network(net)
 
 def test_forward_crnn():
-    '''test softmax layer'''
+    '''test CRNN layer'''
     net = LIB.make_network(1)
     batch = 1
     c = 3
@@ -349,6 +451,42 @@ def test_forward_crnn():
     test_forward(net)
     LIB.free_network(net)
 
+def test_forward_lstm():
+    '''test LSTM layer'''
+    net = LIB.make_network(1)
+    batch = 1
+    inputs = 256
+    outputs = 256
+    steps = 1
+    batch_normalize = 0
+    adam = 0
+    layer_1 = LIB.make_lstm_layer(batch, inputs, outputs, steps, batch_normalize, adam)
+    net.layers[0] = layer_1
+    net.inputs = inputs
+    net.outputs = outputs
+    net.w = net.h = 0
+    LIB.resize_network(net, net.w, net.h)
+    test_rnn_forward(net)
+    LIB.free_network(net)
+
+def test_forward_gru():
+    '''test GRU layer'''
+    net = LIB.make_network(1)
+    batch = 1
+    inputs = 256
+    outputs = 256
+    steps = 1
+    batch_normalize = 0
+    adam = 0
+    layer_1 = LIB.make_gru_layer(batch, inputs, outputs, steps, batch_normalize, adam)
+    net.layers[0] = layer_1
+    net.inputs = inputs
+    net.outputs = outputs
+    net.w = net.h = 0
+    LIB.resize_network(net, net.w, net.h)
+    test_rnn_forward(net)
+    LIB.free_network(net)
+
 def test_forward_activation_logistic():
     '''test logistic activation layer'''
     net = LIB.make_network(1)
@@ -379,7 +517,8 @@ def test_forward_activation_logistic():
     test_forward_resnet50()
     test_forward_alexnet()
     test_forward_extraction()
-    test_forward_yolo()
+    test_forward_yolov2()
+    test_forward_yolov3()
     test_forward_convolutional()
     test_forward_maxpooling()
     test_forward_avgpooling()
@@ -392,7 +531,12 @@ def test_forward_activation_logistic():
     test_forward_rnn()
     test_forward_reorg()
     test_forward_region()
+    test_forward_yolo_op()
+    test_forward_upsample()
+    test_forward_l2normalize()
     test_forward_elu()
     test_forward_rnn()
     test_forward_crnn()
-    test_forward_activation_logistic()
\ No newline at end of file
+    test_forward_lstm()
+    test_forward_gru()
+    test_forward_activation_logistic()
diff --git a/nnvm/tests/python/frontend/keras/test_forward.py b/nnvm/tests/python/frontend/keras/test_forward.py
index 17c9fc1329d7..618af3b2e417 100644
--- a/nnvm/tests/python/frontend/keras/test_forward.py
+++ b/nnvm/tests/python/frontend/keras/test_forward.py
@@ -13,14 +13,13 @@
 set_session(tf.Session(config=config))
 
 
-def verify_keras_frontend(keras_model):
+def verify_keras_frontend(keras_model, need_transpose=True):
     # Keras frontend currently supports tensorflow backend only.
     assert(keras.backend.backend() == 'tensorflow')
 
     in_shapes = []
     for layer in keras_model._input_layers:
         in_shapes.append(tuple(dim.value if dim.value is not None else 1 for dim in layer.input.shape))
-    out_shape = [dim.value if dim.value is not None else 1 for dim in keras_model._output_layers[0].output.shape]
 
     def get_keras_output(xs, dtype='float32'):
         return keras_model.predict(xs)
@@ -35,15 +34,25 @@ def get_tvm_output(xs, target, ctx, dtype='float32'):
             m.set_input(name, tvm.nd.array(x.astype(dtype)))
         m.set_input(**params)
         m.run()
-        out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
-        return out.asnumpy()
 
-    xs = [np.random.uniform(size=shape) for shape in in_shapes]
+        return [m.get_output(i).asnumpy() for i in range(m.get_num_outputs())]
+
+    def to_channels_first(arr):
+        return arr.transpose([0, -1] + list(range(1, arr.ndim - 1)))
+
+    def to_channels_last(arr):
+        return arr.transpose([0] + list(range(2, arr.ndim)) + [1])
+
+    xs = [np.random.uniform(size=shape, low=-1.0, high=1.0) for shape in in_shapes]
     keras_out = get_keras_output(xs)
-    for target, ctx in ctx_list():
-        tvm_out = get_tvm_output([x.transpose([0,3,1,2]) for x in xs], target, ctx)
-        np.testing.assert_allclose(keras_out, tvm_out, rtol=1e-5, atol=1e-5)
 
+    keras_out = keras_out if isinstance(keras_out, list) else [keras_out]
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output([to_channels_first(x) for x in xs] if need_transpose else xs, target, ctx)
+        for kout, tout in zip(keras_out, tvm_out):
+            if need_transpose:
+                tout = to_channels_last(tout)
+            tvm.testing.assert_allclose(kout, tout, rtol=1e-5, atol=1e-5)
 
 def test_forward_elemwise_add():
     r = []
@@ -64,81 +73,110 @@ def test_forward_elemwise_add():
     keras_model = keras.models.Model(data, y)
     verify_keras_frontend(keras_model)
 
-def test_forward_dense():
-    data = keras.layers.Input(shape=(32,32,3))
-    x = keras.layers.MaxPooling2D(pool_size=(2,2))(data)
-    x = keras.layers.Flatten()(x)
+
+def _test_forward_dense():
+    data = keras.layers.Input(shape=(32,32,1))
+    x = keras.layers.Flatten()(data)
     x = keras.layers.Dropout(0.5)(x)
     x = keras.layers.Dense(10, activation='relu', kernel_initializer='uniform')(x)
     keras_model = keras.models.Model(data, x)
     verify_keras_frontend(keras_model)
 
+def _test_forward_dense_with_3d_inp():
+    data = keras.layers.Input(shape=(1, 20))
+    x = keras.layers.Dense(10, activation='relu', kernel_initializer='uniform')(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model, need_transpose=False)
 
-def test_forward_transpose_conv():
-    data = keras.layers.Input(shape=(32,32,3))
-    x = keras.layers.Conv2D(filters=10, kernel_size=(3,3), strides=(2,2), padding='same')(data)
-    x = keras.layers.DepthwiseConv2D(kernel_size=(3,3), padding='same')(x)
-    x = keras.layers.Conv2DTranspose(filters=64, kernel_size=(3,3), padding='valid')(x)
-    x = keras.layers.GlobalMaxPooling2D()(x)
+def test_forward_dense():
+    _test_forward_dense()
+    _test_forward_dense_with_3d_inp()
+
+def test_forward_pool():
+    data = keras.layers.Input(shape=(32,32,1))
+    # maxpool
+    x = keras.layers.MaxPooling2D((3, 3), strides=(1, 1), padding='same')(data)
     keras_model = keras.models.Model(data, x)
     verify_keras_frontend(keras_model)
+    # avgpool
+    y = keras.layers.AveragePooling2D((3, 3), strides=(1, 1), padding='same')(data)
+    keras_model = keras.models.Model(data, y)
+    verify_keras_frontend(keras_model)
 
 
-def test_forward_separable_conv():
+def test_forward_conv():
     data = keras.layers.Input(shape=(32,32,3))
-    x = keras.layers.SeparableConv2D(filters=10, kernel_size=(3,3),
-        padding='same', activation='relu')(data)
-    x = keras.layers.BatchNormalization(scale=True, center=False,
-        beta_initializer='uniform', gamma_initializer='uniform')(x)
-    x = keras.layers.GlobalAveragePooling2D()(x)
-    keras_model = keras.models.Model(data, x)
-    verify_keras_frontend(keras_model)
+    conv_funcs = [keras.layers.Conv2D(filters=10, kernel_size=(3,3),
+                                      strides=(2,2), padding='same'),
+                  keras.layers.Conv2D(filters=10, kernel_size=(3,3),
+                                      dilation_rate=(2,2), padding='same'),
+                  keras.layers.DepthwiseConv2D(kernel_size=(3,3), padding='same'),
+                  keras.layers.Conv2DTranspose(filters=10, kernel_size=(3,3), padding='valid'),
+                  keras.layers.SeparableConv2D(filters=10, kernel_size=(3,3), padding='same')]
+    for conv_func in conv_funcs:
+        x = conv_func(data)
+        keras_model = keras.models.Model(data, x)
+        verify_keras_frontend(keras_model)
 
 
 def test_forward_upsample():
     data = keras.layers.Input(shape=(32,32,3))
     x = keras.layers.UpSampling2D(size=(3,3))(data)
-    x = keras.layers.GlobalAveragePooling2D()(x)
     keras_model = keras.models.Model(data, x)
     verify_keras_frontend(keras_model)
 
+
 def test_forward_reshape():
     data = keras.layers.Input(shape=(32,32,3))
     x = keras.layers.Reshape(target_shape=(32,32,3))(data)
-    x = keras.layers.GlobalAveragePooling2D()(x)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_crop():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Cropping2D(cropping=((1, 1), (1, 1)))(data)
+    x = keras.layers.Cropping2D(cropping=(1, 1))(x)
+    x = keras.layers.Cropping2D(cropping=1)(x)
+    x = keras.layers.Cropping2D(cropping=((0, 1), (1, 0)))(x)
+    x = keras.layers.Cropping2D(cropping=(1, 0))(x)
+    x = keras.layers.Cropping2D(cropping=0)(x)
+    x = keras.layers.Add()([x, x])
     keras_model = keras.models.Model(data, x)
     verify_keras_frontend(keras_model)
 
 
 def test_forward_vgg16():
-    keras_model = keras.applications.vgg16.VGG16(include_top=True, weights=None,
+    keras_model = keras.applications.vgg16.VGG16(include_top=True, weights='imagenet',
         input_shape=(224,224,3), classes=1000)
     verify_keras_frontend(keras_model)
 
 
 def test_forward_xception():
-    keras_model = keras.applications.xception.Xception(include_top=True, weights=None,
+    keras_model = keras.applications.xception.Xception(include_top=True, weights='imagenet',
         input_shape=(299,299,3), classes=1000)
     verify_keras_frontend(keras_model)
 
 
 def test_forward_resnet50():
-    keras_model = keras.applications.resnet50.ResNet50(include_top=True, weights=None,
+    keras_model = keras.applications.resnet50.ResNet50(include_top=True, weights='imagenet',
         input_shape=(224,224,3), classes=1000)
     verify_keras_frontend(keras_model)
 
 
 def test_forward_mobilenet():
-    keras_model = keras.applications.mobilenet.MobileNet(include_top=True, weights=None,
+    keras_model = keras.applications.mobilenet.MobileNet(include_top=True, weights='imagenet',
         input_shape=(224,224,3), classes=1000)
     verify_keras_frontend(keras_model)
 
+
 def test_forward_activations():
     data = keras.layers.Input(shape=(32,32,3))
     weights = np.random.rand(1, 32, 32, 3)
     act_funcs = [keras.layers.Activation('softmax'),
                  keras.layers.Activation('softplus'),
                  keras.layers.ReLU(),
+                 keras.layers.ReLU(max_value=6.),
                  keras.layers.LeakyReLU(alpha=0.3),
                  keras.layers.PReLU(weights=weights, alpha_initializer="zero"),
                  keras.layers.ELU(alpha=0.5),
@@ -151,10 +189,10 @@ def test_forward_activations():
                  keras.layers.Activation('linear')]
     for act_func in act_funcs:
         x = act_func(data)
-        x = keras.layers.GlobalMaxPooling2D()(x)
         keras_model = keras.models.Model(data, x)
         verify_keras_frontend(keras_model)
 
+
 def test_forward_multi_inputs():
     data1 = keras.layers.Input(shape=(32,32,3))
     data2 = keras.layers.Input(shape=(32,32,3))
@@ -166,6 +204,16 @@ def test_forward_multi_inputs():
     verify_keras_frontend(keras_model)
 
 
+def test_forward_multi_outputs():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(data)
+    x = keras.layers.GlobalAveragePooling2D()(x)
+    y = keras.layers.Conv2D(8, (3, 3), padding="same")(data)
+    y = keras.layers.GlobalAveragePooling2D()(y)
+    keras_model = keras.models.Model(data, [x, y])
+    verify_keras_frontend(keras_model)
+
+
 def test_forward_reuse_layers():
     # reuse conv2d
     data = keras.layers.Input(shape=(32,32,3))
@@ -187,19 +235,104 @@ def test_forward_reuse_layers():
     keras_model = keras.models.Model(data, z)
     verify_keras_frontend(keras_model)
 
+def _test_LSTM(time_steps, inputs, hidden, return_state=True):
+    data = keras.layers.Input(shape=(time_steps, inputs))
+    lstm_out = keras.layers.LSTM(hidden,
+                                 return_state=return_state,
+                                 recurrent_activation='sigmoid',
+                                 activation='tanh')
+    x = lstm_out(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def _test_LSTM_MultiLayer(inputs, hidden):
+    inputs = keras.layers.Input(shape=(1, inputs))
+    layer = keras.layers.LSTM(hidden, return_state=True, return_sequences=True,
+                                 recurrent_activation='sigmoid',
+                                 activation='tanh')
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    output = keras.layers.LSTM(hidden, recurrent_activation='sigmoid',
+                               activation='tanh')(output, initial_state=state)
+    keras_model = keras.models.Model(inputs, output)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+
+def test_forward_LSTM():
+    _test_LSTM(1, 8, 8, return_state=True)
+    _test_LSTM(1, 4, 4, return_state=False)
+    _test_LSTM(20, 16, 256, return_state=False)
+    _test_LSTM_MultiLayer(4, 4)
+
+def _test_RNN(inputs, units):
+    data = keras.layers.Input(shape=(1, inputs))
+    rnn_out = keras.layers.SimpleRNN(units, return_state=True,
+                                 activation='tanh')
+    x = rnn_out(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def _test_RNN_MultiLayer(inputs, units):
+    inputs = keras.layers.Input(shape=(1, inputs))
+    layer = keras.layers.SimpleRNN(units, return_state=True, return_sequences=True,
+                                   activation='tanh')
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    output = keras.layers.SimpleRNN(units, activation='tanh')(output, initial_state=state)
+    keras_model = keras.models.Model(inputs, output)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def test_forward_RNN():
+    _test_RNN(2, 4)
+    _test_RNN(4, 3)
+    _test_RNN_MultiLayer(4, 12)
+
+def _test_GRU(inputs, units):
+    data = keras.layers.Input(shape=(1, inputs))
+    gru_out = keras.layers.GRU(units,
+                               return_state=True,
+                               recurrent_activation='sigmoid',
+                               activation='tanh')
+    x = gru_out(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def _test_GRU_MultiLayer(inputs, units):
+    inputs = keras.layers.Input(shape=(1, inputs))
+    layer = keras.layers.GRU(units,
+                             return_state=True,
+                             return_sequences=True,
+                             recurrent_activation='sigmoid',
+                             activation='tanh')
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    output = keras.layers.GRU(units, recurrent_activation='sigmoid',
+                              activation='tanh')(output, initial_state=state)
+    keras_model = keras.models.Model(inputs, output)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def test_forward_GRU():
+    _test_GRU(2, 4)
+    _test_GRU(4, 3)
+    _test_GRU_MultiLayer(4, 4)
 
 if __name__ == '__main__':
     test_forward_elemwise_add()
     test_forward_activations()
     test_forward_dense()
-    test_forward_transpose_conv()
-    test_forward_separable_conv()
+    test_forward_pool()
+    test_forward_conv()
     test_forward_upsample()
     test_forward_reshape()
+    test_forward_crop()
     test_forward_vgg16()
     test_forward_xception()
     test_forward_resnet50()
     test_forward_mobilenet()
 
     test_forward_multi_inputs()
+    test_forward_multi_outputs()
     test_forward_reuse_layers()
+    test_forward_LSTM()
+    test_forward_RNN()
+    test_forward_GRU()
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py b/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py
index e3c9acdf23ef..66e743ad9c33 100644
--- a/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py
@@ -1,11 +1,8 @@
 """MXNet and NNVM model zoo."""
 from __future__ import absolute_import
-from . import mlp, resnet, vgg, dqn, dcgan, squeezenet
+from . import mlp, resnet, vgg, dqn, dcgan, squeezenet, inception_v3
 import nnvm.testing
 
-__all__ = ['mx_mlp', 'nnvm_mlp', 'mx_resnet', 'nnvm_resnet', 'mx_vgg', 'nnvm_vgg',
-           'mx_squeezenet', 'nnvm_squeezenet']
-
 _num_class = 1000
 
 # mlp fc
@@ -35,6 +32,10 @@
     mx_squeezenet[version] = squeezenet.get_symbol(version=version)
     nnvm_squeezenet[version] = nnvm.testing.squeezenet.get_workload(1, version=version)[0]
 
+# inception
+mx_inception_v3 = inception_v3.get_symbol()
+nnvm_inception_v3 = nnvm.testing.inception_v3.get_workload(1)[0]
+
 # dqn
 mx_dqn = dqn.get_symbol()
 nnvm_dqn = nnvm.testing.dqn.get_workload(1)[0]
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/inception_v3.py b/nnvm/tests/python/frontend/mxnet/model_zoo/inception_v3.py
new file mode 100644
index 000000000000..b8585bf05037
--- /dev/null
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/inception_v3.py
@@ -0,0 +1,170 @@
+"""
+Inception V3, suitable for images with around 299 x 299
+
+Reference:
+Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision." arXiv preprint arXiv:1512.00567 (2015).
+
+Adopted from https://github.com/apache/incubator-mxnet/blob/
+             master/example/image-classification/symbols/inception-v3.py
+"""
+import mxnet as mx
+import numpy as np
+
+def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''):
+    conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=True, name='%s%s_conv2d' %(name, suffix))
+    bn = mx.sym.BatchNorm(data=conv, eps=2e-5, name='%s%s_batchnorm' % (name, suffix))
+    act = mx.sym.Activation(data=bn, act_type='relu', name='%s%s_relu' %(name, suffix))
+    return act
+
+
+def Inception7A(data,
+                num_1x1,
+                num_3x3_red, num_3x3_1, num_3x3_2,
+                num_5x5_red, num_5x5,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name))
+    tower_5x5 = Conv(data, num_5x5_red, name=('%s_tower' % name), suffix='_conv')
+    tower_5x5 = Conv(tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name), suffix='_conv_1')
+    tower_3x3 = Conv(data, num_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_2')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(pooling, proj, name=('%s_tower_2' %  name), suffix='_conv')
+    concat = mx.sym.Concat(*[tower_1x1, tower_5x5, tower_3x3, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+# First Downsample
+def Inception7B(data,
+                num_3x3,
+                num_d3x3_red, num_d3x3_1, num_d3x3_2,
+                pool,
+                name):
+    tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_conv' % name))
+    tower_d3x3 = Conv(data, num_d3x3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_2')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0,0), pool_type="max", name=('max_pool_%s_pool' % name))
+    concat = mx.sym.Concat(*[tower_3x3, tower_d3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7C(data,
+                num_1x1,
+                num_d7_red, num_d7_1, num_d7_2,
+                num_q7_red, num_q7_1, num_q7_2, num_q7_3, num_q7_4,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d7 = Conv(data=data, num_filter=num_d7_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=data, num_filter=num_q7_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_1, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_2, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_3, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_3')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_4, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_4')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' %  name), suffix='_conv')
+    # concat
+    concat = mx.sym.Concat(*[tower_1x1, tower_d7, tower_q7, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7D(data,
+                num_3x3_red, num_3x3,
+                num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3,
+                pool,
+                name):
+    tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_3x3 = Conv(data=tower_3x3, num_filter=num_3x3, kernel=(3, 3), pad=(0,0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=data, num_filter=num_d7_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_3x3, kernel=(3, 3), stride=(2, 2), name=('%s_tower_1' % name), suffix='_conv_3')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    # concat
+    concat = mx.sym.Concat(*[tower_3x3, tower_d7_3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7E(data,
+                num_1x1,
+                num_d3_red, num_d3_1, num_d3_2,
+                num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d3 = Conv(data=data, num_filter=num_d3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3_a = Conv(data=tower_d3, num_filter=num_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower' % name), suffix='_mixed_conv')
+    tower_d3_b = Conv(data=tower_d3, num_filter=num_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower' % name), suffix='_mixed_conv_1')
+    tower_3x3_d3 = Conv(data=data, num_filter=num_3x3_d3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower_1' % name), suffix='_mixed_conv')
+    tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower_1' % name), suffix='_mixed_conv_1')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' %  name), suffix='_conv')
+    # concat
+    concat = mx.sym.Concat(*[tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def get_symbol(num_classes=1000, **kwargs):
+    data = mx.sym.Variable(name="data")
+    # stage 1
+    conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv")
+    conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1")
+    conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2")
+    pool = mx.sym.Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool")
+    # stage 2
+    conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3")
+    conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4")
+    pool1 = mx.sym.Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool1")
+
+    # # stage 3
+    in3a = Inception7A(pool1, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 32, "mixed")
+    in3b = Inception7A(in3a, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_1")
+    in3c = Inception7A(in3b, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_2")
+    in3d = Inception7B(in3c, 384,
+                       64, 96, 96,
+                       "max", "mixed_3")
+    # stage 4
+    in4a = Inception7C(in3d, 192,
+                       128, 128, 192,
+                       128, 128, 128, 128, 192,
+                       "avg", 192, "mixed_4")
+    in4b = Inception7C(in4a, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_5")
+    in4c = Inception7C(in4b, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_6")
+    in4d = Inception7C(in4c, 192,
+                       192, 192, 192,
+                       192, 192, 192, 192, 192,
+                       "avg", 192, "mixed_7")
+    in4e = Inception7D(in4d, 192, 320,
+                       192, 192, 192, 192,
+                       "max", "mixed_8")
+    # stage 5
+    in5a = Inception7E(in4e, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "avg", 192, "mixed_9")
+    in5b = Inception7E(in5a, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "max", 192, "mixed_10")
+    # pool
+    pool = mx.sym.Pooling(data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", name="global_pool")
+    flatten = mx.sym.Flatten(data=pool, name="flatten")
+    fc1 = mx.sym.FullyConnected(data=flatten, num_hidden=num_classes, name='fc1', flatten=False)
+    softmax = mx.sym.SoftmaxOutput(data=fc1, name='softmax')
+    return softmax
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/resnet.py b/nnvm/tests/python/frontend/mxnet/model_zoo/resnet.py
index 42a62af023e7..3f9a870d31c0 100644
--- a/nnvm/tests/python/frontend/mxnet/model_zoo/resnet.py
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/resnet.py
@@ -46,14 +46,13 @@ def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True, b
         Workspace used in convolution operator
     """
     if bottle_neck:
-        # the same as https://github.com/facebook/fb.resnet.torch#notes, a bit difference with origin paper
         bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn1')
         act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
-        conv1 = mx.sym.Convolution(data=act1, num_filter=int(num_filter*0.25), kernel=(1,1), stride=(1,1), pad=(0,0),
+        conv1 = mx.sym.Convolution(data=act1, num_filter=int(num_filter*0.25), kernel=(1,1), stride=stride, pad=(0,0),
                                    no_bias=True, workspace=workspace, name=name + '_conv1')
         bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn2')
         act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
-        conv2 = mx.sym.Convolution(data=act2, num_filter=int(num_filter*0.25), kernel=(3,3), stride=stride, pad=(1,1),
+        conv2 = mx.sym.Convolution(data=act2, num_filter=int(num_filter*0.25), kernel=(3,3), stride=(1,1), pad=(1,1),
                                    no_bias=True, workspace=workspace, name=name + '_conv2')
         bn3 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn3')
         act3 = mx.sym.Activation(data=bn3, act_type='relu', name=name + '_relu3')
diff --git a/nnvm/tests/python/frontend/mxnet/test_forward.py b/nnvm/tests/python/frontend/mxnet/test_forward.py
index 6c086cb367e8..66ae9d6e9de4 100644
--- a/nnvm/tests/python/frontend/mxnet/test_forward.py
+++ b/nnvm/tests/python/frontend/mxnet/test_forward.py
@@ -14,7 +14,7 @@
 
 
 def verify_mxnet_frontend_impl(mx_symbol, data_shape=(1, 3, 224, 224), out_shape=(1, 1000),
-                               gluon_impl=False, name=None):
+                               gluon_impl=False, name=None, dtype='float32'):
     """Use name different from test to avoid let nose pick it up"""
     if gluon_impl:
         def get_gluon_output(name, x):
@@ -57,19 +57,18 @@ def get_tvm_output(symbol, x, args, auxs, target, ctx, dtype='float32'):
         return out.asnumpy()
 
     # random input
-    dtype = 'float32'
     x = np.random.uniform(size=data_shape)
     if gluon_impl:
         gluon_out, gluon_sym = get_gluon_output(name, x)
         for target, ctx in ctx_list():
             tvm_out = get_tvm_output(gluon_sym, x, None, None, target, ctx, dtype)
-            np.testing.assert_allclose(gluon_out, tvm_out, rtol=1e-5, atol=1e-5)
+            tvm.testing.assert_allclose(gluon_out, tvm_out, rtol=1e-5, atol=1e-5)
     else:
         mx_out, args, auxs = get_mxnet_output(mx_symbol, x, dtype)
         assert "data" not in args
         for target, ctx in ctx_list():
             tvm_out = get_tvm_output(mx_symbol, x, args, auxs, target, ctx, dtype)
-            np.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
+            tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 def test_forward_mlp():
     mlp = model_zoo.mx_mlp
@@ -154,6 +153,38 @@ def test_forward_lrn():
     mx_sym = mx.sym.LRN(data, alpha=2, beta=2, knorm=1, nsize=5)
     verify_mxnet_frontend_impl(mx_sym, (1, 10, 24, 24), (1, 10, 24, 24))
 
+def test_forward_ones():
+    data = mx.sym.var('data')
+    ones = mx.sym.ones(shape=(2, 3, 4), dtype='float32')
+    mx_sym = mx.sym.elemwise_add(data, ones)
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+    
+def test_forward_zeros():
+    data = mx.sym.var('data')
+    zeros = mx.sym.zeros(shape=(2, 3, 4), dtype='float32')
+    mx_sym = mx.sym.elemwise_add(data, zeros)
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_ones_like():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.ones_like(data, dtype='float32')
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_zeros_like():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.zeros_like(data, dtype='float32')
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_argmax():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.argmax(data, axis=1)
+    verify_mxnet_frontend_impl(mx_sym, (5, 3), (5,))
+
+def test_forward_argmin():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.argmin(data, axis=0)
+    verify_mxnet_frontend_impl(mx_sym, (5, 4), (4,))
+    
 if __name__ == '__main__':
     test_forward_mlp()
     test_forward_vgg()
@@ -169,3 +200,10 @@ def test_forward_lrn():
     test_forward_expand_dims()
     test_forward_pooling()
     test_forward_lrn()
+    test_forward_ones()
+    test_forward_zeros()
+    test_forward_ones_like()
+    test_forward_zeros_like()
+    test_forward_argmax()
+    test_forward_argmin()
+    
diff --git a/nnvm/tests/python/frontend/mxnet/test_graph.py b/nnvm/tests/python/frontend/mxnet/test_graph.py
index 18e124ad6ffc..e89224cd969e 100644
--- a/nnvm/tests/python/frontend/mxnet/test_graph.py
+++ b/nnvm/tests/python/frontend/mxnet/test_graph.py
@@ -39,17 +39,23 @@ def test_squeezenet():
         nnvm_sym = model_zoo.nnvm_squeezenet[version]
         compare_graph(from_mx_sym, nnvm_sym)
 
+def test_inception_v3():
+    mx_sym = model_zoo.mx_inception_v3
+    from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
+    nnvm_sym = model_zoo.nnvm_inception_v3
+    compare_graph(from_mx_sym, nnvm_sym, ishape=(2, 3, 299, 299))
+
 def test_dqn():
     mx_sym = model_zoo.mx_dqn
     from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
     nnvm_sym = model_zoo.nnvm_dqn
-    compare_graph(from_mx_sym, nnvm_sym)
+    compare_graph(from_mx_sym, nnvm_sym, ishape=(2, 4, 84, 84))
 
 def test_dcgan():
     mx_sym = model_zoo.mx_dcgan
     from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
     nnvm_sym = model_zoo.nnvm_dcgan
-    compare_graph(from_mx_sym, nnvm_sym)
+    compare_graph(from_mx_sym, nnvm_sym, ishape=(2, 100))
 
 def test_multi_outputs():
     def compose(F, **kwargs):
@@ -70,3 +76,4 @@ def compose(F, **kwargs):
     test_dqn()
     test_dcgan()
     test_squeezenet()
+    test_inception_v3()
diff --git a/nnvm/tests/python/frontend/onnx/model_zoo/squeezenet.py b/nnvm/tests/python/frontend/onnx/model_zoo/squeezenet.py
new file mode 100644
index 000000000000..2de2d1075494
--- /dev/null
+++ b/nnvm/tests/python/frontend/onnx/model_zoo/squeezenet.py
@@ -0,0 +1,118 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=unused-argument
+
+"""
+Symbol of SqueezeNet
+
+Reference:
+Iandola, Forrest N., et al.
+"Squeezenet: Alexnet-level accuracy with 50x fewer parameters and< 0.5 mb model size." (2016).
+"""
+
+from nnvm import symbol as sym
+from nnvm.testing.utils import create_workload
+
+# Helpers
+def _make_fire(net, squeeze_channels, expand1x1_channels, expand3x3_channels):
+    net = _make_fire_conv(net, squeeze_channels, 1, 0)
+
+    left = _make_fire_conv(net, expand1x1_channels, 1, 0)
+    right = _make_fire_conv(net, expand3x3_channels, 3, 1)
+    # NOTE : Assume NCHW layout here
+    net = sym.concatenate(left, right, axis=1)
+
+    return net
+
+def _make_fire_conv(net, channels, kernel_size, padding=0):
+    net = sym.conv2d(net, channels=channels, kernel_size=(kernel_size, kernel_size),
+                     padding=(padding, padding))
+    net = sym.relu(net)
+    return net
+
+# Net
+def get_symbol(num_classes, version, **kwargs):
+    """Get symbol of SqueezeNet
+
+    Parameters
+    ----------
+    num_classes: int
+        The number of classification results
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+    """
+    assert version == '1.1', ("Unsupported SqueezeNet version {version}:"
+                              "1.1 expected".format(version=version))
+    net = sym.Variable("data")
+
+    net = sym.conv2d(net, channels=64, kernel_size=(3, 3), strides=(2, 2))
+    net = sym.relu(net)
+    net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+    net = _make_fire(net, 16, 64, 64)
+    net = _make_fire(net, 16, 64, 64)
+    net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+    net = _make_fire(net, 32, 128, 128)
+    net = _make_fire(net, 32, 128, 128)
+    net = sym.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+    net = _make_fire(net, 48, 192, 192)
+    net = _make_fire(net, 48, 192, 192)
+    net = _make_fire(net, 64, 256, 256)
+    net = _make_fire(net, 64, 256, 256)
+
+    net = sym.dropout(net, rate=0.5)
+    net = sym.conv2d(net, channels=num_classes, kernel_size=(1, 1))
+    net = sym.relu(net)
+    net = sym.global_avg_pool2d(net)
+    return sym.softmax(net, axis=1)
+
+def get_workload(batch_size=1, num_classes=1000, version='1.0',
+                 image_shape=(3, 224, 224), dtype="float32", **kwargs):
+    """Get benchmark workload for SqueezeNet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    kwargs : dict
+        Extra arguments
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_symbol(num_classes=num_classes, version=version, **kwargs)
+    return create_workload(net, batch_size, image_shape, dtype)
diff --git a/nnvm/tests/python/frontend/onnx/test_forward.py b/nnvm/tests/python/frontend/onnx/test_forward.py
index 9fb3aed2da10..82b5d319f92f 100644
--- a/nnvm/tests/python/frontend/onnx/test_forward.py
+++ b/nnvm/tests/python/frontend/onnx/test_forward.py
@@ -1,6 +1,8 @@
 import numpy as np
 import math
 import nnvm
+import topi
+import topi.testing
 import tvm
 from tvm.contrib import graph_runtime
 from nnvm.testing.config import ctx_list
@@ -8,7 +10,7 @@
 from model_zoo import super_resolution, squeezenet1_1, lenet, resnet18_1_0
 from onnx import helper, TensorProto
 
-def get_tvm_output(graph_def, input_data, target, ctx, output_shape, output_dtype='float32'):
+def get_tvm_output(graph_def, input_data, target, ctx, output_shape=None, output_dtype='float32'):
     """ Generic function to execute and get tvm output"""
 
     sym, params = nnvm.frontend.from_onnx(graph_def)
@@ -45,12 +47,12 @@ def get_tvm_output(graph_def, input_data, target, ctx, output_shape, output_dtyp
     # get outputs
     if isinstance(output_shape, list) and isinstance(output_dtype, list):
         tvm_output_list = []
-        for i, s in enumerate(output_shape):
-            tvm_output = m.get_output(i, tvm.nd.empty((s), output_dtype[i]))
+        for i, _ in enumerate(output_shape):
+            tvm_output = m.get_output(i)
             tvm_output_list.append(tvm_output.asnumpy())
         return tvm_output_list
     else:
-        tvm_output = m.get_output(0, tvm.nd.empty((output_shape), output_dtype))
+        tvm_output = m.get_output(0)
         return tvm_output.asnumpy()
 
 def get_caffe2_output(model, x, dtype='float32'):
@@ -64,11 +66,11 @@ def get_caffe2_output(model, x, dtype='float32'):
 def verify_onnx_forward_impl(graph_file, data_shape, out_shape):
     dtype = 'float32'
     x = np.random.uniform(size=data_shape)
-    model = onnx.load(graph_file)
+    model = onnx.load_model(graph_file)
     c2_out = get_caffe2_output(model, x, dtype)
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output(model, x, target, ctx, out_shape, dtype)
-        np.testing.assert_allclose(c2_out, tvm_out, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(c2_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 def verify_super_resolution_example():
     verify_onnx_forward_impl(super_resolution, (1, 1, 224, 224), (1, 1, 672, 672))
@@ -110,7 +112,7 @@ def test_reshape():
         x = np.random.uniform(size=in_shape).astype('int32')
         tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, 'float32')
 
-    np.testing.assert_allclose(ref_shape, tvm_out.shape)
+    tvm.testing.assert_allclose(ref_shape, tvm_out.shape)
 
 def test_reshape_like():
     in_shape = (4, 3, 3, 4)
@@ -140,7 +142,7 @@ def test_reshape_like():
         x = np.random.uniform(size=in_shape).astype('float32')
         tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, 'float32')
 
-    np.testing.assert_allclose(ref_shape, tvm_out.shape)
+    tvm.testing.assert_allclose(ref_shape, tvm_out.shape)
 
 def _test_power_iteration(x_shape, y_shape):
     if isinstance(y_shape, int):
@@ -166,7 +168,7 @@ def _test_power_iteration(x_shape, y_shape):
 
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output(model, [x, y], target, ctx, np_res.shape)
-        np.testing.assert_allclose(np_res, tvm_out, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(np_res, tvm_out, rtol=1e-5, atol=1e-5)
 
 def test_power():
     _test_power_iteration((1, 3), (1))
@@ -176,7 +178,7 @@ def test_power():
 def test_squeeze():
     in_shape = (1, 3, 1, 3, 1, 1)
     out_shape = (3, 3)
-    y = helper.make_node("Squeeze", ['in'], ['out'])
+    y = helper.make_node("Squeeze", ['in'], ['out'], axes=[0, 2, 4, 5])
 
     graph = helper.make_graph([y],
                               'squeeze_test',
@@ -191,7 +193,7 @@ def test_squeeze():
         x = np.random.uniform(size=in_shape).astype('float32')
         tvm_out = get_tvm_output(model, x, target, ctx, out_shape, 'float32')
 
-    np.testing.assert_allclose(out_shape, tvm_out.shape)
+    tvm.testing.assert_allclose(out_shape, tvm_out.shape)
 
 def test_unsqueeze():
     in_shape = (3, 3)
@@ -212,7 +214,7 @@ def test_unsqueeze():
         x = np.random.uniform(size=in_shape).astype('float32')
         tvm_out = get_tvm_output(model, x, target, ctx, out_shape, 'float32')
 
-    np.testing.assert_allclose(out_shape, tvm_out.shape)
+    tvm.testing.assert_allclose(out_shape, tvm_out.shape)
 
 def verify_gather(in_shape, indices, axis, dtype):
     x = np.random.uniform(size=in_shape).astype(dtype)
@@ -233,7 +235,7 @@ def verify_gather(in_shape, indices, axis, dtype):
 
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output(model, [x, indices], target, ctx, out_np.shape)
-        np.testing.assert_allclose(out_np, tvm_out)
+        tvm.testing.assert_allclose(out_np, tvm_out)
 
 def test_gather():
     verify_gather((4,), [1], 0, 'int32')
@@ -261,7 +263,7 @@ def _test_slice_iteration(indata, outdata, starts, ends, axes=None):
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, 'float32')
 
-    np.testing.assert_allclose(outdata, tvm_out)
+    tvm.testing.assert_allclose(outdata, tvm_out)
 
 def test_slice():
     x = np.random.randn(20, 10, 5).astype(np.float32)
@@ -271,7 +273,7 @@ def test_slice():
     _test_slice_iteration(x, x[:, 0:-1], (0), (-1), (1))
 
 def _test_onnx_op_elementwise(inshape, outfunc, npargs, dtype, opname, kwargs):
-    indata = np.random.uniform(size=(2, 4, 5, 6)).astype(dtype)
+    indata = np.random.uniform(-1, 1, size=inshape).astype(dtype)
     outdata = outfunc(indata, **npargs)
 
     y = helper.make_node(opname, ['in'], ['out'], **kwargs)
@@ -288,7 +290,7 @@ def _test_onnx_op_elementwise(inshape, outfunc, npargs, dtype, opname, kwargs):
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, dtype)
 
-    np.testing.assert_allclose(outdata, tvm_out)
+    tvm.testing.assert_allclose(outdata, tvm_out)
 
 def test_floor():
     _test_onnx_op_elementwise((2, 4, 5, 6), np.floor, {}, 'float32', 'Floor', {})
@@ -327,7 +329,7 @@ def test_matmul():
 
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output(model, [a_array, b_array], target, ctx, out_np.shape)
-        np.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5)
 
 def verify_lrn(shape, nsize, dtype, alpha=None, beta=None, bias=None):
     in_array = np.random.uniform(size=shape).astype(dtype)
@@ -374,12 +376,635 @@ def _get_python_lrn():
         # get outputs
         tvm_out = m.get_output(0, tvm.nd.empty(shape, dtype))
         py_out = _get_python_lrn()
-        np.testing.assert_allclose(py_out, tvm_out.asnumpy(), rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(py_out, tvm_out.asnumpy(), rtol=1e-5, atol=1e-5)
 
 def test_lrn():
     verify_lrn((5, 5, 5, 5), 3, 'float32')
     verify_lrn((5, 5, 5, 5), 3, 'float32', alpha=0.0002, beta=0.5, bias=2.0)
 
+def _test_upsample_nearest():
+    scale = 2
+    in_shape = (1, 1, 3, 3)
+    out_shape = (1, 1, 3*scale, 3*scale)
+    y = helper.make_node("Upsample", ['in'], ['out'], mode='nearest', scales=[1.0, 1.0, 2.0, 2.0])
+
+    in_array = np.random.uniform(size=in_shape).astype(np.float32)
+    out_array = topi.testing.upsampling_python(in_array, scale, "NCHW")
+
+    graph = helper.make_graph([y],
+                              'upsample_nearest_test',
+                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
+
+    model = helper.make_model(graph, producer_name='upsample_nearest_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, 'float32')
+        tvm.testing.assert_allclose(out_array, tvm_out)
+
+def _test_upsample_bilinear():
+    scale = 2
+    in_shape = (1, 1, 3, 3)
+    out_shape = (1, 1, 3*scale, 3*scale)
+    y = helper.make_node("Upsample", ['in'], ['out'], mode='linear', scales=[1.0, 1.0, 2.0, 2.0])
+
+    in_array = np.random.uniform(size=in_shape).astype(np.float32)
+    out_array = topi.testing.bilinear_resize_python(in_array, (3*scale, 3*scale), "NCHW")
+
+    graph = helper.make_graph([y],
+                              'upsample_bilinear_test',
+                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
+
+    model = helper.make_model(graph, producer_name='upsample_bilinear_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, 'float32')
+        tvm.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_upsample():
+    _test_upsample_nearest()
+    _test_upsample_bilinear()
+
+def _test_softmax(inshape, axis):
+    opname = 'Softmax'
+    indata = np.random.uniform(size=inshape).astype(np.float32)
+    outshape = inshape
+    outdata = topi.testing.softmax_python(indata)
+    if isinstance(axis, int):
+        y = helper.make_node(opname, ['in'], ['out'], axis = axis)
+    elif axis is None:
+        y = helper.make_node(opname, ['in'], ['out'])
+
+    graph = helper.make_graph([y],
+                              opname+'_test',
+                              inputs = [helper.make_tensor_value_info("in",
+                                            TensorProto.FLOAT, list(indata.shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(outdata.shape))])
+
+    model = helper.make_model(graph, producer_name=opname+'_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, indata, target, ctx, outshape, 'float32')
+        tvm.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_softmax():
+    _test_softmax((1, 10), None)
+    _test_softmax((1, 10), 1)
+
+def verify_min(input_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np3 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.min((a_np1, a_np2, a_np3), axis=0)
+
+    min_node = helper.make_node("Min", ["a_np1", "a_np2", "a_np3"], ["out"])
+
+    graph = helper.make_graph([min_node],
+                              "Min_test",
+                              inputs = [helper.make_tensor_value_info("a_np1",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np2",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np3",
+                                            TensorProto.FLOAT, list(input_dim))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(b_np.shape))])
+
+    model = helper.make_model(graph, producer_name='Min_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
+        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_min():
+    verify_min((1, 3, 20, 20))
+    verify_min((20, 20))
+
+def verify_max(input_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np3 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.max((a_np1, a_np2, a_np3), axis=0)
+
+    max_node = helper.make_node("Max", ["a_np1", "a_np2", "a_np3"], ["out"])
+
+    graph = helper.make_graph([max_node],
+                              "Max_test",
+                              inputs = [helper.make_tensor_value_info("a_np1",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np2",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np3",
+                                            TensorProto.FLOAT, list(input_dim))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(b_np.shape))])
+
+    model = helper.make_model(graph, producer_name='Max_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
+        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_max():
+    verify_max((1, 3, 20, 20))
+    verify_max((20, 20))
+
+def verify_mean(input_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np3 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.mean((a_np1, a_np2, a_np3), axis=0)
+
+    mean_node = helper.make_node("Mean", ["a_np1", "a_np2", "a_np3"], ["out"])
+
+    graph = helper.make_graph([mean_node],
+                              "Mean_test",
+                              inputs = [helper.make_tensor_value_info("a_np1",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np2",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np3",
+                                            TensorProto.FLOAT, list(input_dim))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(b_np.shape))])
+
+    model = helper.make_model(graph, producer_name='Mean_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
+        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_mean():
+    verify_mean((1, 3, 20, 20))
+    verify_mean((20, 20))
+
+def verify_hardsigmoid(input_dim, alpha, beta):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.clip(a_np1 * alpha + beta, 0, 1)
+
+    hardsigmoid_node = helper.make_node("HardSigmoid", ["a_np1"], ["out"], alpha=alpha, beta=beta)
+
+    graph = helper.make_graph([hardsigmoid_node],
+                              "HardSigmoid_test",
+                              inputs = [helper.make_tensor_value_info("a_np1",
+                                            TensorProto.FLOAT, list(input_dim))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(b_np.shape))])
+
+    model = helper.make_model(graph, producer_name='HardSigmoid_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape)
+        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_hardsigmoid():
+    verify_hardsigmoid((1, 3, 20, 20), 0.5, 0.6)
+    verify_hardsigmoid((20, 20), 0.3, 0.4)
+
+def verify_argmin(input_dim, axis=None, keepdims=None):
+    def _argmin_numpy(data, axis=0, keepdims=True):
+        result = np.argmin(data, axis=axis)
+        if (keepdims == 1):
+            result = np.expand_dims(result, axis)
+        return result.astype(data.dtype)
+
+    a_np1 = np.random.uniform(-10, 10, input_dim).astype(np.int32)
+    if keepdims is None and axis is None:
+        b_np = _argmin_numpy(a_np1)
+        node = onnx.helper.make_node('ArgMin',
+                                     inputs=['a_np1'],
+                                     outputs=['out'])
+    elif axis is None:
+        b_np = _argmin_numpy(a_np1, keepdims=keepdims)
+        node = onnx.helper.make_node('ArgMin',
+                                     inputs=['a_np1'],
+                                     outputs=['out'],
+                                     keepdims=keepdims)
+    elif keepdims is None:
+        b_np = _argmin_numpy(a_np1, axis=axis)
+        node = onnx.helper.make_node('ArgMin',
+                                     inputs=['a_np1'],
+                                     outputs=['out'],
+                                     axis=axis)
+    else:
+        b_np = _argmin_numpy(a_np1, axis=axis, keepdims=keepdims)
+        node = onnx.helper.make_node('ArgMin',
+                                     inputs=['a_np1'],
+                                     outputs=['out'],
+                                     axis=axis,
+                                     keepdims=keepdims)
+    graph = helper.make_graph([node],
+                              "argmin_test",
+                              inputs = [helper.make_tensor_value_info("a_np1",
+                                            TensorProto.INT32, list(a_np1.shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.INT32, list(b_np.shape))])
+
+    model = helper.make_model(graph, producer_name='argmin_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape, b_np.dtype)
+        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def verify_argmax(input_dim, axis=None, keepdims=None):
+    def _argmax_numpy(data, axis=0, keepdims=True):
+        result = np.argmax(data, axis=axis)
+        if (keepdims == 1):
+            result = np.expand_dims(result, axis)
+        return result.astype(data.dtype)
+
+    a_np1 = np.random.uniform(-10, 10, input_dim).astype(np.int32)
+
+    if keepdims is None and axis is None:
+        b_np = _argmax_numpy(a_np1)
+        node = onnx.helper.make_node('ArgMax',
+                                     inputs=['a_np1'],
+                                     outputs=['out'])
+    elif axis is None:
+        b_np = _argmax_numpy(a_np1, keepdims=keepdims)
+        node = onnx.helper.make_node('ArgMax',
+                                     inputs=['a_np1'],
+                                     outputs=['out'],
+                                     keepdims=keepdims)
+    elif keepdims is None:
+        b_np = _argmax_numpy(a_np1, axis=axis)
+        node = onnx.helper.make_node('ArgMax',
+                                     inputs=['a_np1'],
+                                     outputs=['out'],
+                                     axis=axis)
+    else:
+        b_np = _argmax_numpy(a_np1, axis=axis, keepdims=keepdims)
+        node = onnx.helper.make_node('ArgMax',
+                                     inputs=['a_np1'],
+                                     outputs=['out'],
+                                     axis=axis,
+                                     keepdims=keepdims)
+
+    graph = helper.make_graph([node],
+                              "argmax_test",
+                              inputs = [helper.make_tensor_value_info("a_np1",
+                                            TensorProto.INT32, list(a_np1.shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.INT32, list(b_np.shape))])
+
+    model = helper.make_model(graph, producer_name='argmax_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape, b_np.dtype)
+        tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_arg_min_max():
+    '''Verify argmin and argmax'''
+    verify_argmin([3,4,4])
+    verify_argmax([3,4,4])
+    verify_argmin([3,4,4], axis=1)
+    verify_argmax([3,4,4], axis=0)
+    verify_argmin([3,4,4], keepdims=0)
+    verify_argmax([3,4,4], keepdims=1)
+    for axis in [0,1,2]:
+        for keepdims in [True,False]:
+            verify_argmin([3,4,4], axis, keepdims)
+            verify_argmax([3,4,4], axis, keepdims)
+
+def verify_constantfill(is_shape, input_dim, out_dim, value, dtype, **kwargs):
+    input_a = np.random.uniform(size=input_dim).astype(dtype)
+    out = np.empty(shape=out_dim, dtype=dtype)
+    out.fill(value)
+
+    if is_shape == True:
+        fill_node = helper.make_node("ConstantFill", [], ["out"], shape=input_dim, value=value, **kwargs)
+    else:
+        fill_node = helper.make_node("ConstantFill", ["input_a"], ["out"], value=value, dtype=dtype, **kwargs)
+
+    graph = helper.make_graph([fill_node],
+                              "fill_test",
+                              inputs = [helper.make_tensor_value_info("input_a",
+                                            TensorProto.FLOAT, list(input_dim))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(out.shape))])
+
+    model = helper.make_model(graph, producer_name='fill_test')
+
+    for target, ctx in ctx_list():
+        if is_shape == True:
+            tvm_out = get_tvm_output(model, [], target, ctx, out.shape)
+        else:
+            tvm_out = get_tvm_output(model, [input_a], target, ctx, out.shape)
+
+        tvm.testing.assert_allclose(out, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_constantfill():
+    verify_constantfill(True, (2, 3, 4, 5), (2, 3, 4, 5), 10, 'float32')
+    verify_constantfill(False, (2, 3, 4, 5), (2, 3, 4, 5), 10, 'float32')
+    verify_constantfill(True, (2, 3, 4, 5), (2, 3, 4, 5, 4, 5, 6), 10, 'float32', extra_shape=(4, 5, 6))
+
+
+def verify_pad(indata, pads, value=0.0):
+    indata = np.array(indata).astype(np.float32)
+    #  numpy expect result
+    len_dim = len(pads) // 2
+    np_pads = [(pads[i], pads[i+len_dim]) for i in range(len_dim)]
+    outdata = np.pad(indata, pad_width=np_pads, mode='constant', constant_values=value)
+    #  onnx graph
+    node = helper.make_node(
+        'Pad',
+        inputs=['input'],
+        outputs=['output'],
+        mode='constant',
+        pads=pads,
+        value=value
+    )
+    graph = helper.make_graph([node],
+                              'pad_test',
+                              inputs = [helper.make_tensor_value_info("input",
+                                            TensorProto.FLOAT, list(indata.shape))],
+                              outputs = [helper.make_tensor_value_info("output",
+                                            TensorProto.FLOAT, list(outdata.shape))])
+    model = helper.make_model(graph, producer_name='pad_test')
+    #  tvm result
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, 'float32')
+    tvm.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_pad():
+    verify_pad(np.random.randn(2, 2).astype(np.float32), [0, 1, 0, 0], 0.0)
+    verify_pad(np.random.randn(2, 3).astype(np.float32), [1, 0, 0, 1], 0.0)
+    verify_pad(np.random.randn(3, 2).astype(np.float32), [0, 0, 1, 0], 5.0)
+
+def verify_reduce_x(name, indata, axis, keepdims):
+    indata = np.array(indata).astype(np.float32)
+    #  numpy expect result
+    if name == 'ReduceMax':
+        outdata = np.maximum.reduce(indata, axis=axis, keepdims=keepdims == 1)
+    elif name == 'ReduceMin':
+        outdata = np.minimum.reduce(indata, axis=axis, keepdims=keepdims == 1)
+    elif name == 'ReduceSum':
+        outdata = np.sum(indata, axis=axis, keepdims=keepdims == 1)
+    elif name == 'ReduceMean':
+        outdata = np.mean(indata, axis=axis, keepdims=keepdims == 1)
+    else:
+        raise Exception('unsupport op: {}'.format(name))
+    if len(np.asarray(outdata).shape) == 0:
+        outdata = np.asarray([outdata])
+    #  onnx graph
+    if axis is None:
+        node = helper.make_node(name, inputs=['input'], outputs=['output'],
+                                keepdims=keepdims)
+    else:
+        node = helper.make_node(name, inputs=['input'], outputs=['output'],
+                                axis=axis, keepdims=keepdims)
+    graph = helper.make_graph([node],
+                              '{}_test'.format(name),
+                              inputs = [helper.make_tensor_value_info("input",
+                                            TensorProto.FLOAT, list(indata.shape))],
+                              outputs = [helper.make_tensor_value_info("output",
+                                            TensorProto.FLOAT, list(outdata.shape))])
+    model = helper.make_model(graph, producer_name='{}_test'.format(name))
+    #  tvm result
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, 'float32')
+    tvm.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_reduce_max():
+    verify_reduce_x("ReduceMax",
+                    np.random.randn(3, 2, 2).astype(np.float32),
+                    axis=None, keepdims=1)
+    verify_reduce_x("ReduceMax",
+                    np.random.randn(3, 2, 3).astype(np.float32),
+                    axis=None, keepdims=0)
+    verify_reduce_x("ReduceMax",
+                    np.random.randn(3, 3, 3).astype(np.float32),
+                    axis=(1,), keepdims=1)
+
+def test_reduce_min():
+    verify_reduce_x("ReduceMin",
+                    np.random.randn(3, 2, 2).astype(np.float32),
+                    axis=None, keepdims=1)
+    verify_reduce_x("ReduceMin",
+                    np.random.randn(3, 2, 3).astype(np.float32),
+                    axis=None, keepdims=0)
+    verify_reduce_x("ReduceMin",
+                    np.random.randn(3, 3, 3).astype(np.float32),
+                    axis=(1,), keepdims=1)
+
+def test_reduce_sum():
+    verify_reduce_x("ReduceSum",
+                    np.random.randn(3, 2, 2).astype(np.float32),
+                    axis=None, keepdims=1)
+    verify_reduce_x("ReduceSum",
+                    np.random.randn(3, 2, 3).astype(np.float32),
+                    axis=None, keepdims=0)
+    verify_reduce_x("ReduceSum",
+                    np.random.randn(3, 3, 3).astype(np.float32),
+                    axis=(1,), keepdims=1)
+
+def test_reduce_mean():
+    verify_reduce_x("ReduceMean",
+                    np.random.randn(3, 2, 2).astype(np.float32),
+                    axis=None, keepdims=1)
+    verify_reduce_x("ReduceMean",
+                    np.random.randn(3, 2, 3).astype(np.float32),
+                    axis=None, keepdims=0)
+    verify_reduce_x("ReduceMean",
+                    np.random.randn(3, 3, 3).astype(np.float32),
+                    axis=(1,), keepdims=1)
+
+def verify_split(indata, outdatas, split, axis=0):
+    indata = np.array(indata).astype(np.float32)
+    outdatas = [np.array(o).astype(np.float32) for o in outdatas]
+    node = helper.make_node(
+        'Split',
+        inputs=['input'],
+        outputs=['output_{}'.format(i) for i in range(len(split))],
+        axis=axis,
+        split=split
+    )
+    graph = helper.make_graph([node],
+                              'split_test',
+                              inputs = [helper.make_tensor_value_info("input",
+                                            TensorProto.FLOAT, list(indata.shape))],
+                              outputs = [helper.make_tensor_value_info("output_{}".format(i),
+                                            TensorProto.FLOAT, list(outdatas[i].shape))
+                                            for i in range(len(split))
+                                         ])
+    model = helper.make_model(graph, producer_name='split_test')
+
+    for target, ctx in ctx_list():
+        output_shape = [o.shape for o in outdatas]
+        output_type = ['float32', 'float32', 'float32']
+        tvm_out = get_tvm_output(model, indata, target, ctx, output_shape, output_type)
+    for o, t in zip(outdatas, tvm_out):
+        tvm.testing.assert_allclose(o, t)
+
+def test_split():
+    # 1D
+    verify_split([1., 2., 3., 4., 5., 6.], [[1., 2.], [3., 4.], [5., 6.]], [2, 2, 2], 0)
+    verify_split([1., 2., 3., 4., 5., 6.], [[1., 2.], [3.], [4., 5., 6.]], [2, 1, 3], 0)
+    # 2D
+    verify_split([[1., 2., 3., 4.], [7., 8., 9., 10.]],
+                 [[[1., 2.], [7., 8.]], [[3., 4.], [9., 10.]]], [2, 2], 1)
+
+def test_binary_ops():
+    in_shape = (1, 2, 3, 3)
+    dtype = "float32"
+    out_shape = in_shape
+
+    def verify_binary_ops(op, x, y, out_np, broadcast=None):
+        if broadcast is None:
+            z = helper.make_node(op, ['in1', 'in2'], ['out'])
+        else:
+            z = helper.make_node(op, ['in1', 'in2'], ['out'], broadcast=1)
+        graph = helper.make_graph([z],
+                                   '_test',
+                                  inputs = [helper.make_tensor_value_info("in1",
+                                                TensorProto.FLOAT, list(in_shape)),
+                                            helper.make_tensor_value_info("in2",
+                                                TensorProto.FLOAT, list(in_shape))],
+                                  outputs = [helper.make_tensor_value_info("out",
+                                                TensorProto.FLOAT, list(out_shape))])
+        model = helper.make_model(graph, producer_name='_test')
+        for target, ctx in ctx_list():
+            tvm_out = get_tvm_output(model, [x, y], target, ctx)
+            tvm.testing.assert_allclose(out_np, tvm_out)
+
+    x = np.random.uniform(size=in_shape).astype(dtype)
+    y = np.random.uniform(size=in_shape).astype(dtype)
+    z = np.random.uniform(size=(3,)).astype(dtype)
+    verify_binary_ops("Add",x, y, x + y, broadcast=None)
+    verify_binary_ops("Add", x, z,  x + z, broadcast=True)
+    verify_binary_ops("Sub", x, y, x - y, broadcast=None)
+    verify_binary_ops("Sub", x, z, x - z, broadcast=True)
+    verify_binary_ops("Mul",x, y, x * y, broadcast=None)
+    verify_binary_ops("Mul", x, z,  x * z, broadcast=True)
+    verify_binary_ops("Div", x, y, x / y, broadcast=None)
+    verify_binary_ops("Div", x, z, x / z, broadcast=True)
+    verify_binary_ops("Sum", x, y, x + y, broadcast=None)
+
+def test_single_ops():
+    in_shape = (1, 2, 3, 3)
+    dtype = "float32"
+    out_shape = in_shape
+
+    def verify_single_ops(op, x, out_np):
+        z = helper.make_node(op, ['in1'], ['out'])
+        graph = helper.make_graph([z],
+                                   '_test',
+                                  inputs = [helper.make_tensor_value_info("in1",
+                                                TensorProto.FLOAT, list(in_shape)),],
+                                  outputs = [helper.make_tensor_value_info("out",
+                                                TensorProto.FLOAT, list(out_shape))])
+        model = helper.make_model(graph, producer_name='_test')
+        for target, ctx in ctx_list():
+            tvm_out = get_tvm_output(model, [x], target, ctx)
+            tvm.testing.assert_allclose(out_np, tvm_out)
+
+    x = np.random.uniform(size=in_shape).astype(dtype)
+    verify_single_ops("Neg",x, -x)
+    verify_single_ops("Abs",x, np.abs(x))
+    verify_single_ops("Reciprocal",x, 1/x)
+    verify_single_ops("Sqrt",x, np.sqrt(x))
+    verify_single_ops("Relu",x, np.maximum(x, 0))
+    verify_single_ops("Exp",x, np.exp(x))
+    verify_single_ops("Log",x, np.log(x))
+    verify_single_ops("Log",x, np.log(x))
+    verify_single_ops("Tanh",x, np.tanh(x))
+    verify_single_ops("Sigmoid",x, 1 / (1 + np.exp(-x)))
+    verify_single_ops("Softsign",x, x / (1 + np.abs(x)))
+    verify_single_ops("SoftPlus",x, np.log(1 + np.exp(x)))
+
+def test_leaky_relu():
+    def leaky_relu_x(x, alpha):
+        return np.where(x >= 0, x, x * alpha)
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              leaky_relu_x,
+                              {'alpha': 0.25},
+                              'float32',
+                              'LeakyRelu',
+                              {'alpha': 0.25})
+
+def test_elu():
+    def elu_x(x, alpha):
+        return np.where(x > 0, x, alpha * (np.exp(x) - 1.0))
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              elu_x,
+                              {'alpha': 0.25},
+                              'float32',
+                              'Elu',
+                              {'alpha': 0.25})
+
+def test_selu():
+    def selu_x(x, alpha, gamma):
+        return gamma * np.where(x > 0, x, alpha * (np.exp(x) - 1.0))
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              selu_x,
+                              {'alpha': 0.25, 'gamma': 0.3},
+                              'float32',
+                              'Selu',
+                              {'alpha': 0.25, 'gamma': 0.3})
+
+def test_ThresholdedRelu():
+    def ThresholdedRelu_x(x, alpha):
+        out_np = np.clip(x, alpha, np.inf)
+        out_np[out_np == alpha] = 0
+        return out_np
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              ThresholdedRelu_x,
+                              {'alpha': 0.25},
+                              'float32',
+                              'ThresholdedRelu',
+                              {'alpha': 0.25})
+
+def test_ScaledTanh():
+    def ScaledTanh_x(x, alpha, beta):
+        return alpha * np.tanh(beta * x)
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              ScaledTanh_x,
+                              {'alpha': 0.25, 'beta': 0.3},
+                              'float32',
+                              'ScaledTanh',
+                              {'alpha': 0.25, 'beta': 0.3})
+
+def test_ParametricSoftplus():
+    def ParametricSoftplus_x(x, alpha, beta):
+        return alpha * np.log(np.exp(beta * x) + 1)
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              ParametricSoftplus_x,
+                              {'alpha': 0.25, 'beta': 0.3},
+                              'float32',
+                              'ParametricSoftplus',
+                              {'alpha': 0.25, 'beta': 0.3})
+
+def test_Scale():
+    def Scale_x(x, scale):
+        return scale * x
+    _test_onnx_op_elementwise((2, 4, 5, 6),
+                              Scale_x,
+                              {'scale': 0.25},
+                              'float32',
+                              'Scale',
+                              {'scale': 0.25})
+
+def test_LogSoftmax():
+    _test_onnx_op_elementwise((1, 4),
+                              topi.testing.log_softmax_python,
+                              {},
+                              'float32',
+                              'LogSoftmax',
+                              {'axis': 1})
 
 if __name__ == '__main__':
     # verify_super_resolution_example()
@@ -398,3 +1023,27 @@ def test_lrn():
     test_matmul()
     test_gather()
     test_lrn()
+    test_upsample()
+    test_forward_min()
+    test_forward_max()
+    test_forward_mean()
+    test_forward_hardsigmoid()
+    test_forward_arg_min_max()
+    test_softmax()
+    test_constantfill()
+    test_pad()
+    test_reduce_max()
+    test_reduce_min()
+    test_reduce_sum()
+    test_reduce_mean()
+    test_split()
+    test_binary_ops()
+    test_single_ops()
+    test_leaky_relu()
+    test_elu()
+    test_selu()
+    test_ThresholdedRelu()
+    test_ScaledTanh()
+    test_ParametricSoftplus()
+    test_Scale()
+    test_LogSoftmax()
diff --git a/nnvm/tests/python/frontend/onnx/test_graph.py b/nnvm/tests/python/frontend/onnx/test_graph.py
old mode 100644
new mode 100755
index 7fa705ef4c65..b3961c1a38fd
--- a/nnvm/tests/python/frontend/onnx/test_graph.py
+++ b/nnvm/tests/python/frontend/onnx/test_graph.py
@@ -3,9 +3,10 @@
 import onnx
 from nnvm.compiler import graph_util, graph_attr
 from model_zoo import super_resolution, super_resolution_sym
+from model_zoo import squeezenet as squeezenet
 
 def compare_graph(onnx_file, nnvm_sym, ishape):
-    onnx_model = onnx.load(onnx_file)
+    onnx_model = onnx.load_model(onnx_file)
     onnx_sym, params = nnvm.frontend.from_onnx(onnx_model)
     g1 = nnvm.graph.create(onnx_sym)
     g2 = nnvm.graph.create(nnvm_sym)
@@ -18,8 +19,16 @@ def compare_graph(onnx_file, nnvm_sym, ishape):
     graph_util.check_graph_equal(g1, g2)
 
 def test_super_resolution_example():
-    fname, symbol = super_resolution, super_resolution_sym
+    fname, symbol = "super_resolution.onnx", super_resolution_sym
     compare_graph(fname, symbol, ishape=(1, 1, 224, 224))
 
+def test_squeeze_net():
+    # Only works for model downloaded from
+    # https://github.com/onnx/models/tree/master/squeezenet
+    fname = "squeezenet1_1.onnx"
+    symbol, params = squeezenet.get_workload(version='1.1')
+    compare_graph(fname, symbol, ishape=(1, 3, 224, 224))
+
 if __name__ == '__main__':
     test_super_resolution_example()
+    test_squeeze_net()
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index 495852f9e5d6..5b8f11695790 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -26,11 +26,21 @@
 #######################################################################
 # Generic run functions for TVM & tensorflow
 # ------------------------------------------
-def run_tvm_graph(graph_def, input_data, input_node, output_shape, output_dtype):
+def convert_to_list(x):
+    if not isinstance(x, list):
+        x = [x]
+    return x
+
+def run_tvm_graph(graph_def, input_data, input_node, num_output=1, target='llvm', out_names=None):
     """ Generic function to compile on nnvm and execute on tvm """
+    input_data = convert_to_list(input_data)
+    input_node = convert_to_list(input_node)
+
+    layout = None
+    if target == "cuda":
+        layout = "NCHW"
+    target_host = 'llvm'
 
-    sym, params = nnvm.frontend.from_tensorflow(graph_def)
-    target = 'llvm'
     if isinstance(input_data, list):
         shape_dict = {}
         dtype_dict = {}
@@ -40,140 +50,158 @@ def run_tvm_graph(graph_def, input_data, input_node, output_shape, output_dtype)
     else:
         shape_dict = {input_node: input_data.shape}
         dtype_dict = {input_node: input_data.dtype}
-
-    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict,
+   
+    sym, params = nnvm.frontend.from_tensorflow(graph_def, layout=layout, shape=shape_dict, outputs=out_names)
+    graph, lib, params = nnvm.compiler.build(sym, target=target, target_host=target_host, shape=shape_dict,
                                              dtype=dtype_dict, params=params)
 
-    ctx = tvm.cpu(0)
+    ctx = tvm.context(target, 0)
     from tvm.contrib import graph_runtime
     m = graph_runtime.create(graph, lib, ctx)
     # set inputs
-    if isinstance(input_data, list):
-        for i, e in enumerate(input_node):
-            m.set_input(e, tvm.nd.array(input_data[i].astype(input_data[i].dtype)))
-    else:
-        m.set_input(input_node, tvm.nd.array(input_data.astype(input_data.dtype)))
+    for i, e in enumerate(input_node):
+        m.set_input(e, tvm.nd.array(input_data[i].astype(input_data[i].dtype)))
 
     m.set_input(**params)
     # execute
     m.run()
     # get outputs
-    if isinstance(output_shape, list) and isinstance(output_dtype, list):
-        tvm_output_list = []
-        for i, s in enumerate(output_shape):
-            tvm_output = m.get_output(i, tvm.nd.empty((s), output_dtype[i]))
-            tvm_output_list.append(tvm_output.asnumpy())
-        return tvm_output_list
-    else:
-        tvm_output = m.get_output(0, tvm.nd.empty((output_shape), output_dtype))
-        return tvm_output.asnumpy()
+    assert out_names is None or num_output == len(out_names),"out_names: {} num_output: {}".format(
+                                                              out_names, num_output)
+    tvm_output_list = []
+    for i in range(0, num_output):
+        tvm_output = m.get_output(i)
+        tvm_output_list.append(tvm_output.asnumpy())
+    return tvm_output_list
 
 def run_tf_graph(sess, input_data, input_node, output_node):
     """ Generic function to execute tensorflow """
+    input_data = convert_to_list(input_data)
+    input_node = convert_to_list(input_node)
+    output_node = convert_to_list(output_node)
 
-    tensor = sess.graph.get_tensor_by_name(output_node)
+    tensor = [0] * len(output_node)
+    for i in range(len(output_node)):
+        tensor[i] = sess.graph.get_tensor_by_name(output_node[i])
 
-    if isinstance(input_data, list):
-        input_dict = {}
-        for i, e in enumerate(input_node):
-            input_dict[e] = input_data[i]
-    else:
-        input_dict = {input_node: input_data}
+    input_dict = {}
+    for i, e in enumerate(input_node):
+        input_dict[e] = input_data[i]
 
     output_data = sess.run(tensor, input_dict)
     return output_data
 
+
+def compare_tf_with_tvm(in_data, in_name, out_name, init_global_variables=False, no_gpu=False):
+    """Generic function to generate and compare tensorflow and TVM output"""
+
+    out_name = convert_to_list(out_name)
+    out_node = [0]*len(out_name)
+    for i in range(len(out_name)):
+        out_node[i] = out_name[i].split(':')[0] if ":" in out_name[i] else out_name[i]
+
+    in_data = convert_to_list(in_data)
+    in_name = convert_to_list(in_name)
+    in_node = [0]*len(in_name)
+    for i in range(len(in_name)):
+        in_node[i] = in_name[i].split(':')[0] if ":" in in_name[i] else in_name[i]
+
+    with tf.Session() as sess:
+        if init_global_variables:
+            sess.run(variables.global_variables_initializer())
+        final_graph_def = tf.graph_util.convert_variables_to_constants(
+            sess,
+            sess.graph.as_graph_def(add_shapes=True),
+            out_node,
+            )
+        tf_output = run_tf_graph(sess, in_data, in_name, out_name)
+
+        for device in ["llvm", "cuda"]:
+            ctx = tvm.context(device, 0)
+            if not ctx.exist:
+                print("Skip because %s is not enabled" % device)
+                continue
+            if no_gpu and device == 'cuda':
+                continue
+
+            tvm_output = run_tvm_graph(final_graph_def, in_data, in_node,
+                                       num_output=len(out_node), target=device, out_names=out_name)
+            # since the names from tensorflow and nnvm runs are not exactly same, 
+            # first len(tf_output) will be compared
+            for i in range(len(tf_output)):
+                tvm.testing.assert_allclose(tf_output[i], tvm_output[i], atol=1e-5, rtol=1e-5)
+
+        sess.close()
+
+def is_gpu_available():
+    from tensorflow.python.client import device_lib
+    local_device_protos = device_lib.list_local_devices()
+    gpu_list = [x.name for x in local_device_protos if x.device_type == 'GPU']
+    if len(gpu_list) < 0:
+        print("Tensorflow GPU:", gpu_list)
+        return True
+    else:
+        return False
+
 #######################################################################
 # Pooling
 # -------
-def _test_pooling(input_shape, **kwargs):
+def _test_pooling_iteration(input_shape, **kwargs):
     """ One iteration of pool operation with given shapes and attributes """
 
     x = -np.arange(
         np.prod(input_shape), dtype=np.float32).reshape(input_shape) - 1
 
     with tf.Graph().as_default():
-        in_data = constant_op.constant(x, shape=input_shape, dtype='float32')
-        # pylint: disable=unused-variable
-        pool = nn_ops.pool(in_data, **kwargs)
-        # pylint: enable=unused-variable
+        in_data = array_ops.placeholder(shape=input_shape, dtype='float32')
+        nn_ops.pool(in_data, **kwargs)
 
         if kwargs['pooling_type'] == 'MAX':
-            out_node = 'max_pool'
             out_name = 'max_pool:0'
         else:
-            out_node = 'avg_pool'
             out_name = 'avg_pool:0'
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                [out_node],
-                )
+        compare_tf_with_tvm(x, 'Placeholder:0', out_name)
 
-            tf_output = run_tf_graph(sess, x, 'Const:0', out_name)
-            tvm_output = run_tvm_graph(graph_def, x.astype('float32'),
-                                       "Const", tf_output.shape, 'float32')
-            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3)
+def _test_pooling(input_shape, **kwargs):
+    _test_pooling_iteration(input_shape, **kwargs)
 
-            sess.close()
+    if is_gpu_available():
+        input_shape = [input_shape[ii] for ii in (0, 3, 1, 2)]
+        kwargs['data_layout'] = 'NCHW'
+        _test_pooling_iteration(input_shape, **kwargs)
 
 def test_forward_pooling():
     """ Pooling """
 
-    _test_pooling(input_shape=[2, 9, 10, 2],
-                 window_shape=[1, 1],
-                 padding='SAME',
-                 pooling_type='MAX',
-                 dilation_rate=[1, 1],
-                 strides=[1, 1])
-    _test_pooling(input_shape=[2, 9, 10, 2],
-                 window_shape=[1, 1],
-                 padding='SAME',
-                 pooling_type='AVG',
-                 dilation_rate=[1, 1],
-                 strides=[1, 1])
-
-    _test_pooling(input_shape=[2, 10, 9, 2],
-                 window_shape=[1, 1],
-                 padding='SAME',
-                 pooling_type='MAX',
-                 dilation_rate=[1, 1],
-                 strides=[1, 1])
-    _test_pooling(input_shape=[2, 10, 9, 2],
-                 window_shape=[1, 1],
-                 padding='SAME',
-                 pooling_type='AVG',
-                 dilation_rate=[1, 1],
-                 strides=[1, 1])
-
-    _test_pooling(input_shape=[2, 9, 10, 2],
-                 window_shape=[2, 1],
-                 padding='SAME',
-                 pooling_type='MAX',
-                 dilation_rate=[1, 1],
-                 strides=[1, 1])
-    _test_pooling(input_shape=[2, 9, 10, 2],
-                 window_shape=[2, 1],
-                 padding='SAME',
-                 pooling_type='AVG',
-                 dilation_rate=[1, 1],
-                 strides=[2, 1])
-
-    _test_pooling(input_shape=[2, 10, 9, 2],
-                 window_shape=[2, 3],
-                 padding='SAME',
-                 pooling_type='MAX',
-                 dilation_rate=[1, 1],
-                 strides=[2, 1])
-    _test_pooling(input_shape=[2, 10, 9, 2],
-                 window_shape=[2, 3],
-                 padding='SAME',
-                 pooling_type='AVG',
-                 dilation_rate=[1, 1],
-                 strides=[1, 2])
-
+    for pool_type in ['AVG', 'MAX']:
+            _test_pooling(input_shape=[2, 9, 10, 2],
+                         window_shape=[1, 1],
+                         padding='SAME',
+                         pooling_type=pool_type,
+                         dilation_rate=[1, 1],
+                         strides=[1, 1])
+
+            _test_pooling(input_shape=[2, 10, 9, 2],
+                         window_shape=[1, 1],
+                         padding='SAME',
+                         pooling_type=pool_type,
+                         dilation_rate=[1, 1],
+                         strides=[1, 1])
+
+            _test_pooling(input_shape=[2, 9, 10, 2],
+                         window_shape=[2, 1],
+                         padding='SAME',
+                         pooling_type=pool_type,
+                         dilation_rate=[1, 1],
+                         strides=[1, 1])
+
+            _test_pooling(input_shape=[2, 10, 9, 2],
+                         window_shape=[2, 3],
+                         padding='SAME',
+                         pooling_type=pool_type,
+                         dilation_rate=[1, 1],
+                         strides=[2, 1])
 
 #######################################################################
 # Convolution
@@ -195,37 +223,27 @@ def _test_convolution(tensor_in_sizes, filter_in_sizes,
     filter_array = [f * 1.0 for f in range(1, total_size_2 + 1)]
 
     with tf.Graph().as_default():
-        in_data = constant_op.constant(data_array, shape=tensor_in_sizes, dtype='float32')
+        in_data = array_ops.placeholder(shape=tensor_in_sizes, dtype='float32')
         in_filter = constant_op.constant(filter_array, shape=filter_in_sizes, dtype='float32')
         strides = [1] + strides + [1]
         dilations = [1] + dilations + [1]
 
-        # pylint: disable=unused-variable
-        conv = nn_ops.conv2d(in_data,
-                             in_filter,
-                             strides=strides,
-                             padding=padding,
-                             data_format=data_format)
-        # pylint: enable=unused-variable
+        nn_ops.conv2d(in_data,
+                      in_filter,
+                      strides=strides,
+                      padding=padding,
+                      data_format=data_format)
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['Conv2D'],
-                )
-
-            tf_output = run_tf_graph(sess, np.reshape(data_array, tensor_in_sizes),
-                                     'Const:0', 'Conv2D:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       np.reshape(data_array, tensor_in_sizes).astype('float32'),
-                                       "Const", tf_output.shape, 'float32')
-
-            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3)
-
-            sess.close()
+        compare_tf_with_tvm(np.reshape(data_array, tensor_in_sizes).astype('float32'),
+                            'Placeholder:0', 'Conv2D:0')
 
 def test_forward_convolution():
+    if is_gpu_available():
+        _test_convolution([4, 176, 8, 8], [1, 1, 176, 32], [1, 1], [1, 1], 'SAME', 'NCHW')
+        _test_convolution([4, 19, 17, 17], [3, 3, 19, 19], [1, 1], [2, 2], 'VALID', 'NCHW')
+        _test_convolution([4, 124, 17, 17], [1, 1, 124, 19], [1, 1], [1, 1], 'SAME', 'NCHW')
+        _test_convolution([4, 12, 17, 17], [3, 3, 12, 32], [1, 1], [2, 2], 'VALID', 'NCHW')
+
     _test_convolution([4, 8, 8, 176], [1, 1, 176, 32], [1, 1], [1, 1], 'SAME', 'NHWC')
     _test_convolution([4, 17, 17, 19], [3, 3, 19, 19], [1, 1], [2, 2], 'VALID', 'NHWC')
     _test_convolution([4, 17, 17, 124], [1, 1, 124, 19], [1, 1], [1, 1], 'SAME', 'NHWC')
@@ -239,28 +257,10 @@ def _test_reshape(data, out_shape):
     """ One iteration of reshape operation with given data and out shape """
 
     with tf.Graph().as_default():
-        in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype)
-
-        # pylint: disable=unused-variable
-        reshape_out = array_ops.reshape(in_data, out_shape)
-        # pylint: enable=unused-variable
+        in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
+        array_ops.reshape(in_data, out_shape)
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['Reshape'],
-                )
-
-            tf_output = run_tf_graph(sess, data,
-                                     'Const:0', 'Reshape:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       data,
-                                       "Const", tf_output.shape, data.dtype)
-
-            np.testing.assert_allclose(tf_output, tvm_output)
-
-            sess.close()
+        compare_tf_with_tvm(data, 'Placeholder:0', 'Reshape:0')
 
 def test_forward_reshape():
     _test_reshape(np.arange(6.0), [2, 3])
@@ -268,6 +268,7 @@ def test_forward_reshape():
     _test_reshape(np.arange(6), [3, -1])
     _test_reshape(np.arange(6), [-1])
 
+#######################################################################
 #######################################################################
 # Squeeze
 # -------
@@ -279,31 +280,14 @@ def _test_squeeze(data, squeeze_dims=None):
         squeeze_dims = []
 
     with tf.Graph().as_default():
-        in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype)
+        in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
 
-        # pylint: disable=unused-variable
         if squeeze_dims:
-            squeeze_out = array_ops.squeeze(in_data, squeeze_dims)
+            array_ops.squeeze(in_data, squeeze_dims)
         else:
-            squeeze_out = array_ops.squeeze(in_data)
-        # pylint: enable=unused-variable
-
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['Squeeze'],
-                )
-
-            tf_output = run_tf_graph(sess, data,
-                                     'Const:0', 'Squeeze:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       data,
-                                       "Const", tf_output.shape, data.dtype)
+            array_ops.squeeze(in_data)
 
-            np.testing.assert_allclose(tf_output, tvm_output)
-
-            sess.close()
+        compare_tf_with_tvm(data, 'Placeholder:0', 'Squeeze:0')
 
 def test_forward_squeeze():
     """ Squeeze """
@@ -336,28 +320,10 @@ def _test_concat_v2(data, dim):
     """ One iteration of ConcatV2 """
 
     with tf.Graph().as_default():
+        gen_array_ops._concat_v2(data, dim)
 
-        # pylint: disable=unused-variable
-        concat_out = gen_array_ops._concat_v2(data, dim)
-        # pylint: enable=unused-variable
-
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['ConcatV2'],
-                )
-
-            tf_output = run_tf_graph(sess, data,
-                                     ['ConcatV2/values_0:0', 'ConcatV2/values_1:0'], 'ConcatV2:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       data,
-                                       ["ConcatV2/values_0", 'ConcatV2/values_1'],
-                                       tf_output.shape, tf_output.dtype)
-
-            np.testing.assert_allclose(tf_output, tvm_output)
-
-            sess.close()
+        compare_tf_with_tvm(data, ['ConcatV2/values_0:0', 'ConcatV2/values_1:0'],
+                            'ConcatV2:0')
 
 def _test_forward_concat_v2():
     t1 = np.array([])
@@ -377,28 +343,10 @@ def _test_sigmoid(data):
     """ One iteration of sigmoid """
 
     with tf.Graph().as_default():
-        in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype)
-
-        # pylint: disable=unused-variable
+        in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
         sigmoid_out = math_ops.sigmoid(in_data)
-        # pylint: enable=unused-variable
-
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['Sigmoid'],
-                )
 
-            tf_output = run_tf_graph(sess, data,
-                                     'Const:0', 'Sigmoid:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       data,
-                                       "Const", tf_output.shape, data.dtype)
-
-            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
-
-            sess.close()
+        compare_tf_with_tvm(data, 'Placeholder:0', 'Sigmoid:0')
 
 def test_forward_sigmoid():
     """ Sigmoid """
@@ -412,36 +360,44 @@ def test_forward_sigmoid():
 def _test_argx(func, data, **kwargs):
 
     with tf.Graph().as_default():
-        inp = constant_op.constant(data, shape=data.shape, dtype=data.dtype, name="c0")
-
-        # pylint: disable=unused-variable
-        out = func(inp, name="argx0", **kwargs)
-        # pylint: enable=unused-variable
-
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess=sess,
-                input_graph_def=sess.graph.as_graph_def(add_shapes=True),
-                output_node_names=["argx0"])
+        inp = array_ops.placeholder(shape=data.shape, dtype=data.dtype, name="c0")
+        func(inp, name="argx0", **kwargs, output_type=tf.int32)
 
-            tf_output = run_tf_graph(sess, data, input_node="c0:0", output_node="argx0:0")
-            tvm_output = run_tvm_graph(graph_def, data, "c0", tf_output.shape, output_dtype='int32')
+        compare_tf_with_tvm(data, 'c0:0', 'argx0:0')
 
-            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
-
-            sess.close()
-
-def test_argmin_argmax():
+def test_forward_argminmax():
     for axis in [None,0,1,2]:
         data = np.random.uniform(size=(8,4,9)).astype('float32')
         _test_argx(tf.argmax, data=data, axis=axis)
         _test_argx(tf.argmin, data=data, axis=axis)
 
+#######################################################################
+# Reduce
+# ------
+
+def _test_reduce(func, data, **kwargs):
+    """ One iteration of a reduce operation"""
+
+    with tf.Graph().as_default():
+        inp = array_ops.placeholder(shape=data.shape, dtype=data.dtype, name="c0")
+        func(inp, name="reducex0", **kwargs)
+
+        compare_tf_with_tvm(data, 'c0:0', 'reducex0:0')
+
+def test_forward_reduce():
+    data = np.random.uniform(size=(8,4,9)).astype('float32')
+    _test_reduce(tf.reduce_sum, data=data)
+    _test_reduce(tf.reduce_sum, data=data, axis=0)
+    _test_reduce(tf.reduce_sum, data=data, axis=(0,1))
+
+
 #######################################################################
 # Variable
 # --------
 
 def _test_variable(data):
+    """ One iteration of a variable """
+
     tf.reset_default_graph()
     input_op = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
     input_tensor = array_ops.reshape(input_op, data.shape)
@@ -450,84 +406,15 @@ def _test_variable(data):
     with variable_scope.variable_scope("linear", reuse=None):
         w = variable_scope.get_variable(
             "w", shape=[size, size], dtype=input_tensor.dtype)
-    # pylint: disable=unused-variable
-    output_op = math_ops.matmul(input_tensor, w)
-    # pylint: enable=unused-variable
-
-    with tf.Session() as sess:
-        sess.run(variables.global_variables_initializer())
-        final_graph_def = tf.graph_util.convert_variables_to_constants(
-            sess,
-            sess.graph.as_graph_def(add_shapes=True),
-            ['MatMul'],
-            )
+    math_ops.matmul(input_tensor, w)
 
-        tf_output = run_tf_graph(sess, data, 'Placeholder:0', 'MatMul:0')
-        tvm_output = run_tvm_graph(final_graph_def, data,
-                                   "Placeholder", tf_output.shape, data.dtype)
-
-        np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
-        sess.close()
+    compare_tf_with_tvm(data, 'Placeholder:0', 'MatMul:0', init_global_variables=True)
 
 def test_forward_variable():
     """Variable type op test"""
     _test_variable(np.random.uniform(size=(32, 100)).astype('float32'))
 
 
-#######################################################################
-# LSTM
-# ----
-def _test_lstm_cell(batch_size, num_hidden, num_layers, forget_bias, dtype):
-    tf.reset_default_graph()
-    input_size = num_hidden
-    input_data = np.full((batch_size, input_size), 1., dtype=dtype)
-    in_state_c = np.full((num_layers, batch_size, num_hidden), 0.1, dtype=dtype)
-    in_state_h = np.full((num_layers, batch_size, num_hidden), 0.1, dtype=dtype)
-
-    def _get_tensorflow_output():
-        with tf.Session() as sess:
-            with variable_scope.variable_scope(
-                "root", initializer=init_ops.constant_initializer(0.5)):
-                m0 = array_ops.zeros([batch_size, num_hidden])
-                m1 = array_ops.zeros([batch_size, num_hidden])
-                x=tf.placeholder(shape=(batch_size, input_size), dtype=dtype)
-                g, ((out_m0, out_m1)) = \
-                     tf.contrib.rnn.LSTMBlockCell(num_hidden,
-                                                  forget_bias=forget_bias)(x, ((m0, m1)))
-                sess.run([variables.global_variables_initializer()])
-                res = sess.run([g, out_m0, out_m1], {
-                    x.name: np.array([[1., 1.]]),
-                    m0.name: 0.1 * np.ones([batch_size, num_hidden]),
-                    m1.name: 0.1 * np.ones([batch_size, num_hidden]),
-                })
-            graph_def = sess.graph.as_graph_def(add_shapes=True)
-            final_graph_def = graph_util.convert_variables_to_constants(
-                sess,
-                graph_def,
-                ['root/lstm_cell/LSTMBlockCell'])
-            return final_graph_def, res
-
-    graph_def, tf_out = _get_tensorflow_output()
-    tvm_output = run_tvm_graph(graph_def, [input_data, in_state_c, in_state_h],
-                               ['root/Placeholder', 'root/lstm_cell/LSTMBlockCell_c',
-                                'root/lstm_cell/LSTMBlockCell_h'],
-                               [tf_out[0].shape, (2, batch_size, num_hidden)],
-                               [tf_out[0].dtype, tf_out[1].dtype])
-
-    if isinstance(tvm_output, list):
-        out = tvm_output[0]
-        out_state = tvm_output[1]
-        out_state_tup = np.split(out_state, indices_or_sections=2, axis=0)
-        out_state_c = np.reshape(out_state_tup[0], (batch_size, num_hidden))
-        out_state_h = np.reshape(out_state_tup[1], (batch_size, num_hidden))
-        tvm_out = [out, out_state_c, out_state_h]
-        np.testing.assert_allclose(tf_out, tvm_out, rtol=1e-3, atol=1e-3)
-
-def test_forward_lstm():
-    '''test LSTM block cell'''
-    _test_lstm_cell(1, 2, 1, 0.0, 'float32')
-
-
 #######################################################################
 # StridedSlice
 # ------------
@@ -535,6 +422,8 @@ def test_forward_lstm():
 def _test_stridedslice(ip_shape, begin, end, stride, dtype,
                              begin_mask=0, end_mask=0, new_axis_mask=0,
                              shrink_axis_mask=0, ellipsis_mask=0):
+    """ One iteration of a Stridedslice """
+
     tf.reset_default_graph()
     in_data = tf.placeholder(dtype, ip_shape, name="in_data")
     tf.strided_slice(in_data, begin, end, stride, begin_mask=begin_mask,
@@ -543,24 +432,19 @@ def _test_stridedslice(ip_shape, begin, end, stride, dtype,
                          ellipsis_mask=ellipsis_mask, name="strided_slice")
     np_data = np.random.uniform(size=ip_shape).astype(dtype)
 
-    with tf.Session() as sess:
-        final_graph_def = tf.graph_util.convert_variables_to_constants(
-            sess,
-            sess.graph.as_graph_def(add_shapes=True),
-            ['strided_slice'])
-        tf_output = run_tf_graph(sess, np_data,
-                                 'in_data:0', 'strided_slice:0')
-        tvm_output = run_tvm_graph(final_graph_def, np_data,
-                                   "in_data", tf_output.shape, np_data.dtype)
-        np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
-        sess.close()
+    compare_tf_with_tvm(np_data, 'in_data:0', 'strided_slice:0')
 
 def test_forward_stridedslice():
     '''test StridedSlice'''
+
     _test_stridedslice((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], 'float32')
     _test_stridedslice((3, 4, 3), [1, 0], [4, 3], [2, 1], 'float32', ellipsis_mask=8)
+    _test_stridedslice((3, 4, 3), [1, 0], [4, 2], [2, 1], 'float32', ellipsis_mask=2)
+    _test_stridedslice((3, 4, 5, 3), [1, 0], [4, 2], [2, 1], 'float32', ellipsis_mask=2)
+    _test_stridedslice((3, 4, 5, 3), [1, 0, 1], [4, 2, 2], [2, 1, 1], 'float32', ellipsis_mask=2)
     _test_stridedslice((3, 4, 3), [1, 1, 0], [4, 4, 2], [2, 1, 1], 'float32', new_axis_mask=5)
     _test_stridedslice((3, 4, 3), [1, 1, 1], [4, 4, 1], [2, 1, 1], 'float32', ellipsis_mask=2, new_axis_mask=4)
+    _test_stridedslice((6, 4, 5), [1, 1, 1], [6, 3, 4], [2, 1, 1], 'float32', ellipsis_mask=2, new_axis_mask=5)
     _test_stridedslice((3, 4, 3), [1, 1, 2], [4, 4, 3], [2, 1, 1], 'float32', ellipsis_mask=4, new_axis_mask=2)
     _test_stridedslice((3, 4, 3), [1, 1, 2], [4, 4, 3], [2, 1, 1], 'float32', ellipsis_mask=2, new_axis_mask=3)
     _test_stridedslice((3, 4, 3), [1, 1, 0], [4, 4, 1], [2, 1, 1], 'float32', ellipsis_mask=2, new_axis_mask=3)
@@ -579,6 +463,7 @@ def test_forward_stridedslice():
     _test_stridedslice((3, 4, 5, 4, 5, 6), [1, 2, 0, -3], [4, 5, 3, 3], [2, 2, 1, 1],
                        'float32', shrink_axis_mask=8, new_axis_mask=1, ellipsis_mask=2, begin_mask=5,
                        end_mask=8)
+    _test_stridedslice((1), [0], [1], [1], 'float32', shrink_axis_mask=1)
 
 
 #######################################################################
@@ -586,6 +471,8 @@ def test_forward_stridedslice():
 # ------
 
 def _test_gather(ip_shape, indice_shape, indice_value, axis, dtype):
+    """ One iteration of a Gather """
+
     tf.reset_default_graph()
     in_data = tf.placeholder(dtype, ip_shape, name="in_data")
     indices = tf.placeholder("int32", indice_shape, name="indices")
@@ -601,17 +488,7 @@ def _fill_indices(indice_value):
         return indices
     np_indices = _fill_indices(indice_value)
 
-    with tf.Session() as sess:
-        final_graph_def = tf.graph_util.convert_variables_to_constants(
-            sess,
-            sess.graph.as_graph_def(add_shapes=True),
-            ['GatherV2'])
-        tf_output = run_tf_graph(sess, [np_data, np_indices], ['in_data:0',
-                                 'indices:0'], 'GatherV2:0')
-        tvm_output = run_tvm_graph(final_graph_def, [np_data, np_indices],
-                                   ['in_data', 'indices'], tf_output.shape, dtype)
-        np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
-        sess.close()
+    compare_tf_with_tvm([np_data, np_indices], ['in_data:0', 'indices:0'], 'GatherV2:0')
 
 def test_forward_gather():
     '''test gather layer'''
@@ -627,6 +504,85 @@ def test_forward_gather():
     _test_gather((4,3,5,6), (1,4), [[2,1,0,0]], 0, 'float32')
 
 
+#######################################################################
+# Split
+# -----
+
+def _test_split(in_shape, axis, num_or_size_splits, dtype):
+    np_data = np.random.uniform(-5, 5, size=in_shape).astype(dtype)
+
+    """ One iteration of a Split """
+    tf.reset_default_graph()
+    in_data = tf.placeholder(dtype, in_shape, name="in_data")
+    num_split = len(num_or_size_splits) if isinstance(num_or_size_splits, list) else num_or_size_splits
+    tf.split(in_data, num_or_size_splits, axis=axis)
+
+    compare_tf_with_tvm([np_data], ['in_data:0'], [f'split:{n}' for n in range(num_split)])
+
+    # and now test together with concat
+    tf.reset_default_graph()
+    in_data = tf.placeholder(dtype, in_shape, name="in_data")
+    splitted = tf.split(in_data, num_or_size_splits, axis=axis)
+    tf.concat(splitted, axis)
+
+    compare_tf_with_tvm([np_data], 'in_data:0', 'concat:0')
+
+def test_forward_split():
+    '''test split layer'''
+    # rank 1
+    _test_split((3,), 0, 1, 'float32')
+    _test_split((3,), 0, 3, 'float32')
+    _test_split((6,), 0, 3, 'float32')
+    # rank 2
+    _test_split((6, 2), 0, 3, 'float32')
+    _test_split((2, 6), 1, 6, 'float32')
+    # rank 3
+    _test_split((6, 2, 4), 0, 2, 'int32')
+    _test_split((2, 6, 4), 1, 3, 'float32')
+    _test_split((2, 4, 6), 2, 1, 'float32')
+    # rank 4
+    _test_split((6, 1, 3, 5), 0, 3, 'float32')
+    _test_split((1, 6, 3, 5), 1, 3, 'float32')
+    _test_split((1, 3, 6, 5), 2, 3, 'float32')
+    _test_split((1, 3, 5, 6), 3, 3, 'float32')
+    # split along negative axis
+    _test_split((6, 1, 3, 5), -4, 3, 'float32')
+    _test_split((1, 6, 3, 5), -3, 3, 'float32')
+    _test_split((1, 3, 6, 5), -2, 3, 'float32')
+    _test_split((1, 3, 5, 6), -1, 3, 'float32')
+    # size_splits list
+    _test_split((6,), 0, [1, 2, 3], 'int32')
+    _test_split((3, 6, 4), -2, [1, 4, 1], 'float32')
+
+
+#######################################################################
+# Unstack
+# -------
+
+def _test_unstack(ip_shape, axis, dtype):
+    np_data = np.random.uniform(-5, 5, size=ip_shape).astype(dtype)
+
+    tf.reset_default_graph()
+    in_data = tf.placeholder(dtype, ip_shape, name="in_data")
+    tf.unstack(in_data, axis=axis)
+
+    compare_tf_with_tvm([np_data], ['in_data:0'], [f'unstack:{n}' for n in range(ip_shape[axis])])
+
+    tf.reset_default_graph()
+    in_data = tf.placeholder(dtype, ip_shape, name="in_data")
+    tf.stack(tf.unstack(in_data, axis=axis), axis=axis)
+
+    compare_tf_with_tvm([np_data], ['in_data:0'], 'stack:0')
+
+def test_forward_unstack():
+    '''test unstack layer'''
+    _test_unstack((6,), 0, 'int32')
+    _test_unstack((2,6), 1, 'float64')
+    # negative axis
+    _test_unstack((1,4), -1, 'int32')
+    _test_unstack((3,6,4), -2, 'float32')
+
+
 #######################################################################
 # Multi Input to graph
 # --------------------
@@ -640,28 +596,40 @@ def test_forward_multi_input():
 
         out1 = tf.add(in1, in2, name='out1')
         out2 = tf.subtract(in3, in4, name='out2')
-
         out = tf.multiply(out1, out2, name='out')
+        in_data = np.arange(9, dtype='int32').reshape([3, 3])
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['out'],
-                )
+        compare_tf_with_tvm([in_data, in_data, in_data, in_data],
+                            ['in1:0', 'in2:0', 'in3:0', 'in4:0'], 'out:0')
 
-            in_data = np.arange(9, dtype='int32').reshape([3, 3])
-
-            tf_output = run_tf_graph(sess, [in_data, in_data, in_data, in_data ],
-                                     ['in1:0', 'in2:0', 'in3:0', 'in4:0'], 'out:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       [in_data, in_data, in_data, in_data ],
-                                       ['in1', 'in2', 'in3', 'in4'],
-                                       tf_output.shape, tf_output.dtype)
+#######################################################################
+# Multi Output to Graph
+# ---------------------
 
-            np.testing.assert_allclose(tf_output, tvm_output)
+def test_forward_multi_output():
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(tf.int32, shape=[3, 3], name='in1')
+        in2 = tf.placeholder(tf.int32, shape=[3, 3], name='in2')
+        in3 = tf.placeholder(tf.int32, shape=[3, 3], name='in3')
+        in4 = tf.placeholder(tf.int32, shape=[3, 3], name='in4')
 
-            sess.close()
+        out1 = tf.add(in1, in2, name='out1')
+        out2 = tf.subtract(in3, in4, name='out2')
+        in_data = np.arange(9, dtype='int32').reshape([3, 3])
+        in_data = [in_data] * 4
+        in_name = ['in1:0', 'in2:0', 'in3:0', 'in4:0']
+        out_name = ['out1:0', 'out2:0']
+        out_node = [out.strip(':0') for out in out_name]
+        in_node = [inp.strip(':0') for inp in in_name]
+        
+        with tf.Session() as sess:
+            final_graph_def = tf.graph_util.convert_variables_to_constants(
+                sess, sess.graph.as_graph_def(add_shapes=True), out_node,)
+            tf_output = run_tf_graph(sess, in_data, in_name, out_name)
+            tvm_output = run_tvm_graph(final_graph_def, in_data, in_node, target='llvm',
+                                       out_names=out_node, num_output=2)
+            for i in range(len(tf_output)):
+                tvm.testing.assert_allclose(tf_output[i], tvm_output[i], atol=1e-5, rtol=1e-5)
 
 #######################################################################
 # Resize Bilinear
@@ -674,36 +642,97 @@ def _test_resize_bilinear(in_shape, to_shape, align_corners):
     shape_data = np.array(to_shape).astype('int32')
 
     with tf.Graph().as_default():
-        in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype)
+        in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
         shape_data = constant_op.constant(shape_data, shape=shape_data.shape, dtype=shape_data.dtype)
+        tf.image.resize_bilinear(in_data, shape_data, align_corners=align_corners)
+
+        compare_tf_with_tvm(data, 'Placeholder:0', 'ResizeBilinear:0')
 
-        # pylint: disable=unused-variable
-        resize_out = tf.image.resize_bilinear(in_data, shape_data, align_corners=align_corners)
-        # pylint: enable=unused-variable
+def test_forward_resize_bilinear():
+    """ Resize Bilinear """
+
+    _test_resize_bilinear((4, 16, 32, 32), [50, 50], False)
+    _test_resize_bilinear((6, 32, 64, 64), [20, 20], True)
 
+
+#######################################################################
+# LSTM
+# ----
+
+def _test_lstm_cell(batch_size, num_hidden, num_layers, forget_bias, dtype):
+    """ One iteration of a LSTM cell """
+
+    tf.reset_default_graph()
+    input_size = num_hidden
+    input_data = np.full((batch_size, input_size), 1., dtype=dtype)
+    in_state_c = np.full((num_layers, batch_size, num_hidden), 0.1, dtype=dtype)
+    in_state_h = np.full((num_layers, batch_size, num_hidden), 0.1, dtype=dtype)
+
+    def _get_tensorflow_output():
         with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
+            with variable_scope.variable_scope(
+                "root", initializer=init_ops.constant_initializer(0.5)):
+                m0 = array_ops.zeros([batch_size, num_hidden])
+                m1 = array_ops.zeros([batch_size, num_hidden])
+                x=tf.placeholder(shape=(batch_size, input_size), dtype=dtype)
+                g, ((out_m0, out_m1)) = \
+                     tf.contrib.rnn.LSTMBlockCell(num_hidden,
+                                                  forget_bias=forget_bias)(x, ((m0, m1)))
+                sess.run([variables.global_variables_initializer()])
+                res = sess.run([g, out_m0, out_m1], {
+                    x.name: np.array([[1., 1.]]),
+                    m0.name: 0.1 * np.ones([batch_size, num_hidden]),
+                    m1.name: 0.1 * np.ones([batch_size, num_hidden]),
+                })
+            graph_def = sess.graph.as_graph_def(add_shapes=True)
+            final_graph_def = graph_util.convert_variables_to_constants(
                 sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['ResizeBilinear'],
-                )
+                graph_def,
+                ['root/lstm_cell/LSTMBlockCell'])
+            return final_graph_def, res
+
+    graph_def, tf_out = _get_tensorflow_output()
+    tvm_output = run_tvm_graph(graph_def, [input_data, in_state_c, in_state_h],
+                               ['root/Placeholder', 'root/lstm_cell/LSTMBlockCell_c',
+                                'root/lstm_cell/LSTMBlockCell_h'], num_output=2)
+    assert isinstance(tvm_output, list)
 
-            tf_output = run_tf_graph(sess, data,
-                    'Const:0', 'ResizeBilinear:0')
+    out = tvm_output[0]
+    out_state = tvm_output[1]
+    out_state_tup = np.split(out_state, indices_or_sections=2, axis=1)
+    out_state_c = np.reshape(out_state_tup[0], (batch_size, num_hidden))
+    out_state_h = np.reshape(out_state_tup[1], (batch_size, num_hidden))
+    tvm_out = [out, out_state_c, out_state_h]
+    tvm.testing.assert_allclose(tf_out[0], tvm_out[0], rtol=1e-3, atol=1e-3)
 
-            tvm_output = run_tvm_graph(graph_def,
-                                       data,
-                                       "Const", tf_output.shape, data.dtype)
+def test_forward_lstm():
+    '''test LSTM block cell'''
+    _test_lstm_cell(1, 2, 1, 0.0, 'float32')
 
-            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3)
 
-            sess.close()
 
-def test_forward_resize_bilinear():
-    """ Resize Bilinear """
+#######################################################################
+# Pack
+# ---
+def _test_pack(axis, shape, **kwargs):
 
-    _test_resize_bilinear((4, 16, 32, 32), [50, 50], False)
-    _test_resize_bilinear((6, 32, 64, 64), [20, 20], True)
+    a = np.arange(np.prod(shape), dtype=np.float32).reshape(shape)
+    b = np.arange(np.prod(shape), dtype=np.float32).reshape(shape)
+
+    with tf.Graph().as_default():
+        tf_a = array_ops.placeholder(shape=shape, dtype='float32', name='pl_a')
+        tf_b = array_ops.placeholder(shape=shape, dtype='float32', name='pl_b')
+        tf_c = tf.stack([tf_a,tf_b], axis=axis, **kwargs)
+        assert tf_c.op.op_def.name == 'Pack', "tf.stack() is expected to produce 'Pack' operation"
+
+        compare_tf_with_tvm([a,b], ['pl_a:0','pl_b:0'], 'stack:0')
+
+def test_forward_pack():
+    for axis in range(-3,3):
+        _test_pack(axis, [3,2,1])
+    for axis in range(-1,1):
+        _test_pack(axis, [3])
+    _test_pack(0, [])
 
 #######################################################################
 # Pad
@@ -714,30 +743,17 @@ def _test_pad(input_shape, paddings, mode, **kwargs):
     x = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape)
 
     with tf.Graph().as_default():
-        in_data = constant_op.constant(x, shape=input_shape, dtype='float32')
+        in_data = array_ops.placeholder(shape=input_shape, dtype='float32')
         pad_values = constant_op.constant(paddings)
         pad = tf.pad(in_data, paddings=pad_values, mode=mode, **kwargs)
 
         if mode == 'CONSTANT':
             if 'constant_values' in kwargs:
-                out_node = 'PadV2'
                 out_name = 'PadV2:0'
             else:
-                out_node = 'Pad'
                 out_name = 'Pad:0'
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                [out_node],
-                )
-
-            tf_output = run_tf_graph(sess, x, 'Const:0', out_name)
-            tvm_output = run_tvm_graph(graph_def, x.astype('float32'),
-                                       "Const", tf_output.shape, 'float32')
-            np.testing.assert_allclose(tf_output, tvm_output)
-            sess.close()
+        compare_tf_with_tvm(x, 'Placeholder:0', out_name)
 
 def test_forward_pad():
     """ Pad """
@@ -759,8 +775,8 @@ def test_forward_inception_v3():
 
         with tf.Session() as sess:
             tf_output = run_tf_graph(sess, data, 'input:0', 'InceptionV3/Predictions/Reshape_1:0')
-            tvm_output = run_tvm_graph(graph_def, data, 'input', tf_output.shape, 'float32')
-            np.testing.assert_allclose(tf_output, tvm_output, rtol=1e-5, atol=1e-5)
+            tvm_output = run_tvm_graph(graph_def, data, 'input')
+            tvm.testing.assert_allclose(tf_output[0], tvm_output[0], rtol=1e-5, atol=1e-5)
 
 #######################################################################
 # Inception V1
@@ -795,26 +811,50 @@ def test_forward_inception_v1():
 
         with tf.Session() as sess:
             tf_output = run_tf_graph(sess, data, 'DecodeJpeg/contents:0', 'softmax:0')
-            tvm_output = run_tvm_graph(graph_def, tvm_data, 'DecodeJpeg/contents', tf_output.shape, 'float32')
-            np.testing.assert_allclose(tf_output, tvm_output, rtol=1e-5, atol=1e-5)
+            tvm_output = run_tvm_graph(graph_def, tvm_data, 'DecodeJpeg/contents')
+            tvm.testing.assert_allclose(tf_output[0], tvm_output[0], rtol=1e-5, atol=1e-5)
 
 #######################################################################
 # Mobilenet
 # ---------
 def test_forward_mobilenet():
     '''test mobilenet model'''
+    # MobilenetV2
     with tf.Graph().as_default():
-        graph_def = nnvm.testing.tf.get_workload("MobilenetV1/mobilenet_v1_1.0_224_frozen-with-shapes.pb")
+        graph_def = nnvm.testing.tf.get_workload(
+            "https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.4_224.tgz",
+            "mobilenet_v2_1.4_224_frozen.pb")
         # Call the utility to import the graph definition into default graph.
         graph_def = nnvm.testing.tf.ProcessGraphDefParam(graph_def)
 
         data = np.random.uniform(size=(1, 224, 224, 3)).astype('float32')
-        out_node = 'MobilenetV1/Predictions/Reshape_1'
+        out_node = 'MobilenetV2/Predictions/Reshape_1'
 
         with tf.Session() as sess:
+            # Add shapes to the graph.
+            graph_def = nnvm.testing.tf.AddShapesToGraphDef(sess, out_node)
             tf_output = run_tf_graph(sess, data, 'input:0', out_node + ':0')
-            tvm_output = run_tvm_graph(graph_def, data, 'input', tf_output.shape, 'float32')
-            np.testing.assert_allclose(np.squeeze(tvm_output), np.squeeze(tf_output), rtol=1e-5, atol=1e-5)
+            tvm_output = run_tvm_graph(graph_def, data, 'input')
+            tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tf_output[0]), rtol=1e-5, atol=1e-5)
+
+#######################################################################
+# ResnetV2
+# ---------
+def test_forward_resnetv2():
+    '''test resnet model'''
+    if is_gpu_available():
+        with tf.Graph().as_default():
+            graph_def = nnvm.testing.tf.get_workload("ResnetV2/resnet-20180601_resnet_v2_imagenet-shapes.pb")
+            # Call the utility to import the graph definition into default graph.
+            graph_def = nnvm.testing.tf.ProcessGraphDefParam(graph_def)
+
+            data = np.random.uniform(size=(128, 224, 224, 3)).astype('float32')
+            out_node = 'ArgMax'
+
+            with tf.Session() as sess:
+                tf_output = run_tf_graph(sess, data, 'input_tensor:0', out_node + ':0')
+                tvm_output = run_tvm_graph(graph_def, data, 'input_tensor', tf_output.shape, 'float32')
+                tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tf_output[0]), rtol=1e-5, atol=1e-5)
 
 #######################################################################
 # PTB
@@ -880,6 +920,7 @@ def _get_sample(data, state):
             state_output = model.get_output(1, tvm.nd.empty(out_state_shape,
                                                         "float32")).asnumpy()
             sample = nnvm.testing.tf.pick_from_weight(tvm_output[0])
+
             return sample, state_output
 
         for x in data:
@@ -922,7 +963,7 @@ def _get_sample(data, state):
                                 in_state, cnt_sample)
         tf_sample_str = _pretty_print(tf_samples, False, id_to_word)
         inpt = tvm_sample_str
-        np.testing.assert_allclose(tf_samples, tvm_samples, rtol=1e-5, atol=1e-5)
+        tvm.testing.assert_allclose(tf_samples, tvm_samples, rtol=1e-5, atol=1e-5)
         assert(tvm_sample_str == tf_sample_str)
 
 #######################################################################
@@ -944,17 +985,7 @@ def _test_lrn(ishape, size, axis, bias, alpha, beta):
                                             alpha=alpha,
                                             beta=beta)
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['lrn'],)
-            tf_output = run_tf_graph(sess, inp_array, 'lrn0_data:0', 'lrn:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       inp_array,
-                                       "lrn0_data", tf_output.shape, tf_output.dtype)
-            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3)
-            sess.close()
+        compare_tf_with_tvm(inp_array, 'lrn0_data:0', 'lrn:0')
 
 def test_forward_lrn():
     _test_lrn((1, 3, 20, 20), 3, 1, 1.0, 1.0, 0.5)
@@ -962,59 +993,197 @@ def test_forward_lrn():
 #######################################################################
 # l2_normalize
 # ------------
+
 def _test_l2_normalize(ishape, eps, axis):
     """ testing l2 normalize (uses max, sum, square, sqrt frontend operators)"""
 
     inp_array = np.random.uniform(size=ishape).astype(np.float32)
-    inp_array.fill(1)
 
     with tf.Graph().as_default():
-        in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype, name="Placeholder")
+        in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
         nn.l2_normalize(in1,
                         axis=axis,
                         epsilon=eps,
                         name=None,
                         dim=None)
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['l2_normalize'],
-                )
-            tf_output = run_tf_graph(sess, inp_array, 'Placeholder:0', 'Placeholder:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       inp_array,
-                                       "Placeholder",
-                                       tf_output.shape,
-                                       tf_output.dtype)
-
-            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3)
-            sess.close()
+        compare_tf_with_tvm(inp_array, 'Placeholder:0', 'l2_normalize:0')
+
 def test_forward_l2_normalize():
     _test_l2_normalize((1, 3, 20, 20), 0.001, (0,))
 
+#######################################################################
+# transpose
+# ---------
+def _test_forward_transpose(ishape, axes=None):
+    input = np.random.uniform(size=ishape).astype(np.float32)
+
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(shape=input.shape, dtype=input.dtype, name="transpose_data")
+
+        if axes is None:
+            tf.transpose(in1)
+        else:
+            tf.transpose(in1, perm=axes)
+
+        compare_tf_with_tvm(input, 'transpose_data:0', 'transpose:0')
+
+def test_forward_transpose():
+    _test_forward_transpose((2, 3, 4))
+    _test_forward_transpose((7, 8, 8, 10))
+    _test_forward_transpose((2, 3, 4), (1, 2, 0))
+    _test_forward_transpose((2, 3, 4), (0, 1, 2))
+    _test_forward_transpose((2, 3, 4, 5), (3, 0, 1, 2))
+
+
+def test_forward_ceil():
+    ishape = (1, 3, 10, 10)
+    inp_array = np.random.uniform(size=ishape).astype(np.float32)
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
+        tf.ceil(in1)
+        compare_tf_with_tvm(inp_array, 'Placeholder:0', 'Ceil:0')
+
+def test_forward_floor():
+    ishape = (1, 3, 10, 10)
+    inp_array = np.random.uniform(size=ishape).astype(np.float32)
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
+        tf.floor(in1)
+        compare_tf_with_tvm(inp_array, 'Placeholder:0', 'Floor:0')
+
+def test_forward_relu():
+    ishape = (1, 3, 10, 10)
+    inp_array = np.random.uniform(-5, 5, size=ishape).astype(np.float32)
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
+        tf.nn.relu(in1)
+        compare_tf_with_tvm(inp_array, 'Placeholder:0', 'Relu:0')
+
+def test_forward_leaky_relu():
+    ishape = (1, 3, 10, 10)
+    inp_array = np.random.uniform(-5, 5, size=ishape).astype(np.float32)
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
+        tf.nn.leaky_relu(in1, alpha=0.4)
+        compare_tf_with_tvm(inp_array, 'Placeholder:0', 'LeakyRelu/mul:0')
+
+def test_forward_elu():
+    ishape = (1, 3, 10, 10)
+    inp_array = np.random.uniform(-5, 5, size=ishape).astype(np.float32)
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
+        tf.nn.elu(in1)
+        compare_tf_with_tvm(inp_array, 'Placeholder:0', 'Elu:0')
+
+def test_forward_selu():
+    ishape = (1, 3, 10, 10)
+    inp_array = np.random.uniform(-5, 5, size=ishape).astype(np.float32)
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
+        tf.nn.selu(in1)
+        compare_tf_with_tvm(inp_array, 'Placeholder:0', 'Selu:0')
+
+def test_forward_tanh():
+    ishape = (1, 3, 10, 10)
+    inp_array = np.random.uniform(-5, 5, size=ishape).astype(np.float32)
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
+        tf.nn.tanh(in1)
+        compare_tf_with_tvm(inp_array, 'Placeholder:0', 'Tanh:0')
+
+#######################################################################
+# Mean
+# ----
+def test_forward_mean():
+    def check_mean(ishape, **kwargs):
+        inp_array = np.random.uniform(size=ishape).astype(np.float32)
+        with tf.Graph().as_default():
+            in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
+            tf.keras.backend.mean(in1, **kwargs)
+            compare_tf_with_tvm(inp_array, 'Placeholder:0', 'Mean:0', no_gpu=True)
+
+    check_mean((10, 8, 16, 32))
+    check_mean((10, 8, 16, 32), axis=(2,3))
+    check_mean((10, 8, 16, 32), axis=(1,2), keepdims=True)
+
+#######################################################################
+# Relational operators
+# --------------------
+def _test_forward_rel_op(data, func):
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(shape=data[0].shape, dtype=data[0].dtype, name='in1')
+        in2 = tf.placeholder(shape=data[1].shape, dtype=data[1].dtype, name='in2')
+        op = func(in1, in2, name='op')
+        out = tf.cast(op, tf.int32, name='out1')
+        compare_tf_with_tvm([data[0], data[1]], ['in1:0', 'in2:0'], 'out1:0')
+
+def test_forward_rel_ops():
+    t1 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    t2 = np.array([[9, 8, 7], [6, 5, 4], [3, 2, 1]])
+    _test_forward_rel_op([t1, t2], math_ops.less)
+    _test_forward_rel_op([t1, t2], math_ops.greater)
+    _test_forward_rel_op([t1, t2], math_ops.less_equal)
+    _test_forward_rel_op([t1, t2], math_ops.greater_equal)
+    _test_forward_rel_op([t1, t2], math_ops.equal)
+    _test_forward_rel_op([t1, t2], math_ops.not_equal)
+
+
 #######################################################################
 # Main
 # ----
 if __name__ == '__main__':
-    test_forward_convolution()
-    test_forward_pooling()
+    # Transforms
+    test_forward_transpose()
     test_forward_reshape()
     test_forward_squeeze()
+    test_forward_pack()
+    test_forward_resize_bilinear()
+    test_forward_pad()
+    test_forward_gather()
+    test_forward_stridedslice()
+    test_forward_split()
+    test_forward_unstack()
+
+    # Activations
     test_forward_sigmoid()
+    test_forward_relu()
+    test_forward_leaky_relu()
+    test_forward_elu()
+    test_forward_selu()
+    test_forward_tanh()
+
+    # Reductions
+    test_forward_argminmax()
+    test_forward_reduce()
+    test_forward_mean()
+
+    # NN
+    test_forward_convolution()
+    test_forward_pooling()
     if tf.__version__ == '1.4.1':
         _test_forward_concat_v2()
+    test_forward_lrn()
+    test_forward_l2_normalize()
+
+    # General
     test_forward_multi_input()
+    test_forward_multi_output()
+    test_forward_variable()
+
+    # End to End
     test_forward_inception_v3()
     test_forward_inception_v1()
     test_forward_mobilenet()
-    test_forward_variable()
-    test_forward_resize_bilinear()
-    test_forward_pad()    
-    test_forward_lstm()
-    test_forward_stridedslice()
-    test_forward_gather()
+    test_forward_resnetv2()
     test_forward_ptb()
-    test_forward_lrn()
-    test_forward_l2_normalize()
+
+    # RNN
+    test_forward_lstm()
+
+    # Elementwise
+    test_forward_ceil()
+    test_forward_floor()
+
+    # Relational ops
+    test_forward_rel_ops()
diff --git a/nnvm/tests/python/unittest/test_correct_layout.py b/nnvm/tests/python/unittest/test_correct_layout.py
index 6176586284a7..8961498a579e 100644
--- a/nnvm/tests/python/unittest/test_correct_layout.py
+++ b/nnvm/tests/python/unittest/test_correct_layout.py
@@ -77,14 +77,25 @@ def test_concatenate():
     g, ldict = correct_layout(z, {"x": "HW", "y": "HW"})
     assert(ldict["x"][0] == "HW")
     assert(ldict["y"][0] == "HW")
-    assert(ldict["concat"][0] == "__undef__")
+    assert(ldict["concat"][0] == "HW")
     # second pass will insert layout transform
     _, ldict = correct_layout(g, {"x": "HW16w", "y": "HW16w"})
     assert(ldict["x"][0] == "HW16w")
     assert(ldict["y"][0] == "HW16w")
-    assert(ldict["x_HW"][0] == "HW")
-    assert(ldict["y_HW"][0] == "HW")
-    assert(ldict["concat"][0] == "__undef__")
+    assert(ldict["concat"][0] == "HW16w")
+
+    x1 = sym.Variable("x", shape=(10, 20, 60))
+    x2 = sym.Variable("y", shape=(10, 20, 40))
+    z = sym.concatenate(x1, x2, axis=2, name="concat")
+    g, ldict = correct_layout(z, {"x": "H20wW", "y": "H20wW"})
+    assert(ldict["x"][0] == "H20wW")
+    assert(ldict["y"][0] == "H20wW")
+    assert(ldict["concat"][0] == "H20wW")
+    # second pass will insert layout transform
+    _, ldict = correct_layout(g, {"x": "HW", "y": "HW"})
+    assert(ldict["x_H20wW"][0] == "H20wW")
+    assert(ldict["x_H20wW"][0] == "H20wW")
+    assert(ldict["concat"][0] == "H20wW")
 
 
 def test_expand_dims():
@@ -349,4 +360,4 @@ def test_reduce():
     test_transpose()
     test_broadcast_to()
     test_broadcast_binary()
-    test_reduce()
\ No newline at end of file
+    test_reduce()
diff --git a/nnvm/tests/python/unittest/test_infer_shape.py b/nnvm/tests/python/unittest/test_infer_shape.py
index 51e0e9576781..bbd92cea7b5f 100644
--- a/nnvm/tests/python/unittest/test_infer_shape.py
+++ b/nnvm/tests/python/unittest/test_infer_shape.py
@@ -84,6 +84,10 @@ def test_split():
     sdict = infer_shape(z)
     assert(sdict["y"][0] == [10, 10])
     assert(sdict["y"][1] == [10, 10])
+    z = sym.split(x1, indices_or_sections=[6], axis=-1, name="y")
+    sdict = infer_shape(z)
+    assert(sdict["y"][0] == [10, 6])
+    assert(sdict["y"][1] == [10, 14])
 
 
 def test_batchnorm():
@@ -352,6 +356,26 @@ def check(in_shape, out_shape, **kwargs):
     check((4, 5, 10), (1, 5, 1), axis=(0, 2), keepdims=True)
 
 
+def test_gather_nd():
+    def check(data_shape, indices_shape, out_shape):
+        x = sym.Variable("x", shape=data_shape)
+        indices = sym.Variable("indices", shape=indices_shape)
+        y = sym.gather_nd(x, indices, name="y")
+        sdict = infer_shape(y)
+        assert(tuple(sdict["y"][0]) == tuple(out_shape))
+
+    check((4,), (1, 1), (1,))
+    check((4,), (1, 3), (3,))
+    check((2, 3), (1, 1), (1, 3))
+    check((2, 3), (2, 1), (1,))
+    check((2, 3), (2, 5, 6), (5, 6))
+    check((2, 3, 4), (1, 1), (1, 3, 4))
+    check((2, 3, 4), (2, 1), (1, 4))
+    check((2, 3, 4), (2, 5), (5, 4))
+    check((2, 3, 4), (2, 5, 6), (5, 6, 4))
+    check((2, 3, 4, 5), (2, 6, 7), (6, 7, 4, 5))
+
+
 if __name__ == "__main__":
     test_conv2d_packed()
     test_expand_dims()
@@ -372,3 +396,4 @@ def check(in_shape, out_shape, **kwargs):
     test_transpose()
     test_prelu()
     test_squeeze()
+    test_gather_nd()
diff --git a/nnvm/tests/python/unittest/test_pass_saveload_json.py b/nnvm/tests/python/unittest/test_pass_saveload_json.py
new file mode 100644
index 000000000000..7b5f5ea6867a
--- /dev/null
+++ b/nnvm/tests/python/unittest/test_pass_saveload_json.py
@@ -0,0 +1,17 @@
+import nnvm
+from tvm.contrib import util
+
+
+def test_variable_node_parsed():
+    sym = nnvm.sym.Variable('data')
+    tempdir = util.tempdir()
+    json_filename = 'test_nnvm_symbol.json'
+    with open(tempdir.relpath(json_filename), 'w') as fo:
+        fo.write(nnvm.graph.create(sym).json())
+    sym_str = open(tempdir.relpath(json_filename), 'r').read()
+    sym = nnvm.graph.load_json(sym_str).symbol()
+    sym = nnvm.sym.relu(sym)
+
+
+if __name__ == '__main__':
+    test_variable_node_parsed()
diff --git a/python/.gitignore b/python/.gitignore
index c37a64c453dd..a4d2483a90e2 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -1,2 +1,3 @@
 build
-*.cpp
\ No newline at end of file
+dist
+*.cpp
diff --git a/python/setup.py b/python/setup.py
index cbf8c5591703..71d61a52e349 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -74,8 +74,8 @@ def config_cython():
                 "tvm._ffi.%s.%s" % (subdir, fn[:-4]),
                 ["tvm/_ffi/_cython/%s" % fn],
                 include_dirs=["../include/",
-                              "../dmlc-core/include",
-                              "../dlpack/include",
+                              "../3rdparty/dmlc-core/include",
+                              "../3rdparty/dlpack/include",
                 ],
                 library_dirs=library_dirs,
                 libraries=libraries,
diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index a028dfeddf36..67dd54d1db4d 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -13,10 +13,12 @@
 from . import schedule
 from . import module
 from . import node
+from . import attrs
 from . import ir_builder
 from . import target
 from . import generic
 from . import hybrid
+from . import testing
 
 from . import ndarray as nd
 from .ndarray import context, cpu, gpu, opencl, cl, vulkan, metal, mtl
diff --git a/python/tvm/_ffi/_ctypes/function.py b/python/tvm/_ffi/_ctypes/function.py
index 79f3c6033a1f..3c2a7a5f8c9b 100644
--- a/python/tvm/_ffi/_ctypes/function.py
+++ b/python/tvm/_ffi/_ctypes/function.py
@@ -15,8 +15,9 @@
 from .ndarray import NDArrayBase, _make_array
 from .types import TVMValue, TypeCode
 from .types import TVMPackedCFunc, TVMCFuncFinalizer
-from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func
+from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func, _ctx_to_int64
 from .node import NodeBase
+from . import node as _node
 
 FunctionHandle = ctypes.c_void_p
 ModuleHandle = ctypes.c_void_p
@@ -109,7 +110,7 @@ def _make_tvm_args(args, temp_args):
             values[i].v_str = c_str(str(arg))
             type_codes[i] = TypeCode.STR
         elif isinstance(arg, TVMContext):
-            values[i].v_ctx = arg
+            values[i].v_int64 = _ctx_to_int64(arg)
             type_codes[i] = TypeCode.TVM_CONTEXT
         elif isinstance(arg, bytearray):
             arr = TVMByteArray()
@@ -186,6 +187,23 @@ def __call__(self, *args):
         _ = args
         return RETURN_SWITCH[ret_tcode.value](ret_val)
 
+
+def __init_handle_by_constructor__(fconstructor, args):
+    """Initialize handle by constructor"""
+    temp_args = []
+    values, tcodes, num_args = _make_tvm_args(args, temp_args)
+    ret_val = TVMValue()
+    ret_tcode = ctypes.c_int()
+    check_call(_LIB.TVMFuncCall(
+        fconstructor.handle, values, tcodes, ctypes.c_int(num_args),
+        ctypes.byref(ret_val), ctypes.byref(ret_tcode)))
+    _ = temp_args
+    _ = args
+    assert ret_tcode.value == TypeCode.NODE_HANDLE
+    handle = ret_val.v_handle
+    return handle
+
+
 def _return_module(x):
     """Return function"""
     handle = x.v_handle
@@ -202,6 +220,7 @@ def _handle_return_func(x):
 
 
 # setup return handle for function type
+_node.__init_by_constructor__ = __init_handle_by_constructor__
 RETURN_SWITCH[TypeCode.FUNC_HANDLE] = _handle_return_func
 RETURN_SWITCH[TypeCode.MODULE_HANDLE] = _return_module
 RETURN_SWITCH[TypeCode.NDARRAY_CONTAINER] = lambda x: _make_array(x.v_handle, False)
diff --git a/python/tvm/_ffi/_ctypes/ndarray.py b/python/tvm/_ffi/_ctypes/ndarray.py
index df877679fc7d..8b88e7dc98ea 100644
--- a/python/tvm/_ffi/_ctypes/ndarray.py
+++ b/python/tvm/_ffi/_ctypes/ndarray.py
@@ -1,11 +1,47 @@
+# pylint: disable=invalid-name
 """Runtime NDArray api"""
 from __future__ import absolute_import
 
 import ctypes
-from ..base import _LIB, check_call
+from ..base import _LIB, check_call, c_str
 from ..runtime_ctypes import TVMArrayHandle
 from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func, _return_handle
 
+
+TVMPyCapsuleDestructor = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
+_c_str_dltensor = c_str('dltensor')
+_c_str_used_dltensor = c_str('used_dltensor')
+
+
+# used for PyCapsule manipulation
+if hasattr(ctypes, 'pythonapi'):
+    ctypes.pythonapi.PyCapsule_GetName.restype = ctypes.c_char_p
+    ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p
+    ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object
+
+
+def _from_dlpack(dltensor):
+    dltensor = ctypes.py_object(dltensor)
+    if ctypes.pythonapi.PyCapsule_IsValid(dltensor, _c_str_dltensor):
+        ptr = ctypes.pythonapi.PyCapsule_GetPointer(dltensor, _c_str_dltensor)
+        handle = TVMArrayHandle()
+        check_call(_LIB.TVMArrayFromDLPack(ptr, ctypes.byref(handle)))
+        ctypes.pythonapi.PyCapsule_SetName(dltensor, _c_str_used_dltensor)
+        ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0))
+        return _make_array(handle, False)
+    raise ValueError("Expect a dltensor field, PyCapsule can only be consumed once")
+
+
+def _dlpack_deleter(pycapsule):
+    pycapsule = ctypes.cast(pycapsule, ctypes.py_object)
+    if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, _c_str_dltensor):
+        ptr = ctypes.pythonapi.PyCapsule_GetPointer(pycapsule, _c_str_dltensor)
+        _LIB.TVMDLManagedTensorCallDeleter(ptr)
+        ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0))
+
+_c_dlpack_deleter = TVMPyCapsuleDestructor(_dlpack_deleter)
+
+
 class NDArrayBase(object):
     """A simple Device/CPU Array object in runtime."""
     __slots__ = ["handle", "is_view"]
@@ -29,6 +65,17 @@ def __del__(self):
     def _tvm_handle(self):
         return ctypes.cast(self.handle, ctypes.c_void_p).value
 
+    def to_dlpack(self):
+        """Produce an array from a DLPack Tensor without copying memory
+
+        Returns
+        -------
+        dlpack : DLPack tensor view of the array data
+        """
+        handle = ctypes.c_void_p()
+        check_call(_LIB.TVMArrayToDLPack(self.handle, ctypes.byref(handle)))
+        return ctypes.pythonapi.PyCapsule_New(handle, _c_str_dltensor, _c_dlpack_deleter)
+
 
 def _make_array(handle, is_view):
     handle = ctypes.cast(handle, TVMArrayHandle)
diff --git a/python/tvm/_ffi/_ctypes/node.py b/python/tvm/_ffi/_ctypes/node.py
index 01244519532b..ccfaa6dd77a2 100644
--- a/python/tvm/_ffi/_ctypes/node.py
+++ b/python/tvm/_ffi/_ctypes/node.py
@@ -1,5 +1,5 @@
 # pylint: disable=invalid-name, protected-access
-# pylint: disable=no-member, missing-docstring
+# pylint: disable=no-member, missing-docstring, not-callable
 from __future__ import absolute_import
 
 import ctypes
@@ -9,6 +9,7 @@
 from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func
 
 NodeHandle = ctypes.c_void_p
+__init_by_constructor__ = None
 
 """Maps node type to its constructor"""
 NODE_TYPE = {}
@@ -24,7 +25,13 @@ def _return_node(x):
         handle = NodeHandle(handle)
     tindex = ctypes.c_int()
     check_call(_LIB.TVMNodeGetTypeIndex(handle, ctypes.byref(tindex)))
-    return NODE_TYPE.get(tindex.value, NodeBase)(handle)
+    cls = NODE_TYPE.get(tindex.value, NodeBase)
+    # Avoid calling __init__ of cls, instead directly call __new__
+    # This allows child class to implement their own __init__
+    node = cls.__new__(cls)
+    node.handle = handle
+    return node
+
 
 RETURN_SWITCH[TypeCode.NODE_HANDLE] = _return_node
 C_TO_PY_ARG_SWITCH[TypeCode.NODE_HANDLE] = _wrap_arg_func(
@@ -34,16 +41,6 @@ def _return_node(x):
 class NodeBase(object):
     __slots__ = ["handle"]
     # pylint: disable=no-member
-    def __init__(self, handle):
-        """Initialize the function with handle
-
-        Parameters
-        ----------
-        handle : SymbolHandle
-            the handle to the underlying C++ Symbol
-        """
-        self.handle = handle
-
     def __del__(self):
         if _LIB is not None:
             check_call(_LIB.TVMNodeFree(self.handle))
@@ -62,4 +59,28 @@ def __getattr__(self, name):
                 "'%s' object has no attribute '%s'" % (str(type(self)), name))
         return RETURN_SWITCH[ret_type_code.value](ret_val)
 
+    def __init_handle_by_constructor__(self, fconstructor, *args):
+        """Initialize the handle by calling constructor function.
+
+        Parameters
+        ----------
+        fconstructor : Function
+            Constructor function.
+
+        args: list of objects
+            The arguments to the constructor
+
+        Note
+        ----
+        We have a special calling convention to call constructor functions.
+        So the return handle is directly set into the Node object
+        instead of creating a new Node.
+        """
+        # assign handle first to avoid error raising
+        self.handle = None
+        handle = __init_by_constructor__(fconstructor, args)
+        if not isinstance(handle, NodeHandle):
+            handle = NodeHandle(handle)
+        self.handle = handle
+
 _set_class_node_base(NodeBase)
diff --git a/python/tvm/_ffi/_ctypes/types.py b/python/tvm/_ffi/_ctypes/types.py
index 08337b08b521..b3fcad9cfefb 100644
--- a/python/tvm/_ffi/_ctypes/types.py
+++ b/python/tvm/_ffi/_ctypes/types.py
@@ -3,8 +3,9 @@
 from __future__ import absolute_import as _abs
 
 import ctypes
+import struct
 from ..base import py_str, check_call, _LIB
-from ..runtime_ctypes import TVMByteArray, TypeCode
+from ..runtime_ctypes import TVMByteArray, TypeCode, TVMContext
 
 class TVMValue(ctypes.Union):
     """TVMValue in C API"""
@@ -36,7 +37,7 @@ def _return_handle(x):
     return handle
 
 def _return_bytes(x):
-    """return handle"""
+    """return bytes"""
     handle = x.v_handle
     if not isinstance(handle, ctypes.c_void_p):
         handle = ctypes.c_void_p(handle)
@@ -48,6 +49,15 @@ def _return_bytes(x):
         raise RuntimeError('memmove failed')
     return res
 
+def _return_context(value):
+    """return TVMContext"""
+    # use bit unpacking from int64 view
+    # We use this to get around ctypes issue on Union of Structure
+    data = struct.pack("=q", value.v_int64)
+    arr = struct.unpack("=ii", data)
+    return TVMContext(arr[0], arr[1])
+
+
 def _wrap_arg_func(return_f, type_code):
     tcode = ctypes.c_int(type_code)
     def _wrap_func(x):
@@ -55,13 +65,20 @@ def _wrap_func(x):
         return return_f(x)
     return _wrap_func
 
+def _ctx_to_int64(ctx):
+    """Pack context into int64 in native endian"""
+    data = struct.pack("=ii", ctx.device_type, ctx.device_id)
+    return struct.unpack("=q", data)[0]
+
+
 RETURN_SWITCH = {
     TypeCode.INT: lambda x: x.v_int64,
     TypeCode.FLOAT: lambda x: x.v_float64,
     TypeCode.HANDLE: _return_handle,
     TypeCode.NULL: lambda x: None,
     TypeCode.STR: lambda x: py_str(x.v_str),
-    TypeCode.BYTES: _return_bytes
+    TypeCode.BYTES: _return_bytes,
+    TypeCode.TVM_CONTEXT: _return_context
 }
 
 C_TO_PY_ARG_SWITCH = {
@@ -70,5 +87,6 @@ def _wrap_func(x):
     TypeCode.HANDLE: _return_handle,
     TypeCode.NULL: lambda x: None,
     TypeCode.STR: lambda x: py_str(x.v_str),
-    TypeCode.BYTES: _return_bytes
+    TypeCode.BYTES: _return_bytes,
+    TypeCode.TVM_CONTEXT: _return_context
 }
diff --git a/python/tvm/_ffi/_cython/base.pxi b/python/tvm/_ffi/_cython/base.pxi
index 50a99245f793..ac5532835c47 100644
--- a/python/tvm/_ffi/_cython/base.pxi
+++ b/python/tvm/_ffi/_cython/base.pxi
@@ -1,6 +1,7 @@
 from ..base import TVMError
 from libcpp.vector cimport vector
 from cpython.version cimport PY_MAJOR_VERSION
+from cpython cimport pycapsule
 from libc.stdint cimport int64_t, uint64_t, uint8_t, uint16_t
 import ctypes
 
@@ -40,6 +41,11 @@ cdef extern from "tvm/runtime/c_runtime_api.h":
         int64_t* strides
         uint64_t byte_offset
 
+    ctypedef struct DLManagedTensor:
+        DLTensor dl_tensor
+        void* manager_ctx
+        void (*deleter)(DLManagedTensor* self)
+
     ctypedef struct TVMValue:
         int64_t v_int64
         double v_float64
@@ -49,7 +55,7 @@ cdef extern from "tvm/runtime/c_runtime_api.h":
         DLContext v_ctx
 
 ctypedef int64_t tvm_index_t
-ctypedef void* DLTensorHandle
+ctypedef DLTensor* DLTensorHandle
 ctypedef void* TVMStreamHandle
 ctypedef void* TVMRetValueHandle
 ctypedef void* TVMFunctionHandle
@@ -92,11 +98,16 @@ cdef extern from "tvm/runtime/c_runtime_api.h":
     int TVMArrayCopyFromTo(DLTensorHandle src,
                            DLTensorHandle to,
                            TVMStreamHandle stream)
+    int TVMArrayFromDLPack(DLManagedTensor* arr_from,
+                           DLTensorHandle* out)
+    int TVMArrayToDLPack(DLTensorHandle arr_from,
+                         DLManagedTensor** out)
+    void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor)
 
 cdef extern from "tvm/c_dsl_api.h":
     int TVMNodeFree(NodeHandle handle)
-    TVMNodeTypeKey2Index(const char* type_key,
-                         int* out_index)
+    int TVMNodeTypeKey2Index(const char* type_key,
+                             int* out_index)
     int TVMNodeGetTypeIndex(NodeHandle handle,
                             int* out_index)
     int TVMNodeGetAttr(NodeHandle handle,
diff --git a/python/tvm/_ffi/_cython/function.pxi b/python/tvm/_ffi/_cython/function.pxi
index 989f5b8e7b47..dcbf4c665e66 100644
--- a/python/tvm/_ffi/_cython/function.pxi
+++ b/python/tvm/_ffi/_cython/function.pxi
@@ -196,37 +196,54 @@ cdef inline object make_ret(TVMValue value, int tcode):
     raise ValueError("Unhandled type code %d" % tcode)
 
 
-cdef inline object FuncCall3(void* chandle, tuple args, int nargs):
+cdef inline int FuncCall3(void* chandle,
+                          tuple args,
+                          int nargs,
+                          TVMValue* ret_val,
+                          int* ret_tcode) except -1:
     cdef TVMValue[3] values
     cdef int[3] tcodes
-    cdef TVMValue ret_val
-    cdef int ret_code
     nargs = len(args)
     temp_args = []
     for i in range(nargs):
         make_arg(args[i], &values[i], &tcodes[i], temp_args)
     CALL(TVMFuncCall(chandle, &values[0], &tcodes[0],
-                     nargs, &ret_val, &ret_code))
-    return make_ret(ret_val, ret_code)
+                     nargs, ret_val, ret_tcode))
+    return 0
 
-cdef inline object FuncCall(void* chandle, tuple args):
+cdef inline int FuncCall(void* chandle,
+                         tuple args,
+                         TVMValue* ret_val,
+                         int* ret_tcode) except -1:
     cdef int nargs
     nargs = len(args)
     if nargs <= 3:
-        return FuncCall3(chandle, args, nargs)
+        FuncCall3(chandle, args, nargs, ret_val, ret_tcode)
+        return 0
 
     cdef vector[TVMValue] values
     cdef vector[int] tcodes
-    cdef TVMValue ret_val
-    cdef int ret_code
     values.resize(max(nargs, 1))
     tcodes.resize(max(nargs, 1))
     temp_args = []
     for i in range(nargs):
         make_arg(args[i], &values[i], &tcodes[i], temp_args)
     CALL(TVMFuncCall(chandle, &values[0], &tcodes[0],
-                     nargs, &ret_val, &ret_code))
-    return make_ret(ret_val, ret_code)
+                     nargs, ret_val, ret_tcode))
+    return 0
+
+
+cdef inline int ConstructorCall(void* constructor_handle,
+                                int type_code,
+                                tuple args,
+                                void** handle) except -1:
+    """Call contructor of a handle function"""
+    cdef TVMValue ret_val
+    cdef int ret_tcode
+    FuncCall(constructor_handle, args, &ret_val, &ret_tcode)
+    assert ret_tcode == type_code
+    handle[0] = ret_val.v_handle
+    return 0
 
 
 cdef class FunctionBase:
@@ -264,7 +281,10 @@ cdef class FunctionBase:
             CALL(TVMFuncFree(self.chandle))
 
     def __call__(self, *args):
-        return FuncCall(self.chandle, args)
+        cdef TVMValue ret_val
+        cdef int ret_tcode
+        FuncCall(self.chandle, args, &ret_val, &ret_tcode)
+        return make_ret(ret_val, ret_tcode)
 
 _CLASS_FUNCTION = None
 _CLASS_MODULE = None
diff --git a/python/tvm/_ffi/_cython/ndarray.pxi b/python/tvm/_ffi/_cython/ndarray.pxi
index 44b0a544609d..0a507affec1c 100644
--- a/python/tvm/_ffi/_cython/ndarray.pxi
+++ b/python/tvm/_ffi/_cython/ndarray.pxi
@@ -1,5 +1,29 @@
 from ..runtime_ctypes import TVMArrayHandle
 
+cdef const char* _c_str_dltensor = "dltensor"
+cdef const char* _c_str_used_dltensor = "used_dltensor"
+
+
+cdef void _c_dlpack_deleter(object pycaps):
+    cdef DLManagedTensor* dltensor
+    if pycapsule.PyCapsule_IsValid(pycaps, _c_str_dltensor):
+        dltensor = <DLManagedTensor*>pycapsule.PyCapsule_GetPointer(pycaps, _c_str_dltensor)
+        TVMDLManagedTensorCallDeleter(dltensor)
+
+
+def _from_dlpack(object dltensor):
+    cdef DLManagedTensor* ptr
+    cdef DLTensorHandle chandle
+    if pycapsule.PyCapsule_IsValid(dltensor, _c_str_dltensor):
+        ptr = <DLManagedTensor*>pycapsule.PyCapsule_GetPointer(dltensor, _c_str_dltensor)
+        CALL(TVMArrayFromDLPack(ptr, &chandle))
+        # set name and destructor to be empty
+        pycapsule.PyCapsule_SetDestructor(dltensor, NULL)
+        pycapsule.PyCapsule_SetName(dltensor, _c_str_used_dltensor)
+        return c_make_array(chandle, 0)
+    raise ValueError("Expect a dltensor field, pycapsule.PyCapsule can only be consumed once")
+
+
 cdef class NDArrayBase:
     cdef DLTensor* chandle
     cdef int c_is_view
@@ -35,12 +59,26 @@ cdef class NDArrayBase:
         if self.c_is_view == 0:
             CALL(TVMArrayFree(self.chandle))
 
+    def to_dlpack(self):
+        """Produce an array from a DLPack Tensor without copying memory
+
+        Returns
+        -------
+        dlpack : DLPack tensor view of the array data
+        """
+        cdef DLManagedTensor* dltensor
+        if self.c_is_view != 0:
+            raise ValueError("to_dlpack do not work with memory views")
+        CALL(TVMArrayToDLPack(self.chandle, &dltensor))
+        return pycapsule.PyCapsule_New(dltensor, _c_str_dltensor, _c_dlpack_deleter)
+
 
 cdef c_make_array(void* chandle, is_view):
     ret = _CLASS_NDARRAY(None, is_view)
     (<NDArrayBase>ret).chandle = <DLTensor*>chandle
     return ret
 
+
 cdef _TVM_COMPATS = ()
 
 cdef _TVM_EXT_RET = {}
diff --git a/python/tvm/_ffi/_cython/node.pxi b/python/tvm/_ffi/_cython/node.pxi
index a563af5237f9..73ead2b4b447 100644
--- a/python/tvm/_ffi/_cython/node.pxi
+++ b/python/tvm/_ffi/_cython/node.pxi
@@ -1,3 +1,4 @@
+from ... import _api_internal
 from ..base import string_types
 from ..node_generic import _set_class_node_base
 
@@ -10,6 +11,7 @@ def _register_node(int index, object cls):
         NODE_TYPE.append(None)
     NODE_TYPE[index] = cls
 
+
 cdef inline object make_ret_node(void* chandle):
     global NODE_TYPE
     cdef int tindex
@@ -20,14 +22,15 @@ cdef inline object make_ret_node(void* chandle):
     if tindex < len(node_type):
         cls = node_type[tindex]
         if cls is not None:
-            obj = cls(None)
+            obj = cls.__new__(cls)
         else:
-            obj = NodeBase(None)
+            obj = NodeBase.__new__(NodeBase)
     else:
-        obj = NodeBase(None)
+        obj = NodeBase.__new__(NodeBase)
     (<NodeBase>obj).chandle = chandle
     return obj
 
+
 cdef class NodeBase:
     cdef void* chandle
 
@@ -49,9 +52,6 @@ cdef class NodeBase:
         def __set__(self, value):
             self._set_handle(value)
 
-    def __init__(self, handle):
-        self._set_handle(handle)
-
     def __dealloc__(self):
         CALL(TVMNodeFree(self.chandle))
 
@@ -65,4 +65,29 @@ cdef class NodeBase:
                 "'%s' object has no attribute '%s'" % (type(self), name))
         return make_ret(ret_val, ret_type_code)
 
+    def __init_handle_by_constructor__(self, fconstructor, *args):
+        """Initialize the handle by calling constructor function.
+
+        Parameters
+        ----------
+        fconstructor : Function
+            Constructor function.
+
+        args: list of objects
+            The arguments to the constructor
+
+        Note
+        ----
+        We have a special calling convention to call constructor functions.
+        So the return handle is directly set into the Node object
+        instead of creating a new Node.
+        """
+        # avoid error raised during construction.
+        self.chandle = NULL
+        cdef void* chandle
+        ConstructorCall(
+            (<FunctionBase>fconstructor).chandle,
+            kNodeHandle, args, &chandle)
+        self.chandle = chandle
+
 _set_class_node_base(NodeBase)
diff --git a/python/tvm/_ffi/base.py b/python/tvm/_ffi/base.py
index 4c1e979cb684..2579f22e44af 100644
--- a/python/tvm/_ffi/base.py
+++ b/python/tvm/_ffi/base.py
@@ -14,13 +14,19 @@
 #----------------------------
 if sys.version_info[0] == 3:
     string_types = (str,)
-    numeric_types = (float, int, np.float32, np.int32)
+    integer_types = (int, np.int32)
+    numeric_types = integer_types + (float, np.float32)
     # this function is needed for python3
     # to convert ctypes.char_p .value back to python str
-    py_str = lambda x: x.decode('utf-8')
+    if sys.platform == "win32":
+        encoding = 'cp' + str(ctypes.cdll.kernel32.GetACP())
+        py_str = lambda x: x.decode(encoding)
+    else:
+        py_str = lambda x: x.decode('utf-8')
 else:
     string_types = (basestring,)
-    numeric_types = (float, int, long, np.float32, np.int32)
+    integer_types = (int, long, np.int32)
+    numeric_types = integer_types + (float, np.float32)
     py_str = lambda x: x
 
 
diff --git a/python/tvm/_ffi/function.py b/python/tvm/_ffi/function.py
index cfda2a35f9b9..ca1812d4109a 100644
--- a/python/tvm/_ffi/function.py
+++ b/python/tvm/_ffi/function.py
@@ -262,23 +262,7 @@ def _list(name, func):
 def _get_api(f):
     flocal = f
     flocal.is_global = True
-    def my_api_func(*args):
-        """
-
-        This is a type erased API that calls into Global PackedFunc.
-        These APIs corresponds to functions registered from C++ backend
-        and can be used as developer functions.
-
-        args : list
-          The positional arguments to the function call.
-
-        Returns
-        -------
-        value : int, float, None, Node or Function
-        The result of the API function call.
-        """
-        return flocal(*args)
-    return my_api_func
+    return flocal
 
 def _init_api(namespace, target_module_name=None):
     """Initialize api for a given module name
diff --git a/python/tvm/_ffi/libinfo.py b/python/tvm/_ffi/libinfo.py
index 390849f8536d..6ad2e06939b1 100644
--- a/python/tvm/_ffi/libinfo.py
+++ b/python/tvm/_ffi/libinfo.py
@@ -25,7 +25,7 @@ def find_lib_path(name=None, search_path=None, optional=False):
     # inplace) or the install directory (if TVM is installed).
     # An installed TVM's curr_path will look something like:
     #   $PREFIX/lib/python3.6/site-packages/tvm/_ffi
-    ffi_dir = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    ffi_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
     source_dir = os.path.join(ffi_dir, "..", "..", "..")
     install_lib_dir = os.path.join(ffi_dir, "..", "..", "..", "..")
 
@@ -49,7 +49,7 @@ def find_lib_path(name=None, search_path=None, optional=False):
 
     dll_path.append(install_lib_dir)
 
-    dll_path = [os.path.abspath(x) for x in dll_path]
+    dll_path = [os.path.realpath(x) for x in dll_path]
     if search_path is not None:
         if search_path is list:
             dll_path = dll_path + search_path
@@ -99,6 +99,68 @@ def find_lib_path(name=None, search_path=None, optional=False):
     return lib_found
 
 
+def find_include_path(name=None, search_path=None, optional=False):
+    """Find header files for C compilation.
+
+    Parameters
+    ----------
+    name : list of str
+        List of directory names to be searched.
+
+    Returns
+    -------
+    include_path : list(string)
+        List of all found paths to header files.
+    """
+    ffi_dir = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    source_dir = os.path.join(ffi_dir, "..", "..", "..")
+    install_include_dir = os.path.join(ffi_dir, "..", "..", "..", "..")
+    third_party_dir = os.path.join(source_dir, "3rdparty")
+
+    header_path = []
+
+    if os.environ.get('TVM_INCLUDE_PATH', None):
+        header_path.append(os.environ['TVM_INCLUDE_PATH'])
+
+    header_path.append(install_include_dir)
+    header_path.append(source_dir)
+    header_path.append(third_party_dir)
+
+    header_path = [os.path.abspath(x) for x in header_path]
+    if search_path is not None:
+        if search_path is list:
+            header_path = header_path + search_path
+        else:
+            header_path.append(search_path)
+    if name is not None:
+        if isinstance(name, list):
+            tvm_include_path = []
+            for n in name:
+                tvm_include_path += [os.path.join(p, n) for p in header_path]
+        else:
+            tvm_include_path = [os.path.join(p, name) for p in header_path]
+        dlpack_include_path = []
+    else:
+        tvm_include_path = [os.path.join(p, 'include') for p in header_path]
+        dlpack_include_path = [os.path.join(p, 'dlpack/include') for p in header_path]
+
+        # try to find include path
+        include_found = [p for p in tvm_include_path if os.path.exists(p) and os.path.isdir(p)]
+        include_found += [p for p in dlpack_include_path if os.path.exists(p) and os.path.isdir(p)]
+
+    if not include_found:
+        message = ('Cannot find the files.\n' +
+                   'List of candidates:\n' +
+                   str('\n'.join(tvm_include_path + dlpack_include_path)))
+        if not optional:
+            raise RuntimeError(message)
+        return None
+
+    return include_found
+
+
 # current version
-# We use the version of the incoming release for code that is under development
-__version__ = "0.4.0"
+# We use the version of the incoming release for code
+# that is under development.
+# The following line is set by tvm/python/update_version.py
+__version__ = "0.5.dev"
diff --git a/python/tvm/_ffi/ndarray.py b/python/tvm/_ffi/ndarray.py
index 3788c07ac440..e49c3b62f473 100644
--- a/python/tvm/_ffi/ndarray.py
+++ b/python/tvm/_ffi/ndarray.py
@@ -5,7 +5,7 @@
 import sys
 import ctypes
 import numpy as np
-from .base import _LIB, check_call, c_array, string_types, _FFI_MODE
+from .base import _LIB, check_call, c_array, string_types, _FFI_MODE, c_str
 from .runtime_ctypes import TVMType, TVMContext, TVMArray, TVMArrayHandle
 from .runtime_ctypes import TypeCode, tvm_shape_index_t
 
@@ -17,14 +17,14 @@
     if _FFI_MODE == "ctypes":
         raise ImportError()
     if sys.version_info >= (3, 0):
-        from ._cy3.core import _set_class_ndarray, _reg_extension, _make_array
+        from ._cy3.core import _set_class_ndarray, _reg_extension, _make_array, _from_dlpack
         from ._cy3.core import NDArrayBase as _NDArrayBase
     else:
-        from ._cy2.core import _set_class_ndarray, _reg_extension, _make_array
+        from ._cy2.core import _set_class_ndarray, _reg_extension, _make_array, _from_dlpack
         from ._cy2.core import NDArrayBase as _NDArrayBase
 except IMPORT_EXCEPT:
     # pylint: disable=wrong-import-position
-    from ._ctypes.ndarray import _set_class_ndarray, _reg_extension, _make_array
+    from ._ctypes.ndarray import _set_class_ndarray, _reg_extension, _make_array, _from_dlpack
     from ._ctypes.ndarray import NDArrayBase as _NDArrayBase
 
 
@@ -62,6 +62,7 @@ def context(dev_type, dev_id=0):
         dev_type = TVMContext.STR2MASK[dev_type]
     return TVMContext(dev_type, dev_id)
 
+
 def numpyasarray(np_data):
     """Return a TVMArray representation of a numpy array.
     """
@@ -112,6 +113,26 @@ def empty(shape, dtype="float32", ctx=context(1, 0)):
         ctypes.byref(handle)))
     return _make_array(handle, False)
 
+
+def from_dlpack(dltensor):
+    """Produce an array from a DLPack tensor without memory copy.
+    Retreives the underlying DLPack tensor's pointer to create an array from the
+    data. Removes the original DLPack tensor's destructor as now the array is
+    responsible for destruction.
+
+    Parameters
+    ----------
+    dltensor : DLPack tensor
+        Input DLManagedTensor, can only be consumed once.
+
+    Returns
+    -------
+    arr: tvm.nd.NDArray
+        The array view of the tensor data.
+    """
+    return _from_dlpack(dltensor)
+
+
 class NDArrayBase(_NDArrayBase):
     """A simple Device/CPU Array object in runtime."""
     @property
@@ -260,6 +281,7 @@ def copyto(self, target):
             raise ValueError("Unsupported target type %s" % str(type(target)))
         return target
 
+
 def free_extension_handle(handle, type_code):
     """Free c++ extension type handle
 
diff --git a/python/tvm/_ffi/node.py b/python/tvm/_ffi/node.py
index d9e7397ae71f..98ece19f77f2 100644
--- a/python/tvm/_ffi/node.py
+++ b/python/tvm/_ffi/node.py
@@ -21,6 +21,12 @@
     # pylint: disable=wrong-import-position
     from ._ctypes.node import _register_node, NodeBase as _NodeBase
 
+
+def _new_object(cls):
+    """Helper function for pickle"""
+    return cls.__new__(cls)
+
+
 class NodeBase(_NodeBase):
     """NodeBase is the base class of all TVM language AST object."""
     def __repr__(self):
@@ -46,7 +52,8 @@ def __ne__(self, other):
         return not self.__eq__(other)
 
     def __reduce__(self):
-        return (type(self), (None,), self.__getstate__())
+        cls = type(self)
+        return (_new_object, (cls, ), self.__getstate__())
 
     def __getstate__(self):
         handle = self.handle
diff --git a/python/tvm/_ffi/node_generic.py b/python/tvm/_ffi/node_generic.py
index b7230f29da59..e86453499faa 100644
--- a/python/tvm/_ffi/node_generic.py
+++ b/python/tvm/_ffi/node_generic.py
@@ -56,6 +56,8 @@ def convert_to_node(value):
         return _api_internal._Map(*vlist)
     elif isinstance(value, NodeGeneric):
         return value.asnode()
+    elif value is None:
+        return None
     else:
         raise ValueError("don't know how to convert type %s to node" % type(value))
 
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index 4c36e82a81ec..ef5316b5e267 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -48,6 +48,13 @@ def __init__(self, type_str):
         super(TVMType, self).__init__()
         if isinstance(type_str, np.dtype):
             type_str = str(type_str)
+
+        if type_str == "bool":
+            self.bits = 1
+            self.type_code = 1
+            self.lanes = 1
+            return
+
         arr = type_str.split("x")
         head = arr[0]
         self.lanes = int(arr[1]) if len(arr) > 1 else 1
@@ -67,12 +74,14 @@ def __init__(self, type_str):
             bits = 64
             head = ""
         else:
-            raise ValueError("Donot know how to handle type %s" % type_str)
+            raise ValueError("Do not know how to handle type %s" % type_str)
         bits = int(head) if head else bits
         self.bits = bits
 
 
     def __repr__(self):
+        if self.bits == 1 and self.lanes == 1:
+            return "bool"
         x = "%s%d" % (TVMType.CODE2STR[self.type_code], self.bits)
         if self.lanes != 1:
             x += "x%d" % self.lanes
@@ -109,12 +118,14 @@ class TVMContext(ctypes.Structure):
         'llvm': 1,
         'stackvm': 1,
         'cpu': 1,
+        'c': 1,
         'gpu': 2,
         'cuda': 2,
         'nvptx': 2,
         'cl': 4,
         'opencl': 4,
         'aocl' : 5,
+        'aocl_sw_emu' : 5,
         'sdaccel': 6,
         'vulkan': 7,
         'metal': 8,
diff --git a/python/tvm/api.py b/python/tvm/api.py
index 75debc33db66..e275c1122c36 100644
--- a/python/tvm/api.py
+++ b/python/tvm/api.py
@@ -45,6 +45,28 @@ def const(value, dtype=None):
     return _api_internal._const(value, dtype)
 
 
+def get_env_func(name):
+    """Get an EnvFunc by a global name.
+
+    Parameters
+    ----------
+    name: str
+        The name of the global function.
+
+    Returns
+    -------
+    env_func : EnvFunc
+        The result env function.
+
+    Note
+    ----
+    EnvFunc is a Node wrapper around
+    global function that can be serialized via its name.
+    This can be used to serialize function field in the language.
+    """
+    return _api_internal._EnvFuncGet(name)
+
+
 def convert(value):
     """Convert value to TVM node or function.
 
@@ -134,9 +156,9 @@ def any(*args):
         raise ValueError("Any must take at least 1 argument")
     if len(args) == 1:
         return args[0]
-    ret = _make.Or(args[0], args[1])
+    ret = _make._OpOr(args[0], args[1])
     for i in range(2, len(args)):
-        ret = _make.Or(ret, args[i])
+        ret = _make._OpOr(ret, args[i])
     return ret
 
 
@@ -158,9 +180,9 @@ def all(*args):
         raise ValueError("Any must take at least 1 argument")
     if len(args) == 1:
         return args[0]
-    ret = _make.And(args[0], args[1])
+    ret = _make._OpAnd(args[0], args[1])
     for i in range(2, len(args)):
-        ret = _make.And(ret, args[i])
+        ret = _make._OpAnd(ret, args[i])
     return ret
 
 
@@ -216,29 +238,48 @@ def compute(shape, fcompute, name="compute", tag="", attrs=None):
     tensor: Tensor
         The created tensor
     """
-    if _tag.TagScope.current is not None:
+    if _tag.TagScope.get_current() is not None:
         if tag != "":
             raise ValueError("nested tag is not allowed for now")
-        tag = _tag.TagScope.current.tag
+        tag = _tag.TagScope.get_current().tag
     shape = (shape,) if isinstance(shape, _expr.Expr) else shape
+    # for python3
+    shape = tuple([int(s) if isinstance(s, float) else s for s in shape])
     ndim = len(shape)
     code = fcompute.__code__
 
-    if fcompute.__code__.co_argcount == 0:
+    out_ndim = ndim
+    if code.co_argcount == 0:
         arg_names = ["i%d" % i for i in range(ndim)]
     else:
         arg_names = code.co_varnames[:code.co_argcount]
+        out_ndim = code.co_argcount
 
-    if ndim != len(arg_names):
+    if out_ndim != len(arg_names):
         raise ValueError("fcompute do not match dimension, ndim=%d" % ndim)
 
-    dim_var = [_IterVar((0, s), x, 0) for x, s in zip(arg_names, shape)]
+    dim_var = [_IterVar((0, s), x, 0) for x, s in zip(arg_names, shape[:out_ndim])]
     body = fcompute(*[v.var for v in dim_var])
-    if not isinstance(body, (list, tuple)):
-        body = [body]
-    body = convert(body)
-    op_node = _api_internal._ComputeOp(
-        name, tag, attrs, dim_var, body)
+
+    if isinstance(body, _tensor.TensorIntrinCall):
+        for i, s in enumerate(shape[out_ndim:]):
+            var_name = "ax" + str(i)
+            dim_var.append(_IterVar((0, s), var_name, 4))
+        op_node = _api_internal._TensorComputeOp(name,
+                                                 tag,
+                                                 dim_var,
+                                                 body.reduce_axis,
+                                                 out_ndim,
+                                                 body.intrin,
+                                                 body.tensors,
+                                                 body.regions)
+    else:
+        if not isinstance(body, (list, tuple)):
+            body = [body]
+        body = convert(body)
+        op_node = _api_internal._ComputeOp(
+            name, tag, attrs, dim_var, body)
+
     num = op_node.num_outputs
     outputs = tuple(op_node.output(i) for i in range(num))
     return outputs[0] if num == 1 else outputs
@@ -289,10 +330,10 @@ def scan(init, update, state_placeholder, inputs=None, name="scan", tag="", attr
       s_update = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + X[t, i])
       res = tvm.scan(s_init, s_update, s_state, X)
     """
-    if _tag.TagScope.current is not None:
+    if _tag.TagScope.get_current() is not None:
         if tag != "":
             raise ValueError("nested tag is not allowed for now")
-        tag = _tag.TagScope.current.tag
+        tag = _tag.TagScope.get_current().tag
     if isinstance(init, _tensor.Tensor):
         init = [init]
     if isinstance(update, _tensor.Tensor):
@@ -385,10 +426,10 @@ def extern(shape,
                           "tvm.contrib.cblas.matmul",
                             ins[0], ins[1], outs[0], 0, 0), name="C")
     """
-    if _tag.TagScope.current is not None:
+    if _tag.TagScope.get_current() is not None:
         if tag != "":
             raise ValueError("nested tag is not allowed for now")
-        tag = _tag.TagScope.current.tag
+        tag = _tag.TagScope.get_current().tag
     shape = (shape,) if isinstance(shape, (_expr.Expr, _Integral)) else shape
     shape = [shape] if isinstance(shape[0], (_expr.Expr, _Integral)) else shape
     if in_buffers is not None:
@@ -507,14 +548,14 @@ def decl_buffer(shape,
     dtype = float32 if dtype is None else dtype
     strides = () if strides is None else strides
     if offset_factor != 0 and elem_offset is None:
-        elem_offset = var('%s_elem_offset' % name, shape[0].dtype)
+        shape_dtype = shape[0].dtype if hasattr(shape[0], "dtype") else "int32"
+        elem_offset = var('%s_elem_offset' % name, shape_dtype)
     if data is None:
         data = var(name, "handle")
     return _api_internal._Buffer(
         data, dtype, shape, strides, elem_offset, name, scope,
         data_alignment, offset_factor)
 
-
 def _IterVar(dom, name, iter_type, thread_tag=''):
     """Internal function to create IterVar
 
@@ -616,7 +657,7 @@ def select(cond, t, f):
     node : Node
         The tvm.expr.Select node
     """
-    return _make.Select(convert(cond), convert(t), convert(f))
+    return _expr.Select(convert(cond), convert(t), convert(f))
 
 
 def comm_reducer(fcombine, fidentity, name="reduce"):
@@ -699,7 +740,7 @@ def _make_reduce(expr, axis, where=None):
         axis = convert(axis if isinstance(axis, (list, tuple)) else [axis])
         if where is None:
             where = convert(True)
-        outputs = tuple(_make.Reduce(combiner, expr, axis, where, i)
+        outputs = tuple(_expr.Reduce(combiner, expr, axis, where, i)
                         for i in range(size))
         return outputs[0] if size == 1 else outputs
 
@@ -751,5 +792,5 @@ def reducer(expr, axis, where=None, *args):
 _init_api("tvm.api")
 #pylint: disable=unnecessary-lambda
 sum = comm_reducer(lambda x, y: x+y, lambda t: const(0, dtype=t), name="sum")
-min = comm_reducer(lambda x, y: _make.Min(x, y), max_value, name='min')
-max = comm_reducer(lambda x, y: _make.Max(x, y), min_value, name='max')
+min = comm_reducer(lambda x, y: _make._OpMin(x, y), max_value, name='min')
+max = comm_reducer(lambda x, y: _make._OpMax(x, y), min_value, name='max')
diff --git a/python/tvm/attrs.py b/python/tvm/attrs.py
new file mode 100644
index 000000000000..529dbcc14c13
--- /dev/null
+++ b/python/tvm/attrs.py
@@ -0,0 +1,40 @@
+""" TVM Attribute module, which is mainly used for defining attributes of operators"""
+from ._ffi.node import NodeBase, register_node as _register_tvm_node
+from ._ffi.function import _init_api
+from . import _api_internal
+
+
+@_register_tvm_node
+class Attrs(NodeBase):
+    """Attribute node, which is mainly use for defining attributes of relay operators.
+
+    Used by function registered in python side, such as compute, schedule and alter_layout.
+    Attrs is passed as the first argument to these functions.
+    """
+    def list_field_info(self):
+        """ Get fields information
+
+        Returns
+        -------
+        infos: list of AttrFieldInfo
+            List of field information
+        """
+        return _api_internal._AttrsListFieldInfo(self)
+
+    def keys(self):
+        """Get list of names in the attribute.
+
+        Returns
+        -------
+        keys : list of str
+            List of keys
+        """
+        fields = self.list_field_info()
+        for field in fields:
+            yield field.name
+
+    def __getitem__(self, item):
+        return self.__getattr__(item)
+
+
+_init_api("tvm.attrs")
diff --git a/python/tvm/autotvm/__init__.py b/python/tvm/autotvm/__init__.py
index 20426be84aa1..08cfbb2a95da 100644
--- a/python/tvm/autotvm/__init__.py
+++ b/python/tvm/autotvm/__init__.py
@@ -22,8 +22,11 @@
 from . import tophub
 
 # some shortcuts
-from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo, use_rpc
+from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo, \
+    LocalBuilder, LocalRunner, RPCRunner
 from .tuner import callback
 from .task import template, get_config, create, ConfigSpace, ConfigEntity, \
-    ApplyHistoryBest as apply_history_best
+    register_topi_compute, register_topi_schedule, \
+    DispatchContext, FallbackContext, ApplyHistoryBest as apply_history_best, \
+    ApplyGraphBest as apply_graph_best
 from .env import GLOBAL_SCOPE
diff --git a/python/tvm/autotvm/measure/__init__.py b/python/tvm/autotvm/measure/__init__.py
index b9bd3c37b01d..8a6126641a99 100644
--- a/python/tvm/autotvm/measure/__init__.py
+++ b/python/tvm/autotvm/measure/__init__.py
@@ -1,7 +1,7 @@
 """Distributed executor infrastructure to scale up the tuning"""
 
-from .measure import MeasureInput, MeasureResult, MeasureErrorNo, measure_option
-from .measure_methods import request_remote, check_remote, create_measure_batch, use_rpc
-
+from .measure import MeasureInput, MeasureResult, MeasureErrorNo, measure_option, \
+    create_measure_batch
+from .measure_methods import LocalBuilder, LocalRunner, RPCRunner, request_remote
+from .executor import Executor
 from .local_executor import LocalExecutor
-from .executor import Future, Executor
diff --git a/python/tvm/autotvm/measure/executor.py b/python/tvm/autotvm/measure/executor.py
index 17ea1d7fda9e..f3ba4236ce63 100644
--- a/python/tvm/autotvm/measure/executor.py
+++ b/python/tvm/autotvm/measure/executor.py
@@ -6,7 +6,7 @@ class Executor(object):
     Allows submit asynchronous jobs and returns the Future object.
     """
     # timeout for jobs that may hang
-    DEFAULT_TIMEOUT = 60
+    DEFAULT_TIMEOUT = 120
 
     def submit(self, func, *args, **kwargs):
         """
diff --git a/python/tvm/autotvm/measure/local_executor.py b/python/tvm/autotvm/measure/local_executor.py
index 8a045ecfb4c0..63d995c3580c 100644
--- a/python/tvm/autotvm/measure/local_executor.py
+++ b/python/tvm/autotvm/measure/local_executor.py
@@ -37,7 +37,8 @@ def _execute_func(func, queue, args, kwargs):
         res = exc
     queue.put(res)
 
-def timeout_monitor(queue, timeout, func, args, kwargs):
+
+def call_with_timeout(queue, timeout, func, args, kwargs):
     """A wrapper to support timeout of a function call"""
 
     # start a new process for timeout (cannot use thread because we have c function)
@@ -45,17 +46,12 @@ def timeout_monitor(queue, timeout, func, args, kwargs):
     p.start()
     p.join(timeout=timeout)
 
-    alive = p.is_alive()
+    queue.put(executor.TimeoutError())
+
     kill_child_processes(p.pid)
     p.terminate()
     p.join()
 
-    if alive:
-        queue.put(executor.TimeoutError())
-    else:
-        if queue.empty():
-            queue.put(executor.ExecutionError("Fatal error in local executor"))
-
 
 class LocalFuture(executor.Future):
     """Local wrapper for the future
@@ -133,8 +129,8 @@ def submit(self, func, *args, **kwargs):
         if not self.do_fork:
             return LocalFutureNoFork(func(*args, **kwargs))
 
-        queue = Queue(1)
-        process = Process(target=timeout_monitor,
+        queue = Queue(2)
+        process = Process(target=call_with_timeout,
                           args=(queue, self.timeout, func, args, kwargs))
         process.start()
         return LocalFuture(process, queue)
diff --git a/python/tvm/autotvm/measure/measure.py b/python/tvm/autotvm/measure/measure.py
index 6a05e1a6a349..8a8940817237 100644
--- a/python/tvm/autotvm/measure/measure.py
+++ b/python/tvm/autotvm/measure/measure.py
@@ -1,5 +1,6 @@
 # pylint: disable=pointless-string-statement,consider-using-enumerate,invalid-name
 """User facing API for specifying how to measure the generated code"""
+import multiprocessing
 from collections import namedtuple
 
 class MeasureInput(namedtuple("MeasureInput", ["target", "task", "config"])):
@@ -16,6 +17,7 @@ class MeasureInput(namedtuple("MeasureInput", ["target", "task", "config"])):
         Specific configuration.
     """
 
+
 class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost", "timestamp"])):
     """
     Stores all the results of a measurement
@@ -23,8 +25,8 @@ class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost"
     Parameters
     ----------
     costs: Array of float or Array of Exception
-        If no error occurs for this measurement, it is an array of measured running times.
-        If some error occurs during the measurement, it is an array of the exception objections.
+        If no error occurs during measurement, it is an array of measured running times.
+        If an error occurs during measurement, it is an array of the exception objections.
     error_no: int
         Denote error type, defined by MeasureErrorNo
     all_cost: float
@@ -37,91 +39,210 @@ class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost"
 class MeasureErrorNo(object):
     """Error type for MeasureResult"""
     NO_ERROR = 0              # no error
-    INSTANTIATION_ERROR = 1   # error when calling template function
+    INSTANTIATION_ERROR = 1   # actively detected error in instantiating a template with a config
     COMPILE_HOST = 2          # error when compiling code on host (e.g. tvm.build)
-    COMPILE_DEVICE = 3        # error when compiling code on device (e.g. opencl JIT on device)
+    COMPILE_DEVICE = 3        # error when compiling code on device (e.g. OpenCL JIT on the device)
     RUNTIME_DEVICE = 4        # error when run program on device
     WRONG_ANSWER = 5          # answer is wrong when compared to a golden output
-    FLEET_ERROR = 6           # error of measure infrastructure
+    BUILD_TIMEOUT = 6         # timeout during compilation
+    RUN_TIMEOUT = 7           # timeout during run
+    UNKNOWN_ERROR = 8         # unknown error
 
 
-def measure_option(measure_func,
-                   number=1,
-                   repeat=1,
-                   timeout=60,
-                   parallel_num=1,
-                   do_fork=True,
-                   build_func='default',
-                   check_correctness=False,
-                   replay_db=None):
-    """Configure how to do measurement
+class Builder(object):
+    """Builder that builds programs in tuning
 
     Parameters
     ----------
-    measure_func: str or callable
-        'local': use the local device for measurement. The tuner will start a tracker
-        and a RPC server silently for the user.
-
-        callable: It is a callable function for measurement.
-                  See the return value of measure/measure_methods.py::use_rpc for example.
-    number : int, optional
-        Number of times to do the measurement for average
-    repeat : int, optional
-        Number of times to repeat the measurement.
-        In total, the generated code will be run (1 + number x repeat) times,
-        where the first one is warm up. The returned result contains `repeat` costs,
-        each of which is the average of `number` test run.
-    timeout: int, optional
-        Timeout for a whole batch. TimeoutError will be returned as the result if a
-        task timeouts.
-    parallel_num: int, optional
-        The number of measurement task that can run in parallel.
-        Set this according to the number of cpu cores (for compilation) and
-        the number of devices you have (for measuring generate code).
-    do_fork: bool, optional
-        Whether use multiprocessing (based on fork) for running measure jobs in parallel.
-        Set this to False if you want to debug (see trackback) or using fork is not suitable.
-        NOTE: If this is False, parallel and timeout do not work.
-    build_func: str or callable, optional
-        'default': call default builder. This works for normal target (llvm, cuda)
-
-        'ndk': use Android NDK to create shared library. Use this for android target.
-
-        callable: customized build function for other backends (e.g. VTA).
-                  See measure/measure_methods.py::default_build_func for example.
-    check_correctness: bool
-        Whether check correctness after measurement. This will use llvm cpu as reference.
-    replay_db : Database, optional
-        The database that we retrieve saved MeasureResult from.
+    timeout: float, optional
+        The timeout of a build task
+    n_parallel: int, optional
+        The number of tasks submitted in parallel
+        By default it will use all cpu cores
+    """
+    def __init__(self, timeout=10, n_parallel=None):
+        self.timeout = timeout
+        self.n_parallel = n_parallel or multiprocessing.cpu_count()
+        self.build_kwargs = {}
+        self.task = None
+
+    def set_task(self, task, build_kwargs=None):
+        """
+        Initialize for a new tuning task
+
+        Parameters
+        ----------
+        task: Task
+            The tuning task
+        build_kwargs: dict, optional
+            The additional kwargs for build function
+        """
+        self.task = task
+        self.build_kwargs = build_kwargs
+
+    def build(self, measure_inputs):
+        """Build programs
+
+        Parameters
+        ----------
+        measure_inputs: List of MeasureInput
+            The measure input
+
+        Returns
+        -------
+        build_results: List of BuildResult
+            The build result.
+        """
+        raise NotImplementedError()
+
+
+class Runner(object):
+    """Runner that runs and measures the time cost of a generated program in tuning
 
-    Returns
-    -------
-    options: dict
-        A dict to store all options
+    Parameters
+    ----------
+    timeout: float, optional
+        The timeout of a build task
+    n_parallel: int, optional
+        The number of tasks submitted in parallel
+        By default it will use all cpu cores
+    """
+    def __init__(self, timeout=5, n_parallel=None):
+        self.timeout = timeout
+        self.n_parallel = n_parallel or multiprocessing.cpu_count()
+        self.task = None
+
+    def set_task(self, task):
+        """
+        Initialize for a new tuning task
+
+        Parameters
+        ----------
+        task: Task
+            The tuning task
+        """
+        self.task = task
+
+    def get_build_kwargs(self):
+        """
+        Get device specific build arguments (e.g. maximum shared memory size)
+
+        Returns
+        ----------
+        kwargs: dict
+            The additional keyword arguments
+        """
+        raise NotImplementedError()
+
+    def run(self, measure_inputs, build_results):
+        """Run amd measure built programs
+
+        Parameters
+        ----------
+        measure_inputs: List of MeasureInput
+            The raw measure input
+        build_results: List of BuildResults
+            The build results
+
+        Returns
+        -------
+        measure_results: List of MeasureResult
+            The final results of measurement
+        """
+        raise NotImplementedError()
+
+
+def measure_option(builder, runner):
+    """
+    Set options for measure. To measure a config, we will build it and run it.
+    So we have to set options for these two steps.
+    They have their own options on timeout, parallel, etc.
+
+    Parameters
+    ----------
+    builder: Builder
+        Specify how to build programs
+    runner: Runner
+        Specify how to run programs
+
+    Examples
+    --------
+    # example setting for using local devices
+    >>> measure_option = autotvm.measure_option(
+    >>>     builder=autotvm.LocalBuilder(),      # use all local cpu cores for compilation
+    >>>     runner=autotvm.LocalRunner(          # measure them sequentially
+    >>>         number=10,
+    >>>         timeout=5)
+    >>> )
+
+    # example setting for using remote devices
+    >>> measure_option = autotvm.measure_option(
+    >>>    builder=autotvm.LocalBuilder(),  # use all local cpu cores for compilation
+    >>>    runner=autotvm.RPCRunner(
+    >>>        'rasp3b', 'locahost', 9190, # device key, host and port of the rpc tracker
+    >>>        number=4,
+    >>>        timeout=4) # timeout of a run on the device. RPC request waiting time is excluded.
+    >>>)
 
     Note
     ----
-    To support customized measure, you can pass callable `measure_func` or
-    `build_func` in. The `measure_func` will call `build_func` to build binary library
-    and handle the logic of measurement.
-
-    Signature:
-    * measure_func (see the return value of measure/measure_methods.py::use_rpc for example)
-    def measure_func(input_pack, build_func, build_kwargs, number, repeat, ref_input, ref_output):
-        return measure_results
-
-    * build_func (see measure/measure_methods.py::default_build_func for example)
-    def build_func(inp, tmp_dir, **kwargs):
-        return func, args, filename
+    To make measurement results accurate, you should pick the correct value for the argument
+    `number` and `repeat` in Runner(). Using `min_repeat_ms` can dynamically adjusts `number`,
+    so it is recommended. The typical value for NVIDIA GPU is 100 ms.
     """
-    return {
-        'measure_func': measure_func,
-        'number': number,
-        'repeat': repeat,
-        'timeout': timeout,
-        'parallel_num': parallel_num,
-        'do_fork': do_fork,
-        'build_func': build_func,
-        'check_correctness': check_correctness,
-        'replay_db': replay_db,
+    from .measure_methods import LocalBuilder, LocalRunner
+
+    if isinstance(builder, str):
+        if builder == 'local':
+            builder = LocalBuilder()
+        else:
+            raise ValueError("Invalid builder: " + builder)
+
+    if isinstance(runner, str):
+        if runner == 'local':
+            runner = LocalRunner()
+        else:
+            raise ValueError("Invalid runner: " + runner)
+
+    opt = {
+        'builder': builder,
+        'runner': runner,
     }
+
+    return opt
+
+
+def create_measure_batch(task, option):
+    """Get a standard measure_batch function.
+
+    Parameters
+    ----------
+    task: tvm.autotvm.task.Task
+        The tuning task
+    option: dict
+        The option for measuring generated code.
+        You should use the return value of function :any:`measure_option` for this argument.
+
+    Returns
+    -------
+    measure_batch: callable
+        a callback function to measure a batch of configs
+    """
+    builder = option['builder']
+    runner = option['runner']
+
+    attach_objects = runner.set_task(task)
+
+    # feed device related information from runner to builder
+    # (e.g. max shared memory for validity checking)
+    build_kwargs = runner.get_build_kwargs()
+    builder.set_task(task, build_kwargs)
+
+    def measure_batch(measure_inputs):
+        build_results = builder.build(measure_inputs)
+        results = runner.run(measure_inputs, build_results)
+        return results
+
+    measure_batch.n_parallel = builder.n_parallel
+    measure_batch.attach_objects = attach_objects
+    return measure_batch
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 6e95a6e435d0..ff93704edb44 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -1,129 +1,339 @@
-# pylint: disable=consider-using-enumerate,invalid-name,too-many-function-args
+# pylint: disable=invalid-name,too-many-function-args,too-many-nested-blocks
 """
 Functions that run on executor for measurement.
-These functions are responsible for building tvm module, uploading it to
-remote devices, recording the running time costs and checking the correctness of output
+
+These functions are responsible for building the tvm module, uploading it to
+remote devices, recording the running time costs, and checking the correctness of the output.
 """
 
 import logging
+import shutil
 import os
+import threading
 import time
 from random import getrandbits
-import threading
+from collections import namedtuple
+import tempfile
 
 import numpy as np
 
-from ... import rpc, ir_pass, build, build_config, nd, context, TVMError, register_func, \
-    target as _target
-from ...contrib import nvcc, util, ndk
+from ... import ir_pass, build, build_config, nd, TVMError, register_func, \
+    rpc as _rpc, target as _target
+from ...contrib import nvcc, ndk
 
 from ..util import get_const_tuple
 from ..env import AutotvmGlobalScope
 from ..task.space import InstantiationError
 
-from .measure import MeasureResult, MeasureErrorNo
+from .measure import MeasureResult, MeasureErrorNo, Builder, Runner
 from .local_executor import LocalExecutor
 
 logger = logging.getLogger('autotvm')
 
-class HashMismatchError(ValueError):
-    """Raised when the code hash of a submitted config doesn't match that on the
-       measure side """
-    pass
-
-
-def request_remote(device_key, tracker_addr=None, priority=1, timeout=60):
-    """request a remote session
+class BuildResult(namedtuple("BuildResult", ('filename', 'arg_info', 'error', 'time_cost'))):
+    """
+    Stores all the necessary inputs for a measurement.
 
     Parameters
     ----------
-    device_key: string
-        device key of registered device in tracker
-    tracker_addr: Tuple(string, int), optional
-        The address of rpc tracker in (host, port) format.
-        If is none, will use environment variable "TVM_TRACKER_HOST"
-        and "TVM_TRACKER_PORT"
-    priority: int, optional
-        The priority of this request, larger is more prior
-    timeout: float, optional
-        The timeout of this session (units: seconds)
-
-    Returns
-    ------
-    session: RPCSession
+    filename : str
+        The filename of generated library
+    arg_info : Tuple
+        The shape and dtype information of tvm tensor arguments
+    error : Exception
+        The error happens during compilation.
+    time_cost : float
+        The time cost of building
     """
-    # connect to the tracker
-    if tracker_addr:
-        host = tracker_addr[0] or os.environ['TVM_TRACKER_HOST']
-        port = tracker_addr[1] or int(os.environ['TVM_TRACKER_PORT'])
-    else:
-        host = os.environ['TVM_TRACKER_HOST']
-        port = int(os.environ['TVM_TRACKER_PORT'])
-
-    tracker = rpc.connect_tracker(host, port)
-    remote = tracker.request(device_key, priority=priority,
-                             session_timeout=timeout)
-    return remote
 
-def check_remote(target, device_key, tracker_addr=None, priority=2, timeout=10):
-    """
-    Check the availability of a remote device
+class LocalBuilder(Builder):
+    """Run compilation on local machine
 
     Parameters
     ----------
-    target: Target
-        The wanted compilation target
-    device_key: string
-        device key of registered device in tracker
-    tracker_addr: Tuple(string, int), optional
-        The address of rpc tracker in (host, port) format.
-        If is none, will use environment variable "TVM_TRACKER_HOST"
-        and "TVM_TRACKER_PORT"
-    priority: int, optional
-        The priority of this request, larger is more prior
-    timeout: float, optional
-        The timeout of this check (units: seconds).
-        If time is out, a RuntimerError will be raised.
+    timeout: float
+        The timeout of a compilation
+    n_parallel: int
+        The number of tasks run in parallel. "None" will use all cpu cores
+    build_func: callable or str
+        If is 'default', use default build function
+        If is 'ndk', use function for android ndk
+        If is callable, use it as custom build function
     """
-    def _check():
-        remote = request_remote(device_key, tracker_addr, priority)
-        remote.context(str(target))
-    t = threading.Thread(target=_check,)
-    t.start()
-    t.join(timeout)
-    return not t.is_alive()
+    def __init__(self, timeout=10, n_parallel=None, build_func='default'):
+        super(LocalBuilder, self).__init__(timeout, n_parallel)
+
+        if isinstance(build_func, str):
+            if build_func == 'default':
+                build_func = default_build_func
+            elif build_func == 'ndk':
+                build_func = android_ndk_build_func
+            else:
+                raise ValueError("Invalid build_func" + build_func)
+
+        self.build_func = build_func
+        self.executor = LocalExecutor(timeout=timeout)
+        self.tmp_dir = tempfile.mkdtemp()
+
+    def build(self, measure_inputs):
+        results = []
+
+        shutil.rmtree(self.tmp_dir)
+        self.tmp_dir = tempfile.mkdtemp()
+
+        for i in range(0, len(measure_inputs), self.n_parallel):
+            futures = []
+            for inp in measure_inputs[i:i + self.n_parallel]:
+                ret = self.executor.submit(self.build_func,
+                                           inp,
+                                           self.tmp_dir,
+                                           **self.build_kwargs)
+                futures.append(ret)
+
+            for future in futures:
+                res = future.get()
+
+                if isinstance(res, Exception):
+                    # timeout or fleet error, return MeasureResult directly
+                    results.append(MeasureResult((res,), MeasureErrorNo.BUILD_TIMEOUT,
+                                                 self.timeout, time.time()))
+                elif res.error is not None:
+                    # instantiation error
+                    if isinstance(res.error, InstantiationError):
+                        results.append(MeasureResult((res.error,),
+                                                     MeasureErrorNo.INSTANTIATION_ERROR,
+                                                     res.time_cost, time.time()))
+                    else:
+                        if "InstantiationError" in str(res.error):
+                            msg = str(res.error)
+                            try:
+                                msg = msg.split('\n')[-2].split(": ")[1]
+                            except Exception:  # pylint: disable=broad-except
+                                pass
+                            results.append(MeasureResult((InstantiationError(msg),),
+                                                         MeasureErrorNo.INSTANTIATION_ERROR,
+                                                         res.time_cost, time.time()))
+                        else:  # tvm error
+                            results.append(MeasureResult((res.error,),
+                                                         MeasureErrorNo.COMPILE_HOST,
+                                                         res.time_cost, time.time()))
+                else:
+                    # return BuildResult
+                    results.append(res)
+
+        return results
 
-def create_measure_batch(task, option):
-    """Get a standard measure_batch function.
+
+class RPCRunner(Runner):
+    """Run generated code on remove devices.
+    This function will ask a RPC Tracker to get device for measurement.
 
     Parameters
     ----------
-    task: tvm.autotvm.task.Task
-        The tuning task
-    option: dict
-        The option for measuring generated code.
-        You should use the return value of function :any:`measure_option` for this argument.
-
-    Returns
-    -------
-    measure_batch: callable
-        a callback function to measure a batch of configs
+    timeout: float
+        The timeout of a compilation
+    n_parallel: int
+        The number of tasks run in parallel. "None" will use all cpu cores
+    key: str
+        The key of the device registered in the tracker
+    host: str
+        The host address of RPC Tracker
+    port: int
+        The port of RPC Tracker
+    number : int, optional
+        Number of times to do measurement for tasking average
+    repeat : int, optional
+        Number of times to repeat the measurement.
+        In total, the generated code will be run (1 + number x repeat) times,
+        where the first one is warm up. The returned result contains `repeat` costs,
+    min_repeat_ms : float, optional
+        Minimum duration of a timer measurement in milliseconds.
+        When the run time of a measurement trial falls below this time, the
+        `number` parameter will be automatically increased.
+        Set this to improve the accuracy of perf measurement, e.g., when timers
+        are not precise enough to capture short-running tasks. This parameter is
+        also critical when devices need a certain minimum running time to "warm
+        up," such as GPUs that need time to reach a performance power state.
+    cooldown_interval: float, optional
+        The cool down interval between two measurements.
+    check_correctness: bool, optional
+        Whether check correctness after measurement. This will use llvm cpu target to
+        call your template and get the reference output.
+        This can work for TOPI templates, but may not work for your custom template.
     """
-    from ..database import filter_inputs
+    def __init__(self,
+                 key, host, port, priority=1,
+                 timeout=10, n_parallel=None,
+                 number=4, repeat=3, min_repeat_ms=0, cooldown_interval=0.1,
+                 check_correctness=False):
+        super(RPCRunner, self).__init__(timeout, n_parallel)
+
+        self.key = key
+        self.host = host
+        self.port = port
+        self.priority = priority
+        self.timeout = timeout
+
+        self.number = number
+        self.repeat = repeat
+        self.min_repeat_ms = min_repeat_ms
+        self.cur_number = number
+
+        self.ref_input = None
+        self.ref_output = None
+        self.check_correctness = check_correctness
+        self.cooldown_interval = cooldown_interval
+
+        self.executor = LocalExecutor()
+
+    def set_task(self, task):
+        self.task = task
+        self.cur_number = self.number
+
+        if check_remote(task.target, self.key, self.host, self.port):
+            logger.info("Get devices for measurement successfully!")
+        else:
+            raise RuntimeError("Cannot get remote devices from the tracker. "
+                               "Please check the status of tracker by "
+                               "'python -m tvm.exec.query_rpc_tracker --port [THE PORT YOU USE]' "
+                               "and make sure you have free devices on the queue status.")
 
-    measure_func = option['measure_func']
-    number, repeat = option['number'], option['repeat']
-    timeout, parallel_num, do_fork = option['timeout'], option['parallel_num'], option['do_fork']
-    build_func = option['build_func']
-    check_correctness = option['check_correctness']
-    replay_db = option['replay_db']
+        if self.check_correctness:
+            # use llvm cpu to generate a reference input/output
+            # this option works for tuning topi, but might not work for you custom op
+            with _target.create("llvm"):
+                s, arg_bufs = task.instantiate(task.config_space.get(0))
+            self.ref_input = [np.random.uniform(size=get_const_tuple(x.shape)).astype(x.dtype)
+                              for x in arg_bufs]
+            func = build(s, arg_bufs, "llvm")
+            tvm_buf = [nd.array(x) for x in self.ref_input]
+            func(*tvm_buf)
+            self.ref_output = [x.asnumpy() for x in tvm_buf]
+
+    def get_build_kwargs(self):
+        kwargs = {}
+        if 'cuda' in self.task.target.keys or 'opencl' in self.task.target.keys:
+            remote = request_remote(self.key, self.host, self.port)
+            ctx = remote.context(str(self.task.target), 0)
+            max_dims = ctx.max_thread_dimensions
+            kwargs['check_gpu'] = {
+                'max_shared_memory_per_block': ctx.max_shared_memory_per_block,
+                'max_threads_per_block': ctx.max_threads_per_block,
+                'max_thread_x': max_dims[0],
+                'max_thread_y': max_dims[1],
+                'max_thread_z': max_dims[2],
+            }
+
+            if 'cuda' in self.task.target.keys:
+                kwargs["cuda_arch"] = "sm_" + "".join(ctx.compute_version.split('.'))
+
+        return kwargs
+
+    def run(self, measure_inputs, build_results):
+        results = []
+        remote_args = (self.key, self.host, self.port, self.priority, self.timeout)
+
+        for i in range(0, len(measure_inputs), self.n_parallel):
+            futures = []
+            for measure_inp, build_res in zip(measure_inputs[i:i+self.n_parallel],
+                                              build_results[i:i+self.n_parallel]):
+                ret = self.executor.submit(run_through_rpc,
+                                           measure_inp,
+                                           build_res,
+                                           self.cur_number,
+                                           self.repeat,
+                                           self.cooldown_interval,
+                                           remote_args,
+                                           self.ref_input,
+                                           self.ref_output)
+                futures.append(ret)
+
+            for future in futures:
+                res = future.get()
+                if isinstance(res, Exception):   # executor error or timeout
+                    results.append(MeasureResult((str(res),), MeasureErrorNo.RUN_TIMEOUT,
+                                                 self.timeout, time.time()))
+                else:
+                    results.append(res)
+
+        # If some runs were too fast, do remeasure for them
+        # to meet the requirement of `min_repeat_ms`
+        remeasure = np.zeros((len(measure_inputs),), dtype=np.bool)
+        pre_number = next_number = self.cur_number
+        min_repeat_duration = self.min_repeat_ms / 1000.0
+        for i, res in enumerate(results):
+            if res.error_no == MeasureErrorNo.NO_ERROR:
+                if np.mean(res.costs) * pre_number <= min_repeat_duration:
+                    next_number = max(next_number,
+                                      int(np.ceil(min_repeat_duration / np.mean(res.costs))))
+                    remeasure[i] = True
+
+        if pre_number != next_number:
+            self.cur_number = next_number
+            msg = "increasing number to %d" % self.cur_number
+            logger.info(msg)
+
+            re_measure_inputs = [x for i, x in enumerate(measure_inputs) if remeasure[i]]
+            re_build_results = [x for i, x in enumerate(build_results) if remeasure[i]]
+            re_res = self.run(re_measure_inputs, re_build_results)
+            ct = 0
+            for i, rerun in enumerate(remeasure):
+                if rerun:
+                    results[i] = re_res[ct]
+                    ct += 1
+
+        return results
 
-    executor = LocalExecutor(timeout=timeout, do_fork=do_fork)
+class LocalRunner(RPCRunner):
+    """Run generated code on local devices.
+
+    Parameters
+    ----------
+    timeout: float
+        The timeout of a compilation
+    number : int, optional
+        Number of times to do measurement for tasking average
+    repeat : int, optional
+        Number of times to repeat the measurement.
+        In total, the generated code will be run (1 + number x repeat) times,
+        where the first one is warm up. The returned result contains `repeat` costs,
+        each of which is the average of `number` test run.
+    min_repeat_ms : float, optional
+        Minimum duration of a timer measurement in milliseconds.
+        When the run time of a measurement trial falls below this time, the
+        `number` parameter will be automatically increased.
+        Set this to improve the accuracy of perf measurement, e.g., when timers
+        are not precise enough to capture short-running tasks. This parameter is
+        also critical when devices need a certain minimum running time to "warm
+        up," such as GPUs that need time to reach a performance power state.
+    cooldown_interval: float, optional
+        The cool down interval between two measurements.
+    check_correctness: bool, optional
+        Whether check correctness after measurement. This will use llvm cpu target to
+        call your template and get the reference output.
+        This can work for TOPI templates, but may not work for your custom template.
+
+    Note
+    ----
+    This is a "fake" local mode. We start a silent rpc tracker and rpc server
+    for the user. In this way we reuse timeout/isolation mechanism in RPC infrastructure.
+    """
+    def __init__(self,
+                 timeout=10,
+                 number=4, repeat=3, min_repeat_ms=0, cooldown_interval=0.1,
+                 check_correctness=False):
+        super(LocalRunner, self).__init__('', None, None, 0,
+                                          timeout=timeout, n_parallel=1,
+                                          number=number, repeat=repeat,
+                                          min_repeat_ms=min_repeat_ms,
+                                          cooldown_interval=cooldown_interval,
+                                          check_correctness=check_correctness)
+        self.tracker = None
+        self.server = None
+
+    def set_task(self, task):
+        self.task = task
 
-    # convert convenient string to function object
-    attach_objects = None
-    if measure_func == 'local':
-        # start temporary rpc tracker and rpc server for the user
         from ...rpc.tracker import Tracker
         from ...rpc.server import Server
 
@@ -133,352 +343,227 @@ def create_measure_batch(task, option):
                         key=device_key,
                         use_popen=True, silent=True,
                         tracker_addr=(tracker.host, tracker.port))
+        self.key = device_key
+        self.host = tracker.host
+        self.port = tracker.port
 
-        measure_func = use_rpc(device_key, tracker.host, tracker.port)
-        attach_objects = (server, tracker)
+        super(LocalRunner, self).set_task(task)
+        return server, tracker
 
-    build_kwargs = {}
-    if build_func == 'default':
-        build_func = default_build_func
-    if build_func == 'ndk':
-        build_func = default_build_func
-        build_kwargs['use_ndk'] = True
 
-    # check the availability of remote devices
-    if hasattr(measure_func, 'rpc_info'):
-        rpc_info = measure_func.rpc_info
-        if check_remote(task.target, rpc_info['key'], (rpc_info['host'], rpc_info['port'])):
-            logger.info("Get devices for measurement successfully!")
-        else:
-            raise RuntimeError("Cannot get remote devices from the tracker. "
-                               "Please check the status of tracker by "
-                               "'python -m tvm.exec.query_rpc_tracker --port [THE PORT YOU USE]' "
-                               "and make sure you have free devices on the queue status.")
+def _build_func_common(measure_input, check_gpu=None, cuda_arch=None, build_option=None):
+    """Common part for building a configuration"""
+    target, task, config = measure_input
 
-    # add device info of cuda and opencl target
-    if ('cuda' in task.target.keys or 'opencl' in task.target.keys) \
-            and hasattr(measure_func, 'rpc_info'):
-        rpc_info = measure_func.rpc_info
-        add_gpu_target_info(task.target, rpc_info["key"], (rpc_info["host"], rpc_info["port"]),
-                            build_kwargs)
-
-    if check_correctness:
-        # use llvm cpu to generate a reference input/output
-        # this option works for tuning topi, but might not work for you custom op
-        with _target.create("llvm"):
-            s, arg_bufs = task.instantiate(task.config_space.get(0))
-        ref_input = [np.random.uniform(size=get_const_tuple(x.shape)).astype(x.dtype)
-                     for x in arg_bufs]
-        func = build(s, arg_bufs, "llvm")
-        tvm_buf = [nd.array(x) for x in ref_input]
-        func(*tvm_buf)
-        ref_output = [x.asnumpy() for x in tvm_buf]
-    else:
-        ref_input = ref_output = None
-
-    def measure_batch(measure_inputs):
-        """measure the time cost for a batch of configs in real machines"""
-        if replay_db is not None:
-            partial_results, measure_inputs = \
-                filter_inputs(replay_db, measure_inputs, retry=False)
-
-        # launch measure jobs in parallel
-        pack_size = getattr(measure_func, "pack_size", 1)  # measure `pack_size` inputs in one job
-        futures = []
-        for i in range(0, len(measure_inputs), pack_size):
-            input_pack = measure_inputs[i:i + pack_size]
-            ret = executor.submit(
-                measure_func,
-                input_pack,
-                build_func,
-                build_kwargs,
-                number,
-                repeat,
-                ref_input,
-                ref_output)
-            futures.append(ret)
-
-        # transform results
-        results = []
-        for future in futures:
-            result = future.get()
-            if isinstance(result, Exception):
-                tstamp = time.time()
-                results.extend([MeasureResult((result,), MeasureErrorNo.FLEET_ERROR,
-                                              timeout, tstamp)] * pack_size)
-            else:
-                results.extend(result)
-
-        if replay_db is not None:
-            result_idx = 0
-            for i in range(len(partial_results)):
-                if partial_results[i] is None:
-                    partial_results[i] = results[result_idx]
-                    result_idx += 1
-            return partial_results
-        return results
+    with target:
+        s, args = task.instantiate(config)
 
-    measure_batch.parallel_num = parallel_num
-    # attach server and tracker object to avoid them of being garbage-collected
-    measure_batch.attach_objects = attach_objects
-    return measure_batch
+        # check invalidity of template and code hash consistency
+        if not config.valid():
+            raise InstantiationError(config.errors)
+
+        opts = build_option or {}
+        if check_gpu:  # Add verify pass to filter out invalid configs in advance.
+            opts["add_lower_pass"] = [(2, gpu_verify_pass(**check_gpu))]
+        if cuda_arch:
+            set_cuda_target_arch(cuda_arch)
 
+        with build_config(**opts):
+            func = build(s, args, target_host=task.target_host)
+    return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
 
-def use_rpc(key,
-            host=None,
-            port=None,
-            priority=1,
-            session_timeout=60,
-            pack_size=1):
+
+def default_build_func(measure_input, tmp_dir, **kwargs):
     """
-    Create a standard measure_func which uses RPC Tracker for measurement.
-    This measure_func will request a device from the RPC Tracker and
-    upload the built binary library to that device for measurement.
+    Default build func. This can work for cuda, opencl, llvm backend
 
     Parameters
     ----------
-    key: str
-        The registered key of the device in tracker. The tuner will request devices for
-        measurement by this key.
-    host: str, optional
-        The hostname of RPC Tracker. If not set, will use environment variable "TVM_TRACKER_HOST"
-    port: int, optional
-        The port of RPC Tracker. If not set, will use environment variable "TVM_TRACKER_PORT"
-    priority: int, optional
-        Priority of this task, used by scheduler in tracker
-    session_timeout: int, optional
-        Timeout of rpc session
-    pack_size: int, optional
-        The number of configs measure in one RPC session.
-        Usually this can be set to 1. If your device has high overhead to establish a
-        rpc connection, set this higher.
+    measure_input: MeasureInput
+        The input of measurement
+    tmp_dir: str
+        The path of temporary directory to export generated library
+    """
+    tic = time.time()
+    try:
+        filename = os.path.join(tmp_dir, "tmp_func_%0x.tar" % getrandbits(64))
+        func, arg_info = _build_func_common(measure_input, **kwargs)
+        func.export_library(filename)
+    except Exception as e:  # pylint: disable=broad-except
+        return BuildResult(None, None, e, time.time() - tic)
+    return BuildResult(filename, arg_info, None, time.time() - tic)
+
+
+def android_ndk_build_func(measure_input, tmp_dir, **kwargs):
+    """
+    Build function for android device using ndk.
+
+    Parameters
+    ----------
+    measure_input: MeasureInput
+        The input of measurement
+    tmp_dir: str
+        The path of temporary directory to export generated library
     """
-    def fmeasure(input_pack, build_func, build_kwargs, number, repeat, ref_input, ref_output):
-        """Do measurement for a list of inputs inside a same RPC session.
-
-        Parameters
-        ----------
-        input_pack: List of MeasureInput
-            The inputs of measurement
-        build_func: callable
-            Function for building the code. see :any:`default_build_func` for example
-        build_kwargs: dict
-            Extra arguments for build_func
-        number : int, optional
-            Number of times to do the measurement for average
-        repeat : int, optional
-            Number of times to repeat the measurement.
-            In total, the generated code will be run (1 + number x repeat) times,
-            where the first one is warm up. The returned result contains `repeat` costs,
-            each of which is the average of `number` test run.
-        ref_input: List of numpy array
-            Reference input for correctness check
-        ref_output: List of numpy array
-            Reference output for correctness check
-
-        Returns
-        -------
-        results: List of MeasureResult
-            The results for input_pack
-        """
-        remote = request_remote(key, (host, port), priority, session_timeout)
-
-        res = _measure_common(input_pack, build_func, build_kwargs, number, repeat,
-                              ref_input, ref_output,
-                              remote)
-        return res
-
-    fmeasure.pack_size = pack_size
-    fmeasure.rpc_info = {"key": key, "host": host, "port": port}
-    return fmeasure
-
-
-def _measure_common(input_pack, build_func, build_kwargs, number, repeat,
-                    ref_input=None, ref_output=None, remote=None):
-    """Measure the time cost for a pack of inputs.
-
-    (Note: A pack is a list of inputs which will be measured inside a same RPC session)
+    tic = time.time()
+    try:
+        filename = os.path.join(tmp_dir, "tmp_func_%0x.so" % getrandbits(64))
+        func, arg_info = _build_func_common(measure_input, **kwargs)
+        func.export_library(filename, ndk.create_shared)
+    except Exception as e:  # pylint: disable=broad-except
+        return BuildResult(None, None, e, time.time() - tic)
+    return BuildResult(filename, arg_info, None, time.time() - tic)
+
+
+def run_through_rpc(measure_input, build_result,
+                    number, repeat, cooldown_interval,
+                    remote_args, ref_input=None, ref_output=None):
+    """Run a generated library through rpc
 
     Parameters
     ----------
-    input_pack : list of MeasureInput
-        The inputs we need to evaluate
-    build_func : function takes MeasureInput returns tuple of (time_func, ctx, args)
-        The build function used to build each input.
-    build_kwargs: Dict
-        The extra keyword arguments to build_func
+    measure_input: MeasureInput
+        The raw measure input
+    build_result: BuildResult
+        The result returned from Builder. This contains the path to the generated library.
     number : int, optional
-        Number of times to do the measurement for average
+        Number of times to do measurement for tasking average
     repeat : int, optional
         Number of times to repeat the measurement.
         In total, the generated code will be run (1 + number x repeat) times,
         where the first one is warm up. The returned result contains `repeat` costs,
         each of which is the average of `number` test run.
-    ref_input: Array of np.ndarray, optional
-        Reference input for checking correctness
-    ref_output: Array of np.ndarray, optional
-        Reference output for checking correctness
-    remote: RPCSession, optional
-        The remote RPC session
-
-    Returns
-    -------
-    res_pack : Array of MeasureResult
-        The list of results of measurement.
+    cooldown_interval: float
+        The cool down interval between two measurements
+    remote_args: Tuple
+        The argument for request_remote
+    ref_input: List of np.ndarray
+        The reference input used for checking correctness
+    ref_output: List of np.ndarray
+        The reference output used for checking correctness
     """
-    res_pack = []
-    tmp_dir = util.tempdir() if remote else None
-
-    for inp in input_pack:
-        tic = time.time()
-
-        # build function
-        try:
-            func, arg_bufs, filename = build_func(inp, tmp_dir, **build_kwargs)
-        except TVMError as exc:
-            tstamp = time.time()
-            msg = str(exc)
-            if "Stack trace returned" in msg:
-                msg = msg[:msg.index("Stack trace returned")]
-            if "InstantiationError" in msg:
-                try:
-                    msg = msg.split('\n')[-2].split(": ")[1]
-                except Exception:  # pylint: disable=broad-except
-                    pass
-                res_pack.append(MeasureResult((InstantiationError(msg),),
-                                              MeasureErrorNo.INSTANTIATION_ERROR,
-                                              tstamp - tic, tstamp))
-            else:
-                res_pack.append(MeasureResult((RuntimeError(msg),),
-                                              MeasureErrorNo.COMPILE_HOST,
-                                              tstamp - tic, tstamp))
-            continue
-        except InstantiationError as e:
-            tstamp = time.time()
-            res_pack.append(MeasureResult((InstantiationError(str(e)),),
-                                          MeasureErrorNo.INSTANTIATION_ERROR,
-                                          tstamp - tic, tstamp))
-            continue
+    if isinstance(build_result, MeasureResult):
+        return build_result
 
+    tic = time.time()
+    errno = MeasureErrorNo.NO_ERROR
+    try:
         # upload built module
-        if remote:
-            remote.upload(tmp_dir.relpath(filename))
-            func = remote.load_module(filename)
-            ctx = remote.context(str(inp.target), 0)
-            time_f = func.time_evaluator(
-                func.entry_name, ctx, number=number, repeat=repeat)
+        remote = request_remote(*remote_args)
+        remote.upload(build_result.filename)
+        func = remote.load_module(os.path.split(build_result.filename)[1])
+        ctx = remote.context(str(measure_input.target), 0)
+        time_f = func.time_evaluator(
+            func.entry_name, ctx, number=number, repeat=repeat)
+
+        # set input
+        if ref_input:
+            args = [nd.array(x, ctx=ctx) for x in ref_input]
         else:
-            ctx = context(str(inp.target), 0)
-            time_f = func.time_evaluator(
-                func.entry_name, ctx, number=number, repeat=repeat)
-
-        # measure time
-        errno = MeasureErrorNo.NO_ERROR
-        try:
-            if ref_input:
-                args = [nd.array(x, ctx=ctx) for x in ref_input]
-            else:
-                args = [nd.empty(get_const_tuple(x.shape), dtype=x.dtype, ctx=ctx)
-                        for x in arg_bufs]
-            costs = time_f(*args).results
-            if len(costs) > 2:  # remove largest and smallest value to reduce variance
-                costs = list(costs)
-                costs.sort()
-                costs = tuple(costs[1:-1])
-            if ref_output:
-                for expected, real in zip(ref_output, args):
-                    if not np.allclose(expected, real.asnumpy(), rtol=1e-4):
-                        logger.warning("Wrong Answer!")
-                        errno = MeasureErrorNo.WRONG_ANSWER
-        except TVMError as exc:
-            msg = str(exc)
-            if "Stack trace returned" in msg:
-                msg = msg[:msg.index("Stack trace returned")]
-            costs = (RuntimeError(msg),)
-            errno = MeasureErrorNo.RUNTIME_DEVICE
-        tstamp = time.time()
-        res_pack.append(MeasureResult(costs, errno, tstamp - tic, tstamp))
-    return res_pack
-
-
-def default_build_func(inp, tmp_dir=None, **kwargs):
-    """Build function module. Exception will be raised when any error occurs
+            # create empty arrays on the remote device and copy them once.
+            # This can avoid some memory issues that make the measurment results unreliable.
+            args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info]
+            args = [nd.array(x, ctx=ctx) for x in args]
+            ctx.sync()
+
+        costs = time_f(*args).results
+
+        # clean up remote files
+        remote.remove(build_result.filename)
+        remote.remove(os.path.splitext(build_result.filename)[0] + '.so')
+        remote.remove('')
+
+        if len(costs) > 2:  # remove largest and smallest value to reduce variance
+            costs = list(costs)
+            costs.sort()
+            costs = tuple(costs[1:-1])
+
+        # check correctness of output
+        if ref_output:
+            for expected, real in zip(ref_output, args):
+                if not np.allclose(expected, real.asnumpy(), rtol=1e-4):
+                    logger.warning("Wrong Answer!")
+                    errno = MeasureErrorNo.WRONG_ANSWER
+    except TVMError as exc:
+        msg = str(exc)
+        if "Stack trace returned" in msg:
+            msg = msg[:msg.index("Stack trace returned")]
+        if "CUDA Source" in msg:
+            msg = msg[:msg.index("CUDA Source")]
+        costs = (RuntimeError(msg[:1024]),)
+        errno = MeasureErrorNo.RUNTIME_DEVICE
+    tstamp = time.time()
+    time.sleep(cooldown_interval)
+    return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp)
+
+
+def request_remote(device_key, host=None, port=None, priority=1, timeout=60):
+    """Request a remote session
 
     Parameters
     ----------
-    inp: MeasureInput
-       The input of this measurement
-    tmp_dir: tvm.contrib.util.TempDirectory, optional
-       The temporary directory for exporting built binary library.
-       If is not None (in RPC mode), the library in this directory will be uploaded to
-       remote devices.
-    kwargs: Dict, optional
-        Other extra arguments
+    device_key: string
+        The device key of registered device in tracker
+    host: host, optional
+        The host address of rpc tracker.
+        If is none, will use environment variable "TVM_TRACKER_HOST"
+    port: int, optional
+        The port of rpc tracker.
+        If is none, will use environment variable "TVM_TRACKER_PORT"
+    priority: int, optional
+        The priority of this request, larger is more prior
+    timeout: float, optional
+        The timeout of this session (units: second)
 
     Returns
-    -------
-    func: Function
-        TVM built function. Typically this is the return value of tvm.build.
-    args: Array of Buffer or Tensor
-        The argument list for the function. Typically this is the second argument of tvm.build.
-    filename: str
-        The filename of the output build library
+    ------
+    session: RPCSession
     """
-    # build function
-    with inp.target:
-        s, args = inp.task.instantiate(inp.config)
+    # connect to the tracker
+    host = host or os.environ['TVM_TRACKER_HOST']
+    port = port or int(os.environ['TVM_TRACKER_PORT'])
 
-        # check invalidity of template and code hash consistency
-        if not inp.config.valid():
-            raise InstantiationError(inp.config.errors)
-        code_hash = getattr(s, 'code_hash', None)
-        if inp.config.code_hash != code_hash:
-            raise HashMismatchError('got {0:s}, expected {1:s}'
-                                    .format(str(inp.config.code_hash), str(code_hash)))
-
-        opts = {}
-        if "check_gpu" in kwargs:  # Add verify pass to filter out invalid configs in advance.
-            opts["add_lower_pass"] = [(2, gpu_verify_pass(**kwargs['check_gpu']))]
-        if 'cuda_arch' in kwargs:
-            set_cuda_target_arch(kwargs['cuda_arch'])
+    tracker = _rpc.connect_tracker(host, port)
+    remote = tracker.request(device_key, priority=priority,
+                             session_timeout=timeout)
+    return remote
 
-        with build_config(**opts):
-            func = build(s, args, target_host=inp.task.target_host)
 
-    # export library to temp directory
-    if tmp_dir:
-        if kwargs.get('use_ndk', False):  # for Android NDK
-            filename = "tmp_func_%0x.so" % getrandbits(64)
-            func.export_library(tmp_dir.relpath(filename), ndk.create_shared)
-        else:
-            filename = "tmp_func_%0x.tar" % getrandbits(64)
-            func.export_library(tmp_dir.relpath(filename))
-    else:
-        filename = None
-
-    return func, args, filename
-
-
-def add_gpu_target_info(target, device_key, rpc_tracker_addr, kwargs):
-    """Add device info for gpu target.
-    The info will be used to check the validity of generated code."""
-    remote = request_remote(device_key, rpc_tracker_addr)
-    ctx = remote.context(str(target), 0)
-    max_dims = ctx.max_thread_dimensions
-    kwargs['check_gpu'] = {
-        'max_shared_memory_per_block': ctx.max_shared_memory_per_block,
-        'max_threads_per_block': ctx.max_threads_per_block,
-        'max_thread_x': max_dims[0],
-        'max_thread_y': max_dims[1],
-        'max_thread_z': max_dims[2],
-    }
-
-    if 'cuda' in target.keys:
-        kwargs["cuda_arch"] = "sm_" + "".join(ctx.compute_version.split('.'))
+def check_remote(target, device_key, host=None, port=None, priority=100, timeout=10):
+    """
+    Check the availability of a remote device
 
-def set_cuda_target_arch(arch):
-    """set target architecture of nvcc compiler"""
-    AutotvmGlobalScope.current.cuda_target_arch = arch
+    Parameters
+    ----------
+    target: Target
+        The wanted compilation target
+    device_key: string
+        device key of registered device in tracker
+    host: host, optional
+        The host address of rpc tracker.
+        If is none, will use environment variable "TVM_TRACKER_HOST"
+    port: int, optional
+        The port address of rpc tracker.
+        If is none, will use environment variable "TVM_TRACKER_PORT"
+    priority: int, optional
+        The priority of this request, larger is more prior
+    timeout: float, optional
+        The timeout of this check (units: seconds).
+
+    Returns
+    -------
+    available: bool
+        True if can find available device
+    """
+    def _check():
+        remote = request_remote(device_key, host, port, priority)
+        ctx = remote.context(str(target))
+        while not ctx.exist:  # wait until we get an available device
+            pass
+    t = threading.Thread(target=_check,)
+    t.start()
+    t.join(timeout)
+    return not t.is_alive()
 
 
 @register_func
@@ -488,6 +573,17 @@ def tvm_callback_cuda_compile(code):
     return ptx
 
 
+def set_cuda_target_arch(arch):
+    """set target architecture of nvcc compiler
+
+    Parameters
+    ----------
+    arch: str
+        The argument of nvcc -arch. (e.g. "sm_51", "sm_62")
+    """
+    AutotvmGlobalScope.current.cuda_target_arch = arch
+
+
 def gpu_verify_pass(**kwargs):
     """Verify the validity of a gpu kernel.
     This pass will check memory usage and number of threads per block.
diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py
index 77d9b6190a78..5adfae465ce3 100644
--- a/python/tvm/autotvm/record.py
+++ b/python/tvm/autotvm/record.py
@@ -252,13 +252,13 @@ def pick_best(in_file, out_file):
 This record executable module has three modes.
 
 * Print log file in readable format
-e.g. python -m autotvm.record --mode read --i collect_conv.log --begin 0 --end 5 --ir --code
+e.g. python -m tvm.autotvm.record --mode read --i collect_conv.log --begin 0 --end 5 --ir --code
 
 * Extract history best from a large log file
-e.g. python -m autotvm.record --mode pick --i collect.log
+e.g. python -m tvm.autotvm.record --mode pick --i collect.log
 
 * Split a log file into separate files, each of which contains only a single wkl
-e.g. python -m autotvm.record --mode split --i collect.log
+e.g. python -m tvm.autotvm.record --mode split --i collect.log
 """
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
@@ -271,7 +271,7 @@ def pick_best(in_file, out_file):
     parser.add_argument("--code", action='store_true')
 
     args = parser.parse_args()
-    logger.basicConfig(level=logger.INFO)
+    logging.basicConfig(level=logging.INFO)
 
     if args.mode == 'pick':
         args.o = args.o or args.i + ".best.log"
diff --git a/python/tvm/autotvm/task/__init__.py b/python/tvm/autotvm/task/__init__.py
index 0d43f92656cd..f6ea07c272d0 100644
--- a/python/tvm/autotvm/task/__init__.py
+++ b/python/tvm/autotvm/task/__init__.py
@@ -9,7 +9,9 @@
 from .task import Task, create, register, template, get_config, args_to_workload
 from .space import ConfigSpace, ConfigEntity
 from .code_hash import attach_code_hash, attach_code_hash_to_arg
-from .dispatcher import DispatchContext, ApplyConfig, ApplyHistoryBest, dispatcher
+from .dispatcher import dispatcher, DispatchContext, ApplyConfig, ApplyHistoryBest, \
+    FallbackContext, clear_fallback_cache, ApplyGraphBest
 
 from .topi_integration import register_topi_compute, register_topi_schedule
-from .nnvm_integration import extract_from_graph
+from .nnvm_integration import extract_from_graph, extract_from_multiple_graph
+from .relay_integration import extract_from_program, extract_from_multiple_program
diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
index 93f6d584abfa..c5464f94f285 100644
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -16,12 +16,12 @@
 
 import logging
 
-from decorator import decorate
 import numpy as np
+from decorator import decorate
 
 from tvm import target as _target
 
-from .space import ConfigSpace
+from .space import FallbackConfigEntity
 
 logger = logging.getLogger('autotvm')
 
@@ -34,9 +34,83 @@ class DispatchContext(object):
     """
     current = None
 
+    def __init__(self):
+        self._old_ctx = DispatchContext.current
+
     def query(self, target, workload):
         """
-        Query the context to get the specific implementation.
+        Query the context to get the specific config for a template.
+        If cannot find the result inside this context, this function will query it
+        from the upper contexts.
+
+        Parameters
+        ----------
+        target: Target
+            The current target
+        workload : Workload
+            The current workload.
+
+        Returns
+        -------
+        cfg : ConfigSpace
+            The specific configuration.
+        """
+        ret = self._query_inside(target, workload)
+        if ret is None:
+            ret = self._old_ctx.query(target, workload)
+        return ret
+
+    def update(self, target, workload, cfg):
+        """
+        Update context with a specific config.
+
+        Parameters
+        ----------
+        target: Target
+            The current target
+        workload : Workload
+            The current workload.
+        cfg : ConfigSpace
+            The specific configuration.
+
+        Note
+        ----
+        This interface is for cases when TVM decides to replace an operator in the graph.
+        For example, `AlterOpLayout` pass (enables when `opt_level = 3`) replaces `NCHW`
+        convolution with `NCHW[x]c` implementation on x86 CPUs.
+        Thus in TOPI, we first query schedule using original `NCHW` workload,
+        then update the dispatcher with the new `NCHW[x]c` workload.
+        So that later on, `NCHW[x]c` convolution can get schedule from the dispatcher using
+        its own workload directly.
+
+        .. code-block:: python
+
+            @conv2d_alter_layout.register("cpu")
+            def _alter_conv2d_layout(attrs, inputs, tinfo):
+                workload = get_conv2d_workload(...)
+                dispatch_ctx = autotvm.task.DispatchContext.current
+                target = tvm.target.current_target()
+                config = dispatch_ctx.query(target, workload)
+
+                # Get conv2d_NCHWc workload from config
+                # new_workload = ...
+                # new_inputs = ...
+                # new_attrs = ...
+
+                # Store altered operator's config
+                dispatch_ctx.update(target, new_workload, config)
+                return sym.contrib.conv2d_NCHWc(*new_inputs, **new_attrs)
+
+        We directly store `config` back because `conv2d_NCHW` and `conv2d_NCHWc`
+        share the same schedule parameters.
+        One can construct a new `ConfigEntity` if this is not the case.
+        """
+        raise NotImplementedError()
+
+    def _query_inside(self, target, workload):
+        """
+        Query the context to get the specific config for a template.
+        This function only query config inside this context.
 
         Parameters
         ----------
@@ -117,17 +191,17 @@ def _do_reg(myf):
     def dispatch_func(func, *args, **kwargs):
         """The wrapped dispatch function"""
         tgt = _target.current_target()
-        context = DispatchContext.current
-        if context is None:
-            raise RuntimeError("DispatchContext is not initialized")
         workload = func(*args, **kwargs)
-        cfg = context.query(tgt, workload)
-        if cfg.template_key:
-            return dispatch_dict[cfg.template_key](cfg, *args, **kwargs)
-        else:
-            assert dispatch_dict, "No func registered for this dispatcher"
+        cfg = DispatchContext.current.query(tgt, workload)
+        if cfg.is_fallback and not cfg.template_key:
+            # first try 'direct' template
+            if 'direct' in dispatch_dict:
+                return dispatch_dict['direct'](cfg, *args, **kwargs)
+            # otherwise pick a random template
             for v in dispatch_dict.values():
                 return v(cfg, *args, **kwargs)
+        else:
+            return dispatch_dict[cfg.template_key](cfg, *args, **kwargs)
 
     fdecorate = decorate(fworkload, dispatch_func)
     fdecorate.register = register
@@ -135,7 +209,7 @@ def dispatch_func(func, *args, **kwargs):
 
 
 class ApplyConfig(DispatchContext):
-    """Apply a specific config entity during query.
+    """Apply a deterministic config entity for all queries.
 
     Parameters
     ----------
@@ -147,11 +221,16 @@ def __init__(self, config):
         self._config = config
         self.workload = None
 
-    def query(self, target, workload):
+    def _query_inside(self, target, workload):
         """Override query"""
         self.workload = workload
         return self._config
 
+    def update(self, target, workload, cfg):
+        """Override update"""
+        self.workload = workload
+        self._config = cfg
+
 
 class ApplyHistoryBest(DispatchContext):
     """
@@ -164,20 +243,13 @@ class ApplyHistoryBest(DispatchContext):
         If is str, then it should be the filename of a records log file.
                    Each row of this file is an encoded record pair.
         Otherwise, it is an iterator.
-    default: ConfigEntity, optional
-        The default config to return when no history records
-    allow_fallback: bool
-        Whether allow to use a fallback configuration if cannot find
-        tuned result.
     """
-    def __init__(self, records, default=None, allow_fallback=False):
+    def __init__(self, records):
         super(ApplyHistoryBest, self).__init__()
 
         self.best_by_targetkey = {}
         self.best_by_model = {}
-        self._default = default
-        self._allow_fallback = allow_fallback
-        self.fallback = {}
+        self._best_user_defined = {}
 
         if records:
             self.load(records)
@@ -220,54 +292,176 @@ def load(self, records):
                         best_by_targetkey[key] = (inp, res)
 
             # use model as key to build best map
-            for opt in inp.target.options:
-                if opt.startswith("-model"):
-                    model = opt[7:]
-                    key = (model, inp.task.workload)
-                    if key not in best_by_model:
-                        best_by_model[key] = (inp, res)
-                    else:
-                        _, other_res = best_by_model[key]
-                        if np.mean(other_res.costs) > np.mean(res.costs):
-                            best_by_model[key] = (inp, res)
-                    break
+            key = (inp.target.model, inp.task.workload)
+            if key not in best_by_model:
+                best_by_model[key] = (inp, res)
+            else:
+                _, other_res = best_by_model[key]
+                if np.mean(other_res.costs) > np.mean(res.costs):
+                    best_by_model[key] = (inp, res)
 
         logger.debug("Finish loading %d records", counter)
 
-    def query(self, target, workload):
+    def _query_inside(self, target, workload):
         if target is None:
             raise RuntimeError("Need a target context to find the history best. "
                                "Hint: If your target is llvm, use `with tvm.target.create('llvm'):`"
                                " above the dispatcher call. So does other target. ")
 
         # first try matching by model
-        for opt in target.options:
-            if opt.startswith("-model"):
-                model = opt[7:]
-                key = (model, workload)
-                if key in self.best_by_model:
-                    return self.best_by_model[key][0].config
+        key = (target.model, workload)
+        if key in self._best_user_defined:
+            return self._best_user_defined[key]
+        if key in self.best_by_model:
+            return self.best_by_model[key][0].config
 
         # then try matching by target key
         for k in target.keys:
             key = (k, workload)
+            if key in self._best_user_defined:
+                return self._best_user_defined[key]
             if key in self.best_by_targetkey:
                 return self.best_by_targetkey[key][0].config
 
-        if self._default:
-            return self._default
-
-        if self._allow_fallback:
-            key = (target, workload)
-            if key in self.fallback:
-                return self.fallback[key]
-            logger.warning(
-                "Cannot find config for target=%s, workload=%s. A fallback configuration "
-                "is used, which may bring great performance regression.", target, workload)
-            cfg = ConfigSpace()
-            self.fallback[key] = cfg
+        return None
+
+    def update(self, target, workload, cfg):
+        model = target.model
+        key = (model, workload)
+        self._best_user_defined[key] = cfg
+
+        for k in target.keys:
+            key = (k, workload)
+            self._best_user_defined[key] = cfg
+
+
+class FallbackContext(DispatchContext):
+    """
+    A fallback dispatch context.
+
+    Any tunable template can be called under this context.
+    This is the root context.
+    """
+
+    def __init__(self):
+        super(FallbackContext, self).__init__()
+        self.memory = {}
+        self.silent = False
+
+        # a set to prevent print duplicated message
+        self.messages = set()
+
+    def _query_inside(self, target, workload):
+        key = (str(target), workload)
+        if key in self.memory:
+            return self.memory[key]
+
+        if not self.silent:
+            msg = "Cannot find config for target=%s, workload=%s. A fallback configuration "\
+                  "is used, which may bring great performance regression." % (target, workload)
+            if msg not in self.messages:
+                self.messages.add(msg)
+                logger.warning(msg)
+        cfg = FallbackConfigEntity()
+
+        # cache this config
+        self.memory[key] = cfg
+        return cfg
+
+    def clear_cache(self, target, workload):
+        """Clear fallback cache. Pass the same argument as _query_inside to this function
+        to clean the cache.
+
+        Parameters
+        ----------
+        target: Target
+            The current target
+        workload : Workload
+            The current workload.
+        """
+        key = (str(target), workload)
+        if key in self.memory:
+            del self.memory[key]
+
+    def update(self, target, workload, cfg):
+        key = (str(target), workload)
+        self.memory[key] = cfg
+
+DispatchContext.current = FallbackContext()
+
+def clear_fallback_cache(target, workload):
+    """Clear fallback cache. Pass the same argument as _query_inside to this function
+    to clean the cache.
+
+    Parameters
+    ----------
+    target: Target
+        The current target
+    workload : Workload
+        The current workload.
+
+    Note
+    ----
+    This is used in alter_op_layout to clear the bad cache created before call topi compute function
+    """
+    context = DispatchContext.current
+    while not isinstance(context, FallbackContext):
+        context = context._old_ctx
+    context.clear_cache(target, workload)
+
+class ApplyGraphBest(DispatchContext):
+    """Load the graph level tuning optimal schedules.
+
+    The input records should be in the ascending order of
+    node index for target operator. Usually this can be obtained
+    with graph tuner.
+
+    This context maintains an internal counter to indicate the current
+    node index.
+    """
+    def __init__(self, records):
+        """
+        Parameters
+        ----------
+        records : str or iterator of (MeasureInput, MeasureResult)
+            Collection of tuning records.
+            If is str, then it should be the filename of a records log file.
+                   Each row of this file is an encoded record pair.
+            Otherwise, it is an iterator.
+        """
+        from ..record import load_from_file
+
+        super(ApplyGraphBest, self).__init__()
+        if isinstance(records, str):
+            records = load_from_file(records)
+        self._records = list(records)
+        self._counter = 0
+        self._global_cfg_dict = {}
+
+    def _query_inside(self, target, workload):
+        """
+        Query the context to get config from records.
+
+        Parameters
+        ----------
+        target : Target
+            The current target
+        workload : Workload
+            The current workload.
+
+        Returns
+        -------
+        cfg : ConfigSpace
+            The specific configuration.
+        """
+        if self._counter < len(self._records):
+            cfg = self._records[self._counter][0].config
+            self._counter += 1
+            self.update(target, workload, cfg)
             return cfg
+        key = (str(target), workload)
+        return self._global_cfg_dict[key]
 
-        raise RuntimeError(
-            "Cannot find config for target=%s, workload=%s. You need to do tuning "
-            "for this workload to get the config." % (target, workload))
+    def update(self, target, workload, cfg):
+        key = (str(target), workload)
+        self._global_cfg_dict[key] = cfg
diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py
index 338b46784a75..cd7337586519 100644
--- a/python/tvm/autotvm/task/nnvm_integration.py
+++ b/python/tvm/autotvm/task/nnvm_integration.py
@@ -4,157 +4,106 @@
 
 """
 import warnings
+import logging
 
-from ... import tensor, placeholder, target as _target
 
-from ..util import get_const_tuple
-from .task import create, register
+from ... import target as _target
 
+from .task import create
+from .topi_integration import TaskExtractEnv
 
-def serialize_args(args):
-    """serialize arguments of a topi function to a hashable tuple.
+logger = logging.getLogger('autotvm')
 
-    Parameters
-    ----------
-    args: list of hashable or Tensor
-    """
-    ret = []
-    for t in args:
-        if isinstance(t, tensor.Tensor):
-            ret.append(('TENSOR', get_const_tuple(t.shape), t.dtype))
-        else:
-            ret.append(t)
-    return tuple(ret)
 
+def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
+    """ Extract tuning tasks from a nnvm graph.
 
-def deserialize_args(args):
-    """The inverse function of :code:`serialize_args`.
+    This function collects tuning tasks by building the graph
+    with a "tracing" target and tracing all the calls to topi.
 
     Parameters
     ----------
-    args: list of hashable or Tensor
+    graph : Graph
+        The graph to tune
+    shape : dict of str to tuple
+        The input shape to the graph
+    dtype : str or dict of str to str
+        The input types to the graph
+    target: tvm.target.Target
+        The compilation target
+    symbols : Array of nnvm.symbol
+        Array of nnvm symbols want to be tuned
+    target_host: tvm.target.Target
+        The host compilation target
+
+    Returns
+    -------
+    task: Array of autotvm.task.Task
+        collected tasks
     """
-    ret = []
-    for t in args:
-        if isinstance(t, tuple) and t[0] == 'TENSOR':
-            ret.append(placeholder(shape=t[1], dtype=t[2]))
+    import nnvm.compiler
+    import nnvm
+    import topi
+
+    env = TaskExtractEnv.get()
+
+    #NOTE: To add more symbols, you only need to change the following lists
+    #nnvm symbol -> topi compute
+    SYMBOL2TOPI = {
+        nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw,
+                          topi.nn.group_conv2d_nchw],
+        nnvm.sym.conv2d_transpose: [topi.nn.conv2d_transpose_nchw],
+        nnvm.sym.dense: [topi.nn.dense],
+    }
+
+    topi_funcs = []
+    for sym_name in symbols:
+        if sym_name in SYMBOL2TOPI:
+            topi_funcs.extend(SYMBOL2TOPI[sym_name])
         else:
-            ret.append(t)
-    return ret
-
-
-# Task extractor for nnvm graph
-class TaskExtractEnv:
-    """Global environment for extracting tuning tasks from nnvm graph"""
-    current = None
-
-    def __init__(self):
-        import topi
-        import nnvm
-
-        self.symbol2topi = {
-            nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw],
-            nnvm.sym.conv2d_transpose: [topi.nn.conv2d_transpose],
-        }
-
-        self.topi_to_task = {
-            topi.nn.conv2d: "topi_nn_conv2d",
-            topi.nn.depthwise_conv2d_nchw: "topi_nn_depthwise_conv2d_nchw",
-            topi.nn.conv2d_transpose_nchw: "topi_nn_conv2d_transpose_nchw",
-        }
-
-        self._register_dummy()
-        self._register_topi_task()
-        self.task_collection = []
-
-    def _register_dummy(self):
-        """Register dummy function to track the topi function call"""
-        for func in self.topi_to_task:
-            def _local_scope(local_func):
-                """build a scope to holds the function"""
-                @local_func.register("dummy", )
-                def _dummy_func(*args, **kwargs):
-                    assert not kwargs, "Do not support extracting tuning tasks when" \
-                                       "kwargs is used in TOPI function call." \
-                                       "Please modify it to use only positional args."
-
-                    if (self.topi_to_task[local_func], serialize_args(args)) \
-                            not in self.task_collection:
-                        self.task_collection.append((self.topi_to_task[local_func],
-                                                     serialize_args(args)))
-                    with _target.create("opencl"):
-                        return local_func(*args)
-
-            _local_scope(func)
-
-    def _register_topi_task(self):
-        """register tuning wrapper for topi function"""
-        import topi
-
-        # Tuning wrapper for topi functions
-        @register("topi_nn_conv2d")
-        def _topi_nn_conv2d(*args, **kwargs):
-            assert not kwargs, "Do not support kwargs in template function call"
-            args = deserialize_args(args)
-            A, W = args[:2]
-            layout = args[-2]
-            assert layout == 'NCHW', "only support NCHW currently"
-            C = topi.nn.conv2d(*args, **kwargs)
-            s = topi.generic.schedule_conv2d_nchw([C])
-            return s, [A, W, C]
-
-        @register("topi_nn_depthwise_conv2d_nchw")
-        def _topi_nn_depthwise_conv2d_nchw(*args, **kwargs):
-            assert not kwargs, "Do not support kwargs in template function call"
-            args = deserialize_args(args)
-            A, W = args[:2]
-            C = topi.nn.depthwise_conv2d_nchw(*args, **kwargs)
-            s = topi.generic.schedule_depthwise_conv2d_nchw([C])
-            return s, [A, W, C]
-
-        @register("topi_nn_conv2d_transpose_nchw")
-        def _topi_nn_conv2d_transpose_nchw(*args, **kwargs):
-            assert not kwargs, "Do not support kwargs in template function call"
-            args = deserialize_args(args)
-            A, W = args[:2]
-            C = topi.nn.conv2d_transpose_nchw(*args, **kwargs)
-            s = topi.generic.schedule_conv2d_transpose_nchw([C])
-            return s, [A, W, C]
-
-    def reset(self):
-        """Reset task collections"""
-        self.task_collection = []
-
-    def get_tasks(self):
-        """Get collected tasks"""
-        return self.task_collection
-
-    @staticmethod
-    def get():
-        """Get the single instance of TaskExtractEnv"""
-        if not TaskExtractEnv.current:
-            TaskExtractEnv.current = TaskExtractEnv()
-        return TaskExtractEnv.current
+            warnings.warn("Symbol %s is not tunable, ignored" % sym_name)
 
+    # run compiler to collect all TOPI calls during compilation
+    env.reset(topi_funcs)
+
+    # disable logger temporarily
+    old_state = logger.disabled
+    logger.disabled = True
+
+    # use a "tracing" target to do a fake compile for collecting topi calls
+    tracing_target = _target.create("llvm -device=tracing")
+    nnvm.compiler.engine.clear_cache()
+    nnvm.compiler.build(graph, target=tracing_target, shape=shape, dtype=dtype)
+
+    logger.disabled = old_state
+
+    # create tasks for target
+    tasks = []
+    for task_name, args in env.get_tasks():
+        tasks.append(create(task_name, args,
+                            target=target, target_host=target_host,
+                            template_key='direct'))
+
+    return tasks
 
-def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
-    """ Extract tuning tasks from a nnvm graph.
 
-    This function collects tunning tasks by building the graph
-    with a "dummy" target and tracing all the calls to topi.
+def extract_from_multiple_graph(graphs, shapes, dtypes, target, symbols, target_host=None):
+    """ Extract tuning tasks from multiple nnvm graphs.
+
+    This function is the multiple graph version of extract_from_graph
 
     Parameters
     ----------
-    graph : Graph
-        The graph to tune
-    shape : dict of str to tuple, optional
+    graphs : List of Graph
+        The list of graphs to tune
+    shapes : List of dict of str to tuple
         The input shape to the graph
-    dtype : str or dict of str to str
+    dtypes : List of str or dict of str to str
         The input types to the graph
     target: tvm.target.Target
         The compilation target
     symbols : Array of nnvm.symbol
-        Array of nnvm symbols
+        Array of nnvm symbols want to be tuned
     target_host: tvm.target.Target
         The host compilation target
 
@@ -164,21 +113,44 @@ def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
         collected tasks
     """
     import nnvm.compiler
+    import nnvm
+    import topi
 
     env = TaskExtractEnv.get()
 
+    #NOTE: To add more symbols, you only need to change the following lists
+    #nnvm symbol -> topi compute
+    SYMBOL2TOPI = {
+        nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw,
+                          topi.nn.group_conv2d_nchw],
+        nnvm.sym.conv2d_transpose: [topi.nn.conv2d_transpose_nchw],
+        nnvm.sym.dense: [topi.nn.dense],
+    }
+
     topi_funcs = []
     for sym_name in symbols:
-        if sym_name in env.symbol2topi:
-            topi_funcs.extend(env.symbol2topi[sym_name])
+        if sym_name in SYMBOL2TOPI:
+            topi_funcs.extend(SYMBOL2TOPI[sym_name])
         else:
             warnings.warn("Symbol %s is not tunable, ignored" % sym_name)
 
     # run compiler to collect all TOPI calls during compilation
-    env.reset()
-    dummy_target = _target.create("opencl -device=dummy")
-    nnvm.compiler.build(graph, target=dummy_target, shape=shape, dtype=dtype)
+    env.reset(topi_funcs)
+
+    # disable logger temporarily
+    old_state = logger.disabled
+    logger.disabled = True
+
+    # use a "tracing" target to do a fake compile for collecting topi calls
+    tracing_target = _target.create("llvm -device=tracing")
+
+    nnvm.compiler.engine.clear_cache()
+    for graph, shape, dtype in zip(graphs, shapes, dtypes):
+        nnvm.compiler.build(graph, target=tracing_target, shape=shape, dtype=dtype)
+
+    logger.disabled = old_state
 
+    # create tasks for target
     tasks = []
     for task_name, args in env.get_tasks():
         tasks.append(create(task_name, args,
diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
new file mode 100644
index 000000000000..21acf257f9ac
--- /dev/null
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -0,0 +1,200 @@
+# pylint: disable=unused-variable,invalid-name
+"""
+Decorator and utilities for the integration with TOPI and Relay
+99.9% copy-paste of implementation by @MerryMercy
+
+"""
+import threading
+import warnings
+import logging
+
+
+from ... import tensor, placeholder, target as _target
+
+from .task import create
+from .topi_integration import TaskExtractEnv
+
+logger = logging.getLogger('autotvm')
+
+
+def serialize_args(args):
+    """serialize arguments of a topi function to a hashable tuple.
+
+    Parameters
+    ----------
+    args: list of hashable or Tensor
+    """
+    ret = []
+    for t in args:
+        if isinstance(t, tensor.Tensor):
+            ret.append(('TENSOR', get_const_tuple(t.shape), t.dtype))
+        else:
+            ret.append(t)
+    return tuple(ret)
+
+
+def deserialize_args(args):
+    """The inverse function of :code:`serialize_args`.
+
+    Parameters
+    ----------
+    args: list of hashable or Tensor
+    """
+    ret = []
+    for t in args:
+        if isinstance(t, tuple) and t[0] == 'TENSOR':
+            ret.append(placeholder(shape=t[1], dtype=t[2]))
+        else:
+            ret.append(t)
+    return ret
+
+
+def extract_from_program(func, params, ops, target, target_host=None):
+    """ Extract tuning tasks from a relay program.
+
+    This function collects tuning tasks by building the program
+    with a "tracing" target and tracing all the calls to topi.
+
+    Parameters
+    ----------
+    func: relay.expr.Function
+        The func to tune
+    params: dict of str to numpy array
+        The associated parameters of the program
+    ops: List of relay op
+        List of relay ops to be tuned
+    dtype: str or dict of str to str
+        The input types to the program
+    target: tvm.target.Target
+        The compilation target
+    target_host: tvm.target.Target
+        The host compilation target
+
+    Returns
+    -------
+    task: Array of autotvm.task.Task
+        collected tasks
+    """
+    env = TaskExtractEnv.get()
+    import tvm.relay.op
+    from tvm import relay
+    import topi
+
+    # NOTE: To add more ops, you only need to change the following lists
+    # relay op -> topi compute
+    OP2TOPI = {
+        tvm.relay.op.nn.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw,
+                                 topi.nn.group_conv2d_nchw],
+        tvm.relay.op.nn.conv2d_transpose: [topi.nn.conv2d_transpose_nchw],
+        tvm.relay.op.nn.dense: [topi.nn.dense],
+    }
+
+    topi_funcs = []
+    for op_name in ops:
+        if op_name in OP2TOPI:
+            topi_funcs.extend(OP2TOPI[op_name])
+        else:
+            warnings.warn("Op %s is not tunable, ignored" % op_name)
+
+    # run compiler to collect all TOPI calls during compilation
+    env.reset(topi_funcs)
+
+    # disable logger temporarily
+    old_state = logger.disabled
+    logger.disabled = True
+
+    # use a "tracing" target to do a fake compile for collecting topi calls
+    tracing_target = _target.create("llvm -device=tracing")
+    relay.backend.compile_engine.get().clear()
+    # wrap build call in thread to avoid multiprocessing problems
+    build_thread = threading.Thread(target=relay.build, args=(func,
+                                                              tracing_target,
+                                                              target_host,
+                                                              params))
+    build_thread.start()
+    build_thread.join()
+    logger.disabled = old_state
+
+    # create tasks for target
+    tasks = []
+    for task_name, args in env.get_tasks():
+        tasks.append(create(task_name, args,
+                            target=target, target_host=target_host,
+                            template_key='direct'))
+
+    return tasks
+
+
+def extract_from_multiple_program(funcs, params, ops, target, target_host=None):
+    """ Extract tuning tasks from multiple relay programs.
+
+    This function is the multiple program version of extract_from_program
+
+    Parameters
+    ----------
+    funcs: List of relay.expr.Function
+        The list of functions to tune
+    params: List of dict of str to numpy array
+        The associated parameters of the programs
+    ops: List of relay op
+        List of relay ops to be tuned
+    target: tvm.target.Target
+        The compilation target
+    target_host: tvm.target.Target
+        The host compilation target
+
+    Returns
+    -------
+    task: Array of autotvm.task.Task
+        collected tasks
+    """
+    env = TaskExtractEnv.get()
+    import tvm.relay.op
+    from tvm import relay
+    import topi
+
+    # NOTE: To add more ops, you only need to change the following lists
+    # relay op -> topi compute
+    OP2TOPI = {
+        tvm.relay.op.nn.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw,
+                                 topi.nn.group_conv2d_nchw],
+        tvm.relay.op.nn.conv2d_transpose: [topi.nn.conv2d_transpose_nchw],
+        tvm.relay.op.nn.dense: [topi.nn.dense],
+    }
+
+    topi_funcs = []
+    for op_name in ops:
+        if op_name in OP2TOPI:
+            topi_funcs.extend(OP2TOPI[op_name])
+        else:
+            warnings.warn("Op %s is not tunable, ignored" % op_name)
+
+    # run compiler to collect all TOPI calls during compilation
+    env.reset(topi_funcs)
+
+    # disable logger temporarily
+    old_state = logger.disabled
+    logger.disabled = True
+
+    # use a "tracing" target to do a fake compile for collecting topi calls
+    tracing_target = _target.create("llvm -device=tracing")
+
+    for func, param in zip(funcs, params):
+        # wrap build call in thread to avoid multiprocessing problems
+        build_thread = threading.Thread(target=relay.build, args=(func,
+                                                                  tracing_target,
+                                                                  target_host,
+                                                                  params))
+        build_thread.start()
+        build_thread.join()
+
+    logger.disabled = old_state
+
+    # create tasks for target
+    tasks = []
+    for task_name, args in env.get_tasks():
+        tasks.append(create(task_name, args,
+                            target=target, target_host=target_host,
+                            template_key='direct'))
+
+    return tasks
diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index ea823c6f2760..3fb02c6190cf 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -1,5 +1,5 @@
 # pylint: disable=too-few-public-methods,invalid-name,unused-argument,arguments-differ
-# pylint: disable=consider-using-enumerate
+# pylint: disable=consider-using-enumerate,too-many-lines
 """
 Template configuration space.
 
@@ -423,7 +423,7 @@ def __init__(self, axes, policy, **kwargs):
         elif policy == 'locate_cache':
             self.num_axis = len(axes)
             num_anchor = kwargs["num_anchor"]
-            self.anns = list(itertools.combinations(np.arange(self.num_axis), num_anchor))
+            self.anns = list(itertools.combinations(range(self.num_axis), num_anchor))
             self.entities = [AnnotateEntity(x) for x in self.anns]
         else:  # none, vec, unroll, try_vec, try_unroll, try_vec_unroll, ...
             anns = policy.replace('try', 'none').split('_')
@@ -567,15 +567,16 @@ class ConfigSpace(object):
     """
     def __init__(self):
         # private dict to provide sugar
-        self.space_map = OrderedDict()  # name -> space
+        self.space_map = OrderedDict()    # name -> space
         self._collect = True
         self._length = None
-        self._entity_map = OrderedDict()
+        self._entity_map = OrderedDict()  # name -> entity
         self._constraints = []
         self.errors = []
         self.template_key = None
         self.code_hash = None
         self.flop = 0
+        self.is_fallback = False
 
     @staticmethod
     def axis(var):
@@ -607,6 +608,15 @@ def define_split(self, name, axis, policy='all', **kwargs):
             If is 'candidate', try listed candidate.
         kwargs: dict
             extra arguments for policy
+            see examples below for how to use filter
+
+        Examples
+        --------
+        >>> # use custom candidates
+        >>> cfg.define_split('tile_x', x, policy='candidate', candidate=[[1, 4, 4], [4, 1, 4]])
+
+        >>> # use a filter that only accepts the split scheme whose inner most tile is less then 4
+        >>> cfg.define_split('tile_y', y, policy='all', filter=lambda x: x.size[-1] <= 4)
         """
         axes = [axis]
         return self._add_new_transform(SplitSpace, name, axes, policy, **kwargs)
@@ -889,3 +899,114 @@ def from_json_dict(json_dict):
     def __repr__(self):
         return "%s,%s,%s,%d" % (str(self._entity_map)[12:-1], self.template_key,
                                 self.code_hash, self.index)
+
+
+class FallbackConfigEntity(ConfigSpace):
+    """The config entity created to support fallback"""
+
+    def __init__(self):
+        super(FallbackConfigEntity, self).__init__()
+        self.is_fallback = True
+
+    def fallback_split(self, name, constraints):
+        """Fallback a split knob
+
+        Parameters
+        ----------
+        name: str
+            name of the knob
+        constraints: List of int
+            The maximum tile size for every dimension. Value `-1` means no constraint.
+
+        Examples
+        --------
+        If you use cfg.define_split('tile_0', 128, num_outputs=3),
+        Then cfg.fallback_split('tile_0', [-1, 8, 4]) will give you cfg['tile_0'].size = [4, 8, 4]
+
+        If you use cfg.define_split('tile_0', 49, num_outputs=3),
+        Then cfg.fallback_split('tile_0', [-1, 8, 4]) will give you cfg['tile_0'].size = [7, 7, 1]
+        """
+        space = self.space_map[name]
+        assert isinstance(space, SplitSpace)
+        assert len(constraints) == space.num_outputs
+
+        # '-1' means no constraint
+        constraints = [x if x != -1 else 1e10 for x in constraints]
+
+        entity = self._entity_map[name]
+        now = space.product
+
+        for i in reversed(range(space.num_outputs)):
+            factors = get_factors(now)
+
+            find = len(factors) - 1
+            for j, f in enumerate(factors):
+                if f > constraints[i]:
+                    find = j - 1
+                    break
+
+            if find >= 0:
+                entity.size[i] = factors[find]
+                now //= factors[find]
+            else:
+                raise RuntimeError("Cannot find feasible fallback split entity for node: " + name)
+
+    def fallback_with_reference_log(self, ref_log):
+        """A data driven fallback mechanism.
+        We use tuned parameters from TopHub as reference data.
+        For an unseen shape, we find the most similar tuned one from TopHub and
+        mimic its parameters.
+
+        Parameters
+        ----------
+        ref_log: List of (MeasureInput, MeasureResult)
+            The reference log
+        """
+        knob_names = [x for x in self.space_map.keys() if
+                      isinstance(self.space_map[x], SplitSpace)]
+
+        # find best match config in reference data by matching tiling factors
+        factor_list = []
+        for knob_name in knob_names:
+            factor_list.append(get_factors(self.space_map[knob_name].product))
+
+        best_match_cfg = None
+        best_match_score = 0
+        for inp, _ in ref_log:
+            match_score = 0
+            for i, knob_name in enumerate(knob_names):
+                factors = get_factors(int(np.prod(inp.config[knob_name].size)))
+                match_score += (float(len(set(factor_list[i]).intersection(factors))) /
+                                len(factor_list[i]))
+
+                if match_score > best_match_score:
+                    best_match_score, best_match_cfg = match_score, inp.config
+
+        if best_match_cfg is None:
+            return
+
+        # mimic its tiling strategy
+        for knob_name in knob_names:
+            constraint = list(best_match_cfg[knob_name].size)
+            constraint[0] = -1
+            self.fallback_split(knob_name, constraint)
+
+        # copy other knobs
+        for knob_name in self.space_map.keys():
+            if not isinstance(self.space_map[knob_name], SplitSpace):
+                self._entity_map[knob_name] = best_match_cfg[knob_name]
+
+    def __setitem__(self, name, entity):
+        """set the entity(knob) of by name
+
+        Parameters
+        ----------
+        name: str
+            name of the entity
+        entity: SplitEntity, ReorderEntity, AnnotateEntity, OtherOptionEntity
+            value of the entity
+        """
+        self._entity_map[name] = entity
+
+    def __repr__(self):
+        return "%s,%s,%s" % (str(self._entity_map)[12:-1], self.template_key, self.code_hash)
diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index 7a386f1f9e67..22a15143b96e 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -182,7 +182,7 @@ def create(func_name, args, target, target_host=None, template_key=None):
 
     return ret
 
-def args_to_workload(x):
+def args_to_workload(x, topi_compute_func=None):
     """Convert argument list to hashable workload tuple.
     This function will convert list to tuple, tvm node to python value and
     flatten tvm.tensor.Tensor to a tuple
@@ -191,6 +191,8 @@ def args_to_workload(x):
     ----------
     x: primitive hashable types or tensor.Tensor
         The original value
+    topi_compute_func: topi compute function
+        The function name will be added as first element of the workload tuple
 
     Returns
     -------
@@ -198,18 +200,19 @@ def args_to_workload(x):
         The hashable value
     """
     if isinstance(x, tensor.Tensor):
-        return get_const_tuple(x.shape) + (x.dtype, )
+        workload = get_const_tuple(x.shape) + (x.dtype, )
     elif isinstance(x, (tuple, list, container.Array)):
-        return tuple([args_to_workload(a) for a in x])
+        workload = tuple([args_to_workload(a) for a in x])
     elif isinstance(x, (str, int, float, np.int, np.float)):
-        return x
+        workload = x
     elif isinstance(x, (expr.StringImm, expr.IntImm, expr.FloatImm)):
-        return x.value
+        workload = x.value
     elif x is None:
-        return None
+        workload = 0
     else:
         raise RuntimeError('Do not support type "%s" in argument. Consider to use'
                            'primitive types only' % type(x))
+    return (get_func_name(topi_compute_func), ) + workload  if topi_compute_func else workload
 
 def template(func):
     """
@@ -368,7 +371,7 @@ def traverse(ops):
                 pass
             else:
                 raise FlopCalculationError("Only support tvm.compute currently. "
-                                           "Other ops like tvm.scan is not supported")
+                                           "Other ops like tvm.scan/tvm.extern is not supported")
         return ret
 
     try:
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index 012ca4a214e9..412d7ae0e40b 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -1,4 +1,4 @@
-# pylint: disable=unused-variable,invalid-name
+# pylint: disable=unused-variable,invalid-name,unused-argument
 """
 Decorators for registering tunable templates to TOPI.
 
@@ -11,17 +11,202 @@
 See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
 """
 
-from ... import _api_internal, tensor
-
-from ..util import get_func_name
-from .task import args_to_workload, dispatcher
+from ... import _api_internal, tensor, placeholder, create_schedule
 
+from .task import args_to_workload, dispatcher, register
+from ..util import get_const_tuple
 
 # A table that records all registered dispatcher for all targets
 _REGISTED_DISPATHCER = {
 }
 
 
+def serialize_args(args):
+    """serialize arguments of a topi function to a hashable tuple.
+
+    Parameters
+    ----------
+    args: list of hashable or Tensor
+    """
+    ret = []
+    for t in args:
+        if isinstance(t, tensor.Tensor):
+            ret.append(('TENSOR', get_const_tuple(t.shape), t.dtype))
+        else:
+            ret.append(t)
+    return tuple(ret)
+
+
+def deserialize_args(args):
+    """The inverse function of :code:`serialize_args`.
+
+    Parameters
+    ----------
+    args: list of hashable or Tensor
+    """
+    ret = []
+    for t in args:
+        if isinstance(t, tuple) and t[0] == 'TENSOR':
+            ret.append(placeholder(shape=t[1], dtype=t[2]))
+        else:
+            ret.append(t)
+    return ret
+
+
+# Task extractor for nnvm graph, relay program
+class TaskExtractEnv:
+    """Global environment for extracting tuning tasks from nnvm graph"""
+    current = None
+
+    def __init__(self):
+        import topi
+
+        # topi compute -> autotvm task name
+        self.topi_to_task = {
+            topi.nn.conv2d: "topi_nn_conv2d",
+            topi.nn.depthwise_conv2d_nchw: "topi_nn_depthwise_conv2d_nchw",
+            topi.nn.group_conv2d_nchw: "topi_nn_group_conv2d_nchw",
+            topi.nn.conv2d_transpose_nchw: "topi_nn_conv2d_transpose_nchw",
+            topi.nn.dense: "topi_nn_dense",
+        }
+
+        self.topi_to_schedule = {
+            topi.nn.conv2d: [topi.generic.schedule_conv2d_nchw,
+                             topi.generic.schedule_conv2d_nhwc],
+            topi.nn.depthwise_conv2d_nchw: [topi.generic.schedule_depthwise_conv2d_nchw,
+                                            topi.generic.schedule_depthwise_conv2d_nhwc],
+            topi.nn.group_conv2d_nchw: [topi.generic.schedule_group_conv2d_nchw],
+            topi.nn.conv2d_transpose_nchw: [topi.generic.schedule_conv2d_transpose_nchw],
+            topi.nn.dense: [topi.generic.schedule_dense],
+        }
+
+        self._register_tracing()
+        self._register_topi_task()
+        self.task_collection = []
+        self.wanted_topi_funcs = list(self.topi_to_task.keys())
+
+    def _register_tracing(self):
+        """Register tracing function to track the topi function call"""
+        # register topi compute for "tracing" target
+        for topi_compute in self.topi_to_task:
+            def _local_scope(compute_func):
+                """start a scope to hold the local function in for loop"""
+
+                @compute_func.register("tracing", )
+                def _tracing_topi_compute(*args, **kwargs):
+                    assert not kwargs, "Do not support extracting tuning tasks when" \
+                                       "kwargs is used in TOPI function call." \
+                                       "Please modify it to use only positional args."
+
+                    if compute_func in self.wanted_topi_funcs:  # record this call
+                        key = (self.topi_to_task[compute_func], serialize_args(args))
+                        if key not in self.task_collection:
+                            self.task_collection.append(key)
+
+                    return compute_func.fdefault(*args)
+            _local_scope(topi_compute)
+
+        # register topi schedule for "tracing" target
+        for topi_compute in self.topi_to_task:
+            for topi_schedule in self.topi_to_schedule[topi_compute]:
+                def _local_scope_(schedule_func):
+                    """start a scope to hold the local function in for loop"""
+
+                    @schedule_func.register("tracing", )
+                    def _tracing_topi_compute(outs):
+                        outs = [outs] if isinstance(outs, tensor.Tensor) else outs
+                        return create_schedule([x.op for x in outs])
+                _local_scope_(topi_schedule)
+
+    def _register_topi_task(self):
+        """register tuning wrapper for topi function"""
+        import topi
+
+        # Tuning wrapper for topi functions
+        @register("topi_nn_conv2d")
+        def _topi_nn_conv2d(*args, **kwargs):
+            assert not kwargs, "Do not support kwargs in template function call"
+            args = deserialize_args(args)
+            A, W = args[:2]
+            layout = args[-2]
+            assert layout == 'NCHW', "only support NCHW currently"
+            C = topi.nn.conv2d(*args, **kwargs)
+            s = topi.generic.schedule_conv2d_nchw([C])
+            return s, [A, W, C]
+
+        @register("topi_nn_depthwise_conv2d_nchw")
+        def _topi_nn_depthwise_conv2d_nchw(*args, **kwargs):
+            assert not kwargs, "Do not support kwargs in template function call"
+            args = deserialize_args(args)
+            A, W = args[:2]
+            C = topi.nn.depthwise_conv2d_nchw(*args, **kwargs)
+            s = topi.generic.schedule_depthwise_conv2d_nchw([C])
+            return s, [A, W, C]
+
+        @register("topi_nn_group_conv2d_nchw")
+        def _topi_nn_group_conv2d_nchw(*args, **kwargs):
+            assert not kwargs, "Do not support kwargs in template function call"
+            args = deserialize_args(args)
+            A, W = args[:2]
+            C = topi.nn.group_conv2d_nchw(*args, **kwargs)
+            s = topi.generic.schedule_group_conv2d_nchw([C])
+            return s, [A, W, C]
+
+        @register("topi_nn_conv2d_transpose_nchw")
+        def _topi_nn_conv2d_transpose_nchw(*args, **kwargs):
+            assert not kwargs, "Do not support kwargs in template function call"
+            args = deserialize_args(args)
+            A, W = args[:2]
+            C = topi.nn.conv2d_transpose_nchw(*args, **kwargs)
+            s = topi.generic.schedule_conv2d_transpose_nchw([C])
+            return s, [A, W, C]
+
+        @register("topi_nn_dense")
+        def _topi_nn_dense(*args, **kwargs):
+            assert not kwargs, "Do not support kwargs in template function call"
+            args = deserialize_args(args)
+            data, weight, bias = args
+            C = topi.nn.dense(*args, **kwargs)
+            s = topi.generic.schedule_dense([C])
+            if bias is not None:
+                return s, [data, weight, bias, C]
+            return s, [data, weight, C]
+
+    def reset(self, wanted_topi_funcs):
+        """Reset task collections
+
+        Parameters
+        ----------
+        wanted_topi_funcs: List of function
+            The topi function to be extracted
+        """
+        self.task_collection = []
+        self.wanted_topi_funcs = wanted_topi_funcs
+
+    def get_tasks(self):
+        """Get collected tasks
+
+        Returns
+        -------
+        tasks: List of tuple(name, args)
+            A list of tasks extracted from the nnvm graph
+        """
+        return self.task_collection
+
+    @staticmethod
+    def get():
+        """Get the single instance of TaskExtractEnv
+
+        Returns
+        -------
+        env: TaskExtractEnv
+            The single instance of TaskExtractEnv
+        """
+        if not TaskExtractEnv.current:
+            TaskExtractEnv.current = TaskExtractEnv()
+        return TaskExtractEnv.current
+
+
 def register_topi_compute(topi_compute, target_keys, template_keys, func=None):
     """Register a tunable template for a topi compute function.
 
@@ -55,20 +240,18 @@ def register_topi_compute(topi_compute, target_keys, template_keys, func=None):
     --------
     See tvm/topi/python/topi/arm_cpu/depthwise_conv2d.py for example usage.
     """
-    fname = get_func_name(topi_compute)
-
     def _decorator(f):
         targets = [target_keys] if isinstance(target_keys, str) else target_keys
         for target_key in targets:
             if target_key not in _REGISTED_DISPATHCER:
                 _REGISTED_DISPATHCER[target_key] = {}
-            if topi_compute not in _REGISTED_DISPATHCER:
+            if topi_compute not in _REGISTED_DISPATHCER[target_key]:
                 @topi_compute.register(target_key)
                 @dispatcher
                 def config_dispatcher(*args, **kwargs):
                     """override topi call as a config dispatcher"""
                     assert not kwargs, "Do not support kwargs in template function call"
-                    return (fname, ) + args_to_workload(args)
+                    return args_to_workload(args, topi_compute)
                 _REGISTED_DISPATHCER[target_key][topi_compute] = config_dispatcher
 
             config_dispatcher = _REGISTED_DISPATHCER[target_key][topi_compute]
@@ -88,7 +271,7 @@ def template_call(cfg, *args, **kwargs):
                 attrs = {}
                 for k, v in node.op.attrs.items():
                     attrs[k] = v
-                attrs['workload'] = (fname, ) + args_to_workload(args)
+                attrs['workload'] = args_to_workload(args, topi_compute)
                 if isinstance(op, tensor.ComputeOp):
                     op = _api_internal._ComputeOp(
                         op.name, op.tag, attrs, op.axis, op.body)
@@ -153,7 +336,7 @@ def _decorator(f):
             if topi_schedule not in _REGISTED_DISPATHCER[target_key]:
                 @topi_schedule.register(target_key)
                 @dispatcher
-                def config_dispatcher(outs):
+                def config_dispatcher(outs, *args, **kwargs):
                     """override topi call as a workload dispatcher"""
                     def traverse(tensors):
                         """traverse all ops to find attached workload"""
@@ -179,11 +362,11 @@ def traverse(tensors):
             config_dispatcher = _REGISTED_DISPATHCER[target_key][topi_schedule]
 
             @config_dispatcher.register(template_keys)
-            def template_call(cfg, outs):
+            def template_call(cfg, outs, *args, **kwargs):
                 """call the schedule func"""
                 if f == topi_schedule.fdefault:
-                    return f(outs)
-                return f(cfg, outs)
+                    return f(outs, *args, **kwargs)
+                return f(cfg, outs, *args, **kwargs)
 
         return f
 
diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index e11bb7a4fc92..1d9684442a51 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -2,62 +2,86 @@
 TopHub: Tensor Operator Hub
 To get the best performance, we typically need auto-tuning for the specific devices.
 TVM releases pre-tuned parameters in TopHub for some common networks and hardware targets.
-TVM will download these parameters for you when you create the target for the first time.
+TVM will download these parameters for you when you call nnvm.compiler.build_module .
 """
 # pylint: disable=invalid-name
 
 import logging
 import os
-import json
 import sys
 
 from .task import ApplyHistoryBest
 from .. import target as _target
-from ..contrib.util import tempdir
 from ..contrib.download import download
+from .record import load_from_file
 
+# root path to store TopHub files
 AUTOTVM_TOPHUB_ROOT_PATH = os.path.join(os.path.expanduser('~'), ".tvm", "tophub")
 
+# the version of each package
+PACKAGE_VERSION = {
+    'arm_cpu': "v0.04",
+    'llvm':    "v0.03",
+
+    'cuda':    "v0.04",
+    'rocm':    "v0.02",
+    'opencl':  "v0.02",
+    'mali':    "v0.04",
+
+    'vta':     "v0.04",
+}
+
 logger = logging.getLogger('autotvm')
 
 def _alias(name):
     """convert alias for some packages"""
     table = {
         'vtacpu': 'vta',
+
+        'metal': 'opencl',
+        'vulkan': 'opencl',
+        'nvptx': 'cuda',
     }
     return table.get(name, name)
 
 
-def context(target, extra_files=None, allow_fallback=False):
+def context(target, extra_files=None):
     """Return the dispatch context with pre-tuned parameters.
-    The corresponding downloaded *.log files under tophub root path will be loaded.
+    This function will load the corresponding *.log files in AUTOTVM_TOPHUB_ROOT_PATH.
+    If cannot find them, it will download them from TopHub github repo.
     Users can also add their own files in argument `extra_files`.
 
     Parameters
     ----------
-    target: Target
+    target: Target or List of Target
         The compilation target
     extra_files: list of str, optional
         Extra log files to load
-    allow_fallback: bool
-        Whether allow to use a fallback configuration if cannot find
-        tuned result.
     """
-    rootpath = AUTOTVM_TOPHUB_ROOT_PATH
-    best_context = ApplyHistoryBest([], allow_fallback=allow_fallback)
+    best_context = ApplyHistoryBest([])
+
+    targets = target if isinstance(target, (list, tuple)) else [target]
 
-    if isinstance(target, str):
-        target = _target.create(target)
+    for tgt in targets:
+        if isinstance(tgt, str):
+            tgt = _target.create(tgt)
 
-    big_target = str(target).split()[0]
-    if os.path.isfile(os.path.join(rootpath, big_target + ".log")):
-        best_context.load(os.path.join(rootpath, big_target + ".log"))
+        possible_names = []
+        for opt in tgt.options:
+            if opt.startswith("-device"):
+                device = _alias(opt[8:])
+                possible_names.append(device)
+        possible_names.append(tgt.target_name)
 
-    for opt in target.options:
-        if opt.startswith("-device"):
-            model = _alias(opt[8:])
-            if os.path.isfile(os.path.join(rootpath, model) + ".log"):
-                best_context.load(os.path.join(rootpath, model) + ".log")
+        all_packages = list(PACKAGE_VERSION.keys())
+        for name in possible_names:
+            name = _alias(name)
+            if name in all_packages:
+                check_backend(name)
+
+                filename = "%s_%s.log" % (name, PACKAGE_VERSION[name])
+                best_context.load(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, filename))
+                break   # only load one file to avoid some fallback template mismatch problem
 
     if extra_files:
         for filename in extra_files:
@@ -66,12 +90,39 @@ def context(target, extra_files=None, allow_fallback=False):
     return best_context
 
 
-def download_package(backend):
-    """Download pre-tuned parameters of operators for a backend
+def check_backend(backend):
+    """Check whether have pre-tuned parameters of the certain target.
+    If not, will download it.
 
     Parameters
     ----------
     backend: str
+        The name of backend.
+    """
+    backend = _alias(backend)
+    assert backend in PACKAGE_VERSION, 'Cannot find backend "%s" in TopHub' % backend
+
+    version = PACKAGE_VERSION[backend]
+    package_name = "%s_%s.log" % (backend, version)
+    if os.path.isfile(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, package_name)):
+        return
+
+    if sys.version_info >= (3,):
+        import urllib.request as urllib2
+    else:
+        import urllib2
+    try:
+        download_package(package_name)
+    except urllib2.URLError as e:
+        logging.warning("Failed to download tophub package for %s: %s", backend, e)
+
+
+def download_package(package_name):
+    """Download pre-tuned parameters of operators for a backend
+
+    Parameters
+    ----------
+    package_name: str
         The name of package
     """
     rootpath = AUTOTVM_TOPHUB_ROOT_PATH
@@ -84,54 +135,54 @@ def download_package(backend):
             if not os.path.isdir(path):
                 os.mkdir(path)
 
-    backend = _alias(backend)
-    logger.info("Download pre-tuned parameters for %s", backend)
-    download("https://raw.githubusercontent.com/uwsaml/tvm-distro/master/tophub/%s.log" % backend,
-             os.path.join(rootpath, backend + ".log"), True, verbose=0)
+    logger.info("Download pre-tuned parameters package %s", package_name)
+    download("https://raw.githubusercontent.com/uwsaml/tvm-distro/master/tophub/%s"
+             % package_name, os.path.join(rootpath, package_name), True, verbose=0)
 
 
-def check_package(backend):
-    """Check whether have pre-tuned parameters of the certain target.
-    If not, will download it.
+# global cache for load_reference_log
+REFERENCE_LOG_CACHE = {}
+
+def load_reference_log(backend, model, workload_name, template_key):
+    """ Load reference log from TopHub to support fallback in template.
+    Template will use these reference logs to choose fallback config.
 
     Parameters
     ----------
     backend: str
-        The name of package
+        The backend name
+    model: str
+        The name of the model
+    workload_name: str
+        The name of the workload. (The first item in the workload tuple)
+    template_key: str
+        The template key
     """
-    backend = _alias(backend)
-
-    if os.path.isfile(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, backend + ".log")):
-        return
 
-    if sys.version_info >= (3,):
-        import urllib.request as urllib2
-    else:
-        import urllib2
-    try:
-        download_package(backend)
-    except urllib2.URLError:
-        logging.warning("Failed to download tophub package for %s", backend)
-
-
-def list_packages():
-    """List all available pre-tuned op parameters for targets
-
-    Returns
-    -------
-    ret: List
-        All available packets
-    """
-    path = tempdir()
-    filename = path.relpath("info.json")
-    logger.info("Download meta info for pre-tuned parameters")
-    download("https://raw.githubusercontent.com/uwsaml/tvm-distro/master/tophub/info.json",
-             filename, True, verbose=0)
-
-    with open(filename, "r") as fin:
-        text = "".join(fin.readlines())
-    info = json.loads(text)
-    keys = list(info.keys())
-    keys.sort()
-
-    return [(k, info[k]) for k in keys]
+    backend = _alias(backend)
+    version = PACKAGE_VERSION[backend]
+    package_name = "%s_%s.log" % (backend, version)
+    filename = os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, package_name)
+
+    global REFERENCE_LOG_CACHE
+    key = (backend, model, workload_name, template_key)
+
+    if key not in REFERENCE_LOG_CACHE:
+        tmp = []
+        if os.path.isfile(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, package_name)):
+            find = False
+            inp = None
+            for inp, res in load_from_file(filename):
+                if model == inp.target.model:
+                    find = True
+                    break
+            if not find and inp:
+                model = inp.target.model
+
+            for inp, res in load_from_file(filename):
+                if (model == inp.target.model and inp.task.workload[0] == workload_name and
+                        inp.config.template_key == template_key):
+                    tmp.append((inp, res))
+        REFERENCE_LOG_CACHE[key] = tmp
+
+    return REFERENCE_LOG_CACHE[key]
diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py
index 15d5ac1c9689..6f66871f671c 100644
--- a/python/tvm/autotvm/tuner/callback.py
+++ b/python/tvm/autotvm/tuner/callback.py
@@ -101,11 +101,17 @@ def __init__(self):
             self.total = total
 
         def __del__(self):
-            sys.stdout.write(' Done.\n')
+            if logger.level < logging.DEBUG:  # only print progress bar in non-debug mode
+                sys.stdout.write(' Done.\n')
 
     ctx = _Context()
     tic = time.time()
 
+    if logger.level < logging.DEBUG:  # only print progress bar in non-debug mode
+        sys.stdout.write('\r%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) '
+                         '| %.2f s' % (prefix, 0, 0, 0, total, time.time() - tic))
+        sys.stdout.flush()
+
     def _callback(tuner, inputs, results):
         ctx.ct += len(inputs)
 
diff --git a/python/tvm/autotvm/tuner/ga_tuner.py b/python/tvm/autotvm/tuner/ga_tuner.py
index 916bd4ee68c6..1afaca73ebb6 100644
--- a/python/tvm/autotvm/tuner/ga_tuner.py
+++ b/python/tvm/autotvm/tuner/ga_tuner.py
@@ -22,7 +22,7 @@ class GATuner(Tuner):
     mutation_prob: float
         probability of mutation of a knob in a gene
     """
-    def __init__(self, task, pop_size, elite_num=3, mutation_prob=0.1):
+    def __init__(self, task, pop_size=100, elite_num=3, mutation_prob=0.1):
         super(GATuner, self).__init__(task)
 
         # algorithm configurations
@@ -47,6 +47,7 @@ def __init__(self, task, pop_size, elite_num=3, mutation_prob=0.1):
 
         # random initialization
         self.pop_size = min(self.pop_size, len(self.space))
+        self.elite_num = min(self.pop_size, self.elite_num)
         for _ in range(self.pop_size):
             tmp_gene = point2knob(np.random.randint(len(self.space)), self.dims)
             while knob2point(tmp_gene, self.dims) in self.visited:
@@ -70,9 +71,9 @@ def update(self, inputs, results):
                 y = inp.task.flop / np.mean(res.costs)
                 self.scores.append(y)
             else:
-                self.scores.append(0)
+                self.scores.append(0.0)
 
-        if len(self.scores) >= len(self.genes):
+        if len(self.scores) >= len(self.genes) and len(self.visited) < len(self.space):
             genes = self.genes + self.elites
             scores = np.array(self.scores[:len(self.genes)] + self.elite_scores)
 
@@ -85,6 +86,7 @@ def update(self, inputs, results):
 
             # cross over
             indices = np.arange(len(genes))
+            scores += 1e-8
             scores /= np.max(scores)
             probs = scores / np.sum(scores)
             tmp_genes = []
diff --git a/python/tvm/autotvm/tuner/model_based_tuner.py b/python/tvm/autotvm/tuner/model_based_tuner.py
index d1c1b16d3181..62fc57f2e869 100644
--- a/python/tvm/autotvm/tuner/model_based_tuner.py
+++ b/python/tvm/autotvm/tuner/model_based_tuner.py
@@ -8,7 +8,7 @@
 import numpy as np
 
 from .tuner import Tuner
-
+from ..env import GLOBAL_SCOPE
 
 class FeatureCache(object):
     """Feature cache manager for cache sharing between different cost models"""
@@ -119,11 +119,9 @@ def load_basemodel(self, base_model):
         """
         raise NotImplementedError()
 
-    def clone_new(self):
-        """Clone a new model with the same parameters.
-        This function will only copy hyperparameters of the tuner, not all the trained model
-
-        This is used for deriving a base model conveniently
+    def spawn_base_model(self):
+        """Clone a base model with the same parameters.
+        The base model is used to fit history data in transfer learning.
 
         Returns
         -------
@@ -221,7 +219,9 @@ def next_batch(self, batch_size):
                     break
                 self.trial_pt += 1
 
-            if self.trial_pt >= len(self.trials):  # trial list is empty, choose randomly
+            if self.trial_pt >= len(self.trials) - int(0.05 * self.plan_size):
+                # if the trial list is empty or
+                # the tuner is doing the last 5% trials (e-greedy), choose randomly
                 index = np.random.randint(len(self.space))
                 while index in self.visited:
                     index = np.random.randint(len(self.space))
@@ -264,18 +264,16 @@ def update(self, inputs, results):
             self.train_ct += 1
 
     def load_history(self, data_set):
-        # filter data, only pick the data with a same task
-        data = []
-        for inp, res in data_set:
-            if inp.task.name == self.task.name and \
-                            inp.config.template_key == self.task.config_space.template_key:
-                data.append((inp, res))
-        if not data:
-            return
+        # set in_tuning as True to make the feature extraction consistent
+        GLOBAL_SCOPE.in_tuning = True
 
         # fit base model
-        base_model = self.cost_model.clone_new()
-        base_model.fit_log(data, self.plan_size)
+        base_model = self.cost_model.spawn_base_model()
+        success = base_model.fit_log(data_set, self.plan_size)
+
+        if not success:
+            GLOBAL_SCOPE.in_tuning = False
+            return
 
         # use base model to select initial points
         if not self.trials:
@@ -285,6 +283,7 @@ def load_history(self, data_set):
             self.trial_pt = 0
 
         self.cost_model.load_basemodel(base_model)
+        GLOBAL_SCOPE.in_tuning = False
 
     def has_next(self):
         return len(self.visited) < len(self.space)
diff --git a/python/tvm/autotvm/tuner/sa_model_optimizer.py b/python/tvm/autotvm/tuner/sa_model_optimizer.py
index 6e1c373c113f..77c7e919593b 100644
--- a/python/tvm/autotvm/tuner/sa_model_optimizer.py
+++ b/python/tvm/autotvm/tuner/sa_model_optimizer.py
@@ -87,7 +87,7 @@ def find_maximums(self, model, num, exclusive):
 
             new_scores = model.predict(new_points)
 
-            ac_prob = np.exp((new_scores - scores) / t)
+            ac_prob = np.exp(np.minimum((new_scores - scores) / (t + 1e-5), 1))
             ac_index = np.random.random(len(ac_prob)) < ac_prob
 
             points[ac_index] = new_points[ac_index]
diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
index 5d1fc1507e58..abd7ec4fad0b 100644
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
@@ -31,6 +31,11 @@ def __init__(self, task, **kwargs):
         self.best_measure_pair = None
         self.best_iter = 0
 
+        # time to leave
+        self.ttl = None
+        self.n_trial = None
+        self.early_stopping = None
+
     def has_next(self):
         """Whether has next untried config in the space
 
@@ -76,7 +81,7 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()):
         measure_option: dict
             The options for how to measure generated code.
             You should use the return value ot autotvm.measure_option for this argument.
-        early_stopping: int
+        early_stopping: int, optional
             Early stop the tuning when not finding better configs in this number of trials
         callbacks: List of callable
             A list of callback functions. The signature of callback function is
@@ -85,8 +90,11 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()):
             every measurement pair. See autotvm/tuner/callback.py for some examples.
         """
         measure_batch = create_measure_batch(self.task, measure_option)
-        parallel_num = getattr(measure_batch, 'parallel_num', 1)
+        n_parallel = getattr(measure_batch, 'n_parallel', 1)
         early_stopping = early_stopping or 1e9
+        self.n_trial = n_trial
+        self.early_stopping = early_stopping
+
         old_level = logger.level
 
         GLOBAL_SCOPE.in_tuning = True
@@ -95,7 +103,7 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()):
             if not self.has_next():
                 break
 
-            configs = self.next_batch(min(parallel_num, n_trial - i))
+            configs = self.next_batch(min(n_parallel, n_trial - i))
 
             inputs = [MeasureInput(self.task.target, self.task, config) for config in configs]
             results = measure_batch(inputs)
@@ -121,17 +129,18 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()):
                              res, config)
 
             i += len(results)
+            self.ttl = min(early_stopping + self.best_iter, n_trial) - i
 
             self.update(inputs, results)
-
             for callback in callbacks:
                 callback(self, inputs, results)
 
-            if i > self.best_iter + early_stopping:
+            if i >= self.best_iter + early_stopping:
                 logger.debug("Early stopped. Best iter: %d.", self.best_iter)
                 break
 
-            if error_ct > 50:
+            if error_ct > 150:
+                logging.basicConfig()
                 logger.warning("Too many errors happen in the tuning. Now is in debug mode")
                 logger.setLevel(logging.DEBUG)
             else:
diff --git a/python/tvm/autotvm/tuner/xgboost_cost_model.py b/python/tvm/autotvm/tuner/xgboost_cost_model.py
index 178e92476752..a725a1eeabed 100644
--- a/python/tvm/autotvm/tuner/xgboost_cost_model.py
+++ b/python/tvm/autotvm/tuner/xgboost_cost_model.py
@@ -31,8 +31,12 @@ class XGBoostCostModel(CostModel):
         If is 'curve', use sampled curve feature (relation feature).
 
         Note on choosing feature type:
-        For single task tuning, 'itervar' and 'knob' is good.
+        For single task tuning, 'itervar' and 'knob' are good.
                                 'itervar' is more accurate but 'knob' is much faster.
+                                There are some constraints on 'itervar', if you meet
+                                problems with feature extraction when using 'itervar',
+                                you can swith to 'knob'.
+
         For cross-shape tuning (e.g. many convolutions with different shapes),
                                'itervar' and 'curve' has better transferability,
                                'knob' is faster.
@@ -46,8 +50,11 @@ class XGBoostCostModel(CostModel):
         The number of threads.
     log_interval: int, optional
         If is not none, the cost model will print training log every `log_interval` iterations.
+    upper_model: XGBoostCostModel, optional
+        The upper model used in transfer learning
     """
-    def __init__(self, task, feature_type, loss_type, num_threads=None, log_interval=25):
+    def __init__(self, task, feature_type, loss_type, num_threads=None, log_interval=25,
+                 upper_model=None):
         super(XGBoostCostModel, self).__init__()
 
         if xgb is None:
@@ -109,35 +116,51 @@ def __init__(self, task, feature_type, loss_type, num_threads=None, log_interval
         else:
             raise RuntimeError("Invalid feature type " + feature_type)
 
-        self.feature_cache = FeatureCache()
+        if upper_model:  # share a same feature cache with upper model
+            self.feature_cache = upper_model.feature_cache
+        else:
+            self.feature_cache = FeatureCache()
+        self.upper_model = upper_model
         self.feature_extra_ct = 0
         self.pool = None
         self.base_model = None
-        self.upper_model = None
 
         self._sample_size = 0
+        self._reset_pool(self.space, self.target, self.task)
 
-        self._reset_pool()
+    def _reset_pool(self, space, target, task):
+        """reset processing pool for feature extraction"""
+
+        if self.upper_model:  # base model will reuse upper model's pool,
+            self.upper_model._reset_pool(space, target, task)
+            return
+
+        self._close_pool()
 
-    def _reset_pool(self):
-        # reset processing pool for feature extraction
-        if self.pool:
-            self.pool.terminate()
-            self.pool.join()
-            del self.pool
         # use global variable to pass common arguments
         global _extract_space, _extract_target, _extract_task
-        _extract_space = self.space
-        _extract_target = self.target
-        _extract_task = self.task
+        _extract_space = space
+        _extract_target = target
+        _extract_task = task
         self.pool = multiprocessing.Pool(self.num_threads)
 
+    def _close_pool(self):
+        if self.pool:
+            self.pool.terminate()
+            self.pool.join()
+            self.pool = None
+
+    def _get_pool(self):
+        if self.upper_model:
+            return self.upper_model._get_pool()
+        return self.pool
+
     def _base_model_discount(self):
-        return 1.0 / (2 ** (self._sample_size / 50.0))
+        return 1.0 / (2 ** (self._sample_size / 64.0))
 
     def fit(self, xs, ys, plan_size):
         tic = time.time()
-        self._reset_pool()
+        self._reset_pool(self.space, self.target, self.task)
 
         x_train = self._get_feature(xs)
         y_train = np.array(ys)
@@ -150,8 +173,12 @@ def fit(self, xs, ys, plan_size):
         self._sample_size = len(x_train)
 
         if self.base_model:
-            dtrain.set_base_margin(self._base_model_discount() *
-                                   self.base_model.predict(xs, output_margin=True))
+            discount = self._base_model_discount()
+            if discount < 0.05:  # discard base model
+                self.base_model.upper_model = None
+                self.base_model = None
+            else:
+                dtrain.set_base_margin(discount * self.base_model.predict(xs, output_margin=True))
 
         self.bst = xgb.train(self.xgb_params, dtrain,
                              num_boost_round=8000,
@@ -172,11 +199,19 @@ def fit(self, xs, ys, plan_size):
 
     def fit_log(self, records, plan_size):
         tic = time.time()
-        self._reset_pool()
 
-        args = list(records)
-        logger.debug("XGB load %d entries from history log file", len(args))
+        # filter data, only pick the data with a same task
+        data = []
+        for inp, res in records:
+            if inp.task.name == self.task.name and \
+                            inp.config.template_key == self.task.config_space.template_key:
+                data.append((inp, res))
+
+        logger.debug("XGB load %d entries from history log file", len(data))
 
+        # extract feature
+        self._reset_pool(self.space, self.target, self.task)
+        pool = self._get_pool()
         if self.fea_type == 'itervar':
             feature_extract_func = _extract_itervar_feature_log
         elif self.fea_type == 'knob':
@@ -185,10 +220,21 @@ def fit_log(self, records, plan_size):
             feature_extract_func = _extract_curve_feature_log
         else:
             raise RuntimeError("Invalid feature type: " + self.fea_type)
-        res = self.pool.map(feature_extract_func, args)
-        xs, ys = zip(*res)
-        xs, ys = np.array(xs), np.array(ys)
+        res = pool.map(feature_extract_func, data)
+
+        # filter out feature with different shapes
+        fea_len = len(self._get_feature([0])[0])
+
+        xs, ys = [], []
+        for x, y in res:
+            if len(x) == fea_len:
+                xs.append(x)
+                ys.append(y)
 
+        if len(xs) < 500:  # no enough samples
+            return False
+
+        xs, ys = np.array(xs), np.array(ys)
         x_train = xs
         y_train = ys
         y_max = np.max(y_train)
@@ -212,6 +258,8 @@ def fit_log(self, records, plan_size):
 
         logger.debug("XGB train: %.2f\tobs: %d", time.time() - tic, len(xs))
 
+        return True
+
     def predict(self, xs, output_margin=False):
         feas = self._get_feature(xs)
         dtest = xgb.DMatrix(feas)
@@ -224,20 +272,12 @@ def predict(self, xs, output_margin=False):
 
     def load_basemodel(self, base_model):
         self.base_model = base_model
-        if isinstance(base_model, XGBoostCostModel):
-            # share feature cache
-            base_model.feature_cache = self.feature_cache
-
-            # close thread pool
-            if base_model.pool:
-                base_model.pool.terminate()
-                base_model.pool.join()
-                del base_model.pool
-            self.base_model.upper_model = self
-
-    def clone_new(self):
+        self.base_model._close_pool()
+        self.base_model.upper_model = self
+
+    def spawn_base_model(self):
         return XGBoostCostModel(self.task, self.fea_type, self.loss_type,
-                                self.num_threads, self.log_interval)
+                                self.num_threads, self.log_interval, self)
 
     def _get_feature(self, indexes):
         """get features for indexes, run extraction if we do not have cache for them"""
@@ -251,7 +291,7 @@ def _get_feature(self, indexes):
         need_extract = [x for x in indexes if x not in fea_cache]
 
         if need_extract:
-            pool = self.pool if self.upper_model is None else self.upper_model.pool
+            pool = self._get_pool()
             feas = pool.map(self.feature_extract_func, need_extract)
             for i, fea in zip(need_extract, feas):
                 fea_cache[i] = fea
@@ -261,6 +301,9 @@ def _get_feature(self, indexes):
             ret[i, :] = fea_cache[ii]
         return ret
 
+    def __del__(self):
+        self._close_pool()
+
 
 _extract_space = None
 _extract_target = None
diff --git a/python/tvm/autotvm/tuner/xgboost_tuner.py b/python/tvm/autotvm/tuner/xgboost_tuner.py
index 237ac4e19ab1..886c82a4d749 100644
--- a/python/tvm/autotvm/tuner/xgboost_tuner.py
+++ b/python/tvm/autotvm/tuner/xgboost_tuner.py
@@ -20,8 +20,12 @@ class XGBTuner(ModelBasedTuner):
         If is 'curve', use sampled curve feature (relation feature).
 
         Note on choosing feature type:
-        For single task tuning, 'itervar' and 'knob' is good.
+        For single task tuning, 'itervar' and 'knob' are good.
                                 'itervar' is more accurate but 'knob' is much faster.
+                                There are some constraints on 'itervar', if you meet
+                                problems with feature extraction when using 'itervar',
+                                you can swith to 'knob'.
+
         For cross-shape tuning (e.g. many convolutions with different shapes),
                                'itervar' and 'curve' has better transferability,
                                'knob' is faster.
@@ -32,8 +36,7 @@ class XGBTuner(ModelBasedTuner):
         If is 'rank', use pairwise rank loss to train cost model.
                      The cost model predicts relative rank score.
     num_threads: int, optional
-        The number of threads.
-    optimizer: str or ModelOptimizer, optional
+        The number of threads.  optimizer: str or ModelOptimizer, optional
         If is 'sa', use a default simulated annealing optimizer.
         Otherwise it should be a ModelOptimizer object.
     diversity_filter_ratio: int or float, optional
@@ -45,7 +48,7 @@ class XGBTuner(ModelBasedTuner):
         If is 0, output nothing.
         Otherwise, output debug information every `verbose` iterations.
     """
-    def __init__(self, task, plan_size=32,
+    def __init__(self, task, plan_size=64,
                  feature_type='itervar', loss_type='rank', num_threads=None,
                  optimizer='sa', diversity_filter_ratio=None, log_interval=50):
         cost_model = XGBoostCostModel(task,
@@ -62,3 +65,9 @@ def __init__(self, task, plan_size=32,
 
         super(XGBTuner, self).__init__(task, cost_model, optimizer,
                                        plan_size, diversity_filter_ratio)
+
+    def tune(self, *args, **kwargs):  # pylint: disable=arguments-differ
+        super(XGBTuner, self).tune(*args, **kwargs)
+
+        # manually close pool to avoid multiprocessing issues
+        self.cost_model._close_pool()
diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py
index 777654af6619..6117a963ae3a 100755
--- a/python/tvm/build_module.py
+++ b/python/tvm/build_module.py
@@ -5,8 +5,8 @@
 """
 from __future__ import absolute_import as _abs
 import warnings
-import types
 
+from ._ffi.function import Function
 from ._ffi.node import NodeBase, register_node
 from . import api
 from . import _api_internal
@@ -69,7 +69,7 @@ def recover():
             vset[k] = v
         for k, v in vset.items():
             self._recover_list.append(recover)
-            vset[k] = self.decorate(v) if isinstance(v, types.FunctionType) else v
+            vset[k] = self.decorate(v) if isinstance(v, Function) else v
 
     def decorate_custompass(self, custom_pass):
         """decorate given list of custom passes, and return decorated passes"""
@@ -125,7 +125,8 @@ class BuildConfig(NodeBase):
         "data_alignment": -1,
         "restricted_func": True,
         "double_buffer_split_loop": 1,
-        "dump_pass_ir": False
+        "dump_pass_ir": False,
+        "instrument_bound_checkers": False
     }
     _dump_ir = DumpIR()
 
@@ -299,7 +300,7 @@ def lower(sch,
 
     Parameters
     ----------
-    sch : tvm.Schedule
+    sch : tvm.schedule.Schedule
         The schedule to be builded
 
     args : list of Buffer or Tensor or Var
@@ -340,16 +341,11 @@ def lower(sch,
         bounds = schedule.InferBound(sch)
         stmt = schedule.ScheduleOps(sch, bounds)
         stmt = ir_pass.InjectPrefetch(stmt)
-    else:
-        #So far there is no op for hybrid script, so a plain ir body is given
-        if not isinstance(sch, _stmt.Stmt):
-            raise ValueError("sch should be either a Schedule or a Stmt")
-        stmt = sch
 
     for f in lower_phase0:
         stmt = f(stmt)
     # Phase 1
-    stmt = ir_pass.StorageFlatten(stmt, binds, 64)
+    stmt = ir_pass.StorageFlatten(stmt, binds, 64, cfg.instrument_bound_checkers)
     stmt = ir_pass.CanonicalSimplify(stmt)
     for f in lower_phase1:
         stmt = f(stmt)
@@ -368,89 +364,46 @@ def lower(sch,
         cfg.unroll_explicit)
     for f in lower_phase2:
         stmt = f(stmt)
-    # Phase 2
+    # Phase 3
     stmt = ir_pass.Simplify(stmt)
     stmt = ir_pass.LowerStorageAccessInfo(stmt)
     stmt = ir_pass.RemoveNoOp(stmt)
     stmt = ir_pass.RewriteUnsafeSelect(stmt)
     for f in lower_phase3:
         stmt = f(stmt)
+    # Instrument BoundCheckers
+    if cfg.instrument_bound_checkers:
+        stmt = ir_pass.InstrumentBoundCheckers(stmt)
     if simple_mode:
         return stmt
     return ir_pass.MakeAPI(stmt, name, arg_list, 0, cfg.restricted_func)
 
-def build(sch,
-          args=None,
-          target=None,
-          target_host=None,
-          name="default_function",
-          binds=None):
-    """Build a function with arguments as signiture.
+
+def _build_for_device(flist, target, target_host):
+    """Build the lowered functions for a device with the given compilation
+    target.
 
     Parameters
     ----------
-    sch : tvm.Schedule, or LoweredFunc
-        The schedule to be builded
-
-    args : list of Buffer or Tensor or Var, optional
-        The argument lists to the function.
+    flist : list of LoweredFunc
+        The schedule to be built.
 
-    target : str or :any:`tvm.target.Target`, optional
+    target : str or :any:`tvm.target.Target`
         The target and option of the compilation.
 
-    target_host : str or :any:`tvm.target.Target` optional
-        Host compilation target, if target is device.
-        When TVM compiles device specific program such as CUDA,
-        we also need host(CPU) side code to interact with the driver
-        setup the dimensions and parameters correctly.
-        target_host is used to specify the host side codegen target.
-        By default, llvm is used if it is enabled,
-        otherwise a stackvm intepreter is used.
-
-    name : str, optional
-        The name of result function.
-
-    binds : dict, optional
-        Dictionary that maps the binding of symbolic buffer to Tensor.
-        By default, a new buffer is created for each tensor in the argument.
+    target_host : str or :any:`tvm.target.Target`
+        The host compilation target.
 
     Returns
     -------
-    f : Function, or pair of functions
-       The result function.
+    fhost : list of LoweredFunc
+        A list of lowered functions for the host.
 
-    Note
-    ----
-    See the note on :any:`tvm.target` on target string format.
+    mdev : tvm.module
+        A module that contains device code.
     """
-    if isinstance(sch, schedule.Schedule):
-        if args is None:
-            raise ValueError("args must be given for build from schedule")
-        flist = lower(sch, args,
-                      name=name,
-                      binds=binds)
-        if isinstance(flist, container.LoweredFunc):
-            flist = [flist]
-    elif isinstance(sch, container.LoweredFunc):
-        if args:
-            raise ValueError("args must be done when build from LoweredFunc")
-        flist = [sch]
-    elif isinstance(sch, (list, tuple, container.Array)):
-        flist = sch
-    else:
-        raise ValueError("sch have to be Schedule, LoweredFunc or list of LoweredFunc")
-    fname_set = set()
-    for x in flist:
-        if not isinstance(x, container.LoweredFunc):
-            raise ValueError("sch have to be Schedule, LoweredFunc or list of LoweredFunc")
-        if x.name in fname_set:
-            raise ValueError("Duplicate function name %s" % x.name)
-        fname_set.add(x.name)
-
-    target = _target.current_target() if target is None else target
-    target = _target.create(target) if target else _target.create("llvm")
+    target = _target.create(target)
     device_type = ndarray.context(target.target_name, 0).device_type
-
     fhost = []
     fdevice = []
     for func in flist:
@@ -482,25 +435,162 @@ def build(sch,
 
     if "gpu" in target.keys and not fdevice:
         warnings.warn(
-            "Specified target %s, but cannot find device code, did you do bind?" % target)
+            "Specified target %s, but cannot find device code, did you do "
+            "bind?" % target)
 
     fhost = [ir_pass.BindDeviceType(x, device_type) for x in fhost]
     fhost = [ir_pass.LowerTVMBuiltin(x) for x in fhost]
 
-    if not target_host:
-        if device_type == ndarray.cpu(0).device_type:
-            target_host = target
-            assert not fdevice
-        else:
-            target_host = "llvm" if module.enabled("llvm") else "stackvm"
+    if device_type == ndarray.cpu(0).device_type and target_host == target:
+        assert not fdevice
+
     target_host = _target.create(target_host)
-    target_device = target
-    fdevice = [ir_pass.LowerIntrin(x, target_device.target_name) for x in fdevice]
+    fdevice = [ir_pass.LowerIntrin(x, target.target_name) for x in fdevice]
     fhost = [ir_pass.LowerIntrin(x, target_host.target_name) for x in fhost]
     fhost = [ir_pass.CombineContextCall(x) for x in fhost]
-    mhost = codegen.build_module(fhost, str(target_host))
+    mdev = codegen.build_module(fdevice, str(target)) if fdevice else None
+
+    return fhost, mdev
+
+
+def build(inputs,
+          args=None,
+          target=None,
+          target_host=None,
+          name="default_function",
+          binds=None):
+    """Build a function with arguments as signature. Code will be generated
+    for devices coupled with target information.
 
-    if fdevice:
-        mdev = codegen.build_module(fdevice, str(target_device))
-        mhost.import_module(mdev)
+    Parameters
+    ----------
+    inputs : tvm.Schedule, LoweredFunc, or dict of target to LoweredFunc list
+        The schedule to be built
+
+    args : list of Buffer or Tensor or Var, optional
+        The argument lists to the function.
+
+    target : str or :any:`tvm.target.Target`, optional
+        The target and option of the compilation.
+
+    target_host : str or :any:`tvm.target.Target` optional
+        Host compilation target, if target is device.
+        When TVM compiles device specific program such as CUDA,
+        we also need host(CPU) side code to interact with the driver
+        setup the dimensions and parameters correctly.
+        target_host is used to specify the host side codegen target.
+        By default, llvm is used if it is enabled,
+        otherwise a stackvm intepreter is used.
+
+    name : str, optional
+        The name of result function.
+
+    binds : dict, optional
+        Dictionary that maps the binding of symbolic buffer to Tensor.
+        By default, a new buffer is created for each tensor in the argument.
+
+    Returns
+    -------
+    ret : tvm.module
+        A module that combines both host and device code.
+
+    Examples
+    ________
+    There are two typical example uses of this function depending on the type
+    of the argument `inputs`:
+    1. it is a list of lowered functions:
+
+    .. code-block:: python
+
+        n = 2
+        A = tvm.placeholder((n,), name='A')
+        B = tvm.placeholder((n,), name='B')
+        C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+        s = tvm.create_schedule(C.op)
+        f = tvm.lower(s, [A, B, C], name="test_add")
+        m = tvm.build(f, target="llvm")
+
+    2. it is a dict of compilation target to list of lowered functions:
+
+    .. code-block:: python
+
+        n = 2
+        A = tvm.placeholder((n,), name='A')
+        B = tvm.placeholder((n,), name='B')
+        C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+        s1 = tvm.create_schedule(C.op)
+        s2 = topi.cpp.cuda.schedule_injective("cuda", [C])
+        f1 = tvm.lower(s1, [A, B, C], name="test_add1")
+        f2 = tvm.lower(s2, [A, B, C], name="test_add2")
+        m = tvm.build({"llvm": [f1], "cuda": [f2]}, target_host="llvm")
+
+    Note
+    ----
+    See the note on :any:`tvm.target` on target string format.
+    """
+    if isinstance(inputs, schedule.Schedule):
+        if args is None:
+            raise ValueError("args must be given for build from schedule")
+        flist = lower(inputs, args,
+                      name=name,
+                      binds=binds)
+        if isinstance(flist, container.LoweredFunc):
+            flist = [flist]
+    elif isinstance(inputs, container.LoweredFunc):
+        if args:
+            raise ValueError("args must be done when build from LoweredFunc.")
+        flist = [inputs]
+    elif isinstance(inputs, (list, tuple, container.Array)):
+        flist = inputs
+    elif not isinstance(inputs, (dict, container.Map)):
+        raise ValueError("inputs must be Schedule, LoweredFunc, list of "
+                         "LoweredFunc, or dict of target to list of "
+                         "LoweredFunc.")
+
+    if not isinstance(inputs, (dict, container.Map)):
+        target = _target.current_target() if target is None else target
+        target = target if target else "llvm"
+        target_flist = {target: flist}
+    else:
+        target_flist = inputs
+
+    for tar, flist in target_flist.items():
+        if not isinstance(tar, (str, _target.Target)):
+            raise ValueError("The key of inputs must be str or "
+                             "_target.Target when inputs is dict.")
+        fname_set = set()
+        for x in flist:
+            if not isinstance(x, container.LoweredFunc):
+                raise ValueError("inputs must be Schedule, LoweredFunc, list "
+                                 "of LoweredFunc, or dict of str to list of "
+                                 "LoweredFunc.")
+            if x.name in fname_set:
+                raise ValueError("Duplicate function name %s" % x.name)
+            fname_set.add(x.name)
+
+    if not target_host:
+        for tar, _ in target_flist.items():
+            tar = _target.create(tar)
+            device_type = ndarray.context(tar.target_name, 0).device_type
+            if device_type == ndarray.cpu(0).device_type:
+                target_host = tar
+                break
+    if not target_host:
+        target_host = "llvm" if module.enabled("llvm") else "stackvm"
+
+    fhost_all = []
+    device_modules = []
+    for tar, flist in target_flist.items():
+        fhost, mdev = _build_for_device(flist, tar, target_host)
+        # Save the current lowered functions of the host and the device module.
+        fhost_all += fhost
+        device_modules.append(mdev)
+
+    # Generate a unified host module.
+    mhost = codegen.build_module(fhost_all, str(target_host))
+
+    # Import all modules.
+    for mdev in device_modules:
+        if mdev:
+            mhost.import_module(mdev)
     return mhost
diff --git a/python/tvm/container.py b/python/tvm/container.py
index 27e533113926..ba30255f650a 100644
--- a/python/tvm/container.py
+++ b/python/tvm/container.py
@@ -17,16 +17,37 @@ def __getitem__(self, i):
             start = i.start if i.start is not None else 0
             stop = i.stop if i.stop is not None else len(self)
             step = i.step if i.step is not None else 1
+            if start < 0:
+                start += len(self)
+            if stop < 0:
+                stop += len(self)
             return [self[idx] for idx in range(start, stop, step)]
 
-        if i >= len(self):
-            raise IndexError("array index out of range")
+        if i < -len(self) or i >= len(self):
+            raise IndexError("Array index out of range. Array size: {}, got index {}"
+                             .format(len(self), i))
+        if i < 0:
+            i += len(self)
         return _api_internal._ArrayGetItem(self, i)
 
     def __len__(self):
         return _api_internal._ArraySize(self)
 
 
+@register_node
+class EnvFunc(NodeBase):
+    """Environment function.
+
+    This is a global function object that can be serialized by its name.
+    """
+    def __call__(self, *args):
+        return _api_internal._EnvFuncCall(self, *args)
+
+    @property
+    def func(self):
+        return _api_internal._EnvFuncGetPackedFunc(self)
+
+
 @register_node
 class Map(NodeBase):
     """Map container of TVM.
diff --git a/python/tvm/contrib/clang.py b/python/tvm/contrib/clang.py
index 19508160d42d..3e8ad663c58f 100644
--- a/python/tvm/contrib/clang.py
+++ b/python/tvm/contrib/clang.py
@@ -31,6 +31,7 @@ def find_clang(required=True):
     if hasattr(codegen, "llvm_version_major"):
         cc_list += ["clang-%d.0" % codegen.llvm_version_major()]
     cc_list += ["clang"]
+    cc_list += ["clang.exe"]
     valid_list = [util.which(x) for x in cc_list]
     valid_list = [x for x in valid_list if x]
     if not valid_list and required:
diff --git a/python/tvm/contrib/debugger/__init__.py b/python/tvm/contrib/debugger/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/python/tvm/contrib/debugger/debug_result.py b/python/tvm/contrib/debugger/debug_result.py
new file mode 100644
index 000000000000..101af6887c47
--- /dev/null
+++ b/python/tvm/contrib/debugger/debug_result.py
@@ -0,0 +1,205 @@
+"""Graph debug results dumping class."""
+import os
+import json
+import tvm
+
+GRAPH_DUMP_FILE_NAME = '_tvmdbg_graph_dump.json'
+
+class DebugResult(object):
+    """Graph debug data module.
+
+    Data dump module manage all the debug data formatting.
+    Output data and input graphs are formatted and dumped to file.
+    Frontend read these data and graph for visualization.
+
+    Parameters
+    ----------
+    graph_json : str
+        The graph to be deployed in json format output by nnvm graph. Each operator (tvm_op)
+        in the graph will have a one to one mapping with the symbol in libmod which is used
+        to construct a "PackedFunc" .
+
+    dump_path : str
+        Output data path is read/provided from frontend
+    """
+
+    def __init__(self, graph_json, dump_path):
+        self._dump_path = dump_path
+        self._output_tensor_list = []
+        self._time_list = []
+        self._parse_graph(graph_json)
+        # dump the json information
+        self.dump_graph_json(graph_json)
+
+    def _parse_graph(self, graph_json):
+        """Parse and extract the NNVM graph and update the nodes, shapes and dltype.
+
+        Parameters
+        ----------
+        graph_json : str or graph class
+           The graph to be deployed in json format output by nnvm graph.
+        """
+        json_obj = json.loads(graph_json)
+        self._nodes_list = json_obj['nodes']
+        self._shapes_list = json_obj['attrs']['shape']
+        self._dtype_list = json_obj['attrs']['dltype']
+        self._update_graph_json()
+
+    def _update_graph_json(self):
+        """update the nodes_list with name, shape and data type,
+        for temporarily storing the output.
+        """
+
+        nodes_len = len(self._nodes_list)
+        for i in range(nodes_len):
+            node = self._nodes_list[i]
+            input_list = []
+            for input_node in node['inputs']:
+                input_list.append(self._nodes_list[input_node[0]]['name'])
+            node['inputs'] = input_list
+            dtype = str("type: " + self._dtype_list[1][i])
+            if 'attrs' not in node:
+                node['attrs'] = {}
+                node['op'] = "param"
+            else:
+                node['op'] = node['attrs']['func_name']
+            node['attrs'].update({"T": dtype})
+            node['shape'] = self._shapes_list[1][i]
+
+    def _cleanup_tensors(self):
+        """Remove the tensor dump file (graph wont be removed)
+        """
+        for filename in os.listdir(self._dump_path):
+            if os.path.isfile(filename) and not filename.endswith(".json"):
+                os.remove(filename)
+
+    def get_graph_nodes(self):
+        """Return the nodes list
+        """
+        return self._nodes_list
+
+    def get_graph_node_shapes(self):
+        """Return the nodes shapes list
+        """
+        return self._shapes_list
+
+    def get_graph_node_output_num(self, node):
+        """Return the number of outputs of a node
+        """
+        return 1 if node['op'] == 'param' else int(node['attrs']['num_outputs'])
+
+    def get_graph_node_dtypes(self):
+        """Return the nodes dtype list
+        """
+        return self._dtype_list
+
+    def get_output_tensors(self):
+        """Dump the outputs to a temporary folder, the tensors are in numpy format
+        """
+        eid = 0
+        order = 0
+        output_tensors = {}
+        for node, time in zip(self._nodes_list, self._time_list):
+            num_outputs = self.get_graph_node_output_num(node)
+            for j in range(num_outputs):
+                order += time[0]
+                key = node['name'] + "_" + str(j)
+                output_tensors[key] = self._output_tensor_list[eid]
+                eid += 1
+        return output_tensors
+
+    def dump_output_tensor(self):
+        """Dump the outputs to a temporary folder, the tensors are in numpy format
+        """
+        #cleanup existing tensors before dumping
+        self._cleanup_tensors()
+        eid = 0
+        order = 0
+        output_tensors = {}
+        for node, time in zip(self._nodes_list, self._time_list):
+            num_outputs = self.get_graph_node_output_num(node)
+            for j in range(num_outputs):
+                order += time[0]
+                key = node['name'] + "_" + str(j) + "__" + str(order)
+                output_tensors[key] = self._output_tensor_list[eid]
+                eid += 1
+
+        with open(os.path.join(self._dump_path, "output_tensors.params"), "wb") as param_f:
+            param_f.write(save_tensors(output_tensors))
+
+    def dump_graph_json(self, graph):
+        """Dump json formatted graph.
+
+        Parameters
+        ----------
+        graph : json format
+            json formatted NNVM graph contain list of each node's
+            name, shape and type.
+        """
+        graph_dump_file_name = GRAPH_DUMP_FILE_NAME
+        with open(os.path.join(self._dump_path, graph_dump_file_name), 'w') as outfile:
+            json.dump(graph, outfile, indent=4, sort_keys=False)
+
+    def display_debug_result(self):
+        """Displays the debugger result"
+        """
+        header = ["Node Name", "Ops", "Time(us)", "Time(%)", "Start Time", \
+                    "End Time", "Shape", "Inputs", "Outputs"]
+        lines = ["---------", "---", "--------", "-------", "----------", \
+                    "--------", "-----", "------", "-------"]
+        eid = 0
+        data = []
+        total_time = sum(time[0] for time in self._time_list)
+        for node, time in zip(self._nodes_list, self._time_list):
+            num_outputs = self.get_graph_node_output_num(node)
+            for j in range(num_outputs):
+                op = node['op']
+                if node['op'] == 'param':
+                    eid += 1
+                    continue
+                name = node['name']
+                shape = str(self._output_tensor_list[eid].shape)
+                time_us = round(time[0] * 1000000, 2)
+                time_percent = round(((time[0] / total_time) * 100), 2)
+                inputs = str(node['attrs']['num_inputs'])
+                outputs = str(node['attrs']['num_outputs'])
+                node_data = [name, op, time_us, time_percent, str(time[1]), str(time[2]), \
+                             shape, inputs, outputs]
+                data.append(node_data)
+                eid += 1
+        fmt = ""
+        for i, _ in enumerate(header):
+            max_len = len(header[i])
+            for j, _ in enumerate(data):
+                item_len = len(str(data[j][i]))
+                if item_len > max_len:
+                    max_len = item_len
+            fmt = fmt + "{:<" + str(max_len + 2) + "}"
+        print(fmt.format(*header))
+        print(fmt.format(*lines))
+        for row in data:
+            print(fmt.format(*row))
+
+def save_tensors(params):
+    """Save parameter dictionary to binary bytes.
+
+    The result binary bytes can be loaded by the
+    GraphModule with API "load_params".
+
+    Parameters
+    ----------
+    params : dict of str to NDArray
+        The parameter dictionary.
+
+    Returns
+    -------
+    param_bytes: bytearray
+        Serialized parameters.
+    """
+    _save_tensors = tvm.get_global_func("_save_param_dict")
+
+    args = []
+    for k, v in params.items():
+        args.append(k)
+        args.append(tvm.nd.array(v))
+    return _save_tensors(*args)
diff --git a/python/tvm/contrib/debugger/debug_runtime.py b/python/tvm/contrib/debugger/debug_runtime.py
new file mode 100644
index 000000000000..d38ee6cf7982
--- /dev/null
+++ b/python/tvm/contrib/debugger/debug_runtime.py
@@ -0,0 +1,224 @@
+"""Graph debug runtime executes TVM debug packed functions."""
+
+import os
+import tempfile
+import shutil
+from datetime import datetime
+from tvm._ffi.base import string_types
+from tvm._ffi.function import get_global_func
+from tvm.contrib import graph_runtime
+from tvm.rpc import base as rpc_base
+from . import debug_result
+
+_DUMP_ROOT_PREFIX = "tvmdbg_"
+_DUMP_PATH_PREFIX = "_tvmdbg_"
+
+def create(graph_json_str, libmod, ctx, dump_root=None):
+    """Create a runtime executor module given a graph and module.
+
+    Parameters
+    ----------
+    graph_json_str : str or graph class
+        The graph to be deployed in json format output by nnvm graph.
+        The graph can only contain one operator(tvm_op) that
+        points to the name of PackedFunc in the libmod.
+
+    libmod : tvm.Module
+        The module of the corresponding function.
+
+    ctx : TVMContext
+        The context to deploy the module, can be local or remote.
+
+    dump_root : str
+        To select which folder the outputs should be kept.
+        None will make a temp folder in /tmp/tvmdbg<rand_string> and does the dumping
+    Returns
+    -------
+    graph_module : GraphModuleDebug
+        Debug Runtime graph module that can be used to execute the graph.
+    """
+    if not isinstance(graph_json_str, string_types):
+        try:
+            graph_json_str = graph_json_str._tvm_graph_json()
+        except AttributeError:
+            raise ValueError("Type %s is not supported" % type(graph_json_str))
+    try:
+        fcreate = get_global_func("tvm.graph_runtime_debug.create")
+    except ValueError:
+        raise ValueError("Please set '(USE_GRAPH_RUNTIME_DEBUG ON)' in " \
+                         "config.cmake and rebuild TVM to enable debug mode")
+
+    ctx, num_rpc_ctx, device_type_id = graph_runtime.get_device_ctx(libmod, ctx)
+    if num_rpc_ctx == len(ctx):
+        libmod = rpc_base._ModuleHandle(libmod)
+        try:
+            fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime_debug.remote_create")
+        except ValueError:
+            raise ValueError("Please set '(USE_GRAPH_RUNTIME_DEBUG ON)' in " \
+                             "config.cmake and rebuild TVM to enable debug mode")
+    func_obj = fcreate(graph_json_str, libmod, *device_type_id)
+    return GraphModuleDebug(func_obj, ctx, graph_json_str, dump_root)
+
+
+class GraphModuleDebug(graph_runtime.GraphModule):
+    """Graph debug runtime module.
+
+    This is a debug wrapper over the TVM runtime.
+    Runtime interfaces are wrapped with debug functionalities.
+    Manage the debug framework to format the debug data and
+    trigger the user interfaces.
+
+    Parameters
+    ----------
+    module : Module
+        The interal tvm module that holds the actual graph functions.
+
+    ctx : TVMContext
+        The context this module is under.
+
+    graph_json_str : str or graph class
+        Content of graph json file in string format
+
+    dump_root : str
+        To select which folder the outputs should be kept.
+        None will make a temp folder in /tmp/tvmdbg<rand_string> and does the dumping
+    """
+    def __init__(self, module, ctx, graph_json_str, dump_root):
+        self._dump_root = dump_root
+        self._dump_path = None
+        self._debug_run = module["debug_run"]
+        self._get_output_by_layer = module["get_output_by_layer"]
+        graph_runtime.GraphModule.__init__(self, module)
+        self._create_debug_env(graph_json_str, ctx)
+
+    def _format_context(self, ctx):
+        return str(ctx[0]).upper().replace("(", ":").replace(")", "")
+
+    def _ensure_dir(self, directory):
+        """Create a directory if not exists
+
+        Parameters
+        ----------
+
+        directory : str
+            File path to create
+        """
+        if not os.path.exists(directory):
+            os.makedirs(directory, 0o700)
+
+    def _get_dump_path(self, ctx):
+        """Make the graph and tensor dump folder and return the path.
+
+        Parameters
+        ----------
+        ctx : TVMContext
+            The context this module is under.
+
+        Returns
+        -------
+        path : str
+            Directory path where the graph and node outputs will be stored.
+        """
+        # save to file
+        folder_name = _DUMP_PATH_PREFIX + "ctx_"
+        folder_name = folder_name + ctx.replace(":", "_")
+        path = os.path.join(self._dump_root, folder_name)
+        self._ensure_dir(path)
+        return path
+
+    def _remove_dump_root(self):
+        if os.path.isdir(self._dump_root):
+            shutil.rmtree(self._dump_root)
+
+    def _create_debug_env(self, graph_json, ctx):
+        """Create UI wrapper framework to handle multiple UI frontends for tvmdbg
+
+        Parameters
+        ----------
+        graph_json : json format
+            json formatted NNVM graph contain list of each node's name, shape and type.
+
+        nodes_list : list
+            List of all the nodes presented in the graph
+
+        ctx : TVMContext
+            The context this module is under.
+        """
+        # make the dump folder if not given
+        if not self._dump_root:
+            self._dump_root = tempfile.mkdtemp(prefix=_DUMP_ROOT_PREFIX)
+
+        # format the context
+        ctx = self._format_context(ctx)
+
+        # updates the dumping directories
+        self._dump_path = self._get_dump_path(ctx)
+
+        # init the debug dumping environment
+        self.debug_datum = debug_result.DebugResult(graph_json, self._dump_path)
+
+    def _run_debug(self):
+        """Execute the node spcified with index will be executed.
+        Each debug output will be copied to the buffer
+        Time consumed for each execuion will be set as debug output.
+
+        """
+
+        for i, node in enumerate(self.debug_datum.get_graph_nodes()):
+            start_time = datetime.now().time()
+            time_stamp = self._debug_run(i)
+            end_time = datetime.now().time()
+            self.debug_datum._time_list.append([time_stamp, start_time, end_time])
+            num_outputs = self.debug_datum.get_graph_node_output_num(node)
+            for j in range(num_outputs):
+                out_tensor = self._get_output_by_layer(i, j)
+                self.debug_datum._output_tensor_list.append(out_tensor)
+
+    def debug_get_output(self, node, out):
+        """Run graph upto node and get the output to out
+
+        Parameters
+        ----------
+        node : int / str
+            The node index or name
+
+        out : NDArray
+            The output array container
+        """
+        ret = None
+        if isinstance(node, str):
+            output_tensors = self.debug_datum.get_output_tensors()
+            try:
+                ret = output_tensors[node]
+            except:
+                node_list = output_tensors.keys()
+                raise RuntimeError("Node " + node + " not found, available nodes are: "
+                                   + str(node_list) + ".")
+        elif isinstance(node, int):
+            output_tensors = self.debug_datum._output_tensor_list
+            ret = output_tensors[node]
+        else:
+            raise RuntimeError("Require node index or name only.")
+        return ret
+
+    def run(self, **input_dict):
+        """Run forward execution of the graph with debug
+
+        Parameters
+        ----------
+        input_dict : dict of str to NDArray
+            List of input values to be feed to
+        """
+        if input_dict:
+            self.set_input(**input_dict)
+
+        # Step 1. Execute the graph
+        self._run_debug()
+        # Step 2. Dump the output tensors to the dump folder
+        self.debug_datum.dump_output_tensor()
+        # Step 3. Display the collected information
+        self.debug_datum.display_debug_result()
+
+    def exit(self):
+        """Exits the dump folder and all its contents"""
+        self._remove_dump_root()
diff --git a/python/tvm/contrib/dlpack.py b/python/tvm/contrib/dlpack.py
new file mode 100644
index 000000000000..11db29f98b3e
--- /dev/null
+++ b/python/tvm/contrib/dlpack.py
@@ -0,0 +1,43 @@
+"""Wrapping functions to bridge frameworks with DLPack support to TVM"""
+from .. import ndarray
+
+def convert_func(tvm_func, tensor_type, to_dlpack_func):
+    """Convert a tvm function into one that accepts a tensor from another
+       framework, provided the other framework supports DLPACK
+
+    Parameters
+    ----------
+    tvm_func: Function
+        Built tvm function operating on arrays
+
+    tensor_type: Type
+        Type of the tensors of the target framework
+
+    to_dlpack_func: Function
+        Function to convert the source tensors to DLPACK
+    """
+    assert callable(tvm_func)
+
+    def _wrapper(*args):
+        args = tuple(ndarray.from_dlpack(to_dlpack_func(arg))\
+            if isinstance(arg, tensor_type) else arg for arg in args)
+        return tvm_func(*args)
+
+    return _wrapper
+
+def to_pytorch_func(tvm_func):
+    """Convert a tvm function into one that accepts PyTorch tensors
+
+    Parameters
+    ----------
+    tvm_func: Function
+        Built tvm function operating on arrays
+
+    Returns
+    -------
+    wrapped_func: Function
+        Wrapped tvm function that operates on PyTorch tensors
+    """
+    import torch
+    import torch.utils.dlpack
+    return convert_func(tvm_func, torch.Tensor, torch.utils.dlpack.to_dlpack)
diff --git a/python/tvm/contrib/download.py b/python/tvm/contrib/download.py
index 434216a2652c..0dcbb56ad663 100644
--- a/python/tvm/contrib/download.py
+++ b/python/tvm/contrib/download.py
@@ -64,8 +64,8 @@ def _download_progress(count, block_size, total_size):
         progress_size = int(count * block_size)
         speed = int(progress_size / (1024 * duration))
         percent = min(int(count * block_size * 100 / total_size), 100)
-        sys.stdout.write("\r...%d%%, %d MB, %d KB/s, %d seconds passed" %
-                         (percent, progress_size / (1024 * 1024), speed, duration))
+        sys.stdout.write("\r...%d%%, %.2f MB, %d KB/s, %d seconds passed" %
+                         (percent, progress_size / (1024.0 * 1024), speed, duration))
         sys.stdout.flush()
 
     if sys.version_info >= (3,):
diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py
index 9ce9dd602fa3..0d62a04a5571 100644
--- a/python/tvm/contrib/graph_runtime.py
+++ b/python/tvm/contrib/graph_runtime.py
@@ -1,26 +1,26 @@
 """Minimum graph runtime that executes graph containing TVM PackedFunc."""
+import numpy as np
+
 from .._ffi.base import string_types
 from .._ffi.function import get_global_func
+from .._ffi.runtime_ctypes import TVMContext
 from ..rpc import base as rpc_base
-from .. import ndarray as nd
-
 
 def create(graph_json_str, libmod, ctx):
     """Create a runtime executor module given a graph and module.
-
     Parameters
     ----------
     graph_json_str : str or graph class
         The graph to be deployed in json format output by nnvm graph.
         The graph can only contain one operator(tvm_op) that
         points to the name of PackedFunc in the libmod.
-
     libmod : tvm.Module
         The module of the corresponding function
-
-    ctx : TVMContext
-        The context to deploy the module, can be local or remote.
-
+    ctx : TVMContext or list of TVMContext
+        The context to deploy the module. It can be local or remote when there
+        is only one TVMContext. Otherwise, the first context in the list will
+        be used as this purpose. All context should be given for heterogeneous
+        execution.
     Returns
     -------
     graph_module : GraphModule
@@ -31,17 +31,60 @@ def create(graph_json_str, libmod, ctx):
             graph_json_str = graph_json_str._tvm_graph_json()
         except AttributeError:
             raise ValueError("Type %s is not supported" % type(graph_json_str))
-    device_type = ctx.device_type
-    device_id = ctx.device_id
-    if device_type >= rpc_base.RPC_SESS_MASK:
-        assert libmod.type_key == "rpc"
-        assert rpc_base._SessTableIndex(libmod) == ctx._rpc_sess._tbl_index
+
+    ctx, num_rpc_ctx, device_type_id = get_device_ctx(libmod, ctx)
+
+    if num_rpc_ctx == len(ctx):
         hmod = rpc_base._ModuleHandle(libmod)
-        fcreate = ctx._rpc_sess.get_function("tvm.graph_runtime.remote_create")
-        device_type = device_type % rpc_base.RPC_SESS_MASK
-        return GraphModule(fcreate(graph_json_str, hmod, device_type, device_id), ctx)
+        fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime.remote_create")
+        return GraphModule(fcreate(graph_json_str, hmod, *device_type_id))
+
     fcreate = get_global_func("tvm.graph_runtime.create")
-    return GraphModule(fcreate(graph_json_str, libmod, device_type, device_id), ctx)
+    return GraphModule(fcreate(graph_json_str, libmod, *device_type_id))
+
+def get_device_ctx(libmod, ctx):
+    """Parse and validate all the device context(s).
+    Parameters
+    ----------
+    libmod : tvm.Module
+        The module of the corresponding function
+    ctx : TVMContext or list of TVMContext
+    Returns
+    -------
+    ctx : list of TVMContext
+    num_rpc_ctx : Number of rpc contexts
+    device_type_id : List of device type and device id
+    """
+
+    if isinstance(ctx, TVMContext):
+        ctx = [ctx]
+    elif not isinstance(ctx, (list, tuple)):
+        raise ValueError("ctx has to be the type of TVMContext or a list of "
+                         "TVMCTVMContext")
+    for cur_ctx in ctx:
+        if not isinstance(cur_ctx, TVMContext):
+            raise ValueError("ctx has to be the type of TVMContext or a list "
+                             "of TVMContext")
+
+    # device_type_id[0], device_type_id[1] are used as the primary/fallback
+    # context type and id. All other ones are used as device context for
+    # heterogeneous execution.
+    num_rpc_ctx = 0
+    device_type_id = []
+    for cur_ctx in ctx:
+        device_type = cur_ctx.device_type
+        if device_type >= rpc_base.RPC_SESS_MASK:
+            assert libmod.type_key == "rpc"
+            assert rpc_base._SessTableIndex(
+                libmod) == cur_ctx._rpc_sess._tbl_index
+            num_rpc_ctx += 1
+            device_type = cur_ctx.device_type % rpc_base.RPC_SESS_MASK
+        device_type_id.append(device_type)
+        device_type_id.append(cur_ctx.device_id)
+
+    if 0 < num_rpc_ctx < len(ctx):
+        raise ValueError("Either all or none of the contexts should be rpc.")
+    return ctx, num_rpc_ctx, device_type_id
 
 
 class GraphModule(object):
@@ -56,29 +99,20 @@ class GraphModule(object):
     module : Module
         The interal tvm module that holds the actual graph functions.
 
-    ctx : TVMContext
-        The context this module is under
-
     Attributes
     ----------
     module : Module
         The interal tvm module that holds the actual graph functions.
-
-    ctx : TVMContext
-        The context this module is under
     """
-    def __init__(self, module, ctx):
+
+    def __init__(self, module):
         self.module = module
         self._set_input = module["set_input"]
         self._run = module["run"]
         self._get_output = module["get_output"]
         self._get_input = module["get_input"]
-        try:
-            self._debug_get_output = module["debug_get_output"]
-        except AttributeError:
-            pass
+        self._get_num_outputs = module["get_num_outputs"]
         self._load_params = module["load_params"]
-        self.ctx = ctx
 
     def set_input(self, key=None, value=None, **params):
         """Set inputs to the module via kwargs
@@ -94,11 +128,15 @@ def set_input(self, key=None, value=None, **params):
         params : dict of str to NDArray
            Additonal arguments
         """
-        if key:
-            self._set_input(key, nd.array(value, ctx=self.ctx))
-        for k, v in params.items():
-            self._set_input(k, nd.array(v, ctx=self.ctx))
-        return self
+        if key is not None:
+            self._get_input(key).copyfrom(value)
+
+        if params:
+            # upload big arrays first to avoid memory issue in rpc mode
+            keys = list(params.keys())
+            keys.sort(key=lambda x: -np.prod(params[x].shape))
+            for k in keys:
+                self._get_input(k).copyfrom(params[k])
 
     def run(self, **input_dict):
         """Run forward execution of the graph
@@ -112,7 +150,17 @@ def run(self, **input_dict):
             self.set_input(**input_dict)
         self._run()
 
-    def get_input(self, index, out):
+    def get_num_outputs(self):
+        """Get the number of outputs from the graph
+
+        Returns
+        -------
+        count : int
+            The number of outputs.
+        """
+        return self._get_num_outputs()
+
+    def get_input(self, index, out=None):
         """Get index-th input to out
 
         Parameters
@@ -123,10 +171,13 @@ def get_input(self, index, out):
         out : NDArray
             The output array container
         """
-        self._get_input(index, out)
-        return out
+        if out:
+            self._get_input(index).copyto(out)
+            return out
+
+        return self._get_input(index)
 
-    def get_output(self, index, out):
+    def get_output(self, index, out=None):
         """Get index-th output to out
 
         Parameters
@@ -137,8 +188,11 @@ def get_output(self, index, out):
         out : NDArray
             The output array container
         """
-        self._get_output(index, out)
-        return out
+        if out:
+            self._get_output(index, out)
+            return out
+
+        return self._get_output(index)
 
     def debug_get_output(self, node, out):
         """Run graph upto node and get the output to out
@@ -151,11 +205,8 @@ def debug_get_output(self, node, out):
         out : NDArray
             The output array container
         """
-        if hasattr(self, '_debug_get_output'):
-            self._debug_get_output(node, out)
-        else:
-            raise RuntimeError("Please compile runtime with USE_GRAPH_RUNTIME_DEBUG = 0")
-        return out
+        raise NotImplementedError(
+            "Please use debugger.debug_runtime as graph_runtime instead.")
 
     def load_params(self, params_bytes):
         """Load parameters from serialized byte array of parameter dict.
diff --git a/python/tvm/contrib/mps.py b/python/tvm/contrib/mps.py
index 43b3b9fb48db..86532f72153c 100644
--- a/python/tvm/contrib/mps.py
+++ b/python/tvm/contrib/mps.py
@@ -1,4 +1,4 @@
-"""External function interface to MPS libraroes."""
+"""External function interface to MPS libraries."""
 from __future__ import absolute_import as _abs
 from .. import api as _api
 from .. import intrin as _intrin
diff --git a/python/tvm/contrib/nnpack.py b/python/tvm/contrib/nnpack.py
index d6587df26229..9fd0e7ed2cba 100644
--- a/python/tvm/contrib/nnpack.py
+++ b/python/tvm/contrib/nnpack.py
@@ -1,20 +1,15 @@
-"""External function interface to NNPACK libraroes."""
+"""External function interface to NNPACK libraries."""
 from __future__ import absolute_import as _abs
 
 from .. import api as _api
 from .. import intrin as _intrin
 from .._ffi.function import _init_api
 
-def config(nthreads):
-    """Configure the nnpack library.
-
-    Parameters
-    ----------
-    nthreads : int
-        The threads number of nnpack thread pool, must be a nonnegative.
-
+def is_available():
+    """Check whether NNPACK is available, that is, `nnp_initialize()`
+    returns `nnp_status_success`.
     """
-    _Config(nthreads)
+    return _initialize() == 0
 
 def fully_connected_inference(lhs, rhs, nthreads=1):
     """Create an extern op that compute fully connected of 1D tensor lhs and
@@ -39,38 +34,32 @@ def fully_connected_inference(lhs, rhs, nthreads=1):
             "tvm.contrib.nnpack.fully_connected_inference",
             ins[0], ins[1], outs[0], nthreads), name="C")
 
-def fully_connected_output(lhs, rhs, nthreads=1):
-    """Create an extern op that compute fully connected of 2D tensor lhs and
-    2D tensor rhs with nnpack.
 
-    Parameters
-    ----------
-    lhs : Tensor
-        lhs 2D matrix input[batch_size][input_channels] of FP32 elements
-    rhs : Tensor
-        lhs 2D matrix kernel[output_channels][input_channels] of FP32 elements
+class ConvolutionAlgorithm:
+    AUTO = 0
+    FFT_8x8 = 1
+    FFT_16x16 = 2
+    WT_8x8 = 3
+    IMPLICIT_GEMM = 4
+    DIRECT = 5
+    WT_8x8_FP16 = 6
 
-    Returns
-    -------
-    C : Tensor
-        lhs 2D array out[batch_size][output_channels] of FP32 elements.
-    """
-    n = lhs.shape[0]
-    m = rhs.shape[0]
-    return _api.extern(
-        (n, m), [lhs, rhs],
-        lambda ins, outs: _intrin.call_packed(
-            "tvm.contrib.nnpack.fully_connected_output",
-            ins[0], ins[1], outs[0], nthreads), name="C")
 
-def convolution_inference(data, kernel, bias, padding, stride, nthreads=1):
-    """Create an extern op to do inference convolution of 3D tensor data and
+class ConvolutionTransformStrategy:
+    COMPUTE = 1
+    PRECOMPUTE = 2
+
+
+def convolution_inference(
+        data, kernel, bias, padding, stride, nthreads=1,
+        algorithm=ConvolutionAlgorithm.AUTO):
+    """Create an extern op to do inference convolution of 4D tensor data and
     4D tensor kernel and 1D tensor bias with nnpack.
 
     Parameters
     ----------
     data : Tensor
-        data 3D tensor input[input_channels][input_height][input_width] of
+        data 4D tensor input[batch][input_channels][input_height][input_width] of
         FP32 elements.
     kernel : Tensor
         kernel 4D tensor kernel[output_channels][input_channels][kernel_height]
@@ -88,60 +77,107 @@ def convolution_inference(data, kernel, bias, padding, stride, nthreads=1):
     Returns
     -------
     output : Tensor
-        output 3D tensor output[output_channels][output_height][output_width]
+        output 4D tensor output[batch][output_channels][output_height][output_width]
         of FP32 elements.
     """
 
     assert isinstance(padding, list) and len(padding) == 4
     assert isinstance(stride, list) and len(stride) == 2
-    _, input_height, input_width = data.shape
+    batch, _, input_height, input_width = data.shape
     output_channels, _, kernel_height, kernel_width = kernel.shape
     output_height = (input_height + padding[0] + padding[1] - kernel_height) / stride[0] + 1
     output_width = (input_width + padding[0] + padding[1] - kernel_width) / stride[1] + 1
 
     return _api.extern(
-        (output_channels, output_height, output_width), [data, kernel, bias],
+        (batch, output_channels, output_height, output_width),
+        [data, kernel, bias] if bias is not None else [data, kernel],
         lambda ins, outs: _intrin.call_packed(
-            "tvm.contrib.nnpack.convolution_inference", ins[0], ins[1], ins[2],
+            "tvm.contrib.nnpack.convolution_inference",
+            ins[0],
+            ins[1],
+            ins[2] if bias is not None else 0,
             outs[0], padding[0], padding[1], padding[2], padding[3],
-            stride[0], stride[1], nthreads), name="C")
+            stride[0], stride[1], nthreads, algorithm), name="C")
 
-def convolution_output(data, kernel, bias, padding, nthreads=1):
-    """Create an extern op to compute convolution of 4D tensor data and
-    4D tensor kernel and 1D tensor bias with nnpack.
+def convolution_inference_without_weight_transform(
+        data, transformed_kernel, bias, padding, stride, nthreads=1,
+        algorithm=ConvolutionAlgorithm.AUTO):
+    """Create an extern op to do inference convolution of 4D tensor data and
+    4D pre-transformed tensor kernel and 1D tensor bias with nnpack.
 
     Parameters
     ----------
     data : Tensor
-        data 4D tensor input[batch_size][input_channels][input_height]
-        [input_width] of FP32 elements.
-    kernel : Tensor
-        kernel 4D tensor kernel[output_channels][input_channels][kernel_height]
-        [kernel_width] of FP32 elements.
+        data 4D tensor input[batch][input_channels][input_height][input_width] of
+        FP32 elements.
+    transformed_kernel : Tensor
+        transformed_kernel 4D tensor kernel[output_channels][input_channels][tile]
+        [tile] of FP32 elements.
     bias : Tensor
         bias 1D array bias[output_channels][input_channels][kernel_height]
         [kernel_width] of FP32 elements.
     padding : list
         padding A 4-dim list of [pad_top, pad_bottom, pad_left, pad_right],
         which indicates the padding around the feature map.
+    stride : list
+        stride A 2-dim list of [stride_height, stride_width], which indicates
+        the stride.
 
     Returns
     -------
     output : Tensor
-        output 4D tensor output[batch_size][output_channels][output_height]
-        [output_width] of FP32 elements.
+        output 4D tensor output[batch][output_channels][output_height][output_width]
+        of FP32 elements.
     """
 
+    assert algorithm in (ConvolutionAlgorithm.WT_8x8,
+                         ConvolutionAlgorithm.WT_8x8_FP16)
     assert isinstance(padding, list) and len(padding) == 4
+    assert isinstance(stride, list) and len(stride) == 2
     batch, _, input_height, input_width = data.shape
-    output_channels, _, kernel_height, kernel_width = kernel.shape
-    output_height = (input_height + padding[0] + padding[1] - kernel_height) + 1
-    output_width = (input_width + padding[0] + padding[1] - kernel_width) + 1
+    output_channels, _, _, _ = transformed_kernel.shape
+    kernel_height, kernel_width = (3, 3)
+    output_height = (input_height + padding[0] + padding[1] - kernel_height) / stride[0] + 1
+    output_width = (input_width + padding[0] + padding[1] - kernel_width) / stride[1] + 1
+
+    return _api.extern(
+        (batch, output_channels, output_height, output_width),
+        [data, transformed_kernel, bias] if bias is not None else [data, transformed_kernel],
+        lambda ins, outs: _intrin.call_packed(
+            "tvm.contrib.nnpack.convolution_inference_without_weight_transform",
+            ins[0],
+            ins[1],
+            ins[2] if bias is not None else 0,
+            outs[0], padding[0], padding[1], padding[2], padding[3],
+            stride[0], stride[1], nthreads, algorithm), name="C")
+
+def convolution_inference_weight_transform(
+        kernel, nthreads=1,
+        algorithm=ConvolutionAlgorithm.AUTO):
+    """Create an extern op to do inference convolution of 3D tensor data and
+    4D tensor kernel and 1D tensor bias with nnpack.
+
+    Parameters
+    ----------
+    kernel : Tensor
+        kernel 4D tensor kernel[output_channels][input_channels][kernel_height]
+        [kernel_width] of FP32 elements.
+
+    Returns
+    -------
+    output : Tensor
+        output 4D tensor output[output_channels][input_channels][tile][tile]
+        of FP32 elements.
+    """
+    assert algorithm in (ConvolutionAlgorithm.WT_8x8, ConvolutionAlgorithm.WT_8x8_FP16)
+    output_channels, input_channels, _, _ = kernel.shape
 
+    transform_tile_size = 8
     return _api.extern(
-        (batch, output_channels, output_height, output_width), [data, kernel, bias],
+        (output_channels, input_channels, transform_tile_size, transform_tile_size),
+        [kernel],
         lambda ins, outs: _intrin.call_packed(
-            "tvm.contrib.nnpack.convolution_output", ins[0], ins[1], ins[2],
-            outs[0], padding[0], padding[1], padding[2], padding[3], nthreads), name="C")
+            "tvm.contrib.nnpack.convolution_inference_weight_transform",
+            ins[0], outs[0], nthreads, algorithm), name="transform_kernel")
 
 _init_api("tvm.contrib.nnpack")
diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index 1b7bb840127d..21cc4844087c 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -28,7 +28,7 @@ def compile_cuda(code,
     arch : str
         The architecture
 
-    options : str
+    options : str or list of str
         The additional options
 
     path_target : str, optional
@@ -59,10 +59,16 @@ def compile_cuda(code,
     cmd = ["nvcc"]
     cmd += ["--%s" % target, "-O3"]
     cmd += ["-arch", arch]
-    cmd += ["-o", file_target]
 
     if options:
-        cmd += options
+        if isinstance(options, str):
+            cmd += [options]
+        elif isinstance(options, list):
+            cmd += options
+        else:
+            raise ValueError("options must be str or list of str")
+
+    cmd += ["-o", file_target]
     cmd += [temp_code]
 
     proc = subprocess.Popen(
@@ -97,7 +103,7 @@ def find_cuda_path():
     (out, _) = proc.communicate()
     out = py_str(out)
     if proc.returncode == 0:
-        return os.path.abspath(os.path.join(str(out).strip(), "../.."))
+        return os.path.realpath(os.path.join(str(out).strip(), "../.."))
     cuda_path = "/usr/local/cuda"
     if os.path.exists(os.path.join(cuda_path, "bin/nvcc")):
         return cuda_path
diff --git a/python/tvm/contrib/rocm.py b/python/tvm/contrib/rocm.py
index 10cfaed83e68..172d081ff96a 100644
--- a/python/tvm/contrib/rocm.py
+++ b/python/tvm/contrib/rocm.py
@@ -1,6 +1,6 @@
 """Utility for ROCm backend"""
 import subprocess
-from os.path import join
+from os.path import join, exists
 from . import util
 from .._ffi.base import py_str
 from ..api import register_func, convert
@@ -79,4 +79,5 @@ def callback_rocm_bitcode_path(rocdl_dir="/opt/rocm/lib/"):
         "oclc_unsafe_math_off.amdgcn.bc",
         "oclc_unsafe_math_on.amdgcn.bc"
     ]
-    return convert([join(rocdl_dir, bitcode) for bitcode in bitcode_files])
+    paths = [join(rocdl_dir, bitcode) for bitcode in bitcode_files]
+    return convert([path for path in paths if exists(path)])
diff --git a/python/tvm/contrib/sdaccel.py b/python/tvm/contrib/sdaccel.py
index 0f89911dbdad..6bc246ff1751 100644
--- a/python/tvm/contrib/sdaccel.py
+++ b/python/tvm/contrib/sdaccel.py
@@ -36,7 +36,7 @@ def compile_vhls(kernel_info, device_name):
         platform = os.environ.get("XCL_PLATFORM", os.environ.get("AWS_PLATFORM"))
 
     if platform is None:
-        raise RuntimeError("No Xlinx device specified.")
+        raise RuntimeError("No Xilinx device specified.")
 
     tmp_xo_files = []
     for funcname, code  in kernel_info:
diff --git a/python/tvm/contrib/sparse.py b/python/tvm/contrib/sparse.py
new file mode 100644
index 000000000000..523039912aa9
--- /dev/null
+++ b/python/tvm/contrib/sparse.py
@@ -0,0 +1,163 @@
+"""Tensor and Operation class for computation declaration."""
+# pylint: disable=invalid-name
+from __future__ import absolute_import as _abs
+import numpy as _np
+from .. import expr as _expr
+from .. import api as _api
+from .. import tensor as _tensor
+from .. import ndarray as _nd
+
+float32 = "float32"
+itype = 'int32'
+
+class CSRNDArray(object):
+    """Sparse tensor object in CSR format."""
+    def __init__(self, arg1, ctx=None, shape=None):
+        """Construct a sparse matrix in CSR format.
+
+        Parameters
+        ----------
+        arg1 : numpy.ndarray or a tuple with (data, indices, indptr)
+            The corresponding a dense numpy array,
+            or a tuple for constructing a sparse matrix directly.
+
+        ctx: tvm.TVMContext
+            The corresponding context.
+
+        shape : tuple of int
+            The shape of the array
+        """
+        if isinstance(arg1, tuple):
+            assert len(arg1) == 3
+            self.data, self.indices, self.indptr = arg1
+            self.shape = shape
+        elif isinstance(arg1, _np.ndarray):
+            source_array = arg1
+            ridx, cidx = _np.nonzero(source_array)
+            data = source_array[ridx, cidx]
+            self.data = _nd.array(data, ctx)
+            indices = _np.nonzero(source_array)[1].astype(itype)
+            self.indices = _nd.array(indices, ctx)
+            indptr = [0]+_np.apply_along_axis(_np.count_nonzero, axis=1, arr=source_array).tolist()
+            indptr = _np.cumsum(_np.array(indptr, itype)).astype(itype)
+            self.indptr = _nd.array(indptr, ctx)
+            self.shape = source_array.shape
+        else:
+            raise RuntimeError("Construct CSRNDArray with either a tuple (data, indices, indptr) "
+                               "or a numpy.array, can't handle type %s." % (type(arg1),))
+        self.stype = 'csr'
+        self.dtype = self.data.dtype
+        assert self.shape is not None
+        assert isinstance(self.data, _nd.NDArray)
+        assert isinstance(self.indices, _nd.NDArray)
+        assert str(self.indices.dtype) == 'int32' or \
+            str(self.indices.dtype) == 'int64', str(self.indices.dtype)
+        assert isinstance(self.indptr, _nd.NDArray)
+        assert str(self.indptr.dtype) == 'int32' or \
+            str(self.indptr.dtype) == 'int64', str(self.indptr.dtype)
+
+    def asnumpy(self):
+        """Construct a full matrix and convert it to numpy array."""
+        full = _np.zeros(self.shape, self.dtype)
+        ridx = _np.diff(self.indptr.asnumpy())
+        ridx = _np.hstack((_np.ones((v,), itype)*i for i, v in enumerate(ridx)))
+        full[ridx, self.indices.asnumpy().astype(itype)] = self.data.asnumpy()
+        return full
+
+def array(source_array, ctx=None, shape=None, stype='csr'):
+    """Construct a sparse NDArray from numpy.ndarray"""
+    ret = None
+    if stype == 'csr':
+        ret = CSRNDArray(source_array, shape=shape, ctx=ctx)
+    else:
+        raise NotImplementedError('stype=%s is not supported yet.' % (stype,))
+    return ret
+
+class SparsePlaceholderOp(object):
+    """Placeholder class for sparse tensor representations."""
+    def __init__(self, shape, nonzeros, dtype, name):
+        # pylint: disable=unused-argument
+        """Contructing a bare bone structure for a sparse matrix
+
+        Parameters
+        ----------
+        shape: Tuple of Expr
+            The shape of the tensor
+
+        nonzeros: int
+            The number of non-zero values
+
+        dtype: str, optional
+            The data type of the tensor
+
+        name: str, optional
+            The name hint of the tensor
+        """
+        self.shape = shape
+        self.dtype = dtype
+        self.name = name
+        self.stype = 'unknown'
+
+class CSRPlaceholderOp(SparsePlaceholderOp):
+    """Placeholder class for CSR based sparse tensor representation."""
+    def __init__(self, shape, nonzeros, dtype, name):
+        """Contructing a bare bone structure for a csr_matrix
+
+        Parameters
+        ----------
+        shape: Tuple of Expr
+            The shape of the tensor
+
+        nonzeros: int
+            The number of non-zero values
+
+        dtype: str, optional
+            The data type of the tensor
+
+        name: str, optional
+            The name hint of the tensor
+        """
+        SparsePlaceholderOp.__init__(self, shape, nonzeros, dtype, name)
+        self.stype = 'csr'
+        self.data = _api.placeholder((nonzeros,), dtype=dtype, name=self.name+'_data')
+        self.indices = _api.placeholder((nonzeros,), dtype=itype, name=self.name+'_indices')
+        self.indptr = _api.placeholder((self.shape[0]+1,), dtype=itype, name=self.name+'_indptr')
+        assert isinstance(self.data, _tensor.Tensor)
+        assert isinstance(self.indices, _tensor.Tensor)
+        assert isinstance(self.indptr, _tensor.Tensor)
+
+def placeholder(shape, nonzeros=None, dtype=None, name="placeholder", stype=None):
+    """Construct an empty sparse tensor object.
+
+    Parameters
+    ----------
+    shape: Tuple of Expr
+        The shape of the tensor
+
+    nonzeros: int
+        The number of non-zero values
+
+    dtype: str, optional
+        The data type of the tensor
+
+    name: str, optional
+        The name hint of the tensor
+
+    stype: str, optional
+        The name storage type of the sparse tensor (e.g. csr, coo, ell)
+
+    Returns
+    -------
+    tensor: SparsePlaceholderOp
+        The created sparse tensor placeholder
+    """
+    shape = (shape,) if isinstance(shape, _expr.Expr) else shape
+    nonzeros = 0 if nonzeros is None else nonzeros
+    dtype = float32 if dtype is None else dtype
+    stype = 'csr' if stype is None else stype
+    ret = None
+    if stype == 'csr':
+        ret = CSRPlaceholderOp(shape=shape, nonzeros=nonzeros, dtype=dtype, name=name)
+    else:
+        raise NotImplementedError('stype=%s is not supported yet.' % (stype,))
+    return ret
diff --git a/python/tvm/contrib/verilog.py b/python/tvm/contrib/verilog.py
index 22b8fe1722d4..358366684fa4 100644
--- a/python/tvm/contrib/verilog.py
+++ b/python/tvm/contrib/verilog.py
@@ -111,7 +111,7 @@ def __getattr__(self, name):
 
 
 def _find_vpi_path():
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    curr_path = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
     api_path = os.path.join(curr_path, '../../../lib/')
     vpi_path = [curr_path, api_path]
     vpi_path = [os.path.join(p, 'tvm_vpi.vpi') for p in vpi_path]
@@ -123,7 +123,7 @@ def _find_vpi_path():
 
 def search_path():
     """Get the search directory."""
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    curr_path = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
     ver_path = [os.path.join(curr_path, '../../../verilog/')]
     ver_path += [os.path.join(curr_path, '../../../tests/verilog/unittest/')]
     ver_path += [os.path.join(curr_path, '../../../tests/verilog/integration/')]
diff --git a/python/tvm/contrib/xcode.py b/python/tvm/contrib/xcode.py
index 63fbad2a58cf..186df3f130e9 100644
--- a/python/tvm/contrib/xcode.py
+++ b/python/tvm/contrib/xcode.py
@@ -206,9 +206,9 @@ def popen_test_rpc(host,
     if "TVM_IOS_RPC_ROOT" in os.environ:
         rpc_root = os.environ["TVM_IOS_RPC_ROOT"]
     else:
-        curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+        curr_path = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
         rpc_root = os.path.join(curr_path, "../../../apps/ios_rpc")
-    proj_path = os.path.abspath(os.path.join(rpc_root, "tvmrpc.xcodeproj"))
+    proj_path = os.path.realpath(os.path.join(rpc_root, "tvmrpc.xcodeproj"))
     if not os.path.exists(proj_path):
         raise RuntimeError("Cannot find tvmrpc.xcodeproj in %s," +
                            (" please set env TVM_IOS_RPC_ROOT correctly" % rpc_root))
diff --git a/python/tvm/exec/autotvm_log_editor.py b/python/tvm/exec/autotvm_log_editor.py
index c524fb5dc785..458b6eff0e44 100644
--- a/python/tvm/exec/autotvm_log_editor.py
+++ b/python/tvm/exec/autotvm_log_editor.py
@@ -10,9 +10,9 @@
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument("--act", type=str, choices=['pick-best'],
+    parser.add_argument("--act", type=str, choices=['pick-best'], required=True,
                         help="The action")
-    parser.add_argument("--i", type=str, help="The input file or directory")
+    parser.add_argument("--i", type=str, help="The input file or directory", required=True)
     parser.add_argument("--o", type=str, help="The output file")
 
     args = parser.parse_args()
diff --git a/python/tvm/exec/rpc_proxy.py b/python/tvm/exec/rpc_proxy.py
index 678023a10550..363609c81de4 100644
--- a/python/tvm/exec/rpc_proxy.py
+++ b/python/tvm/exec/rpc_proxy.py
@@ -12,7 +12,7 @@
 
 def find_example_resource():
     """Find resource examples."""
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    curr_path = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
     base_path = os.path.join(curr_path, "../../../")
     index_page = os.path.join(base_path, "web/example_rpc.html")
     js_files = [
diff --git a/python/tvm/exec/tophub.py b/python/tvm/exec/tophub.py
deleted file mode 100644
index 9dd951a52701..000000000000
--- a/python/tvm/exec/tophub.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# pylint: disable=invalid-name
-"""Download pre-tuned parameters of ops"""
-
-import argparse
-import logging
-
-from ..autotvm.tophub import list_packages, download_package
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--download", type=str, nargs='+',
-                        help="Target to download. Use 'all' to download for all targets")
-    parser.add_argument("-l", "--list", action='store_true', help="List available packages")
-    args = parser.parse_args()
-
-    logging.basicConfig(level=logging.INFO)
-
-    if args.list:
-        info = list_packages()
-        print("\n%-20s %-20s" % ("Target", "Size"))
-        print("-" * 41)
-        for target, info in info:
-            print("%-20s %-20s" % (target, "%.2f MB" % (info['size']/1000000)))
-
-    if args.download:
-        info = list_packages()
-        all_targets = [x[0] for x in info]
-        if 'all' in args.download:
-            targets = all_targets
-        else:
-            targets = args.download
-
-        for t in targets:
-            if t not in all_targets:
-                print("Warning : cannot find tuned parameters of " + t + ". (ignored)")
-            download_package(t)
diff --git a/python/tvm/expr.py b/python/tvm/expr.py
index 8bf46b7eee62..bdb253d21582 100644
--- a/python/tvm/expr.py
+++ b/python/tvm/expr.py
@@ -60,7 +60,7 @@ def __rfloordiv__(self, other):
         return self.__rdiv__(other)
 
     def __mod__(self, other):
-        return _make.Mod(self, other)
+        return _make._OpMod(self, other)
 
     def __neg__(self):
         neg_one = _api_internal._const(-1, self.dtype)
@@ -85,10 +85,10 @@ def __invert__(self):
         return _make.Call(self.dtype, "bitwise_not", [self], Call.PureIntrinsic, None, 0)
 
     def __lt__(self, other):
-        return _make.LT(self, other)
+        return _make._OpLT(self, other)
 
     def __le__(self, other):
-        return _make.LE(self, other)
+        return _make._OpLE(self, other)
 
     def __eq__(self, other):
         return EqualOp(self, other)
@@ -97,10 +97,10 @@ def __ne__(self, other):
         return NotEqualOp(self, other)
 
     def __gt__(self, other):
-        return _make.GT(self, other)
+        return _make._OpGT(self, other)
 
     def __ge__(self, other):
-        return _make.GE(self, other)
+        return _make._OpGE(self, other)
 
     def __nonzero__(self):
         raise ValueError("Cannot use and / or / not operator to Expr, hint: " +
@@ -122,7 +122,7 @@ def equal(self, other):
         ret : Expr
             The equality expression.
         """
-        return _make.EQ(self, other)
+        return _make._OpEQ(self, other)
 
     def astype(self, dtype):
         """Cast the expression to other type.
@@ -169,7 +169,7 @@ def __bool__(self):
 
     def asnode(self):
         """Convert node."""
-        return _make.EQ(self.a, self.b)
+        return _make._OpEQ(self.a, self.b)
 
 
 class NotEqualOp(NodeGeneric, ExprOp):
@@ -201,7 +201,7 @@ def __bool__(self):
 
     def asnode(self):
         """Convert node."""
-        return _make.NE(self.a, self.b)
+        return _make._OpNE(self.a, self.b)
 
 
 class Expr(ExprOp, NodeBase):
@@ -225,127 +225,548 @@ class LogicalExpr(Expr):
 
 @register_node("Variable")
 class Var(Expr):
-    """Symbolic variable."""
-    pass
+    """Symbolic variable.
+
+    Parameters
+    ----------
+    name : str
+        The name
+
+    dtype : int
+        The data type
+    """
+    def __init__(self, name, dtype):
+        self.__init_handle_by_constructor__(
+            _api_internal._Var, name, dtype)
+
 
 @register_node
 class Reduce(Expr):
-    pass
+    """Reduce node.
+
+    Parameters
+    ----------
+    combiner : CommReducer
+        The combiner.
+
+    src : list of Expr
+        The source expression.
+
+    rdom : list of IterVar
+        The iteration domain
+
+    condition : Expr
+        The reduce condition.
+
+    value_index : int
+        The value index.
+    """
+    def __init__(self, combiner, src, rdom, condition, value_index):
+        self.__init_handle_by_constructor__(
+            _make.Reduce, combiner, src, rdom,
+            condition, value_index)
+
 
 @register_node
 class FloatImm(ConstExpr):
-    pass
+    """Float constant.
+
+    Parameters
+    ----------
+    dtype : str
+        The data type
+
+    value : float
+        The constant value.
+    """
+    def __init__(self, dtype, value):
+        self.__init_handle_by_constructor__(
+            _make.FloatImm, dtype, value)
 
 @register_node
 class IntImm(ConstExpr):
-    pass
+    """Int constant.
+
+    Parameters
+    ----------
+    dtype : str
+        The data type
+
+    value : int
+        The constant value.
+    """
+    def __init__(self, dtype, value):
+        self.__init_handle_by_constructor__(
+            _make.IntImm, dtype, value)
+
+    def __int__(self):
+        return self.value
+
 
 @register_node
 class UIntImm(ConstExpr):
-    pass
+    """UInt constant.
+
+    Parameters
+    ----------
+    dtype : str
+        The data type
+
+    value : int
+        The constant value.
+    """
+    def __init__(self, dtype, value):
+        self.__init_handle_by_constructor__(
+            _make.UIntImm, dtype, value)
+
 
 @register_node
 class StringImm(ConstExpr):
-    pass
+    """String constant.
+
+    Parameters
+    ----------
+    value : str
+        The value of the function.
+    """
+    def __init__(self, value):
+        self.__init_handle_by_constructor__(
+            _make.StringImm, value)
+
 
 @register_node
 class Cast(Expr):
-    pass
+    """Cast expression.
+
+    Parameters
+    ----------
+    dtype : str
+        The data type
+
+    value : Expr
+        The value of the function.
+    """
+    def __init__(self, dtype, value):
+        self.__init_handle_by_constructor__(
+            _make.Cast, dtype, value)
+
 
 @register_node
 class Add(BinaryOpExpr):
-    pass
+    """Add node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Add, a, b)
+
 
 @register_node
 class Sub(BinaryOpExpr):
-    pass
+    """Sub node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Sub, a, b)
+
 
 @register_node
 class Mul(BinaryOpExpr):
-    pass
+    """Mul node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Mul, a, b)
+
 
 @register_node
 class Div(BinaryOpExpr):
-    pass
+    """Div node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Div, a, b)
+
 
 @register_node
 class Mod(BinaryOpExpr):
-    pass
+    """Mod node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Mod, a, b)
+
 
 @register_node
 class Min(BinaryOpExpr):
-    pass
+    """Min node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Min, a, b)
+
 
 @register_node
 class Max(BinaryOpExpr):
-    pass
+    """Max node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Max, a, b)
+
 
 @register_node
 class EQ(CmpExpr):
-    pass
+    """EQ node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.EQ, a, b)
+
 
 @register_node
 class NE(CmpExpr):
-    pass
+    """NE node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.NE, a, b)
+
 
 @register_node
 class LT(CmpExpr):
-    pass
+    """LT node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.LT, a, b)
+
 
 @register_node
 class LE(CmpExpr):
-    pass
+    """LE node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.LE, a, b)
+
 
 @register_node
 class GT(CmpExpr):
-    pass
+    """GT node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.GT, a, b)
+
 
 @register_node
 class GE(CmpExpr):
-    pass
+    """GE node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.GE, a, b)
+
 
 @register_node
 class And(LogicalExpr):
-    pass
+    """And node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.And, a, b)
+
 
 @register_node
 class Or(LogicalExpr):
-    pass
+    """Or node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Or, a, b)
+
 
 @register_node
 class Not(LogicalExpr):
-    pass
+    """Not node.
+
+    Parameters
+    ----------
+    a : Expr
+        The input value
+    """
+    def __init__(self, a):
+        self.__init_handle_by_constructor__(
+            _make.Not, a)
+
 
 @register_node
 class Select(Expr):
-    pass
+    """Select node.
+
+    Parameters
+    ----------
+    condition : Expr
+        The condition expression.
+
+    true_value : Expr
+        The value to take when condition is true.
+
+    false_value : Expr
+        The value to take when condition is false.
+    """
+    def __init__(self, condition, true_value, false_value):
+        self.__init_handle_by_constructor__(
+            _make.Select, condition, true_value, false_value)
+
 
 @register_node
 class Load(Expr):
-    pass
+    """Load node.
+
+    Parameters
+    ----------
+    dtype : str
+        The data type.
+
+    buffer_var : Var
+        The buffer variable in the load expression.
+
+    index : Expr
+        The index in the load.
+
+    predicate : Expr
+        The load predicate.
+    """
+    def __init__(self, dtype, buffer_var, index, predicate):
+        self.__init_handle_by_constructor__(
+            _make.Load, dtype, buffer_var, index, predicate)
+
 
 @register_node
 class Ramp(Expr):
-    pass
+    """Ramp node.
+
+    Parameters
+    ----------
+    base : Expr
+        The base expression.
+
+    stride : ramp stride
+        The stride of the ramp.
+
+    lanes : int
+        The lanes of the expression.
+    """
+    def __init__(self, base, stride, lanes):
+        self.__init_handle_by_constructor__(
+            _make.Ramp, base, stride, lanes)
+
 
 @register_node
 class Broadcast(Expr):
-    pass
+    """Broadcast node.
+
+    Parameters
+    ----------
+    value : Expr
+        The value of the expression.
+
+    lanes : int
+        The lanes of the expression.
+    """
+    def __init__(self, value, lanes):
+        self.__init_handle_by_constructor__(
+            _make.Broadcast, value, lanes)
+
 
 @register_node
 class Shuffle(Expr):
-    pass
+    """Shuffle node.
+
+    Parameters
+    ----------
+    vectors : Array of Expr
+        The vectors
+
+    indices : Array of indices
+        The indices
+    """
+    def __init__(self, vectors, indices):
+        self.__init_handle_by_constructor__(
+            _make.Shuffle, vectors, indices)
+
 
 @register_node
 class Call(Expr):
+    """Call node.
+
+    Parameters
+    ----------
+    dtype : str
+        The return data type
+
+    name : str
+        The name of the function
+
+    args : list of Expr
+        The input arguments to the call
+
+    call_type : int
+        The type of the call
+
+    func : Operation, optional
+        Operation if call_type is Halide
+
+    value_index : int
+        The output value index
+    """
     Extern = 0
     ExternCPlusPlus = 1
     PureExtern = 2
     Halide = 3
     Intrinsic = 4
     PureIntrinsic = 5
+    def __init__(self, dtype, name, args, call_type, func, value_index):
+        self.__init_handle_by_constructor__(
+            _make.Call, dtype, name, args, call_type, func, value_index)
 
 
 @register_node
 class Let(Expr):
-    pass
+    """Let node.
+
+    Parameters
+    ----------
+    var : Var
+        The variable in the binding.
+
+    value : Expr
+        The value in to be binded.
+
+    body : Expr
+        The body expression.
+    """
+    def __init__(self, var, value, body):
+        self.__init_handle_by_constructor__(
+            _make.Let, var, value, body)
diff --git a/python/tvm/generic.py b/python/tvm/generic.py
index 2926f73d5a02..ab1a80d3f612 100644
--- a/python/tvm/generic.py
+++ b/python/tvm/generic.py
@@ -24,7 +24,7 @@ def add(lhs, rhs):
     op : tvm.Expr
         The result Expr of add operaton.
     """
-    return _make.Add(lhs, rhs)
+    return _make._OpAdd(lhs, rhs)
 
 
 def subtract(lhs, rhs):
@@ -42,7 +42,7 @@ def subtract(lhs, rhs):
     op : tvm.Expr
         The result Expr of subtract operaton.
     """
-    return _make.Sub(lhs, rhs)
+    return _make._OpSub(lhs, rhs)
 
 
 def multiply(lhs, rhs):
@@ -60,7 +60,7 @@ def multiply(lhs, rhs):
     op : tvm.Expr
         The result Expr of multiply operaton.
     """
-    return _make.Mul(lhs, rhs)
+    return _make._OpMul(lhs, rhs)
 
 
 def divide(lhs, rhs):
@@ -78,7 +78,7 @@ def divide(lhs, rhs):
     op : tvm.Expr
         The result Expr of divide operaton.
     """
-    return _make.Div(lhs, rhs)
+    return _make._OpDiv(lhs, rhs)
 
 
 def cast(src, dtype):
diff --git a/python/tvm/hybrid/__init__.py b/python/tvm/hybrid/__init__.py
index e0a39c562f0f..6c137490c38e 100644
--- a/python/tvm/hybrid/__init__.py
+++ b/python/tvm/hybrid/__init__.py
@@ -7,4 +7,5 @@
 2. Developers can build HalideIR by writing Python code.
 """
 
-from .api import script, parse
+from .api import script
+from .parser import parse_python
diff --git a/python/tvm/hybrid/api.py b/python/tvm/hybrid/api.py
index 48e192d4ba39..d43217ca5dfc 100644
--- a/python/tvm/hybrid/api.py
+++ b/python/tvm/hybrid/api.py
@@ -1,9 +1,12 @@
 """APIs of lowering the Python subset to HalideIR"""
 from __future__ import absolute_import as _abs
 
-import types
 from .._ffi.base import decorate
+from .. import _api_internal as _tvm_internal
+from ..tensor import Tensor
+
 from .parser import parse_python
+from .util import _pruned_source
 
 
 def script(pyfunc):
@@ -17,40 +20,24 @@ def script(pyfunc):
     hybrid_func : function
         A decorated hybrid script function.
     """
-    def wrapped_func(func, *args, **kwargs):
+    def wrapped_func(func, *args, **kwargs): #pylint: disable=missing-docstring
         from .util import _enter_hybrid_runtime, _restore_runtime, _is_tvm_arg_types
         if _is_tvm_arg_types(args):
-            return parse(func, args)
+            src = _pruned_source(func)
+            parser = parse_python(src, func.__globals__, args)
+
+            input_tensors = []
+            for i in args:
+                if isinstance(i, Tensor):
+                    input_tensors.append(i)
+            op = _tvm_internal._HybridOp(parser.func_name, "HybridOp", None, input_tensors,
+                                         parser.outputs, parser.parsed_body)
+            res = [op.output(i) for i in range(len(parser.outputs))]
+            return res[0] if len(res) == 1 else res
 
         intersect = _enter_hybrid_runtime(func)
         value = func(*args, **kwargs)
         _restore_runtime(func, intersect)
         return value
-    return decorate(pyfunc, wrapped_func)
-
-
-def parse(func, args):
-    """Parse a subset of Python to HalideIR
-
-    Parameters
-    ----------
-    func : str or types.FunctionType
-        If it is a string, parse the source code
-        If it is a function, parse the function
 
-    args : list of Buffer or Tensor or Var
-        The argument lists to the function.
-        Leave it None if no buffer is related to the function to be parsed
-
-    Returns
-    -------
-    root : Stmt
-        The result Halide IR and the parser class instance.
-    """
-    from .util import _pruned_source
-    if isinstance(func, str):
-        src = func
-    else:
-        assert isinstance(func, types.FunctionType)
-        src = _pruned_source(func)
-    return parse_python(src, args)
+    return decorate(pyfunc, wrapped_func)
diff --git a/python/tvm/hybrid/calls.py b/python/tvm/hybrid/calls.py
new file mode 100644
index 000000000000..730b56f58bd2
--- /dev/null
+++ b/python/tvm/hybrid/calls.py
@@ -0,0 +1,92 @@
+"""Intrinsics of TVM-Python Hybrid Script for Python compilation time
+semantic support."""
+
+from .. import api as _api
+from .. import expr as _expr
+from .. import make as _make
+from ..container import Array
+from .. import ir_pass
+from ..stmt import For
+from .util import _internal_assert
+
+#pylint: disable=redefined-builtin
+
+LOOP_INTRIN = {
+    'range'    : For.Serial,
+    'unroll'   : For.Unrolled,
+    'parallel' : For.Parallel,
+    'vectorize': For.Vectorized,
+}
+
+def _range(annotation, args):
+    """Handling TVM loop types"""
+    n = len(args)
+    if n == 1:
+        low, ext = _api.const(0, dtype='int32'), args[0]
+    else:
+        _internal_assert(n == 2, "A loop intrinsic should only have 1 or 2 arguments!")
+        low, ext = args[0], args[1]
+    if not ir_pass.Equal(low, _api.const(0, dtype='int32')):
+        ext = ext - low
+    for_type = LOOP_INTRIN[annotation]
+    iter_var = None
+    return iter_var, low, ext, for_type
+
+
+range = unroll = vectorize = parallel = _range #pylint: disable=invalid-name
+
+
+def bind(func_id, args):
+    """Handling TVM thread binding"""
+    _internal_assert(func_id == "bind", "This function cannot be directly invoked!")
+    _internal_assert(len(args) == 2, "A loop bind should only have 2 arguments!")
+    _internal_assert(isinstance(args[0], str), \
+                     "A loop bind's first argument should be a string!")
+    iter_var = _api.thread_axis(args[0])
+    low, ext = _api.const(0), args[1]
+    for_type = None
+    return iter_var, low, ext, for_type
+
+
+def _math_intrin(func_id, args):
+    from .. import intrin
+    return getattr(intrin, func_id)(*args)
+
+sqrt = log = exp = tanh = sigmoid = power = popcount = _math_intrin #pylint: disable=invalid-name
+
+
+def _min_max(func_id, args):
+    _internal_assert(len(args) == 2, "Max/Min function should have 2 elements")
+    return getattr(_make, func_id.title())(args[0], args[1])
+
+
+min = max = _min_max #pylint: disable=invalid-name
+
+
+def _allocate_tensor(func_id, args):
+    """Handling TVM tensor allocation.
+    You may refer hybrid.intrin.allocate for more details."""
+    n = len(args)
+    _internal_assert(isinstance(_api.convert(args[0]), Array), \
+                     "allocate's first argument should be a tuple of shape!")
+    shape = args[0]
+    for i in shape:
+        _internal_assert(isinstance(i, _expr.Expr), "The shape should be an expression")
+    if n > 1:
+        _internal_assert(isinstance(args[1], str),
+                         "The data type should be an str")
+        _internal_assert(args[1].startswith('int') or args[1].startswith('float'), \
+                         "The data type should be either int or float!")
+        dtype = args[1]
+    else:
+        dtype = 'float32'
+    if n > 2:
+        _internal_assert(isinstance(args[2], str), \
+                         "The data scope should be an string")
+        _internal_assert(func_id != 'output_tensor', "Output tensor cannot specify scope")
+        scope = args[2]
+    else:
+        scope = 'global' if func_id != 'output_tensor' else 'output'
+    return (shape, dtype, scope)
+
+output_tensor = allocate = _allocate_tensor #pylint: disable=invalid-name
diff --git a/python/tvm/hybrid/intrin.py b/python/tvm/hybrid/intrin.py
index b3fb64579b60..48e92a8bf5ac 100644
--- a/python/tvm/hybrid/intrin.py
+++ b/python/tvm/hybrid/intrin.py
@@ -1,7 +1,6 @@
-"""Intrinsics of TVM-Python Hybrid Script for Python runtime"""
+"""Intrinsics of TVM-Python Hybrid Script for Python emulation runtime"""
 
 import numpy
-from ..stmt import For
 
 class _range(object):
     """Base class of the loop ranges in hybrid script"""
@@ -48,6 +47,7 @@ def allocate(shape, dtype='float32', scope='global'): #pylint: disable=unused-ar
     """
     return numpy.zeros(shape).astype(dtype)
 
+output_tensor = allocate #pylint: disable=invalid-name
 
 def popcount(x):
     """
@@ -87,28 +87,17 @@ def sigmoid(x):
 
 
 HYBRID_GLOBALS = {
-    'unroll'    : unroll,
-    'vectorize' : vectorize,
-    'parallel'  : parallel,
-    'allocate'  : allocate,
-    'bind'      : bind,
-    'sqrt'      : numpy.sqrt,
-    'log'       : numpy.log,
-    'tanh'      : numpy.tanh,
-    'power'     : numpy.power,
-    'exp'       : numpy.exp,
-    'sigmoid'   : sigmoid,
-    'popcount'  : popcount
+    'unroll'       : unroll,
+    'vectorize'    : vectorize,
+    'parallel'     : parallel,
+    'allocate'     : allocate,
+    'output_tensor': output_tensor,
+    'bind'         : bind,
+    'sqrt'         : numpy.sqrt,
+    'log'          : numpy.log,
+    'tanh'         : numpy.tanh,
+    'power'        : numpy.power,
+    'exp'          : numpy.exp,
+    'sigmoid'      : sigmoid,
+    'popcount'     : popcount
 }
-
-
-LOOP_INTRIN = {
-    'range'    : For.Serial,
-    'unroll'   : For.Unrolled,
-    'parallel' : For.Parallel,
-    'vectorize': For.Vectorized,
-    'bind'     : None
-}
-
-
-MATH_INTRIN = ['sqrt', 'log', 'exp', 'tanh', 'sigmoid', 'power', 'popcount']
diff --git a/python/tvm/hybrid/parser.py b/python/tvm/hybrid/parser.py
index 1e532367a321..26b0e141d0db 100644
--- a/python/tvm/hybrid/parser.py
+++ b/python/tvm/hybrid/parser.py
@@ -2,23 +2,26 @@
 
 import ast
 import operator
+import logging
 import sys
-from .util import make_nop, halide_imm_types
-from .intrin import LOOP_INTRIN, MATH_INTRIN
+from .util import _internal_assert
+from . import calls
+from . import util
 from .var_decl import determine_variable_usage
-from ..api import thread_axis
+from ..api import all as _all
+from ..api import any as _any
+from ..tensor import Tensor, Operation
 from .. import expr as _expr
 from .. import make as _make
-from .. import intrin
 from .. import api  as _api
 from .. import ir_pass as _ir_pass
 
 def list_to_block(visit, lst):
     """Convert a list of Python IR nodes to HalideIR Block"""
-    lst = list(map(visit, lst))
-    lst = [stmt for stmt in lst if not _ir_pass.Equal(stmt, make_nop())]
+    lst = [visit(stmt) for stmt in lst if not util.is_docstring(stmt)]
+    lst = [stmt for stmt in lst if not _ir_pass.Equal(stmt, util.make_nop())]
     if not lst:
-        return make_nop()
+        return util.make_nop()
     if len(lst) == 1:
         return lst[0]
     body = lst[0]
@@ -32,20 +35,23 @@ class HybridParser(ast.NodeVisitor):
 
 
     _binop_maker = {
-        ast.Add   : operator.add,
-        ast.Sub   : operator.sub,
-        ast.Mult  : operator.mul,
-        ast.Div   : operator.div if sys.version_info[0] == 2 else operator.truediv,
-        ast.Mod   : operator.mod,
-        ast.BitOr : operator.or_,
-        ast.BitAnd: operator.and_,
-        ast.BitXor: operator.xor,
-        ast.Gt    : operator.gt,
-        ast.GtE   : operator.ge,
-        ast.Lt    : operator.lt,
-        ast.LtE   : operator.le,
-        ast.Eq    : operator.eq,
-        ast.NotEq : operator.ne,
+        ast.Add     : operator.add,
+        ast.Sub     : operator.sub,
+        ast.Mult    : operator.mul,
+        ast.Div     : operator.div if sys.version_info[0] == 2 else operator.truediv,
+        ast.FloorDiv: operator.div if sys.version_info[0] == 2 else operator.truediv,
+        ast.Mod     : operator.mod,
+        ast.BitOr   : operator.or_,
+        ast.BitAnd  : operator.and_,
+        ast.BitXor  : operator.xor,
+        ast.Gt      : operator.gt,
+        ast.GtE     : operator.ge,
+        ast.Lt      : operator.lt,
+        ast.LtE     : operator.le,
+        ast.Eq      : operator.eq,
+        ast.NotEq   : operator.ne,
+        ast.And   : _all,
+        ast.Or    : _any,
     }
 
 
@@ -56,7 +62,7 @@ class HybridParser(ast.NodeVisitor):
     }
 
 
-    def __init__(self, args, usage, func_name=None):
+    def __init__(self, args, usage, symbols, func_name=None):
         """
         Parameters
         ----------
@@ -72,66 +78,84 @@ def __init__(self, args, usage, func_name=None):
             The name of the function to be lowered; if not provided,
             the compiler will use the name in the AST
         """
-        self.args = args[:]
+        self.args = list(args)
         self.usage = usage.copy()
         self._args = {} # Dict maps arg name to actual arg instance (either a var or a buffer)
-        self.var_buffers = {} # Buffers formed by mutatble variables
-        self.alloc_buffers = {} # Buffers formed by allocate instructions
+        self.alloc_buffers = {} # Buffers formed by explicit allocate instructions
         self.loops_above = {} # State variable that indicates loop levels above the current node
-        self.var_consts = {} # Variables that are determined as readonly in previous stage
+        self.variables = {} # The status of defined variables
         self.func_name = func_name # The name of the function to be lowered
-        self.iter_axis = []
+        self.outputs = [] # Output tensors' name
+        self.side_effect = set() # Tensors with side effects
+        self.parsed_body = None # The parsed HalideIR body
+        self.returned = False # If this function has a valid return
+        self.symbols = symbols # The global context
 
 
     def wrap_up_realize(self, node, body):
         """Wrap up all the variables which will no longer be used"""
+        pop_buf = []
+        pop_var = []
         for key, val in self.usage.items():
-            if key in self.var_consts.keys():
-                continue
             _, level, _ = val
-            if level == node:
-                if key in self.var_buffers.keys():
-                    _buf = self.var_buffers[key]
-                    _scope = 'global'
-                else:
-                    _buf, _scope = self.alloc_buffers[key]
-                _domain = [_make.range_by_min_extent(0, i) for i in _buf.shape]
-                _dtype = _buf.dtype
-                _true = _api.convert(True)
-                body = _make.Realize(_buf.op, 0, _dtype, _domain, _true, body)
-                body = _make.AttrStmt(_buf.op, 'realize_scope', _api.convert(_scope), body)
+            if level != node:
+                continue
+            if key in self._args.keys():
+                continue
+            if key in self.alloc_buffers.keys():
+                _buf, _scope = self.alloc_buffers[key]
+                if _scope == 'output':
+                    continue
+                pop_buf.append(key)
+            else:
+                _internal_assert(key in self.variables.keys(),
+                                 "Key should be either in one of args, buffers, and vars")
+                if not isinstance(self.variables[key], tuple):
+                    continue
+                _buf, _scope = self.variables[key]
+                pop_var.append(key)
+            _domain = [_make.range_by_min_extent(0, i) for i in _buf.shape]
+            _dtype = _buf.dtype
+            _true = _api.convert(True)
+            body = _make.Realize(_buf.op, 0, _dtype, _domain, _true, body)
+            body = _make.AttrStmt(_buf.op, 'realize_scope', _api.convert(_scope), body)
+
+        for elem in pop_buf:
+            self.alloc_buffers.pop(elem)
+        for elem in pop_var:
+            self.variables.pop(elem)
         return body
 
 
-    def _get_buffer_from_id(self, s):
-        if s not in self._args.keys() and s not in self.alloc_buffers.keys():
-            raise ValueError("This %s is expected to be in argument list or allocated buffer!" % s)
-        if s in self._args.keys() and s in self.alloc_buffers.keys():
-            raise ValueError("%s, a buffer cannot be both argument and allocated!" % s)
+    def _get_buffer_from_id(self, s, for_provide=False):
+        _internal_assert((s in self._args.keys()) + (s in self.alloc_buffers.keys()) == 1,
+                         "This %s is expected to be in either \
+                          argument list or allocated buffer!" % s)
         if s in self._args.keys():
+            if for_provide:
+                self.side_effect.add(self._args[s])
             return self._args[s]
         return self.alloc_buffers[s][0]
 
 
-
     #pylint: disable=invalid-name, missing-docstring
     def visit_Module(self, node):
-        if len(node.body) != 1:
-            raise ValueError("Only one-function source code can be fed to this parser!")
+        _internal_assert(len(node.body) == 1, \
+                         "Only one-function source code can be fed to this parser!")
         return self.visit(node.body[0])
 
 
     def visit_FunctionDef(self, node):
-        if len(node.args.args) != len(self.args):
-            raise ValueError("The number of arguments passed to the function\
-                should be the same as it is defined!")
+        _internal_assert(len(node.args.args) == len(self.args), \
+                         "The number of arguments passed to the \
+                         function should be the same as it is defined!")
+        if self.func_name is None:
+            self.func_name = node.name
         for idx, arg in enumerate(node.args.args):
             _attr = 'id' if sys.version_info[0] < 3 else 'arg' # To make py2 and 3 compatible
             self._args[getattr(arg, _attr)] = self.args[idx]
         res = list_to_block(self.visit, node.body)
         res = self.wrap_up_realize(node, res)
-        if self.func_name is None:
-            self.func_name = node.name
         return res
 
 
@@ -140,100 +164,126 @@ def visit_Expr(self, node):
 
 
     def visit_Name(self, node):
-        _id = node.id
-        if _id in self._args.keys() and isinstance(self._args[_id], _expr.Var):
-            return self._args[_id]
-        elif _id in self.loops_above.keys():
-            return self.loops_above[_id]
-        if _id in self._args.keys():
-            raise ValueError("This id %s should be handled in visit_Subscript!" % _id)
-        if _id  not in self.usage.keys():
-            raise ValueError("This id %s is expected to be a defined variable!" % _id)
-        # Buffer
-        if _id in self.var_buffers.keys():
-            _buf = self.var_buffers[_id]
-            return _make.Call(_buf.dtype, _id, [_api.const(0)], _expr.Call.Halide, _buf.op, 0)
-        # Compilation time constant
-        if _id not in self.var_consts.keys():
-            raise ValueError("This id %s is expected to a compilation time constant!" % _id)
-        return self.var_consts[_id]
+        name = node.id
+        if name in self.loops_above.keys():
+            return self.loops_above[name]
+        elif name in self.variables.keys():
+            res = self.variables[name]
+            if isinstance(res, tuple):
+                buf = res[0]
+                if isinstance(node.ctx, ast.Load):
+                    return _make.Call(buf.dtype, buf.name, [_api.const(0)], \
+                                      _expr.Call.Halide, buf.op, buf.value_index)
+                return buf, [_api.const(0)]
+            if isinstance(node.ctx, ast.Load):
+                return res
+            return None
+        buf = self._get_buffer_from_id(name)
+        return buf
 
 
     def visit_Num(self, node):
         return _api.const(node.n)
 
 
+    def visit_AugAssign(self, node):
+        buf = self.visit(node.target)
+        rhs = self.visit(node.value)
+        if isinstance(buf, tuple):
+            _internal_assert(len(buf) == 2, "LHS is supposed to be (buf, args)!")
+            buf, args = buf
+        else:
+            args = [_api.const(0)]
+        _internal_assert(isinstance(buf, Tensor), "LHS is supposed to be Tensor!")
+
+        read = _make.Call(buf.dtype, buf.name, args, _expr.Call.Halide, buf.op, buf.value_index)
+        value = HybridParser._binop_maker[type(node.op)](read, rhs)
+
+        return _make.Provide(buf.op, 0, value, args)
+
+
     def visit_Assign(self, node):
-        if len(node.targets) != 1:
-            raise ValueError("So far only one-valued assignment is supported!")
-        lhs = node.targets[0]
         rhs = self.visit(node.value)
+        if isinstance(rhs, Operation):
+            rmap = {}
+            _internal_assert(len(node.targets) == rhs.num_outputs, \
+                             "Unable to detuple the outs to targets")
+            for i in range(rhs.num_outputs):
+                _internal_assert(isinstance(node.targets[i], ast.Name),
+                                 "You should bind a pure name to the tensors")
+                self.alloc_buffers[node.targets[i].id] = (rhs.output(i), 'global')
+                rmap[rhs.outputs[i].op] = rhs.output(i)
+            return util.replace_io(rhs.body, rmap)
+
+        _internal_assert(len(node.targets) == 1, "So far only one-valued assignment is supported!")
+        lhs = node.targets[0]
         if isinstance(rhs, _expr.Expr):
             rhs = _ir_pass.Simplify(rhs)
         if isinstance(lhs, ast.Name):
             #TODO: support defined intermediate buffer later
             lhs_ = lhs
             lhs = lhs.id
-            if lhs in self.loops_above.keys():
-                raise ValueError("You CAN NEVER overwrite a loop variable!")
+            _internal_assert(lhs not in self.loops_above.keys(), \
+                             "Loop variable cannot be overwritten!")
             decl, _, rw = self.usage[lhs]
             if decl == lhs_:
-                if lhs in self.var_consts.keys():
-                    raise ValueError("BUG: A constant cannot be overwritten!")
-                if lhs in self.var_buffers.keys() or lhs in self.alloc_buffers.keys():
-                    raise ValueError("BUG: This value should not be defined before this point!")
+                _internal_assert(lhs not in self.variables.keys() and
+                                 lhs not in self.alloc_buffers.keys(), \
+                                 "This value should not be defined before this point!")
                 if isinstance(rhs, tuple):
                     shape, dtype, scope = rhs
                     ph = _api.placeholder(shape, dtype=dtype, name=lhs)
                     self.alloc_buffers[lhs] = (ph, scope)
-                    return make_nop()
-                if isinstance(rhs, halide_imm_types) and ast.Store not in rw:
-                    self.var_consts[lhs] = rhs
+                    if scope == 'output':
+                        self.outputs.append(lhs)
+                    return util.make_nop()
+                if isinstance(rhs, util.halide_imm_types) and ast.Store not in rw:
+                    self.variables[lhs] = rhs
                 else:
-                    self.var_buffers[lhs] = _api.placeholder((1, ), dtype=rhs.dtype, name=lhs)
-            if lhs in self.var_consts.keys():
-                return make_nop()
-            else:
-                if lhs not in self.var_buffers.keys():
-                    raise ValueError("BUG: This variable should be defined before!")
-                tgt = self.var_buffers[lhs]
-                return _make.Provide(tgt.op, 0, rhs, [_api.const(0, dtype=rhs.dtype)])
+                    ph = _api.placeholder((1, ), dtype=rhs.dtype, name=lhs)
+                    self.variables[lhs] = (ph, 'global')
+            lhs = self.visit(lhs_)
+            if lhs is not None:
+                buf, args = lhs
+                return _make.Provide(buf.op, 0, rhs, args)
+            return util.make_nop()
         else:
-            lhs = self.visit(lhs)
-            if not isinstance(lhs, _expr.Call):
-                raise ValueError("An array access's LHS is expected to be a expr.Call!")
-            #TODO: support slice later
-            buf = self._get_buffer_from_id(lhs.name)
-            return _make.Provide(buf.op, 0, rhs, lhs.args)
+            lhs, args = self.visit(lhs)
+            _internal_assert(isinstance(lhs, Tensor), \
+                             "An array access's LHS is expected to be a expr.Call!")
+            res = _make.Provide(lhs.op, lhs.value_index, rhs, args)
+            return res
 
 
     def visit_Index(self, node):
         if isinstance(node.value, ast.Tuple):
-            return [self.visit(i) for i in node.value.elts]
+            return self.visit(node.value)
         return [self.visit(node.value)]
 
 
+    def visit_Attribute(self, node):
+        _internal_assert(isinstance(node.value, ast.Name), \
+                         "For atrribute access, only both names are supported so far!")
+        buf = self._get_buffer_from_id(node.value.id)
+        return getattr(buf, node.attr)
+
+
     def visit_Subscript(self, node):
         args = self.visit(node.slice)
         if isinstance(node.value, ast.Name):
-            array = node.value.id
-            _buf = self._get_buffer_from_id(array)
-            return _make.Call(_buf.dtype, array, args, _expr.Call.Halide, _buf.op, 0)
-        elif isinstance(node.value, ast.Attribute):
-            if not isinstance(node.value.value, ast.Name):
-                raise ValueError("The root of array access is expect to be a id!")
-            if node.value.attr != "shape":
-                raise ValueError("Attribute access so far only 'shape' is supported!")
-            if len(args) != 1:
-                raise ValueError("For 'shape' access the argument should be only one!")
-            args = args[0]
-            #TODO: maybe support non-constant value later?
-            if not isinstance(args, (_expr.IntImm, _expr.UIntImm)):
-                raise ValueError("So far only constant shape access supported!")
-            buf = self._get_buffer_from_id(node.value.value.id)
-            return buf.shape[args.value]
-        else:
-            raise ValueError("Not supported yet!")
+            buf = self.visit(node.value)
+            if isinstance(node.ctx, ast.Load):
+                return _make.Call(buf.dtype, buf.name, args, \
+                                  _expr.Call.Halide, buf.op, buf.value_index)
+            return buf, args
+
+        shape = self.visit(node.value)
+        _internal_assert(len(args) == 1, "For 'shape' access the argument should be only one!")
+        args = args[0]
+        #TODO: maybe support non-constant value later?
+        _internal_assert(isinstance(args, (_expr.IntImm, _expr.UIntImm)), \
+                         "So far only constant shape access supported!")
+        return shape[args.value]
 
 
     def visit_With(self, node):
@@ -241,14 +291,11 @@ def visit_With(self, node):
             context = node.context_expr
             option = node.optional_vars
         else:
-            if len(node.items) != 1:
-                raise ValueError("Only one with element is supported so far!")
+            _internal_assert(len(node.items) == 1, "Only one with element is supported so far!")
             context = node.items[0].context_expr
             option = node.items[0].optional_vars
-        if not isinstance(context, ast.Call):
-            raise ValueError("The object must be a Python function call!")
-        if not isinstance(option, ast.Name):
-            raise ValueError("The object after 'as' must be an id!")
+        _internal_assert(isinstance(context, ast.Call), "The object must be a Python func call!")
+        _internal_assert(isinstance(option, ast.Name), "The object after 'as' must be an id!")
         self.annotation[option.id] = context.func.id
         return list_to_block(self.visit, node.body)
 
@@ -259,7 +306,7 @@ def visit_If(self, node):
         if node.orelse:
             else_body = list_to_block(self.visit, node.orelse)
         else:
-            else_body = make_nop()
+            else_body = util.make_nop()
         return _make.IfThenElse(cond, if_body, else_body)
 
 
@@ -271,13 +318,28 @@ def visit_IfExp(self, node):
 
 
     def visit_Compare(self, node):
-        lhs = self.visit(node.left)
-        if len(node.ops) != 1:
-            raise ValueError("Only one compare op is supported!")
-        if len(node.comparators) != 1:
-            raise ValueError("Only one comparator is supported!")
-        rhs = self.visit(node.comparators[0])
-        return HybridParser._binop_maker[type(node.ops[0])](lhs, rhs)
+        _internal_assert(len(node.ops) == len(node.comparators),
+                         "#compare ops != #comparators")
+        ops = [self.visit(node.left)]
+        ops += [self.visit(i) for i in node.comparators]
+        res = []
+        for i in range(len(node.ops)):
+            lhs = ops[i]
+            rhs = ops[i + 1]
+            res.append(HybridParser._binop_maker[type(node.ops[i])](lhs, rhs))
+        return _all(*res)
+
+
+    def visit_BoolOp(self, node):
+        n = len(node.values)
+        if n == 1:
+            _internal_assert(isinstance(node.op, ast.Not), \
+                             "Unary is supposed to be not!")
+            return operator.not_(self.visit(node.values[0]))
+        _internal_assert(isinstance(node.op, (ast.And, ast.Or)), \
+                         "Binary is supposed to be and/or!")
+        values = [self.visit(i) for i in node.values]
+        return HybridParser._binop_maker[type(node.op)](*values)
 
 
     def visit_UnaryOp(self, node):
@@ -293,93 +355,96 @@ def visit_BinOp(self, node):
 
     def visit_Call(self, node):
         # Yet, no function pointer supported
-        if not isinstance(node.func, ast.Name):
-            raise ValueError("Only id-function function call is supported so far!")
+        _internal_assert(isinstance(node.func, ast.Name), \
+                         "Only id-function function call is supported so far!")
+
         func_id = node.func.id
-        n = len(node.args)
-        if func_id in LOOP_INTRIN.keys() and func_id != 'bind':
-            if n == 1:
-                low, ext = _api.const(0, dtype='int32'), self.visit(node.args[0])
-            else:
-                if n != 2:
-                    raise ValueError("A loop intrinsic should only have 1 or 2 arguments!")
-                low, ext = self.visit(node.args[0]), self.visit(node.args[1])
-            if not _ir_pass.Equal(low, _api.const(0, dtype='int32')):
-                ext = ext - low
-            for_type = LOOP_INTRIN[func_id]
-            iter_var = None
-            return iter_var, low, ext, for_type
-        elif func_id == 'bind':
-            if n != 2:
-                raise ValueError("A loop bind should only have 2 arguments!")
-            if not isinstance(node.args[0], ast.Str):
-                raise ValueError("A loop bind's first argument should be a string!")
-            _vn = node.args[0].s
-            iter_var = thread_axis(node.args[0].s)
-            low, ext = _api.const(0, dtype='int32'), self.visit(node.args[1])
-            for_type = None
-            return iter_var, low, ext, for_type
-        elif func_id in MATH_INTRIN:
-            return getattr(intrin, func_id)(*[self.visit(arg) for arg in node.args])
-        elif func_id == 'allocate':
-            if not isinstance(node.args[0], ast.Tuple):
-                raise ValueError("allocate's first argument should be a tuple of shape!")
-            shape = tuple(self.visit(i) for i in node.args[0].elts)
-            for i in shape:
-                if not isinstance(i, _expr.Expr):
-                    raise ValueError("The shape should be an expression")
-            if n > 1:
-                if not isinstance(node.args[1], ast.Str):
-                    raise ValueError("The data type should be an string")
-                dtype = node.args[1].s
-            else:
-                dtype = 'float32'
-            if n > 2:
-                if not isinstance(node.args[2], ast.Str):
-                    raise ValueError("The data type should be an string")
-                scope = node.args[2].s
-            else:
-                scope = 'global'
-            return (shape, dtype, scope)
-        elif func_id == 'max' or func_id == 'min':
-            if n != 2:
-                raise ValueError("Max/Min function should have 2 elements")
-            a, b = self.visit(node.args[0]), self.visit(node.args[1])
-            return getattr(_make, func_id.title())(a, b)
-        else:
-            raise ValueError("Function call not supported yet!")
+        args = [self.visit(i) for i in node.args]
+        try:
+            return getattr(calls, func_id)(func_id, args)
+        except AttributeError:
+            _internal_assert(func_id in self.symbols.keys(), \
+                             "The function called is not in the context either!")
+            outs = self.symbols[func_id](*args)
+            op = outs.op if isinstance(outs, Tensor) else outs[0].op
+            return op
 
 
     def visit_For(self, node):
         iter_var, low, ext, for_type = self.visit(node.iter)
-        if not isinstance(node.target, ast.Name):
-            raise ValueError("The loop iterator should be a variable!")
+        _internal_assert(isinstance(node.target, ast.Name), \
+                         "The loop iterator should be a variable!")
         _name = node.target.id
         if iter_var is None:
-            if for_type is None:
-                raise ValueError("The loop bind function parse error!")
+            _internal_assert(for_type is not None, "The loop bind function parse error!")
             offset = iter_var = _api.var(_name)
-            if not _ir_pass.Equal(low, _api.const(0, dtype='int32')):
+            if not _ir_pass.Equal(low, _api.const(0)):
                 offset = iter_var + low
             self.loops_above[_name] = offset
         else:
-            if for_type is not None:
-                raise ValueError("The loop iterating function parse error!")
+            _internal_assert(for_type is None, "The loop iterating function parse error!")
             self.loops_above[_name] = iter_var.var
         _body = list_to_block(self.visit, node.body)
         _body = self.wrap_up_realize(node, _body)
         if for_type is None:
             res = _make.AttrStmt(iter_var, 'thread_extent', ext, _body)
         else:
-            res = _make.For(iter_var, _api.const(0, dtype='int32'), ext, for_type, 0, _body)
+            res = _make.For(iter_var, _api.const(0), ext, for_type, 0, _body)
         self.loops_above.pop(_name)
         return res
 
 
-def parse_python(src, args):
-    """The helper function of calling the AST visitor"""
+    def visit_Return(self, node):
+        _internal_assert(not self.loops_above, "Return should not be in a loop body!")
+        ids = []
+        if isinstance(node.value, ast.Name):
+            ids.append(node.value.id)
+        else:
+            _internal_assert(isinstance(node.value, ast.Tuple), \
+                             "You should return either a single tensor or a tuple")
+            for i in node.value.elts:
+                _internal_assert(isinstance(i, ast.Name), "What do you return?")
+                ids.append(i.id)
+        _internal_assert(len(set(ids)) == len(ids), "Duplicated tensors in the return tuples")
+        if len(ids) < len(self.outputs):
+            logging.log(logging.CRITICAL, '[Warning] Not all the output buffers returned!')
+        self.outputs = [self.alloc_buffers[i][0] for i in ids]
+        self.returned = True
+        return util.make_nop()
+
+
+    def visit_Tuple(self, node):
+        return tuple(self.visit(i) for i in node.elts)
+
+
+    def visit_Str(self, node):
+        return node.s
+
+
+def parse_python(src, symbols, args):
+    """The helper function of calling the AST visitor
+
+    Parameters
+    ----------
+    src : str
+        The source code of the function to be parsed.
+
+    src : str
+        The symbol list of the global context of the function.
+
+    args : list of Tensors or Vars
+        The argument lists to the function.
+        It is NOT encouraged to write a function without arguments.
+        It is NOT encouraged to write a function with side effect.
+
+    Returns
+    -------
+    root : Stmt
+        The result Halide IR and the parser class instance.
+    """
     root = ast.parse(src)
-    var_usage = determine_variable_usage(root, args)
-    parser = HybridParser(args, var_usage)
-    halide_ir = parser.visit(root)
-    return halide_ir
+    var_usage = determine_variable_usage(root, args, symbols)
+    parser = HybridParser(args, var_usage, symbols)
+    parser.parsed_body = parser.visit(root)
+    _internal_assert(parser.returned, 'No valid return found in the function body!')
+    return parser
diff --git a/python/tvm/hybrid/util.py b/python/tvm/hybrid/util.py
index 43d26e859560..aa86d55a6fcf 100644
--- a/python/tvm/hybrid/util.py
+++ b/python/tvm/hybrid/util.py
@@ -1,20 +1,29 @@
 """Internal utilities for parsing Python subset to HalideIR"""
 
+import ast
 import inspect
+import logging
+import sys
 import numpy
 from .intrin import HYBRID_GLOBALS
 from .._ffi.base import numeric_types
 from .. import api as _api
 from .. import make as _make
 from .. import expr as _expr
+from .. import stmt as _stmt
 from ..tensor import Tensor
 
 
 #pylint: disable=invalid-name
 np_arg_types = tuple(list(numeric_types) + [numpy.ndarray])
-tvm_arg_types = (Tensor, _expr.Var)
+tvm_arg_types = (Tensor, _expr.Var, _expr.ConstExpr)
 halide_imm_types = (_expr.IntImm, _expr.FloatImm, _expr.UIntImm)
 
+def _internal_assert(cond, err):
+    """Simplify the code segment like if not XXX then raise an error"""
+    if not cond:
+        raise ValueError(err)
+
 
 # Useful constants. In avoid of runtime dependences, we use function calls to return them.
 def make_nop():
@@ -22,12 +31,24 @@ def make_nop():
     return _make.Evaluate(_api.const(0, dtype='int32'))
 
 
+def is_docstring(node):
+    """Checks if a Python AST node is a docstring"""
+    return isinstance(node, ast.Expr) and isinstance(node.value, ast.Str)
+
+
 def _pruned_source(func):
     """Prune source code's extra leading spaces"""
-    lines = inspect.getsource(func).split('\n')
-    leading_space = len(lines[0]) - len(lines[0].lstrip(' '))
-    lines = [line[leading_space:] for line in lines]
-    return '\n'.join(lines)
+    try:
+        lines = inspect.getsource(func).split('\n')
+        leading_space = len(lines[0]) - len(lines[0].lstrip(' '))
+        lines = [line[leading_space:] for line in lines]
+        return '\n'.join(lines)
+    except IOError as err:
+        if sys.version_info[0] == 2 and str(err) == 'could not get source code':
+            logging.log(logging.CRITICAL, \
+                        'This module is not fully operated under Python2... ' \
+                        'Please move to Python3!')
+            raise err
 
 
 def _is_tvm_arg_types(args):
@@ -35,14 +56,16 @@ def _is_tvm_arg_types(args):
     If neither is true, raise a value error."""
     if isinstance(args[0], tvm_arg_types):
         for elem in args[1:]:
-            if not isinstance(elem, tvm_arg_types):
-                raise ValueError("Expect a Var or Tensor instance but % get!" % str(type(elem)))
+            _internal_assert(isinstance(elem, tvm_arg_types),
+                             "Expecting a Var, Tensor or ConstExpr instance but %s get!" \
+                             % str(type(elem)))
         return True
-    if not isinstance(args[0], np_arg_types):
-        raise ValueError("Expect a numpy type but % get!" % str(type(args[0])))
+
+    _internal_assert(isinstance(args[0], np_arg_types), \
+                     "Expect a numpy type but %s get!" % str(type(args[0])))
     for elem in args[1:]:
-        if not isinstance(elem, np_arg_types):
-            raise ValueError("Expect a numpy type but % get!" % str(type(elem)))
+        _internal_assert(isinstance(elem, np_arg_types), \
+                         "Expect a numpy type but %s get!" % str(type(elem)))
     return False
 
 
@@ -64,3 +87,20 @@ def _restore_runtime(func, intersect):
         _globals.pop(elem)
     for k, v in intersect:
         _globals[k] = v
+
+
+def replace_io(body, rmap):
+    """Replacing tensors usage according to the dict given"""
+    from .. import ir_pass
+
+    def replace(op):
+        if isinstance(op, _stmt.Provide) and op.func in rmap.keys():
+            buf = rmap[op.func]
+            return _make.Provide(buf.op, op.value_index, op.value, op.args)
+        elif isinstance(op, _expr.Call) and  op.func in rmap.keys():
+            buf = rmap[op.func]
+            return _make.Call(buf.dtype, buf.name, op.args, \
+                              _expr.Call.Halide, buf.op, buf.value_index)
+        return None
+
+    return ir_pass.IRTransform(body, None, replace, ['Provide', 'Call'])
diff --git a/python/tvm/hybrid/var_decl.py b/python/tvm/hybrid/var_decl.py
index df38bac1acba..eb893a7f22a1 100644
--- a/python/tvm/hybrid/var_decl.py
+++ b/python/tvm/hybrid/var_decl.py
@@ -3,23 +3,26 @@
 import ast
 import sys
 from .intrin import HYBRID_GLOBALS
+from .util import _internal_assert
 
 
 class PyVariableUsage(ast.NodeVisitor):
     """The vistor class to determine the declaration, r/w status, and last use of each variable"""
     #pylint: disable=invalid-name
     #pylint: disable=missing-docstring
-    def __init__(self, args):
+    def __init__(self, args, symbols):
         self.status = {}
         self.scope_level = []
         self._args = {}
         self.args = args
+        self.aug_assign_ = False
+        self.symbols = symbols
 
 
     def visit_FunctionDef(self, node):
         self.scope_level.append(node)
-        if len(node.args.args) != len(self.args):
-            raise ValueError('#arguments passed should be the same as #arguments defined')
+        _internal_assert(len(node.args.args) == len(self.args), \
+                '#arguments passed should be the same as #arguments defined')
         for idx, arg in enumerate(node.args.args):
             _attr = 'id' if sys.version_info[0] < 3 else 'arg' # To make py2 and 3 compatible
             self._args[getattr(arg, _attr)] = self.args[idx]
@@ -28,8 +31,8 @@ def visit_FunctionDef(self, node):
 
 
     def visit_For(self, node):
-        if not isinstance(node.target, ast.Name):
-            raise ValueError("For's iterator should be an id")
+        _internal_assert(isinstance(node.target, ast.Name), \
+                "For's iterator should be an id")
         self.visit(node.iter)
         self.scope_level.append(node)
         for i in node.body:
@@ -39,15 +42,22 @@ def visit_For(self, node):
 
     def visit_Call(self, node):
         #No function pointer supported so far
-        if not isinstance(node.func, ast.Name):
-            raise ValueError("Function call should be an id")
+        _internal_assert(isinstance(node.func, ast.Name), "Function call should be an id")
         func_id = node.func.id
-        if func_id not in list(HYBRID_GLOBALS.keys()) + ['range', 'max', 'min']:
-            raise ValueError("Function call id not in intrinsics' list")
+        _internal_assert(func_id in list(HYBRID_GLOBALS.keys()) + \
+                         ['range', 'max', 'min'] + \
+                         list(self.symbols.keys()), \
+                         "Function call id not in intrinsics' list")
         for elem in node.args:
             self.visit(elem)
 
 
+    def visit_AugAssign(self, node):
+        self.aug_assign_ = True
+        self.generic_visit(node)
+        self.aug_assign_ = False
+
+
     def visit_Name(self, node):
         # If it is from the argument list or loop variable, we do not worry about it!
         if node.id in self._args.keys():
@@ -56,21 +66,25 @@ def visit_Name(self, node):
         if node.id in fors:
             return
         # The loop variable cannot be overwritten when iteration
-        if isinstance(node.ctx, ast.Store) and node.id in fors:
-            raise ValueError("Iter var cannot be overwritten")
+        _internal_assert(not isinstance(node.ctx, ast.Store) or node.id not in fors, \
+                         "Iter var cannot be overwritten")
 
         if node.id not in self.status.keys():
-            if not isinstance(node.ctx, ast.Store):
-                raise ValueError('In Python, "first store" indicates "declaration"')
+            _internal_assert(isinstance(node.ctx, ast.Store), \
+                             'Undeclared variable %s' % node.id)
+            if self.aug_assign_:
+                raise ValueError('"First store" cannot be an AugAssign')
             self.status[node.id] = (node, self.scope_level[-1], set())
         else:
             decl, loop, usage = self.status[node.id]
             usage.add(type(node.ctx))
+            _internal_assert(loop in self.scope_level,
+                             "%s is used out of the scope it is defined!" % node.id)
             self.status[node.id] = (decl, loop, usage)
 
 
-def determine_variable_usage(root, args):
+def determine_variable_usage(root, args, symbols):
     """The helper function for calling the dedicated visitor."""
-    visitor = PyVariableUsage(args)
+    visitor = PyVariableUsage(args, symbols)
     visitor.visit(root)
     return visitor.status
diff --git a/python/tvm/intrin.py b/python/tvm/intrin.py
index 30da873b5dcf..cd9a108c546a 100644
--- a/python/tvm/intrin.py
+++ b/python/tvm/intrin.py
@@ -376,6 +376,22 @@ def popcount(x):
     """
     return call_pure_intrin(x.dtype, "popcount", x)
 
+def fmod(x, y):
+    """Return the remainder of x divided by y with the same sign as x.
+
+    Parameters
+    ----------
+    x : Expr
+        Input argument.
+    y : Expr
+        Input argument.
+
+    Returns
+    -------
+    z : Expr
+        The result.
+    """
+    return call_pure_intrin(x.dtype, "fmod", x, y)
 
 # Intrinsic rule related code
 def register_intrin_rule(target, intrin, f=None, override=False):
@@ -476,6 +492,3 @@ def _rule_float_direct(op):
 register_intrin_rule("opencl", "exp", _rule_float_direct, override=True)
 # default pattern for exp
 register_intrin_rule("default", "exp", _rule_float_suffix, override=True)
-
-# default pattern for sigmoid
-register_intrin_rule("default", "sigmoid", lambda op: 1.0 / (1.0 + exp(-op.args[0])))
diff --git a/python/tvm/make.py b/python/tvm/make.py
index 49f698f4f663..6238fd7f1789 100644
--- a/python/tvm/make.py
+++ b/python/tvm/make.py
@@ -6,9 +6,10 @@
 Each api is a PackedFunc that can be called in a positional argument manner.
 You can use make function to build the IR node.
 """
+from __future__ import absolute_import as _abs
 from ._ffi.function import _init_api
 from ._ffi.runtime_ctypes import TVMType
-from . import stmt as _stmt
+
 
 def range_by_min_extent(min_value, extent):
     """Construct a Range by min and extent.
@@ -71,6 +72,17 @@ def node(type_key, **kwargs):
     **kwargs : dict
         The fields of the node.
 
+    Returns
+    -------
+    node : Node
+        The corresponding DSL Node
+
+    Note
+    ----
+    If the created node is instance of AttrsNode, then
+    the creator function will also run bound checks and
+    default value setup as supported by Attrs.
+
     Example
     -------
     The following code constructs a IntImm object
@@ -87,44 +99,4 @@ def node(type_key, **kwargs):
     return _Node(*args)
 
 
-def stmt_seq(*args):
-    """Make sequence of statements
-
-    Parameters
-    ----------
-    args : list of Expr or Var
-        List of statements to be combined as sequence.
-
-    Returns
-    -------
-    stmt : Stmt
-        The combined statement.
-    """
-    ret = None
-    for value in args:
-        if not isinstance(value, _stmt.Stmt):
-            value = Evaluate(value)
-        ret = value if ret is None else Block(ret, value)
-    return ret if ret else Evaluate(0)
-
-
-def stmt_list(stmt):
-    """Make list of stmt from blocks.
-
-    Parameters
-    ----------
-    stmt : A block statement
-
-    Returns
-    -------
-    stmt_list : list of Stmt
-         The unpacked list of statements
-    """
-    if isinstance(stmt, _stmt.Block):
-        return stmt_list(stmt.first) + stmt_list(stmt.rest)
-    elif isinstance(stmt, _stmt.ProducerConsumer):
-        return stmt_list(stmt.body)
-    return [stmt]
-
-
 _init_api("tvm.make")
diff --git a/python/tvm/module.py b/python/tvm/module.py
index 1b83c9b26243..79a1fab45570 100644
--- a/python/tvm/module.py
+++ b/python/tvm/module.py
@@ -6,6 +6,7 @@
 
 from ._ffi.function import ModuleBase, _set_class_module
 from ._ffi.function import _init_api
+from ._ffi.libinfo import find_include_path
 from .contrib import cc as _cc, tar as _tar, util as _util
 
 ProfileResult = namedtuple("ProfileResult", ["mean", "results"])
@@ -87,24 +88,31 @@ def export_library(self,
             If fcompile has attribute object_format, will compile host library
             to that format. Otherwise, will use default format "o".
 
-        kwargs : dict, optiona;
+        kwargs : dict, optional
             Additional arguments passed to fcompile
         """
-        if self.type_key == "stacktvm":
-            raise ValueError("Module[%s]: export_library requires llvm module,"
-                             " did you build with LLVM enabled?" % self.type_key)
-
-        if self.type_key != "llvm":
-            raise ValueError("Module[%s]: Only llvm support export shared" % self.type_key)
+        if self.type_key == "stackvm":
+            if not file_name.endswith(".stackvm"):
+                raise ValueError("Module[%s]: can only be saved as stackvm format."
+                                 "did you build with LLVM enabled?" % self.type_key)
+            self.save(file_name)
+            return
+
+        if not (self.type_key == "llvm" or self.type_key == "c"):
+            raise ValueError("Module[%s]: Only llvm and c support export shared" % self.type_key)
         temp = _util.tempdir()
         if fcompile is not None and hasattr(fcompile, "object_format"):
             object_format = fcompile.object_format
         else:
-            object_format = "o"
+            if self.type_key == "llvm":
+                object_format = "o"
+            else:
+                assert self.type_key == "c"
+                object_format = "cc"
         path_obj = temp.relpath("lib." + object_format)
         self.save(path_obj)
         files = [path_obj]
-        is_system_lib = self.get_function("__tvm_is_system_module")()
+        is_system_lib = self.type_key == "llvm" and self.get_function("__tvm_is_system_module")()
         if self.imported_modules:
             path_cc = temp.relpath("devc.cc")
             with open(path_cc, "w") as f:
@@ -115,6 +123,8 @@ def export_library(self,
                 fcompile = _tar.tar
             else:
                 fcompile = _cc.create_shared
+        if self.type_key == "c":
+            kwargs.update({'options': ["-I" + path for path in find_include_path()]})
         fcompile(file_name, files, **kwargs)
 
     def time_evaluator(self, func_name, ctx, number, repeat=1):
diff --git a/python/tvm/ndarray.py b/python/tvm/ndarray.py
index 18e958973d94..448e5f6d8bdb 100644
--- a/python/tvm/ndarray.py
+++ b/python/tvm/ndarray.py
@@ -8,7 +8,7 @@
 import numpy as _np
 
 from ._ffi.ndarray import TVMContext, TVMType, NDArrayBase
-from ._ffi.ndarray import context, empty
+from ._ffi.ndarray import context, empty, from_dlpack
 from ._ffi.ndarray import _set_class_ndarray
 from ._ffi.ndarray import register_extension, free_extension_handle
 
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
new file mode 100644
index 000000000000..572589921dcf
--- /dev/null
+++ b/python/tvm/relay/__init__.py
@@ -0,0 +1,68 @@
+# pylint: disable=wildcard-import, redefined-builtin, invalid-name
+"""The Relay IR namespace containing the IR definition and compiler."""
+from __future__ import absolute_import
+from ..api import register_func
+from . import base
+from . import ty
+from . import expr
+from . import expr_functor
+from . import module
+from . import ir_pass
+from .build_module import build, build_config, create_executor
+from . import parser
+from . import debug
+
+# Root operators
+from .op import Op
+from .op.reduce import *
+from .op.tensor import *
+from .op.transform import *
+from . import nn
+from . import vision
+from . import image
+from . import frontend
+from . import backend
+
+from .scope_builder import ScopeBuilder
+
+# Span
+Span = base.Span
+
+# Env
+Module = module.Module
+
+# Type
+Type = ty.Type
+TupleType = ty.TupleType
+TensorType = ty.TensorType
+Kind = ty.Kind
+TypeVar = ty.TypeVar
+TypeConstraint = ty.TypeConstraint
+FuncType = ty.FuncType
+TypeRelation = ty.TypeRelation
+IncompleteType = ty.IncompleteType
+scalar_type = ty.scalar_type
+
+# Expr
+Expr = expr.Expr
+Constant = expr.Constant
+Tuple = expr.Tuple
+Var = expr.Var
+GlobalVar = expr.GlobalVar
+Function = expr.Function
+Call = expr.Call
+Let = expr.Let
+If = expr.If
+TupleGetItem = expr.TupleGetItem
+
+# ExprFunctor
+ExprFunctor = expr_functor.ExprFunctor
+ExprMutator = expr_functor.ExprMutator
+
+# helper functions
+var = expr.var
+const = expr.const
+bind = expr.bind
+
+# Parser
+fromtext = parser.fromtext
diff --git a/python/tvm/relay/_expr.py b/python/tvm/relay/_expr.py
new file mode 100644
index 000000000000..1a27c4efc410
--- /dev/null
+++ b/python/tvm/relay/_expr.py
@@ -0,0 +1,5 @@
+# pylint: disable=no-else-return, unidiomatic-typecheck, undefined-variable
+"""The interface of expr function exposed from C++."""
+from tvm._ffi.function import _init_api
+
+_init_api("relay._expr", __name__)
diff --git a/python/tvm/relay/_ir_pass.py b/python/tvm/relay/_ir_pass.py
new file mode 100644
index 000000000000..61fdcfa38c2f
--- /dev/null
+++ b/python/tvm/relay/_ir_pass.py
@@ -0,0 +1,5 @@
+"""FFI exposing the Relay type inference and checking."""
+
+from tvm._ffi.function import _init_api
+
+_init_api("relay._ir_pass", __name__)
diff --git a/python/tvm/relay/_ir_pass.pyi b/python/tvm/relay/_ir_pass.pyi
new file mode 100644
index 000000000000..6bf4e2dac871
--- /dev/null
+++ b/python/tvm/relay/_ir_pass.pyi
@@ -0,0 +1,8 @@
+from .env import Module
+from . import ir
+
+def check_expr(env: Module, expr: ir.Expr) -> ir.Type: ...
+def generalize(env: Module, expr: ir.Expr) -> ir.Expr: ...
+def _get_checked_type(expr: ir.Expr) -> ir.Type: ...
+def well_formed(expr: ir.Expr) -> bool: ...
+def dead_code_elimination(expr: ir.Expr) -> ir.Expr: ...
diff --git a/python/tvm/relay/_make.py b/python/tvm/relay/_make.py
new file mode 100644
index 000000000000..20a582e76d6a
--- /dev/null
+++ b/python/tvm/relay/_make.py
@@ -0,0 +1,9 @@
+"""
+The constructors for all Relay AST nodes exposed from C++.
+
+This module includes MyPy type signatures for all of the
+exposed modules.
+"""
+from .._ffi.function import _init_api
+
+_init_api("relay._make", __name__)
diff --git a/python/tvm/relay/_module.py b/python/tvm/relay/_module.py
new file mode 100644
index 000000000000..b6e74c451915
--- /dev/null
+++ b/python/tvm/relay/_module.py
@@ -0,0 +1,5 @@
+# pylint: disable=no-else-return, unidiomatic-typecheck, undefined-variable
+"""The interface to the Module exposed from C++."""
+from tvm._ffi.function import _init_api
+
+_init_api("relay._module", __name__)
diff --git a/python/tvm/relay/_module.pyi b/python/tvm/relay/_module.pyi
new file mode 100644
index 000000000000..de3aabefba4c
--- /dev/null
+++ b/python/tvm/relay/_module.pyi
@@ -0,0 +1,5 @@
+from typing import Union, Tuple, Dict, List
+from relay.ir import GlobalId, OperatorId, Item, NodeBase, Span, FileId
+from relay.ir import ShapeExtension, Operator, Defn
+
+class Module(NodeBase): ...
diff --git a/python/tvm/relay/_parser.py b/python/tvm/relay/_parser.py
new file mode 100644
index 000000000000..f64c635dd4ff
--- /dev/null
+++ b/python/tvm/relay/_parser.py
@@ -0,0 +1,425 @@
+
+# pylint: disable=invalid-name, unused-import
+"""A parser for Relay's text format."""
+from __future__ import absolute_import
+
+import sys
+
+from collections import deque
+from typing import TypeVar, Deque, Tuple, Optional, Union, NamedTuple, List, Callable, Any
+
+from . import module
+from . import expr
+from . import ty
+from . import op
+
+class ParseError(Exception):
+    """Exception type for parse errors."""
+
+    def __init__(self, message):
+        # type: (str) -> None
+        super(ParseError, self).__init__()
+        self.message = message
+
+PYTHON_VERSION = sys.version_info.major
+try:
+    if PYTHON_VERSION == 2:
+        from .grammar.py2.RelayVisitor import RelayVisitor
+        from .grammar.py2.RelayParser import RelayParser
+        from .grammar.py2.RelayLexer import RelayLexer
+    else:
+        from .grammar.py3.RelayVisitor import RelayVisitor
+        from .grammar.py3.RelayParser import RelayParser
+        from .grammar.py3.RelayLexer import RelayLexer
+except ImportError:
+    raise ParseError("Couldn't find ANTLR parser. Try building with USE_ANTLR=ON.")
+
+try:
+    from antlr4 import ParserRuleContext, InputStream, CommonTokenStream
+    from antlr4.tree.Tree import TerminalNode
+except ImportError:
+    raise ParseError("Couldn't find ANTLR runtime." +
+                     "Try running `pip{} install antlr4-python{}-runtime`."
+                     .format(PYTHON_VERSION, PYTHON_VERSION))
+
+BINARY_OPS = {
+    RelayParser.MUL: op.multiply,
+    RelayParser.DIV: op.divide,
+    RelayParser.ADD: op.add,
+    RelayParser.SUB: op.subtract,
+    RelayParser.LT:  op.less,
+    RelayParser.GT:  op.greater,
+    RelayParser.LE:  op.less_equal,
+    RelayParser.GE:  op.greater_equal,
+    RelayParser.EQ:  op.equal,
+    RelayParser.NE:  op.not_equal,
+}
+
+TYPE_PREFIXES = [
+    "int",
+    "uint",
+    "float",
+    "bool",
+]
+
+T = TypeVar("T")
+Scope = Deque[Tuple[str, T]]
+Scopes = Deque[Scope[T]]
+
+def lookup(scopes, name):
+    # type: (Scopes[T], str) -> Optional[T]
+    """Look up `name` in `scopes`."""
+
+    for scope in scopes:
+        for key, val in scope:
+            if key == name:
+                return val
+    return None
+
+# TODO(@jmp): Use https://stackoverflow.com/q/13889941
+# to figure out how to get ANTLR4 to be more unhappy about syntax errors
+class ParseTreeToRelayIR(RelayVisitor):
+    """Parse Relay text format into Relay IR."""
+
+    def __init__(self):
+        # type: () -> None
+        self.module = module.Module({})   # type: module.Module
+
+        # Adding an empty scope allows naked lets without pain.
+        self.var_scopes = deque([deque()]) # type: Scopes[expr.Var]
+        self.type_param_scopes = deque([deque()]) # type: Scopes[ty.TypeVar]
+
+        super(ParseTreeToRelayIR, self).__init__()
+
+    def enter_var_scope(self):
+        # type: () -> None
+        """Enter a new Var scope so it can be popped off later."""
+
+        self.var_scopes.appendleft(deque())
+
+    def exit_var_scope(self):
+        # type: () -> Scope[expr.Var]
+        """Pop off the current Var scope and return it."""
+
+        return self.var_scopes.popleft()
+
+    def mk_var(self, name, type_):
+        # type: (str, ty.Type) -> expr.Var
+        """Create a new Var and add it to the Var scope."""
+
+        var = expr.Var(name, type_)
+        self.var_scopes[0].appendleft((name, var))
+        return var
+
+    def enter_type_param_scope(self):
+        # type: () -> None
+        """Enter a new TypeVar scope so it can be popped off later."""
+
+        self.type_param_scopes.appendleft(deque())
+
+    def exit_type_param_scope(self):
+        # type: () -> Scope[ty.TypeVar]
+        """Pop off the current TypeVar scope and return it."""
+
+        return self.type_param_scopes.popleft()
+
+    def mk_typ(self, name, kind):
+        # (str, ty.Kind) -> ty.TypeVar
+        """Create a new TypeVar and add it to the TypeVar scope."""
+
+        typ = ty.TypeVar(name, kind)
+        self.type_param_scopes[0].appendleft((name, typ))
+        return typ
+
+    def visitTerminal(self, node):
+        # type: (TerminalNode) -> Union[expr.Expr, int, float]
+        """Visit lexer tokens that aren't ignored or visited by other functions."""
+
+        node_type = node.getSymbol().type
+        node_text = node.getText()
+
+        # variables
+        if node_type == RelayLexer.GLOBAL_VAR:
+            return expr.GlobalVar(node_text[1:])
+        elif node_type == RelayLexer.LOCAL_VAR:
+            name = node_text[1:]
+            var = lookup(self.var_scopes, name)
+            if var is None:
+                raise ParseError("Couldn't resolve `{}`.".format(name))
+
+            return var
+
+        # data types
+        elif node_type == RelayLexer.INT:
+            return int(node_text)
+        elif node_type == RelayLexer.FLOAT:
+            return float(node_text)
+        elif node_type == RelayLexer.BOOL_LIT:
+            if node_text == "True":
+                return True
+            elif node_text == "False":
+                return False
+            else:
+                raise ParseError("Unrecognized BOOL_LIT: `{}`".format(node_text))
+
+        else:
+            raise ParseError("todo: {}".format(node_text))
+
+    def visit_list(self, ctx_list):
+        # type: (List[ParserRuleContext]) -> List[Any]
+        """"Visit a list of contexts."""
+
+        return [self.visit(ctx) for ctx in ctx_list]
+
+    def getType_(self, ctx):
+        # type: (Optional[RelayParser.Type_Context]) -> Optional[ty.Type]
+        """Return a (possibly None) Relay type."""
+
+        if ctx is None:
+            return None
+
+        return self.visit(ctx)
+
+    def visitProg(self, ctx):
+        # type: (RelayParser.ProgContext) -> Union[expr.Expr, env.Environment]
+        if ctx.defn():
+            self.visit_list(ctx.defn())
+            return self.module
+
+        return self.visit(ctx.expr())
+
+    # Exprs
+
+    def visitOpIdent(self, ctx):
+        # type: (RelayParser.OpIdentContext) -> op.Op
+        return op.get(ctx.CNAME().getText())
+
+    # pass through
+    def visitParens(self, ctx):
+        # type: (RelayParser.ParensContext) -> expr.Expr
+        return self.visit(ctx.expr())
+
+    # pass through
+    def visitBody(self, ctx):
+        # type: (RelayParser.BodyContext) -> expr.Expr
+        return self.visit(ctx.expr())
+
+    def visitScalarFloat(self, ctx):
+        # type: (RelayParser.ScalarFloatContext) -> expr.Constant
+        return expr.const(self.visit(ctx.FLOAT()))
+
+    def visitScalarInt(self, ctx):
+        # type: (RelayParser.ScalarIntContext) -> expr.Constant
+        return expr.const(self.visit(ctx.INT()))
+
+    def visitScalarBool(self, ctx):
+        # type: (RelayParser.ScalarBoolContext) -> expr.Constant
+        return expr.const(self.visit(ctx.BOOL_LIT()))
+
+    def visitNeg(self, ctx):
+        # type: (RelayParser.NegContext) -> Union[expr.Constant, expr.Call]
+        val = self.visit(ctx.expr())
+        if isinstance(val, expr.Constant) and val.data.asnumpy().ndim == 0:
+            # fold Neg in for scalars
+            return expr.const(-val.data.asnumpy().item())
+
+        return op.negative(val)
+
+    def visitTuple(self, ctx):
+        # type: (RelayParser.TupleContext) -> expr.Tuple
+        tup = self.visit_list(ctx.expr())
+        return expr.Tuple(tup)
+
+    # Currently doesn't support mutable sequencing.
+    def visitSeq(self, ctx):
+        # type: (RelayParser.SeqContext) -> expr.Let
+        """Desugar various sequence constructs to Relay Let nodes."""
+        if ctx.MUT() is not None:
+            raise ParseError("Mutation is currently unsupported.")
+
+        if ctx.var() is None or ctx.var().ident() is None:
+            # anonymous identity
+            ident = "_"
+            type_ = None
+        else:
+            local_var = ctx.var().ident().LOCAL_VAR()
+            if local_var is None:
+                raise ParseError('Only local ids may be used in `let`s.')
+            ident = local_var.getText()[1:]
+            type_ = self.getType_(ctx.var().type_())
+
+        var = self.mk_var(ident, type_)
+
+        self.enter_var_scope()
+        value = self.visit(ctx.expr(0))
+        self.exit_var_scope()
+
+        body = self.visit(ctx.expr(1))
+
+        return expr.Let(var, value, body)
+
+    def visitBinOp(self, ctx):
+        # type: (RelayParser.BinOpContext) -> expr.Call
+        """Desugar binary operators."""
+        arg0, arg1 = self.visit_list(ctx.expr())
+        relay_op = BINARY_OPS.get(ctx.op.type)
+
+        if relay_op is None:
+            raise ParseError("Unimplemented binary op.")
+
+        return relay_op(arg0, arg1)
+
+    def visitVar(self, ctx):
+        # type: (RelayParser.VarContext) -> expr.Var
+        ident = ctx.ident().LOCAL_VAR()
+
+        if ident is None:
+            raise ParseError('Only local ids may be used in params.')
+
+        type_ = self.getType_(ctx.type_())
+
+        return self.mk_var(ident.getText()[1:], type_)
+
+    def visitVarList(self, ctx):
+        # type: (RelayParser.VarListContext) -> List[expr.Var]
+        return self.visit_list(ctx.var())
+
+    def mk_func(self, ctx):
+        # type: (Union[RelayParser.FuncContext, RelayParser.DefnContext]) -> Function
+        """Construct a function from either a Func or Defn."""
+
+        # Enter var scope early to put params in scope.
+        self.enter_var_scope()
+        # Capture type params in params.
+        self.enter_type_param_scope()
+        var_list = self.visit(ctx.varList())
+        ret_type = self.getType_(ctx.type_())
+
+        type_params = list(self.exit_type_param_scope())
+        if type_params:
+            _, type_params = zip(*type_params)
+
+        body = self.visit(ctx.body())
+        self.exit_var_scope()
+
+        return expr.Function(var_list, body, ret_type, type_params) # type: ignore
+
+    def visitFunc(self, ctx):
+        # type: (RelayParser.FuncContext) -> expr.Function
+        return self.mk_func(ctx)
+
+    def visitDefn(self, ctx):
+        # type: (RelayParser.DefnContext) -> None
+        ident = ctx.ident().GLOBAL_VAR()
+        if ident is None:
+            raise ParseError('Only global ids may be used in `def`s.')
+        ident = expr.GlobalVar(ident.getText()[1:])
+
+        self.module[ident] = self.mk_func(ctx)
+
+    def visitCall(self, ctx):
+        # type: (RelayParser.CallContext) -> expr.Call
+        visited_exprs = self.visit_list(ctx.expr())
+
+        func = visited_exprs[0]
+        args = visited_exprs[1:]
+
+        return expr.Call(func, args, None, None)
+
+    def visitIfElse(self, ctx):
+        # type: (RelayParser.IfElseContext) -> expr.If
+        """Construct a Relay If node. Creates a new scope for each branch."""
+        cond = self.visit(ctx.expr())
+
+        self.enter_var_scope()
+        true_branch = self.visit(ctx.body(0))
+        self.exit_var_scope()
+
+        self.enter_var_scope()
+        false_branch = self.visit(ctx.body(1))
+        self.exit_var_scope()
+
+        return expr.If(cond, true_branch, false_branch)
+
+    # Types
+
+    # pylint: disable=unused-argument
+    def visitIncompleteType(self, ctx):
+        # type (RelayParser.IncompleteTypeContext) -> None:
+        return None
+
+    def visitIdentType(self, ctx):
+        # type: (RelayParser.IdentTypeContext) -> Union[ty.TensorType, str]
+        ident_type = ctx.CNAME().getText()
+
+        # look through all type prefixes for a match
+        for type_prefix in TYPE_PREFIXES:
+            if ident_type.startswith(type_prefix):
+                return ty.scalar_type(ident_type)
+
+        raise ParseError("Unknown builtin type: {}".format(ident_type))
+
+    # def visitCallType(self, ctx):
+    #     # type: (RelayParser.CallTypeContext) -> Union[expr.Expr, ty.TensorType]
+    #     ident_type = ctx.identType().CNAME().getText()
+
+    #     args = self.visit_list(ctx.type_())
+
+    #     if not args:
+    #         raise ParseError("Type-level functions must have arguments!")
+
+    #     func_type = TYPE_FUNCS.get(ident_type)(args)
+
+    #     if func_type is None:
+    #         raise ParseError("Unknown type-level function: `{}`".format(ident_type))
+    #     else:
+    #         return func_type
+
+    def visitParensShape(self, ctx):
+        # type: (RelayParser.ParensShapeContext) -> int
+        return self.visit(ctx.shape())
+
+    def visitShapeSeq(self, ctx):
+        # type: (RelayParser.ShapeSeqContext) -> List[int]
+        return self.visit_list(ctx.shape())
+
+    def visitTensorType(self, ctx):
+        # type: (RelayParser.TensorTypeContext) -> ty.TensorType
+        """Create a simple tensor type. No generics."""
+
+        shape = self.visit(ctx.shapeSeq())
+        dtype = self.visit(ctx.type_())
+
+        if not isinstance(dtype, ty.TensorType):
+            raise ParseError("Expected dtype to be a Relay base type.")
+
+        dtype = dtype.dtype
+
+        return ty.TensorType(shape, dtype)
+
+    def visitTupleType(self, ctx):
+        # type: (RelayParser.TupleTypeContext) -> ty.TupleType
+        return ty.TupleType(self.visit_list(ctx.type_()))
+
+    def visitFuncType(self, ctx):
+        # type: (RelayParser.FuncTypeContext) -> ty.FuncType
+        types = self.visit_list(ctx.type_())
+
+        arg_types = types[:-1]
+        ret_type = types[-1]
+
+        return ty.FuncType(arg_types, ret_type, [], None)
+
+def make_parser(data):
+    # type: (str) -> RelayParser
+    """Construct a RelayParser a given data stream."""
+    input_stream = InputStream(data)
+    lexer = RelayLexer(input_stream)
+    token_stream = CommonTokenStream(lexer)
+    return RelayParser(token_stream)
+
+def fromtext(data):
+    # type: (str) -> Union[expr.Expr, env.Environment]
+    """Parse a Relay program."""
+    tree = make_parser(data).prog()
+    return ParseTreeToRelayIR().visit(tree)
diff --git a/python/tvm/relay/backend/__init__.py b/python/tvm/relay/backend/__init__.py
new file mode 100644
index 000000000000..158989e9bf2f
--- /dev/null
+++ b/python/tvm/relay/backend/__init__.py
@@ -0,0 +1,2 @@
+"""Backend codege modules for relay."""
+from . import compile_engine
diff --git a/python/tvm/relay/backend/_backend.py b/python/tvm/relay/backend/_backend.py
new file mode 100644
index 000000000000..a51cc8072aac
--- /dev/null
+++ b/python/tvm/relay/backend/_backend.py
@@ -0,0 +1,85 @@
+"""The interface of expr function exposed from C++."""
+from __future__ import absolute_import
+
+import logging
+from ... import build_module as _build
+from ... import container as _container
+from ..._ffi.function import _init_api, register_func
+
+
+@register_func("relay.backend.lower")
+def lower(sch, inputs, func_name, source_func):
+    """Backend function for lowering.
+
+    Parameters
+    ----------
+    sch : tvm.Schedule
+        The schedule.
+
+    inputs : List[tvm.Tensor]
+        The inputs to the function.
+
+    func_name : str
+        The name of the function.
+
+    source-func : tvm.relay.Function
+        The source function to be lowered.
+
+    Returns
+    -------
+    lowered_funcs : List[tvm.LoweredFunc]
+        The result of lowering.
+    """
+    import traceback
+    # pylint: disable=broad-except
+    try:
+        f = _build.lower(sch, inputs, name=func_name)
+        logging.debug("lower function %s", func_name)
+        logging.debug("%s", _build.lower(sch, inputs, simple_mode=True))
+    except Exception:
+        msg = traceback.format_exc()
+        msg += "Error during compile function\n"
+        msg += "-----------------------------\n"
+        msg += source_func.astext()
+        raise RuntimeError(msg)
+    return f if isinstance(
+        f, (_container.Array, tuple, list)) else [f]
+
+
+@register_func("relay.backend.build")
+def build(funcs, target, target_host=None):
+    """Backend build function.
+
+    Parameters
+    ----------
+    funcs : List[tvm.LoweredFunc]
+         The list of lowered functions.
+
+
+    target : tvm.Target
+         The target to run the code on.
+
+    target_host : tvm.Target
+         The host target.
+
+    Returns
+    -------
+    module : tvm.Module
+         The runtime module.
+    """
+    if target_host == "":
+        target_host = None
+    return _build.build(funcs, target=target, target_host=target_host)
+
+
+@register_func("relay._tensor_value_repr")
+def _tensor_value_repr(tvalue):
+    return str(tvalue.data.asnumpy())
+
+
+@register_func("relay._constant_repr")
+def _tensor_constant_repr(tvalue):
+    return str(tvalue.data.asnumpy())
+
+
+_init_api("relay.backend", __name__)
diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py
new file mode 100644
index 000000000000..1f7ab18677c4
--- /dev/null
+++ b/python/tvm/relay/backend/compile_engine.py
@@ -0,0 +1,152 @@
+"""Backend code generation engine."""
+from __future__ import absolute_import
+
+from ..base import register_relay_node, NodeBase
+from ... import target as _target
+from .. import expr as _expr
+from . import _backend
+
+@register_relay_node
+class CachedFunc(NodeBase):
+    """Low-level tensor function to back a relay primitive function.
+    """
+    pass
+
+
+@register_relay_node
+class CCacheKey(NodeBase):
+    """Key in the CompileEngine.
+
+    Parameters
+    ----------
+    source_func : tvm.relay.Function
+        The source function.
+
+    target : tvm.Target
+        The target we want to run the function on.
+    """
+    def __init__(self, source_func, target):
+        self.__init_handle_by_constructor__(
+            _backend._make_CCacheKey, source_func, target)
+
+
+@register_relay_node
+class CCacheValue(NodeBase):
+    """Value in the CompileEngine, including usage statistics.
+    """
+    pass
+
+
+def _get_cache_key(source_func, target):
+    if isinstance(source_func, _expr.Function):
+        if isinstance(target, str):
+            target = _target.create(target)
+            if not target:
+                raise ValueError("Need target when source_func is a Function")
+        return CCacheKey(source_func, target)
+    if not isinstance(source_func, CCacheKey):
+        raise TypeError("Expect source_func to be CCacheKey")
+    return source_func
+
+
+@register_relay_node
+class CompileEngine(NodeBase):
+    """CompileEngine to get lowered code.
+    """
+    def __init__(self):
+        raise RuntimeError("Cannot construct a CompileEngine")
+
+    def lower(self, source_func, target=None):
+        """Lower a source_func to a CachedFunc.
+
+        Parameters
+        ----------
+        source_func : Union[tvm.relay.Function, CCacheKey]
+            The source relay function.
+
+        target : tvm.Target
+            The target platform.
+
+        Returns
+        -------
+        cached_func: CachedFunc
+            The result of lowering.
+        """
+        # pylint: disable=broad-except
+        try:
+            key = _get_cache_key(source_func, target)
+            return _backend._CompileEngineLower(self, key)
+        except Exception:
+            import traceback
+            msg = traceback.format_exc()
+            msg += "Error during compile func\n"
+            msg += "--------------------------\n"
+            msg += source_func.astext(show_meta_data=False)
+            msg += "--------------------------\n"
+            raise RuntimeError(msg)
+
+    def jit(self, source_func, target=None):
+        """JIT a source_func to a tvm.Function.
+
+        Parameters
+        ----------
+        source_func : Union[tvm.relay.Function, CCacheKey]
+            The source relay function.
+
+        target : tvm.Target
+            The target platform.
+
+        Returns
+        -------
+        cached_func: CachedFunc
+            The result of lowering.
+        """
+        key = _get_cache_key(source_func, target)
+        return _backend._CompileEngineJIT(self, key)
+
+    def clear(self):
+        """clear the existing cached functions"""
+        _backend._CompileEngineClear(self)
+
+    def items(self):
+        """List items in the cache.
+
+        Returns
+        -------
+        item_list : List[Tuple[CCacheKey, CCacheValue]]
+            The list of items.
+        """
+        res = _backend._CompileEngineListItems(self)
+        assert len(res) % 2 == 0
+        return [(res[2*i], res[2*i+1]) for i in range(len(res) // 2)]
+
+    def dump(self):
+        """Return a string representation of engine dump.
+
+        Returns
+        -------
+        dump : str
+            The dumped string representation
+        """
+        items = self.items()
+        res = "====================================\n"
+        res += "CompilerEngine dump, %d items cached\n" % len(items)
+        for k, v in items:
+            res += "------------------------------------\n"
+            res += "target={}\n".format(k.target)
+            res += "use_count={}\n".format(v.use_count)
+            res += "func_name={}\n".format(v.cached_func.func_name)
+            res += k.source_func.astext() + "\n"
+        res += "===================================\n"
+        return res
+
+
+def get():
+    """Get the global compile engine.
+
+    Returns
+    -------
+    engine : tvm.relay.backend.CompileEngine
+        The compile engine.
+    """
+    return _backend._CompileEngineGlobal()
diff --git a/python/tvm/relay/backend/graph_runtime_codegen.py b/python/tvm/relay/backend/graph_runtime_codegen.py
new file mode 100644
index 000000000000..91d09973ea8f
--- /dev/null
+++ b/python/tvm/relay/backend/graph_runtime_codegen.py
@@ -0,0 +1,360 @@
+"""
+A compiler from a Relay expression to TVM's graph runtime.
+
+The compiler is built from a few pieces.
+
+First we define a compiler from a single Relay expression to the
+graph langauge. We require the expression to be a function.
+The function's parameters correpond to the placeholder/inputs
+and model parameters found in the computation graph representation.
+The body of the function represents the computation graph.
+
+The compiler's output is a program in the graph language, which is composed of
+graph langauge is composed of Node, NodeRef, InputNode, OpNode.
+This "little language" represents programs in TVM's graph format.
+
+To connect to the graph runtime, we use a printer that converts our graph format
+into TVM's JSON format. The resulting string can be loaded by
+contrib.graph_runtime or any other TVM runtime comptatible system.
+"""
+
+from __future__ import absolute_import
+import json
+import attr
+from . import _backend
+from . import compile_engine
+from ..op import Op
+from ..expr import Function, GlobalVar
+from ..expr_functor import ExprFunctor
+from ..ty import TupleType, TensorType
+
+
+@attr.s
+class NodeRef(object):
+    """A reference to a node, used for constructing the graph."""
+    ident = attr.ib()
+    index = attr.ib(default=0)
+    version = attr.ib(default=0)
+
+    def to_json(self):
+        return [self.ident, self.index, self.version]
+
+
+@attr.s
+class Node(object):
+    """The base class for nodes in the TVM runtime system graph input."""
+    name = attr.ib()
+    attrs = attr.ib()
+
+    def to_json(self):
+        raise Exception("Abstract method, please implement me.")
+
+
+@attr.s
+class InputNode(Node):
+    """An input node in the TVM runtime system graph input."""
+    name = attr.ib()
+    attrs = attr.ib()
+
+    def to_json(self):
+        return {
+            "op": "null",
+            "name": self.name,
+            "inputs": []
+        }
+
+
+@attr.s
+class OpNode(Node):
+    """An operator node in the TVM runtime system"s graph input."""
+    op_name = attr.ib()
+    inputs = attr.ib()
+    op_attrs = attr.ib()
+    num_outputs = attr.ib(default=1)
+
+    def to_json(self):
+        attrs = dict.copy(self.op_attrs)
+        # Extend ops with extra info.
+        attrs["func_name"] = self.op_name
+        attrs["flatten_data"] = "0"
+        attrs["num_inputs"] = str(len(self.inputs))
+        attrs["num_outputs"] = str(self.num_outputs)
+
+        return {
+            "op": "tvm_op",
+            "name": self.name,
+            "attrs": attrs,
+            "inputs": self.inputs
+        }
+
+
+def shape_to_json(shape):
+    """Convert symbolic shape to json compatible forma."""
+    return [sh.value for sh in shape]
+
+
+class GraphRuntimeCodegen(ExprFunctor):
+    """The compiler from Relay to the TVM runtime system."""
+    nodes = attr.ib()
+    var_map = attr.ib()
+
+    def __init__(self, mod, target):
+        ExprFunctor.__init__(self)
+        self.mod = mod
+        self.target = target
+        self.nodes = []
+        self.var_map = {}
+        self.params = {}
+        self.storage_map = None
+        self.compile_engine = compile_engine.get()
+        self.lowered_funcs = set()
+        self._name_map = {}
+
+    def add_node(self, node, expr):
+        """
+        Add a node to the graph.
+
+        Parameters
+        ----------
+        node: Node
+            The node to add to the graph.
+
+        expr: tvm.relay.Expr
+            The corresponding expression.
+
+        Returns
+        -------
+        node_ref: Union[NodeRef, List[NodeRef]]
+            A reference to the node.
+        """
+        checked_type = expr.checked_type
+        # setup storage ids
+        assert expr in self.storage_map
+        node.attrs["storage_id"] = [
+            x.value for x in self.storage_map[expr]
+        ]
+
+        node_id = len(self.nodes)
+        self.nodes.append(node)
+        # Tuple return value, flatten as tuple
+        if isinstance(checked_type, TupleType):
+            ret = []
+            shape = []
+            dtype = []
+            for i, typ in enumerate(checked_type.fields):
+                if not isinstance(typ, TensorType):
+                    raise RuntimeError("type %s not supported" % typ)
+                ret.append(NodeRef(node_id, i))
+                shape.append(shape_to_json(typ.shape))
+                dtype.append(typ.dtype)
+            node.attrs["shape"] = shape
+            node.attrs["dtype"] = dtype
+            assert isinstance(node, OpNode)
+            node.num_outputs = len(checked_type.fields)
+            return tuple(ret)
+        # Normal tensor return type
+        if not isinstance(checked_type, TensorType):
+            raise RuntimeError("type %s not supported" % checked_type)
+        node.attrs["shape"] = [shape_to_json(checked_type.shape)]
+        node.attrs["dtype"] = [checked_type.dtype]
+        node.num_outputs = 1
+        return NodeRef(node_id, 0)
+
+    def visit_tuple(self, vtuple):
+        fields = []
+        for field in vtuple.fields:
+            ref = self.visit(field)
+            assert isinstance(ref, NodeRef)
+            fields.append(ref)
+        return tuple(fields)
+
+    def visit_tuple_getitem(self, op):
+        vtuple = self.visit(op.tuple_value)
+        assert isinstance(vtuple, tuple)
+        return vtuple[op.index]
+
+    def visit_constant(self, op):
+        index = len(self.params)
+        name = "p%d" % index
+        self.params[name] = op.data
+        node = InputNode(name, {})
+        return self.add_node(node, op)
+
+    def visit_function(self, _):
+        raise RuntimeError("function not supported")
+
+    def visit_if(self, _):
+        raise RuntimeError("if not supported")
+
+    def visit_global_var(self, _):
+        raise RuntimeError()
+
+    def visit_let(self, let):
+        """
+        Visit the let binding, by first traversing its value,
+        then setting the metadata on the returned NodeRef.
+
+        Finally visit the body, and return the NodeRef corresponding
+        to it.
+
+        Parameters
+        ----------
+        let: tvm.relay.Expr
+            The let binding to transform.
+
+        Returns
+        -------
+        ref: NodeRef
+            The node reference to the body.
+        """
+        assert let.var not in self.var_map
+        self.var_map[let.var] = self.visit(let.value)
+        return self.visit(let.body)
+
+    def visit_var(self, rvar):
+        return self.var_map[rvar]
+
+    def visit_call(self, call):
+        """Transform a ::tvm.relay.Call into an operator in the TVM graph."""
+        if isinstance(call.op, Op):
+            raise Exception(
+                "Operators should be transformed away; try applying" +
+                "the fuse_ops transformation to the expression.")
+        elif isinstance(call.op, GlobalVar):
+            func = self.mod[call.op]
+        elif isinstance(call.op, Function):
+            func = call.op
+        else:
+            raise Exception(
+                "TVM runtime does not support calls to {0}".format(type(call.op)))
+        if int(func.attrs.Primitive) != 1:
+            raise Exception(
+                "TVM only support calls to primitive functions " +
+                "(i.e functions composed of fusable operator invocations)")
+
+        cached_func = self.compile_engine.lower(func, self.target)
+        for loweredf in cached_func.funcs:
+            self.lowered_funcs.add(loweredf)
+
+        inputs = []
+        # flatten tuple in the call.
+        for arg in call.args:
+            res = self.visit(arg)
+            if isinstance(arg.checked_type, TupleType):
+                assert isinstance(res, tuple)
+                inputs += res
+            else:
+                inputs.append(res)
+
+        inputs = [x.to_json() for x in inputs]
+        op_name = cached_func.func_name
+        op_node = OpNode(self._get_unique_name(op_name), {},
+                         op_name, inputs, {})
+        return self.add_node(op_node, call)
+
+    def visit_op(self, _):
+        raise Exception("can not compile op in non-eta expanded form")
+
+    def _get_json(self):
+        """
+        Convert the sequence of nodes stored by the compiler into the
+        TVM graph runtime format.
+
+        Returns
+        -------
+        graph_json : str
+            The generated JSON as a string.
+        """
+        nodes = []
+        # First we compute "nodes" field.
+        for node in self.nodes:
+            nodes.append(node.to_json())
+
+        arg_nodes = []
+        # Compute "arg_nodes" and "heads" fields.
+        for i, node in enumerate(self.nodes):
+            if isinstance(node, InputNode):
+                arg_nodes.append(i)
+
+        heads = self.heads
+        heads = heads if isinstance(heads, tuple) else [heads]
+        heads = [x.to_json() for x in heads]
+
+        # Compute "node_row_ptr" and entry attributes.
+        num_entry = 0
+        shapes = []
+        storage_ids = []
+        dltypes = []
+        node_row_ptr = [0]
+        for node in self.nodes:
+            assert node.num_outputs == len(node.attrs["shape"])
+            shapes += node.attrs["shape"]
+            dltypes += node.attrs["dtype"]
+            storage_ids += node.attrs["storage_id"]
+            num_entry += node.num_outputs
+            node_row_ptr.append(num_entry)
+
+        # Compute "attrs" field.
+        attrs = {}
+        attrs["shape"] = ["list_shape", shapes]
+        attrs["storage_id"] = ["list_int", storage_ids]
+        attrs["dltype"] = ["list_str", dltypes]
+
+        json_dict = {
+            "nodes": nodes,
+            "arg_nodes": arg_nodes,
+            "heads": heads,
+            "attrs": attrs,
+            "node_row_ptr":  node_row_ptr
+        }
+
+        return json.dumps(json_dict, indent=2)
+
+    def debug_dump_memory_plan(self, func):
+        """Debug function to dump memory plan."""
+        def _annotate(expr):
+            if expr in self.storage_map:
+                return str(self.storage_map[expr])
+            return ""
+        return func.astext(show_meta_data=False, annotate=_annotate)
+
+    def codegen(self, func):
+        """Compile a single function into a graph.
+
+        Parameters
+        ----------
+        func: tvm.relay.Expr
+            The function to compile.
+
+        Returns
+        -------
+        graph_json : str
+            The graph json that can be consumed by runtime.
+
+        lowered_funcs : List[tvm.LoweredFunc]
+            The lowered functions.
+
+        params : Dict[str, tvm.nd.NDArray]
+            Additional constant parameters.
+        """
+        self.storage_map = _backend.GraphPlanMemory(func)
+        # First we convert all the parameters into input nodes.
+        for param in func.params:
+            node = InputNode(param.name_hint, {})
+            self.var_map[param] = self.add_node(
+                node, param)
+
+        # Then we compile the body into a graph which can depend
+        # on input variables.
+        self.heads = self.visit(func.body)
+        graph_json = self._get_json()
+        lowered_funcs = list(self.lowered_funcs)
+        return graph_json, lowered_funcs, self.params
+
+    def _get_unique_name(self, name):
+        if name not in self._name_map:
+            self._name_map[name] = 1
+            return name
+        index = self._name_map[name]
+        self._name_map[name] += 1
+        return self._get_unique_name(name + str(index))
diff --git a/python/tvm/relay/backend/interpreter.py b/python/tvm/relay/backend/interpreter.py
new file mode 100644
index 000000000000..ff6cf6aa1d5c
--- /dev/null
+++ b/python/tvm/relay/backend/interpreter.py
@@ -0,0 +1,184 @@
+#pylint: disable=no-else-return
+"""The Python interface to the Relay reference interpreter."""
+from __future__ import absolute_import
+
+import numpy as np
+
+from . import _backend
+from .. import _make, ir_pass
+from ... import register_func, nd
+from ..base import NodeBase, register_relay_node
+from ..expr import Call, Constant, GlobalVar, Function, const
+from ..scope_builder import ScopeBuilder
+
+class Value(NodeBase):
+    """Base class of all values.
+    """
+    @staticmethod
+    @register_func("relay.from_scalar")
+    def from_scalar(value, dtype=None):
+        """Convert a Python scalar to a Relay scalar."""
+        return TensorValue(const(value, dtype).data)
+
+
+@register_relay_node
+class TupleValue(Value):
+    """A tuple value produced by the interpreter."""
+    def __init__(self, *fields):
+        self.__init_handle_by_constructor__(
+            _make.TupleValue, fields)
+
+    def __getitem__(self, field_no):
+        return self.fields[field_no]
+
+
+@register_relay_node
+class Closure(Value):
+    """A closure produced by the interpreter."""
+    pass
+
+
+@register_relay_node
+class TensorValue(Value):
+    """A Tensor value produced by the interpreter."""
+
+    def __init__(self, data):
+        """Allocate a new TensorValue and copy the data from `array` into
+           the new array.
+        """
+        if isinstance(data, np.ndarray):
+            data = nd.array(data)
+
+        self.__init_handle_by_constructor__(
+            _make.TensorValue, data)
+
+    def asnumpy(self):
+        """Convert a Relay TensorValue into a numpy.ndarray."""
+        return self.data.asnumpy()
+
+    def __eq__(self, other):
+        return self.data == other.data
+
+
+def _arg_to_ast(arg):
+    if isinstance(arg, TensorValue):
+        return Constant(arg.data.copyto(_nd.cpu(0)))
+    elif isinstance(arg, np.ndarray):
+        return Constant(nd.array(arg))
+    elif isinstance(arg, Constant):
+        return arg
+    else:
+        return const(arg)
+
+
+class Executor(object):
+    """An abstract interface for executing Relay programs."""
+    def _make_executor(self, _):
+        """
+        Construct a Python function that implements the evaluation
+        of expression.
+
+        Parameters
+        ----------
+        expr: relay.Expr
+            The Relay expression to execute.
+
+        Returns
+        -------
+        executor: function,
+            A Python function which implements the behavior of `expr`.
+        """
+        raise NotImplementedError()
+
+    def evaluate(self, expr, binds=None):
+        """
+        Evaluate a Relay expression on the executor.
+
+        Parameters
+        ----------
+        expr: tvm.relay.Expr
+            The expression to evaluate.
+
+        binds: Map[tvm.relay.Var, tvm.relay.Expr]
+            Additional binding of free variable.
+
+        Returns
+        -------
+        val : Union[function, Value]
+            The evaluation result.
+        """
+        if binds:
+            scope_builder = ScopeBuilder()
+            for key, value in binds.items():
+                scope_builder.let(key, _arg_to_ast(value))
+            scope_builder.ret(expr)
+            expr = scope_builder.get()
+
+        if isinstance(expr, Function):
+            assert not ir_pass.free_vars(expr)
+
+        if isinstance(expr, (Function, GlobalVar)):
+            return self._make_executor(expr)
+
+        # normal expression evaluated by running a function.
+        func = Function([], expr)
+        return self._make_executor(func)()
+
+
+class Interpreter(Executor):
+    """
+    Simple interpreter interface.
+
+    Parameters
+    ----------
+    mod : tvm.relay.Module
+        The module to support the execution.
+
+    ctx : tvm.TVMContext
+        The runtime context to run the code on.
+
+    target : tvm.Target
+        The target option to build the function.
+    """
+    def __init__(self, mod, ctx, target):
+        self.mod = mod
+        self.ctx = ctx
+        self.target = target
+        self._intrp = _backend.CreateInterpreter(mod, ctx, target)
+
+    def optimize(self, expr):
+        """Optimize an expr.
+
+        Parameters
+        ----------
+        expr : Expr
+            The expression to be optimized.
+
+        Returns
+        -------
+        opt_expr : Expr
+            The optimized expression.
+        """
+        # TODO: We need to move this optimization code into the optimizer/pass manager
+        ck_expr = ir_pass.infer_type(expr, mod=self.mod)
+        fused_expr = ir_pass.fuse_ops(ck_expr)
+        ck_fused = ir_pass.infer_type(fused_expr, mod=self.mod)
+        return ck_fused
+
+    def _make_executor(self, expr):
+        def _interp_wrapper(*args):
+            relay_args = []
+            for arg in args:
+                relay_args.append(_arg_to_ast(arg))
+
+            if isinstance(expr, GlobalVar):
+                func = self.mod[expr]
+                func = self.optimize(func)
+                self.mod._add(expr, func, True)
+                opt_expr = Call(expr, relay_args)
+                return self._intrp(opt_expr)
+            else:
+                call = Call(expr, relay_args)
+                opt_expr = self.optimize(call)
+                return self._intrp(opt_expr)
+        return _interp_wrapper
diff --git a/python/tvm/relay/base.py b/python/tvm/relay/base.py
new file mode 100644
index 000000000000..c50013b199ac
--- /dev/null
+++ b/python/tvm/relay/base.py
@@ -0,0 +1,81 @@
+# pylint: disable=no-else-return, unidiomatic-typecheck
+"""The base node types for the Relay language."""
+from __future__ import absolute_import as _abs
+from .._ffi.node import NodeBase, register_node as _register_tvm_node
+from . import _make
+from . import _expr
+
+NodeBase = NodeBase
+
+def register_relay_node(type_key=None):
+    """Register a Relay node type.
+
+    Parameters
+    ----------
+    type_key : str or cls
+        The type key of the node.
+    """
+    if not isinstance(type_key, str):
+        return _register_tvm_node(
+            "relay." + type_key.__name__)(type_key)
+    return _register_tvm_node(type_key)
+
+
+def register_relay_attr_node(type_key=None):
+    """Register a Relay attribute node.
+
+    Parameters
+    ----------
+    type_key : str or cls
+        The type key of the node.
+    """
+    if not isinstance(type_key, str):
+        return _register_tvm_node(
+            "relay.attrs." + type_key.__name__)(type_key)
+    return _register_tvm_node(type_key)
+
+
+class RelayNode(NodeBase):
+    """Base class of all Relay nodes."""
+    def astext(self, show_meta_data=True, annotate=None):
+        """Get the text format of the expression.
+
+        Parameters
+        ----------
+        show_meta_data : bool
+            Whether to include meta data section in the text
+            if there is meta data.
+
+        annotate: Optional[relay.Expr->str]
+            Optional annotate function to provide additional
+            information in the comment block.
+
+        Note
+        ----
+        The metadata section is necessary to fully parse the text format.
+        However, it can contain dumps that are big (e.g constant weights)a,
+        so it can be helpful to skip printing the meta data section.
+
+        Returns
+        -------
+        text : str
+            The text format of the expression.
+        """
+        return _expr.RelayPrint(self, show_meta_data, annotate)
+
+
+@register_relay_node
+class Span(RelayNode):
+    """Specifies a location in a source program."""
+
+    def __init__(self, source, lineno, col_offset):
+        self.__init_handle_by_constructor__(_make.Span, source, lineno, col_offset)
+
+
+@register_relay_node
+class Id(NodeBase):
+    """Unique identifier(name) used in Var.
+       Guaranteed to be stable across all passes.
+    """
+    def __init__(self):
+        raise RuntimeError("Cannot directly construct Id")
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
new file mode 100644
index 000000000000..5b05bc44551a
--- /dev/null
+++ b/python/tvm/relay/build_module.py
@@ -0,0 +1,307 @@
+"""
+Construct the necessary state for the TVM graph runtime
+from a Relay expression.
+"""
+from ..build_module import build as _tvm_build_module
+from .. import nd as _nd, target as _target, autotvm
+from ..contrib import graph_runtime as _graph_rt
+from . import ir_pass
+from . import expr
+from .backend import interpreter as _interpreter
+from .backend import graph_runtime_codegen as _graph_gen
+
+# List of optimization pass and level when switch on
+OPT_PASS_LEVEL = {
+    "SimplifyInference": 0,
+    "OpFusion": 1,
+    "FoldConstant": 2,
+    "CombineParallelConv2D": 3,
+    "FoldScaleAxis": 3,
+    "AlterOpLayout": 3,
+}
+
+class BuildConfig(object):
+    """Configuration scope to set a build config option.
+
+    Parameters
+    ----------
+    kwargs
+        Keyword arguments of configurations to set.
+    """
+    current = None
+    defaults = {
+        "opt_level": 2,
+        "add_pass": None,
+    }
+    def __init__(self, **kwargs):
+        self._old_scope = None
+        for k, _ in kwargs.items():
+            if k not in BuildConfig.defaults:
+                raise ValueError(
+                    "invalid argument %s, candidates are %s" % (k, BuildConfig.defaults.keys()))
+        self._attr = kwargs
+
+    def __getattr__(self, name):
+        if name not in self._attr:
+            return BuildConfig.defaults[name]
+        return self._attr[name]
+
+    def __enter__(self):
+        # pylint: disable=protected-access
+        self._old_scope = BuildConfig.current
+        attr = BuildConfig.current._attr.copy()
+        attr.update(self._attr)
+        self._attr = attr
+        BuildConfig.current = self
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        assert self._old_scope
+        BuildConfig.current = self._old_scope
+
+    def pass_enabled(self, pass_name):
+        """Get whether pass is enabled.
+
+        Parameters
+        ----------
+        pass_name : str
+            The optimization pass name
+
+        Returns
+        -------
+        enabled : bool
+            Whether pass is enabled.
+        """
+        if self.add_pass and pass_name in self.add_pass:
+            return True
+        return self.opt_level >= OPT_PASS_LEVEL[pass_name]
+
+
+BuildConfig.current = BuildConfig()
+
+
+def build_config(**kwargs):
+    """Configure the build behavior by setting config variables.
+
+    Parameters
+    ----------
+    opt_level: int, default=2
+        Optimization level. See OPT_PASS_LEVEL for level of each pass.
+
+    add_pass: set of str
+        Optimization pass to be added regardless of optimization level.
+
+    Returns
+    -------
+    config: BuildConfig
+        The build configuration
+    """
+    return BuildConfig(**kwargs)
+
+
+def _bind_params_by_name(func, params):
+    """Bind parameters of function by its name."""
+    name_dict = {}
+    for arg in func.params:
+        name = arg.name_hint
+        if name in name_dict:
+            name_dict[name] = None
+        else:
+            name_dict[name] = arg
+    bind_dict = {}
+    for k, v in params.items():
+        if k not in name_dict:
+            continue
+        arg = name_dict[k]
+        if arg is None:
+            raise ValueError("Multiple args in the function have name %s" % k)
+        bind_dict[arg] = expr.const(v)
+    return expr.bind(func, bind_dict)
+
+
+def optimize(func, params=None):
+    """Perform target invariant optimizations.
+
+    Parameters
+    ----------
+    func : tvm.relay.Function
+        The input to optimization.
+
+    params : Optional[Dict[str, tvm.nd.NDArray]]
+        Input parameters to the graph that do not change
+        during inference time. used for constant folding.
+
+    Returns
+    -------
+    opt_func : tvm.relay.Function
+        The optimized version of the function.
+    """
+    cfg = BuildConfig.current
+
+    # bind expressions
+    if params:
+        func = _bind_params_by_name(func, params)
+
+    if cfg.pass_enabled("SimplifyInference"):
+        func = ir_pass.infer_type(func)
+        func = ir_pass.simplify_inference(func)
+
+    if cfg.pass_enabled("CombineParallelConv2D"):
+        func = ir_pass.infer_type(func)
+        func = ir_pass.combine_parallel_conv2d(func)
+
+    if cfg.pass_enabled("FoldConstant"):
+        func = ir_pass.fold_constant(func)
+
+    if cfg.pass_enabled("FoldScaleAxis"):
+        func = ir_pass.infer_type(func)
+        func = ir_pass.backward_fold_scale_axis(func)
+        func = ir_pass.infer_type(func)
+        func = ir_pass.forward_fold_scale_axis(func)
+        func = ir_pass.fold_constant(func)
+
+    if cfg.pass_enabled("AlterOpLayout"):
+        func = ir_pass.infer_type(func)
+        func = ir_pass.canonicalize_ops(func)
+        func = ir_pass.infer_type(func)
+        func = ir_pass.alter_op_layout(func)
+
+    return func
+
+
+def build(func,
+          target=None,
+          target_host=None,
+          params=None):
+    """Build a function to run on TVM graph runtime.
+
+    Parameters
+    ----------
+    func: relay.Function
+        The function to build.
+
+    target : str or :any:`tvm.target.Target`, optional
+        The build target
+
+    target_host : str or :any:`tvm.target.Target` optional
+        Host compilation target, if target is device.
+        When TVM compiles device specific program such as CUDA,
+        we also need host(CPU) side code to interact with the driver
+        setup the dimensions and parameters correctly.
+        target_host is used to specify the host side codegen target.
+        By default, llvm is used if it is enabled,
+        otherwise a stackvm intepreter is used.
+
+    params : dict of str to NDArray
+        Input parameters to the graph that do not change
+        during inference time. Used for constant folding.
+
+    Returns
+    -------
+    graph_json : str
+        The json string that can be accepted by graph runtime.
+
+    mod : tvm.Module
+        The module containing necessary libraries.
+
+    params : dict
+        The parameters of the final graph.
+    """
+    target = target if target else _target.current_target()
+    if target is None:
+        raise ValueError("Target is not set in env or passed as argument.")
+    target = _target.create(target)
+
+    # If current dispatch context is fallback context (the default root context),
+    # then load pre-tuned parameters from TopHub
+    if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
+        tophub_context = autotvm.tophub.context(target)
+    else:
+        tophub_context = autotvm.util.EmptyContext()
+
+    cfg = BuildConfig.current
+
+    with tophub_context:
+        func = optimize(func, params)
+        # Fuse ops before running code gen
+        func = ir_pass.infer_type(func)
+        func = ir_pass.fuse_ops(func, cfg.opt_level)
+        # Graph code generation
+        func = ir_pass.infer_type(func)
+        graph_gen = _graph_gen.GraphRuntimeCodegen(mod=None, target=target)
+        graph_json, lowered_funcs, params = graph_gen.codegen(func)
+        mod = _tvm_build_module(lowered_funcs, target=target, target_host=target_host)
+    return graph_json, mod, params
+
+
+class GraphExecutor(_interpreter.Executor):
+    """Wrapper around Executor interface.
+
+    This executor is used for debug and testing purpoes.
+
+    Parameters
+    ----------
+    mod : :py:class:`~tvm.relay.module.Module`
+        The module to support the execution.
+
+    ctx : :py:class:`TVMContext`
+        The runtime context to run the code on.
+
+    target : :py:class:`Target`
+        The target option to build the function.
+    """
+    def __init__(self, mod, ctx, target):
+        self.mod = mod
+        self.ctx = ctx
+        self.target = target
+
+    def _make_executor(self, func):
+        graph_json, mod, params = build(func, target=self.target)
+        gmodule = _graph_rt.create(graph_json, mod, self.ctx)
+        if params:
+            gmodule.set_input(*params)
+        def _graph_wrapper(*args):
+            # Create map of inputs.
+            for i, arg in enumerate(args):
+                gmodule.set_input(i, arg)
+            # Run the module, and fetch the output.
+            gmodule.run()
+            # make a copy so multiple invocation won't hurt perf.
+            return gmodule.get_output(0).copyto(_nd.cpu(0))
+
+        return _graph_wrapper
+
+
+def create_executor(kind="debug",
+                    mod=None,
+                    ctx=None,
+                    target="llvm"):
+    """Factory function to create an executor.
+
+    Parameters
+    ----------
+    kind : str
+        The type of executor
+
+    mod : :py:class:`~tvm.relay.module.Module`
+        The Relay module containing collection of functions
+
+    ctx : :py:class:`tvm.TVMContext`
+        The context to execute the code.
+
+    target : :py:class:`tvm.Target`
+        The corresponding context
+    """
+    if ctx is not None:
+        assert ctx.device_type == _nd.context(str(target), 0).device_type
+    else:
+        ctx = _nd.context(str(target), 0)
+
+    if isinstance(target, str):
+        target = _target.create(target)
+    if kind == "debug":
+        return _interpreter.Interpreter(mod, ctx, target)
+    elif kind == "graph":
+        return GraphExecutor(mod, ctx, target)
+    else:
+        raise RuntimeError("unknown mode {0}".format(mode))
diff --git a/python/tvm/relay/debug.py b/python/tvm/relay/debug.py
new file mode 100644
index 000000000000..00ad7b4401b0
--- /dev/null
+++ b/python/tvm/relay/debug.py
@@ -0,0 +1,25 @@
+# pylint: disable=wildcard-import, redefined-builtin, invalid-name
+"""The Relay IR namespace containing the IR definition and compiler."""
+from __future__ import absolute_import
+from .base import NodeBase, register_relay_node
+from ..api import register_func
+
+@register_relay_node
+class InterpreterState(NodeBase):
+    pass
+
+# pylint: disable=unused-argument
+def _debugger_init(expr, stack):
+    import pdb
+    pdb.set_trace()
+
+# pylint: disable=unused-argument
+@register_func("relay.debug")
+def _debug(*args):
+    _, _, _, ist = args
+    print("Relay Debugger")
+    print("  You can manipulate the expression under evaluation with the name `expr`.")
+    print("  You can manipulate the call stack with the name `stack`.")
+    print("--------------")
+    print("--------------")
+    _debugger_init(ist.current_expr, ist.stack)
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
new file mode 100644
index 000000000000..b96111083bce
--- /dev/null
+++ b/python/tvm/relay/expr.py
@@ -0,0 +1,500 @@
+# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name
+"""The expression nodes of Relay."""
+from __future__ import absolute_import
+from numbers import Number as _Number
+
+import numpy as _np
+from .base import RelayNode, register_relay_node
+from . import _make
+from . import _expr
+from . import ty as _ty
+from .._ffi import base as _base
+from .. import nd as _nd
+from .. import convert
+
+# will be registered afterwards
+_op_make = None
+
+class Expr(RelayNode):
+    """The base type for all Relay expressions."""
+    @property
+    def checked_type(self):
+        """Get the checked type of tvm.relay.Expr.
+
+        Returns
+        -------
+        checked_type : tvm.relay.Type
+            The checked type.
+        """
+        ret = self._checked_type_
+        if ret is None:
+            raise ValueError("The type checker has not populated"
+                             " the checked_type for this node")
+        return ret
+
+    def astype(self, dtype):
+        """Cast the content type of the current data to dtype.
+
+        Parameters
+        ----------
+        dtype : str
+            The target data type.
+
+        Note
+        ----
+        This function only works for TensorType Exprs.
+
+        Returns
+        -------
+        result : tvm.relay.Expr
+            The result expression.
+        """
+        return _make.cast(self, dtype)
+
+    def __add__(self, other):
+        if isinstance(other, Expr):
+            return _op_make.add(self, other)
+        elif isinstance(other, _Number):
+            raise TypeError('convert "%s" with `const` first' % str(other))
+        else:
+            raise TypeError("type %s not supported" % str(type(other)))
+
+    def __radd__(self, other):
+        return self.__add__(other)
+
+    def __sub__(self, other):
+        if isinstance(other, Expr):
+            return _op_make.subtract(self, other)
+        elif isinstance(other, _Number):
+            raise TypeError('convert "%s" with `const` first' % str(other))
+        else:
+            raise TypeError("type %s not supported" % str(type(other)))
+
+    def __rsub__(self, other):
+        if isinstance(other, _Number):
+            raise TypeError('convert "%s" with `const` first' % str(other))
+        else:
+            raise TypeError("type %s not supported" % str(type(other)))
+
+    def __mul__(self, other):
+        if isinstance(other, Expr):
+            return _op_make.multiply(self, other)
+        elif isinstance(other, _Number):
+            raise TypeError('convert "%s" with `const` first' % str(other))
+        else:
+            raise TypeError("type %s not supported" % str(type(other)))
+
+    def __rmul__(self, other):
+        return self.__mul__(other)
+
+    def __div__(self, other):
+        if isinstance(other, Expr):
+            return _op_make.divide(self, other)
+        elif isinstance(other, _Number):
+            raise TypeError('convert "%s" with `const` first' % str(other))
+        else:
+            raise TypeError("type %s not supported" % str(type(other)))
+
+    def __rdiv__(self, other):
+        if isinstance(other, _Number):
+            raise TypeError('convert "%s" with `const` first' % str(other))
+        else:
+            raise TypeError("type %s not supported" % str(type(other)))
+
+    def __truediv__(self, other):
+        return self.__div__(other)
+
+    def __rtruediv__(self, other):
+        return self.__rdiv__(other)
+
+
+@register_relay_node
+class Constant(Expr):
+    """A constant expression in Relay.
+
+    Parameters
+    ----------
+    data : tvm.nd.NDArray
+        The data content of the constant expression.
+    """
+    def __init__(self, data):
+        self.__init_handle_by_constructor__(_make.Constant, data)
+
+
+@register_relay_node
+class Tuple(Expr):
+    """Tuple expression that groups several fields together.
+
+    Parameters
+    ----------
+    fields : List[tvm.relay.Expr]
+        The fields in the tuple.
+    """
+    def __init__(self, fields):
+        self.__init_handle_by_constructor__(_make.Tuple, fields)
+
+    def __getitem__(self, index):
+        if index >= len(self):
+            raise IndexError("Tuple index out of range")
+        return self.fields[index]
+
+    def __len__(self):
+        return len(self.fields)
+
+    def astype(self, _):
+        raise TypeError("astype cannot be used on tuple")
+
+
+@register_relay_node
+class Var(Expr):
+    """A local variable in Relay.
+
+    Local variable can be used to declare input
+    arguments to a function, or intermediate variables.
+
+    Parameters
+    ----------
+    name_hint: str
+        The name of the variable.
+        This name only acts as a hint, and is not used
+        for equality.
+
+    type_annotation: tvm.relay.Type, optional
+        The type annotation on the variable.
+    """
+    def __init__(self, name_hint, type_annotation=None):
+        self.__init_handle_by_constructor__(
+            _make.Var, name_hint, type_annotation)
+
+    @property
+    def name_hint(self):
+        """Get name hint of the current var."""
+        name = self.vid.name_hint
+        return name
+
+
+@register_relay_node
+class GlobalVar(Expr):
+    """A global variable in Tvm.Relay.
+
+    GlobalVar is used to refer to the global functions
+    stored in the module.
+
+    Parameters
+    ----------
+    name_hint: str
+        The name of the variable.
+    """
+    def __init__(self, name_hint):
+        self.__init_handle_by_constructor__(_make.GlobalVar, name_hint)
+
+    def __call__(self, *args):
+        """Invoke the gobal function.
+
+        Parameters
+        ----------
+        args: List[relay.Expr]
+            Arguments.
+        """
+        return Call(self, args, None, None)
+
+
+@register_relay_node
+class Function(Expr):
+    """A function declaration expression.
+
+    Parameters
+    ----------
+    params: List[tvm.relay.Var]
+        List of input parameters to the function.
+
+    body: tvm.relay.Expr
+        The body of the function.
+
+    ret_type: Optional[tvm.relay.Type]
+        The return type annotation of the function.
+
+    type_params: Optional[List[tvm.relay.TypeParam]]
+        The additional type parameters, this is only
+        used in advanced usecase of template functions.
+    """
+    def __init__(self,
+                 params,
+                 body,
+                 ret_type=None,
+                 type_params=None,
+                 attrs=None):
+        if type_params is None:
+            type_params = convert([])
+
+        self.__init_handle_by_constructor__(
+            _make.Function, params, body, ret_type, type_params, attrs)
+
+    def __call__(self, *args):
+        """Invoke the gobal function.
+
+        Parameters
+        ----------
+        args: List[relay.Expr]
+            Arguments.
+        """
+        return Call(self, args, None, None)
+
+
+@register_relay_node
+class Call(Expr):
+    """Function call node in Relay.
+
+    Call node corresponds the operator application node
+    in computational graph terminology.
+
+    Parameters
+    ----------
+    op: tvm.relay.Op or any tvm.relay.Expr with function type.
+        The operation to be called.
+
+    args: List[tvm.relay.Expr]
+        The arguments to the call.
+
+    attrs: Optional[tvm.Attrs]
+        Attributes to the call, can be None
+
+    type_args: Optional[List[tvm.relay.Type]]
+        The additional type arguments, this is only
+        used in advanced usecase of template functions.
+    """
+    def __init__(self, op, args, attrs=None, type_args=None):
+        if not type_args:
+            type_args = []
+        self.__init_handle_by_constructor__(
+            _make.Call, op, args, attrs, type_args)
+
+
+@register_relay_node
+class Let(Expr):
+    """Let variable binding expression.
+
+    Parameters
+    ----------
+    variable: tvm.relay.Var
+        The local variable to be bound.
+
+    value: tvm.relay.Expr
+        The value to be bound.
+
+    body: tvm.relay.Expr
+        The body of the let binding.
+    """
+    def __init__(self, variable, value, body):
+        self.__init_handle_by_constructor__(
+            _make.Let, variable, value, body)
+
+
+@register_relay_node
+class If(Expr):
+    """A conditional expression in Relay.
+
+    Parameters
+    ----------
+    cond: tvm.relay.Expr
+        The condition.
+
+    true_branch: tvm.relay.Expr
+        The expression evaluated when condition is true.
+
+    false_branch: tvm.relay.Expr
+        The expression evaluated when condition is false.
+    """
+    def __init__(self, cond, true_branch, false_branch):
+        self.__init_handle_by_constructor__(
+            _make.If, cond, true_branch, false_branch)
+
+
+@register_relay_node
+class TupleGetItem(Expr):
+    """Get index-th item from a tuple.
+
+    Parameters
+    ----------
+    tuple_value: tvm.relay.Expr
+        The input tuple expression.
+
+    index: int
+        The index.
+    """
+    def __init__(self, tuple_value, index):
+        self.__init_handle_by_constructor__(
+            _make.TupleGetItem, tuple_value, index)
+
+
+class TempExpr(Expr):
+    """Baseclass of all TempExpr.
+
+    TempExprs are pass specific expression that can be
+    useful to define intermediate result in the
+    rewriting pass such as layout or type transformation.
+    """
+    def realize(self):
+        """Convert the expression to a normal(non-temp) Expr.
+
+        Returns
+        -------
+        The corresponding normal expression.
+        """
+        return _expr.TempExprRealize(self)
+
+
+class TupleWrapper(object):
+    """TupleWrapper.
+
+    This class is a Python wrapper for a Relay tuple of known size.
+    It allows for accessing the fields of the Relay tuple as though
+    it were a Python tuple.
+
+    Parameters
+    ----------
+    tuple_value: tvm.relay.Expr
+        The input tuple
+
+    size: int
+        The size of the tuple.
+    """
+    def __init__(self, tuple_value, size):
+        self.tuple_value = tuple_value
+        self.size = size
+
+    def astuple(self):
+        """Returns the underlying Relay tuple if this wrapper is passed
+        as an argument to an FFI function."""
+        return self.tuple_value
+
+    def astext(self):
+        """Get the text format of the tuple expression.
+
+        Returns
+        -------
+        text : str
+            The text format of the tuple expression.
+        """
+        return self.tuple_value.astext()
+
+    def __getitem__(self, index):
+        if index >= len(self):
+            raise IndexError("Tuple index out of range")
+        return TupleGetItem(self.tuple_value, index)
+
+    def __len__(self):
+        return self.size
+
+    def __repr__(self):
+        return ("TupleWrapper(" + self.tuple_value.__repr__() +
+                ", " + str(self.size) + ")")
+
+    def astype(self, _):
+        raise TypeError("astype cannot be used on tuple")
+
+
+def var(name_hint,
+        type_annotation=None,
+        shape=None,
+        dtype="float32"):
+    """Create a new tvm.relay.Var.
+
+    This is a simple wrapper function that allows specify
+    shape and dtype directly.
+
+    Parameters
+    ----------
+    name_hint: str
+        The name of the variable.
+        This name only acts as a hint, and is not used
+        for equality.
+
+    type_annotation: Optional[tvm.relay.Type, str]
+        The type annotation on the variable.
+        When type_annotation is a str, we will create a scalar variable.
+
+    shape: Optional[List[tvm.Expr]]
+        The shape of the tensor type.
+
+    dtype: str, optional
+        The data type of the tensor.
+
+    Examples
+    --------
+    .. code-block:: python
+
+      # The following 4 lines are equivalent to each other
+      x = tvm.relay.Var("x", tvm.relay.TensorType([1, 2]))
+      x = tvm.relay.var("x", tvm.relay.TensorType([1, 2]))
+      x = tvm.relay.var("x", shape=[1, 2])
+      x = tvm.relay.var("x", shape=[1, 2], dtype="float32")
+
+      # The following 2 lines are equivalent to each other.
+      y = tvm.relay.var("x", "float32")
+      y = tvm.relay.var("x", shape=(), dtype="float32")
+    """
+    if type_annotation is not None and shape is not None:
+        raise ValueError("Can only specify either type_annotation or shape.")
+    if shape is not None:
+        type_annotation = _ty.TensorType(shape, dtype)
+    elif isinstance(type_annotation, str):
+        type_annotation = _ty.TensorType((), type_annotation)
+    return Var(name_hint, type_annotation)
+
+
+def const(value, dtype=None):
+    """Create a constant value.
+
+    Parameters
+    ----------
+    value: Union[bool, int, float, numpy.ndarray, tvm.nd.NDArray]
+        The constant value.
+
+    dtype: str, optional
+        The data type of the value.
+
+    Note
+    ----
+    When dtype is None, we use the following rule:
+
+    - int maps to "int32"
+    - float maps to "float32"
+    - bool maps to "bool"
+    - other using the same default rule as numpy.
+    """
+    if isinstance(value, (_base.numeric_types, (bool, list))):
+        value = _np.array(value, dtype=dtype)
+        # convert default to int32 and float32
+        if dtype is None:
+            if value.dtype == "float64":
+                value = value.astype("float32")
+            elif value.dtype == "int64":
+                value = value.astype("int32")
+    if isinstance(value, (_np.ndarray, _np.generic)):
+        value = _nd.array(value)
+
+    if not isinstance(value, _nd.NDArray):
+        raise ValueError("value has to be scalar or NDArray")
+    return Constant(value)
+
+
+def bind(expr, binds):
+    """Bind an free variables in expr or function arguments.
+
+    We can bind parameters expr if it is a function.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression.
+
+    binds : Union[Map[tvm.relay.Var, tvm.relay.Expr], Map[str, tvm.relay.Expr]]
+        The specific bindings.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The expression or function after binding.
+    """
+    return _expr.Bind(expr, binds)
diff --git a/python/tvm/relay/expr.pyi b/python/tvm/relay/expr.pyi
new file mode 100644
index 000000000000..bc2e5115df0d
--- /dev/null
+++ b/python/tvm/relay/expr.pyi
@@ -0,0 +1,114 @@
+from typing import List
+import tvm
+from .base import Span, NodeBase
+from .ty import Type, TypeParam
+from ._ir_pass import _get_checked_type
+
+
+class Expr(NodeBase):
+    def checked_type(self):
+        ...
+
+    def __call__(self, *args):
+        ...
+
+
+class Constant(Expr):
+    data = ...  # type: tvm.nd.NDArray
+
+    def __init__(self, data):
+        # type: (tvm.nd.NDArray) -> None
+        ...
+
+
+class Tuple(Expr):
+    fields = ...  # type: List[Expr]
+
+    def __init__(self, fields):
+        # type: (List[Expr]) -> None
+        ...
+
+
+class Var(Expr):
+    """A local variable in Relay."""
+    name_hint = ...  # type: str
+
+    def __init__(self, name_hint):
+        # type: (str) -> None
+        ...
+
+
+class GlobalVar(Expr):
+    name_hint = ...  # type: str
+
+    def __init__(self, name_hint):
+        # type: (str) -> None
+        ...
+
+
+class Param(Expr):
+    var = ...  # type: Var
+    type = ...  # type: Type
+
+    def __init__(self, var, ty):
+        # type: (Var, Type) -> None
+        ...
+
+
+class Function(Expr):
+    """A function in Relay, see tvm/relay/expr.h for more details."""
+    type_params = ...  # type: List[TypeParam]
+    params = ...  # type: List[Param]
+    ret_type = ...  # type: Type
+    body = ...  # type: Expr
+
+    def __init__(self,
+                 params,  # type: List[Param],
+                 ret_type,  # type: Type,
+                 body,  # type: Expr,
+                 type_params=None,  # type: List[TypeParam]
+                 ):
+        # type: (...) -> None
+        ...
+
+
+@register_relay_node
+class Call(Expr):
+    """A function call in Relay, see tvm/relay/expr.h for more details."""
+    op = ...  # type: Expr
+    args = ...  # type: List[Expr]
+    # todo(@jroesch): add attrs. revise attrs type in __init__
+
+    def __init__(self, op, args, attrs=None, ty_args=None):
+        # type: (Expr, List[Expr], Optional[List[Any]], Optional[List[Type]]) -> None
+        if not ty_args:
+            ty_args = []
+
+        self.__init_handle_by_constructor__(
+            _make.Call, op, args, attrs, ty_args)
+
+
+@register_relay_node
+class Let(Expr):
+    """A variable bindings in Relay, see tvm/relay/expr.h for more details."""
+    var = ...  # type: Var
+    value = ...  # type: Expr
+    body = ...  # type: Expr
+    value_type = ...  # type: Type
+
+    def __init__(self, var, value, body, value_type):
+        # type: (Var, Expr, Expr, Type) -> None
+        ...
+
+
+@register_relay_node
+class If(Expr):
+    """A conditional expression in Relay, see tvm/relay/expr.h for more details."""
+    cond = ...  # type: Expr
+    true_value = ...  # type: Expr
+    false_value = ...  # type: Expr
+    span = ...  # type: Span
+
+    def __init__(self, cond, true_value, false_value):
+        # type: (Expr, Expr, Expr) -> None
+        ...
\ No newline at end of file
diff --git a/python/tvm/relay/expr_functor.py b/python/tvm/relay/expr_functor.py
new file mode 100644
index 000000000000..eafe5f09309f
--- /dev/null
+++ b/python/tvm/relay/expr_functor.py
@@ -0,0 +1,155 @@
+# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name
+"""The expression functor of Relay."""
+
+from .expr import Function, Call, Let, Var, GlobalVar, If, Tuple, TupleGetItem, Constant
+from .op import Op
+
+class ExprFunctor:
+    """
+    An abstract visitor defined over Expr.
+
+    Defines the default dispatch over expressions, and
+    implements memoization.
+    """
+    def __init__(self):
+        self.memo_map = {}
+
+    # pylint: disable=no-else-return
+    def visit(self, expr):
+        """Apply the visitor to an expression."""
+        found = self.memo_map.get(expr)
+        if found:
+            return found
+
+        if isinstance(expr, Function):
+            res = self.visit_function(expr)
+        elif isinstance(expr, Call):
+            res = self.visit_call(expr)
+        elif isinstance(expr, Let):
+            res = self.visit_let(expr)
+        elif isinstance(expr, Var):
+            res = self.visit_var(expr)
+        elif isinstance(expr, GlobalVar):
+            res = self.visit_global_var(expr)
+        elif isinstance(expr, If):
+            res = self.visit_if(expr)
+        elif isinstance(expr, Tuple):
+            res = self.visit_tuple(expr)
+        elif isinstance(expr, TupleGetItem):
+            res = self.visit_tuple_getitem(expr)
+        elif isinstance(expr, Constant):
+            res = self.visit_constant(expr)
+        elif isinstance(expr, Op):
+            res = self.visit_op(expr)
+        else:
+            raise Exception("warning unhandled case: {0}".format(type(expr)))
+
+        self.memo_map[expr] = res
+
+        return res
+
+    def visit_function(self, _):
+        raise NotImplementedError()
+
+    def visit_let(self, _):
+        raise NotImplementedError()
+
+    def visit_call(self, _):
+        raise NotImplementedError()
+
+    def visit_var(self, _):
+        raise NotImplementedError()
+
+    def visit_type(self, typ):
+        return typ
+
+    def visit_if(self, _):
+        raise NotImplementedError()
+
+    def visit_tuple(self, _):
+        raise NotImplementedError()
+
+    def visit_tuple_getitem(self, _):
+        raise NotImplementedError()
+
+    def visit_global_var(self, _):
+        raise NotImplementedError()
+
+    def visit_op(self, _):
+        raise NotImplementedError()
+
+    def visit_constant(self, _):
+        raise NotImplementedError()
+
+
+class ExprMutator(ExprFunctor):
+    """
+    A functional visitor over Expr.
+
+    The default behavior recursively traverses the AST
+    and reconstructs the AST.
+    """
+    def visit_function(self, fn):
+        new_body = self.visit(fn.body)
+        return Function(
+            list(fn.params),
+            new_body,
+            fn.ret_type,
+            fn.type_params,
+            fn.attrs)
+
+    def visit_let(self, let):
+        new_var = self.visit(let.var)
+        new_val = self.visit(let.value)
+        new_body = self.visit(let.body)
+        return Let(new_var, new_val, new_body)
+
+    def visit_call(self, call):
+        new_fn = self.visit(call.op)
+        new_args = [self.visit(arg) for arg in call.args]
+        return Call(new_fn, new_args, call.attrs)
+
+    def visit_var(self, rvar):
+        return rvar
+
+    def visit_global_id(self, global_var):
+        return global_var
+
+    def visit_if(self, ite):
+        return If(
+            self.visit(ite.guard),
+            self.visit(ite.true_b),
+            self.visit(ite.false_b))
+
+    def visit_tuple(self, tup):
+        return Tuple([self.visit(field) for field in tup.fields])
+
+    def visit_tuple_getitem(self, op):
+        tuple_value = self.visit(op.tuple_value)
+        if not tuple_value.same_as(op.tuple_value):
+            return TupleGetItem(tuple_value, op.index)
+        return op
+
+    def visit_global_var(self, gvar):
+        return gvar
+
+    def visit_op(self, op):
+        return op
+
+    def visit_constant(self, const):
+        return const
+
+    def visit_constructor(self, con):
+        return con
+
+    def visit_match(self, m):
+        return Match(self.visit(m.data), [Clause(c.lhs, self.visit(c.rhs)) for c in m.pattern])
+
+    def visit_ref_new(self, r):
+        return RefNew(self.visit(r.value))
+
+    def visit_ref_write(self, r):
+        return RefWrite(self.visit(r.ref), self.visit(r.value))
+
+    def visit_ref_read(self, r):
+        return RefRead(self.visit(r.ref))
diff --git a/python/tvm/relay/frontend/__init__.py b/python/tvm/relay/frontend/__init__.py
new file mode 100644
index 000000000000..2d01174a0d96
--- /dev/null
+++ b/python/tvm/relay/frontend/__init__.py
@@ -0,0 +1,10 @@
+"""
+Frontends for constructing Relay programs.
+
+Contains the model importers currently defined
+for Relay.
+"""
+
+from __future__ import absolute_import
+
+from .mxnet import from_mxnet
diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
new file mode 100644
index 000000000000..20598400ce21
--- /dev/null
+++ b/python/tvm/relay/frontend/common.py
@@ -0,0 +1,206 @@
+"""Common utilities"""
+from __future__ import absolute_import as _abs
+
+
+class RequiredAttr(object):
+    """Dummpy class to represent required attr"""
+    pass
+
+
+class StrAttrsDict(object):
+    """Helper class to parse attrs stored as Dict[str, str].
+
+    Parameters
+    ----------
+    attrs : Dict[str, str]
+        The attributes to be used.
+    """
+    def __init__(self, attrs):
+        self.attrs = attrs
+
+    def get_float(self, key, default=RequiredAttr()):
+        """Get float attribute
+
+        Parameters
+        ----------
+        key : str
+            The attribute key
+
+        default : float
+            The default value.
+
+        Returns
+        -------
+        value : The result
+        """
+        if key in self.attrs:
+            return float(self.attrs[key])
+        if isinstance(default, RequiredAttr):
+            raise AttributeError("Required attribute {} not found.".format(key))
+        return default
+
+    def get_int(self, key, default=RequiredAttr()):
+        """Get int attribute
+
+        Parameters
+        ----------
+        key : str
+            The attribute key
+
+        default : float
+            The default value.
+
+        Returns
+        -------
+        value : The result
+        """
+        if key in self.attrs:
+            val = self.attrs[key]
+            if val == "None":
+                return None
+            return int(val)
+        if isinstance(default, RequiredAttr):
+            raise AttributeError("Required attribute {} not found.".format(key))
+        return default
+
+    def get_str(self, key, default=RequiredAttr()):
+        """Get str attribute
+
+        Parameters
+        ----------
+        key : str
+            The attribute key
+
+        default : float
+            The default value.
+
+        Returns
+        -------
+        value : The result
+        """
+        if key in self.attrs:
+            return self.attrs[key]
+        if isinstance(default, RequiredAttr):
+            raise AttributeError("Required attribute {} not found.".format(key))
+        return default
+
+    def get_int_tuple(self, key, default=RequiredAttr()):
+        """Get int tuple attribute
+
+        Parameters
+        ----------
+        key : str
+            The attribute key
+
+        default : float
+            The default value.
+
+        Returns
+        -------
+        value : The result
+        """
+        if key in self.attrs:
+            tshape = self.attrs[key]
+            return tuple(int(x.strip()) for x in tshape.strip('()[]').split(','))
+        if isinstance(default, RequiredAttr):
+            raise AttributeError("Required attribute {} not found.".format(key))
+        return default
+
+    def get_float_tuple(self, key, default=RequiredAttr()):
+        """Get float tuple attribute
+
+        Parameters
+        ----------
+        key : str
+            The attribute key
+
+        default : float
+            The default value.
+
+        Returns
+        -------
+        value : The result
+        """
+
+        if key in self.attrs:
+            tshape = self.attrs[key]
+            return tuple(float(x.strip()) for x in
+                         tshape.strip('()[]').split(','))
+        if isinstance(default, RequiredAttr):
+            raise AttributeError("Required attribute {} not found.".format(key))
+        return default
+
+    def get_tuple_tuple_int(self, key, default=RequiredAttr()):
+        """Get int list attribute
+
+        Parameters
+        ----------
+        key : str
+            The attribute key
+
+        default : float
+            The default value.
+
+        Returns
+        -------
+        value : The result
+        """
+        if key in self.attrs:
+            value = self.attrs[key]
+            seq = []
+            for tup in value.strip('()').split('),'):
+                tup = tup.strip('[]()')
+                els = [int(x.strip('( ')) for x in tup.split(',')]
+                seq.append(tuple(els))
+
+            return tuple(seq)
+
+        if isinstance(default, RequiredAttr):
+            raise AttributeError("Required attribute {} not found.".format(key))
+        return default
+
+    def get_int_list(self, key, default=RequiredAttr()):
+        """Get int list attribute
+
+        Parameters
+        ----------
+        key : str
+            The attribute key
+
+        default : float
+            The default value.
+
+        Returns
+        -------
+        value : The result
+        """
+        if key in self.attrs:
+            tshape = self.attrs[key]
+            return tuple(int(x.strip()) for x in tshape.strip('[]()').split(','))
+        if isinstance(default, RequiredAttr):
+            raise AttributeError("Required attribute {} not found.".format(key))
+        return default
+
+
+
+    def get_bool(self, key, default=RequiredAttr()):
+        """Get bool tuple attribute
+
+        Parameters
+        ----------
+        key : str
+            The attribute key
+
+        default : float
+            The default value.
+
+        Returns
+        -------
+        value : The result
+        """
+        if key in self.attrs:
+            val = self.attrs[key]
+            return val.strip().lower() in ['true', '1', 't', 'y', 'yes']
+        if isinstance(default, RequiredAttr):
+            raise AttributeError("Required attribute {} not found.".format(key))
+        return default
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
new file mode 100644
index 000000000000..7bffbd4f499e
--- /dev/null
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -0,0 +1,510 @@
+# pylint: disable=invalid-name, import-self, len-as-condition
+"""MXNet symbol frontend."""
+from __future__ import absolute_import as _abs
+
+import json
+from .. import ir_pass
+from .. import expr as _expr
+from .. import op as _op
+from ... import nd as _nd
+from .common import StrAttrsDict
+from .nnvm_common import _rename, _binop_scalar, _rbinop_scalar, _reduce
+from .nnvm_common import _arg_reduce, _init_op, _softmax_op, _cast
+from .nnvm_common import _clip, _transpose, _upsampling
+from .nnvm_common import _elemwise_sum, _reshape
+from .nnvm_common import _warn_not_used
+
+__all__ = ['from_mxnet']
+
+def _mx_fully_connected(inputs, attrs):
+    import mxnet as mx
+    units = attrs.get_int("num_hidden")
+    use_bias = not attrs.get_bool("no_bias", False)
+    try:
+        _ = mx.sym.FullyConnected(mx.sym.var("x"), num_hidden=1, flatten=True)
+        has_flatten = True
+    except mx.base.MXNetError:
+        # no flatten attribute in old mxnet
+        has_flatten = False
+    use_flatten = attrs.get_bool("flatten", True)
+    if has_flatten and use_flatten:
+        inputs[0] = _op.nn.batch_flatten(inputs[0])
+    res = _op.nn.dense(inputs[0], inputs[1], units=units)
+    if use_bias:
+        assert len(inputs) == 3
+        res = _op.nn.bias_add(res, inputs[2])
+    return res
+
+
+def _get_channel_axis(layout, op_name):
+    if layout == "NCHW":
+        return 1
+    elif layout == "NHWC":
+        return 3
+    raise RuntimeError("layout: {} is not supported in {}".format(layout, op_name))
+
+
+def _mx_activations(inputs, attrs):
+    act_type = attrs.get_str("act_type")
+    assert len(inputs) == 1
+    if act_type == "sigmoid":
+        return _op.sigmoid(inputs[0])
+    elif act_type == "tanh":
+        return _op.tanh(inputs[0])
+    elif act_type == "relu":
+        return _op.nn.relu(inputs[0])
+    elif act_type == "softrelu":
+        def _stable_softrelu(x):
+            # log(1 + exp(-abs(x))) + relu(x)
+            one = _expr.const(1, dtype="float32")
+            exp_neg_abs_x = _op.exp(_op.negative(_op.abs(x)))
+            return _op.add(_op.log(_op.add(one, exp_neg_abs_x)),
+                           _op.nn.relu(x))
+        return _stable_softrelu(inputs[0])
+    raise RuntimeError("Do not support act_type: {}".format(act_type))
+
+
+def _mx_conv2d(inputs, attrs):
+    kernel_size = attrs.get_int_tuple("kernel")
+    if len(kernel_size) != 2:
+        raise RuntimeError("non-2d kernel is not supported in conv2d")
+    data_layout = attrs.get_str("layout", "NCHW")
+    channel_axis = _get_channel_axis(data_layout, "conv2d")
+
+    if "kernel_layout" in attrs.attrs:
+        weight_layout = attrs.get_str("kernel_layout")
+    else:
+        weight_layout = "HWIO" if data_layout == "NHWC" else "OIHW"
+
+    new_attrs = {}
+    new_attrs["channels"] = attrs.get_int("num_filter")
+    new_attrs["kernel_size"] = kernel_size
+    new_attrs["strides"] = attrs.get_int_tuple("stride", (1, 1))
+    new_attrs["padding"] = attrs.get_int_tuple("pad", (0, 0))
+    new_attrs["dilation"] = attrs.get_int_tuple("dilate", (1, 1))
+    new_attrs["groups"] = attrs.get_int("num_group", 1)
+    new_attrs["data_layout"] = data_layout
+    new_attrs["weight_layout"] = weight_layout
+    use_bias = not attrs.get_bool("no_bias", False)
+    res = _op.nn.conv2d(inputs[0], inputs[1], **new_attrs)
+    if use_bias:
+        assert len(inputs) == 3
+        res = _op.nn.bias_add(res, inputs[2], axis=channel_axis)
+    return res
+
+
+def _mx_conv2d_transpose(inputs, attrs):
+    if "target_shape" in attrs.attrs:
+        raise RuntimeError("target_shape is not supported in conv2d_transpose")
+    kernel_size = attrs.get_int_tuple("kernel")
+    if len(kernel_size) != 2:
+        raise RuntimeError("non-2d kernel is not supported in conv2d")
+    data_layout = attrs.get_str("layout", "NCHW")
+    channel_axis = _get_channel_axis(data_layout, "conv2d_transpose")
+
+    if "kernel_layout" in attrs.attrs:
+        weight_layout = attrs.get_str("kernel_layout")
+    else:
+        weight_layout = "HWIO" if data_layout == "NHWC" else "OIHW"
+
+    new_attrs = {}
+    new_attrs["channels"] = attrs.get_int("num_filter")
+    new_attrs["kernel_size"] = kernel_size
+    new_attrs["strides"] = attrs.get_int_tuple("stride", (1, 1))
+    new_attrs["output_padding"] = attrs.get_int_tuple("adj", (0, 0))
+    new_attrs["padding"] = attrs.get_int_tuple("pad", (0, 0))
+    new_attrs["dilation"] = attrs.get_int_tuple("dilate", (1, 1))
+    new_attrs["groups"] = attrs.get_int("num_group", 1)
+    new_attrs["data_layout"] = data_layout
+    new_attrs["weight_layout"] = weight_layout
+    use_bias = not attrs.get_bool("no_bias", False)
+    res = _op.nn.conv2d_transpose(inputs[0], inputs[1], **new_attrs)
+
+    if use_bias:
+        assert len(inputs) == 3
+        res = _op.nn.bias_add(res, inputs[2], axis=channel_axis)
+    return res
+
+
+def _mx_pooling(inputs, attrs):
+    global_pool = attrs.get_bool("global_pool", False)
+    pool_type = attrs.get_str("pool_type")
+
+    def _pool2d(new_op, is_avg):
+        kernel_size = attrs.get_int_tuple("kernel")
+        if len(kernel_size) != 2:
+            raise RuntimeError("non-2d kernel is not supported in pool2d")
+        new_attrs = {}
+        new_attrs["pool_size"] = kernel_size
+        new_attrs["strides"] = attrs.get_int_tuple("stride", (1, 1))
+        new_attrs["padding"] = attrs.get_int_tuple("pad", (0, 0))
+        new_attrs["ceil_mode"] = (attrs.get_str("pooling_convention", "valid") == "full")
+        if is_avg:
+            new_attrs["count_include_pad"] = attrs.get_bool("count_include_pad", True)
+        return new_op(inputs[0], **new_attrs)
+
+    if pool_type == "max":
+        if global_pool:
+            return _op.nn.global_max_pool2d(inputs[0])
+        return _pool2d(_op.nn.max_pool2d, False)
+    elif pool_type == "avg":
+        if global_pool:
+            return _op.nn.global_avg_pool2d(inputs[0])
+        return _pool2d(_op.nn.avg_pool2d, True)
+    raise RuntimeError("Do not support pool_type:{}".format(pool_type))
+
+
+def _mx_dropout(inputs, attrs):
+    rate = attrs.get_float("p", 0.5)
+    return _op.nn.dropout(inputs[0], rate=rate)
+
+
+def _mx_batch_norm(inputs, attrs):
+    if attrs.get_bool("output_mean_var", False):
+        raise RuntimeError("batch_norm do not support output_mean_var")
+    if attrs.get_bool("use_global_stats", False):
+        _warn_not_used("use_global_stats", "batch_norm")
+    new_attrs = {}
+    new_attrs["axis"] = attrs.get_int("axis", 1)
+    new_attrs["epsilon"] = attrs.get_float("eps", 0.001)
+    new_attrs["center"] = True
+    new_attrs["scale"] = not attrs.get_bool("fix_gamma", False)
+    return _op.nn.batch_norm(*inputs, **new_attrs)
+
+
+def _mx_split(inputs, attrs):
+    axis = attrs.get_int("axis", 1)
+    new_attrs = {}
+    new_attrs["indices_or_sections"] = attrs.get_int("num_outputs")
+    new_attrs["axis"] = axis
+    res = _op.split(inputs[0], **new_attrs)
+    if attrs.get_bool("squeeze_axis", False):
+        return tuple([_op.squeeze(x, axis=[axis]) for x in res])
+    return res
+
+
+def _mx_softmax_activation(inputs, attrs):
+    mode = attrs.get_str("mode", "instance")
+    axis = 0 if mode == "instance" else 1
+    return _op.nn.softmax(inputs[0], axis=axis)
+
+
+def _mx_softmax_output(inputs, attrs):
+    if attrs.get_bool("multi_output", False):
+        return _op.nn.softmax(inputs[0], axis=1)
+    return _op.nn.softmax(inputs[0])
+
+
+def _mx_concat(inputs, attrs):
+    axis = attrs.get_int("dim", 1)
+    return _op.concatenate(tuple(inputs), axis=axis)
+
+
+def _mx_expand_dims(inputs, attrs):
+    axis = attrs.get_int("axis")
+    return _op.expand_dims(inputs[0], axis=axis)
+
+
+def _mx_leaky_relu(inputs, attrs):
+    act_type = attrs.get_str("act_type")
+    if act_type == "leaky":
+        return _op.nn.leaky_relu(inputs[0], alpha=attrs.get_float("slope", 0.25))
+    elif act_type == "prelu":
+        assert len(inputs) == 2
+        return _op.nn.prelu(*inputs)
+    elif act_type == "elu":
+        # -slope * relu(1-exp(x)) + relu(x)
+        slope = attrs.get_float("slope", 0.25)
+        one = _expr.const(1, dtype="float32")
+        x = inputs[0]
+        mslope = _op.nn.relu(_op.subtract(one, _op.exp(x)))
+        mslope = _op.multiply(mslope, _expr.const(-slope, dtype="float32"))
+        return _op.add(mslope, _op.nn.relu(x))
+    elif act_type == "rrelu":
+        # NOTE this is only converted for inference.
+        lower_bound = attrs.get_float("lower_bound")
+        upper_bound = attrs.get_float("upper_bound")
+        alpha = (lower_bound + upper_bound) / 2.0
+        return _op.nn.leaky_relu(inputs[0], alpha=alpha)
+    raise RuntimeError("act_type: {} is not supported".format(act_type))
+
+
+def _mx_lrn(inputs, attrs):
+    new_attrs = {}
+    new_attrs["alpha"] = attrs.get_float("alpha", 0.0001)
+    new_attrs["beta"] = attrs.get_float("beta", 0.75)
+    new_attrs["bias"] = attrs.get_float("knorm", 2)
+    # NCHW format and normalization along channel axis
+    new_attrs["axis"] = 1
+    new_attrs["size"] = attrs.get_int("nsize")
+    assert len(inputs) == 1
+    return _op.nn.lrn(inputs[0], **new_attrs)
+
+
+def _mx_multibox_prior(inputs, attrs):
+    new_attrs = {}
+    new_attrs["sizes"] = attrs.get_float_tuple("sizes", (1.0, ))
+    new_attrs["steps"] = attrs.get_float_tuple("steps", (-1.0, -1.0))
+    new_attrs["offsets"] = attrs.get_float_tuple("offsets", (0.5, 0.5))
+    new_attrs["ratios"] = attrs.get_float_tuple("ratios", (1.0, ))
+    new_attrs["clip"] = attrs.get_bool("clip", False)
+    return _op.vision.multibox_prior(inputs[0], **new_attrs)
+
+
+def _mx_multibox_detection(inputs, attrs):
+    new_attrs0 = {}
+    new_attrs0["clip"] = attrs.get_bool("clip", True)
+    new_attrs0["threshold"] = attrs.get_float("threshold", 0.01)
+    new_attrs0["variances"] = attrs.get_float_tuple("variances", (0.1, 0.1,
+                                                                  0.2, 0.2))
+
+    new_attrs1 = {}
+    new_attrs1["overlap_threshold"] = attrs.get_float("nms_threshold", 0.5)
+    new_attrs1["force_suppress"] = attrs.get_bool("force_suppress", False)
+    new_attrs1["topk"] = attrs.get_int("nms_topk", -1)
+
+    ret = _op.vision.multibox_transform_loc(inputs[0], inputs[1],
+                                            inputs[2], **new_attrs0)
+    return _op.vision.nms(ret[0], ret[1], **new_attrs1)
+
+
+# Note: due to attribute conversion constraint
+# ops in the identity set must be attribute free
+_identity_list = [
+    "log",
+    "exp",
+    "sigmoid",
+    "tanh",
+    "exp",
+    "negative",
+    "reshape_like",
+    "slice_like",
+    "zeros_like",
+    "ones_like",
+]
+
+_convert_map = {
+    "_copy"         : _rename(_op.copy),
+    "relu"          : _rename(_op.nn.relu),
+    "broadcast_add" : _rename(_op.add),
+    "broadcast_sub" : _rename(_op.subtract),
+    "broadcast_mul" : _rename(_op.multiply),
+    "broadcast_div" : _rename(_op.divide),
+    "elemwise_add"  : _rename(_op.add),
+    "elemwise_sub"  : _rename(_op.subtract),
+    "elemwise_mul"  : _rename(_op.multiply),
+    "elemwise_div"  : _rename(_op.divide),
+    "flatten"       : _rename(_op.nn.batch_flatten),
+    "Flatten"       : _rename(_op.nn.batch_flatten),
+    "_plus_scalar"  : _binop_scalar(_op.add),
+    "__add_scalar__": _binop_scalar(_op.add),
+    "__sub_scalar__": _binop_scalar(_op.subtract),
+    "_minus_scalar" : _binop_scalar(_op.subtract),
+    "__mul_scalar__": _binop_scalar(_op.multiply),
+    "_mul_scalar"   : _binop_scalar(_op.multiply),
+    "__div_scalar__": _binop_scalar(_op.divide),
+    "_div_scalar"   : _binop_scalar(_op.divide),
+    "__pow_scalar__": _binop_scalar(_op.power),
+    "_rminus_scalar": _rbinop_scalar(_op.subtract),
+    "__rsub_scalar__": _rbinop_scalar(_op.subtract),
+    "_rdiv_scalar"  : _rbinop_scalar(_op.divide),
+    "__rdiv_scalar__"  : _rbinop_scalar(_op.divide),
+    "__rpow_scalar__": _rbinop_scalar(_op.power),
+    # reduction ops
+    "max"           : _reduce(_op.max),
+    "min"           : _reduce(_op.min),
+    "sum"           : _reduce(_op.sum),
+    "max_axis"      : _reduce(_op.max),
+    "min_axis"      : _reduce(_op.min),
+    "sum_axis"      : _reduce(_op.sum),
+    "argmax"        : _arg_reduce(_op.argmax),
+    "argmin"        : _arg_reduce(_op.argmin),
+    # init ops
+    "_ones"         : _init_op(_op.ones),
+    "_zeros"        : _init_op(_op.zeros),
+    # softmax
+    "softmax"       : _softmax_op(_op.nn.softmax),
+    "log_softmax"   : _softmax_op(_op.nn.log_softmax),
+    "Softmax"       : _softmax_op(_op.nn.softmax),
+    # per op specialization
+    "Reshape"       : _reshape,
+    "reshape"       : _reshape,
+    "Cast"          : _cast,
+    "clip"          : _clip,
+    "transpose"     : _transpose,
+    "UpSampling"    : _upsampling,
+    "add_n"         : _elemwise_sum,
+    # MXNet specific implementations
+    "FullyConnected": _mx_fully_connected,
+    "Activation"    : _mx_activations,
+    "Convolution"   : _mx_conv2d,
+    "Convolution_v1": _mx_conv2d,
+    "Deconvolution" : _mx_conv2d_transpose,
+    "Pooling"       : _mx_pooling,
+    "Pooling_v1"    : _mx_pooling,
+    "Dropout"       : _mx_dropout,
+    "BatchNorm"     : _mx_batch_norm,
+    "BatchNorm_v1"  : _mx_batch_norm,
+    "LRN"           : _mx_lrn,
+    "SliceChannel"  : _mx_split,
+    "split"         : _mx_split,
+    "expand_dims"   : _mx_expand_dims,
+    "Concat"        : _mx_concat,
+    "concat"        : _mx_concat,
+    "LeakyReLU"     : _mx_leaky_relu,
+    "SoftmaxOutput" : _mx_softmax_output,
+    "SoftmaxActivation" : _mx_softmax_activation,
+    # vision
+    "_contrib_MultiBoxPrior" : _mx_multibox_prior,
+    "_contrib_MultiBoxDetection" : _mx_multibox_detection,
+    # List of missing operators that are present in NNVMv1
+    # TODO(tvm-tvm): support all operators.
+    #
+    # "broadcast_to",
+    # "gather_nd",
+    # "Crop"          : _crop_like,
+
+}
+
+# set identity list
+_convert_map.update({k : _rename(k) for k in _identity_list})
+
+
+def _from_mxnet_impl(symbol, shape_dict, dtype_info):
+    """Convert mxnet symbol to compatible relay Function.
+
+    Reconstruct a relay Function by traversing the mxnet symbol.
+
+    Parameters
+    ----------
+    symbol : mxnet.sym.Symbol
+        Incompatible symbol from mxnet.
+        The op_name and attrs inside are not always compatible.
+
+    shape_dict : dict
+        Known parameter shapes
+
+    dtype_info : dict or str.
+        Known parameter dtypes
+
+    Returns:
+    -------
+    func : tvm.relay.Function
+        Converted relay Function
+    """
+    assert symbol is not None
+    jgraph = json.loads(symbol.tojson())
+    jnodes = jgraph["nodes"]
+    node_map = {}
+
+    for nid, node in enumerate(jnodes):
+        children = [node_map[e[0]][e[1]] for e in node["inputs"]]
+        attrs = StrAttrsDict(node.get("attrs", {}))
+        node_name = node["name"]
+        op_name = node["op"]
+        if op_name == "null":
+            shape = shape_dict[node_name] if node_name in shape_dict else None
+            if isinstance(dtype_info, dict):
+                dtype = dtype_info[node_name] if node_name in dtype_info else "float32"
+            else:
+                dtype = dtype_info
+            node_map[nid] = [_expr.var(node_name, shape=shape, dtype=dtype)]
+        elif op_name in _convert_map:
+            res = _convert_map[op_name](children, attrs)
+            if isinstance(res, (_expr.TupleWrapper, tuple, list)):
+                pass
+            elif isinstance(res, _expr.Expr):
+                res = [res]
+            else:
+                raise RuntimeError("unexpected type %s" % type(res))
+            node_map[nid] = res
+        else:
+            raise RuntimeError("{} is not supported in relay frontend".format(op_name))
+
+    outputs = [node_map[e[0]][e[1]] for e in jgraph["heads"]]
+    outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs)
+    func = _expr.Function(ir_pass.free_vars(outputs), outputs)
+    return func
+
+
+def _update_shape_dtype(shape, dtype, params):
+    """Update shape dtype given params information"""
+    shape = {} if shape is None else shape
+    if not params:
+        return shape, dtype
+    shape = shape.copy()
+    shape.update({k : v.shape for k, v in params.items()})
+    if isinstance(dtype, str):
+        for k, v in params.items():
+            if v.dtype != dtype:
+                raise ValueError(
+                    "%s: dtype not expected %s vs %s" % (k, dtype, v.dtype))
+    else:
+        dtype = dtype.copy()
+        dtype.update({k : str(v.dtype) for k, v in params.items()})
+    return shape, dtype
+
+
+def from_mxnet(symbol,
+               shape=None,
+               dtype="float32",
+               arg_params=None,
+               aux_params=None):
+    """Convert from MXNet"s model into compatible relay Function.
+
+    Parameters
+    ----------
+    symbol : mxnet.Symbol or mxnet.gluon.HybridBlock
+        MXNet symbol.
+
+    shape : dict of str to tuple, optional
+        The input shape to the graph
+
+    dtype : str or dict of str to str
+        The input types to the graph
+
+    arg_params : dict of str to mx.NDArray
+        The argument parameters in mxnet
+
+    aux_params : dict of str to mx.NDArray
+        The auxiliary parameters in mxnet
+
+    Returns
+    -------
+    sym : tvm.relay.Function
+        Compatible relay Function
+
+    params : dict of str to tvm.NDArray
+        The parameter dict to be used by nnvm
+    """
+    try:
+        import mxnet as mx
+    except ImportError as e:
+        raise ImportError("{}. MXNet is required to parse symbols.".format(e))
+
+    if isinstance(symbol, mx.sym.Symbol):
+        params = {}
+        arg_params = arg_params if arg_params else {}
+        aux_params = aux_params if aux_params else {}
+        for k, v in arg_params.items():
+            params[k] = _nd.array(v.asnumpy())
+        for k, v in aux_params.items():
+            params[k] = _nd.array(v.asnumpy())
+        shape, dtype = _update_shape_dtype(shape, dtype, params)
+        sym = _from_mxnet_impl(symbol, shape, dtype)
+    elif isinstance(symbol, mx.gluon.HybridBlock):
+        if arg_params is not None or aux_params is not None:
+            raise ValueError("arg_params and aux_params ae not used when importing HybridBlock")
+        params = {}
+        for k, v in symbol.collect_params().items():
+            params[k] = _nd.array(v.data().asnumpy())
+        data = mx.sym.Variable("data")
+        sym = symbol(data)
+        shape, dtype = _update_shape_dtype(shape, dtype, params)
+        sym = _from_mxnet_impl(sym, shape, dtype)
+    elif isinstance(symbol, mx.gluon.Block):
+        raise NotImplementedError("Only Hybrid Blocks are supported now.")
+    else:
+        msg = "mxnet.Symbol or gluon.HybridBlock expected, got {}".format(type(symbol))
+        raise ValueError(msg)
+    return sym, params
diff --git a/python/tvm/relay/frontend/nnvm_common.py b/python/tvm/relay/frontend/nnvm_common.py
new file mode 100644
index 000000000000..17502dbaa090
--- /dev/null
+++ b/python/tvm/relay/frontend/nnvm_common.py
@@ -0,0 +1,132 @@
+# pylint: disable=invalid-name, import-self, len-as-condition
+"""Utility functions common to NNVM and MxNet conversion."""
+from __future__ import absolute_import as _abs
+
+from .. import expr as _expr
+from .. import op as _op
+
+def _get_relay_op(op_name):
+    op = _op
+    for path in op_name.split("."):
+        op = getattr(op, path)
+    if not op:
+        raise RuntimeError("Unable to map op_name {} to relay".format(op_name))
+    return op
+
+
+def _warn_not_used(attr, op='nnvm'):
+    import warnings
+    err = "{} is ignored in {}.".format(attr, op)
+    warnings.warn(err)
+
+
+def _rename(new_op):
+    if isinstance(new_op, str):
+        new_op = _get_relay_op(new_op)
+    # attrs are ignored.
+    def impl(inputs, _, _dtype='float32'):
+        return new_op(*inputs)
+    return impl
+
+
+def _reshape(inputs, attrs):
+    if attrs.get_bool("reverse", False):
+        raise RuntimeError("reshape do not support option reverse")
+    shape = attrs.get_int_tuple("shape")
+    return _op.reshape(inputs[0], newshape=shape)
+
+
+def _init_op(new_op):
+    """Init ops like zeros/ones"""
+    def _impl(inputs, attrs):
+        assert len(inputs) == 0
+        shape = attrs.get_int_tuple("shape")
+        dtype = attrs.get_str("dtype", "float32")
+        return new_op(shape=shape, dtype=dtype)
+    return _impl
+
+
+def _softmax_op(new_op):
+    """softmax/log_softmax"""
+    def _impl(inputs, attrs):
+        assert len(inputs) == 1
+        axis = attrs.get_int("axis", -1)
+        return new_op(inputs[0], axis=axis)
+    return _impl
+
+
+def _reduce(new_op):
+    """Reduction ops like sum/min/max"""
+    def _impl(inputs, attrs):
+        assert len(inputs) == 1
+        axis = attrs.get_int_tuple("axis", [])
+        keepdims = attrs.get_bool("keepdims", False)
+        # use None for reduce over all axis.
+        axis = None if len(axis) == 0 else axis
+        return new_op(inputs[0], axis=axis, keepdims=keepdims)
+    return _impl
+
+
+def _arg_reduce(new_op):
+    """Arg Reduction ops like argmin/argmax"""
+    def _impl(inputs, attrs):
+        assert len(inputs) == 1
+        axis = attrs.get_int("axis", None)
+        keepdims = attrs.get_bool("keepdims", False)
+        res = new_op(inputs[0], axis=[axis], keepdims=keepdims)
+        # cast to dtype.
+        res = res.astype("float32")
+        return res
+    return _impl
+
+
+def _cast(inputs, attrs):
+    """Type cast"""
+    dtype = attrs.get_str("dtype")
+    return inputs[0].astype(dtype=dtype)
+
+
+def _clip(inputs, attrs):
+    a_min = attrs.get_float("a_min")
+    a_max = attrs.get_float("a_max")
+    return _op.clip(inputs[0], a_min=a_min, a_max=a_max)
+
+
+def _transpose(inputs, attrs):
+    axes = attrs.get_int_tuple("axes", None)
+    # translate default case
+    axes = None if len(axes) == 0 else axes
+    return _op.transpose(inputs[0], axes=axes)
+
+
+def _upsampling(inputs, attrs):
+    scale = attrs.get_int("scale")
+    return _op.nn.upsampling(inputs[0], scale=scale)
+
+
+def _elemwise_sum(inputs, _):
+    assert len(inputs) > 0
+    res = inputs[0]
+    for x in inputs[1:]:
+        res = _op.add(res, x)
+    return res
+
+
+def _binop_scalar(new_op):
+    def _impl(inputs, attrs):
+        assert len(inputs) == 1
+        scalar = attrs.get_float("scalar")
+        # Note: binary scalar only works for float op for now
+        scalar = _expr.const(scalar, dtype="float32")
+        return new_op(inputs[0], scalar)
+    return _impl
+
+
+def _rbinop_scalar(new_op):
+    def _impl(inputs, attrs):
+        assert len(inputs) == 1
+        scalar = attrs.get_float("scalar")
+        # Note: binary scalar only works for float op for now
+        scalar = _expr.const(scalar, dtype="float32")
+        return new_op(scalar, inputs[0])
+    return _impl
diff --git a/python/tvm/relay/grammar/.gitignore b/python/tvm/relay/grammar/.gitignore
new file mode 100644
index 000000000000..cffe35e1a41a
--- /dev/null
+++ b/python/tvm/relay/grammar/.gitignore
@@ -0,0 +1 @@
+/.antlr/
diff --git a/python/tvm/relay/grammar/Relay.g4 b/python/tvm/relay/grammar/Relay.g4
new file mode 100644
index 000000000000..c74a42c97e77
--- /dev/null
+++ b/python/tvm/relay/grammar/Relay.g4
@@ -0,0 +1,146 @@
+grammar Relay;
+
+// Lexing
+// comments
+WS : [ \t\n\r]+ -> skip ;
+LINE_COMMENT : '//' .*? '\n' -> skip ;
+COMMENT : '/*' .*? '*/' -> skip ;
+
+// operators
+MUL: '*' ;
+DIV: '/' ;
+ADD: '+' ;
+SUB: '-' ;
+LT: '<' ;
+GT: '>' ;
+LE: '<=' ;
+GE: '>=' ;
+EQ: '==' ;
+NE: '!=' ;
+
+opIdent: CNAME ;
+GLOBAL_VAR: '@' CNAME ;
+LOCAL_VAR: '%' CNAME ;
+
+MUT: 'mut' ;
+
+BOOL_LIT
+  : 'True'
+  | 'False'
+  ;
+
+// non-negative floats
+FLOAT
+  : INT '.' INT EXP? // 1.35, 1.35E-9, 0.3, 4.5
+  | INT EXP // 1e10 3e4
+  ;
+
+// non-negative ints
+INT: DIGIT+ ;
+fragment EXP: [eE] [+\-]? INT ; // \- since - means "range" inside [...]
+
+CNAME: ('_'|LETTER) ('_'|LETTER|DIGIT)* ;
+fragment LETTER: [a-zA-Z] ;
+fragment DIGIT: [0-9] ;
+
+// Parsing
+
+// A Relay program is a list of global definitions or an expression.
+prog: (defn* | expr) EOF ;
+
+// option: 'set' ident BOOL_LIT ;
+
+expr
+  // operators
+  : '(' expr ')'                              # parens
+  | '-' expr                                  # neg
+  | expr op=('*'|'/') expr                    # binOp
+  | expr op=('+'|'-') expr                    # binOp
+  | expr op=('<'|'>'|'<='|'>=') expr          # binOp
+  | expr op=('=='|'!=') expr                  # binOp
+
+  // function definition and application
+  | expr '(' (expr (',' expr)*)? ')'          # call
+  | func                                      # funcExpr
+
+  // tuples and tensors
+  | '(' ')'                                   # tuple
+  | '(' expr ',' ')'                          # tuple
+  | '(' expr (',' expr)+ ')'                  # tuple
+  | '[' (expr (',' expr)*)? ']'               # tensor
+
+  | 'if' '(' expr ')' body 'else' body        # ifElse
+
+  // sequencing
+  | 'let' MUT? var '=' expr ';' expr          # seq
+  | 'let' MUT? var '=' '{' expr '}' ';' expr  # seq
+  // sugar for let %_ = expr; expr
+  | expr ';' expr                             # seq
+
+  // mutable update
+  // | ident '=' expr                            # writeRef
+  // | expr '^'                                  # readRef
+
+  | ident                                     # identExpr
+  | scalar                                    # scalarExpr
+  // | expr '.' INT                              # project
+  // | 'debug'                                   # debug
+  ;
+
+func: 'fn'        varList ('->' type_)? body ;
+defn: 'def' ident varList ('->' type_)? body ;
+
+varList: '(' (var (',' var)*)? ')' ;
+var: ident (':' type_)? ;
+
+// TODO(@jmp): for improved type annotations
+// returnAnno: (ident ':')? type_ ;
+
+// relations: 'where' relation (',' relation)* ;
+// relation: ident '(' (type_ (',' type_)*)? ')' ;
+
+type_
+  : '(' ')'                                         # tupleType
+  | '(' type_ ',' ')'                               # tupleType
+  | '(' type_ (',' type_)+ ')'                      # tupleType
+  | identType                                       # identTypeType
+  | 'Tensor' '[' shapeSeq ',' type_ ']'             # tensorType
+  // currently unused
+  // | identType '[' (type_ (',' type_)*)? ']'         # callType
+  | 'fn' '(' (type_ (',' type_)*)? ')' '->' type_   # funcType
+  | '_'                                             # incompleteType
+  | INT                                             # intType
+  ;
+
+shapeSeq
+  : '(' ')'
+  | '(' shape ',' ')'
+  | '(' shape (',' shape)+ ')'
+  ;
+
+shape
+  : '(' shape ')'                   # parensShape
+  // | type_ op=('*'|'/') type_        # binOpType
+  // | type_ op=('+'|'-') type_        # binOpType
+  | INT                             # intShape
+  ;
+
+identType: CNAME ;
+// Int8, Int16, Int32, Int64
+// UInt8, UInt16, UInt32, UInt64
+// Float16, Float32, Float64
+// Bool
+
+body: '{' expr '}' ;
+
+scalar
+  : FLOAT    # scalarFloat
+  | INT      # scalarInt
+  | BOOL_LIT # scalarBool
+  ;
+
+ident
+  : opIdent
+  | GLOBAL_VAR
+  | LOCAL_VAR
+  ;
diff --git a/python/tvm/relay/grammar/__init__.py b/python/tvm/relay/grammar/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/python/tvm/relay/grammar/py2/.gitignore b/python/tvm/relay/grammar/py2/.gitignore
new file mode 100644
index 000000000000..d677ff551940
--- /dev/null
+++ b/python/tvm/relay/grammar/py2/.gitignore
@@ -0,0 +1 @@
+Relay*
diff --git a/python/tvm/relay/grammar/py2/__init__.py b/python/tvm/relay/grammar/py2/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/python/tvm/relay/grammar/py3/.gitignore b/python/tvm/relay/grammar/py3/.gitignore
new file mode 100644
index 000000000000..d677ff551940
--- /dev/null
+++ b/python/tvm/relay/grammar/py3/.gitignore
@@ -0,0 +1 @@
+Relay*
diff --git a/python/tvm/relay/grammar/py3/__init__.py b/python/tvm/relay/grammar/py3/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/python/tvm/relay/image.py b/python/tvm/relay/image.py
new file mode 100644
index 000000000000..90bb87d71c2e
--- /dev/null
+++ b/python/tvm/relay/image.py
@@ -0,0 +1,4 @@
+# pylint: disable=wildcard-import, unused-import, unused-wildcard-import
+"""Image network related operators."""
+# Re-export in a specific file name so that autodoc can pick it up
+from .op.image import *
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
new file mode 100644
index 000000000000..53fa59cd053d
--- /dev/null
+++ b/python/tvm/relay/ir_pass.py
@@ -0,0 +1,359 @@
+# pylint: disable=no-else-return
+# pylint: disable=unidiomatic-typecheck
+"""The set of passes for Relay.
+
+Exposes an interface for configuring the passes and
+scripting them in Python.
+"""
+from . import _ir_pass
+from . import _make
+from .expr import Expr
+from .ty import Type
+
+def post_order_visit(expr, fvisit):
+    """Recursively visit the ir in post DFS order node,
+    apply fvisit. Each node is guaranteed to be visited
+    only once.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression.
+    fvisit : function
+        The visitor function to be applied.
+    """
+    return _ir_pass.post_order_visit(expr, fvisit)
+
+def infer_type(expr, mod=None):
+    """Infer the type of expr under the context of mod.
+
+    Parameters
+    ----------
+    expr: tvm.relay.Expr
+        The input expression.
+
+    mod: Optional[tvm.relay.Module]
+        The global module.
+
+
+    Returns
+    -------
+    checked_expr : tvm.relay.Expr
+        The checked expression.
+    """
+    return _ir_pass.infer_type(expr, mod)
+
+
+def backward_fold_scale_axis(expr):
+    """Backward fold axis scaling into weights of conv2d/dense.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression, we expect that expr's types
+        should be fully inferred by infer_type.
+
+    Returns
+    -------
+    folded_expr : tvm.relay.Expr
+        The folded expression after transformation.
+
+    Note
+    ----
+    It is recommended to call backward_fold_scale_axis
+    before using forward_fold_scale_axis.
+    As backward folding targets common conv-bn pattern.
+    """
+    return _ir_pass.backward_fold_scale_axis(expr)
+
+
+def forward_fold_scale_axis(expr):
+    """Fold the scaling of axis into weights of conv2d/dense.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression, we expect that expr's types
+        should be fully inferred by infer_type.
+
+    Returns
+    -------
+    folded_expr : tvm.relay.Expr
+        The folded expression after transformation.
+
+    Note
+    ----
+    It is recommended to call backward_fold_scale_axis
+    before using forward_fold_scale_axis.
+    As backward folding targets common conv-bn pattern.
+    """
+    return _ir_pass.forward_fold_scale_axis(expr)
+
+
+def well_formed(expr):
+    """Check that each Var is only bound once (well formed).
+
+    Parameters
+    ----------
+    expr: tvm.relay.Expr
+        The input expression
+
+    Returns
+    -------
+    well_form : bool
+        Whether the input expression is well formed
+    """
+    return _ir_pass.well_formed(expr)
+
+
+def check_kind(t, mod=None):
+    """Check that the type is well kinded.
+    For example, this mean type cannot has tensor of tensor, or is a tuple type of 2 shapes.
+
+    Parameters
+    ----------
+    t: tvm.relay.Type
+        The type to check
+
+    mod: tvm.relay.Module, optional
+        The global module
+
+    Returns
+    -------
+    well_kinded : bool
+        whether the input type is well kinded.
+
+    Examples
+    --------
+    .. code:: python
+
+        assert not check_kind(relay.TupleType([relay.TypeParam('tp1', relay.Kind.Shape)]))
+        assert check_kind(relay.TupleType([relay.TypeParam('tp1', relay.Kind.Type)]))
+    """
+    if mod is not None:
+        return _ir_pass.check_kind(t, mod)
+    else:
+        return _ir_pass.check_kind(t)
+
+
+def free_vars(expr):
+    """Get free Vars from expression expr in Post DFS order.
+
+    Parameters
+    ----------
+    expr: tvm.relay.Expr
+        The input expression
+
+    Returns
+    -------
+    free : List[tvm.relay.Var]
+        The list of free variables in post DFS order.
+
+    Note
+    ----
+    The fact that Vars are post-DFS ordred are useful in
+    neural networks: usually this means weights of previous
+    are ordered first.
+    """
+    return _ir_pass.free_vars(expr)
+
+
+def free_type_vars(expr):
+    """Get free type variables from expression/type e
+
+    Parameters
+    ----------
+    expr: Union[tvm.relay.Expr,tvm.relay.Type]
+        The input expression/type
+
+    Returns
+    -------
+    free : List[tvm.relay.TypeParam]
+        The list of free type variables
+    """
+    return _ir_pass.free_type_vars(expr)
+
+
+def simplify_inference(expr):
+    """ Simplify the data-flow graph for inference phase.
+
+    Parameters
+    ----------
+    e: tvm.relay.Expr
+        The input Expression
+
+    Returns
+    -------
+    result: tvm.relay.Expr
+        An expression which is semantically equal to the input expression,
+        but with some simplification
+    """
+    return _ir_pass.simplify_inference(expr)
+
+
+def canonicalize_ops(expr):
+    """ Canonicalize special operators to basic operators.
+    This can simplify latter analysis. (e.g. Expand bias_add to expand_dims and broadcast_add.)
+
+    Parameters
+    ----------
+    e: tvm.relay.Expr
+        The input Expression
+
+    Returns
+    -------
+    result: tvm.relay.Expr
+        An expression without bias_add
+    """
+    return _ir_pass.canonicalize_ops(expr)
+
+
+def dead_code_elimination(expr):
+    """ Remove expressions which does not effect the program result (dead code).
+
+    Parameters
+    ----------
+    e: tvm.relay.Expr
+        The input Expression
+
+    Returns
+    -------
+    result: tvm.relay.Expr
+        An expression which is semantically equal to the input expression,
+        but with dead code removed.
+    """
+    return _ir_pass.dead_code_elimination(expr)
+
+
+def alpha_equal(lhs, rhs):
+    """Compare two Relay expr for structural equivalence (alpha equivalence).
+
+    Parameters
+    ----------
+    lhs: tvm.relay.Expr
+        One of the input Expression.
+
+    rhs: tvm.relay.Expr
+        One of the input Expression.
+
+    Returns
+    -------
+    result: bool
+        True iff lhs is alpha equal to rhs.
+    """
+    return bool(_make._alpha_equal(lhs, rhs))
+
+
+def graph_equal(lhs, rhs):
+    """Compare two Relay expr for data-flow equivalence.
+    The difference between this and alpha-equality is that
+    variables are not expected to match between lhs and rhs;
+    they are treated as sources and are mapped between each other.
+
+    Parameters
+    ----------
+    lhs: tvm.relay.Expr
+      One of the input Expression.
+
+    rhs: tvm.relay.Expr
+      One of the input Expression.
+
+    Returns
+    -------
+    result: bool
+      True iff lhs is data-flow equivalent to rhs.
+    """
+    return bool(_make._graph_equal(lhs, rhs))
+
+
+def structural_hash(value):
+    """Hash a Relay expression structurally.
+
+    Parameters
+    ----------
+    expr: tvm.relay.Expr or tvm.relay.Type
+      The expression to hash.
+
+    Returns
+    -------
+    result: int
+      The hash value
+    """
+    if isinstance(value, Expr):
+        return int(_ir_pass._expr_hash(value))
+    elif isinstance(value, Type):
+        return int(_ir_pass._type_hash(value))
+    else:
+        msg = ("found value of type {0} expected" +
+               "relay.Expr or relay.Type").format(type(value))
+        raise TypeError(msg)
+
+
+def fold_constant(expr):
+    """Fold the constant expression in expr.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression.
+
+    Returns
+    -------
+    transformed_expr : tvm.relay.Expr
+        The transformed expression.
+    """
+    return _ir_pass.FoldConstant(expr)
+
+
+def fuse_ops(expr, opt_level=1):
+    """Fuse operators in expr together.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression.
+
+    opt_level : int
+        The level of fuse optimization.
+
+    Returns
+    -------
+    transformed_expr : tvm.relay.Expr
+        Transformed expression, containing fused result.
+    """
+    return _ir_pass.FuseOps(expr, opt_level)
+
+
+def combine_parallel_conv2d(expr):
+    """Fold multiple conv2d into one.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression.
+
+    Returns
+    -------
+    transformed_expr : tvm.relay.Expr
+        Transformed expression
+    """
+    return _ir_pass.CombineParallelConv2D(expr)
+
+
+def alter_op_layout(expr):
+    """Alternate the layouts of operators or replace primitive operators with
+    other expressions.
+    This pass can be used for computing convolution in custom layouts or
+    other general weight pre-transformation.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression.
+
+    Returns
+    -------
+    transformed_expr : tvm.relay.Expr
+        Transformed expression with alternated layout.
+    """
+    return _ir_pass.AlterOpLayout(expr)
diff --git a/python/tvm/relay/module.py b/python/tvm/relay/module.py
new file mode 100644
index 000000000000..024c6baf7012
--- /dev/null
+++ b/python/tvm/relay/module.py
@@ -0,0 +1,102 @@
+# pylint: disable=no-else-return, unidiomatic-typecheck, undefined-variable, wildcard-import
+"""A global module storing everything needed to interpret or compile a Relay program."""
+from .base import register_relay_node, RelayNode
+from .._ffi import base as _base
+from . import _make
+from . import _module
+from . import expr as _expr
+
+
+@register_relay_node
+class Module(RelayNode):
+    """The global Relay module containing collection of functions.
+
+    Each global function is identified by an unique tvm.relay.GlobalVar.
+    tvm.relay.GlobalVar and Module is necessary in order to enable
+    recursions in function to avoid cyclic reference in the function.x
+
+    Parameters
+    ----------
+    functions : dict, optional.
+        Map of global var to Function
+    """
+    def __init__(self, functions=None):
+        if functions is None:
+            functions = {}
+        elif isinstance(functions, dict):
+            mapped_funcs = {}
+            for k, v in functions.items():
+                if isinstance(k, _base.string_types):
+                    k = _expr.GlobalVar(k)
+                if not isinstance(k, _expr.GlobalVar):
+                    raise TypeError("Expect functions to be Dict[GlobalVar, Function]")
+                mapped_funcs[k] = v
+            functions = mapped_funcs
+        self.__init_handle_by_constructor__(_make.Module, functions)
+
+    def __setitem__(self, var, func):
+        """Add a function to the module.
+
+        Parameters
+        ---------
+        var: GlobalVar
+            The global variable which names the function.
+
+        func: Function
+            The function.
+        """
+        return self._add(var, func)
+
+    def _add(self, var, func, update=False):
+        if isinstance(var, _base.string_types):
+            var = _expr.GlobalVar(var)
+        return _module.Module_Add(self, var, func, update)
+
+    def __getitem__(self, var):
+        """Lookup a global function by name or by variable.
+
+        Parameters
+        ----------
+        var: str or GlobalVar
+            The name or global variable.
+
+        Returns
+        -------
+            func: Function
+                The function referenced by :code:`var`.
+        """
+        if isinstance(var, _base.string_types):
+            return _module.Module_Lookup_str(self, var)
+        else:
+            return _module.Module_Lookup(self, var)
+
+    def update(self, other):
+        """Insert functions in another Module to current one.
+
+        Parameters
+        ----------
+        other: Module
+            The module to merge into the current Module.
+        """
+        if isinstance(other, dict):
+            other = Module(other)
+        return _module.Module_Update(self, other)
+
+    def get_global_var(self, name):
+        """Get a global variable in the function by name.
+
+        Parameters
+        ----------
+        name: str
+            The name of the global variable.
+
+        Returns
+        -------
+        global_var: GlobalVar
+            The global variable mapped to :code:`name`.
+
+        Raises
+        ------
+        tvm.TVMError if we cannot find corresponding global var.
+        """
+        return _module.Module_GetGlobalVar(self, name)
diff --git a/python/tvm/relay/nn.py b/python/tvm/relay/nn.py
new file mode 100644
index 000000000000..6f45aea8b544
--- /dev/null
+++ b/python/tvm/relay/nn.py
@@ -0,0 +1,4 @@
+# pylint: disable=wildcard-import, unused-import, unused-wildcard-import
+"""Neural network related operators."""
+# Re-export in a specific file name so that autodoc can pick it up
+from .op.nn import *
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
new file mode 100644
index 000000000000..63baa5128bb9
--- /dev/null
+++ b/python/tvm/relay/op/__init__.py
@@ -0,0 +1,31 @@
+#pylint: disable=wildcard-import, redefined-builtin
+"""Relay core operators."""
+# operator defs
+from .op import get, register, register_schedule, register_compute, register_alter_op_layout, \
+    Op
+from .op import debug
+
+# Operators
+from .reduce import *
+from .tensor import *
+from .transform import *
+from . import nn
+from . import image
+from . import vision
+from . import op_attrs
+
+
+# operator registry
+from . import _tensor
+from . import _transform
+from . import _reduce
+from ..expr import Expr
+from ..base import register_relay_node
+
+
+def _register_op_make():
+    from . import _make
+    from .. import expr
+    expr._op_make = _make
+
+_register_op_make()
diff --git a/python/tvm/relay/op/_make.py b/python/tvm/relay/op/_make.py
new file mode 100644
index 000000000000..79c86cbb0254
--- /dev/null
+++ b/python/tvm/relay/op/_make.py
@@ -0,0 +1,4 @@
+"""Constructor APIs"""
+from ..._ffi.function import _init_api
+
+_init_api("relay.op._make", __name__)
diff --git a/python/tvm/relay/op/_reduce.py b/python/tvm/relay/op/_reduce.py
new file mode 100644
index 000000000000..5c720256bbd6
--- /dev/null
+++ b/python/tvm/relay/op/_reduce.py
@@ -0,0 +1,20 @@
+"""Backend compiler related feature registration"""
+from __future__ import absolute_import
+
+import topi
+from . import op as _reg
+
+
+def _schedule_reduce(_, outs, target):
+    """Generic schedule for reduce"""
+    with target:
+        return topi.generic.schedule_reduce(outs)
+
+
+_reg.register_schedule("argmax", _schedule_reduce)
+_reg.register_schedule("argmin", _schedule_reduce)
+_reg.register_schedule("sum", _schedule_reduce)
+_reg.register_schedule("max", _schedule_reduce)
+_reg.register_schedule("min", _schedule_reduce)
+_reg.register_schedule("prod", _schedule_reduce)
+_reg.register_schedule("mean", _schedule_reduce)
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
new file mode 100644
index 000000000000..d1035ee047e5
--- /dev/null
+++ b/python/tvm/relay/op/_tensor.py
@@ -0,0 +1,82 @@
+#pylint: disable=invalid-name, unused-argument
+"""Backend compiler related feature registration"""
+from __future__ import absolute_import
+import topi
+from .op import register_compute, register_schedule, register_pattern
+from .op import schedule_injective, OpPattern
+
+
+schedule_broadcast = schedule_injective
+schedule_elemwise = schedule_injective
+
+register_schedule("log", schedule_broadcast)
+register_schedule("exp", schedule_broadcast)
+register_schedule("sqrt", schedule_broadcast)
+register_schedule("sigmoid", schedule_broadcast)
+register_schedule("floor", schedule_broadcast)
+register_schedule("ceil", schedule_broadcast)
+register_schedule("trunc", schedule_broadcast)
+register_schedule("round", schedule_broadcast)
+register_schedule("abs", schedule_broadcast)
+register_schedule("tanh", schedule_broadcast)
+register_schedule("negative", schedule_broadcast)
+register_schedule("copy", schedule_broadcast)
+
+register_schedule("add", schedule_broadcast)
+register_schedule("subtract", schedule_broadcast)
+register_schedule("multiply", schedule_broadcast)
+register_schedule("divide", schedule_broadcast)
+register_schedule("power", schedule_injective)
+register_schedule("mod", schedule_broadcast)
+register_schedule("equal", schedule_broadcast)
+register_schedule("not_equal", schedule_broadcast)
+register_schedule("less", schedule_broadcast)
+register_schedule("less_equal", schedule_broadcast)
+register_schedule("greater", schedule_broadcast)
+register_schedule("greater_equal", schedule_broadcast)
+register_schedule("maximum", schedule_injective)
+register_schedule("minimum", schedule_injective)
+register_schedule("right_shift", schedule_injective)
+register_schedule("left_shift", schedule_injective)
+
+# zeros
+@register_compute("zeros")
+def zeros_compute(attrs, inputs, output_type, target):
+    assert not inputs
+    return [topi.full(output_type.shape, output_type.dtype, 0.0)]
+
+register_schedule("zeros", schedule_broadcast)
+register_pattern("zeros", OpPattern.ELEMWISE)
+
+# zeros_like
+@register_compute("zeros_like")
+def zeros_like_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 1
+    return [topi.full_like(inputs[0], 0.0)]
+
+register_schedule("zeros_like", schedule_broadcast)
+
+# ones
+@register_compute("ones")
+def ones_compute(attrs, inputs, output_type, target):
+    assert not inputs
+    return [topi.full(output_type.shape, output_type.dtype, 1.0)]
+
+register_schedule("ones", schedule_broadcast)
+register_pattern("ones", OpPattern.ELEMWISE)
+
+# ones_like
+@register_compute("ones_like")
+def ones_like(attrs, inputs, output_type, target):
+    assert len(inputs) == 1
+    return [topi.full_like(inputs[0], 1.0)]
+
+register_schedule("ones_like", schedule_broadcast)
+
+# clip
+@register_compute("clip")
+def clip_compute(attrs, inputs, output_type, target):
+    assert len(inputs) == 1
+    return [topi.clip(inputs[0], attrs.a_min, attrs.a_max)]
+
+register_schedule("clip", schedule_elemwise)
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
new file mode 100644
index 000000000000..085a8ceed5d1
--- /dev/null
+++ b/python/tvm/relay/op/_transform.py
@@ -0,0 +1,40 @@
+"""Backend compiler related feature registration"""
+# pylint: disable=invalid-name,unused-argument
+from __future__ import absolute_import
+import topi
+from . import op as _reg
+from ._reduce import _schedule_reduce
+from .op import schedule_injective, OpPattern
+
+schedule_injective = _reg.schedule_injective
+schedule_broadcast = _reg.schedule_injective
+
+
+_reg.register_schedule("collapse_sum_like", _schedule_reduce)
+_reg.register_schedule("broadcast_to", schedule_broadcast)
+_reg.register_schedule("broadcast_to_like", schedule_broadcast)
+_reg.register_schedule("expand_dims", schedule_broadcast)
+_reg.register_schedule("squeeze", schedule_injective)
+_reg.register_schedule("reshape", schedule_injective)
+_reg.register_schedule("reshape_like", schedule_injective)
+_reg.register_schedule("full", schedule_injective)
+_reg.register_schedule("full_like", schedule_injective)
+_reg.register_schedule("cast", schedule_injective)
+_reg.register_schedule("strided_slice", schedule_injective)
+_reg.register_schedule("slice_like", schedule_injective)
+_reg.register_schedule("split", schedule_injective)
+_reg.register_schedule("take", schedule_injective)
+_reg.register_schedule("transpose", schedule_injective)
+_reg.register_schedule("where", schedule_broadcast)
+
+# layout_transform
+_reg.register_schedule("layout_transform", schedule_injective)
+_reg.register_pattern("layout_transform", OpPattern.INJECTIVE)
+
+# concatenate
+@_reg.register_compute("concatenate")
+def concatenate_compute(attrs, inputs, output_type, target):
+    return [topi.concatenate(inputs, axis=attrs.axis)]
+
+_reg.register_schedule("concatenate", schedule_injective)
+_reg.register_pattern("concatenate", OpPattern.INJECTIVE)
diff --git a/python/tvm/relay/op/image/__init__.py b/python/tvm/relay/op/image/__init__.py
new file mode 100644
index 000000000000..5fa5c01575e0
--- /dev/null
+++ b/python/tvm/relay/op/image/__init__.py
@@ -0,0 +1,5 @@
+# pylint: disable=wildcard-import
+"""Image network related operators."""
+from __future__ import absolute_import as _abs
+from .image import *
+from ._image import *
diff --git a/python/tvm/relay/op/image/_image.py b/python/tvm/relay/op/image/_image.py
new file mode 100644
index 000000000000..e44748372374
--- /dev/null
+++ b/python/tvm/relay/op/image/_image.py
@@ -0,0 +1,7 @@
+#pylint: disable=invalid-name, unused-argument
+"""Backend compiler related feature registration"""
+from __future__ import absolute_import
+from ..op import  register_schedule, schedule_injective
+
+# resize
+register_schedule("image.resize", schedule_injective)
diff --git a/python/tvm/relay/op/image/_make.py b/python/tvm/relay/op/image/_make.py
new file mode 100644
index 000000000000..1198258553fe
--- /dev/null
+++ b/python/tvm/relay/op/image/_make.py
@@ -0,0 +1,4 @@
+"""Constructor APIs"""
+from ...._ffi.function import _init_api
+
+_init_api("relay.op.image._make", __name__)
diff --git a/python/tvm/relay/op/image/image.py b/python/tvm/relay/op/image/image.py
new file mode 100644
index 000000000000..36c8dd5fa548
--- /dev/null
+++ b/python/tvm/relay/op/image/image.py
@@ -0,0 +1,42 @@
+"""Image operations."""
+from __future__ import absolute_import as _abs
+from . import _make
+
+def resize(data,
+           size,
+           layout="NCHW",
+           method="BILINEAR",
+           align_corners=False):
+    """Image resize operator.
+
+    This operator takes data as input and does 2D scaling to the given scale factor.
+    In the default case, where the data_layout is `NCHW`
+    with data of shape (n, c, h, w)
+    out will have a shape (n, c, size[0], size[1])
+
+    method indicates the algorithm to be used while calculating ghe out value
+    and method can be one of ("BILINEAR", "NEAREST_NEIGHBOR")
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    size: Tuple of Expr
+        The out size to which the image will be resized.
+
+    layout : str, optional
+        Layout of the input.
+
+    method : str, optional
+        Scale method to used [NEAREST_NEIGHBOR, BILINEAR].
+
+    align_corners : int, optional
+        Should be true to preserve the values at the corner pixels
+
+    Returns
+    -------
+    result: relay.Expr
+        The resized result.
+    """
+    return _make.resize(data, size, layout, method, align_corners)
diff --git a/python/tvm/relay/op/nn/__init__.py b/python/tvm/relay/op/nn/__init__.py
new file mode 100644
index 000000000000..0c2a0a4358c9
--- /dev/null
+++ b/python/tvm/relay/op/nn/__init__.py
@@ -0,0 +1,5 @@
+# pylint: disable=wildcard-import
+"""Neural network related operators."""
+from __future__ import absolute_import as _abs
+from .nn import *
+from . import _nn
diff --git a/python/tvm/relay/op/nn/_make.py b/python/tvm/relay/op/nn/_make.py
new file mode 100644
index 000000000000..c4922ea8ab04
--- /dev/null
+++ b/python/tvm/relay/op/nn/_make.py
@@ -0,0 +1,4 @@
+"""Constructor APIs"""
+from ...._ffi.function import _init_api
+
+_init_api("relay.op.nn._make", __name__)
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
new file mode 100644
index 000000000000..8180d8b31044
--- /dev/null
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -0,0 +1,259 @@
+#pylint: disable=invalid-name, unused-argument
+"""Backend compiler related feature registration"""
+from __future__ import absolute_import
+
+import topi
+from topi.util import get_const_int, get_const_tuple
+from .. import op as reg
+from ..op import OpPattern, schedule_injective
+
+# relu
+reg.register_schedule("nn.relu", schedule_injective)
+reg.register_pattern("nn.relu", OpPattern.ELEMWISE)
+
+# softmax
+@reg.register_schedule("nn.softmax")
+def schedule_softmax(_, outputs, target):
+    """Schedule definition of softmax"""
+    with target:
+        return topi.generic.schedule_softmax(outputs)
+
+reg.register_pattern("nn.softmax", OpPattern.OPAQUE)
+
+schedule_broadcast = schedule_injective
+
+@reg.register_schedule("nn.log_softmax")
+def schedule_log_softmax(_, outputs, target):
+    """Schedule definition of log_softmax"""
+    with target:
+        return topi.generic.schedule_softmax(outputs)
+
+reg.register_pattern("nn.log_softmax", OpPattern.OPAQUE)
+
+
+# dense
+@reg.register_compute("nn.dense")
+def compute_dense(attrs, inputs, out_type, target):
+    """Compute definition of dense"""
+    return [topi.nn.dense(inputs[0], inputs[1])]
+
+@reg.register_schedule("nn.dense")
+def schedule_dense(attrs, outputs, target):
+    """Schedule definition of dense"""
+    with target:
+        return topi.generic.schedule_dense(outputs)
+
+reg.register_pattern("nn.dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+# conv2d
+@reg.register_compute("nn.conv2d")
+def compute_conv2d(attrs, inputs, out_type, target):
+    """Compute definition of conv2d"""
+    padding = get_const_tuple(attrs.padding)
+    strides = get_const_tuple(attrs.strides)
+    dilation = get_const_tuple(attrs.dilation)
+    groups = attrs.groups
+    layout = attrs.data_layout
+    weight_layout = attrs.weight_layout
+    out_dtype = attrs.out_dtype
+    out_dtype = (inputs[0].dtype if (out_dtype == "same" or out_dtype == "")
+                 else out_dtype)
+
+    assert layout in ["NCHW", "NHWC", "NCHW4c"]
+    (dilation_h, dilation_w) = dilation
+    if dilation_h < 1 or dilation_w < 1:
+        raise ValueError("dilation should be positive value")
+
+    if groups == 1:
+        out = topi.nn.conv2d(
+            inputs[0], inputs[1], strides, padding,
+            dilation, layout, out_dtype=out_dtype)
+    elif layout == "NCHW" and \
+         weight_layout == "OIHW" and \
+         get_const_int(inputs[1].shape[0]) == groups and \
+         get_const_int(inputs[1].shape[1]) == 1:
+        out = topi.nn.depthwise_conv2d_nchw(
+            inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
+    elif layout == "NHWC" and \
+         weight_layout == "HWOI" and\
+         get_const_int(inputs[1].shape[2]) == groups and \
+         get_const_int(inputs[1].shape[3]) == 1:
+        out = topi.nn.depthwise_conv2d_nhwc(
+            inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
+    else:
+        raise ValueError("not support arbitrary group number for now")
+    return [out]
+
+
+@reg.register_schedule("nn.conv2d")
+def schedule_conv2d(attrs, outs, target):
+    """Schedule definition of conv2d"""
+    groups = attrs.groups
+    layout = attrs.data_layout
+    kernel_layout = attrs.weight_layout
+    with target:
+        if groups == 1 and layout == "NCHW":
+            return topi.generic.schedule_conv2d_nchw(outs)
+        elif groups == 1 and layout == "NCHW4c":
+            return topi.generic.schedule_conv2d_nchw(outs)
+        elif groups == 1 and layout == "NHWC":
+            return topi.generic.schedule_conv2d_nhwc(outs)
+        elif groups != 1:
+            if layout == "NCHW":
+                # TODO(leyuan, merrymercy, Huyuwei): fold depthwise topi into conv2d.
+                return topi.generic.schedule_depthwise_conv2d_nchw(outs)
+            elif layout == "NHWC" and kernel_layout == "HWOI":
+                return topi.generic.schedule_depthwise_conv2d_nhwc(outs)
+    raise ValueError("No compatible schedule")
+
+
+@reg.register_alter_op_layout("nn.conv2d")
+def alter_op_layout_conv2d(attrs, inputs, tinfos):
+    """Alternate the layout of conv2d"""
+    return None
+
+reg.register_pattern("nn.conv2d", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+# conv2d_transpose
+@reg.register_compute("nn.conv2d_transpose")
+def compute_conv2d_transpose(attrs, inputs, out_dtype, target):
+    """Compute definition of conv2d_transpose"""
+    padding = get_const_tuple(attrs.padding)
+    strides = get_const_tuple(attrs.strides)
+    dilation = get_const_tuple(attrs.dilation)
+    groups = attrs.groups
+    layout = attrs.data_layout
+    out_dtype = attrs.out_dtype
+    out_dtype = (inputs[0].dtype if (out_dtype == "same" or out_dtype == "")
+                 else out_dtype)
+    assert layout == "NCHW", "only support nchw for now"
+    assert dilation == (1, 1), "not support dilate now"
+    assert groups == 1, "only support groups == 1 for now"
+    out = topi.nn.conv2d_transpose_nchw(inputs[0], inputs[1], strides, padding, out_dtype)
+    output_padding = get_const_tuple(attrs.output_padding)
+    out = topi.nn.pad(out,
+                      [0, 0, 0, 0], [0, 0, output_padding[0], output_padding[1]])
+    return [out]
+
+@reg.register_schedule("nn.conv2d_transpose")
+def schedule_conv2d_transpose(attrs, outs, target):
+    """Schedule definition of conv2d_transpose"""
+    with target:
+        return topi.generic.schedule_conv2d_transpose_nchw(outs)
+
+reg.register_pattern("nn.conv2d_transpose", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+# bias_add
+@reg.register_compute("nn.bias_add")
+def compute_bias_add(attrs, inputs, out_dtype, target):
+    """Compute definition of conv2d_transpose"""
+    axis = attrs.axis
+    bias = inputs[1]
+    data_ndim = len(inputs[0].shape)
+    if axis < 0:
+        axis = axis + data_ndim
+    num_newaxis = data_ndim - axis - 1
+
+    if num_newaxis:
+        bias = topi.expand_dims(bias, axis=1, num_newaxis=num_newaxis)
+    return [topi.add(inputs[0], bias)]
+
+reg.register_schedule("nn.bias_add", schedule_injective)
+reg.register_pattern("nn.bias_add", OpPattern.BROADCAST)
+
+
+# max_pool2d
+@reg.register_schedule("nn.max_pool2d")
+def schedule_max_pool2d(attrs, outs, target):
+    """Schedule definition of max_pool2d"""
+    layout = attrs.layout
+    with target:
+        return topi.generic.schedule_pool(outs, layout)
+
+reg.register_pattern("nn.max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+# avg_pool2d
+@reg.register_schedule("nn.avg_pool2d")
+def schedule_avg_pool2d(attrs, outs, target):
+    """Schedule definition of avg_pool2d"""
+    layout = attrs.layout
+    with target:
+        return topi.generic.schedule_pool(outs, layout)
+
+reg.register_pattern("nn.avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+# global_max_pool2d
+@reg.register_schedule("nn.global_max_pool2d")
+def schedule_global_max_pool2d(_, outs, target):
+    """Schedule definition of global_max_pool2d"""
+    with target:
+        return topi.generic.schedule_global_pool(outs)
+
+reg.register_pattern("nn.global_max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
+# global_avg_pool2d
+@reg.register_schedule("nn.global_avg_pool2d")
+def schedule_global_avg_pool2d(_, outs, target):
+    """Schedule definition of global_avg_pool2d"""
+    with target:
+        return topi.generic.schedule_global_pool(outs)
+
+reg.register_pattern("nn.global_avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+# leaky_relu
+reg.register_schedule("nn.leaky_relu", schedule_broadcast)
+reg.register_pattern("nn.leaky_relu", OpPattern.ELEMWISE)
+
+# prelu
+reg.register_schedule("nn.prelu", schedule_broadcast)
+reg.register_pattern("nn.prelu", OpPattern.BROADCAST)
+
+# flatten
+reg.register_schedule("nn.batch_flatten", schedule_broadcast)
+reg.register_pattern("nn.batch_flatten", OpPattern.INJECTIVE)
+
+
+# lrn
+@reg.register_compute("nn.lrn")
+def compute_lrn(attrs, inputs, out_dtype, target):
+    """Compute definition of lrn"""
+    assert len(inputs) == 1
+    return [topi.nn.lrn(inputs[0], attrs.size, attrs.axis,
+                        attrs.alpha, attrs.beta, attrs.bias)]
+
+@reg.register_schedule("nn.lrn")
+def schedule_lrn(attrs, outs, target):
+    """Schedule definition of lrn"""
+    with target:
+        return topi.generic.schedule_lrn(outs)
+
+reg.register_pattern("nn.lrn", OpPattern.OPAQUE)
+
+
+# l2_normalize
+@reg.register_compute("nn.l2_normalize")
+def compute_l2_normalize(attrs, inputs, out_dtype, target):
+    """Compute definition of l2 normalize"""
+    return [topi.nn.l2_normalize(inputs[0], attrs.eps, attrs.axis)]
+
+@reg.register_schedule("nn.l2_normalize")
+def schedule_l2_normalize(attrs, outs, target):
+    """Schedule definition of l2 normalize"""
+    with target:
+        return topi.generic.schedule_l2_normalize(outs)
+
+reg.register_pattern("nn.l2_normalize", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+# Upsampling
+reg.register_schedule("nn.upsampling", reg.schedule_injective)
+def schedule_upsampling(_, outs, target):
+    """Schedule definition of upsampling"""
+    with target:
+        return topi.generic.schedule_injective(outs)
+# pad
+reg.register_schedule("nn.pad", schedule_broadcast)
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
new file mode 100644
index 000000000000..63b1e206e72c
--- /dev/null
+++ b/python/tvm/relay/op/nn/nn.py
@@ -0,0 +1,767 @@
+"""Neural network operations."""
+from __future__ import absolute_import as _abs
+from ...expr import TupleWrapper
+from . import _make
+
+
+def conv2d(data,
+           weight,
+           strides=(1, 1),
+           padding=(0, 0),
+           dilation=(1, 1),
+           groups=1,
+           channels=None,
+           kernel_size=None,
+           data_layout="NCHW",
+           weight_layout="OIHW",
+           out_layout="",
+           out_dtype=""):
+    r"""2D convolution.
+
+    This operator takes the weight as the convolution kernel
+    and convolves it with data to produce an output.
+
+
+    In the default case, where the data_layout is `NCHW`
+    and weight_layout is `OIHW`, conv2d takes in
+    a data Tensor with shape `(batch_size, in_channels, height, width)`,
+    and a weight Tensor with shape `(channels, in_channels, kernel_size[0], kernel_size[1])`
+    to produce an output Tensor with the following rule:
+
+    .. math::
+
+        \mbox{out}[b, c, y, x] = \sum_{dy, dx, k}
+           \mbox{data}[b, k, \mbox{strides}[0] * y  + dy, \mbox{strides}[1] * x + dx] *
+           \mbox{weight}[c, k, dy, dx]
+
+    Padding and dilation are applied to data and weight respectively before the computation.
+    This operator accepts data layout specification.
+    Semantically, the operator will convert the layout to the canonical layout
+    (`NCHW` for data and `OIHW` for weight), perform the computation,
+    then convert to the out_layout.
+
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    weight : tvm.relay.Expr
+        The weight expressions.
+
+    strides : tuple of int, optional
+        The strides of convoltution.
+
+    padding : tuple of int, optional
+        The padding of convolution on both sides of inputs before convolution.
+
+    dilation : tuple of int, optional
+        Specifies the dilation rate to be used for dilated convolution.
+
+    groups : int, optional
+        Number of groups for grouped convolution.
+
+    channels : int, optional
+        Number of output channels of this convolution.
+
+    kernel_size : tuple of int, optional
+        The spatial of the convolution kernel.
+
+    data_layout : str, optional
+        Layout of the input.
+
+    weight_layout : str, optional
+        Layout of the weight.
+
+    out_layout : str, optional
+        Layout of the output, by default, out_layout is the same as data_layout
+
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision conv2d.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.conv2d(data, weight, strides, padding, dilation,
+                        groups, channels, kernel_size, data_layout,
+                        weight_layout, out_layout, out_dtype)
+
+
+def conv2d_transpose(data,
+                     weight,
+                     strides=(1, 1),
+                     padding=(0, 0),
+                     dilation=(1, 1),
+                     groups=1,
+                     channels=None,
+                     kernel_size=None,
+                     data_layout="NCHW",
+                     weight_layout="OIHW",
+                     output_padding=(0, 0),
+                     out_dtype=""):
+    """Two dimensional trnasposed convolution operator.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    weight : tvm.relay.Expr
+        The weight expressions.
+
+    strides : Tuple[int], optional
+        The strides of convoltution.
+
+    padding : Tuple[int], optional
+        The padding of convolution on both sides of inputs.
+
+    dilation : Tuple[int], optional
+        Specifies the dilation rate to be used for dilated convolution.
+
+    groups : int, optional
+        Number of groups for grouped convolution.
+
+    data_layout : str, optional
+        Layout of the input.
+
+    weight_layout : str, optional
+        Layout of the weight.
+
+    output_padding : Tuple[int], optional
+        Additional zero-padding to be added to one side of the output.
+
+    out_dtype : str, optional
+        Specifies the output data type for mixed precision conv2d.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.conv2d_transpose(data, weight, strides, padding, dilation,
+                                  groups, channels, kernel_size, data_layout,
+                                  weight_layout, output_padding, out_dtype)
+
+
+def softmax(data, axis=-1):
+    r"""Computes softmax.
+
+    .. math:: \text{softmax}(x)_i = \frac{exp(x_i)}{\sum_j exp(x_j)}
+
+    .. note::
+        This operator can be optimized away for inference.
+
+    Parameters
+    ----------
+    data: tvm.relay.Expr
+        The input data to the operator.
+
+    axis: int, optional
+        The axis to sum over when computing softmax
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.softmax(data, axis)
+
+
+def log_softmax(data, axis=-1):
+    r"""Computes log softmax.
+
+    .. math::
+
+        \text{log_softmax}(x)_i = \log \frac{exp(x_i)}{\sum_j exp(x_j)}
+
+    .. note::
+        This operator can be optimized away for inference.
+
+    Parameters
+    ----------
+    data: tvm.relay.Expr
+        The input data to the operator.
+
+    axis: int
+        The axis to sum over when computing softmax
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.log_softmax(data, axis)
+
+
+def max_pool2d(data,
+               pool_size=(1, 1),
+               strides=(1, 1),
+               padding=(0, 0),
+               layout="NCHW",
+               ceil_mode=False):
+    r"""2D maximum pooling operator.
+
+    This operator takes data as input and does 2D max value calculation
+    with in pool_size sized window by striding defined by stride
+
+
+    In the default case, where the data_layout is `NCHW`
+    a data Tensor with shape `(batch_size, in_channels, height, width)`,
+    to produce an output Tensor with the following rule:
+
+    with data of shape (b, c, h, w) and pool_size (kh, kw)
+
+    .. math::
+
+        \mbox{out}(b, c, y, x)  = \max_{m=0, \ldots, kh-1} \max_{n=0, \ldots, kw-1}
+             \mbox{data}(b, c, \mbox{stride}[0] * y + m, \mbox{stride}[1] * x + n)
+
+    Padding is applied to data before the computation.
+    ceil_mode is used to take ceil or floor while computing out shape.
+    This operator accepts data layout specification.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    strides : tuple of int, optional
+        The strides of pooling.
+
+    padding : tuple of int, optional
+        The padding for pooling.
+
+    layout : str, optional
+        Layout of the input.
+
+    ceil_mode : bool, optional
+        To enable or disable ceil while pooling.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.max_pool2d(data, pool_size, strides, padding,
+                            layout, ceil_mode)
+
+def avg_pool2d(data,
+               pool_size=(1, 1),
+               strides=(1, 1),
+               padding=(0, 0),
+               layout="NCHW",
+               ceil_mode=False,
+               count_include_pad=False):
+    r"""2D average pooling operator.
+
+    This operator takes data as input and does 2D average value calculation
+    with in pool_size sized window by striding defined by stride
+
+
+    In the default case, where the data_layout is `NCHW`
+    a data Tensor with shape `(batch_size, in_channels, height, width)`,
+    to produce an output Tensor with the following rule:
+
+    with data of shape (b, c, h, w), pool_size (kh, kw)
+
+    .. math::
+
+        \mbox{out}(b, c, y, x)  = \frac{1}{kh * kw} \sum_{m=0}^{kh-1} \sum_{n=0}^{kw-1}
+             \mbox{data}(b, c, \mbox{stride}[0] * y + m, \mbox{stride}[1] * x + n)
+
+    Padding is applied to data before the computation.
+    ceil_mode is used to take ceil or floor while computing out shape.
+    count_include_pad indicates including or excluding padded input values in computation.
+    This operator accepts data layout specification.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    strides : tuple of int, optional
+        The strides of pooling.
+
+    padding : tuple of int, optional
+        The padding for pooling.
+
+    layout : str, optional
+        Layout of the input.
+
+    ceil_mode : bool, optional
+        To enable or disable ceil while pooling.
+
+    count_include_pad : bool, optional
+        To include padding to compute the average.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.avg_pool2d(data, pool_size, strides, padding,
+                            layout, ceil_mode, count_include_pad)
+
+def global_max_pool2d(data,
+                      layout="NCHW"):
+    r"""2D global maximum pooling operator.
+
+    This operator takes data as input and does 2D max value calculation
+    across each window represented by WxH.
+
+
+    In the default case, where the data_layout is `NCHW`
+    a data Tensor with shape `(batch_size, in_channels, height, width)`,
+    to produce an output Tensor with the following rule:
+
+    with data of shape (b, c, h, w)
+
+    .. math::
+
+        \mbox{out}(b, c, 1, 1)  = \max_{m=0, \ldots, h} \max_{n=0, \ldots, w}
+             \mbox{data}(b, c, m, n)
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    layout : str, optional
+        Layout of the input.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.global_max_pool2d(data, layout)
+
+def global_avg_pool2d(data,
+                      layout="NCHW"):
+    r"""2D global average pooling operator.
+
+    This operator takes data as input and does 2D average value calculation
+    across each window represented by WxH.
+
+
+    In the default case, where the data_layout is `NCHW`
+    a data Tensor with shape `(batch_size, in_channels, height, width)`,
+    to produce an output Tensor with the following rule:
+
+    with data of shape (b, c, h, w)
+
+    .. math::
+
+        \mbox{out}(b, c, 1, 1)  = \frac{1}{h * w} \sum_{m=0}^{h-1} \sum_{n=0}^{w-1}
+             \mbox{data}(b, c, m, n)
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    layout : str, optional
+        Layout of the input.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.global_avg_pool2d(data, layout)
+
+
+def upsampling(data,
+               scale=1,
+               layout="NCHW",
+               method="NEAREST_NEIGHBOR"):
+    """Upsampling.
+
+    This operator takes data as input and does 2D scaling to the given scale factor.
+    In the default case, where the data_layout is `NCHW`
+    with data of shape (n, c, h, w)
+    out will have a shape (n, c, h*scale, w*scale)
+
+    method indicates the algorithm to be used while calculating ghe out value
+    and method can be one of ("BILINEAR", "NEAREST_NEIGHBOR")
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    scale : tvm.relay.Expr
+        The scale factor for upsampling.
+
+    layout : str, optional
+        Layout of the input.
+
+    method : str, optional
+        Scale method to used [NEAREST_NEIGHBOR, BILINEAR].
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.upsampling(data, scale, layout, method)
+
+
+def batch_flatten(data):
+    """BatchFlatten.
+
+    This operator flattens all the dimensions except for the batch dimension.
+    which results a 2D output.
+
+    For data with shape ``(d1, d2, ..., dk)``
+    batch_flatten(data) returns reshaped output of shape ``(d1, d2*...*dk)``.
+
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The Flattened result.
+    """
+    return _make.batch_flatten(data)
+
+
+def bias_add(data, bias, axis=1):
+    """add_bias operator.
+
+    Add 1D bias to the axis of data.
+    This function is a special case of add which allows
+    inference of shape of the bias from data.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    bias : tvm.relay.Expr
+        The bias to be added.
+
+    axis : int, optional
+        The axis to add the bias.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The final result.
+    """
+    return _make.bias_add(data, bias, axis)
+
+
+def dense(data, weight, units=None):
+    """Dense operator.
+    Applies a linear transformation
+
+    .. math::
+
+    `Y = X * W`
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    weight : tvm.relay.Expr
+        The weight expressions.
+
+    units : int, optional
+        Number of hidden units of the dense transformation.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.dense(data, weight, units)
+
+
+def relu(data):
+    """Rectified linear unit.
+
+    .. math::
+       out = max(x, 0)
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.relu(data)
+
+
+def leaky_relu(data, alpha):
+    """This operator takes data as input and does Leaky version
+    of a Rectified Linear Unit.
+
+    .. math::
+
+        `y = x > 0 ? x : alpha * x`
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    alpha : float
+        Slope coefficient for the negative half axis.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.leaky_relu(data, alpha)
+
+
+def prelu(data, alpha, axis=1):
+    """This operator takes data as input and does Leaky version
+    of a Rectified Linear Unit.
+
+    .. math::
+
+        `y = x > 0 ? x : alpha * x`
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    alpha : tvm.relay.Expr
+        Slope coefficient for the negative half axis.
+
+    axis : int, optional
+        Specify which shape axis the channel is specified.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.prelu(data, alpha, axis)
+
+
+def pad(data,
+        pad_width,
+        pad_value=0.0):
+    r"""Padding
+
+    This operator takes in a tensor and pads each axis by the specified
+    widths using the specified value.
+
+    Parameters
+    ----------
+    data: tvm.relay.Expr
+        The input data to the operator
+    pad_width: tuple of <tuple of <int>>, required
+        Number of values padded to the edges of each axis, in the format
+        of ((before_1, after_1), ..., (before_N, after_N))
+    pad_value: float, optional, default=0.0
+        The value used for padding
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.pad(data, pad_width, pad_value)
+
+
+def lrn(data, size=5, axis=1, bias=2, alpha=.00001, beta=0.75):
+    """This operator takes data as input and does local response normalization.
+
+    Normalize the input in a local region across or within feature maps.
+    Each input value is divided by (data / (bias + (alpha * sum_data ^2 /size))^beta)
+    where n is the size of each local region, and the sum is taken over the region
+    centered at that value (zero padding is added where necessary).
+
+    .. math::
+        (data / (bias + (alpha * sum_data ^2 /size))^beta)
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    size : int, optional
+        The size of the local region to be considered for normalization.
+
+    axis : int, optional
+        Input data layout channel axis. Default value is 1 for NCHW format
+
+    bias : float, optional
+        The offset parameter to avoid dividing by 0.
+
+    alpha : float, optional
+        The scaling parameter.
+
+    beta : float, optional
+        The exponent parameter.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.lrn(data, size, axis, alpha, beta, bias)
+
+
+def l2_normalize(data, eps, axis=None):
+    """Perform L2 normalization on the input data
+
+    .. math::
+        y(i, j) = x(i, j) / sqrt(max(sum(x^2), eps))
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    eps : float
+        epsilon value
+
+    axis : list of int, optional
+        axis over the normalization applied
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The computed result.
+    """
+    return _make.l2_normalize(data, eps, axis)
+
+
+def dropout(data, rate=0.5):
+    """Applies the dropout operation to the input array.
+
+    During training, each element of the input is set to zero with
+    probability ``p``. The whole array is rescaled by ``1/(1-p)``
+    to keep the expected sum of the input unchanged.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    rate : float, optional (default=0.5)
+        The probability for an element to be reset to 0.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The result of dropout
+    """
+    result = _make.dropout(data, rate)
+    return TupleWrapper(result, 2)[0]
+
+
+def batch_norm(data,
+               gamma,
+               beta,
+               moving_mean,
+               moving_var,
+               axis=1,
+               epsilon=1e-5,
+               center=True,
+               scale=True):
+    r"""
+    Batch normalization layer (Ioffe and Szegedy, 2014).
+    Normalizes the input at each batch, i.e. applies a transformation
+    that maintains the mean activation close to 0 and the activation
+    standard deviation close to 1.
+
+    .. math::
+
+        data\_mean[i] = mean(data[:,i,:,...]) \\
+        data\_var[i] = var(data[:,i,:,...])
+
+    Then compute the normalized output, which has the same shape as input, as following:
+
+    .. math::
+
+        out[:,i,:,...] = \frac{data[:,i,:,...] - data\_mean[i]}{\sqrt{data\_var[i]+\epsilon}}
+            * gamma[i] + beta[i]
+
+    Both *mean* and *var* returns a scalar by treating the input as a vector.
+
+    Assume the input has size *k* on axis 1, then both ``gamma`` and ``beta``
+    have shape *(k,)*.
+
+    Besides the inputs and the outputs, this operator accepts two auxiliary
+    states, ``moving_mean`` and ``moving_var``, which are *k*-length
+    vectors. They are global statistics for the whole dataset, which are updated by::
+
+    moving_mean = moving_mean * momentum + data_mean * (1 - momentum)
+    moving_var = moving_var * momentum + data_var * (1 - momentum)
+
+    The parameter ``axis`` specifies which axis of the input shape denotes
+    the 'channel' (separately normalized groups).  The default is 1.
+    Specifying -1 sets the channel axis to be the last item in the input shape.
+
+    .. note::
+
+        This operator can be optimized away for inference.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        Input to which batch_norm will be applied.
+
+    gamma : tvm.relay.Expr
+        The gamma scale factor.
+
+    beta : tvm.relay.Expr
+        The beta offset factor.
+
+    moving_mean : tvm.relay.Expr
+        Running mean of input,
+
+    moving_var : tvm.relay.Expr
+        Running variance of input.
+
+    axis : int, optional, default=1
+        Specify along which shape axis the channel is specified.
+
+    epsilon : double, optional, default=1e-5
+        Small float added to variance to avoid diving by zero.
+
+    center : boolean, optional, default=True
+        If True, add offset of beta to normalized tensor, If False,
+        beta is ignored.
+
+    scale : boolean, optional, default=True
+        If true, multiply by gamma. If False, gamma is not used.
+        When the next layer is piecewise linear (also e.g. nn.relu),
+        this can be disabled since the scaling will be done by the next layer.
+
+    Returns
+    -------
+    result : relay.Tuple([tvm.relay.Expr, tvm.relay.Expr, tvm.relay.Expr])
+        Tuple of normed data (same shape as input),
+        new running mean (k-length vector),
+        and new running variance (k-length vector)
+    """
+    result = _make.batch_norm(data,
+                              gamma,
+                              beta,
+                              moving_mean,
+                              moving_var,
+                              axis,
+                              epsilon,
+                              center,
+                              scale)
+    return TupleWrapper(result, 3)
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
new file mode 100644
index 000000000000..b027211acf47
--- /dev/null
+++ b/python/tvm/relay/op/op.py
@@ -0,0 +1,201 @@
+#pylint: disable=unused-argument
+"""The base node types for the Relay language."""
+import topi
+
+from ..._ffi.function import _init_api
+
+from ..base import register_relay_node
+from ..expr import Expr
+from ...api import register_func
+from ...build_module import lower, build
+from . import _make
+
+@register_relay_node
+class Op(Expr):
+    """A Relay operator definition."""
+
+    def __init__(self):
+        raise RuntimeError("Cannot create op, use get instead")
+
+    def get_attr(self, attr_name):
+        """Get additional attribute about the operator.
+
+        Parameters
+        ----------
+        attr_name : str
+            The attribute name.
+
+        Returns
+        -------
+        value : object
+            The attribute value
+        """
+        return _OpGetAttr(self, attr_name)
+
+
+def get(op_name):
+    """Get the Op for a given name
+
+    Parameters
+    ----------
+    op_name : str
+        The operator name
+
+    Returns
+    -------
+    op : Op
+        The op of the corresponding name
+    """
+    return _GetOp(op_name)
+
+
+def register(op_name, attr_key, value=None, level=10):
+    """Register an operator property of an operator.
+
+
+    Parameters
+    ----------
+    op_name : str
+        The name of operator
+
+    attr_key : str
+        The attribute name.
+
+    value : object, optional
+        The value to set
+
+    level : int, optional
+        The priority level
+
+    Returns
+    -------
+    fregister : function
+        Register function if value is not specified.
+    """
+    def _register(v):
+        """internal register function"""
+        _Register(op_name, attr_key, v, level)
+        return v
+    return _register(value) if value is not None else _register
+
+
+class OpPattern(object):
+    """Operator generic patterns
+
+    See Also
+    --------
+    top.tag : Contains explanation of the tag type.
+    """
+    # Elementwise operator
+    ELEMWISE = 0
+    # Broadcast operator
+    BROADCAST = 1
+    # Injective mapping
+    INJECTIVE = 2
+    # Comunication
+    COMM_REDUCE = 3
+    # Complex op, can still fuse ewise into it
+    OUT_ELEMWISE_FUSABLE = 4
+    # Not fusable opaque op
+    OPAQUE = 8
+
+
+def register_schedule(op_name, schedule=None, level=10):
+    """Register schedule function for an op
+
+    Parameters
+    ----------
+    op_name : str
+        The name of the op.
+
+    schedule : function (attrs: Attrs, outs: List[Tensor], target: Target) -> sch: Schedule
+        The schedule function.
+
+    level : int
+        The priority level
+    """
+    return register(op_name, "FTVMSchedule", schedule, level)
+
+
+def register_compute(op_name, compute=None, level=10):
+    """Register compute function for an op.
+
+    Parameters
+    ----------
+    op_name : str
+        The name of the op.
+
+    compute : function (attrs: Attrs, inputs: List[Tensor], out_type: Type, target:Target)
+                       -> List[Tensor]
+        The compute function.
+
+    level : int
+        The priority level
+    """
+    return register(op_name, "FTVMCompute", compute, level)
+
+
+def register_alter_op_layout(op_name, alter_layout=None, level=10):
+    """Register alter op layout function for an op
+
+    Parameters
+    ----------
+    op_name : str
+        The name of the operator
+
+    alter_layout: function (attrs: Attrs, inputs: List[Expr]) -> new_expr: Expr
+        The function for changing the layout or replacing the operator
+
+    level : int
+        The priority level
+    """
+    return register(op_name, "FTVMAlterOpLayout", alter_layout, level)
+
+
+def register_pattern(op_name, pattern, level=10):
+    """Register operator pattern for an op.
+
+    Parameters
+    ----------
+    op_name : str
+        The name of the op.
+
+    pattern : int
+        The pattern being used.
+
+    level : int
+        The priority level
+    """
+    return register(op_name, "TOpPattern", pattern, level)
+
+
+_init_api("relay.op", __name__)
+
+@register_func("relay.op.compiler._lower")
+def _lower(name, schedule, inputs, outputs):
+    return lower(schedule, list(inputs) + list(outputs), name=name)
+
+@register_func("relay.op.compiler._build")
+def _build(lowered_funcs):
+    return build(lowered_funcs, target="llvm")
+
+
+def schedule_injective(attrs, outputs, target):
+    """Generic schedule for binary broadcast."""
+    with target:
+        return topi.generic.schedule_injective(outputs)
+
+__DEBUG_COUNTER__ = 0
+
+def debug(expr, debug_func=None):
+    """The main entry point to the debugger."""
+    global __DEBUG_COUNTER__
+
+    if debug_func:
+        name = "debugger_func{}".format(__DEBUG_COUNTER__)
+        register_func(name, debug_func)
+        __DEBUG_COUNTER__ += 1
+    else:
+        name = ''
+
+    return _make.debug(expr, name)
diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py
new file mode 100644
index 000000000000..682d56fb9efc
--- /dev/null
+++ b/python/tvm/relay/op/op_attrs.py
@@ -0,0 +1,14 @@
+"""The attributes node used for Relay operators"""
+
+from ...attrs import Attrs
+from ..base import register_relay_attr_node
+
+@register_relay_attr_node
+class Conv2DAttrs(Attrs):
+    """Attribute of a Convolution Operator"""
+    pass
+
+@register_relay_attr_node
+class GlobalPool2DAttrs(Attrs):
+    """Attribute of a Global 2D Pooling Operator"""
+    pass
diff --git a/python/tvm/relay/op/reduce.py b/python/tvm/relay/op/reduce.py
new file mode 100644
index 000000000000..71c7dea9c0dc
--- /dev/null
+++ b/python/tvm/relay/op/reduce.py
@@ -0,0 +1,219 @@
+"""Reduce operators."""
+# pylint: disable=redefined-builtin
+
+from . import _make
+
+def argmax(data, axis=None, keepdims=False, exclude=False):
+    """Returns the indices of the maximum values along an axis.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    axis : None or int or tuple of int
+        Axis or axes along which a argmin operation is performed.
+        The default, axis=None, will find the indices of maximum element all of the elements of
+        the input array. If axis is negative it counts from the last to the first axis.
+
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in the result as dimensions
+        with size one.
+        With this option, the result will broadcast correctly against the input array.
+
+    exclude : bool
+        If `exclude` is true, reduction will be performed on the axes that are
+      NOT in axis instead.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    axis = [axis] if isinstance(axis, int) else axis
+    return _make.argmax(data, axis, keepdims, exclude)
+
+def argmin(data, axis=None, keepdims=False, exclude=False):
+    """Returns the indices of the minimum values along an axis.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    axis : None or int or tuple of int
+        Axis or axes along which a argmin operation is performed.
+        The default, axis=None, will find the indices of minimum element all of the elements of
+        the input array. If axis is negative it counts from the last to the first axis.
+
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in the result as dimensions
+        with size one.
+        With this option, the result will broadcast correctly against the input array.
+
+    exclude : bool
+        If `exclude` is true, reduction will be performed on the axes that are
+      NOT in axis instead.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    axis = [axis] if isinstance(axis, int) else axis
+    return _make.argmin(data, axis, keepdims, exclude)
+
+
+def sum(data, axis=None, keepdims=False, exclude=False):
+    """Computes the sum of array elements over given axes.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    axis : None or int or tuple of int
+        Axis or axes along which a argmin operation is performed.
+        The default, axis=None, will find the indices of minimum element all of the elements of
+        the input array. If axis is negative it counts from the last to the first axis.
+
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in the result as dimensions
+        with size one.
+        With this option, the result will broadcast correctly against the input array.
+
+    exclude : bool
+        If `exclude` is true, reduction will be performed on the axes that are
+      NOT in axis instead.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    axis = [axis] if isinstance(axis, int) else axis
+    return _make.sum(data, axis, keepdims, exclude)
+
+
+def max(data, axis=None, keepdims=False, exclude=False):
+    """ Computes the max of array elements over given axes.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    axis : None or int or tuple of int
+        Axis or axes along which a argmin operation is performed.
+        The default, axis=None, will find the indices of minimum element all of the elements of
+        the input array. If axis is negative it counts from the last to the first axis.
+
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in the result as dimensions
+        with size one.
+        With this option, the result will broadcast correctly against the input array.
+
+    exclude : bool
+        If `exclude` is true, reduction will be performed on the axes that are
+      NOT in axis instead.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    axis = [axis] if isinstance(axis, int) else axis
+    return _make.max(data, axis, keepdims, exclude)
+
+
+def min(data, axis=None, keepdims=False, exclude=False):
+    """Computes the min of array elements over given axes.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    axis : None or int or tuple of int
+        Axis or axes along which a argmin operation is performed.
+        The default, axis=None, will find the indices of minimum element all of the elements of
+        the input array. If axis is negative it counts from the last to the first axis.
+
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in the result as dimensions
+        with size one.
+        With this option, the result will broadcast correctly against the input array.
+
+    exclude : bool
+        If `exclude` is true, reduction will be performed on the axes that are
+      NOT in axis instead.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    axis = [axis] if isinstance(axis, int) else axis
+    return _make.min(data, axis, keepdims, exclude)
+
+
+def mean(data, axis=None, keepdims=False, exclude=False):
+    """Computes the mean of array elements over given axes.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    axis : None or int or tuple of int
+        Axis or axes along which a argmin operation is performed.
+        The default, axis=None, will find the indices of minimum element all of the elements of
+        the input array. If axis is negative it counts from the last to the first axis.
+
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in the result as dimensions
+        with size one.
+        With this option, the result will broadcast correctly against the input array.
+
+    exclude : bool
+        If `exclude` is true, reduction will be performed on the axes that are
+      NOT in axis instead.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    axis = [axis] if isinstance(axis, int) else axis
+    return _make.mean(data, axis, keepdims, exclude)
+
+
+def prod(data, axis=None, keepdims=False, exclude=False):
+    """Computes the products of array elements over given axes.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    axis : None or int or tuple of int
+        Axis or axes along which a argmin operation is performed.
+        The default, axis=None, will find the indices of minimum element all of the elements of
+        the input array. If axis is negative it counts from the last to the first axis.
+
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in the result as dimensions
+        with size one.
+        With this option, the result will broadcast correctly against the input array.
+
+    exclude : bool
+        If `exclude` is true, reduction will be performed on the axes that are
+      NOT in axis instead.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    axis = [axis] if isinstance(axis, int) else axis
+    return _make.prod(data, axis, keepdims, exclude)
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
new file mode 100644
index 000000000000..b7845cfaca57
--- /dev/null
+++ b/python/tvm/relay/op/tensor.py
@@ -0,0 +1,618 @@
+"""Basic tensor operations."""
+# pylint: disable=redefined-builtin
+from __future__ import absolute_import as _abs
+from . import _make
+from ..expr import Tuple
+
+# We create a wrapper function for each operator in the
+# python side to call into the positional _make.OpName function.
+#
+# We make this decision so that we can:
+# - Have declare python docstring for each function
+# - Enable keyword arguments easily
+# - Not put too much burden on FFI to support complicated features
+#   like default value and keyword arguments
+
+def log(data):
+    """Compute elementwise log of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.log(data)
+
+
+def exp(data):
+    """Compute elementwise exp of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.exp(data)
+
+
+def sqrt(data):
+    """Compute elementwise sqrt of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.sqrt(data)
+
+def sigmoid(data):
+    """Compute elementwise sigmoid of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.sigmoid(data)
+
+
+def floor(data):
+    """Compute element-wise floor of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.floor(data)
+
+
+def ceil(data):
+    """Compute element-wise ceil of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.ceil(data)
+
+
+def trunc(data):
+    """Compute element-wise trunc of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.trunc(data)
+
+
+def round(data):
+    """Compute element-wise round of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.round(data)
+
+
+def abs(data):
+    """Compute element-wise absolute of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.abs(data)
+
+
+def tanh(data):
+    """Compute element-wise tanh of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.tanh(data)
+
+
+def negative(data):
+    """Compute element-wise negative of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.negative(data)
+
+
+def add(lhs, rhs):
+    """Addition with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+
+    Examples
+    --------
+    .. code:: python
+
+      x = relay.Var("a") # shape is [2, 3]
+      y = relay.Var("b") # shape is [2, 1]
+      z = relay.add(x, y)  # result shape is [2, 3]
+    """
+    return _make.add(lhs, rhs)
+
+def subtract(lhs, rhs):
+    """Subtraction with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.subtract(lhs, rhs)
+
+def multiply(lhs, rhs):
+    """Multiplication with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.multiply(lhs, rhs)
+
+
+def divide(lhs, rhs):
+    """Division with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.divide(lhs, rhs)
+
+
+def power(lhs, rhs):
+    """Power with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.power(lhs, rhs)
+
+
+def mod(lhs, rhs):
+    """Mod with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.mod(lhs, rhs)
+
+
+def equal(lhs, rhs):
+    """Broadcasted elementwise test for (lhs == rhs).
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.equal(lhs, rhs)
+
+
+def not_equal(lhs, rhs):
+    """Broadcasted elementwise test for (lhs != rhs).
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.not_equal(lhs, rhs)
+
+
+def less(lhs, rhs):
+    """Broadcasted elementwise test for (lhs < rhs).
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.less(lhs, rhs)
+
+
+def less_equal(lhs, rhs):
+    """Broadcasted elementwise test for (lhs <= rhs).
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.less_equal(lhs, rhs)
+
+
+def greater(lhs, rhs):
+    """Broadcasted elementwise test for (lhs > rhs).
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.greater(lhs, rhs)
+
+
+def greater_equal(lhs, rhs):
+    """Broadcasted elementwise test for (lhs >= rhs).
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.greater_equal(lhs, rhs)
+
+
+def maximum(lhs, rhs):
+    """Maximum with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.maximum(lhs, rhs)
+
+
+def minimum(lhs, rhs):
+    """Minimum with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.minimum(lhs, rhs)
+
+
+def right_shift(lhs, rhs):
+    """Right shift with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.right_shift(lhs, rhs)
+
+
+def left_shift(lhs, rhs):
+    """Left shift with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.left_shift(lhs, rhs)
+
+
+def zeros(shape, dtype):
+    """Fill array with zeros.
+
+    Parameters
+    ----------
+    shape : tuple of int
+        The shape of the target.
+
+    dtype : data type
+        The data type of the target.
+
+    Returns
+    -------
+    result : relay.Expr
+        The resulting tensor.
+    """
+    return _make.zeros(shape, dtype)
+
+
+def zeros_like(data):
+    """Returns an array of zeros, with same type and shape as the input.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.zeros_like(data)
+
+
+def ones(shape, dtype):
+    """Fill array with ones.
+
+    Parameters
+    ----------
+    shape : tuple of int
+        The shape of the target.
+
+    dtype : data type
+        The data type of the target.
+
+    Returns
+    -------
+    result : relay.Expr
+        The resulting tensor.
+    """
+    return _make.ones(shape, dtype)
+
+
+def ones_like(data):
+    """Returns an array of ones, with same type and shape as the input.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.ones_like(data)
+
+def clip(a, a_min, a_max):
+    """Clip the elements in `a` between `a_min` and `a_max`.
+    `a_min` and `a_max` are cast to `a`'s dtype.
+
+    Parameters
+    ----------
+    a : relay.Expr
+        The input tensor.
+    a_min : float
+        The clip minimum.
+    a_max : float
+        The clip maximum.
+
+    Returns
+    -------
+    result : relay.Expr
+        `a` with elements clipped between `a_min` and `a_max`.
+
+    Examples
+    --------
+    .. code:: python
+      x = relay.Constant(tvm.nd.array([0, 1, 5, 3, 4, 2]))
+      relay.clip(x, 1., 4.)
+      # [1, 1, 4, 3, 4, 2]
+    """
+    return _make.clip(a, a_min, a_max)
+
+
+def concatenate(data, axis):
+    """Concatenate the input tensors along the given axis.
+
+    Parameters
+    ----------
+    data : Union(List[relay.Expr], Tuple[relay.Expr])
+        A list of tensors.
+    axis : int
+        The axis along which the tensors are concatenated.
+
+    Returns
+    -------
+    result: relay.Expr
+        The concatenated tensor.
+    """
+    data = list(data)
+    if not data:
+        raise ValueError("relay.concatenate requires data to be non-empty.")
+    if not isinstance(axis, int):
+        raise ValueError("For now, we only support integer axis")
+    return _make.concatenate(Tuple(data), axis)
+
+
+def copy(data):
+    """Copy a tensor.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The tensor to be copied.
+
+    Returns
+    -------
+    result: relay.Expr
+        The copied result.
+    """
+    return _make.copy(data)
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
new file mode 100644
index 000000000000..bc0a42d6ab30
--- /dev/null
+++ b/python/tvm/relay/op/transform.py
@@ -0,0 +1,451 @@
+"""Transform operators."""
+
+from . import _make
+from ..expr import TupleWrapper
+
+
+def cast(data, dtype):
+    """Cast input tensor to data type.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    dtype: str
+        The target data type
+
+    Returns
+    -------
+    result : relay.Expr
+        The casted result.
+    """
+    from .. import _make as _relay_make
+    return _relay_make.cast(data, dtype)
+
+
+def expand_dims(data, axis, num_newaxis=1):
+    """Insert `num_newaxis` axises at the position given by `axis`.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    axis : int
+        The axis at which the input array is expanded.
+        Should lie in range `[-data.ndim - 1, data.ndim]`.
+        If `axis < 0`, it is the first axis inserted;
+        If `axis >= 0`, it is the last axis inserted in Python's negative indexing.
+
+    num_newaxis : int
+        Number of axes to be inserted. Should be >= 0.
+
+    Returns
+    -------
+    result : relay.Expr
+        The reshaped result.
+    """
+    return _make.expand_dims(data, axis, num_newaxis)
+
+
+def transpose(data, axes=None):
+    """Permutes the dimensions of an array.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    axes : None or List[int]
+        The target axes order, reverse order if not specified.
+
+    Returns
+    -------
+    result : relay.Expr
+        The transposed result.
+    """
+
+    if axes is not None:
+        axes = list(axes)
+    return _make.transpose(data, axes)
+
+
+def squeeze(data, axis=None):
+    """Squeeze axes in the array.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The input data to the operator.
+
+    axis : None or List[int]
+        The set of axes to remove.
+        If axis = None, remove all axis of dimensions 1.
+        If any specified axis has dimension that does not equal 1, it is an error.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The squeezed result.
+    """
+    return _make.squeeze(data, axis)
+
+
+def reshape(data, newshape):
+    """Reshapes the input array.
+
+    Example::
+
+    To give user more convenience in without doing manual shape inference,
+    some dimensions of the shape can take special values from the set {0, -1, -2, -3, -4}.
+    The significance of each is explained below:
+
+    - ``0``  copy this dimension from the input to the output shape.
+
+    Example::
+
+    - data.shape = (2,3,4), newshape = (4,0,2), result.shape = (4,3,2)
+    - data.shape = (2,3,4), newshape = (2,0,0), result.shape = (2,3,4)
+
+    - ``-1`` infers the dimension of the output shape by using the remainder of the input dimensions
+    keeping the size of the new array same as that of the input array.
+    At most one dimension of shape can be -1.
+
+    Example::
+
+    - data.shape = (2,3,4), newshape = (6,1,-1), result.shape = (6,1,4)
+    - data.shape = (2,3,4), newshape = (3,-1,8), result.shape = (3,1,8)
+    - data.shape = (2,3,4), newshape = (-1,), result.shape = (24,)
+
+    - ``-2`` copy all/remainder of the input dimensions to the output shape.
+
+    Example::
+
+    - data.shape = (2,3,4), newshape = (-2,), result.shape = (2,3,4)
+    - data.shape = (2,3,4), newshape = (2,-2), result.shape = (2,3,4)
+    - data.shape = (2,3,4), newshape = (-2,1,1), result.shape = (2,3,4,1,1)
+
+    - ``-3`` use the product of two consecutive dimensions of the input shape
+    as the output dimension.
+
+    Example::
+
+    - data.shape = (2,3,4), newshape = (-3,4), result.shape = (6,4)
+    - data.shape = (2,3,4,5), newshape = (-3,-3), result.shape = (6,20)
+    - data.shape = (2,3,4), newshape = (0,-3), result.shape = (2,12)
+    - data.shape = (2,3,4), newshape = (-3,-2), result.shape = (6,4)
+
+    - ``-4`` split one dimension of the input into two dimensions passed subsequent
+    to -4 in shape (can contain -1).
+
+    Example::
+
+    - data.shape = (2,3,4), newshape = (-4,1,2,-2), result.shape =(1,2,3,4)
+    - data.shape = (2,3,4), newshape = (2,-4,-1,3,-2), result.shape = (2,1,3,4)
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    newshape : Union[int, Tuple[int], List[int]]
+        The new shape. Should be compatible with the original shape.
+
+    Returns
+    -------
+    result : relay.Expr
+        The reshaped result.
+    """
+    if isinstance(newshape, int):
+        newshape = [newshape]
+    return _make.reshape(data, list(newshape))
+
+
+def reshape_like(data, shape_like):
+    """Reshapes the input array by the size of another array.
+    For an input array with shape ``(d1, d2, ..., dk)``, `reshape_like` operation reshapes
+    the input array into an output array with the same shape as the second input array.
+    .. note::
+    Sizes for both array should be compatible.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    shape_like : tuple of int
+        The new shape. Should be compatible with the original shape.
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+    """
+    return _make.reshape_like(data, shape_like)
+
+
+def take(data, indices, axis=None):
+    """Take elements from an array along an axis.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The source array.
+
+    indices : rely.Expr
+        The indices of the values to extract.
+
+    axis : int, optional
+        The axis over which to select values. By default,
+        the flattened input array is used.
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+    """
+    return _make.take(data, indices, axis)
+
+
+def full(fill_value, shape=(), dtype=""):
+    """Fill array with scalar value.
+
+    Parameters
+    ----------
+    fill_value : relay.Expr
+        The value to fill. Must be a scalar.
+
+    shape : tuple of int
+        The shape of the target.
+
+    dtype : data type, optional (defaults to data type of the fill value)
+        The data type of the target.
+
+    Returns
+    -------
+    result : relay.Expr
+        The resulting tensor.
+    """
+    return _make.full(fill_value, shape, dtype)
+
+
+def full_like(data, fill_value):
+    """Return an scalar value array with the same shape and type as the input array.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input tensor.
+
+    fill_value : relay.Expr
+        The scalar value to fill.
+
+    Returns
+    -------
+    result : relay.Expr
+        The resulting tensor.
+    """
+    return _make.full_like(data, fill_value)
+
+
+def where(condition, x, y):
+    """Selecting elements from either x or y depending on the value of the
+    condition.
+
+    Parameters
+    ----------
+    condition : relay.Expr
+        The condition array. The n-th element in `y` is selected when the n-th
+        value in the `condition` array is zero. Otherwise, the corresponding
+        element from `x` will be picked.
+
+    x : relay.Expr
+        The first array to be selected.
+
+    y : relay.Expr
+        The second array to be selected.
+
+    Returns
+    -------
+    result : relay.Expr
+		The selected array.
+
+    Examples
+    --------
+    .. code-block:: python
+
+        x = [[1, 2], [3, 4]]
+        y = [[5, 6], [7, 8]]
+        condition = [[0, 1], [-1, 0]]
+        relay.where(conditon, x, y) = [[5, 2], [3, 8]]
+
+        condition = [1, 0]
+        relay.where(conditon, x, y) = [[1, 2], [7, 8]]
+
+    Note that the shape of condition, x, and y needs to be the same.
+    """
+    return _make.where(condition, x, y)
+
+def broadcast_to(data, shape):
+    """Return an scalar value array with the same type, broadcast to
+    the provided shape.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input tensor.
+
+    shape : shape
+        Provide the shape to broadcast to.
+
+    Returns
+    -------
+    result : relay.Expr
+        The resulting tensor.
+    """
+    return _make.broadcast_to(data, shape)
+
+def broadcast_to_like(data, broadcast_type):
+    """Return an scalar value array with the same shape and type as the input array.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input tensor.
+
+    broadcast_type : relay.Expr
+        Provide the type to broadcast to.
+
+    Returns
+    -------
+    result : relay.Expr
+        The resulting tensor.
+    """
+    return _make.broadcast_to_like(data, broadcast_type)
+
+
+def collapse_sum_like(data, collapse_type):
+    """Return an scalar value array with the same shape and type as the input array.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input tensor.
+
+    collapse_type : relay.Expr
+        Provide the type to collapse to.
+
+    Returns
+    -------
+    result : relay.Expr
+        The resulting tensor.
+    """
+    return _make.collapse_sum_like(data, collapse_type)
+
+
+def split(data, indices_or_sections, axis=0):
+    """Split input tensor along axis by sections or indices.
+
+    If indices_or_sections is an integer, the input will be divided equally
+    along given axis. If such a split is not possible, an error is raised.
+
+    If indices_or_sections is a tuple of sorted integers,
+    the entries indicate where along axis the array is split.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The source array.
+
+    indices_or_sections : int or tuple of int
+        Indices or sections to split into. Accepts an int or a tuple
+
+    axis : int, optional
+        The axis over which to split.
+
+    Returns
+    -------
+    ret : relay.Tuple([relay.Expr, relay.Expr])
+        The computed result.
+    """
+    if isinstance(indices_or_sections, int):
+        ret_size = indices_or_sections
+    else:
+        ret_size = len(indices_or_sections) + 1
+    return TupleWrapper(_make.split(data, indices_or_sections, axis), ret_size)
+
+
+def strided_slice(data, begin, end, strides=None):
+    """Strided slice of an array..
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The source array to be sliced.
+
+    begin: list of int
+        The indices to begin with in the slicing.
+
+    end: list of int
+        Indicies indicating end of the slice.
+
+    strides: list of int, optional
+        Specifies the stride values, it can be negative in that case,
+        the input tensor will be reversed in that particular axis.
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+    """
+    strides = strides or []
+    return _make.strided_slice(data, list(begin), list(end), list(strides))
+
+
+def slice_like(data, shape_like, axes=None):
+    """Slice the first input with respect to the second input.
+
+    For an input array with shape ``(d1, d2, ..., dk)``, `slice_like` operation slices the
+    the input array corresponding size of second array. By default will slice on all axes.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The source array.
+
+    shape_like : tvm.relay.Expr
+        The new shape.
+
+    axes : Optional[Tuple[int]]
+        List of axes on which input data will be sliced according to the corresponding size of
+        the second input. By default will slice on all axes. Negative axes mean counting in reverse.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.slice_like(data, shape_like, axes)
+
+
+def layout_transform(data, src_layout, dst_layout):
+    """Transform the layout of a tensor
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The source tensor to be transformed
+
+    src_layout: str
+        The source layout.  (e.g NCHW)
+
+    dst_layout: str
+        The destination layout.  (e.g. NCHW16c)
+
+    Returns
+    -------
+    ret : relay.Expr
+        The transformed tensor.
+    """
+    return _make.layout_transform(data, src_layout, dst_layout)
diff --git a/python/tvm/relay/op/vision/__init__.py b/python/tvm/relay/op/vision/__init__.py
new file mode 100644
index 000000000000..ea3ed69e8f38
--- /dev/null
+++ b/python/tvm/relay/op/vision/__init__.py
@@ -0,0 +1,7 @@
+# pylint: disable=wildcard-import
+"""Vision network related operators."""
+from __future__ import absolute_import as _abs
+
+from .multibox import *
+from .nms import *
+from . import _multibox
diff --git a/python/tvm/relay/op/vision/_make.py b/python/tvm/relay/op/vision/_make.py
new file mode 100644
index 000000000000..614d42f47176
--- /dev/null
+++ b/python/tvm/relay/op/vision/_make.py
@@ -0,0 +1,4 @@
+"""Constructor APIs"""
+from ...._ffi.function import _init_api
+
+_init_api("relay.op.vision._make", __name__)
diff --git a/python/tvm/relay/op/vision/_multibox.py b/python/tvm/relay/op/vision/_multibox.py
new file mode 100644
index 000000000000..e9ef43f7e06f
--- /dev/null
+++ b/python/tvm/relay/op/vision/_multibox.py
@@ -0,0 +1,77 @@
+# pylint: disable=invalid-name, unused-argument
+"""Definition of vision ops"""
+from __future__ import absolute_import
+
+import topi
+from topi.util import get_const_int, get_const_float, get_float_tuple
+from .. import op as reg
+from ..op import OpPattern
+
+
+@reg.register_schedule("vision.multibox_prior")
+def schedule_multibox_prior(_, outs, target):
+    """Schedule definition of multibox_prior"""
+    with target:
+        return topi.generic.schedule_multibox_prior(outs)
+
+
+@reg.register_compute("vision.multibox_prior")
+def compute_multibox_prior(attrs, inputs, _, target):
+    """Compute definition of multibox_prior"""
+    sizes = get_float_tuple(attrs.sizes)
+    ratios = get_float_tuple(attrs.ratios)
+    steps = get_float_tuple(attrs.steps)
+    offsets = get_float_tuple(attrs.offsets)
+    clip = bool(get_const_int(attrs.clip))
+    return [
+        topi.vision.ssd.multibox_prior(inputs[0], sizes, ratios, steps,
+                                       offsets, clip)
+    ]
+
+
+reg.register_pattern("vision.multibox_prior", OpPattern.OPAQUE)
+
+
+# multibox_transform_loc
+@reg.register_schedule("vision.multibox_transform_loc")
+def schedule_multibox_transform_loc(_, outs, target):
+    """Schedule definition of multibox_detection"""
+    with target:
+        return topi.generic.schedule_multibox_transform_loc(outs)
+
+
+@reg.register_compute("vision.multibox_transform_loc")
+def compute_multibox_transform_loc(attrs, inputs, _, target):
+    """Compute definition of multibox_detection"""
+    clip = bool(get_const_int(attrs.clip))
+    threshold = get_const_float(attrs.threshold)
+    variances = get_float_tuple(attrs.variances)
+    return topi.vision.ssd.multibox_transform_loc(
+        inputs[0], inputs[1], inputs[2], clip, threshold, variances)
+
+
+reg.register_pattern("vision.multibox_transform_loc", OpPattern.OPAQUE)
+reg.register_pattern("vision.multibox_detection", OpPattern.OPAQUE)
+
+
+# non-maximum suppression
+@reg.register_schedule("vision.nms")
+def schedule_nms(_, outs, target):
+    """Schedule definition of nms"""
+    with target:
+        return topi.generic.schedule_nms(outs)
+
+
+@reg.register_compute("vision.nms")
+def compute_nms(attrs, inputs, _, target):
+    """Compute definition of nms"""
+    overlap_threshold = get_const_float(attrs.overlap_threshold)
+    force_suppress = bool(get_const_int(attrs.force_suppress))
+    topk = get_const_int(attrs.topk)
+    return [
+        topi.vision.nms(inputs[0], inputs[1], overlap_threshold,
+                        force_suppress, topk)
+    ]
+
+
+reg.register_pattern("vision.nms", OpPattern.OPAQUE)
diff --git a/python/tvm/relay/op/vision/multibox.py b/python/tvm/relay/op/vision/multibox.py
new file mode 100644
index 000000000000..90591da925f5
--- /dev/null
+++ b/python/tvm/relay/op/vision/multibox.py
@@ -0,0 +1,76 @@
+"""Multibox operations."""
+from __future__ import absolute_import as _abs
+from . import _make
+from ...expr import TupleWrapper
+
+def multibox_prior(data,
+                   sizes=(1.0,),
+                   ratios=(1.0,),
+                   steps=(-1.0, -1.0),
+                   offsets=(0.5, 0.5),
+                   clip=False):
+    """Generate prior(anchor) boxes from data, sizes and ratios.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data tensor.
+
+    sizes : tuple of float, optional
+        Tuple of sizes for anchor boxes.
+
+    ratios : tuple of float, optional
+        Tuple of ratios for anchor boxes.
+
+    steps : Tuple of float, optional
+        Priorbox step across y and x, -1 for auto calculation.
+
+    offsets : tuple of int, optional
+        Priorbox center offsets, y and x respectively.
+
+    clip : boolean, optional
+        Whether to clip out-of-boundary boxes.
+
+    Returns
+    -------
+    out : relay.Expr
+        3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
+    """
+    return _make.multibox_prior(data, sizes, ratios, steps, offsets, clip)
+
+
+def multibox_transform_loc(cls_prob,
+                           loc_pred,
+                           anchor,
+                           clip=True,
+                           threshold=0.01,
+                           variances=(0.1, 0.1, 0.2, 0.2)):
+    """Location transformation for multibox detection
+
+    Parameters
+    ----------
+    cls_prob : tvm.relay.Expr
+        Class probabilities.
+
+    loc_pred : tvm.relay.Expr
+        Location regression predictions.
+
+    anchor : tvm.relay.Expr
+        Prior anchor boxes.
+
+    clip : boolean, optional
+        Whether to clip out-of-boundary boxes.
+
+    threshold : double, optional
+        Threshold to be a positive prediction.
+
+    variances : Tuple of float, optional
+        variances to be decoded from box regression output.
+
+    Returns
+    -------
+    ret : tuple of tvm.relay.Expr
+    """
+    return TupleWrapper(_make.multibox_transform_loc(cls_prob, loc_pred,
+                                                     anchor, clip, threshold,
+                                                     variances), 2)
diff --git a/python/tvm/relay/op/vision/nms.py b/python/tvm/relay/op/vision/nms.py
new file mode 100644
index 000000000000..8035e3030b17
--- /dev/null
+++ b/python/tvm/relay/op/vision/nms.py
@@ -0,0 +1,36 @@
+"""Non-maximum suppression operations."""
+from __future__ import absolute_import as _abs
+from . import _make
+
+def nms(data,
+        valid_count,
+        overlap_threshold=0.5,
+        force_suppress=False,
+        topk=-1):
+    """Non-maximum suppression operator for object detection.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        3-D tensor with shape [batch_size, num_anchors, 6].
+        The last dimension should be in format of
+        [class_id, score, box_left, box_top, box_right, box_bottom].
+
+    valid_count : relay.Expr
+        1-D tensor for valid number of boxes.
+
+    overlap_threshold : float, optional
+        Non-maximum suppression threshold.
+
+    force_suppress : bool, optional
+        Suppress all detections regardless of class_id.
+
+    topk : int, optional
+        Keep maximum top k detections before nms, -1 for no limit.
+
+    Returns
+    -------
+    out : relay.Expr
+        3-D tensor with shape [batch_size, num_anchors, 6].
+    """
+    return _make.nms(data, valid_count, overlap_threshold, force_suppress, topk)
diff --git a/python/tvm/relay/parser.py b/python/tvm/relay/parser.py
new file mode 100644
index 000000000000..51200343f147
--- /dev/null
+++ b/python/tvm/relay/parser.py
@@ -0,0 +1,17 @@
+"""A parser for Relay's text format."""
+from __future__ import absolute_import
+
+def enabled():
+    """Is the parser enabled/Can we import the parser?"""
+    try:
+        # pylint: disable=unused-variable
+        from tvm.relay import _parser
+        return True
+    # pylint: disable=broad-except
+    except Exception:
+        return False
+
+def fromtext(data):
+    """Parse a Relay program."""
+    from tvm.relay import _parser
+    return _parser.fromtext(data)
diff --git a/python/tvm/relay/scope_builder.py b/python/tvm/relay/scope_builder.py
new file mode 100644
index 000000000000..074a4aa66c81
--- /dev/null
+++ b/python/tvm/relay/scope_builder.py
@@ -0,0 +1,185 @@
+"""The scope builder interface """
+from __future__ import absolute_import
+
+from . import expr as _expr
+from .._ffi import base as _base
+
+class WithScope(object):
+    """A wrapper for builder methods which introduce scoping.
+
+    Parameters
+    ----------
+    enter_value: object
+        The value returned by enter.
+    """
+
+    def __init__(self, enter_value, exit_cb):
+        self._enter_value = enter_value
+        self._exit_cb = exit_cb
+
+    def __enter__(self):
+        return self._enter_value
+
+    def __exit__(self, ptype, value, trace):
+        if value:
+            raise value
+        else:
+            self._exit_cb()
+
+
+def _make_lets(bindings, ret_value):
+    """Make a nested let expressions.
+
+    Parameters
+    ----------
+    bindings: List[Tuple[tvm.relay.Var,tvm.relay.Expr]]
+        The sequence of let bindings
+
+    ret_value: tvm.relay.Expr
+        The final value of the expression.
+
+    Returns
+    -------
+    lets: tvm.relay.Expr
+        A nested let expression.
+    """
+    if ret_value is None:
+        raise RuntimeError("ret is not called in this scope")
+    if isinstance(ret_value, _expr.If) and ret_value.false_branch is None:
+        raise RuntimeError("Creating an If expression without else.")
+    let_expr = ret_value
+    for var, value in reversed(bindings):
+        let_expr = _expr.Let(var, value, let_expr)
+    return let_expr
+
+
+class ScopeBuilder(object):
+    """Scope builder class.
+
+    Enables users to build up a nested
+    scope(let, if) expression easily.
+
+    Examples
+    --------
+    .. code-block: python
+
+        sb = relay.ScopeBuilder()
+        cond = relay.var("cond", 'bool')
+        x = relay.var("x")
+        y = relay.var("y")
+
+        with sb.if_scope(cond):
+            one = relay.const(1, "float32")
+            t1 = sb.let(t1, relay.add(x, one))
+            sb.ret(t1)
+        with sb.else_scope():
+            sb.ret(y)
+
+        print(sb.get().astext())
+    """
+    def __init__(self):
+        self._bindings = [[]]
+        self._ret_values = [None]
+
+    def _enter_scope(self):
+        self._bindings.append([])
+        self._ret_values.append(None)
+
+    def _exit_scope(self):
+        bindings = self._bindings.pop()
+        ret_value = self._ret_values.pop()
+        return bindings, ret_value
+
+    def let(self, var, value):
+        """Create a new let binding.
+
+        Parameters
+        ----------
+        var: Union[Tuple[str, relay.Type], tvm.relay.Var]
+            The variable or name of variable.
+
+        value: tvm.relay.Expr
+            The value to be binded
+        """
+        if isinstance(var, (tuple, list)):
+            if len(var) > 2:
+                raise ValueError("Expect var to be Tuple[str, relay.Type]")
+            var = _expr.var(*var)
+        elif isinstance(var, _base.string_types):
+            var = _expr.var(var)
+        self._bindings[-1].append((var, value))
+        return var
+
+    def if_scope(self, cond):
+        """Create a new if scope.
+
+        Parameters
+        ----------
+        cond: tvm.relay.expr.Expr
+            The condition
+
+        Returns
+        -------
+        scope: WithScope
+            The if scope.
+
+        Note
+        ----
+        The user must follows with an else scope.
+        """
+        self._enter_scope()
+        def _on_exit():
+            bindings, ret_value = self._exit_scope()
+            if self._ret_values[-1] is not None:
+                raise RuntimeError("result already returned before if scope")
+            true_branch = _make_lets(bindings, ret_value)
+            self._ret_values[-1] = _expr.If(cond, true_branch, None)
+        return WithScope(None, _on_exit)
+
+    def else_scope(self):
+        """Create a new else scope.
+
+        Returns
+        -------
+        scope: WithScope
+            The if scope.
+        """
+        self._enter_scope()
+
+        def _on_exit():
+            bindings, ret_value = self._exit_scope()
+            partial_if = self._ret_values[-1]
+            no_else = (not isinstance(partial_if, _expr.If) or
+                       partial_if.false_branch is not None)
+            if no_else:
+                raise RuntimeError("else scope must follows")
+            false_branch = _make_lets(bindings, ret_value)
+            self._ret_values[-1] = _expr.If(
+                partial_if.cond,
+                partial_if.true_branch,
+                false_branch)
+        return WithScope(None, _on_exit)
+
+    def ret(self, value):
+        """Set the return value of this scope.
+
+        Parameters
+        ----------
+        value: tvm.relay.expr.Expr
+            The return value.
+        """
+        if self._ret_values[-1] is not None:
+            raise RuntimeError("ret value is already set in this scope.")
+        self._ret_values[-1] = value
+
+    def get(self):
+        """Get the generated result.
+
+        Returns
+        -------
+        value: tvm.relay.expr.Expr
+            The final result of the expression.
+        """
+        if len(self._bindings) != 1:
+            raise RuntimeError("can only call get at the outmost scope")
+        return _make_lets(self._bindings[-1], self._ret_values[-1])
diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
new file mode 100644
index 000000000000..f49013928748
--- /dev/null
+++ b/python/tvm/relay/testing/__init__.py
@@ -0,0 +1,14 @@
+"""Utilities for testing and benchmarks"""
+from __future__ import absolute_import as _abs
+
+from . import mlp
+from . import resnet
+from . import dqn
+from . import dcgan
+from . import mobilenet
+from . import lstm
+from . import inception_v3
+from . import squeezenet
+from . import vgg
+from . import densenet
+from .config import ctx_list
diff --git a/python/tvm/relay/testing/config.py b/python/tvm/relay/testing/config.py
new file mode 100644
index 000000000000..677b72d979a1
--- /dev/null
+++ b/python/tvm/relay/testing/config.py
@@ -0,0 +1,14 @@
+"""Configuration about tests"""
+from __future__ import absolute_import as _abs
+
+import os
+import tvm
+
+def ctx_list():
+    """Get context list for testcases"""
+    device_list = os.environ.get("RELAY_TEST_TARGETS", "")
+    device_list = (device_list.split(",") if device_list
+                   else ["llvm", "cuda"])
+    device_list = set(device_list)
+    res = [(device, tvm.context(device, 0)) for device in device_list]
+    return [x for x in res if x[1].exist]
diff --git a/python/tvm/relay/testing/dcgan.py b/python/tvm/relay/testing/dcgan.py
new file mode 100644
index 000000000000..d6c1d55df01a
--- /dev/null
+++ b/python/tvm/relay/testing/dcgan.py
@@ -0,0 +1,96 @@
+# pylint: disable=unused-argument
+"""
+Net of the generator of DCGAN
+
+Adopted from:
+https://github.com/tqchen/mxnet-gan/blob/master/mxgan/generator.py
+
+Reference:
+Radford, Alec, Luke Metz, and Soumith Chintala.
+"Unsupervised representation learning with deep convolutional generative adversarial networks."
+arXiv preprint arXiv:1511.06434 (2015).
+"""
+from tvm import relay
+from . import layers
+from .init import create_workload
+
+def deconv2d(data, ishape, oshape, kshape, name, stride=(2, 2)):
+    """a deconv layer that enlarges the feature map"""
+    target_shape = (oshape[-2], oshape[-1])
+
+    pad_y = (kshape[0] - 1) // 2
+    pad_x = (kshape[1] - 1) // 2
+    adj_y = (target_shape[0] + 2 * pad_y - kshape[0]) % stride[0]
+    adj_x = (target_shape[1] + 2 * pad_x - kshape[1]) % stride[1]
+
+    net = layers.conv2d_transpose(data,
+                                  kernel_size=kshape,
+                                  strides=stride,
+                                  channels=oshape[0],
+                                  padding=(pad_y, pad_x),
+                                  output_padding=(adj_y, adj_x),
+                                  name=name)
+    return net
+
+def deconv2d_bn_relu(data, prefix, **kwargs):
+    """a block of deconv + batch norm + relu"""
+    eps = 1e-5 + 1e-12
+    net = deconv2d(data, name="%s_deconv" % prefix, **kwargs)
+    net = layers.batch_norm_infer(net, epsilon=eps, name="%s_batch_norm" % prefix)
+    net = relay.nn.relu(net)
+    return net
+
+def get_net(batch_size, random_len=100, oshape=(3, 64, 64), ngf=128, code=None, dtype="float32"):
+    """get net of dcgan generator"""
+    assert oshape[-1] == 64, "Only support 64x64 image"
+    assert oshape[-2] == 64, "Only support 64x64 image"
+
+    code = relay.var("data", dtype=dtype, shape=(batch_size, random_len)) if code is None else code
+    dense_weight = relay.var("dense_weight")
+    dense = relay.nn.dense(code, weight=dense_weight, units=4*4*ngf*8)
+    relu = relay.nn.relu(dense)
+    # 4 x 4
+    reshape = relay.reshape(relu, newshape=(-1, ngf * 8, 4, 4))
+    # 8 x 8
+    dc8 = deconv2d_bn_relu(
+        reshape, ishape=(ngf * 8, 4, 4), oshape=(ngf * 4, 8, 8), kshape=(4, 4), prefix="g2")
+    # 16x16
+    dc16 = deconv2d_bn_relu(
+        dc8, ishape=(ngf * 4, 8, 8), oshape=(ngf * 2, 16, 16), kshape=(4, 4), prefix="g3")
+    # 32x32
+    dc32 = deconv2d_bn_relu(
+        dc16, ishape=(ngf * 2, 16, 16), oshape=(ngf, 32, 32), kshape=(4, 4), prefix="g4")
+    # 64x64
+    dc64 = deconv2d(
+        dc32, ishape=(ngf, 32, 32), oshape=oshape[-3:], kshape=(4, 4), name="g5_deconv")
+    tanh = relay.tanh(dc64)
+
+    args = relay.ir_pass.free_vars(tanh)
+    return relay.Function(args, tanh)
+
+
+def get_workload(batch_size, oshape=(3, 64, 64), ngf=128, random_len=100, dtype="float32"):
+    """Get benchmark workload for a DCGAN generator
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+    oshape : tuple, optional
+        The shape of output image, layout="CHW"
+    ngf: int, optional
+        The number of final feature maps in the generator
+    random_len : int, optional
+        The length of random input
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : nnvm.symbol
+        The computational graph
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_net(batch_size, random_len, oshape=oshape, ngf=ngf, dtype=dtype)
+    return create_workload(net)
diff --git a/python/tvm/relay/testing/densenet.py b/python/tvm/relay/testing/densenet.py
new file mode 100644
index 000000000000..7abebc75ecee
--- /dev/null
+++ b/python/tvm/relay/testing/densenet.py
@@ -0,0 +1,123 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=invalid-name, line-too-long
+"""
+Port of MxNet version of Densenet to Relay.
+https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/model_zoo/vision/densenet.py
+"""
+# pylint: enable=line-too-long
+from tvm import relay
+from . import layers
+from .init import create_workload
+
+def _make_dense_layer(data, growth_rate, bn_size, index):
+    """Single densenet layer."""
+    bn1 = layers.batch_norm_infer(data, name="batch_1_%s" % index)
+    relu1 = relay.nn.relu(bn1)
+    conv1 = layers.conv2d(relu1, channels=bn_size * growth_rate,
+                          kernel_size=(1, 1), name="conv2d_1_%s" % index)
+    bn2 = layers.batch_norm_infer(conv1, name="batch_2_" + index)
+    relu2 = relay.nn.relu(bn2)
+    conv2 = layers.conv2d(relu2, channels=growth_rate, kernel_size=(3, 3),
+                          padding=(1, 1), name="conv2d_2_%s" % index)
+    return conv2
+
+def _make_dense_block(data, num_layers, bn_size, growth_rate, index):
+    """Makes a block of dense layers of the specified size."""
+    layer_out = data
+    for i in range(num_layers):
+        layer_out = _make_dense_layer(layer_out, growth_rate, bn_size,
+                                      "(%s, %s)" % (index, i))
+    return layer_out
+
+def _make_transition(data, num_output_features, index):
+    """Transition between layers."""
+    bn = layers.batch_norm_infer(data, name="batch_t_%s" % index)
+    relu = relay.nn.relu(bn)
+    conv = layers.conv2d(relu, channels=num_output_features,
+                         kernel_size=(1, 1), name="conv_t_%s" % index)
+    return relay.nn.avg_pool2d(conv, pool_size=(2, 2), strides=(2, 2))
+
+def _make_dense_net(num_init_features, growth_rate, block_config,
+                    data_shape, data_dtype, bn_size=4, classes=1000):
+    """Builds up a densenet."""
+    data = relay.Var("data", relay.TensorType(data_shape, data_dtype)) # (bn_size, 3, 224, 224)))
+    conv1 = layers.conv2d(data, channels=num_init_features,
+                          kernel_size=(7, 7), strides=(2, 2), padding=(3, 3),
+                          name='conv1')
+    bn1 = layers.batch_norm_infer(conv1, name='batch1')
+    relu1 = relay.nn.relu(bn1)
+    mp = relay.nn.max_pool2d(relu1, pool_size=(3, 3), strides=(2, 2), padding=(1, 1))
+
+    num_features = num_init_features
+    layer_out = mp
+    for i, num_layers in enumerate(block_config):
+        layer_out = _make_dense_block(layer_out, num_layers, growth_rate, bn_size, i)
+        num_features = num_features + num_layers*growth_rate
+        if i != len(block_config) - 1:
+            layer_out = _make_transition(layer_out, num_features // 2, i)
+            num_features = num_features // 2
+    bn2 = layers.batch_norm_infer(layer_out, name='batch2')
+    relu2 = relay.nn.relu(bn2)
+    avg = relay.nn.avg_pool2d(relu2, pool_size=(7, 7))
+    flat = relay.nn.batch_flatten(avg)
+
+    ret = layers.dense_add_bias(flat, units=classes, name='dense')
+
+    return relay.Function(relay.ir_pass.free_vars(ret), ret)
+
+def get_workload(densenet_size=121, classes=1000, batch_size=4,
+                 image_shape=(3, 224, 224), dtype='float32'):
+    """Gets benchmark workload for densenet.
+
+    Parameters
+    ----------
+    densenet_size : int, optional (default 121)
+        Parameter for the network size. The supported sizes
+        are 121, 161, 169, and 201.
+
+    classes : int, optional (default 1000)
+        The number of classes.
+
+    batch_size : int, optional (detault 4)
+        The batch size for the network.
+
+    image_shape : shape, optional (default (3, 224, 224))
+        The shape of the input data.
+
+    dtype : data type, optional (default 'float32')
+        The data type of the input data.
+
+    Returns
+    -------
+    net: relay.Function
+        The computation graph representing densenet.
+
+    params : dict of str to NDArray
+        The benchmark paraeters.
+    """
+    specs = {121: (64, 32, [6, 12, 24, 16]),
+             161: (96, 48, [6, 12, 36, 24]),
+             169: (69, 32, [6, 12, 32, 32]),
+             201: (64, 32, [6, 12, 48, 32])}
+
+    num_init_features, growth_rate, block_config = specs[densenet_size]
+    data_shape = tuple([batch_size] + list(image_shape))
+    net = _make_dense_net(num_init_features, growth_rate, block_config,
+                          data_shape, dtype, batch_size, classes)
+    return create_workload(net)
diff --git a/python/tvm/relay/testing/dqn.py b/python/tvm/relay/testing/dqn.py
new file mode 100644
index 000000000000..034ac0a6c2e5
--- /dev/null
+++ b/python/tvm/relay/testing/dqn.py
@@ -0,0 +1,81 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Net of Nature DQN
+Reference:
+Mnih, Volodymyr, et al. "Human-level control through deep reinforcement learning."
+Nature 518.7540 (2015): 529.
+"""
+
+from tvm import relay
+from . import layers
+from .init import create_workload
+
+def get_net(batch_size, num_actions=18, image_shape=(4, 84, 84), dtype="float32"):
+    """get symbol of nature dqn"""
+    data_shape = (batch_size,) + image_shape
+    data = relay.var("data", shape=data_shape, dtype=dtype)
+
+    conv1_bias = relay.var("conv1_bias")
+    conv1 = layers.conv2d(data, kernel_size=(8, 8), strides=(4, 4), padding=(0, 0),
+                          channels=32, name="conv1")
+    conv1 = relay.nn.bias_add(conv1, conv1_bias)
+    relu1 = relay.nn.relu(conv1)
+
+    conv2_bias = relay.var("conv2_bias")
+    conv2 = layers.conv2d(relu1, kernel_size=(4, 4), strides=(2, 2), padding=(0, 0),
+                          channels=64, name="conv2")
+    conv2 = relay.nn.bias_add(conv2, conv2_bias)
+    relu2 = relay.nn.relu(conv2)
+
+    conv3_bias = relay.var("conv3_bias")
+    conv3 = layers.conv2d(relu2, kernel_size=(3, 3), strides=(1, 1), padding=(0, 0),
+                          channels=64, name="conv3")
+    conv3 = relay.nn.bias_add(conv3, conv3_bias)
+    relu3 = relay.nn.relu(conv3)
+
+    bf1 = relay.nn.batch_flatten(relu3)
+    dense1 = layers.dense_add_bias(bf1, units=512, name="dense1")
+    relu4 = relay.nn.relu(dense1)
+    dense2 = layers.dense_add_bias(relu4, units=num_actions, name="dense2")
+
+    args = relay.ir_pass.free_vars(dense2)
+    return relay.Function(args, dense2)
+
+
+def get_workload(batch_size, num_actions=18, image_shape=(4, 84, 84), dtype="float32"):
+    """Get benchmark workload for a Deep Q Network
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+    num_actions : int, optional
+        Number of actions
+    image_shape : tuple, optional
+        The input image shape
+    dtype : str, optional
+        The data type
+    Returns
+    -------
+    net : nnvm.symbol
+        The computational graph
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_net(batch_size, num_actions=num_actions, image_shape=image_shape, dtype=dtype)
+    return create_workload(net)
diff --git a/python/tvm/relay/testing/inception_v3.py b/python/tvm/relay/testing/inception_v3.py
new file mode 100644
index 000000000000..491b221fbe0a
--- /dev/null
+++ b/python/tvm/relay/testing/inception_v3.py
@@ -0,0 +1,284 @@
+"""
+Inception V3, suitable for images with around 299 x 299
+
+Reference:
+Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision."
+arXiv preprint arXiv:1512.00567 (2015).
+
+Adopted from https://github.com/apache/incubator-mxnet/blob/
+             master/example/image-classification/symbols/inception-v3.py
+"""
+# pylint: disable=invalid-name,missing-docstring,unused-argument
+from tvm import relay
+from .init import create_workload
+from . import layers
+
+def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''):
+    conv = layers.conv2d(
+        data=data,
+        channels=int(num_filter),
+        kernel_size=kernel,
+        strides=stride,
+        padding=pad,
+        name='%s%s_conv1' % (name, suffix))
+
+    bn = layers.batch_norm_infer(data=conv, epsilon=2e-5, name='%s%s_bn' % (name, suffix))
+    act = relay.nn.relu(data=bn)
+    return act
+
+def Pooling(data, kernel, stride, pad, pool_type, name):
+    if pool_type == 'max':
+        return relay.nn.max_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad)
+    elif pool_type == 'avg':
+        return relay.nn.avg_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad,
+                                   count_include_pad=True)
+    else:
+        raise ValueError("Invalid pooling type: " + pool_type)
+
+def Inception7A(data,
+                num_1x1,
+                num_3x3_red, num_3x3_1, num_3x3_2,
+                num_5x5_red, num_5x5,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name))
+    tower_5x5 = Conv(data, num_5x5_red, name=('%s_tower' % name), suffix='_conv')
+    tower_5x5 = Conv(tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name),
+                     suffix='_conv_1')
+    tower_3x3 = Conv(data, num_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name),
+                     suffix='_conv_1')
+    tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name),
+                     suffix='_conv_2')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool,
+                      name=('%s_pool_%s_pool' % (pool, name)))
+
+    cproj = Conv(pooling, proj, name=('%s_tower_2' % name), suffix='_conv')
+    concat = relay.concatenate((tower_1x1, tower_5x5, tower_3x3, cproj), axis=1)
+    return concat
+
+# First Downsample
+def Inception7B(data,
+                num_3x3,
+                num_d3x3_red, num_d3x3_1, num_d3x3_2,
+                pool,
+                name):
+    tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2),
+                     name=('%s_conv' % name))
+    tower_d3x3 = Conv(data, num_d3x3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel=(3, 3), pad=(1, 1), stride=(1, 1),
+                      name=('%s_tower' % name), suffix='_conv_1')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel=(3, 3), pad=(0, 0), stride=(2, 2),
+                      name=('%s_tower' % name), suffix='_conv_2')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0, 0), pool_type="max",
+                      name=('max_pool_%s_pool' % name))
+    concat = relay.concatenate((tower_3x3, tower_d3x3, pooling), axis=1)
+    return concat
+
+def Inception7C(data,
+                num_1x1,
+                num_d7_red, num_d7_1, num_d7_2,
+                num_q7_red, num_q7_1, num_q7_2, num_q7_3, num_q7_4,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d7 = Conv(data=data, num_filter=num_d7_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3),
+                    name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0),
+                    name=('%s_tower' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=data, num_filter=num_q7_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_1, kernel=(7, 1), pad=(3, 0),
+                    name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_2, kernel=(1, 7), pad=(0, 3),
+                    name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_3, kernel=(7, 1), pad=(3, 0),
+                    name=('%s_tower_1' % name), suffix='_conv_3')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_4, kernel=(1, 7), pad=(0, 3),
+                    name=('%s_tower_1' % name), suffix='_conv_4')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool,
+                      name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1),
+                 name=('%s_tower_2' % name), suffix='_conv')
+    # concat
+    concat = relay.concatenate((tower_1x1, tower_d7, tower_q7, cproj), axis=1)
+    return concat
+
+def Inception7D(data,
+                num_3x3_red, num_3x3,
+                num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3,
+                pool,
+                name):
+    tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=('%s_tower' % name),
+                     suffix='_conv')
+    tower_3x3 = Conv(data=tower_3x3, num_filter=num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2),
+                     name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=data, num_filter=num_d7_3x3_red, name=('%s_tower_1' % name),
+                        suffix='_conv')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3),
+                        name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0),
+                        name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_3x3, kernel=(3, 3), stride=(2, 2),
+                        name=('%s_tower_1' % name), suffix='_conv_3')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, pad=(0, 0),
+                      name=('%s_pool_%s_pool' % (pool, name)))
+    # concat
+    concat = relay.concatenate((tower_3x3, tower_d7_3x3, pooling), axis=1)
+    return concat
+
+def Inception7E(data,
+                num_1x1,
+                num_d3_red, num_d3_1, num_d3_2,
+                num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d3 = Conv(data=data, num_filter=num_d3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3_a = Conv(data=tower_d3, num_filter=num_d3_1, kernel=(1, 3), pad=(0, 1),
+                      name=('%s_tower' % name), suffix='_mixed_conv')
+    tower_d3_b = Conv(data=tower_d3, num_filter=num_d3_2, kernel=(3, 1), pad=(1, 0),
+                      name=('%s_tower' % name), suffix='_mixed_conv_1')
+    tower_3x3_d3 = Conv(data=data, num_filter=num_3x3_d3_red, name=('%s_tower_1' % name),
+                        suffix='_conv')
+    tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1),
+                        name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_1, kernel=(1, 3), pad=(0, 1),
+                          name=('%s_tower_1' % name), suffix='_mixed_conv')
+    tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_2, kernel=(3, 1), pad=(1, 0),
+                          name=('%s_tower_1' % name), suffix='_mixed_conv_1')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool,
+                      name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' % name),
+                 suffix='_conv')
+    # concat
+    concat = relay.concatenate(
+        (tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj), axis=1)
+    return concat
+
+def get_net(batch_size,
+            num_classes,
+            image_shape,
+            dtype):
+    """Get network a Inception v3 network.
+
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of claseses
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : relay.Function
+        The dataflow.
+    """
+    data_shape = (batch_size,) + image_shape
+    data = relay.var("data",
+                     shape=data_shape,
+                     dtype=dtype)
+
+    # stage 1
+    conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv")
+    conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1")
+    conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2")
+    pool = Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", pad=(0, 0),
+                   name="pool")
+    # stage 2
+    conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3")
+    conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4")
+    pool1 = Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", pad=(0, 0),
+                    name="pool1")
+
+    # stage 3
+    in3a = Inception7A(pool1, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 32, "mixed")
+
+    in3b = Inception7A(in3a, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_1")
+    in3c = Inception7A(in3b, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_2")
+    in3d = Inception7B(in3c, 384,
+                       64, 96, 96,
+                       "max", "mixed_3")
+    # stage 4
+    in4a = Inception7C(in3d, 192,
+                       128, 128, 192,
+                       128, 128, 128, 128, 192,
+                       "avg", 192, "mixed_4")
+    in4b = Inception7C(in4a, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_5")
+    in4c = Inception7C(in4b, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_6")
+    in4d = Inception7C(in4c, 192,
+                       192, 192, 192,
+                       192, 192, 192, 192, 192,
+                       "avg", 192, "mixed_7")
+    in4e = Inception7D(in4d, 192, 320,
+                       192, 192, 192, 192,
+                       "max", "mixed_8")
+    # stage 5
+    in5a = Inception7E(in4e, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "avg", 192, "mixed_9")
+    in5b = Inception7E(in5a, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "max", 192, "mixed_10")
+
+    # pool
+    pool = Pooling(data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", pad=(0, 0),
+                   name="global_pool")
+
+    flatten = relay.nn.batch_flatten(pool)
+    fc1 = relay.nn.dense(flatten, relay.var("fc1_weight"), units=num_classes)
+    fc1 = relay.nn.bias_add(fc1, relay.var("fc2_bias"))
+    inception_v3 = relay.nn.softmax(data=fc1)
+    args = relay.ir_pass.free_vars(inception_v3)
+    return relay.Function(args, inception_v3)
+
+def get_workload(batch_size=1, num_classes=1000,
+                 image_shape=(3, 299, 299), dtype="float32"):
+    """Get benchmark workload for InceptionV3
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_net(batch_size, num_classes, image_shape, dtype)
+    return create_workload(net)
diff --git a/python/tvm/relay/testing/init.py b/python/tvm/relay/testing/init.py
new file mode 100644
index 000000000000..7f92b539a1a3
--- /dev/null
+++ b/python/tvm/relay/testing/init.py
@@ -0,0 +1,150 @@
+"""Initializer of parameters."""
+import numpy as np
+
+import tvm
+from tvm import relay
+
+class Initializer(object):
+    """The base class of an initializer."""
+    def __init__(self, **kwargs):
+        self._kwargs = kwargs
+
+    def __call__(self, desc, arr):
+        """Initialize an array
+
+        Parameters
+        ----------
+        desc : str
+            Initialization pattern descriptor.
+
+        arr : NDArray
+            The array to be initialized.
+        """
+        if desc.endswith('weight'):
+            self._init_weight(desc, arr)
+        elif desc.endswith('bias'):
+            self._init_bias(desc, arr)
+        elif desc.endswith('gamma'):
+            self._init_gamma(desc, arr)
+        elif desc.endswith('beta'):
+            self._init_beta(desc, arr)
+        elif desc.endswith('mean'):
+            self._init_mean(desc, arr)
+        elif desc.endswith('var'):
+            self._init_var(desc, arr)
+        else:
+            self._init_default(desc, arr)
+
+    def _init_bias(self, _, arr):
+        arr[:] = 0.0
+
+    def _init_gamma(self, _, arr):
+        arr[:] = 1.0
+
+    def _init_beta(self, _, arr):
+        arr[:] = 0.0
+
+    def _init_mean(self, _, arr):
+        arr[:] = 0.0
+
+    def _init_var(self, _, arr):
+        arr[:] = 1.0
+
+    def _init_weight(self, name, arr):
+        """Abstract method to Initialize weight."""
+        raise NotImplementedError("Must override it")
+
+    def _init_default(self, name, _):
+        raise ValueError(
+            'Unknown initialization pattern for %s. ' \
+            'Default initialization is now limited to '\
+            '"weight", "bias", "gamma" (1.0), and "beta" (0.0).' \
+            'Please use mx.sym.Variable(init=mx.init.*) to set initialization pattern' % name)
+
+
+class Xavier(Initializer):
+    """ "Xavier" initialization for weights
+
+    Parameters
+    ----------
+    rnd_type: str, optional
+        Random generator type, can be ``'gaussian'`` or ``'uniform'``.
+
+    factor_type: str, optional
+        Can be ``'avg'``, ``'in'``, or ``'out'``.
+
+    magnitude: float, optional
+        Scale of random number.
+    """
+    def __init__(self, rnd_type="uniform", factor_type="avg", magnitude=3):
+        super(Xavier, self).__init__(rnd_type=rnd_type,
+                                     factor_type=factor_type,
+                                     magnitude=magnitude)
+        self.rnd_type = rnd_type
+        self.factor_type = factor_type
+        self.magnitude = float(magnitude)
+
+    def _init_weight(self, name, arr):
+        shape = arr.shape
+        hw_scale = 1.
+        if len(shape) < 2:
+            raise ValueError('Xavier initializer cannot be applied to vector {0}. It requires at'
+                             ' least 2D.'.format(name))
+        if len(shape) > 2:
+            hw_scale = np.prod(shape[2:])
+        fan_in, fan_out = shape[1] * hw_scale, shape[0] * hw_scale
+        factor = 1.
+        if self.factor_type == "avg":
+            factor = (fan_in + fan_out) / 2.0
+        elif self.factor_type == "in":
+            factor = fan_in
+        elif self.factor_type == "out":
+            factor = fan_out
+        else:
+            raise ValueError("Incorrect factor type")
+        # Hack for mobilenet, because there is less connectivity
+        if "depthwise" in name:
+            factor = hw_scale
+        scale = np.sqrt(self.magnitude / factor)
+        if self.rnd_type == "uniform":
+            arr[:] = np.random.uniform(-scale, scale, size=arr.shape)
+        else:
+            raise ValueError("Unknown random type")
+
+
+def create_workload(net, initializer=None, seed=0):
+    """Helper function to create benchmark image classification workload.
+
+    Parameters
+    ----------
+    net : tvm.relay.Function
+        The selected function of the network.
+
+    initializer : Initializer
+        The initializer used
+
+    seed : int
+        The seed used in initialization.
+
+    Returns
+    -------
+    net : tvm.relay.Function
+        The updated dataflow
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = relay.ir_pass.infer_type(net)
+    shape_dict = {
+        v.name_hint : v.checked_type for v in net.params}
+    net.astext()
+    np.random.seed(seed)
+    initializer = initializer if initializer else Xavier()
+    params = {}
+    for k, v in shape_dict.items():
+        if k == "data":
+            continue
+        init_value = np.zeros(v.concrete_shape).astype(v.dtype)
+        initializer(k, init_value)
+        params[k] = tvm.nd.array(init_value, ctx=tvm.cpu(0))
+    return net, params
diff --git a/python/tvm/relay/testing/layers.py b/python/tvm/relay/testing/layers.py
new file mode 100644
index 000000000000..9d4d3b3b4e13
--- /dev/null
+++ b/python/tvm/relay/testing/layers.py
@@ -0,0 +1,138 @@
+"""Simple Layer DSL wrapper to ease creation of neural nets."""
+from tvm import relay
+
+def batch_norm_infer(data,
+                     gamma=None,
+                     beta=None,
+                     moving_mean=None,
+                     moving_var=None,
+                     **kwargs):
+    """Wrapper of batch_norm.
+
+    This function automatically creates weights and return
+    the first output(normalized result).
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input expression.
+
+    gamma : relay.Expr
+        The gamma scale factor.
+
+    beta : relay.Expr
+        The beta offset factor.
+
+    moving_mean : relay.Expr
+        Running mean of input,
+
+    moving_var : relay.Expr
+        Running variance of input.
+
+    kwargs : dict
+        Additional arguments.
+
+    Returns
+    -------
+    result : relay.Expr
+        The result.
+    """
+    name = kwargs.get("name")
+    kwargs.pop("name")
+    if not gamma:
+        gamma = relay.var(name + "_gamma")
+    if not beta:
+        beta = relay.var(name + "_beta")
+    if not moving_mean:
+        moving_mean = relay.var(name + "_moving_mean")
+    if not moving_var:
+        moving_var = relay.var(name + "_moving_var")
+    return relay.nn.batch_norm(data,
+                               gamma=gamma,
+                               beta=beta,
+                               moving_mean=moving_mean,
+                               moving_var=moving_var,
+                               **kwargs)[0]
+
+
+def conv2d(data, weight=None, **kwargs):
+    """Wrapper of conv2d which automatically creates weights if not given.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input expression.
+
+    weight : relay.Expr
+        The weight to conv2d.
+
+    kwargs : dict
+        Additional arguments.
+
+    Returns
+    -------
+    result : relay.Expr
+        The result.
+    """
+    name = kwargs.get("name")
+    kwargs.pop("name")
+    if not weight:
+        weight = relay.var(name + "_weight")
+    return relay.nn.conv2d(data, weight, **kwargs)
+
+def conv2d_transpose(data, weight=None, **kwargs):
+    """Wrapper of conv2d_transpose which automatically creates weights if not given.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input expression.
+
+    weight : relay.Expr
+        The weight to conv2d_transpose.
+
+    kwargs : dict
+        Additional arguments.
+
+    Returns
+    -------
+    result : relay.Expr
+        The result.
+    """
+    name = kwargs.get("name")
+    kwargs.pop("name")
+    if not weight:
+        weight = relay.var(name + "_weight")
+    return relay.nn.conv2d_transpose(data, weight, **kwargs)
+
+def dense_add_bias(data, weight=None, bias=None, units=None, **kwargs):
+    """Wrapper of dense which automatically creates weights if not given.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input expression.
+
+    weight : relay.Expr
+        The weight to conv2d.
+
+    bias : relay.Expr
+        The bias.
+
+    kwargs : dict
+        Additional arguments.
+
+    Returns
+    -------
+    result : relay.Expr
+        The result.
+    """
+    name = kwargs.get("name")
+    kwargs.pop("name")
+    if not weight:
+        weight = relay.var(name + "_weight")
+    if not bias:
+        bias = relay.var(name + "_bias")
+    data = relay.nn.dense(data, weight, units, **kwargs)
+    data = relay.nn.bias_add(data, bias)
+    return data
diff --git a/python/tvm/relay/testing/lstm.py b/python/tvm/relay/testing/lstm.py
new file mode 100644
index 000000000000..b0915e033ccb
--- /dev/null
+++ b/python/tvm/relay/testing/lstm.py
@@ -0,0 +1,182 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Implementation of a Long Short-Term Memory (LSTM) cell.
+
+Adapted from:
+https://gist.github.com/merrymercy/5eb24e3b019f84200645bd001e9caae9
+"""
+
+from tvm import relay
+from . import layers
+from .init import create_workload
+
+def lstm_cell(num_hidden, batch_size=1, dtype="float32", name=""):
+    """Long-Short Term Memory (LSTM) network cell.
+
+    Parameters
+    ----------
+    num_hidden : int
+        Number of units in output symbol.
+
+    batch_size : int
+        Batch size (length of states).
+
+    Returns
+    -------
+    result : tvm.relay.Function
+        A Relay function that evaluates an LSTM cell.
+        The function takes in a tensor of input data, a tuple of two
+        states, and weights and biases for dense operations on the
+        inputs and on the state. It returns a tuple with two members,
+        an output tensor and a tuple of two new states.
+    """
+    builder = relay.ScopeBuilder()
+
+    input_type = relay.TensorType((batch_size, num_hidden), dtype)
+    weight_type = relay.TensorType((4*num_hidden, num_hidden), dtype)
+    bias_type = relay.TensorType((4*num_hidden,), dtype)
+
+    dense_type = relay.TensorType((batch_size, 4*num_hidden), dtype)
+    slice_type = relay.TupleType([input_type, input_type,
+                                  input_type, input_type])
+    ret_type = relay.TupleType([input_type,
+                                relay.TupleType([input_type, input_type])])
+
+    inputs = relay.Var("inputs", input_type)
+    states = relay.Var("states",
+                       relay.TupleType([input_type, input_type]))
+
+    i2h_weight = relay.Var("i2h_weight", weight_type)
+    i2h_bias = relay.Var("i2h_bias", bias_type)
+
+    h2h_weight = relay.Var("h2h_weight", weight_type)
+    h2h_bias = relay.Var("h2h_bias", bias_type)
+
+    i2h = builder.let(("i2h", dense_type),
+                      layers.dense_add_bias(
+                          data=inputs,
+                          units=num_hidden * 4,
+                          weight=i2h_weight, bias=i2h_bias,
+                          name="%si2h" % name))
+    h2h = builder.let(("h2h", dense_type),
+                      layers.dense_add_bias(
+                          data=relay.TupleGetItem(states, 0),
+                          units=num_hidden * 4,
+                          weight=h2h_weight, bias=h2h_bias,
+                          name="%sh2h" % name))
+
+    gates = builder.let(("gates", dense_type), relay.add(i2h, h2h))
+    slice_gates = builder.let(("slice_gates", slice_type),
+                              relay.split(gates,
+                                          indices_or_sections=4,
+                                          axis=1).astuple())
+
+    in_gate = builder.let(("in_gate", input_type),
+                          relay.sigmoid(relay.TupleGetItem(slice_gates, 0)))
+    forget_gate = builder.let(("forget_gate", input_type),
+                              relay.sigmoid(relay.TupleGetItem(slice_gates, 1)))
+    in_transform = builder.let(("in_transform", input_type),
+                               relay.tanh(relay.TupleGetItem(slice_gates, 2)))
+    out_gate = builder.let(("out_gate", input_type),
+                           relay.sigmoid(relay.TupleGetItem(slice_gates, 3)))
+
+    next_c = builder.let(("next_c", input_type),
+                         relay.add(relay.multiply(forget_gate,
+                                                  relay.TupleGetItem(states, 1)),
+                                   relay.multiply(in_gate, in_transform)))
+    next_h = builder.let(("next_h", input_type),
+                         relay.multiply(out_gate, relay.tanh(next_c)))
+    ret = builder.let(("ret", ret_type),
+                      relay.Tuple([next_h, relay.Tuple([next_h, next_c])]))
+    builder.ret(ret)
+
+    body = builder.get()
+
+    return relay.Function([inputs, states, i2h_weight,
+                           i2h_bias, h2h_weight, h2h_bias],
+                          body, ret_type)
+
+
+def get_net(iterations, num_hidden, batch_size=1, dtype="float32"):
+    '''Constructs an unrolled RNN with LSTM cells'''
+    input_type = relay.TensorType((batch_size, num_hidden), dtype)
+    weight_type = relay.TensorType((4*num_hidden, num_hidden), dtype)
+    bias_type = relay.TensorType((4*num_hidden,), dtype)
+
+    state_type = relay.TupleType([input_type, input_type])
+    cell_type = relay.TupleType([input_type, state_type])
+
+    builder = relay.ScopeBuilder()
+
+    zeros = builder.let(("zeros", input_type),
+                        relay.zeros((batch_size, num_hidden), dtype))
+    init_states = builder.let(("init_states", state_type),
+                              relay.Tuple([zeros, zeros]))
+
+    states = init_states
+    out = None
+
+    for i in range(iterations):
+        inputs = relay.Var("data", input_type)
+        i2h_weight = relay.Var("i2h_%s_weight" % i, weight_type)
+        i2h_bias = relay.Var("i2h_%i_bias" % i, bias_type)
+        h2h_weight = relay.Var("h2h_%s_weight" % i, weight_type)
+        h2h_bias = relay.Var("h2h_%s_bias" % i, bias_type)
+
+        cell_fn = lstm_cell(num_hidden, batch_size, dtype, "lstm_%s" % i)
+
+        call = builder.let(("call_%s" % i, cell_type),
+                           relay.Call(cell_fn,
+                                      [inputs, states, i2h_weight,
+                                       i2h_bias, h2h_weight, h2h_bias]))
+        new_out = builder.let(("out_%s" % i, input_type),
+                              relay.TupleGetItem(call, 0))
+        new_states = builder.let(("states_%s" % i, state_type),
+                                 relay.TupleGetItem(call, 1))
+        states = new_states
+        out = new_out
+
+    builder.ret(out)
+    body = builder.get()
+    args = relay.ir_pass.free_vars(body)
+    return relay.Function(args, body, input_type)
+
+
+def get_workload(iterations, num_hidden, batch_size=1, dtype="float32"):
+    """Get benchmark workload for an LSTM RNN.
+
+    Parameters
+    ----------
+    iterations : int
+        The number of iterations in the desired LSTM RNN.
+    num_hidden : int
+        The size of the hiddxen state
+    batch_size : int, optional (default 1)
+        The batch size used in the model
+    dtype : str, optional (default "float32")
+        The data type
+    Returns
+    -------
+    net : nnvm.symbol
+        The computational graph
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_net(iterations, num_hidden, batch_size, dtype)
+    return create_workload(net)
diff --git a/python/tvm/relay/testing/mlp.py b/python/tvm/relay/testing/mlp.py
new file mode 100644
index 000000000000..7d7d984f7526
--- /dev/null
+++ b/python/tvm/relay/testing/mlp.py
@@ -0,0 +1,94 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+a simple multilayer perceptron
+"""
+from __future__ import absolute_import
+from tvm import relay
+from .init import create_workload
+
+def get_net(batch_size,
+            num_classes=10,
+            image_shape=(1, 28, 28),
+            dtype="float32"):
+    """Get network a simple multilayer perceptron.
+
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of claseses
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : relay.Function
+        The dataflow.
+    """
+    data_shape = (batch_size,) + image_shape
+    data = relay.var("data",
+                     shape=data_shape,
+                     dtype=dtype)
+    data = relay.nn.batch_flatten(data)
+    fc1 = relay.nn.dense(data, relay.var("fc1_weight"), units=128)
+    fc1 = relay.nn.bias_add(fc1, relay.var("fc2_bias"))
+    act1 = relay.nn.relu(fc1)
+    fc2 = relay.nn.dense(act1, relay.var("fc2_weight"), units=64)
+    fc2 = relay.nn.bias_add(fc2, relay.var("fc2_bias"))
+    act2 = relay.nn.relu(fc2)
+    fc3 = relay.nn.dense(act2, relay.var("fc3_weight"), units=num_classes)
+    fc3 = relay.nn.bias_add(fc3, relay.var("fc3_bias"))
+    mlp = relay.nn.softmax(data=fc3)
+    args = relay.ir_pass.free_vars(mlp)
+    return relay.Function(args, mlp)
+
+
+def get_workload(batch_size,
+                 num_classes=10,
+                 image_shape=(1, 28, 28),
+                 dtype="float32"):
+    """Get benchmark workload for a simple multilayer perceptron.
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of claseses
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : relay.Function
+        The dataflow.
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_net(batch_size, num_classes, image_shape, dtype)
+    return create_workload(net)
diff --git a/python/tvm/relay/testing/mobilenet.py b/python/tvm/relay/testing/mobilenet.py
new file mode 100644
index 000000000000..78e1d82456c8
--- /dev/null
+++ b/python/tvm/relay/testing/mobilenet.py
@@ -0,0 +1,142 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Port of NNVM version of MobileNet to Relay.
+"""
+# pylint: disable=invalid-name
+
+from tvm import relay
+from . import layers
+from .init import create_workload
+
+def conv_block(data, name, channels, kernel_size=(3, 3), strides=(1, 1),
+               padding=(1, 1), epsilon=1e-5):
+    """Helper function to construct conv_bn-relu"""
+    # convolution + bn + relu
+    conv = layers.conv2d(
+        data=data,
+        channels=channels,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_layout='NCHW',
+        name=name+'_conv')
+    bn = layers.batch_norm_infer(data=conv, epsilon=epsilon, name=name + '_bn')
+    act = relay.nn.relu(data=bn)
+    return act
+
+
+def separable_conv_block(data, name, depthwise_channels, pointwise_channels,
+                         kernel_size=(3, 3), downsample=False, padding=(1, 1),
+                         epsilon=1e-5):
+    """Helper function to get a separable conv block"""
+    if downsample:
+        strides = (2, 2)
+    else:
+        strides = (1, 1)
+    # depthwise convolution + bn + relu
+    conv1 = layers.conv2d(
+        data=data,
+        channels=depthwise_channels,
+        groups=depthwise_channels,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        name=name+'_depthwise_conv1')
+    bn1 = layers.batch_norm_infer(data=conv1, epsilon=epsilon, name=name+'_bn1')
+    act1 = relay.nn.relu(data=bn1)
+    # pointwise convolution + bn + relu
+    conv2 = layers.conv2d(
+        data=act1,
+        channels=pointwise_channels,
+        kernel_size=(1, 1),
+        strides=(1, 1),
+        padding=(0, 0),
+        data_layout='NCHW',
+        name=name + '_conv2')
+    bn2 = layers.batch_norm_infer(data=conv2, epsilon=epsilon, name=name+'_bn2')
+    act2 = relay.nn.relu(data=bn2)
+    return act2
+
+
+def mobile_net(num_classes=1000, data_shape=(1, 3, 224, 224),
+               dtype='float32', alpha=1.0, is_shallow=False):
+    """Function to construct a MobileNet"""
+    data = relay.var("data", shape=data_shape, dtype=dtype)
+    body = conv_block(data, 'conv_block_1', int(32*alpha), strides=(2, 2))
+    body = separable_conv_block(body, 'separable_conv_block_1',
+                                int(32*alpha), int(64*alpha))
+    body = separable_conv_block(body, 'separable_conv_block_2',
+                                int(64*alpha), int(128*alpha), downsample=True)
+    body = separable_conv_block(body, 'separable_conv_block_3',
+                                int(128*alpha), int(128*alpha))
+    body = separable_conv_block(body, 'separable_conv_block_4',
+                                int(128*alpha), int(256*alpha), downsample=True)
+    body = separable_conv_block(body, 'separable_conv_block_5',
+                                int(256*alpha), int(256*alpha))
+    body = separable_conv_block(body, 'separable_conv_block_6',
+                                int(256*alpha), int(512*alpha), downsample=True)
+    if is_shallow:
+        body = separable_conv_block(body, 'separable_conv_block_7',
+                                    int(512*alpha), int(1024*alpha), downsample=True)
+        body = separable_conv_block(body, 'separable_conv_block_8',
+                                    int(1024*alpha), int(1024*alpha), downsample=True)
+    else:
+        for i in range(7, 12):
+            body = separable_conv_block(body, 'separable_conv_block_%d' % i,
+                                        int(512*alpha), int(512*alpha))
+        body = separable_conv_block(body, 'separable_conv_block_12',
+                                    int(512*alpha), int(1024*alpha), downsample=True)
+        body = separable_conv_block(body, 'separable_conv_block_13',
+                                    int(1024*alpha), int(1024*alpha))
+    pool = relay.nn.global_avg_pool2d(data=body)
+    flatten = relay.nn.batch_flatten(data=pool)
+    weight = relay.var('fc_weight')
+    fc = relay.nn.dense(data=flatten, weight=weight, units=num_classes)
+    softmax = relay.nn.softmax(data=fc)
+    return relay.Function(relay.ir_pass.free_vars(softmax), softmax)
+
+
+def get_workload(batch_size=1, num_classes=1000, image_shape=(3, 224, 224), dtype='float32'):
+    """Get benchmark workload for mobilenet
+
+    Parameters
+    ----------
+    batch_size : int, optional
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : relay.Function
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    data_shape = tuple([batch_size] + list(image_shape))
+    net = mobile_net(num_classes=num_classes, data_shape=data_shape,
+                     dtype=dtype, alpha=1.0, is_shallow=False)
+    return create_workload(net)
diff --git a/python/tvm/relay/testing/resnet.py b/python/tvm/relay/testing/resnet.py
new file mode 100644
index 000000000000..9ba57ae09ef5
--- /dev/null
+++ b/python/tvm/relay/testing/resnet.py
@@ -0,0 +1,276 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
+Original author Wei Wu
+
+Implemented the following paper:
+
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Identity Mappings in Deep Residual Networks"
+"""
+# pylint: disable=unused-argument
+from tvm import relay
+from .init import create_workload
+from . import layers
+
+def residual_unit(data,
+                  num_filter,
+                  stride,
+                  dim_match,
+                  name,
+                  bottle_neck=True):
+    """Return ResNet Unit symbol for building ResNet
+
+    Parameters
+    ----------
+    data : str
+        Input data
+
+    num_filter : int
+        Number of output channels
+
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+
+    stride : tuple
+        Stride used in convolution
+
+    dim_match : bool
+        True means channel number between input and output is the same,
+        otherwise means differ
+
+    name : str
+        Base name of the operators
+    """
+    if bottle_neck:
+        bn1 = layers.batch_norm_infer(data=data,
+                                      epsilon=2e-5,
+                                      name=name + '_bn1')
+        act1 = relay.nn.relu(data=bn1)
+        conv1 = layers.conv2d(
+            data=act1,
+            channels=int(num_filter*0.25),
+            kernel_size=(1, 1),
+            strides=stride,
+            padding=(0, 0),
+            name=name + '_conv1')
+        bn2 = layers.batch_norm_infer(data=conv1, epsilon=2e-5, name=name + '_bn2')
+        act2 = relay.nn.relu(data=bn2)
+        conv2 = layers.conv2d(
+            data=act2, channels=int(num_filter*0.25), kernel_size=(3, 3),
+            strides=(1, 1), padding=(1, 1), name=name + '_conv2')
+        bn3 = layers.batch_norm_infer(data=conv2, epsilon=2e-5, name=name + '_bn3')
+        act3 = relay.nn.relu(data=bn3)
+        conv3 = layers.conv2d(
+            data=act3, channels=num_filter, kernel_size=(1, 1),
+            strides=(1, 1), padding=(0, 0), name=name + '_conv3')
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = layers.conv2d(
+                data=act1, channels=num_filter, kernel_size=(1, 1),
+                strides=stride, name=name+'_sc')
+        return relay.add(conv3, shortcut)
+    else:
+        bn1 = layers.batch_norm_infer(data=data, epsilon=2e-5, name=name + '_bn1')
+        act1 = relay.nn.relu(data=bn1)
+        conv1 = layers.conv2d(
+            data=act1, channels=num_filter, kernel_size=(3, 3),
+            strides=stride, padding=(1, 1), name=name + '_conv1')
+        bn2 = layers.batch_norm_infer(data=conv1, epsilon=2e-5, name=name + '_bn2')
+        act2 = relay.nn.relu(data=bn2)
+        conv2 = layers.conv2d(
+            data=act2, channels=num_filter, kernel_size=(3, 3),
+            strides=(1, 1), padding=(1, 1), name=name + '_conv2')
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = layers.conv2d(
+                data=act1, channels=num_filter, kernel_size=(1, 1),
+                strides=stride, name=name+'_sc')
+        return relay.add(conv2, shortcut)
+
+
+def resnet(units,
+           num_stages,
+           filter_list,
+           num_classes,
+           data_shape,
+           bottle_neck=True,
+           dtype="float32"):
+    """Return ResNet Program.
+
+    Parameters
+    ----------
+    units : list
+        Number of units in each stage
+
+    num_stages : int
+        Number of stages
+
+    filter_list : list
+        Channel size of each stage
+
+    num_classes : int
+        Ouput size of symbol
+
+    data_shape : tuple of int.
+        The shape of input data.
+
+    bottle_neck : bool
+        Whether apply bottleneck transformation.
+
+    dtype : str
+        The global data type.
+    """
+    num_unit = len(units)
+    assert num_unit == num_stages
+    data = relay.var("data", shape=data_shape, dtype=dtype)
+    data = layers.batch_norm_infer(data=data, epsilon=2e-5, scale=False, name='bn_data')
+    (_, _, height, _) = data_shape
+    if height <= 32:            # such as cifar10
+        body = layers.conv2d(
+            data=data, channels=filter_list[0], kernel_size=(3, 3),
+            strides=(1, 1), padding=(1, 1), name="conv0")
+    else:                       # often expected to be 224 such as imagenet
+        body = layers.conv2d(
+            data=data, channels=filter_list[0], kernel_size=(7, 7),
+            strides=(2, 2), padding=(3, 3), name="conv0")
+        body = layers.batch_norm_infer(data=body, epsilon=2e-5, name='bn0')
+        body = relay.nn.relu(data=body)
+        body = relay.nn.max_pool2d(data=body, pool_size=(3, 3), strides=(2, 2), padding=(1, 1))
+
+    for i in range(num_stages):
+        body = residual_unit(
+            body, filter_list[i+1], (1 if i == 0 else 2, 1 if i == 0 else 2),
+            False, name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck)
+        for j in range(units[i]-1):
+            body = residual_unit(
+                body, filter_list[i+1], (1, 1), True,
+                name='stage%d_unit%d' % (i + 1, j + 2), bottle_neck=bottle_neck)
+    bn1 = layers.batch_norm_infer(data=body, epsilon=2e-5, name='bn1')
+    relu1 = relay.nn.relu(data=bn1)
+    # Although kernel is not used here when global_pool=True, we should put one
+    pool1 = relay.nn.global_avg_pool2d(data=relu1)
+    flat = relay.nn.batch_flatten(data=pool1)
+    fc1 = layers.dense_add_bias(data=flat, units=num_classes, name='fc1')
+    net = relay.nn.softmax(data=fc1)
+    return relay.Function(relay.ir_pass.free_vars(net), net)
+
+
+def get_net(batch_size,
+            num_classes,
+            num_layers=50,
+            image_shape=(3, 224, 224),
+            dtype="float32",
+            **kwargs):
+    """
+    Adapted from https://github.com/tornadomeet/ResNet/blob/master/train_resnet.py
+    Original author Wei Wu
+    """
+    (_, height, _) = image_shape
+    data_shape = (batch_size,) + image_shape
+    if height <= 28:
+        num_stages = 3
+        if (num_layers-2) % 9 == 0 and num_layers >= 164:
+            per_unit = [(num_layers-2)//9]
+            filter_list = [16, 64, 128, 256]
+            bottle_neck = True
+        elif (num_layers-2) % 6 == 0 and num_layers < 164:
+            per_unit = [(num_layers-2)//6]
+            filter_list = [16, 16, 32, 64]
+            bottle_neck = False
+        else:
+            raise ValueError("no experiments done on num_layers {}".format(num_layers))
+        units = per_unit * num_stages
+    else:
+        if num_layers >= 50:
+            filter_list = [64, 256, 512, 1024, 2048]
+            bottle_neck = True
+        else:
+            filter_list = [64, 64, 128, 256, 512]
+            bottle_neck = False
+        num_stages = 4
+        if num_layers == 18:
+            units = [2, 2, 2, 2]
+        elif num_layers == 34:
+            units = [3, 4, 6, 3]
+        elif num_layers == 50:
+            units = [3, 4, 6, 3]
+        elif num_layers == 101:
+            units = [3, 4, 23, 3]
+        elif num_layers == 152:
+            units = [3, 8, 36, 3]
+        elif num_layers == 200:
+            units = [3, 24, 36, 3]
+        elif num_layers == 269:
+            units = [3, 30, 48, 8]
+        else:
+            raise ValueError("no experiments done on num_layers {}".format(num_layers))
+
+    return resnet(units=units,
+                  num_stages=num_stages,
+                  filter_list=filter_list,
+                  num_classes=num_classes,
+                  data_shape=data_shape,
+                  bottle_neck=bottle_neck,
+                  dtype=dtype)
+
+
+def get_workload(batch_size=1,
+                 num_classes=1000,
+                 num_layers=18,
+                 image_shape=(3, 224, 224),
+                 dtype="float32",
+                 **kwargs):
+    """Get benchmark workload for resnet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    num_layers : int, optional
+        Number of layers
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    kwargs : dict
+        Extra arguments
+
+    Returns
+    -------
+    net : relay.Function
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_net(batch_size=batch_size,
+                  num_classes=num_classes,
+                  num_layers=num_layers,
+                  image_shape=image_shape,
+                  dtype=dtype,
+                  **kwargs)
+    return create_workload(net)
diff --git a/python/tvm/relay/testing/squeezenet.py b/python/tvm/relay/testing/squeezenet.py
new file mode 100644
index 000000000000..c7b8e8db166b
--- /dev/null
+++ b/python/tvm/relay/testing/squeezenet.py
@@ -0,0 +1,160 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=unused-argument
+
+"""
+Symbol of SqueezeNet
+
+Reference:
+Iandola, Forrest N., et al.
+"Squeezenet: Alexnet-level accuracy with 50x fewer parameters and< 0.5 mb model size." (2016).
+"""
+
+from tvm import relay
+from .init import create_workload
+from . import layers
+
+# Helpers
+def _make_fire(net, squeeze_channels, expand1x1_channels, expand3x3_channels, prefix):
+    net = _make_fire_conv(net, squeeze_channels, 1, 0, "%s_input" % prefix)
+
+    left = _make_fire_conv(net, expand1x1_channels, 1, 0, "%s_left" % prefix)
+    right = _make_fire_conv(net, expand3x3_channels, 3, 1, "%s_right" % prefix)
+    # NOTE : Assume NCHW layout here
+    net = relay.concatenate((left, right), axis=1)
+    return net
+
+def _make_fire_conv(net, channels, kernel_size, padding=0, prefix=""):
+    net = layers.conv2d(net,
+                        channels=channels,
+                        kernel_size=(kernel_size, kernel_size),
+                        padding=(padding, padding), name="%s_conv" % prefix)
+    net = relay.nn.bias_add(net, relay.var("%s_conv_bias" % prefix))
+    net = relay.nn.relu(net)
+    return net
+
+# Net
+def get_net(batch_size, image_shape, num_classes, version, dtype):
+    """Get symbol of SqueezeNet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    image_shape : tuple, optional
+        The input image shape
+
+    num_classes: int
+        The number of classification results
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+    """
+    assert version in ['1.0', '1.1'], ("Unsupported SqueezeNet version {version}:"
+                                       "1.0 or 1.1 expected".format(version=version))
+    data_shape = (batch_size,) + image_shape
+    net = relay.var("data", shape=data_shape, dtype=dtype)
+    if version == '1.0':
+        net = layers.conv2d(net,
+                            channels=96,
+                            kernel_size=(7, 7),
+                            strides=(2, 2),
+                            padding=(3, 3),
+                            name="conv1")
+        net = relay.nn.bias_add(net, relay.var("conv1_bias"))
+        net = relay.nn.relu(net)
+        net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 16, 64, 64, "fire1")
+        net = _make_fire(net, 16, 64, 64, "fire2")
+        net = _make_fire(net, 32, 128, 128, "fire3")
+        net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 32, 128, 128, "fire4")
+        net = _make_fire(net, 48, 192, 192, "fire5")
+        net = _make_fire(net, 48, 192, 192, "fire6")
+        net = _make_fire(net, 64, 256, 256, "fire7")
+        net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 64, 256, 256, "fire8")
+    else:
+        net = layers.conv2d(net,
+                            channels=64,
+                            kernel_size=(3, 3),
+                            strides=(2, 2),
+                            padding=(1, 1),
+                            name="conv1")
+        net = relay.nn.bias_add(net, relay.var("conv1_bias"))
+        net = relay.nn.relu(net)
+        net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 16, 64, 64, "fire1")
+        net = _make_fire(net, 16, 64, 64, "fire2")
+        net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 32, 128, 128, "fire3")
+        net = _make_fire(net, 32, 128, 128, "fire4")
+        net = relay.nn.max_pool2d(net, pool_size=(3, 3), strides=(2, 2))
+        net = _make_fire(net, 48, 192, 192, "fire5")
+        net = _make_fire(net, 48, 192, 192, "fire6")
+        net = _make_fire(net, 64, 256, 256, "fire7")
+        net = _make_fire(net, 64, 256, 256, "fire8")
+    net = relay.nn.dropout(net, rate=0.5)
+    net = layers.conv2d(
+        net, channels=num_classes, kernel_size=(1, 1), name="conv_final")
+    net = relay.nn.bias_add(net, relay.var("conv_final_bias"))
+    net = relay.nn.relu(net)
+    net = relay.nn.global_avg_pool2d(net)
+    net = relay.nn.batch_flatten(net)
+    net = relay.nn.softmax(net)
+    args = relay.ir_pass.free_vars(net)
+    return relay.Function(args, net)
+
+
+def get_workload(batch_size=1,
+                 num_classes=1000,
+                 version='1.0',
+                 image_shape=(3, 224, 224),
+                 dtype="float32"):
+    """Get benchmark workload for SqueezeNet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+
+    net = get_net(batch_size, image_shape, num_classes, version, dtype)
+    return create_workload(net)
diff --git a/python/tvm/relay/testing/vgg.py b/python/tvm/relay/testing/vgg.py
new file mode 100644
index 000000000000..bec141f70ffd
--- /dev/null
+++ b/python/tvm/relay/testing/vgg.py
@@ -0,0 +1,134 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""References:
+
+Simonyan, Karen, and Andrew Zisserman. "Very deep convolutional networks for
+large-scale image recognition." arXiv preprint arXiv:1409.1556 (2014).
+"""
+from tvm import relay
+from .init import create_workload
+from . import layers as wrapper
+
+
+def get_feature(internal_layer, layers, filters, batch_norm=False):
+    """Get VGG feature body as stacks of convoltions."""
+    for i, num in enumerate(layers):
+        for j in range(num):
+            internal_layer = wrapper.conv2d(
+                data=internal_layer, kernel_size=(3, 3), padding=(1, 1),
+                channels=filters[i], name="conv%s_%s" % (i + 1, j + 1))
+            internal_layer = relay.nn.bias_add(
+                internal_layer, relay.var("conv%s_%s_bias" % (i + 1, j + 1)))
+            if batch_norm:
+                internal_layer = wrapper.batch_norm_infer(
+                    data=internal_layer, name="bn%s_%s" %(i + 1, j + 1))
+            internal_layer = relay.nn.relu(data=internal_layer)
+        internal_layer = relay.nn.max_pool2d(
+            data=internal_layer, pool_size=(2, 2), strides=(2, 2))
+    return internal_layer
+
+
+def get_classifier(input_data, num_classes):
+    """Get VGG classifier layers as fc layers."""
+    flatten = relay.nn.batch_flatten(data=input_data)
+    fc6 = wrapper.dense_add_bias(data=flatten, units=4096, name="fc6")
+    relu6 = relay.nn.relu(data=fc6)
+    drop6 = relay.nn.dropout(data=relu6, rate=0.5)
+    fc7 = wrapper.dense_add_bias(data=drop6, units=4096, name="fc7")
+    relu7 = relay.nn.relu(data=fc7)
+    drop7 = relay.nn.dropout(data=relu7, rate=0.5)
+    fc8 = wrapper.dense_add_bias(data=drop7, units=num_classes, name="fc8")
+    return fc8
+
+
+def get_net(batch_size, image_shape, num_classes, dtype, num_layers=11, batch_norm=False):
+    """
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    image_shape : tuple, optional
+        The input image shape
+
+    num_classes : int, optional
+        Number of claseses
+
+    dtype : str, optional
+        The data type
+
+    num_layers : int
+        Number of layers for the variant of vgg. Options are 11, 13, 16, 19.
+
+    batch_norm : bool, default False
+        Use batch normalization.
+    """
+    vgg_spec = {11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]),
+                13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]),
+                16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]),
+                19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512])}
+    if num_layers not in vgg_spec:
+        raise ValueError("Invalide num_layers {}. Choices are 11,13,16,19.".format(num_layers))
+    layers, filters = vgg_spec[num_layers]
+    data_shape = (batch_size,) + image_shape
+    data = relay.var("data", shape=data_shape, dtype=dtype)
+    feature = get_feature(data, layers, filters, batch_norm)
+    classifier = get_classifier(feature, num_classes)
+    symbol = relay.nn.softmax(data=classifier)
+    args = relay.ir_pass.free_vars(symbol)
+    return relay.Function(args, symbol)
+
+
+def get_workload(batch_size,
+                 num_classes=1000,
+                 image_shape=(3, 224, 224),
+                 dtype="float32",
+                 num_layers=11,
+                 batch_norm=False):
+    """Get benchmark workload for VGG nets.
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of claseses
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    num_layers : int
+        Number of layers for the variant of vgg. Options are 11, 13, 16, 19.
+
+    batch_norm : bool
+        Use batch normalization.
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_net(batch_size, image_shape, num_classes, dtype, num_layers, batch_norm)
+    return create_workload(net)
diff --git a/python/tvm/relay/ty.py b/python/tvm/relay/ty.py
new file mode 100644
index 000000000000..96dde5acb4df
--- /dev/null
+++ b/python/tvm/relay/ty.py
@@ -0,0 +1,228 @@
+# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name
+"""The type nodes of the Relay language."""
+from enum import IntEnum
+from .base import RelayNode, register_relay_node
+from . import _make
+
+
+class Type(RelayNode):
+    """The base type for all Relay types."""
+
+    def __eq__(self, other):
+        """Compare two Relay types for structural equivalence using
+           alpha equivalence.
+        """
+        return bool(_make._type_alpha_equal(self, other))
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def same_as(self, other):
+        """Compares two Relay types by referential equality."""
+        return super().__eq__(other)
+
+
+@register_relay_node
+class TensorType(Type):
+    """A concrete TensorType in Relay.
+
+    This is the type assigned to tensor's with a known dype and shape. For
+    example a tensor of `float32` and `(5, 5)`.
+
+    Parameters
+    ----------
+    shape : List[tvm.Expr]
+        The shape of the Tensor
+
+    dtype : Optional[str]
+        The content data type.
+        Default to "float32".
+
+    Returns
+    -------
+    tensor_type : tvm.relay.TensorType
+        The tensor type.
+    """
+    def __init__(self, shape, dtype="float32"):
+        self.__init_handle_by_constructor__(
+            _make.TensorType, shape, dtype)
+
+    @property
+    def concrete_shape(self):
+        """Get shape of the type as concrete tuple of int.
+
+        Returns
+        -------
+        shape : List[int]
+            The concrete shape of the Type.
+
+        Raises
+        ------
+        TypeError : If the shape is symbolic
+        """
+        return tuple(int(x) for x in self.shape)
+
+
+class Kind(IntEnum):
+    """The kind of a type parameter, represents a variable shape,
+       base type, type, or dimension.
+
+       This controls what a type parameter is allowed to be instantiated
+       with. For example one's of kind BaseType can only be `float32`, `int32`,
+       and so on.
+    """
+    Type = 0
+    ShapeVar = 1
+    BaseType = 2
+    Shape = 3
+
+@register_relay_node
+class TypeVar(Type):
+    """A type variable used for generic types in Relay,
+    see tvm/relay/type.h for more details.
+
+    A type variable represents a type placeholder which will
+    be filled in later on. This allows the user to write
+    functions which are generic over types.
+    """
+
+    def __init__(self, var, kind=Kind.Type):
+        """Construct a TypeVar.
+
+        Parameters
+        ----------
+        var : tvm.expr.Var
+            The tvm.Var which backs the type parameter.
+
+        kind : Optional[Kind]
+            The kind of the type parameter.
+            Default to Kind.Type.
+
+        Returns
+        -------
+        type_var : tvm.relay.TypeVar
+            The type variable.
+        """
+        self.__init_handle_by_constructor__(_make.TypeVar, var, kind)
+
+
+@register_relay_node
+class TypeConstraint(Type):
+    """Abstract class representing a type constraint."""
+    pass
+
+
+@register_relay_node
+class TupleType(Type):
+    """A tuple type in Relay, see tvm/relay/type.h for more details.
+
+    Lists the type of each field in the tuple.
+    """
+
+    def __init__(self, fields):
+        """Constructs a tuple type
+
+        Parameters
+        ----------
+        fields : List[tvm.relay.Type]
+            The fields in the tuple
+
+        Returns
+        -------
+        tuple_type : tvm.relay.TupleType
+            the tuple type
+        """
+        self.__init_handle_by_constructor__(_make.TupleType, fields)
+
+
+@register_relay_node
+class FuncType(Type):
+    """A function type in Relay, see tvm/relay/type.h for more details.
+
+    This is the type assigned to functions in Relay. They consist of
+    a list of type parameters which enable the definition of generic
+    functions, a set of type constraints which we omit for the time
+    being, a sequence of argument types, and a return type.
+
+    We informally write them as:
+    `forall (type_params), (arg_types) -> ret_type where type_constraints`
+
+    Parameters
+    ----------
+    arg_types : List[tvm.relay.Type]
+        The argument types
+
+    ret_type : tvm.relay.Type
+        The return type.
+
+    type_params : Optional[List[tvm.relay.TypeVar]]
+        The type parameters
+
+    type_constraints : Optional[List[tvm.relay.TypeConstraint]]
+        The type constraints.
+    """
+    def __init__(self,
+                 arg_types,
+                 ret_type,
+                 type_params=None,
+                 type_constraints=None):
+        if type_params is None:
+            type_params = []
+        if type_constraints is None:
+            type_constraints = []
+        self.__init_handle_by_constructor__(
+            _make.FuncType, arg_types, ret_type, type_params, type_constraints)
+
+
+@register_relay_node
+class IncompleteType(Type):
+    """An incomplete type."""
+    def __init__(self, kind=Kind.Type):
+        self.__init_handle_by_constructor__(_make.IncompleteType, kind)
+
+
+@register_relay_node
+class TypeRelation(TypeConstraint):
+    """Type relation in relay.
+
+    Parameters
+    ----------
+    func : EnvFunc
+        User defined relation function.
+
+    args : [tvm.relay.Type]
+        List of types to the func.
+
+    num_inputs : int
+        Number of input arguments in args,
+        this act as a hint for type inference.
+
+    attrs : Attrs
+        The attribute attached to the relation information
+
+    Returns
+    -------
+    type_relation : tvm.relay.TypeRelation
+        The type relation.
+    """
+    def __init__(self, func, args, num_inputs, attrs):
+        self.__init_handle_by_constructor__(_make.TypeRelation,
+                                            func, args, num_inputs, attrs)
+
+
+def scalar_type(dtype):
+    """Creates a scalar type.
+
+    This function returns TensorType((), dtype)
+
+    Parameters
+    ----------
+    dtype : str
+        The content data type.
+
+    Returns
+    -------
+    s_type : tvm.relay.TensorType
+        The result type.
+    """
+    return TensorType((), dtype)
diff --git a/python/tvm/relay/ty.pyi b/python/tvm/relay/ty.pyi
new file mode 100644
index 000000000000..c4d5df7ac06c
--- /dev/null
+++ b/python/tvm/relay/ty.pyi
@@ -0,0 +1,183 @@
+# pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name
+"""The type nodes of the Relay language."""
+from enum import IntEnum
+from .base import NodeBase, register_relay_node
+from . import _make
+
+
+class Type(NodeBase):
+    """The base type for all Relay types."""
+
+    def __eq__(self, other):
+        """Compare two Relay types for structural equivalence using
+           alpha equivalence.
+        """
+        return bool(_make._type_alpha_eq(self, other))
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def same_as(self, other):
+        """Compares two Relay types by referential equality."""
+        return super().__eq__(other)
+
+
+@register_relay_node
+class TensorType(Type):
+    """A concrete TensorType in Relay, see tvm/relay/type.h for more details.
+
+    This is the type assigned to tensor's with a known dype and shape. For
+    example a tensor of `float32` and `(5, 5)`.
+    """
+
+    def __init__(self, shape, dtype):
+        """Construct a tensor type.
+
+        Parameters
+        ----------
+        shape: list of tvm.Expr
+        dtype: str
+
+        Returns
+        -------
+        tensor_type: The TensorType
+        """
+        self.__init_handle_by_constructor__(_make.TensorType, shape, dtype)
+
+
+class Kind(IntEnum):
+    """The kind of a type parameter, represents a variable shape,
+       base type, type, or dimension.
+
+       This controls what a type parameter is allowed to be instantiated
+       with. For example one's of kind BaseType can only be `float32`, `int32`,
+       and so on.
+    """
+    ShapeVar = 0
+    Shape = 1
+    BaseType = 2
+    Type = 3
+
+
+@register_relay_node
+class TypeParam(Type):
+    """A type parameter used for generic types in Relay,
+    see tvm/relay/type.h for more details.
+
+    A type parameter represents a type placeholder which will
+    be filled in later on. This allows the user to write
+    functions which are generic over types.
+    """
+
+    def __init__(self, var, kind):
+        """Construct a TypeParam.
+
+        Parameters
+        ----------
+        var: tvm.expr.Var
+            The tvm.Var which backs the type parameter.
+
+        kind: Kind
+            The kind of the type parameter.
+
+        Returns
+        -------
+        type_param: TypeParam
+            The type parameter.
+        """
+        self.__init_handle_by_constructor__(_make.TypeParam, var, kind)
+
+
+@register_relay_node
+class TypeConstraint(Type):
+    """Abstract class representing a type constraint."""
+    pass
+
+
+@register_relay_node
+class TupleType(Type):
+    """A tuple type in Relay, see tvm/relay/type.h for more details.
+
+    Lists the type of each field in the tuple.
+    """
+
+    def __init__(self, fields):
+        """Constructs a tuple type
+
+        Parameters
+        ----------
+        fields: list of tvm.Type
+
+        Returns
+        -------
+        tuple_type: the tuple type
+        """
+        self.__init_handle_by_constructor__(_make.TupleType, fields)
+
+
+@register_relay_node
+class FuncType(Type):
+    """A function type in Relay, see tvm/relay/type.h for more details.
+
+    This is the type assigned to functions in Relay. They consist of
+    a list of type parameters which enable the definition of generic
+    functions, a set of type constraints which we omit for the time
+    being, a sequence of argument types, and a return type.
+
+    We informally write them as:
+    `forall (type_params), (arg_types) -> ret_type where type_constraints`
+    """
+
+    def __init__(self,
+                 arg_types,
+                 ret_type,
+                 type_params,
+                 type_constraints,
+                 ):
+        """Construct a function type.
+
+        Parameters
+        ----------
+        arg_types:  list of Type
+        ret_type: Type
+        type_params: list of TypeParam
+        type_constraints: list of TypeConstraint
+
+        Returns
+        -------
+        func_type: FuncType
+            The function type.
+        """
+        self.__init_handle_by_constructor__(
+            _make.FuncType, arg_types, ret_type, type_params, type_constraints)
+
+
+@register_relay_node
+class IncompleteType(Type):
+    """An incomplete type."""
+
+    def __init__(self, kind=Kind.Type):
+        self.__init_handle_by_constructor__(_make.IncompleteType, kind)
+
+@register_relay_node
+class TypeRelation(TypeConstraint):
+    """Type relation in relay.
+
+    Parameters
+    ----------
+    func : EnvFunc
+        User defined relation function.
+
+    args : list of types
+        List of types to the func.
+
+    num_inputs: int
+        Number of input arguments in args,
+        this act as a hint for type inference.
+
+    attrs : Attrs
+        The attribute attached to the relation information
+    """
+    def __init__(self, func, args, num_inputs, attrs):
+        self.__init_handle_by_constructor__(_make.TypeRelation,
+                                            func, args, num_inputs, attrs)
diff --git a/python/tvm/relay/vision.py b/python/tvm/relay/vision.py
new file mode 100644
index 000000000000..d2c08bc0cc45
--- /dev/null
+++ b/python/tvm/relay/vision.py
@@ -0,0 +1,4 @@
+# pylint: disable=wildcard-import, unused-import, unused-wildcard-import
+"""Vision network related operators."""
+# Re-export in a specific file name so that autodoc can pick it up
+from .op.vision import *
diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index 57f368b0e660..c975ec64aa76 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -103,6 +103,19 @@ def download(self, path):
                 "tvm.rpc.server.download")
         return self._remote_funcs["download"](path)
 
+    def remove(self, path):
+        """Remove file from remote temp folder.
+
+        Parameters
+        ----------
+        path: str
+            The relative location to remote temp folder.
+        """
+        if "remove" not in self._remote_funcs:
+            self._remote_funcs["remove"] = self.get_function(
+                "tvm.rpc.server.remove")
+        self._remote_funcs["remove"](path)
+
     def load_module(self, path):
         """Load a remote module, the file need to be uploaded first.
 
@@ -130,6 +143,10 @@ def cl(self, dev_id=0):
         """Construct OpenCL device."""
         return self.context(4, dev_id)
 
+    def vulkan(self, dev_id=0):
+        """Construct Vulkan device."""
+        return self.context(7, dev_id)
+
     def metal(self, dev_id=0):
         """Construct Metal device."""
         return self.context(8, dev_id)
@@ -214,6 +231,9 @@ def summary(self):
     def text_summary(self):
         """Get a text summary of the tracker."""
         data = self.summary()
+
+        total_ct = {}
+
         res = ""
         res += "Server List\n"
         res += "----------------------------\n"
@@ -221,8 +241,12 @@ def text_summary(self):
         res += "----------------------------\n"
         for item in data["server_info"]:
             addr = item["addr"]
-            res += addr[0] + ":" + str(addr[1])+ "\t"
+            res += addr[0] + ":" + str(addr[1]) + "\t"
             res += item["key"] + "\n"
+            key = item['key'].split(':')[1]   # 'server:rasp3b` -> 'rasp3b'
+            if key not in total_ct:
+                total_ct[key] = 0
+            total_ct[key] += 1
         res += "----------------------------\n"
         res += "\n"
 
@@ -236,14 +260,16 @@ def text_summary(self):
             max_key_len = 0
 
         res += "Queue Status\n"
-        res += "----------------------------\n"
-        res += ("%%-%ds" % max_key_len + "\tfree\tpending\n") % 'key'
-        res += "----------------------------\n"
+        title = ("%%-%ds" % max_key_len + "   total  free  pending\n") % 'key'
+        separate_line = '-' * len(title) + '\n'
+        res += separate_line + title + separate_line
         for k in keys:
-            res += ("%%-%ds" % max_key_len + "\t%d\t%g\n") % \
-                   (k, queue_info[k]["free"], queue_info[k]["pending"])
-
-        res += "----------------------------\n"
+            total = total_ct.get(k, 0)
+            free, pending = queue_info[k]["free"], queue_info[k]["pending"]
+            if total or pending:
+                res += ("%%-%ds" % max_key_len + "   %-5d  %-4d  %-7d\n") % \
+                       (k, total, free, pending)
+        res += separate_line
         return res
 
     def request(self, key, priority=1, session_timeout=0, max_retry=5):
diff --git a/python/tvm/rpc/server.py b/python/tvm/rpc/server.py
index 792685b94a18..ccd222c67b76 100644
--- a/python/tvm/rpc/server.py
+++ b/python/tvm/rpc/server.py
@@ -250,7 +250,7 @@ class Server(object):
     """Start RPC server on a separate process.
 
     This is a simple python implementation based on multi-processing.
-    It is also possible to implement a similar C based sever with
+    It is also possible to implement a similar C based server with
     TVM runtime which does not depend on the python.
 
     Parameters
@@ -313,7 +313,7 @@ def __init__(self,
         self.use_popen = use_popen
 
         if silent:
-            logger.setLevel(logging.WARN)
+            logger.setLevel(logging.ERROR)
 
         if use_popen:
             cmd = [sys.executable,
diff --git a/python/tvm/rpc/tornado_util.py b/python/tvm/rpc/tornado_util.py
index 00e1fd13865b..eafea2e85394 100644
--- a/python/tvm/rpc/tornado_util.py
+++ b/python/tvm/rpc/tornado_util.py
@@ -66,6 +66,8 @@ def _update_write(self):
         while self._pending_write:
             try:
                 msg = self._pending_write[0]
+                if self._sock is None:
+                    return
                 nsend = self._sock.send(msg)
                 if nsend != len(msg):
                     self._pending_write[0] = msg[nsend:]
diff --git a/python/tvm/rpc/tracker.py b/python/tvm/rpc/tracker.py
index de39c97b5000..88868ad6e978 100644
--- a/python/tvm/rpc/tracker.py
+++ b/python/tvm/rpc/tracker.py
@@ -78,6 +78,16 @@ def request(self, user, priority, callback):
         """
         raise NotImplementedError()
 
+    def remove(self, value):
+        """Remove a resource in the scheduler
+
+        Parameters
+        ----------
+        value: object
+            The resource to remove
+        """
+        pass
+
     def summary(self):
         """Get summary information of the scheduler."""
         raise NotImplementedError()
@@ -108,6 +118,11 @@ def request(self, user, priority, callback):
         heapq.heappush(self._requests, (-priority, time.time(), callback))
         self._schedule()
 
+    def remove(self, value):
+        if value in self._values:
+            self._values.remove(value)
+            self._schedule()
+
     def summary(self):
         """Get summary information of the scheduler."""
         return {"free": len(self._values),
@@ -132,6 +147,7 @@ def __init__(self, tracker, sock, addr):
         # list of pending match keys that has not been used.
         self.pending_matchkeys = set()
         self._tracker._connections.add(self)
+        self.put_values = []
 
     def name(self):
         """name of connection"""
@@ -199,9 +215,11 @@ def call_handler(self, args):
             self.pending_matchkeys.add(matchkey)
             # got custom address (from rpc server)
             if args[3] is not None:
-                self._tracker.put(key, (self, args[3], port, matchkey))
+                value = (self, args[3], port, matchkey)
             else:
-                self._tracker.put(key, (self, self._addr[0], port, matchkey))
+                value = (self, self._addr[0], port, matchkey)
+            self._tracker.put(key, value)
+            self.put_values.append(value)
             self.ret_value(TrackerCode.SUCCESS)
         elif code == TrackerCode.REQUEST:
             key = args[1]
@@ -239,7 +257,7 @@ def _cb(value):
             self.close()
 
     def on_close(self):
-        self._tracker._connections.remove(self)
+        self._tracker.close(self)
 
     def on_error(self, err):
         logger.warning("%s: Error in RPC Tracker: %s", self.name(), err)
@@ -285,6 +303,13 @@ def request(self, key, user, priority, callback):
             self._scheduler_map[key] = self.create_scheduler(key)
         self._scheduler_map[key].request(user, priority, callback)
 
+    def close(self, conn):
+        self._connections.remove(conn)
+        if 'key' in conn._info:
+            key = conn._info['key'].split(':')[1]  # 'server:rasp3b' -> 'rasp3b'
+            for value in conn.put_values:
+                self._scheduler_map[key].remove(value)
+
     def stop(self):
         """Safely stop tracker."""
         for conn in list(self._connections):
diff --git a/python/tvm/schedule.py b/python/tvm/schedule.py
index 594c2f2dc8bd..6c261a453457 100644
--- a/python/tvm/schedule.py
+++ b/python/tvm/schedule.py
@@ -362,7 +362,7 @@ def split(self, parent, factor=None, nparts=None):
         """
         if nparts is not None:
             if factor is not None:
-                raise ValueError("Donot need to provide both outer and nparts")
+                raise ValueError("Do not need to provide both outer and nparts")
             outer, inner = _api_internal._StageSplitByNParts(self, parent, nparts)
         else:
             if factor is None:
diff --git a/python/tvm/stmt.py b/python/tvm/stmt.py
index 1f5fea11a472..48d91dfa8044 100644
--- a/python/tvm/stmt.py
+++ b/python/tvm/stmt.py
@@ -15,65 +15,376 @@
 """
 from __future__ import absolute_import as _abs
 from ._ffi.node import NodeBase, register_node
+from . import make as _make
+
 
 class Stmt(NodeBase):
     pass
 
 @register_node
 class LetStmt(Stmt):
-    pass
+    """LetStmt node.
+
+    Parameters
+    ----------
+    var : Var
+        The variable in the binding.
+
+    value : Expr
+        The value in to be binded.
+
+    body : Stmt
+        The body statement.
+    """
+    def __init__(self, var, value, body):
+        self.__init_handle_by_constructor__(
+            _make.LetStmt, var, value, body)
+
 
 @register_node
 class AssertStmt(Stmt):
-    pass
+    """AssertStmt node.
+
+    Parameters
+    ----------
+    condition : Expr
+        The assert condition.
+
+    message : Expr
+        The error message.
+
+    body : Stmt
+        The body statement.
+    """
+    def __init__(self, condition, message, body):
+        self.__init_handle_by_constructor__(
+            _make.AssertStmt, condition, message, body)
+
 
 @register_node
 class ProducerConsumer(Stmt):
-    pass
+    """ProducerConsumer node.
+
+    Parameters
+    ----------
+    func : Operation
+        The Operation.
+
+    is_producer : bool
+        Whether if the node is producer.
+
+    body : Stmt
+        The body statement.
+    """
+    def __init__(self, func, is_producer, body):
+        self.__init_handle_by_constructor__(
+            _make.ProducerConsumer, func, is_producer, body)
+
 
 @register_node
 class For(Stmt):
+    """For node.
+
+    Parameters
+    ----------
+    loop_var : Var
+        The loop variable.
+
+    min_val : Expr
+        The begining value.
+
+    extent : Expr
+        The length of the loop.
+
+    for_type : int
+        The for type.
+
+    device_api : int
+        The device api type.
+
+    body : Stmt
+        The body statement.
+    """
     Serial = 0
     Parallel = 1
     Vectorized = 2
     Unrolled = 3
+    def __init__(self,
+                 loop_var,
+                 min_val,
+                 extent,
+                 for_type,
+                 device_api,
+                 body):
+        self.__init_handle_by_constructor__(
+            _make.For, loop_var, min_val, extent,
+            for_type, device_api, body)
+
 
 @register_node
 class Store(Stmt):
-    pass
+    """Store node.
+
+    Parameters
+    ----------
+    buffer_var : Var
+        The buffer Variable.
+
+    value : Expr
+        The value we want to store.
+
+    index : Expr
+        The index in the store expression.
+
+    predicate : Expr
+        The store predicate.
+    """
+    def __init__(self, buffer_var, value, index, predicate):
+        self.__init_handle_by_constructor__(
+            _make.Store, buffer_var, value, index, predicate)
+
 
 @register_node
 class Provide(Stmt):
-    pass
+    """Provide node.
+
+    Parameters
+    ----------
+    func : Operation
+        The operation to create the function.
+
+    value_index : int
+        The output value index
+
+    value : Expr
+        The value to be stored.
+
+    args : list of Expr
+        The index arguments of the Provide.
+    """
+    def __init__(self, func, value_index, value, args):
+        self.__init_handle_by_constructor__(
+            _make.Provide, func, value_index, value, args)
+
 
 @register_node
 class Allocate(Stmt):
-    pass
+    """Allocate node.
+
+    Parameters
+    ----------
+    buffer_var : Var
+        The buffer variable.
+
+    dtype : str
+        The data type of the buffer.
+
+    extents : list of Expr
+        The extents of the allocate
+
+    condition : Expr
+        The condition.
+
+    body : Stmt
+        The body statement.
+    """
+    def __init__(self,
+                 buffer_var,
+                 dtype,
+                 extents,
+                 condition,
+                 body):
+        self.__init_handle_by_constructor__(
+            _make.Allocate, buffer_var, dtype,
+            extents, condition, body)
+
 
 @register_node
 class AttrStmt(Stmt):
-    pass
+    """AttrStmt node.
+
+    Parameters
+    ----------
+    node : Node
+        The node to annotate the attribute
+
+    attr_key : str
+        Attribute type key.
+
+    value : Expr
+        The value of the attribute
+
+    body : Stmt
+        The body statement.
+    """
+    def __init__(self, node, attr_key, value, body):
+        self.__init_handle_by_constructor__(
+            _make.AttrStmt, node, attr_key, value, body)
+
 
 @register_node
 class Free(Stmt):
-    pass
+    """Free node.
+
+    Parameters
+    ----------
+    buffer_var : Var
+        The buffer variable.
+    """
+    def __init__(self, buffer_var):
+        self.__init_handle_by_constructor__(
+            _make.Free, buffer_var)
+
 
 @register_node
 class Realize(Stmt):
-    pass
+    """Realize node.
+
+    Parameters
+    ----------
+    func : Operation
+        The operation to create the function.
+
+    value_index : int
+        The output value index
+
+    dtype : str
+        The data type of the operation.
+
+    bounds : list of range
+        The bound of realize
+
+    condition : Expr
+        The realize condition.
+
+    body : Stmt
+        The realize body
+    """
+    def __init__(self,
+                 func,
+                 value_index,
+                 dtype,
+                 bounds,
+                 condition,
+                 body):
+        self.__init_handle_by_constructor__(
+            _make.Realize, func, value_index, dtype,
+            bounds, condition, body)
+
 
 @register_node
 class Block(Stmt):
-    pass
+    """Block node.
+
+    Parameters
+    ----------
+    first : Stmt
+        The first statement.
+
+    rest : Stmt
+        The following statement.
+    """
+    def __init__(self, first, rest):
+        self.__init_handle_by_constructor__(
+            _make.Block, first, rest)
+
 
 @register_node
 class IfThenElse(Stmt):
-    pass
+    """IfThenElse node.
+
+    Parameters
+    ----------
+    condition : Expr
+        The expression
+
+    then_case : Stmt
+        The statement to execute if condition is true.
+
+    else_case : Stmt
+        The statement to execute if condition is false.
+    """
+    def __init__(self, condition, then_case, else_case):
+        self.__init_handle_by_constructor__(
+            _make.IfThenElse, condition, then_case, else_case)
+
 
 @register_node
 class Evaluate(Stmt):
-    pass
+    """Evaluate node.
+
+    Parameters
+    ----------
+    value : Expr
+        The expression to be evalued.
+    """
+    def __init__(self, value):
+        self.__init_handle_by_constructor__(
+            _make.Evaluate, value)
+
 
 @register_node
 class Prefetch(Stmt):
-    pass
+    """Prefetch node.
+
+    Parameters
+    ----------
+    func : Operation
+        The operation to create the function.
+
+    value_index : int
+        The output value index
+
+    dtype : str
+        The data type to be prefetched.
+
+    bounds : list of Range
+        The bounds to be prefetched.
+    """
+    def __init__(self, func, value_index, dtype, bounds):
+        self.__init_handle_by_constructor__(
+            _make.Prefetch, func, value_index, dtype, bounds)
+
+
+def stmt_seq(*args):
+    """Make sequence of statements
+
+    Parameters
+    ----------
+    args : list of Expr or Var
+        List of statements to be combined as sequence.
+
+    Returns
+    -------
+    stmt : Stmt
+        The combined statement.
+    """
+    ret = None
+    for value in args:
+        if not isinstance(value, Stmt):
+            value = Evaluate(value)
+        ret = value if ret is None else Block(ret, value)
+    return ret if ret else Evaluate(0)
+
+
+def stmt_list(stmt):
+    """Make list of stmt from blocks.
+
+    Parameters
+    ----------
+    stmt : A block statement
+
+    Returns
+    -------
+    stmt_list : list of Stmt
+         The unpacked list of statements
+    """
+    if isinstance(stmt, Block):
+        return stmt_list(stmt.first) + stmt_list(stmt.rest)
+    elif isinstance(stmt, ProducerConsumer):
+        return stmt_list(stmt.body)
+    return [stmt]
+
+
+_make.stmt_list = stmt_list
+_make.stmt_seq = stmt_seq
diff --git a/python/tvm/tag.py b/python/tvm/tag.py
index 5f6091a80a17..de9f8403de2a 100644
--- a/python/tvm/tag.py
+++ b/python/tvm/tag.py
@@ -1,25 +1,36 @@
 """Tag class for TVM operators."""
+import warnings
 from ._ffi.base import decorate
 
 class TagScope(object):
     """Tag scope object to set tag for operators, working as context
     manager and decorator both. See also tag_scope.
     """
-    current = None
+    _current = None
+
+    @classmethod
+    def get_current(cls):
+        if cls._current:
+            cls._current.accessed = True
+        return cls._current
+
     def __init__(self, tag):
         self._old_scope = None
         self.tag = tag
+        self.accessed = False
 
     def __enter__(self):
-        if TagScope.current is not None:
+        if TagScope._current is not None:
             raise ValueError("nested op_tag is not allowed for now")
-        self._old_scope = TagScope.current
-        TagScope.current = self
+        self._old_scope = TagScope._current
+        TagScope._current = self
         return self
 
     def __exit__(self, ptype, value, trace):
         assert self._old_scope is None
-        TagScope.current = self._old_scope
+        if not self.accessed:
+            warnings.warn("Tag '%s' declared via TagScope was not used." % (self.tag,))
+        TagScope._current = self._old_scope
 
     def __call__(self, fdecl):
         def tagged_fdecl(func, *args, **kwargs):
diff --git a/python/tvm/target.py b/python/tvm/target.py
index fed20c3914c6..75f82743f9fa 100644
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -79,11 +79,13 @@ class Target(NodeBase):
     - :any:`tvm.target.mali` create Mali target
     - :any:`tvm.target.intel_graphics` create Intel Graphics target
     """
-    def __init__(self, handle):
-        super(Target, self).__init__(handle)
-        self._keys = None
-        self._options = None
-        self._libs = None
+    def __new__(cls):
+        # Always override new to enable class
+        obj = NodeBase.__new__(cls)
+        obj._keys = None
+        obj._options = None
+        obj._libs = None
+        return obj
 
     @property
     def keys(self):
@@ -103,6 +105,13 @@ def libs(self):
             self._libs = [l.value for l in self.libs_array]
         return self._libs
 
+    @property
+    def model(self):
+        for opt in self.options_array:
+            if opt.value.startswith('-model='):
+                return opt.value[7:]
+        return 'unknown'
+
     def __enter__(self):
         _api_internal._EnterTargetScope(self)
         return self
@@ -263,6 +272,7 @@ def dispatch_func(func, *args, **kwargs):
                     "Keyword arguments cannot be used when invoking generic_func %s" % func_name)
             return generic_func_node(*args)
         fresult = decorate(fdefault, dispatch_func)
+        fresult.fdefault = fdefault
         fresult.register = register
         return fresult
     return fdecorate
@@ -351,57 +361,65 @@ def dispatch_func(func, *args, **kwargs):
     return fdecorate
 
 
-def cuda(options=None):
+def cuda(model='unknown', options=None):
     """Returns a cuda target.
 
     Parameters
     ----------
+    model: str
+        The model of cuda device (e.g. 1080ti)
     options : str or list of str
         Additional options
     """
-    options = _merge_opts([], options)
-    return _api_internal._TargetCreate("cuda", *options)
+    opts = _merge_opts(['-model=%s' % model], options)
+    return _api_internal._TargetCreate("cuda", *opts)
 
 
-def rocm(options=None):
+def rocm(model='unknown', options=None):
     """Returns a ROCM target.
 
     Parameters
     ----------
+    model: str
+        The model of this device
     options : str or list of str
         Additional options
     """
-    options = _merge_opts([], options)
-    return _api_internal._TargetCreate("rocm", *options)
+    opts = _merge_opts(["-model=%s" % model], options)
+    return _api_internal._TargetCreate("rocm", *opts)
 
 
-def mali(options=None):
+def mali(model='unknown', options=None):
     """Returns a ARM Mali GPU target.
 
     Parameters
     ----------
+    model: str
+        The model of this device
     options : str or list of str
         Additional options
     """
-    opts = ["-device=mali"]
+    opts = ["-device=mali", '-model=%s' % model]
     opts = _merge_opts(opts, options)
     return _api_internal._TargetCreate("opencl", *opts)
 
 
-def intel_graphics(options=None):
+def intel_graphics(model='unknown', options=None):
     """Returns an Intel Graphics target.
 
     Parameters
     ----------
+    model: str
+        The model of this device
     options : str or list of str
         Additional options
     """
-    opts = ["-device=intel_graphics"]
+    opts = ["-device=intel_graphics", '-model=%s' % model]
     opts = _merge_opts(opts, options)
     return _api_internal._TargetCreate("opencl", *opts)
 
 
-def opengl(options=None):
+def opengl(model='unknown', options=None):
     """Returns a OpenGL target.
 
     Parameters
@@ -409,8 +427,8 @@ def opengl(options=None):
     options : str or list of str
         Additional options
     """
-    options = _merge_opts([], options)
-    return _api_internal._TargetCreate("opengl", *options)
+    opts = _merge_opts(["-model=%s" % model], options)
+    return _api_internal._TargetCreate("opengl", *opts)
 
 
 def arm_cpu(model='unknown', options=None):
@@ -424,23 +442,19 @@ def arm_cpu(model='unknown', options=None):
     options : str or list of str
         Additional options
     """
-    from . import autotvm
-
     trans_table = {
-        "pixel2":    ["-model=snapdragon835", "-target=arm64-linux-android"],
-        "mate10":    ["-model=kirin970", "-target=arm64-linux-android"],
-        "mate10pro": ["-model=kirin970", "-target=arm64-linux-android"],
-        "p20":       ["-model=kirin970", "-target=arm64-linux-android"],
-        "p20pro":    ["-model=kirin970", "-target=arm64-linux-android"],
-        "rasp3b":    ["-model=bcm2837", "-target=armv7l-linux-gnueabihf"],
-        "rk3399":    ["-model=rk3399", "-target=aarch64-linux-gnu"],
-        "pynq":      ["-model=pynq", "-target=armv7a-linux-eabi"],
+        "pixel2":    ["-model=snapdragon835", "-target=arm64-linux-android -mattr=+neon"],
+        "mate10":    ["-model=kirin970", "-target=arm64-linux-android -mattr=+neon"],
+        "mate10pro": ["-model=kirin970", "-target=arm64-linux-android -mattr=+neon"],
+        "p20":       ["-model=kirin970", "-target=arm64-linux-android -mattr=+neon"],
+        "p20pro":    ["-model=kirin970", "-target=arm64-linux-android -mattr=+neon"],
+        "rasp3b":    ["-model=bcm2837", "-target=armv7l-linux-gnueabihf -mattr=+neon"],
+        "rk3399":    ["-model=rk3399", "-target=aarch64-linux-gnu -mattr=+neon"],
+        "pynq":      ["-model=pynq", "-target=armv7a-linux-eabi -mattr=+neon"],
+        "ultra96":   ["-model=ultra96", "-target=aarch64-linux-gnu -mattr=+neon"],
     }
     pre_defined_opt = trans_table.get(model, ["-model=%s" % model])
 
-    # download pre-tuned parameters for arm_cpu if there is not any.
-    autotvm.tophub.check_package('arm_cpu')
-
     opts = ["-device=arm_cpu"] + pre_defined_opt
     opts = _merge_opts(opts, options)
     return _api_internal._TargetCreate("llvm", *opts)
@@ -496,5 +510,4 @@ def current_target(allow_none=True):
     ------
     ValueError if current target is not set.
     """
-    target_str = _api_internal._GetCurrentTarget(allow_none)
-    return create(target_str) if target_str is not None else None
+    return _api_internal._GetCurrentTarget(allow_none)
diff --git a/python/tvm/tensor.py b/python/tvm/tensor.py
index f169ff1b64ac..9a98e9a6e769 100644
--- a/python/tvm/tensor.py
+++ b/python/tvm/tensor.py
@@ -6,8 +6,10 @@
 from . import make as _make
 from . import expr as _expr
 
+
 class TensorSlice(NodeGeneric, _expr.ExprOp):
     """Auxiliary data structure for enable slicing syntax from tensor."""
+
     def __init__(self, tensor, indices):
         if not isinstance(indices, tuple):
             indices = (indices,)
@@ -28,12 +30,19 @@ def dtype(self):
         """Data content of the tensor."""
         return self.tensor.dtype
 
+@register_node
+class TensorIntrinCall(NodeBase):
+    """Intermediate structure for calling a tensor intrinsic."""
+    pass
+
 
 itervar_cls = None
 
+
 @register_node
 class Tensor(NodeBase, _expr.ExprOp):
     """Tensor object, to construct, see function.Tensor"""
+
     def __call__(self, *indices):
         ndim = self.ndim
         if len(indices) != ndim:
@@ -102,8 +111,10 @@ def name(self):
         return "%s.v%d" % (op.name, self.value_index)
 
 
+
 class Operation(NodeBase):
     """Represent an operation that generate a tensor"""
+
     def output(self, index):
         """Get the index-th output of the operation
 
@@ -150,6 +161,12 @@ def reduce_axis(self):
         return self.__getattr__("reduce_axis")
 
 
+@register_node
+class TensorComputeOp(Operation):
+    """Tensor operation."""
+    pass
+
+
 @register_node
 class ScanOp(Operation):
     """Scan operation."""
@@ -163,3 +180,8 @@ def scan_axis(self):
 class ExternOp(Operation):
     """Extern operation."""
     pass
+
+@register_node
+class HybridOp(Operation):
+    """Hybrid operation."""
+    pass
diff --git a/python/tvm/tensor_intrin.py b/python/tvm/tensor_intrin.py
index 62f8c8897d10..f1f26655fe27 100644
--- a/python/tvm/tensor_intrin.py
+++ b/python/tvm/tensor_intrin.py
@@ -6,9 +6,25 @@
 from . import stmt as _stmt
 from . import make as _make
 from . import tensor as _tensor
+from . import schedule as _schedule
 from .build_module import current_build_config
 from ._ffi.node import NodeBase, register_node
 
+
+def _get_region(tslice):
+    region = []
+    for idx in tslice.indices:
+        if isinstance(idx, slice):
+            assert idx.step is None
+            region.append(_api.Range(idx.start, idx.stop))
+        else:
+            if isinstance(idx, _schedule.IterVar):
+                begin = idx.var
+            else:
+                begin = idx
+            region.append(_make.range_by_min_extent(begin, 1))
+    return region
+
 @register_node
 class TensorIntrin(NodeBase):
     """Tensor intrinsic functions for certain computation.
@@ -17,8 +33,16 @@ class TensorIntrin(NodeBase):
     --------
     decl_tensor_intrin: Construct a TensorIntrin
     """
-    pass
-
+    def __call__(self, *args, **kwargs):
+        tensors = [x.tensor for x in args]
+        regions = [_get_region(x) for x in args]
+        reduce_axis = []
+        if "reduce_axis" in kwargs:
+            reduce_axis = kwargs["reduce_axis"]
+            if not isinstance(reduce_axis, (list, tuple)):
+                reduce_axis = [reduce_axis]
+            reduce_axis = _api.convert(reduce_axis)
+        return _api_internal._TensorIntrinCall(self, tensors, regions, reduce_axis)
 
 def decl_tensor_intrin(op,
                        fcompute,
@@ -72,7 +96,7 @@ def decl_tensor_intrin(op,
     binds_list = []
     for t in inputs:
         if not isinstance(t.op, _tensor.PlaceholderOp):
-            raise ValueError("Donot yet support composition op")
+            raise ValueError("Do not yet support composition op")
 
     cfg = current_build_config()
     for t in tensors:
diff --git a/python/tvm/testing.py b/python/tvm/testing.py
new file mode 100644
index 000000000000..1a6666bdee2a
--- /dev/null
+++ b/python/tvm/testing.py
@@ -0,0 +1,147 @@
+""" TVM testing utilities """
+import logging
+import numpy as np
+
+def assert_allclose(actual, desired, rtol=1e-7, atol=1e-7):
+    """ Version of np.testing.assert_allclose with `atol` and `rtol` fields set
+    in reasonable defaults.
+
+    Arguments `actual` and `desired` are not interchangable, since the function
+    compares the `abs(actual-desired)` with `atol+rtol*abs(desired)`.  Since we
+    often allow `desired` to be close to zero, we generally want non-zero `atol`.
+    """
+    np.testing.assert_allclose(actual, desired, rtol=rtol, atol=atol, verbose=True)
+
+
+def check_numerical_grads(function, input_values, grad_values, function_value=None,
+                          delta=1e-3, atol=1e-2, rtol=0.1):
+    """A helper function that checks that numerical gradients of a function are
+    equal to gradients computed in some different way (analytical gradients).
+
+    Numerical gradients are computed using finite difference approximation. To
+    reduce the number of function evaluations, the number of points used is
+    gradually increased if the error value is too high (up to 5 points).
+
+    Parameters
+    ----------
+    function
+        A function that takes inputs either as positional or as keyword
+        arguments (either `function(*input_values)` or `function(**input_values)`
+        should be correct) and returns a scalar result. Should accept numpy
+        ndarrays.
+
+    input_values : Dict[str, numpy.ndarray] or List[numpy.ndarray]
+        A list of values or a dict assigning values to variables. Represents the
+        point at which gradients should be computed.
+
+    grad_values : Dict[str, numpy.ndarray] or List[numpy.ndarray]
+        Gradients computed using a different method.
+
+    function_value : float, optional
+        Should be equal to `function(**input_values)`.
+
+    delta : float, optional
+        A small number used for numerical computation of partial derivatives.
+        The default 1e-3 is a good choice for float32.
+
+    atol : float, optional
+        Absolute tolerance. Gets multiplied by `sqrt(n)` where n is the size of a
+        gradient.
+
+    rtol : float, optional
+        Relative tolerance.
+    """
+    # If input_values is a list then function accepts positional arguments
+    # In this case transform it to a function taking kwargs of the form {"0": ..., "1": ...}
+    if not isinstance(input_values, dict):
+        input_len = len(input_values)
+        input_values = {str(idx): val for idx, val in enumerate(input_values)}
+
+        def _function(_input_len=input_len, _orig_function=function, **kwargs):
+            return _orig_function(*(kwargs[str(i)] for i in range(input_len)))
+        function = _function
+
+        grad_values = {str(idx): val for idx, val in enumerate(grad_values)}
+
+    if function_value is None:
+        function_value = function(**input_values)
+
+    # a helper to modify j-th element of val by a_delta
+    def modify(val, j, a_delta):
+        val = val.copy()
+        val.reshape(-1)[j] = val.reshape(-1)[j] + a_delta
+        return val
+
+    # numerically compute a partial derivative with respect to j-th element of the var `name`
+    def derivative(x_name, j, a_delta):
+        modified_values = {n: modify(val, j, a_delta) if n == x_name else val
+                           for n, val in input_values.items()}
+        return (function(**modified_values) - function_value)/a_delta
+
+    def compare_derivative(j, n_der, grad):
+        der = grad.reshape(-1)[j]
+        return np.abs(n_der - der) < atol + rtol*np.abs(n_der)
+
+    for x_name, grad in grad_values.items():
+        if grad.shape != input_values[x_name].shape:
+            raise AssertionError(
+                "Gradient wrt '{}' has unexpected shape {}, expected {} "
+                .format(x_name, grad.shape, input_values[x_name].shape))
+
+        ngrad = np.zeros_like(grad)
+
+        wrong_positions = []
+
+        # compute partial derivatives for each position in this variable
+        for j in range(np.prod(grad.shape)):
+            # forward difference approximation
+            nder = derivative(x_name, j, delta)
+
+            # if the derivative is not equal to the analytical one, try to use more
+            # precise and expensive methods
+            if not compare_derivative(j, nder, grad):
+                # central difference approximation
+                nder = (derivative(x_name, j, -delta) + nder)/2
+
+                if not compare_derivative(j, nder, grad):
+                    # central difference approximation using h = delta/2
+                    cnder2 = (derivative(x_name, j, delta/2) + derivative(x_name, j, -delta/2))/2
+                    # five-point derivative
+                    nder = (4*cnder2 - nder)/3
+
+            # if the derivatives still don't match, add this position to the
+            # list of wrong positions
+            if not compare_derivative(j, nder, grad):
+                wrong_positions.append(np.unravel_index(j, grad.shape))
+
+            ngrad.reshape(-1)[j] = nder
+
+        wrong_percentage = int(100*len(wrong_positions)/np.prod(grad.shape))
+
+        dist = np.sqrt(np.sum((ngrad - grad)**2))
+        grad_norm = np.sqrt(np.sum(ngrad**2))
+
+        if not (np.isfinite(dist) and np.isfinite(grad_norm)):
+            raise ValueError(
+                "NaN or infinity detected during numerical gradient checking wrt '{}'\n"
+                "analytical grad = {}\n numerical grad = {}\n"
+                .format(x_name, grad, ngrad))
+
+        # we multiply atol by this number to make it more universal for different sizes
+        sqrt_n = np.sqrt(float(np.prod(grad.shape)))
+
+        if dist > atol*sqrt_n + rtol*grad_norm:
+            raise AssertionError(
+                "Analytical and numerical grads wrt '{}' differ too much\n"
+                "analytical grad = {}\n numerical grad = {}\n"
+                "{}% of elements differ, first 10 of wrong positions: {}\n"
+                "distance > atol*sqrt(n) + rtol*grad_norm\n"
+                "distance {} > {}*{} + {}*{}"
+                .format(x_name, grad, ngrad, wrong_percentage, wrong_positions[:10],
+                        dist, atol, sqrt_n, rtol, grad_norm))
+
+        max_diff = np.max(np.abs(ngrad - grad))
+        avg_diff = np.mean(np.abs(ngrad - grad))
+        logging.info("Numerical grad test wrt '%s' of shape %s passes, "
+                     "dist = %f, max_diff = %f, avg_diff = %f",
+                     x_name, grad.shape, dist, max_diff, avg_diff)
diff --git a/python/update_version.py b/python/update_version.py
new file mode 100644
index 000000000000..9e958f109479
--- /dev/null
+++ b/python/update_version.py
@@ -0,0 +1,66 @@
+"""
+This is the global script that set the version information of TVM.
+This script runs and update all the locations that related to versions
+
+List of affected files:
+- tvm-root/python/tvm/_ffi/libinfo.py
+- tvm-root/include/tvm/runtime/c_runtime_api.h
+- tvm-root/web/tvm_runtime.js
+- tvm-root/conda/tvm/meta.yaml
+- tvm-root/conda/topi/meta.yaml
+- tvm-root/conda/nnvm/meta.yaml
+- tvm-root/conda/tvm-libs/meta.yaml
+"""
+import os
+import re
+# current version
+# We use the version of the incoming release for code
+# that is under development
+__version__ = "0.5.dev"
+
+# Implementations
+def update(file_name, pattern, repl):
+    update = []
+    hit_counter = 0
+    need_update = False
+    for l in open(file_name):
+        result = re.findall(pattern, l)
+        if result:
+            assert len(result) == 1
+            hit_counter += 1
+            if result[0] != repl:
+                l = re.sub(pattern, repl, l)
+                need_update = True
+                print("%s: %s->%s" % (file_name, result[0], repl))
+            else:
+                print("%s: version is already %s" % (file_name, repl))
+
+        update.append(l)
+    if hit_counter != 1:
+        raise RuntimeError("Cannot find version in %s" % file_name)
+
+    if need_update:
+        with open(file_name, "w") as output_file:
+            for l in update:
+                output_file.write(l)
+
+
+def main():
+    curr_dir = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    proj_root = os.path.abspath(os.path.join(curr_dir, ".."))
+    # python path
+    update(os.path.join(proj_root, "python", "tvm", "_ffi", "libinfo.py"),
+           r"(?<=__version__ = \")[.0-9a-z]+", __version__)
+    # C++ header
+    update(os.path.join(proj_root, "include", "tvm", "runtime", "c_runtime_api.h"),
+           "(?<=TVM_VERSION \")[.0-9a-z]+", __version__)
+    # conda
+    for path in ["tvm", "topi", "nnvm", "tvm-libs"]:
+        update(os.path.join(proj_root, "conda", path, "meta.yaml"),
+               "(?<=version = \")[.0-9a-z]+", __version__)
+    # web
+    update(os.path.join(proj_root, "web", "tvm_runtime.js"),
+           "(?<=@version )[.0-9a-z]+", __version__)
+
+if __name__ == "__main__":
+    main()
diff --git a/rust/.gitignore b/rust/.gitignore
new file mode 100644
index 000000000000..230ab66104df
--- /dev/null
+++ b/rust/.gitignore
@@ -0,0 +1,3 @@
+Cargo.lock
+target/
+**/*.rs.bk
diff --git a/rust/.rustfmt.toml b/rust/.rustfmt.toml
new file mode 100644
index 000000000000..dbf3347a32bd
--- /dev/null
+++ b/rust/.rustfmt.toml
@@ -0,0 +1,59 @@
+max_width = 100
+hard_tabs = false
+tab_spaces = 2
+newline_style = "Auto"
+use_small_heuristics = "Default"
+indent_style = "Block"
+wrap_comments = false
+comment_width = 80
+normalize_comments = false
+format_strings = false
+format_macro_matchers = false
+format_macro_bodies = true
+empty_item_single_line = true
+struct_lit_single_line = true
+fn_single_line = false
+where_single_line = false
+imports_indent = "Block"
+imports_layout = "Mixed"
+merge_imports = true
+reorder_imports = true
+reorder_modules = true
+reorder_impl_items = false
+type_punctuation_density = "Wide"
+space_before_colon = false
+space_after_colon = true
+spaces_around_ranges = false
+binop_separator = "Front"
+remove_nested_parens = true
+combine_control_expr = true
+struct_field_align_threshold = 0
+match_arm_blocks = true
+force_multiline_blocks = false
+fn_args_density = "Tall"
+brace_style = "SameLineWhere"
+control_brace_style = "AlwaysSameLine"
+trailing_semicolon = true
+trailing_comma = "Vertical"
+match_block_trailing_comma = false
+blank_lines_upper_bound = 1
+blank_lines_lower_bound = 0
+edition = "2015"
+merge_derives = true
+use_try_shorthand = true
+use_field_init_shorthand = false
+force_explicit_abi = true
+condense_wildcard_suffixes = false
+color = "Auto"
+required_version = "0.99.5"
+unstable_features = false
+disable_all_formatting = false
+skip_children = false
+hide_parse_errors = false
+error_on_line_overflow = false
+error_on_unformatted = false
+report_todo = "Never"
+report_fixme = "Never"
+ignore = []
+emit_mode = "Files"
+make_backup = false
diff --git a/rust/.travis.yml b/rust/.travis.yml
new file mode 100644
index 000000000000..63a3d0277c1b
--- /dev/null
+++ b/rust/.travis.yml
@@ -0,0 +1,5 @@
+language: rust
+rust:
+  - nightly
+matrix:
+  fast_finish: true
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
new file mode 100644
index 000000000000..0819e0c70023
--- /dev/null
+++ b/rust/Cargo.toml
@@ -0,0 +1,28 @@
+[package]
+name = "tvm"
+version = "0.1.0"
+license = "Apache-2.0"
+description = "TVM Rust runtime"
+repository = "https://github.com/dmlc/tvm"
+readme = "README.md"
+keywords = ["tvm", "nnvm"]
+categories = ["api-bindings", "science"]
+authors = ["Nick Hynes <nhynes@berkeley.edu>"]
+
+[features]
+default = ["nom/std"]
+sgx = ["nom/alloc"]
+
+[dependencies]
+bounded-spsc-queue = "0.4.0"
+error-chain = { version = "0.12.0", default-features = false }
+itertools = "0.7.8"
+lazy_static = "1.1.0"
+ndarray = "0.11.2"
+nom = {version = "4.0.0", default-features = false }
+serde = "1.0.59"
+serde_derive = "1.0.79"
+serde_json = "1.0.17"
+
+[target.'cfg(not(target_env = "sgx"))'.dependencies]
+num_cpus = "1.8.0"
diff --git a/rust/src/errors.rs b/rust/src/errors.rs
new file mode 100644
index 000000000000..f9da7180b8cc
--- /dev/null
+++ b/rust/src/errors.rs
@@ -0,0 +1,39 @@
+#[cfg(target_env = "sgx")]
+use alloc::alloc;
+#[cfg(not(target_env = "sgx"))]
+use std::alloc;
+use std::num;
+
+use ndarray;
+use serde_json;
+
+error_chain! {
+  errors {
+    TryFromTVMRetValueError(expected: String, actual: i64) {
+      description("mismatched types while downcasting TVMRetValue")
+      display("invalid downcast: expected `{}` but was `{}`", expected, actual)
+    }
+
+    GraphFormatError(msg: String) {
+      description("unable to load graph")
+      display("could not load graph json: {}", msg)
+    }
+
+    LoadGraphParamsError(msg: String) {
+      description("unable to load graph params")
+      display("could not load graph params: {}", msg)
+    }
+  }
+  foreign_links {
+    Alloc(alloc::AllocErr);
+    GraphDeserialize(serde_json::Error);
+    ParseInt(num::ParseIntError);
+    ShapeError(ndarray::ShapeError);
+  }
+}
+
+impl From<alloc::LayoutErr> for Error {
+  fn from(_err: alloc::LayoutErr) -> Error {
+    Error::from_kind(ErrorKind::Msg("Layout error".to_string()))
+  }
+}
diff --git a/rust/src/lib.rs b/rust/src/lib.rs
new file mode 100644
index 000000000000..e17c66911b18
--- /dev/null
+++ b/rust/src/lib.rs
@@ -0,0 +1,67 @@
+//! This crate is an implementation of the TVM runtime for modules compiled with `--system-lib`.
+//! It's mainly useful for compiling to WebAssembly and SGX,
+//! but also native if you prefer Rust to C++.
+//!
+//! For TVM graphs, the entrypoint to this crate is `runtime::GraphExecutor`.
+//! Single-function modules are used via the `packed_func!` macro after obtaining
+//! the function from `runtime::SystemLibModule`
+//!
+//! The main entrypoints to this crate are `GraphExecutor`
+//! For examples of use, please refer to the multi-file tests in the `tests` directory.
+
+#![feature(
+  alloc,
+  allocator_api,
+  box_syntax,
+  fn_traits,
+  try_from,
+  unboxed_closures,
+  vec_remove_item
+)]
+
+#[cfg(target_env = "sgx")]
+extern crate alloc;
+extern crate bounded_spsc_queue;
+#[cfg(target_env = "sgx")]
+extern crate core;
+#[macro_use]
+extern crate error_chain;
+#[macro_use]
+extern crate itertools;
+#[macro_use]
+extern crate lazy_static;
+extern crate ndarray;
+#[macro_use]
+extern crate nom;
+#[cfg(not(target_env = "sgx"))]
+extern crate num_cpus;
+extern crate serde;
+#[macro_use]
+extern crate serde_derive;
+extern crate serde_json;
+
+pub mod ffi {
+  #![allow(
+    non_camel_case_types,
+    non_snake_case,
+    non_upper_case_globals,
+    unused
+  )]
+
+  pub mod runtime {
+    use std::os::raw::{c_char, c_int, c_void};
+
+    include!(concat!(
+      env!("CARGO_MANIFEST_DIR"),
+      "/src/runtime/c_runtime_api.rs"
+    ));
+
+    pub type BackendPackedCFunc =
+      extern "C" fn(args: *const TVMValue, type_codes: *const c_int, num_args: c_int) -> c_int;
+  }
+}
+
+pub mod errors;
+pub mod runtime;
+
+pub use errors::*;
diff --git a/rust/src/runtime/allocator.rs b/rust/src/runtime/allocator.rs
new file mode 100644
index 000000000000..d704336bff1f
--- /dev/null
+++ b/rust/src/runtime/allocator.rs
@@ -0,0 +1,52 @@
+#[cfg(target_env = "sgx")]
+use alloc::alloc::{self, Layout};
+#[cfg(not(target_env = "sgx"))]
+use std::alloc::{self, Layout};
+
+use errors::*;
+
+const DEFAULT_ALIGN_BYTES: usize = 4;
+
+#[derive(PartialEq, Eq)]
+pub struct Allocation {
+  layout: Layout,
+  ptr: *mut u8,
+}
+
+impl Allocation {
+  /// Allocates a chunk of memory of `size` bytes with optional alignment.
+  pub fn new(size: usize, align: Option<usize>) -> Result<Self> {
+    let alignment = align.unwrap_or(DEFAULT_ALIGN_BYTES);
+    let layout = Layout::from_size_align(size, alignment)?;
+    let ptr = unsafe { alloc::alloc(layout.clone()) };
+    if ptr.is_null() {
+      alloc::handle_alloc_error(layout);
+    }
+    Ok(Self {
+      ptr: ptr,
+      layout: layout,
+    })
+  }
+
+  pub fn as_mut_ptr(&self) -> *mut u8 {
+    self.ptr
+  }
+
+  /// Returns the size of the Allocation in bytes.
+  pub fn size(&self) -> usize {
+    self.layout.size()
+  }
+
+  /// Returns the byte alignment of the Allocation.
+  pub fn align(&self) -> usize {
+    self.layout.align()
+  }
+}
+
+impl Drop for Allocation {
+  fn drop(&mut self) {
+    unsafe {
+      alloc::dealloc(self.ptr, self.layout.clone());
+    }
+  }
+}
diff --git a/rust/src/runtime/array.rs b/rust/src/runtime/array.rs
new file mode 100644
index 000000000000..100258d9a157
--- /dev/null
+++ b/rust/src/runtime/array.rs
@@ -0,0 +1,500 @@
+use std::{
+  any::TypeId,
+  convert::TryFrom,
+  mem,
+  os::raw::{c_int, c_void},
+  ptr, slice,
+};
+
+use ndarray;
+
+use super::allocator::Allocation;
+use errors::*;
+use ffi::runtime::{
+  DLContext, DLDataType, DLDataTypeCode_kDLFloat, DLDataTypeCode_kDLInt, DLDataTypeCode_kDLUInt,
+  DLDeviceType_kDLCPU, DLTensor,
+};
+
+/// A `Storage` is a container which holds `Tensor` data.
+#[derive(PartialEq)]
+pub enum Storage<'a> {
+  /// A `Storage` which owns its contained bytes.
+  Owned(Allocation),
+
+  /// A view of an existing `Storage`.
+  View(&'a mut [u8], usize), // ptr, align
+}
+
+impl<'a> Storage<'a> {
+  pub fn new(size: usize, align: Option<usize>) -> Result<Storage<'static>> {
+    Ok(Storage::Owned(Allocation::new(size, align)?))
+  }
+
+  pub fn as_mut_ptr(&self) -> *mut u8 {
+    match self {
+      Storage::Owned(alloc) => alloc.as_mut_ptr(),
+      Storage::View(slice, _) => slice.as_ptr() as *mut u8,
+    }
+  }
+
+  pub fn size(&self) -> usize {
+    match self {
+      Storage::Owned(alloc) => alloc.size(),
+      Storage::View(slice, _) => slice.len(),
+    }
+  }
+
+  pub fn align(&self) -> usize {
+    match self {
+      Storage::Owned(alloc) => alloc.align(),
+      Storage::View(_, align) => *align,
+    }
+  }
+
+  pub fn as_ptr(&self) -> *const u8 {
+    self.as_mut_ptr() as *const _
+  }
+
+  /// Returns a `Storage::View` which points to an owned `Storage::Owned`.
+  pub fn view(&self) -> Storage<'a> {
+    match self {
+      Storage::Owned(alloc) => Storage::View(
+        unsafe { slice::from_raw_parts_mut(alloc.as_mut_ptr(), self.size()) },
+        self.align(),
+      ),
+      Storage::View(slice, _) => Storage::View(
+        unsafe { slice::from_raw_parts_mut(self.as_mut_ptr(), slice.len()) },
+        self.align(),
+      ),
+    }
+  }
+
+  pub fn is_owned(&self) -> bool {
+    match self {
+      Storage::Owned(_) => true,
+      _ => false,
+    }
+  }
+
+  /// Returns an owned version of this storage via cloning.
+  pub fn to_owned(&self) -> Storage<'static> {
+    let s = Storage::new(self.size(), Some(self.align())).unwrap();
+    unsafe {
+      s.as_mut_ptr()
+        .copy_from_nonoverlapping(self.as_ptr(), self.size())
+    }
+    s
+  }
+}
+
+impl<'a, T> From<&'a [T]> for Storage<'a> {
+  fn from(data: &'a [T]) -> Self {
+    let data = unsafe {
+      slice::from_raw_parts_mut(
+        data.as_ptr() as *const u8 as *mut u8,
+        data.len() * mem::size_of::<T>() as usize,
+      )
+    };
+    Storage::View(data, mem::align_of::<T>())
+  }
+}
+
+/// A n-dimensional array type which can be converted to/from `tvm::DLTensor` and `ndarray::Array`.
+/// `Tensor` is primarily a holder of data which can be operated on via TVM (via `DLTensor`) or
+/// converted to `ndarray::Array` for non-TVM processing.
+///
+/// # Examples
+///
+/// ```
+/// extern crate ndarray;
+///
+/// let mut a_nd: ndarray::Array = ndarray::Array::from_vec(vec![1f32, 2., 3., 4.]);
+/// let mut a: Tensor = a_nd.into();
+/// let mut a_dl: DLTensor = (&mut t).into();
+/// call_packed!(tvm_fn, &mut a_dl);
+///
+/// // Array -> Tensor is mostly useful when post-processing TVM graph outputs.
+/// let mut a_nd = ndarray::Array::try_from(&a).unwrap();
+/// ```
+#[derive(PartialEq)]
+pub struct Tensor<'a> {
+  /// The bytes which contain the data this `Tensor` represents.
+  pub(super) data: Storage<'a>,
+  pub(super) ctx: TVMContext,
+  pub(super) dtype: DataType,
+  pub(super) shape: Vec<i64>, // not usize because `typedef int64_t tvm_index_t` in c_runtime_api.h
+  /// The `Tensor` strides. Can be `None` if the `Tensor` is contiguous.
+  pub(super) strides: Option<Vec<usize>>,
+  pub(super) byte_offset: isize,
+  /// The number of elements in the `Tensor`.
+  pub(super) size: usize,
+}
+
+unsafe impl<'a> Send for Tensor<'a> {}
+
+impl<'a> Tensor<'a> {
+  pub fn shape(&self) -> Vec<i64> {
+    self.shape.clone()
+  }
+
+  /// Returns the data of this `Tensor` as a `Vec`.
+  ///
+  /// # Panics
+  ///
+  /// Panics if the `Tensor` is not contiguous or does not contain elements of type `T`.
+  pub fn to_vec<T: 'static>(&self) -> Vec<T> {
+    assert!(self.is_contiguous());
+    assert!(self.dtype.is_type::<T>());
+    let mut vec: Vec<T> = Vec::with_capacity(self.size * self.dtype.itemsize());
+    unsafe {
+      vec.as_mut_ptr().copy_from_nonoverlapping(
+        self.data.as_ptr().offset(self.byte_offset) as *const T,
+        self.size,
+      );
+      vec.set_len(self.size);
+    }
+    vec
+  }
+
+  /// Returns `true` iff this `Tensor` is represented by a contiguous region of memory.
+  pub fn is_contiguous(&self) -> bool {
+    match self.strides {
+      None => true,
+      Some(ref strides) => {
+        // check that stride for each dimension is the product of all trailing dimensons' shapes
+        self
+          .shape
+          .iter()
+          .zip(strides)
+          .rfold(
+            (true, 1),
+            |(is_contig, expected_stride), (shape, stride)| {
+              (
+                is_contig && *stride == expected_stride,
+                expected_stride * (*shape as usize),
+              )
+            },
+          )
+          .0
+      }
+    }
+  }
+
+  /// Returns a clone of this `Tensor`.
+  ///
+  /// # Panics
+  ///
+  /// Panics if the `Tensor` is not contiguous or does not contain elements of type `T`.
+  pub fn copy(&mut self, other: &Tensor) {
+    assert!(
+      self.dtype == other.dtype && self.size == other.size,
+      "Tensor shape/dtype mismatch."
+    );
+    assert!(
+      self.is_contiguous() && other.is_contiguous(),
+      "copy currently requires contiguous tensors\n`self.strides = {:?}` `other.strides = {:?}`",
+      self.strides,
+      other.strides
+    );
+    unsafe {
+      self
+        .data
+        .as_mut_ptr()
+        .offset(self.byte_offset as isize)
+        .copy_from_nonoverlapping(
+          other.data.as_mut_ptr().offset(other.byte_offset),
+          other.size * other.dtype.itemsize(),
+        );
+    }
+  }
+
+  /// Returns an owned version of this `Tensor` via cloning.
+  pub fn to_owned(&self) -> Tensor<'static> {
+    let t = Tensor {
+      data: self.data.to_owned(),
+      ctx: self.ctx.clone(),
+      dtype: self.dtype.clone(),
+      size: self.size.clone(),
+      shape: self.shape.clone(),
+      strides: None,
+      byte_offset: 0,
+    };
+    unsafe { mem::transmute::<Tensor<'a>, Tensor<'static>>(t) }
+  }
+
+  fn from_array_storage<'s, T, D: ndarray::Dimension>(
+    arr: &ndarray::Array<T, D>,
+    storage: Storage<'s>,
+    type_code: usize,
+  ) -> Tensor<'s> {
+    let type_width = mem::size_of::<T>() as usize;
+    Tensor {
+      data: storage,
+      ctx: TVMContext::default(),
+      dtype: DataType {
+        code: type_code,
+        bits: 8 * type_width,
+        lanes: 1,
+      },
+      size: arr.len(),
+      shape: arr.shape().iter().map(|&v| v as i64).collect(),
+      strides: Some(arr.strides().into_iter().map(|&v| v as usize).collect()),
+      byte_offset: 0,
+    }
+  }
+}
+
+/// Conversions to `ndarray::Array` from `Tensor`, if the types match.
+macro_rules! impl_ndarray_try_from_tensor {
+  ($type:ty, $dtype:expr) => {
+    impl<'a, 't> TryFrom<&'a Tensor<'t>> for ndarray::ArrayD<$type> {
+      type Error = Error;
+      fn try_from(tensor: &'a Tensor) -> Result<ndarray::ArrayD<$type>> {
+        ensure!(
+          tensor.dtype == $dtype,
+          "Cannot convert Tensor with dtype {:?} to ndarray",
+          tensor.dtype
+        );
+        Ok(ndarray::Array::from_shape_vec(
+          tensor
+            .shape
+            .iter()
+            .map(|s| *s as usize)
+            .collect::<Vec<usize>>(),
+          tensor.to_vec::<$type>(),
+        )?)
+      }
+    }
+  };
+}
+
+impl_ndarray_try_from_tensor!(i32, DTYPE_INT32);
+impl_ndarray_try_from_tensor!(u32, DTYPE_UINT32);
+impl_ndarray_try_from_tensor!(f32, DTYPE_FLOAT32);
+impl_ndarray_try_from_tensor!(f64, DTYPE_FLOAT64);
+
+impl DLTensor {
+  pub(super) fn from_tensor<'a>(tensor: &'a Tensor, flatten: bool) -> Self {
+    assert!(!flatten || tensor.is_contiguous());
+    Self {
+      data: unsafe { tensor.data.as_mut_ptr().offset(tensor.byte_offset) } as *mut c_void,
+      ctx: DLContext::from(&tensor.ctx),
+      ndim: if flatten { 1 } else { tensor.shape.len() } as i32,
+      dtype: DLDataType::from(&tensor.dtype),
+      shape: if flatten {
+        &tensor.size as *const _ as *mut i64
+      } else {
+        tensor.shape.as_ptr()
+      } as *mut i64,
+      strides: if flatten || tensor.is_contiguous() {
+        ptr::null_mut()
+      } else {
+        tensor.strides.as_ref().unwrap().as_ptr()
+      } as *mut i64,
+      byte_offset: 0,
+    }
+  }
+}
+
+impl<'a, 't> From<&'a Tensor<'t>> for DLTensor {
+  fn from(tensor: &'a Tensor<'t>) -> Self {
+    DLTensor::from_tensor(tensor, false /* flatten */)
+  }
+}
+
+impl<'a, 't> From<&'a mut Tensor<'t>> for DLTensor {
+  fn from(tensor: &'a mut Tensor<'t>) -> Self {
+    DLTensor::from_tensor(tensor, false /* flatten */)
+  }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct DataType {
+  pub(super) code: usize,
+  pub(super) bits: usize,
+  pub(super) lanes: usize,
+}
+
+impl DataType {
+  /// Returns the number of bytes occupied by an element of this `DataType`.
+  pub fn itemsize(&self) -> usize {
+    (self.bits * self.lanes) >> 3
+  }
+
+  /// Returns whether this `DataType` represents primitive type `T`.
+  pub fn is_type<T: 'static>(&self) -> bool {
+    if self.lanes != 1 {
+      return false;
+    }
+    let typ = TypeId::of::<T>();
+    (typ == TypeId::of::<i32>() && self.code == 0 && self.bits == 32)
+      || (typ == TypeId::of::<i64>() && self.code == 0 && self.bits == 64)
+      || (typ == TypeId::of::<u32>() && self.code == 1 && self.bits == 32)
+      || (typ == TypeId::of::<u64>() && self.code == 1 && self.bits == 64)
+      || (typ == TypeId::of::<f32>() && self.code == 2 && self.bits == 32)
+      || (typ == TypeId::of::<f64>() && self.code == 2 && self.bits == 64)
+  }
+}
+
+impl<'a> From<&'a DataType> for DLDataType {
+  fn from(dtype: &'a DataType) -> Self {
+    Self {
+      code: dtype.code as u8,
+      bits: dtype.bits as u8,
+      lanes: dtype.lanes as u16,
+    }
+  }
+}
+
+impl From<DLDataType> for DataType {
+  fn from(dtype: DLDataType) -> Self {
+    Self {
+      code: dtype.code as usize,
+      bits: dtype.bits as usize,
+      lanes: dtype.lanes as usize,
+    }
+  }
+}
+
+macro_rules! make_dtype_const {
+  ($name: ident, $code: ident, $bits: expr, $lanes: expr) => {
+    const $name: DataType = DataType {
+      code: $code as usize,
+      bits: $bits,
+      lanes: $lanes,
+    };
+  };
+}
+
+make_dtype_const!(DTYPE_INT32, DLDataTypeCode_kDLInt, 32, 1);
+make_dtype_const!(DTYPE_UINT32, DLDataTypeCode_kDLUInt, 32, 1);
+// make_dtype_const!(DTYPE_FLOAT16, DLDataTypeCode_kDLFloat, 16, 1);
+make_dtype_const!(DTYPE_FLOAT32, DLDataTypeCode_kDLFloat, 32, 1);
+make_dtype_const!(DTYPE_FLOAT64, DLDataTypeCode_kDLFloat, 64, 1);
+
+impl Default for DLContext {
+  fn default() -> Self {
+    DLContext {
+      device_type: DLDeviceType_kDLCPU,
+      device_id: 0,
+    }
+  }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub struct TVMContext {
+  pub(super) device_type: usize,
+  pub(super) device_id: usize,
+}
+
+impl<'a> From<&'a TVMContext> for DLContext {
+  fn from(ctx: &'a TVMContext) -> Self {
+    Self {
+      device_type: ctx.device_type as u32,
+      device_id: ctx.device_id as i32,
+    }
+  }
+}
+
+impl Default for TVMContext {
+  fn default() -> Self {
+    Self {
+      device_type: DLDeviceType_kDLCPU as usize,
+      device_id: 0,
+    }
+  }
+}
+
+impl<'a> From<DLTensor> for Tensor<'a> {
+  fn from(dlt: DLTensor) -> Self {
+    unsafe {
+      let dtype = DataType::from(dlt.dtype);
+      let shape = slice::from_raw_parts(dlt.shape, dlt.ndim as usize).to_vec();
+      let size = shape.iter().map(|v| *v as usize).product::<usize>() as usize;
+      let storage = Storage::from(slice::from_raw_parts(
+        dlt.data as *const u8,
+        dtype.itemsize() * size,
+      ));
+      Self {
+        data: storage,
+        ctx: TVMContext::default(),
+        dtype: dtype,
+        size: size,
+        shape: shape,
+        strides: if dlt.strides == ptr::null_mut() {
+          None
+        } else {
+          Some(slice::from_raw_parts_mut(dlt.strides as *mut usize, size).to_vec())
+        },
+        byte_offset: dlt.byte_offset as isize,
+      }
+    }
+  }
+}
+
+/// `From` conversions to `Tensor` for owned or borrowed `ndarray::Array`.
+///
+/// # Panics
+///
+/// Panics if the ndarray is not contiguous.
+macro_rules! impl_tensor_from_ndarray {
+  ($type:ty, $typecode:expr) => {
+    impl<D: ndarray::Dimension> From<ndarray::Array<$type, D>> for Tensor<'static> {
+      fn from(arr: ndarray::Array<$type, D>) -> Self {
+        assert!(arr.is_standard_layout(), "Array must be contiguous.");
+        let size = arr.len() * mem::size_of::<$type>() as usize;
+        let storage =
+          Storage::from(unsafe { slice::from_raw_parts(arr.as_ptr() as *const u8, size) });
+        Tensor::from_array_storage(&arr, storage, $typecode as usize)
+      }
+    }
+    impl<'a, D: ndarray::Dimension> From<&'a ndarray::Array<$type, D>> for Tensor<'a> {
+      fn from(arr: &'a ndarray::Array<$type, D>) -> Self {
+        assert!(arr.is_standard_layout(), "Array must be contiguous.");
+        Tensor::from_array_storage(
+          arr,
+          Storage::from(arr.as_slice().unwrap()),
+          $typecode as usize,
+        )
+      }
+    }
+  };
+}
+
+/// `From` conversions to `DLTensor` for `ndarray::Array`.
+/// Takes a reference to the `ndarray` since `DLTensor` is not owned.
+macro_rules! impl_dltensor_from_ndarray {
+  ($type:ty, $typecode:expr) => {
+    impl<'a, D: ndarray::Dimension> From<&'a mut ndarray::Array<$type, D>> for DLTensor {
+      fn from(arr: &'a mut ndarray::Array<$type, D>) -> Self {
+        DLTensor {
+          data: arr.as_mut_ptr() as *mut c_void,
+          ctx: DLContext::default(),
+          ndim: arr.ndim() as c_int,
+          dtype: DLDataType {
+            code: $typecode as u8,
+            bits: 8 * mem::size_of::<$type>() as u8,
+            lanes: 1,
+          },
+          shape: arr.shape().as_ptr() as *const i64 as *mut i64,
+          strides: arr.strides().as_ptr() as *const isize as *mut i64,
+          byte_offset: 0,
+        }
+      }
+    }
+  };
+}
+
+impl_dltensor_from_ndarray!(f32, DLDataTypeCode_kDLFloat);
+impl_dltensor_from_ndarray!(f64, DLDataTypeCode_kDLFloat);
+impl_dltensor_from_ndarray!(i32, DLDataTypeCode_kDLInt);
+impl_dltensor_from_ndarray!(i64, DLDataTypeCode_kDLInt);
+impl_dltensor_from_ndarray!(u32, DLDataTypeCode_kDLUInt);
+impl_dltensor_from_ndarray!(u64, DLDataTypeCode_kDLUInt);
+
+impl_tensor_from_ndarray!(f32, DLDataTypeCode_kDLFloat);
+impl_tensor_from_ndarray!(f64, DLDataTypeCode_kDLFloat);
+impl_tensor_from_ndarray!(i32, DLDataTypeCode_kDLInt);
+impl_tensor_from_ndarray!(i64, DLDataTypeCode_kDLInt);
+impl_tensor_from_ndarray!(u32, DLDataTypeCode_kDLUInt);
+impl_tensor_from_ndarray!(u64, DLDataTypeCode_kDLUInt);
diff --git a/rust/src/runtime/c_runtime_api.rs b/rust/src/runtime/c_runtime_api.rs
new file mode 100644
index 000000000000..6facf9ca274f
--- /dev/null
+++ b/rust/src/runtime/c_runtime_api.rs
@@ -0,0 +1,770 @@
+/* automatically generated by rust-bindgen for TVM revision 6292c78 */
+
+pub const TVM_VERSION: &'static [u8; 8usize] = b"0.5.dev\0";
+pub const DLPACK_VERSION: u32 = 8;
+pub const _STDINT_H: u32 = 1;
+pub const _FEATURES_H: u32 = 1;
+pub const _DEFAULT_SOURCE: u32 = 1;
+pub const __USE_ISOC11: u32 = 1;
+pub const __USE_ISOC99: u32 = 1;
+pub const __USE_ISOC95: u32 = 1;
+pub const __USE_POSIX_IMPLICITLY: u32 = 1;
+pub const _POSIX_SOURCE: u32 = 1;
+pub const _POSIX_C_SOURCE: u32 = 200809;
+pub const __USE_POSIX: u32 = 1;
+pub const __USE_POSIX2: u32 = 1;
+pub const __USE_POSIX199309: u32 = 1;
+pub const __USE_POSIX199506: u32 = 1;
+pub const __USE_XOPEN2K: u32 = 1;
+pub const __USE_XOPEN2K8: u32 = 1;
+pub const _ATFILE_SOURCE: u32 = 1;
+pub const __USE_MISC: u32 = 1;
+pub const __USE_ATFILE: u32 = 1;
+pub const __USE_FORTIFY_LEVEL: u32 = 0;
+pub const _STDC_PREDEF_H: u32 = 1;
+pub const __STDC_IEC_559__: u32 = 1;
+pub const __STDC_IEC_559_COMPLEX__: u32 = 1;
+pub const __STDC_ISO_10646__: u32 = 201505;
+pub const __STDC_NO_THREADS__: u32 = 1;
+pub const __GNU_LIBRARY__: u32 = 6;
+pub const __GLIBC__: u32 = 2;
+pub const __GLIBC_MINOR__: u32 = 23;
+pub const _SYS_CDEFS_H: u32 = 1;
+pub const __WORDSIZE: u32 = 64;
+pub const __WORDSIZE_TIME64_COMPAT32: u32 = 1;
+pub const __SYSCALL_WORDSIZE: u32 = 64;
+pub const _BITS_WCHAR_H: u32 = 1;
+pub const INT8_MIN: i32 = -128;
+pub const INT16_MIN: i32 = -32768;
+pub const INT32_MIN: i32 = -2147483648;
+pub const INT8_MAX: u32 = 127;
+pub const INT16_MAX: u32 = 32767;
+pub const INT32_MAX: u32 = 2147483647;
+pub const UINT8_MAX: u32 = 255;
+pub const UINT16_MAX: u32 = 65535;
+pub const UINT32_MAX: u32 = 4294967295;
+pub const INT_LEAST8_MIN: i32 = -128;
+pub const INT_LEAST16_MIN: i32 = -32768;
+pub const INT_LEAST32_MIN: i32 = -2147483648;
+pub const INT_LEAST8_MAX: u32 = 127;
+pub const INT_LEAST16_MAX: u32 = 32767;
+pub const INT_LEAST32_MAX: u32 = 2147483647;
+pub const UINT_LEAST8_MAX: u32 = 255;
+pub const UINT_LEAST16_MAX: u32 = 65535;
+pub const UINT_LEAST32_MAX: u32 = 4294967295;
+pub const INT_FAST8_MIN: i32 = -128;
+pub const INT_FAST16_MIN: i64 = -9223372036854775808;
+pub const INT_FAST32_MIN: i64 = -9223372036854775808;
+pub const INT_FAST8_MAX: u32 = 127;
+pub const INT_FAST16_MAX: u64 = 9223372036854775807;
+pub const INT_FAST32_MAX: u64 = 9223372036854775807;
+pub const UINT_FAST8_MAX: u32 = 255;
+pub const UINT_FAST16_MAX: i32 = -1;
+pub const UINT_FAST32_MAX: i32 = -1;
+pub const INTPTR_MIN: i64 = -9223372036854775808;
+pub const INTPTR_MAX: u64 = 9223372036854775807;
+pub const UINTPTR_MAX: i32 = -1;
+pub const PTRDIFF_MIN: i64 = -9223372036854775808;
+pub const PTRDIFF_MAX: u64 = 9223372036854775807;
+pub const SIG_ATOMIC_MIN: i32 = -2147483648;
+pub const SIG_ATOMIC_MAX: u32 = 2147483647;
+pub const SIZE_MAX: i32 = -1;
+pub const WINT_MIN: u32 = 0;
+pub const WINT_MAX: u32 = 4294967295;
+pub type int_least8_t = ::std::os::raw::c_schar;
+pub type int_least16_t = ::std::os::raw::c_short;
+pub type int_least32_t = ::std::os::raw::c_int;
+pub type int_least64_t = ::std::os::raw::c_long;
+pub type uint_least8_t = ::std::os::raw::c_uchar;
+pub type uint_least16_t = ::std::os::raw::c_ushort;
+pub type uint_least32_t = ::std::os::raw::c_uint;
+pub type uint_least64_t = ::std::os::raw::c_ulong;
+pub type int_fast8_t = ::std::os::raw::c_schar;
+pub type int_fast16_t = ::std::os::raw::c_long;
+pub type int_fast32_t = ::std::os::raw::c_long;
+pub type int_fast64_t = ::std::os::raw::c_long;
+pub type uint_fast8_t = ::std::os::raw::c_uchar;
+pub type uint_fast16_t = ::std::os::raw::c_ulong;
+pub type uint_fast32_t = ::std::os::raw::c_ulong;
+pub type uint_fast64_t = ::std::os::raw::c_ulong;
+pub type intmax_t = ::std::os::raw::c_long;
+pub type uintmax_t = ::std::os::raw::c_ulong;
+pub type wchar_t = ::std::os::raw::c_int;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct max_align_t {
+  pub __clang_max_align_nonce1: ::std::os::raw::c_longlong,
+  pub __bindgen_padding_0: u64,
+  pub __clang_max_align_nonce2: f64,
+}
+pub const DLDeviceType_kDLCPU: DLDeviceType = 1;
+pub const DLDeviceType_kDLGPU: DLDeviceType = 2;
+pub const DLDeviceType_kDLCPUPinned: DLDeviceType = 3;
+pub const DLDeviceType_kDLOpenCL: DLDeviceType = 4;
+pub const DLDeviceType_kDLMetal: DLDeviceType = 8;
+pub const DLDeviceType_kDLVPI: DLDeviceType = 9;
+pub const DLDeviceType_kDLROCM: DLDeviceType = 10;
+/// \brief The device type in DLContext.
+pub type DLDeviceType = u32;
+/// \brief A Device context for Tensor and operator.
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct DLContext {
+  /// \brief The device type used in the device.
+  pub device_type: DLDeviceType,
+  /// \brief The device index
+  pub device_id: ::std::os::raw::c_int,
+}
+pub const DLDataTypeCode_kDLInt: DLDataTypeCode = 0;
+pub const DLDataTypeCode_kDLUInt: DLDataTypeCode = 1;
+pub const DLDataTypeCode_kDLFloat: DLDataTypeCode = 2;
+/// \brief The type code options DLDataType.
+pub type DLDataTypeCode = u32;
+/// \brief The data type the tensor can hold.
+///
+/// Examples
+/// - float: type_code = 2, bits = 32, lanes=1
+/// - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
+/// - int8: type_code = 0, bits = 8, lanes=1
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct DLDataType {
+  /// \brief Type code of base types.
+  /// We keep it uint8_t instead of DLDataTypeCode for minimal memory
+  /// footprint, but the value should be one of DLDataTypeCode enum values.
+  ///
+  pub code: u8,
+  /// \brief Number of bits, common choices are 8, 16, 32.
+  pub bits: u8,
+  /// \brief Number of lanes in the type, used for vector types.
+  pub lanes: u16,
+}
+/// \brief Plain C Tensor object, does not manage memory.
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct DLTensor {
+  /// \brief The opaque data pointer points to the allocated data.
+  /// This will be CUDA device pointer or cl_mem handle in OpenCL.
+  /// This pointer is always aligns to 256 bytes as in CUDA.
+  pub data: *mut ::std::os::raw::c_void,
+  /// \brief The device context of the tensor
+  pub ctx: DLContext,
+  /// \brief Number of dimensions
+  pub ndim: ::std::os::raw::c_int,
+  /// \brief The data type of the pointer
+  pub dtype: DLDataType,
+  /// \brief The shape of the tensor
+  pub shape: *mut i64,
+  /// \brief strides of the tensor,
+  /// can be NULL, indicating tensor is compact.
+  pub strides: *mut i64,
+  /// \brief The offset in bytes to the beginning pointer to data
+  pub byte_offset: u64,
+}
+/// \brief C Tensor object, manage memory of DLTensor. This data structure is
+/// intended to faciliate the borrowing of DLTensor by another framework. It is
+/// not meant to transfer the tensor. When the borrowing framework doesn't need
+/// the tensor, it should call the deleter to notify the host that the resource
+/// is no longer needed.
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct DLManagedTensor {
+  /// \brief DLTensor which is being memory managed
+  pub dl_tensor: DLTensor,
+  /// \brief the context of the original host framework of DLManagedTensor in
+  /// which DLManagedTensor is used in the framework. It can also be NULL.
+  pub manager_ctx: *mut ::std::os::raw::c_void,
+  /// \brief Destructor signature void (*)(void*) - this should be called
+  /// to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
+  /// if there is no way for the caller to provide a reasonable destructor.
+  pub deleter: ::std::option::Option<unsafe extern "C" fn(self_: *mut DLManagedTensor)>,
+}
+/// \brief type of array index.
+pub type tvm_index_t = i64;
+pub const TVMDeviceExtType_kDLAOCL: TVMDeviceExtType = 5;
+pub const TVMDeviceExtType_kDLSDAccel: TVMDeviceExtType = 6;
+pub const TVMDeviceExtType_kDLVulkan: TVMDeviceExtType = 7;
+pub const TVMDeviceExtType_kOpenGL: TVMDeviceExtType = 11;
+pub const TVMDeviceExtType_kExtDev: TVMDeviceExtType = 12;
+/// \brief Extension device types in TVM
+pub type TVMDeviceExtType = u32;
+pub const TVMTypeCode_kHandle: TVMTypeCode = 3;
+pub const TVMTypeCode_kNull: TVMTypeCode = 4;
+pub const TVMTypeCode_kTVMType: TVMTypeCode = 5;
+pub const TVMTypeCode_kTVMContext: TVMTypeCode = 6;
+pub const TVMTypeCode_kArrayHandle: TVMTypeCode = 7;
+pub const TVMTypeCode_kNodeHandle: TVMTypeCode = 8;
+pub const TVMTypeCode_kModuleHandle: TVMTypeCode = 9;
+pub const TVMTypeCode_kFuncHandle: TVMTypeCode = 10;
+pub const TVMTypeCode_kStr: TVMTypeCode = 11;
+pub const TVMTypeCode_kBytes: TVMTypeCode = 12;
+pub const TVMTypeCode_kNDArrayContainer: TVMTypeCode = 13;
+pub const TVMTypeCode_kExtBegin: TVMTypeCode = 15;
+pub const TVMTypeCode_kNNVMFirst: TVMTypeCode = 16;
+pub const TVMTypeCode_kNNVMLast: TVMTypeCode = 20;
+pub const TVMTypeCode_kExtReserveEnd: TVMTypeCode = 64;
+pub const TVMTypeCode_kExtEnd: TVMTypeCode = 128;
+/// \brief The type code in TVMType
+/// \note TVMType is used in two places.
+pub type TVMTypeCode = u32;
+/// \brief The data type used in TVM Runtime.
+///
+/// Examples
+/// - float: type_code = 2, bits = 32, lanes=1
+/// - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
+/// - int8: type_code = 0, bits = 8, lanes=1
+///
+/// \note Arguments TVM API function always takes bits=64 and lanes=1
+pub type TVMType = DLDataType;
+/// \brief The Device information, abstract away common device types.
+pub type TVMContext = DLContext;
+/// \brief The tensor array stucture to TVM API.
+pub type TVMArray = DLTensor;
+/// \brief the array handle
+pub type TVMArrayHandle = *mut TVMArray;
+/// \brief Union type of values
+/// being passed through API and function calls.
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub union TVMValue {
+  pub v_int64: i64,
+  pub v_float64: f64,
+  pub v_handle: *mut ::std::os::raw::c_void,
+  pub v_str: *const ::std::os::raw::c_char,
+  pub v_type: TVMType,
+  pub v_ctx: TVMContext,
+  _bindgen_union_align: u64,
+}
+/// \brief Byte array type used to pass in byte array
+/// When kBytes is used as data type.
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct TVMByteArray {
+  pub data: *const ::std::os::raw::c_char,
+  pub size: usize,
+}
+/// \brief Handle to TVM runtime modules.
+pub type TVMModuleHandle = *mut ::std::os::raw::c_void;
+/// \brief Handle to packed function handle.
+pub type TVMFunctionHandle = *mut ::std::os::raw::c_void;
+/// \brief Handle to hold return value.
+pub type TVMRetValueHandle = *mut ::std::os::raw::c_void;
+/// \brief The stream that is specific to device
+/// can be NULL, which indicates the default one.
+pub type TVMStreamHandle = *mut ::std::os::raw::c_void;
+extern "C" {
+  /// \brief Used for implementing C API function.
+  /// Set last error message before return.
+  /// \param msg The error message to be set.
+  pub fn TVMAPISetLastError(msg: *const ::std::os::raw::c_char);
+}
+extern "C" {
+  /// \brief return str message of the last error
+  /// all function in this file will return 0 when success
+  /// and -1 when an error occured,
+  /// TVMGetLastError can be called to retrieve the error
+  ///
+  /// this function is threadsafe and can be called by different thread
+  /// \return error info
+  pub fn TVMGetLastError() -> *const ::std::os::raw::c_char;
+}
+extern "C" {
+  /// \brief Load module from file.
+  /// \param file_name The file name to load the module from.
+  /// \param format The format of the module.
+  /// \param out The result module
+  ///
+  /// \return 0 when success, -1 when failure happens
+  /// \note The resulting module do not contain import relation.
+  /// It can be reconstructed by TVMModImport.
+  pub fn TVMModLoadFromFile(
+    file_name: *const ::std::os::raw::c_char,
+    format: *const ::std::os::raw::c_char,
+    out: *mut TVMModuleHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Add dep to mod's dependency.
+  /// This allows functions in this module to use modules.
+  ///
+  /// \param mod The module handle.
+  /// \param dep The dependent module to be imported.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMModImport(mod_: TVMModuleHandle, dep: TVMModuleHandle) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Get function from the module.
+  /// \param mod The module handle.
+  /// \param func_name The name of the function.
+  /// \param query_imports Whether to query imported modules
+  /// \param out The result function, can be NULL if it is not available.
+  /// \return 0 when no error is thrown, -1 when failure happens
+  pub fn TVMModGetFunction(
+    mod_: TVMModuleHandle,
+    func_name: *const ::std::os::raw::c_char,
+    query_imports: ::std::os::raw::c_int,
+    out: *mut TVMFunctionHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Free front-end extension type resource.
+  /// \param handle The extension handle.
+  /// \param type_code The type of of the extension type.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMExtTypeFree(
+    handle: *mut ::std::os::raw::c_void,
+    type_code: ::std::os::raw::c_int,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Free the Module
+  /// \param mod The module to be freed.
+  ///
+  /// \note This may not free up the module's resources.
+  /// If there is active TVMFunctionHandle uses the module
+  /// Or if this module is imported by another active module.
+  ///
+  /// The all functions remains valid until TVMFuncFree is called.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMModFree(mod_: TVMModuleHandle) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Free the function when it is no longer needed.
+  /// \param func The function handle
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMFuncFree(func: TVMFunctionHandle) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Call a Packed TVM Function.
+  ///
+  /// \param func node handle of the function.
+  /// \param arg_values The arguments
+  /// \param type_codes The type codes of the arguments
+  /// \param num_args Number of arguments.
+  ///
+  /// \param ret_val The return value.
+  /// \param ret_type_code the type code of return value.
+  ///
+  /// \return 0 when success, -1 when failure happens
+  /// \note TVM calls always exchanges with type bits=64, lanes=1
+  ///
+  /// \note API calls always exchanges with type bits=64, lanes=1
+  /// If API call returns container handles (e.g. FunctionHandle)
+  /// these handles should be managed by the front-end.
+  /// The front-end need to call free function (e.g. TVMFuncFree)
+  /// to free these handles.
+  pub fn TVMFuncCall(
+    func: TVMFunctionHandle,
+    arg_values: *mut TVMValue,
+    type_codes: *mut ::std::os::raw::c_int,
+    num_args: ::std::os::raw::c_int,
+    ret_val: *mut TVMValue,
+    ret_type_code: *mut ::std::os::raw::c_int,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Set the return value of TVMPackedCFunc.
+  ///
+  /// This function is called by TVMPackedCFunc to set the return value.
+  /// When this function is not called, the function returns null by default.
+  ///
+  /// \param ret The return value handle, pass by ret in TVMPackedCFunc
+  /// \param value The value to be returned.
+  /// \param type_code The type of the value to be returned.
+  /// \param num_ret Number of return values, for now only 1 is supported.
+  pub fn TVMCFuncSetReturn(
+    ret: TVMRetValueHandle,
+    value: *mut TVMValue,
+    type_code: *mut ::std::os::raw::c_int,
+    num_ret: ::std::os::raw::c_int,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Inplace translate callback argument value to return value.
+  /// This is only needed for non-POD arguments.
+  ///
+  /// \param value The value to be translated.
+  /// \param code The type code to be translated.
+  /// \note This function will do a shallow copy when necessary.
+  ///
+  /// \return 0 when success, -1 when failure happens.
+  pub fn TVMCbArgToReturn(
+    value: *mut TVMValue,
+    code: ::std::os::raw::c_int,
+  ) -> ::std::os::raw::c_int;
+}
+/// \brief C type of packed function.
+///
+/// \param args The arguments
+/// \param type_codes The type codes of the arguments
+/// \param num_args Number of arguments.
+/// \param ret The return value handle.
+/// \param resource_handle The handle additional resouce handle from fron-end.
+/// \return 0 if success, -1 if failure happens, set error via TVMAPISetLastError.
+/// \sa TVMCFuncSetReturn
+pub type TVMPackedCFunc = ::std::option::Option<
+  unsafe extern "C" fn(
+    args: *mut TVMValue,
+    type_codes: *mut ::std::os::raw::c_int,
+    num_args: ::std::os::raw::c_int,
+    ret: TVMRetValueHandle,
+    resource_handle: *mut ::std::os::raw::c_void,
+  ) -> ::std::os::raw::c_int,
+>;
+/// \brief C callback to free the resource handle in C packed function.
+/// \param resource_handle The handle additional resouce handle from fron-end.
+pub type TVMPackedCFuncFinalizer =
+  ::std::option::Option<unsafe extern "C" fn(resource_handle: *mut ::std::os::raw::c_void)>;
+/// \brief Signature for extension function declarer.
+///
+/// TVM call this function to get the extension functions
+/// The declarer will call register_func to register function and their name.
+///
+/// \param register_func_handle The register function
+/// \return 0 if success, -1 if failure happens
+pub type TVMExtensionFuncDeclarer = ::std::option::Option<
+  unsafe extern "C" fn(register_func_handle: TVMFunctionHandle) -> ::std::os::raw::c_int,
+>;
+extern "C" {
+  /// \brief Wrap a TVMPackedCFunc to become a FunctionHandle.
+  ///
+  /// The resource_handle will be managed by TVM API, until the function is no longer used.
+  ///
+  /// \param func The packed C function.
+  /// \param resource_handle The resource handle from front-end, can be NULL.
+  /// \param fin The finalizer on resource handle when the FunctionHandle get freed, can be NULL
+  /// \param out the result function handle.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMFuncCreateFromCFunc(
+    func: TVMPackedCFunc,
+    resource_handle: *mut ::std::os::raw::c_void,
+    fin: TVMPackedCFuncFinalizer,
+    out: *mut TVMFunctionHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Register the function to runtime's global table.
+  ///
+  /// The registered function then can be pulled by the backend by the name.
+  ///
+  /// \param name The name of the function.
+  /// \param f The function to be registered.
+  /// \param override Whether allow override already registered function.
+  pub fn TVMFuncRegisterGlobal(
+    name: *const ::std::os::raw::c_char,
+    f: TVMFunctionHandle,
+    override_: ::std::os::raw::c_int,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Get a global function.
+  ///
+  /// \param name The name of the function.
+  /// \param out the result function pointer, NULL if it does not exist.
+  ///
+  /// \note The function handle of global function is managed by TVM runtime,
+  /// So TVMFuncFree is should not be called when it get deleted.
+  pub fn TVMFuncGetGlobal(
+    name: *const ::std::os::raw::c_char,
+    out: *mut TVMFunctionHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief List all the globally registered function name
+  /// \param out_size The number of functions
+  /// \param out_array The array of function names.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMFuncListGlobalNames(
+    out_size: *mut ::std::os::raw::c_int,
+    out_array: *mut *mut *const ::std::os::raw::c_char,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Allocate a nd-array's memory,
+  /// including space of shape, of given spec.
+  ///
+  /// \param shape The shape of the array, the data content will be copied to out
+  /// \param ndim The number of dimension of the array.
+  /// \param dtype_code The type code of the dtype
+  /// \param dtype_bits The number of bits of dtype
+  /// \param dtype_lanes The number of lanes in the dtype.
+  /// \param device_type The device type of context
+  /// \param device_id The device id of context.
+  /// \param out The output handle.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMArrayAlloc(
+    shape: *const tvm_index_t,
+    ndim: ::std::os::raw::c_int,
+    dtype_code: ::std::os::raw::c_int,
+    dtype_bits: ::std::os::raw::c_int,
+    dtype_lanes: ::std::os::raw::c_int,
+    device_type: ::std::os::raw::c_int,
+    device_id: ::std::os::raw::c_int,
+    out: *mut TVMArrayHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Free the TVM Array.
+  /// \param handle The array handle to be freed.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMArrayFree(handle: TVMArrayHandle) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Copy array data from CPU byte array.
+  /// \param handle The array handle.
+  /// \param data the data pointer
+  /// \param nbytes The number of bytes to copy.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMArrayCopyFromBytes(
+    handle: TVMArrayHandle,
+    data: *mut ::std::os::raw::c_void,
+    nbytes: usize,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Copy array data to CPU byte array.
+  /// \param handle The array handle.
+  /// \param data the data pointer
+  /// \param nbytes The number of bytes to copy.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMArrayCopyToBytes(
+    handle: TVMArrayHandle,
+    data: *mut ::std::os::raw::c_void,
+    nbytes: usize,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Copy the array, both from and to must be valid during the copy.
+  /// \param from The array to be copied from.
+  /// \param to The target space.
+  /// \param stream The stream where the copy happens, can be NULL.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMArrayCopyFromTo(
+    from: TVMArrayHandle,
+    to: TVMArrayHandle,
+    stream: TVMStreamHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Produce an array from the DLManagedTensor that shares data memory
+  /// with the DLManagedTensor.
+  /// \param from The source DLManagedTensor.
+  /// \param out The output array handle.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMArrayFromDLPack(
+    from: *mut DLManagedTensor,
+    out: *mut TVMArrayHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Produce a DLMangedTensor from the array that shares data memory with
+  /// the array.
+  /// \param from The source array.
+  /// \param out The DLManagedTensor handle.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMArrayToDLPack(
+    from: TVMArrayHandle,
+    out: *mut *mut DLManagedTensor,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Delete (free) a DLManagedTensor's data.
+  /// \param dltensor Pointer to the DLManagedTensor.
+  pub fn TVMDLManagedTensorCallDeleter(dltensor: *mut DLManagedTensor);
+}
+extern "C" {
+  /// \brief Create a new runtime stream.
+  ///
+  /// \param device_type The device type of context
+  /// \param device_id The device id of context
+  /// \param out The new stream handle
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMStreamCreate(
+    device_type: ::std::os::raw::c_int,
+    device_id: ::std::os::raw::c_int,
+    out: *mut TVMStreamHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Free a created stream handle.
+  ///
+  /// \param device_type The device type of context
+  /// \param device_id The device id of context
+  /// \param stream The stream to be freed
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMStreamFree(
+    device_type: ::std::os::raw::c_int,
+    device_id: ::std::os::raw::c_int,
+    stream: TVMStreamHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Set the runtime stream of current thread to be stream.
+  /// The subsequent calls to the same device_type
+  /// will use the setted stream handle.
+  /// The specific type of stream is runtime device dependent.
+  ///
+  /// \param device_type The device type of context
+  /// \param device_id The device id of context.
+  /// \param handle The stream handle.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMSetStream(
+    device_type: ::std::os::raw::c_int,
+    device_id: ::std::os::raw::c_int,
+    handle: TVMStreamHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Wait until all computations on stream completes.
+  ///
+  /// \param device_type The device type of context
+  /// \param device_id The device id of context.
+  /// \param stream The stream to be synchronized.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMSynchronize(
+    device_type: ::std::os::raw::c_int,
+    device_id: ::std::os::raw::c_int,
+    stream: TVMStreamHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Synchronize two streams of execution.
+  ///
+  /// \param device_type The device type of context
+  /// \param device_id The device id of context
+  /// \param src The source stream to synchronize.
+  /// \param dst The destination stream to synchronize.
+  /// \return 0 when success, -1 when failure happens
+  pub fn TVMStreamStreamSynchronize(
+    device_type: ::std::os::raw::c_int,
+    device_id: ::std::os::raw::c_int,
+    src: TVMStreamHandle,
+    dst: TVMStreamHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Backend function for modules to get function
+  /// from its environment mod_node (its imports and global function).
+  /// The user do should not call TVMFuncFree on func.
+  ///
+  /// \param mod_node The module handle.
+  /// \param func_name The name of the function.
+  /// \param out The result function.
+  /// \return 0 when no error is thrown, -1 when failure happens
+  pub fn TVMBackendGetFuncFromEnv(
+    mod_node: *mut ::std::os::raw::c_void,
+    func_name: *const ::std::os::raw::c_char,
+    out: *mut TVMFunctionHandle,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Backend function to register system-wide library symbol.
+  ///
+  /// \param name The name of the symbol
+  /// \param ptr The symbol address.
+  /// \return 0 when no error is thrown, -1 when failure happens
+  pub fn TVMBackendRegisterSystemLibSymbol(
+    name: *const ::std::os::raw::c_char,
+    ptr: *mut ::std::os::raw::c_void,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Backend function to allocate temporal workspace.
+  ///
+  /// \note The result allocate spaced is ensured to be aligned to kTempAllocaAlignment.
+  ///
+  /// \param nbytes The size of the space requested.
+  /// \param device_type The device type which the space will be allocated.
+  /// \param device_id The device id which the space will be allocated.
+  /// \param dtype_code_hint The type code of the array elements. Only used in
+  /// certain backends such as OpenGL.
+  /// \param dtype_bits_hint The type bits of the array elements. Only used in
+  /// certain backends such as OpenGL.
+  /// \return nullptr when error is thrown, a valid ptr if success
+  pub fn TVMBackendAllocWorkspace(
+    device_type: ::std::os::raw::c_int,
+    device_id: ::std::os::raw::c_int,
+    nbytes: u64,
+    dtype_code_hint: ::std::os::raw::c_int,
+    dtype_bits_hint: ::std::os::raw::c_int,
+  ) -> *mut ::std::os::raw::c_void;
+}
+extern "C" {
+  /// \brief Backend function to free temporal workspace.
+  ///
+  /// \param ptr The result allocated space pointer.
+  /// \param device_type The device type which the space will be allocated.
+  /// \param device_id The device id which the space will be allocated.
+  /// \return 0 when no error is thrown, -1 when failure happens
+  ///
+  /// \sa TVMBackendAllocWorkspace
+  pub fn TVMBackendFreeWorkspace(
+    device_type: ::std::os::raw::c_int,
+    device_id: ::std::os::raw::c_int,
+    ptr: *mut ::std::os::raw::c_void,
+  ) -> ::std::os::raw::c_int;
+}
+/// \brief Environment for TVM parallel task.
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct TVMParallelGroupEnv {
+  /// \brief Auxiliary used for synchronization
+  pub sync_handle: *mut ::std::os::raw::c_void,
+  /// \brief total amount of task
+  pub num_task: i32,
+}
+/// \brief The callback function to execute a parallel lambda
+/// \param task_id the task id of the function.
+/// \param penv The parallel environment backs the execution.
+/// \param cdata The supporting closure data.
+pub type FTVMParallelLambda = ::std::option::Option<
+  unsafe extern "C" fn(
+    task_id: ::std::os::raw::c_int,
+    penv: *mut TVMParallelGroupEnv,
+    cdata: *mut ::std::os::raw::c_void,
+  ) -> ::std::os::raw::c_int,
+>;
+extern "C" {
+  /// \brief Backend function for running parallel jobs.
+  ///
+  /// \param flambda The parallel function to be launched.
+  /// \param cdata The closure data.
+  /// \param num_task Number of tasks to launch, can be 0, means launch
+  /// with all available threads.
+  ///
+  /// \return 0 when no error is thrown, -1 when failure happens
+  pub fn TVMBackendParallelLaunch(
+    flambda: FTVMParallelLambda,
+    cdata: *mut ::std::os::raw::c_void,
+    num_task: ::std::os::raw::c_int,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief BSP barrrier between parallel threads
+  /// \param task_id the task id of the function.
+  /// \param penv The parallel environment backs the execution.
+  /// \return 0 when no error is thrown, -1 when failure happens
+  pub fn TVMBackendParallelBarrier(
+    task_id: ::std::os::raw::c_int,
+    penv: *mut TVMParallelGroupEnv,
+  ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+  /// \brief Simple static initialization function.
+  /// Run f once and set handle to be not null.
+  /// This function is mainly used for test purpose.
+  ///
+  /// \param handle An global address to indicate f
+  /// \param f The function to be ran
+  /// \param cdata The closure data to pass to the function.
+  /// \param nbytes Number of bytes in the closure data.
+  /// \return 0 when no error is thrown, -1 when failure happens
+  pub fn TVMBackendRunOnce(
+    handle: *mut *mut ::std::os::raw::c_void,
+    f: ::std::option::Option<
+      unsafe extern "C" fn(arg1: *mut ::std::os::raw::c_void) -> ::std::os::raw::c_int,
+    >,
+    cdata: *mut ::std::os::raw::c_void,
+    nbytes: ::std::os::raw::c_int,
+  ) -> ::std::os::raw::c_int;
+}
diff --git a/rust/src/runtime/graph.rs b/rust/src/runtime/graph.rs
new file mode 100644
index 000000000000..08fbd5938380
--- /dev/null
+++ b/rust/src/runtime/graph.rs
@@ -0,0 +1,472 @@
+use std::{cmp, collections::HashMap, convert::TryFrom, iter::FromIterator, mem, str};
+
+use nom::{alpha1, digit1, le_i32, le_i64, le_u16, le_u32, le_u64, le_u8, types::CompleteStr};
+use serde;
+use serde_json;
+
+use super::{DataType, Module, Storage, TVMArgValue, TVMContext, Tensor};
+use errors::{Error, ErrorKind, Result};
+use ffi::runtime::{
+  DLDataTypeCode_kDLFloat, DLDataTypeCode_kDLInt, DLDataTypeCode_kDLUInt, DLTensor,
+};
+
+// Magic number for NDArray file. @see `kTVMNDArrayMagic` in `ndarray.h`
+const _NDARRAY_MAGIC: u64 = 0xDD5E40F096B4A13F;
+// Magic number for NDArray list file. @see `kTVMNDArrayListMagic` in `graph_runtime.h`
+const _NDARRAY_LIST_MAGIC: u64 = 0xF7E58D4F05049CB7;
+
+/// A TVM computation graph.
+///
+/// # Examples
+///
+/// ```
+/// let graph_json = fs::read_to_string("graph.json")).unwrap();
+/// let graph = Graph::try_from(&graph_json).unwrap();
+/// ```
+#[derive(Serialize, Deserialize, Debug)]
+pub struct Graph {
+  pub nodes: Vec<Node>,
+  pub arg_nodes: Vec<usize>,
+  pub heads: Vec<Entry>,
+  pub node_row_ptr: Option<Vec<usize>>,
+  pub attrs: Option<HashMap<String, serde_json::Value>>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct Entry {
+  pub id: usize,
+  pub index: usize,
+  pub version: usize,
+}
+
+impl Graph {
+  fn entry_index(&self, entry: &Entry) -> Result<usize> {
+    self
+      .node_row_ptr
+      .as_ref()
+      .map(|nrp| nrp[entry.id] + entry.index)
+      .ok_or("Missing node_row_ptr.".into())
+  }
+
+  /// Attempt to deserialize a JSON attribute to a type `T`.
+  fn get_attr<T: serde::de::DeserializeOwned>(&self, attr: &str) -> Result<T> {
+    Ok(serde_json::from_value::<T>(
+      self
+        .attrs
+        .as_ref()
+        .ok_or(ErrorKind::GraphFormatError(
+          "Missing graph attrs".to_string(),
+        ))?
+        .get(attr)
+        .ok_or(ErrorKind::GraphFormatError(format!(
+          "Missing {} attr",
+          attr
+        )))?
+        .to_owned(),
+    )?)
+  }
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct Node {
+  pub op: String,
+  pub name: String,
+  pub inputs: Vec<Entry>,
+  pub attrs: Option<HashMap<String, String>>,
+  pub control_deps: Option<Vec<Entry>>,
+}
+
+struct NodeAttrs {
+  func_name: String,
+  num_outputs: usize,
+  flatten_data: bool,
+}
+
+impl Node {
+  fn parse_attrs(&self) -> Result<NodeAttrs> {
+    let attrs = self
+      .attrs
+      .as_ref()
+      .ok_or(format!("Missing node.attrs for `{}`", self.name))?;
+    let func_name = attrs
+      .get("func_name")
+      .ok_or(format!("Node `{}` is missing attrs.func_name", self.name))?
+      .to_string();
+    let num_outputs = attrs
+      .get("num_outputs")
+      .ok_or(format!("Node `{}` is missing attrs.num_outputs", self.name))?
+      .parse::<usize>()?;
+    let flatten_data = attrs
+      .get("flatten_data")
+      .ok_or(format!(
+        "Node `{}` is missing attrs.flatten_data",
+        self.name
+      ))?
+      .parse::<u8>()?
+      == 1;
+    Ok(NodeAttrs {
+      func_name,
+      num_outputs,
+      flatten_data,
+    })
+  }
+}
+
+impl<'a> TryFrom<&'a String> for Graph {
+  type Error = Error;
+  fn try_from(graph_json: &String) -> Result<Self> {
+    let graph = serde_json::from_str(graph_json)?;
+    Ok(graph)
+  }
+}
+
+impl<'a> TryFrom<&'a str> for Graph {
+  type Error = Error;
+  fn try_from(graph_json: &'a str) -> Result<Self> {
+    let graph = serde_json::from_str(graph_json)?;
+    Ok(graph)
+  }
+}
+
+/// A executor for a TVM computation graph.
+///
+/// # Examples
+///
+/// ```
+/// use ndarray::Array;
+///
+/// let syslib = SystemLibModule::default(); // a provider of TVM functions
+///
+/// let mut params_bytes = Vec::new();
+/// fs::File::open("graph.params").unwrap().read_to_end(&mut params_bytes).unwrap();
+/// let params = tvm::runtime::load_param_dict(&params_bytes).unwrap();
+///
+/// let graph = Graph::try_from(&fs::read_to_string("graph.json").unwrap()).unwrap();
+///
+/// let mut exec = GraphExecutor::new(graph, &syslib).unwrap();
+/// exec.load_params(params);
+///
+/// let x = Array::from_vec(vec![1f32, 2., 3., 4.]);
+/// exec.set_input("data", x.into());
+/// exec.run();
+/// let output = exec.get_output(0).unwrap();
+///
+/// println!("{:#?}", Array::try_from(output).unwrap());
+/// ```
+pub struct GraphExecutor<'m, 't> {
+  graph: Graph,
+  op_execs: Vec<Box<Fn() + 'm>>,
+  tensors: Vec<Tensor<'t>>,
+}
+
+unsafe impl<'m, 't> Send for GraphExecutor<'m, 't> {}
+
+impl<'m, 't> GraphExecutor<'m, 't> {
+  pub fn new<M: 'm + Module>(graph: Graph, lib: &'m M) -> Result<Self> {
+    let tensors = Self::setup_storages(&graph)?;
+    Ok(GraphExecutor {
+      op_execs: Self::setup_op_execs(&graph, lib, &tensors)?,
+      tensors: tensors,
+      graph: graph,
+    })
+  }
+
+  /// Runs the computation graph.
+  pub fn run(&self) {
+    self.op_execs.iter().for_each(|op_exec| {
+      op_exec();
+    });
+  }
+
+  /// Allocates `Storages` for each `storage_id` and returns `Tensor`s to hold each output.
+  fn setup_storages<'a>(graph: &'a Graph) -> Result<Vec<Tensor<'t>>> {
+    let storage_ids = graph.get_attr::<(String, Vec<usize>)>("storage_id")?.1;
+    let shapes = graph.get_attr::<(String, Vec<Vec<i64>>)>("shape")?.1;
+    let dtypes = graph
+      .get_attr::<(String, Vec<String>)>("dltype")?
+      .1
+      .iter()
+      .map(|dltype| {
+        if let Ok((_, dtype)) = tvm_str_to_type(CompleteStr(dltype)) {
+          Ok(dtype)
+        } else {
+          Err(ErrorKind::GraphFormatError(format!("Invalid dltype: {}", dltype).to_string()).into())
+        }
+      })
+      .collect::<Result<Vec<DataType>>>()?;
+
+    let align = dtypes.iter().map(|dtype| dtype.bits as usize).max();
+    let mut storage_num_bytes = vec![0usize; *storage_ids.iter().max().unwrap_or(&1) + 1];
+    for (i, &storage_id) in storage_ids.iter().enumerate() {
+      let dtype_size = dtypes[i].bits * dtypes[i].lanes >> 3;
+      let nbytes = dtype_size * shapes[i].iter().product::<i64>() as usize;
+      storage_num_bytes[storage_id] = cmp::max(nbytes, storage_num_bytes[storage_id]);
+    }
+
+    let mut storages: Vec<Storage> = storage_num_bytes
+      .into_iter()
+      .map(|nbytes| Storage::new(nbytes, align))
+      .collect::<Result<Vec<Storage>>>()?;
+
+    let tensors = izip!(storage_ids, shapes, dtypes)
+      .map(|(storage_id, shape, dtype)| {
+        let storage = storages[storage_id].view();
+        Tensor {
+          data: mem::replace(&mut storages[storage_id], storage),
+          ctx: TVMContext::default(),
+          dtype: dtype,
+          size: shape.iter().product::<i64>() as usize,
+          shape: shape,
+          strides: None,
+          byte_offset: 0,
+        }
+      })
+      .collect();
+
+    Ok(tensors)
+  }
+
+  /// Creates closures which represent the computation performed by this graph.
+  fn setup_op_execs<M: 'm + Module>(
+    graph: &Graph,
+    lib: &'m M,
+    tensors: &Vec<Tensor<'t>>,
+  ) -> Result<Vec<Box<Fn() + 'm>>> {
+    ensure!(graph.node_row_ptr.is_some(), "Missing node_row_ptr.");
+    let node_row_ptr = graph.node_row_ptr.as_ref().unwrap();
+
+    let mut op_execs = Vec::new();
+    for (i, node) in graph.nodes.iter().enumerate() {
+      if node.op == "null" {
+        continue;
+      }
+      ensure!(node.op == "tvm_op", "Only TVM ops are supported.");
+      ensure!(node.attrs.is_some(), "Missing node attrs.");
+
+      let attrs = node.parse_attrs()?;
+
+      if attrs.func_name == "__nop" {
+        continue;
+      }
+
+      let func = lib
+        .get_function(&attrs.func_name)
+        .ok_or(format!("Missing function {}", attrs.func_name))?;
+      let arg_indices = node
+        .inputs
+        .iter()
+        .map(|entry| graph.entry_index(entry))
+        .chain((0..attrs.num_outputs).map(|oi| Ok(node_row_ptr[i].clone() + oi)));
+
+      let dl_tensors = arg_indices
+        .map(|idx| {
+          let tensor = &tensors[idx?];
+          Ok(if attrs.flatten_data {
+            DLTensor::from_tensor(tensor, true /* flatten */)
+          } else {
+            DLTensor::from(tensor)
+          })
+        })
+        .collect::<Result<Vec<DLTensor>>>()
+        .unwrap();
+      let op: Box<Fn()> = box move || {
+        let args = dl_tensors
+          .iter()
+          .map(|t| t.into())
+          .collect::<Vec<TVMArgValue>>();
+        func(args.as_slice());
+      };
+      op_execs.push(op);
+    }
+    Ok(op_execs)
+  }
+
+  pub fn load_params(&mut self, params: HashMap<String, Tensor<'t>>) {
+    params.into_iter().for_each(|(name, param)| {
+      self.set_input(name, param);
+    })
+  }
+
+  pub fn set_input<S: AsRef<str>>(&mut self, name: S, value: Tensor<'t>) {
+    if let Some(idx) = self.get_input_index(name.as_ref()) {
+      // TODO: consider `new_with_params` to avoid ever allocating
+      let ptr = self.tensors[idx].data.as_ptr();
+      let mut to_replace = self.tensors.iter_mut().filter(|t| t.data.as_ptr() == ptr);
+      let mut owner = to_replace.nth(0).unwrap();
+      if value.data.is_owned() {
+        // FIXME: for no-copy, need setup_op_execs to not capture tensor ptr
+        // mem::replace(&mut (*owner), value);
+        // to_replace.for_each(|t| {
+        //   panic!("replacing");
+        //   t.data = owner.data.view();
+        // });
+        owner.copy(&value);
+      } else {
+        owner.copy(&value);
+      }
+    } else {
+      println!("Unexpected input `{}`", name.as_ref());
+    }
+  }
+
+  /// Returns the graph input with name `name`, if it exists.
+  pub fn get_input<S: AsRef<str>>(&mut self, name: S) -> Option<&Tensor> {
+    self
+      .get_input_index(name.as_ref())
+      .and_then(move |idx| Some(&self.tensors[idx]))
+  }
+
+  /// Returns the graph output with index `index`, if it exists.
+  pub fn get_output(&self, idx: usize) -> Option<&Tensor> {
+    let graph = &self.graph;
+    graph.heads.get(idx).and_then(|entry| {
+      graph
+        .entry_index(entry)
+        .map(|idx| self.tensors.get(idx))
+        .unwrap_or(None)
+    })
+  }
+
+  /// Returns the index for graph input with name `name`, if it exists.
+  pub fn get_input_index<S: AsRef<str>>(&self, name: S) -> Option<usize> {
+    let graph = &self.graph;
+    (0..graph.nodes.len())
+      .skip_while(|&i| graph.nodes[i].name != name.as_ref())
+      .nth(0)
+      .and_then(|i| {
+        if graph.arg_nodes.iter().any(|&id| id == i) {
+          graph.node_row_ptr.as_ref().map(|nrp| nrp[i])
+        } else {
+          None
+        }
+      })
+  }
+}
+
+/// Converts a string to TVM DLDataTypeCode. @see `String2TVMType` in packed_func.h
+named!(
+  tvm_str_to_type<CompleteStr, DataType>,
+  do_parse!(
+    type_name: alpha1 >>
+    bits: digit1 >>
+    lanes: opt!(tuple!(tag!("x"), digit1)) >>
+    (DataType {
+      code: match type_name {
+        CompleteStr("int") => DLDataTypeCode_kDLInt,
+        CompleteStr("uint") => DLDataTypeCode_kDLUInt,
+        CompleteStr("float") => DLDataTypeCode_kDLFloat,
+        _ => DLDataTypeCode_kDLFloat,
+      } as usize,
+      bits: bits.parse::<u8>().unwrap() as usize,
+      lanes: match lanes {
+        Some(lanes) => lanes.1.parse::<u16>().unwrap() as usize,
+        None => 1,
+      },
+    })
+  )
+);
+
+/// Converts a bytes to String.
+named!(
+  name<String>,
+  map_res!(length_bytes!(le_u64), |b: &[u8]| String::from_utf8(
+    b.to_vec()
+  ))
+);
+
+/// Parses a TVMContext
+named!(
+  tvm_ctx<&[u8], TVMContext>,
+  do_parse!(
+    device_type: le_u32 >>
+    device_id: le_i32 >>
+    (TVMContext { device_type: device_type as usize, device_id: device_id as usize })
+  )
+);
+
+/// Parses a DataType
+named!(
+  data_type<&[u8], DataType>,
+  do_parse!(
+    code: le_u8 >>
+    bits: le_u8 >>
+    lanes: le_u16 >>
+    (DataType { code: code as usize, bits: bits as usize, lanes: lanes as usize })
+  )
+);
+
+/// Parses a Tensor from a TVM array file.
+named!(
+  tensor<Tensor>,
+  do_parse!(
+    take!(8)
+      >> bits!(tag_bits!(u64, 64, 0))
+      >> ctx: tvm_ctx
+      >> ndim: le_u32
+      >> dtype: data_type
+      >> shape: count!(map!(le_i64, |sz| sz as i64), ndim as usize)
+      >> length: le_i64
+      >> data: take!(length)
+      >> (Tensor {
+        data: Storage::from(data),
+        ctx: ctx,
+        dtype: dtype,
+        size: shape.iter().product::<i64>() as usize,
+        shape: shape,
+        strides: None,
+        byte_offset: 0,
+      })
+  )
+);
+
+/// Parses a graph params dict from a params binary file.
+named!(
+  parse_param_dict<HashMap<String, Tensor>>,
+  do_parse!(
+    take!(8)
+      >> bits!(tag_bits!(u64, 64, 0))
+      >> names: length_count!(le_u64, name)
+      >> tensors: length_count!(le_u64, tensor)
+      >> (HashMap::from_iter(names.into_iter().zip(tensors.into_iter())))
+  )
+);
+
+/// Loads a param dict saved using `nnvm.compiler.save_param_dict`.
+pub fn load_param_dict(bytes: &[u8]) -> Result<HashMap<String, Tensor>> {
+  if let Ok((remaining_bytes, param_dict)) = parse_param_dict(bytes) {
+    if remaining_bytes.len() > 0 {
+      bail!(ErrorKind::LoadGraphParamsError("extra input".to_string()))
+    } else {
+      Ok(param_dict)
+    }
+  } else {
+    bail!(ErrorKind::LoadGraphParamsError(
+      "invalid parameters file".to_string()
+    ))
+  }
+}
+
+#[cfg(test)]
+mod tests {
+  use super::*;
+
+  #[test]
+  fn test_str_to_type() {
+    assert_eq!(
+      tvm_str_to_type(CompleteStr("float24")).unwrap().1,
+      DataType {
+        code: DLDataTypeCode_kDLFloat as usize,
+        bits: 24,
+        lanes: 1
+      }
+    );
+    assert_eq!(
+      tvm_str_to_type(CompleteStr("uint111x44")).unwrap().1,
+      DataType {
+        code: DLDataTypeCode_kDLUInt as usize,
+        bits: 111,
+        lanes: 44
+      }
+    );
+  }
+}
diff --git a/rust/src/runtime/mod.rs b/rust/src/runtime/mod.rs
new file mode 100644
index 000000000000..1a9c5ba7c7bd
--- /dev/null
+++ b/rust/src/runtime/mod.rs
@@ -0,0 +1,28 @@
+mod allocator;
+mod array;
+mod module;
+#[macro_use]
+mod packed_func;
+mod graph;
+#[cfg(target_env = "sgx")]
+#[macro_use]
+pub mod sgx;
+mod threading;
+mod workspace;
+
+use std::os::raw::c_char;
+
+pub use self::{array::*, graph::*, module::*, packed_func::*, threading::*, workspace::*};
+
+#[cfg(target_env = "sgx")]
+use self::sgx::ocall_packed_func;
+
+#[no_mangle]
+pub extern "C" fn TVMAPISetLastError(cmsg: *const c_char) {
+  #[cfg(not(target_env = "sgx"))]
+  unsafe {
+    panic!(std::ffi::CStr::from_ptr(cmsg).to_str().unwrap());
+  }
+  #[cfg(target_env = "sgx")]
+  ocall_packed!("__sgx_set_last_error__", cmsg);
+}
diff --git a/rust/src/runtime/module.rs b/rust/src/runtime/module.rs
new file mode 100644
index 000000000000..2594756d9885
--- /dev/null
+++ b/rust/src/runtime/module.rs
@@ -0,0 +1,46 @@
+use std::{
+  collections::HashMap, convert::AsRef, ffi::CStr, os::raw::c_char, string::String, sync::Mutex,
+};
+
+use ffi::runtime::BackendPackedCFunc;
+use runtime::packed_func::{wrap_backend_packed_func, PackedFunc};
+
+pub trait Module {
+  fn get_function<S: AsRef<str>>(&self, name: S) -> Option<PackedFunc>;
+}
+
+pub struct SystemLibModule;
+
+lazy_static! {
+  static ref SYSTEM_LIB_FUNCTIONS: Mutex<HashMap<String, BackendPackedCFunc>> =
+    Mutex::new(HashMap::new());
+}
+
+impl Module for SystemLibModule {
+  fn get_function<S: AsRef<str>>(&self, name: S) -> Option<PackedFunc> {
+    SYSTEM_LIB_FUNCTIONS
+      .lock()
+      .unwrap()
+      .get(name.as_ref())
+      .map(|func| wrap_backend_packed_func(func.to_owned()))
+  }
+}
+
+impl Default for SystemLibModule {
+  fn default() -> Self {
+    SystemLibModule {}
+  }
+}
+
+#[no_mangle]
+pub extern "C" fn TVMBackendRegisterSystemLibSymbol(
+  cname: *const c_char,
+  func: BackendPackedCFunc,
+) -> i32 {
+  let name = unsafe { CStr::from_ptr(cname).to_str().unwrap() };
+  SYSTEM_LIB_FUNCTIONS
+    .lock()
+    .unwrap()
+    .insert(name.to_string(), func);
+  return 0;
+}
diff --git a/rust/src/runtime/packed_func.rs b/rust/src/runtime/packed_func.rs
new file mode 100644
index 000000000000..a6ad7fc35821
--- /dev/null
+++ b/rust/src/runtime/packed_func.rs
@@ -0,0 +1,342 @@
+use std::{any::Any, convert::TryFrom, marker::PhantomData, os::raw::c_void};
+
+use super::Tensor;
+use ffi::runtime::{
+  BackendPackedCFunc, DLDataTypeCode_kDLFloat, DLDataTypeCode_kDLInt, DLTensor,
+  TVMTypeCode_kArrayHandle, TVMTypeCode_kHandle, TVMTypeCode_kNDArrayContainer, TVMValue,
+};
+
+use errors::*;
+
+pub type PackedFunc = Box<Fn(&[TVMArgValue]) -> TVMRetValue + Send + Sync>;
+
+/// Calls a packed function and returns a `TVMRetValue`.
+///
+/// # Example
+///
+/// `call_packed!(my_tvm_func, &mut arg1, &mut arg2)`
+#[macro_export]
+macro_rules! call_packed {
+  ($fn:expr, $($args:expr),+) => {
+    $fn(&[$($args.into(),)+])
+  };
+  ($fn:expr) => {
+    $fn(&Vec::new())
+  };
+}
+
+/// A borrowed TVMPODValue. Can be constructed using `into()` but the preferred way
+/// to obtain a `TVMArgValue` is automatically via `call_packed!`.
+#[derive(Clone, Copy)]
+pub struct TVMArgValue<'a> {
+  _lifetime: PhantomData<&'a ()>,
+  pub(crate) value: TVMValue,
+  pub(crate) type_code: i64,
+}
+
+impl<'a> TVMArgValue<'a> {
+  pub fn new(value: TVMValue, type_code: i64) -> Self {
+    TVMArgValue {
+      _lifetime: PhantomData,
+      value: value,
+      type_code: type_code,
+    }
+  }
+}
+
+/// Creates a conversion to a `TVMArgValue` for a primitive type and DLDataTypeCode.
+macro_rules! impl_prim_tvm_arg {
+  ($type:ty, $field:ident, $code:expr, $as:ty) => {
+    impl<'a> From<$type> for TVMArgValue<'a> {
+      fn from(val: $type) -> Self {
+        TVMArgValue {
+          value: TVMValue { $field: val as $as },
+          type_code: $code as i64,
+          _lifetime: PhantomData,
+        }
+      }
+    }
+    impl<'a> TryFrom<TVMArgValue<'a>> for $type {
+      type Error = Error;
+      fn try_from(val: TVMArgValue<'a>) -> Result<Self> {
+        ensure!(
+          val.type_code == $code as i64,
+          "Could not downcast arg. Expected `{}`, got `{}`",
+          $code,
+          val.type_code
+        );
+        Ok(unsafe { val.value.$field as $type })
+      }
+    }
+  };
+  ($type:ty, $field:ident, $code:expr) => {
+    impl_prim_tvm_arg!($type, $field, $code, $type);
+  };
+  ($type:ty,v_int64) => {
+    impl_prim_tvm_arg!($type, v_int64, DLDataTypeCode_kDLInt, i64);
+  };
+  ($type:ty,v_float64) => {
+    impl_prim_tvm_arg!($type, v_float64, DLDataTypeCode_kDLFloat, f64);
+  };
+}
+
+impl_prim_tvm_arg!(f32, v_float64);
+impl_prim_tvm_arg!(f64, v_float64);
+impl_prim_tvm_arg!(i8, v_int64);
+impl_prim_tvm_arg!(u8, v_int64);
+impl_prim_tvm_arg!(i32, v_int64);
+impl_prim_tvm_arg!(u32, v_int64);
+impl_prim_tvm_arg!(i64, v_int64);
+impl_prim_tvm_arg!(u64, v_int64);
+
+/// Creates a conversion to a `TVMArgValue` for an object handle.
+impl<'a, T> From<*const T> for TVMArgValue<'a> {
+  fn from(ptr: *const T) -> Self {
+    TVMArgValue {
+      value: TVMValue {
+        v_handle: ptr as *mut T as *mut c_void,
+      },
+      type_code: TVMTypeCode_kArrayHandle as i64,
+      _lifetime: PhantomData,
+    }
+  }
+}
+
+/// Creates a conversion to a `TVMArgValue` for a mutable object handle.
+impl<'a, T> From<*mut T> for TVMArgValue<'a> {
+  fn from(ptr: *mut T) -> Self {
+    TVMArgValue {
+      value: TVMValue {
+        v_handle: ptr as *mut c_void,
+      },
+      type_code: TVMTypeCode_kHandle as i64,
+      _lifetime: PhantomData,
+    }
+  }
+}
+
+impl<'a> From<&'a mut DLTensor> for TVMArgValue<'a> {
+  fn from(arr: &'a mut DLTensor) -> Self {
+    TVMArgValue {
+      value: TVMValue {
+        v_handle: arr as *mut _ as *mut c_void,
+      },
+      type_code: TVMTypeCode_kArrayHandle as i64,
+      _lifetime: PhantomData,
+    }
+  }
+}
+
+impl<'a> From<&'a DLTensor> for TVMArgValue<'a> {
+  fn from(arr: &'a DLTensor) -> Self {
+    TVMArgValue {
+      value: TVMValue {
+        v_handle: arr as *const _ as *mut DLTensor as *mut c_void,
+      },
+      type_code: TVMTypeCode_kArrayHandle as i64,
+      _lifetime: PhantomData,
+    }
+  }
+}
+
+impl<'a> TryFrom<TVMArgValue<'a>> for Tensor<'a> {
+  type Error = Error;
+  fn try_from(val: TVMArgValue<'a>) -> Result<Self> {
+    ensure!(
+      val.type_code == TVMTypeCode_kArrayHandle as i64
+        || val.type_code == TVMTypeCode_kNDArrayContainer as i64,
+      "Could not downcast arg. Expected `{}` or `{}`, but got `{}`",
+      TVMTypeCode_kArrayHandle,
+      TVMTypeCode_kNDArrayContainer,
+      val.type_code,
+    );
+
+    let dlt = unsafe { *(val.value.v_handle as *mut DLTensor as *const DLTensor) };
+    Ok(dlt.into())
+  }
+}
+
+/// An owned TVMPODValue. Can be converted from a variety of primitive and object types.
+/// Can be downcasted using `try_from` if it contains the desired type.
+///
+/// # Example
+///
+/// ```
+/// let a = 42u32;
+/// let b: i64 = TVMRetValue::from(a).try_into().unwrap();
+///
+/// let s = "hello, world!";
+/// let t: TVMRetValue = s.into();
+/// assert_eq!(String::try_from(t).unwrap(), s);
+/// ```
+pub struct TVMRetValue {
+  /// A primitive return value, if any.
+  prim_value: u64,
+  /// An object return value, if any.
+  box_value: Box<Any>,
+  /// The DLDataTypeCode which determines whether `prim_value` or `box_value` is in use.
+  type_code: i64,
+}
+
+#[cfg(target_env = "sgx")]
+impl TVMRetValue {
+  pub(crate) fn from_tvm_value(value: TVMValue, type_code: i64) -> Self {
+    unsafe {
+      Self {
+        prim_value: match type_code {
+          0 | 1 => value.v_int64 as u64,
+          2 => value.v_float64 as u64,
+          3 | 7 | 8 | 9 | 10 => value.v_handle as u64,
+          11 | 12 => value.v_str as u64,
+          _ => 0,
+        } as u64,
+        box_value: box (),
+        type_code: type_code,
+      }
+    }
+  }
+
+  pub fn into_tvm_value(self) -> (TVMValue, i64) {
+    let val = match self.type_code {
+      0 | 1 => TVMValue {
+        v_int64: self.prim_value.clone() as i64,
+      },
+      2 => TVMValue {
+        v_float64: self.prim_value.clone() as f64,
+      },
+      3 | 7 | 8 | 9 | 10 | 13 => TVMValue {
+        v_handle: Box::into_raw(self.box_value) as *mut c_void,
+      },
+      11 | 12 => TVMValue {
+        v_str: Box::into_raw(self.box_value) as *const _,
+      },
+      _ => unreachable!(),
+    };
+    (val, self.type_code)
+  }
+}
+
+impl Default for TVMRetValue {
+  fn default() -> Self {
+    TVMRetValue {
+      prim_value: 0,
+      box_value: box (),
+      type_code: 0,
+    }
+  }
+}
+
+macro_rules! impl_prim_ret_value {
+  ($type:ty, $code:expr) => {
+    impl From<$type> for TVMRetValue {
+      fn from(val: $type) -> Self {
+        TVMRetValue {
+          prim_value: val as u64,
+          box_value: box (),
+          type_code: $code,
+        }
+      }
+    }
+    impl TryFrom<TVMRetValue> for $type {
+      type Error = Error;
+      fn try_from(ret: TVMRetValue) -> Result<$type> {
+        if ret.type_code == $code {
+          Ok(ret.prim_value as $type)
+        } else {
+          bail!(ErrorKind::TryFromTVMRetValueError(
+            stringify!($type).to_string(),
+            ret.type_code
+          ))
+        }
+      }
+    }
+  };
+}
+
+macro_rules! impl_boxed_ret_value {
+  ($type:ty, $code:expr) => {
+    impl From<$type> for TVMRetValue {
+      fn from(val: $type) -> Self {
+        TVMRetValue {
+          prim_value: 0,
+          box_value: box val,
+          type_code: $code,
+        }
+      }
+    }
+    impl TryFrom<TVMRetValue> for $type {
+      type Error = Error;
+      fn try_from(ret: TVMRetValue) -> Result<$type> {
+        if let Ok(val) = ret.box_value.downcast::<$type>() {
+          Ok(*val)
+        } else {
+          bail!(ErrorKind::TryFromTVMRetValueError(
+            stringify!($type).to_string(),
+            ret.type_code
+          ))
+        }
+      }
+    }
+  };
+}
+
+impl_prim_ret_value!(i8, 0);
+impl_prim_ret_value!(u8, 1);
+impl_prim_ret_value!(i16, 0);
+impl_prim_ret_value!(u16, 1);
+impl_prim_ret_value!(i32, 0);
+impl_prim_ret_value!(u32, 1);
+impl_prim_ret_value!(f32, 2);
+impl_prim_ret_value!(i64, 0);
+impl_prim_ret_value!(u64, 1);
+impl_prim_ret_value!(f64, 2);
+impl_prim_ret_value!(isize, 0);
+impl_prim_ret_value!(usize, 1);
+impl_boxed_ret_value!(String, 11);
+
+impl<'a, 't> From<&'t Tensor<'a>> for TVMRetValue {
+  fn from(val: &'t Tensor<'a>) -> Self {
+    TVMRetValue {
+      prim_value: 0,
+      box_value: box DLTensor::from(val),
+      type_code: TVMTypeCode_kNDArrayContainer as i64,
+    }
+  }
+}
+
+impl<'a> TryFrom<TVMRetValue> for Tensor<'a> {
+  type Error = Error;
+  fn try_from(ret: TVMRetValue) -> Result<Self> {
+    ensure!(
+      ret.type_code == TVMTypeCode_kArrayHandle as i64
+        || ret.type_code == TVMTypeCode_kNDArrayContainer as i64,
+      "Could not downcast arg. Expected `{}` or `{}`, but got `{}`",
+      TVMTypeCode_kArrayHandle,
+      TVMTypeCode_kNDArrayContainer,
+      ret.type_code,
+    );
+
+    let dlt = unsafe { *(ret.prim_value as *mut DLTensor as *const DLTensor) };
+    Ok(dlt.into())
+  }
+}
+
+// @see `WrapPackedFunc` in `llvm_module.cc`.
+pub(super) fn wrap_backend_packed_func(func: BackendPackedCFunc) -> PackedFunc {
+  box move |args: &[TVMArgValue]| {
+    func(
+      args
+        .iter()
+        .map(|ref arg| arg.value)
+        .collect::<Vec<TVMValue>>()
+        .as_ptr(),
+      args
+        .iter()
+        .map(|ref arg| arg.type_code as i32)
+        .collect::<Vec<i32>>()
+        .as_ptr() as *const i32,
+      args.len() as i32,
+    );
+    TVMRetValue::default()
+  }
+}
diff --git a/rust/src/runtime/sgx.rs b/rust/src/runtime/sgx.rs
new file mode 100644
index 000000000000..00be3ee3b608
--- /dev/null
+++ b/rust/src/runtime/sgx.rs
@@ -0,0 +1,82 @@
+use std::{
+  ffi::CString,
+  os::raw::{c_char, c_int},
+};
+
+use errors::Result;
+use ffi::runtime::TVMValue;
+use runtime::{threading::sgx_join_threads, SystemLibModule, TVMArgValue, TVMRetValue};
+
+pub use runtime::threading::tvm_run_worker as run_worker;
+
+#[macro_export]
+macro_rules! tvm_ocall {
+  ($func: expr) => {
+    match $func {
+      0 => Ok(()),
+      err => Err(format!("SGX error: {}", err)),
+    }
+  };
+}
+
+pub type SgxStatus = u32;
+
+#[cfg(target_env = "sgx")]
+extern "C" {
+  fn tvm_ocall_packed_func(
+    name: *const c_char,
+    arg_values: *const TVMValue,
+    type_codes: *const c_int,
+    num_args: c_int,
+    ret_val: *mut TVMValue,
+    ret_type_code: *mut c_int,
+  ) -> SgxStatus;
+}
+
+pub fn ocall_packed_func<S: AsRef<str>>(fn_name: S, args: &[TVMArgValue]) -> Result<TVMRetValue> {
+  let mut ret_val = TVMValue { v_int64: 0 };
+  let ret_type_code = 0i64;
+  unsafe {
+    tvm_ocall!(tvm_ocall_packed_func(
+      CString::new(fn_name.as_ref()).unwrap().as_ptr(),
+      args
+        .iter()
+        .map(|ref arg| arg.value)
+        .collect::<Vec<TVMValue>>()
+        .as_ptr(),
+      args
+        .iter()
+        .map(|ref arg| arg.type_code as i32)
+        .collect::<Vec<i32>>()
+        .as_ptr() as *const i32,
+      args.len() as i32,
+      &mut ret_val as *mut TVMValue,
+      &mut (ret_type_code as i32) as *mut c_int,
+    ))?;
+  }
+  Ok(TVMRetValue::from_tvm_value(ret_val, ret_type_code as i64))
+}
+
+#[macro_export]
+macro_rules! ocall_packed {
+  ($fn_name:expr, $($args:expr),+) => {
+    ocall_packed_func($fn_name, &[$($args.into(),)+])
+      .expect(concat!("Error calling `", $fn_name, "`"))
+  };
+  ($fn_name:expr) => {
+    ocall_packed_func($fn_name, &Vec::new())
+      .expect(concat!("Error calling `", $fn_name, "`"))
+  }
+}
+
+pub fn shutdown() {
+  if env!("TVM_NUM_THREADS") != "0" {
+    sgx_join_threads()
+  }
+}
+
+impl Drop for SystemLibModule {
+  fn drop(&mut self) {
+    shutdown()
+  }
+}
diff --git a/rust/src/runtime/threading.rs b/rust/src/runtime/threading.rs
new file mode 100644
index 000000000000..1d6d7fc78834
--- /dev/null
+++ b/rust/src/runtime/threading.rs
@@ -0,0 +1,337 @@
+use std::{
+  os::raw::{c_int, c_void},
+  sync::{
+    atomic::{AtomicUsize, Ordering, ATOMIC_USIZE_INIT},
+    Arc, Barrier,
+  },
+};
+
+#[cfg(not(target_env = "sgx"))]
+use num_cpus;
+#[cfg(not(target_env = "sgx"))]
+use std::{
+  env,
+  thread::{self, JoinHandle},
+};
+
+#[cfg(target_env = "sgx")]
+use std::{collections::VecDeque, ptr, sync::Mutex};
+
+use bounded_spsc_queue::{self, Producer};
+
+use super::super::errors::*;
+use ffi::runtime::TVMParallelGroupEnv;
+
+#[cfg(target_env = "sgx")]
+use super::{sgx::ocall_packed_func, TVMArgValue, TVMRetValue};
+
+type FTVMParallelLambda =
+  extern "C" fn(task_id: usize, penv: *const TVMParallelGroupEnv, cdata: *const c_void) -> i32;
+
+/// Holds a parallel job request made by a TVM library function.
+struct Job {
+  cb: FTVMParallelLambda,
+  cdata: *const c_void,
+  req_num_tasks: usize,
+  pending: Arc<AtomicUsize>,
+}
+
+impl Job {
+  /// Splits this job into a number of `Task`s which can be scheduled.
+  fn tasks(&self, num_workers: usize) -> Vec<Task> {
+    let num_tasks = if self.req_num_tasks == 0 {
+      num_workers
+    } else {
+      self.req_num_tasks.min(num_workers)
+    };
+    self.pending.store(num_tasks, Ordering::SeqCst);
+
+    let barrier = Arc::new(Barrier::new(num_tasks));
+
+    (0..num_tasks)
+      .map(move |i| Task {
+        id: i,
+        flambda: self.cb,
+        penv: TVMParallelGroupEnv {
+          sync_handle: &Arc::clone(&barrier) as *const _ as *mut c_void,
+          num_task: num_tasks as i32,
+        },
+        cdata: self.cdata,
+        pending: Arc::clone(&self.pending),
+      })
+      .collect()
+  }
+
+  /// Waits for all tasks in this `Job` to be completed.
+  fn wait(&self) -> Result<()> {
+    while self.pending.load(Ordering::Acquire) > 0 {
+      #[cfg(not(target_env = "sgx"))]
+      thread::yield_now();
+    }
+    Ok(())
+  }
+}
+
+/// A chunk of work requested by a TVM function.
+struct Task {
+  id: usize,
+  flambda: FTVMParallelLambda,
+  penv: TVMParallelGroupEnv,
+  cdata: *const c_void,
+  pending: Arc<AtomicUsize>,
+}
+unsafe impl Send for Task {}
+unsafe impl Sync for Task {}
+
+impl FnOnce<()> for Task {
+  type Output = i32;
+  extern "rust-call" fn call_once(self, _args: ()) -> Self::Output {
+    let status = (self.flambda)(self.id, &self.penv as *const _, self.cdata);
+    self.pending.fetch_sub(1, Ordering::AcqRel);
+    status
+  }
+}
+
+#[derive(Default)]
+struct Threads {
+  #[allow(unused)]
+  #[cfg(not(target_env = "sgx"))]
+  handles: Vec<JoinHandle<()>>,
+  queues: Vec<Producer<Task>>,
+}
+
+impl<'a> Threads {
+  #[cfg(not(target_env = "sgx"))]
+  fn launch<F: Sync + Send + FnOnce(Consumer<Task>) + 'static + Copy>(
+    num_threads: usize,
+    cb: F,
+  ) -> Self {
+    let (handles, queues) = (0..num_threads)
+      .map(|_| {
+        let (p, c) = bounded_spsc_queue::make(2);
+        let handle = thread::spawn(move || cb(c.into()));
+        (handle, p)
+      })
+      .unzip();
+    Threads {
+      handles: handles,
+      queues: queues,
+    }
+  }
+
+  #[cfg(target_env = "sgx")]
+  fn launch<F: Sync + Send + FnOnce(Consumer<Task>) + 'static + Copy>(
+    num_threads: usize,
+    _cb: F,
+  ) -> Self {
+    let mut consumer_queues = SGX_QUEUES.lock().unwrap();
+    let queues = (0..num_threads)
+      .map(|_| {
+        let (p, c) = bounded_spsc_queue::make(2);
+        consumer_queues.push_back(c.into());
+        p
+      })
+      .collect();
+    ocall_packed!("__sgx_thread_group_launch__", num_threads as u64);
+    Threads { queues: queues }
+  }
+}
+
+struct ThreadPool {
+  num_workers: usize,
+  #[allow(unused)]
+  threads: Threads,
+}
+
+thread_local!(static THREAD_POOL: ThreadPool = ThreadPool::new());
+
+impl ThreadPool {
+  fn new() -> Self {
+    let num_workers = max_concurrency();
+    ThreadPool {
+      num_workers: num_workers,
+      threads: Threads::launch(num_workers, ThreadPool::run_worker),
+    }
+  }
+
+  fn launch(&self, job: Job) {
+    let mut tasks = job.tasks(self.num_workers + 1);
+
+    for (i, task) in tasks.split_off(1).into_iter().enumerate() {
+      self.threads.queues[i].push(task);
+    }
+
+    tasks.pop().unwrap()();
+    job.wait().unwrap();
+  }
+
+  fn run_worker(queue: Consumer<Task>) {
+    loop {
+      let task = queue.pop();
+      let result = task();
+      if result == <i32>::min_value() {
+        break;
+      } else if result != 0 {
+        panic!("Error running task.");
+      }
+    }
+  }
+}
+
+// Send + Sync wrapper for bounded_spsc_queue::Consumer
+struct Consumer<T> {
+  consumer: bounded_spsc_queue::Consumer<T>,
+}
+impl<T> From<bounded_spsc_queue::Consumer<T>> for Consumer<T> {
+  fn from(c: bounded_spsc_queue::Consumer<T>) -> Self {
+    Consumer { consumer: c }
+  }
+}
+impl<T> Consumer<T> {
+  fn pop(&self) -> T {
+    self.consumer.pop()
+  }
+}
+unsafe impl<T> Send for Consumer<T> {}
+unsafe impl<T> Sync for Consumer<T> {}
+
+#[cfg(target_env = "sgx")]
+lazy_static! {
+  /// Holds tasks for untrusted threads which re-enter the enclave to execute.
+  static ref SGX_QUEUES: Mutex<VecDeque<Consumer<Task>>> = Mutex::new(VecDeque::new());
+}
+
+#[cfg(all(not(target_arch = "wasm32"), not(target_env = "sgx")))]
+fn max_concurrency() -> usize {
+  if let Ok(threads_str) = env::var("TVM_NUM_THREADS").or(env::var("OMP_NUM_THREADS")) {
+    if let Ok(threads) = usize::from_str_radix(&threads_str, 10) {
+      return threads;
+    }
+  }
+  num_cpus::get_physical()
+}
+
+#[cfg(target_env = "sgx")]
+fn max_concurrency() -> usize {
+  usize::from_str_radix(env!("TVM_NUM_THREADS"), 10).unwrap_or(1)
+}
+
+#[cfg(target_arch = "wasm32")]
+fn max_concurrency() -> usize {
+  0 // wasm doesn't support threads yet
+}
+
+#[cfg(target_env = "sgx")]
+pub fn tvm_run_worker(_args: &[TVMArgValue]) -> TVMRetValue {
+  let q = {
+    let mut qs = SGX_QUEUES.lock().unwrap();
+    qs.pop_front()
+    // `qs: MutexGuard` needs to be dropped here since `run_worker` won't return
+  };
+  if let Some(q) = q {
+    ThreadPool::run_worker(q);
+  }
+  TVMRetValue::default()
+}
+
+#[no_mangle]
+pub extern "C" fn TVMBackendParallelLaunch(
+  cb: FTVMParallelLambda,
+  cdata: *const c_void,
+  num_task: usize,
+) -> c_int {
+  if max_concurrency() == 0 {
+    let penv = TVMParallelGroupEnv {
+      sync_handle: 0 as *mut c_void,
+      num_task: 1,
+    };
+    cb(0, &penv as *const _, cdata);
+  } else {
+    THREAD_POOL.with(|pool| {
+      pool.launch(Job {
+        cb: cb,
+        cdata: cdata,
+        req_num_tasks: num_task,
+        pending: Arc::new(ATOMIC_USIZE_INIT),
+      });
+    });
+  }
+  return 0;
+}
+
+#[cfg(target_env = "sgx")]
+pub(crate) fn sgx_join_threads() {
+  extern "C" fn poison_pill(
+    _task_id: usize,
+    _penv: *const TVMParallelGroupEnv,
+    _cdata: *const c_void,
+  ) -> i32 {
+    <i32>::min_value()
+  }
+
+  THREAD_POOL.with(|pool| {
+    pool.launch(Job {
+      cb: poison_pill,
+      cdata: ptr::null(),
+      req_num_tasks: 0,
+      pending: Arc::new(ATOMIC_USIZE_INIT),
+    });
+  });
+  ocall_packed!("__sgx_thread_group_join__", 0);
+}
+
+// @see https://github.com/dmlc/tvm/issues/988 for information on why this function is used.
+#[no_mangle]
+pub extern "C" fn TVMBackendParallelBarrier(_task_id: usize, penv: *const TVMParallelGroupEnv) {
+  let barrier: &Arc<Barrier> = unsafe { &*((*penv).sync_handle as *const Arc<Barrier>) };
+  barrier.wait();
+}
+
+#[cfg(test)]
+mod tests {
+  use std::{ptr, thread, time::Duration};
+
+  use super::*;
+
+  #[test]
+  fn test_max_concurrency() {
+    env::set_var("TVM_NUM_THREADS", "42");
+    env::set_var("OMP_NUM_THREADS", "24");
+    assert_eq!(max_concurrency(), 42);
+    env::remove_var("TVM_NUM_THREADS");
+    assert_eq!(max_concurrency(), 24);
+  }
+
+  extern "C" fn flambda(
+    task_id: usize,
+    penv: *const TVMParallelGroupEnv,
+    cdata: *const c_void,
+  ) -> i32 {
+    if cdata == ptr::null() {
+      return 0;
+    }
+    unsafe {
+      let &(ref counter, ref task_ids_sum) = &*(cdata as *const (AtomicUsize, AtomicUsize));
+      thread::sleep(Duration::from_millis(50 * task_id as u64));
+      counter.fetch_add(1, Ordering::SeqCst);
+      task_ids_sum.fetch_add(task_id, Ordering::SeqCst);
+      assert_eq!((*penv).num_task, 3);
+    }
+    0
+  }
+
+  #[test]
+  fn test_parallel_launch() {
+    TVMBackendParallelLaunch(flambda, ptr::null(), 6);
+    let counter = ATOMIC_USIZE_INIT;
+    let task_ids_sum = ATOMIC_USIZE_INIT;
+    let cdata = (counter, task_ids_sum);
+    let num_tasks = 3;
+    TVMBackendParallelLaunch(flambda, &cdata as *const _ as *const c_void, num_tasks);
+    assert_eq!(cdata.0.load(Ordering::SeqCst), num_tasks);
+    assert_eq!(
+      cdata.1.load(Ordering::SeqCst),
+      (0..num_tasks).sum::<usize>()
+    );
+  }
+}
diff --git a/rust/src/runtime/workspace.rs b/rust/src/runtime/workspace.rs
new file mode 100644
index 000000000000..d0e6d8c89255
--- /dev/null
+++ b/rust/src/runtime/workspace.rs
@@ -0,0 +1,119 @@
+use std::{
+  cell::RefCell,
+  os::raw::{c_int, c_void},
+  ptr,
+};
+
+use super::allocator::Allocation;
+use errors::*;
+
+const WS_ALIGN: usize = 64; // taken from `kTempAllocaAlignment` in `device_api.h`
+
+struct WorkspacePool {
+  workspaces: Vec<Allocation>,
+  free: Vec<usize>,
+  in_use: Vec<usize>,
+}
+
+impl WorkspacePool {
+  fn new() -> Self {
+    WorkspacePool {
+      workspaces: Vec::new(),
+      free: Vec::new(),
+      in_use: Vec::new(),
+    }
+  }
+
+  fn alloc_new(&mut self, size: usize) -> Result<*mut u8> {
+    self.workspaces.push(Allocation::new(size, Some(WS_ALIGN))?);
+    self.in_use.push(self.workspaces.len() - 1);
+    Ok(self.workspaces[self.workspaces.len() - 1].as_mut_ptr())
+  }
+
+  fn alloc(&mut self, size: usize) -> Result<*mut u8> {
+    if self.free.len() == 0 {
+      return self.alloc_new(size);
+    }
+    let idx = self
+      .free
+      .iter()
+      .fold(None, |cur_ws_idx: Option<usize>, &idx| {
+        let ws_size = self.workspaces[idx].size();
+        if !ws_size >= size {
+          return cur_ws_idx;
+        }
+        cur_ws_idx.or(Some(idx)).and_then(|cur_idx| {
+          let cur_size = self.workspaces[cur_idx].size();
+          Some(match ws_size <= cur_size {
+            true => idx,
+            false => cur_idx,
+          })
+        })
+      });
+    match idx {
+      Some(idx) => {
+        self.free.remove_item(&idx).unwrap();
+        self.in_use.push(idx);
+        Ok(self.workspaces[idx].as_mut_ptr())
+      }
+      None => self.alloc_new(size),
+    }
+  }
+
+  fn free(&mut self, ptr: *mut u8) -> Result<()> {
+    let mut ws_idx = None;
+    for i in 0..self.in_use.len() {
+      let idx = self.in_use[i];
+      if self.workspaces[idx].as_mut_ptr() == ptr {
+        self.in_use.remove(i);
+        ws_idx = Some(idx);
+        break;
+      }
+    }
+    Ok(
+      self
+        .free
+        .push(ws_idx.ok_or("Tried to free nonexistent workspace.")?),
+    )
+  }
+}
+
+thread_local!(static WORKSPACE_POOL: RefCell<WorkspacePool> = RefCell::new(WorkspacePool::new()));
+
+const WORKSPACE_PAGE_SIZE: usize = 4 << 10;
+
+#[no_mangle]
+pub extern "C" fn TVMBackendAllocWorkspace(
+  _device_type: c_int,
+  _device_id: c_int,
+  size: u64,
+  _dtype_code_hint: c_int,
+  _dtype_bits_hint: c_int,
+) -> *mut c_void {
+  let nbytes = if size == 0 {
+    WORKSPACE_PAGE_SIZE
+  } else {
+    size as usize
+  };
+  WORKSPACE_POOL.with(|pool_cell| {
+    pool_cell
+      .borrow_mut()
+      .alloc(nbytes as usize)
+      .unwrap_or(ptr::null_mut()) as *mut c_void
+  })
+}
+
+#[no_mangle]
+pub extern "C" fn TVMBackendFreeWorkspace(
+  _device_type: c_int,
+  _device_id: c_int,
+  ptr: *mut c_void,
+) -> c_int {
+  WORKSPACE_POOL.with(|pool_cell| {
+    (match pool_cell.borrow_mut().free(ptr as *mut u8) {
+      Ok(()) => 0,
+      Err(_) => -1,
+    }) as c_int
+  });
+  return 0;
+}
diff --git a/rust/tests/.gitignore b/rust/tests/.gitignore
new file mode 100644
index 000000000000..811076739bfa
--- /dev/null
+++ b/rust/tests/.gitignore
@@ -0,0 +1,3 @@
+*.json
+*.params
+*.o
diff --git a/rust/tests/build_model.py b/rust/tests/build_model.py
new file mode 100644
index 000000000000..e0b90495159f
--- /dev/null
+++ b/rust/tests/build_model.py
@@ -0,0 +1,53 @@
+"""Builds a simple NNVM graph for testing."""
+
+from os import path as osp
+
+import nnvm
+from nnvm import sym
+from nnvm.compiler import graph_util
+from nnvm.testing import init
+import numpy as np
+import tvm
+
+CWD = osp.dirname(osp.abspath(osp.expanduser(__file__)))
+
+
+def _get_model(dshape):
+    data = sym.Variable('data', shape=dshape)
+    fc1 = sym.dense(data, units=dshape[-1]*2, use_bias=True)
+    left, right = sym.split(fc1, indices_or_sections=2, axis=1)
+    return sym.Group(((left + 1), (right - 1)))
+
+
+def _init_params(graph, input_shapes, initializer=init.Xavier(), seed=10):
+    if isinstance(graph, sym.Symbol):
+        graph = nnvm.graph.create(graph)
+    ishapes, _ = graph_util.infer_shape(graph, **input_shapes)
+    param_shapes = dict(zip(graph.index.input_names, ishapes))
+    np.random.seed(seed)
+    params = {}
+    for param, shape in param_shapes.items():
+        if param in {'data', 'label'} or not shape:
+            continue
+        init_value = np.empty(shape).astype('float32')
+        initializer(param, init_value)
+        params[param] = tvm.nd.array(init_value)
+    return params
+
+def main():
+    dshape = (32, 16)
+    net = _get_model(dshape)
+    ishape_dict = {'data': dshape}
+    params = _init_params(net, ishape_dict)
+    graph, lib, params = nnvm.compiler.build(net, 'llvm',
+                                             shape=ishape_dict,
+                                             params=params,
+                                             dtype='float32')
+
+    with open(osp.join(CWD, 'graph.json'), 'w') as f_resnet:
+        f_resnet.write(graph.json())
+    with open(osp.join(CWD, 'graph.params'), 'wb') as f_params:
+        f_params.write(nnvm.compiler.save_param_dict(params))
+
+if __name__ == '__main__':
+    main()
diff --git a/rust/tests/test_graph_serde.rs b/rust/tests/test_graph_serde.rs
new file mode 100644
index 000000000000..b02c12889794
--- /dev/null
+++ b/rust/tests/test_graph_serde.rs
@@ -0,0 +1,39 @@
+#![feature(try_from)]
+
+extern crate serde;
+extern crate serde_json;
+
+extern crate tvm;
+
+use std::{convert::TryFrom, fs, io::Read};
+
+use tvm::runtime::Graph;
+
+#[test]
+fn test_load_graph() {
+  let mut params_bytes = Vec::new();
+  fs::File::open(concat!(env!("CARGO_MANIFEST_DIR"), "/tests/graph.params"))
+    .expect("Could not find TVM graph. Did you run `tests/build_model.py`?")
+    .read_to_end(&mut params_bytes)
+    .unwrap();
+  let _params = tvm::runtime::load_param_dict(&params_bytes);
+
+  let graph = Graph::try_from(
+    &fs::read_to_string(concat!(env!("CARGO_MANIFEST_DIR"), "/tests/graph.json")).unwrap(),
+  )
+  .unwrap();
+
+  assert_eq!(graph.nodes[3].op, "tvm_op");
+  assert_eq!(
+    graph.nodes[3]
+      .attrs
+      .as_ref()
+      .unwrap()
+      .get("func_name")
+      .unwrap(),
+    "fuse_dense"
+  );
+  assert_eq!(graph.nodes[5].inputs[0].index, 0);
+  assert_eq!(graph.nodes[6].inputs[0].index, 1);
+  assert_eq!(graph.heads.len(), 2);
+}
diff --git a/rust/tests/test_nnvm/Cargo.toml b/rust/tests/test_nnvm/Cargo.toml
new file mode 100644
index 000000000000..7e6ce5fb729c
--- /dev/null
+++ b/rust/tests/test_nnvm/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "test-nnvm"
+version = "0.0.0"
+license = "Apache-2.0"
+authors = ["Nick Hynes <nhynes@berkeley.edu>"]
+
+[dependencies]
+ndarray = "0.11.2"
+tvm = { path = "../../" }
+serde = "1.0.59"
+serde_json = "1.0.17"
+
+[build-dependencies]
+ar = "0.6.0"
diff --git a/rust/tests/test_nnvm/build.rs b/rust/tests/test_nnvm/build.rs
new file mode 100644
index 000000000000..cb3a4e0d574d
--- /dev/null
+++ b/rust/tests/test_nnvm/build.rs
@@ -0,0 +1,28 @@
+extern crate ar;
+
+use std::{env, path::PathBuf, process::Command};
+
+use ar::Builder;
+use std::fs::File;
+
+fn main() {
+  let out_dir = env::var("OUT_DIR").unwrap();
+
+  let output = Command::new(concat!(
+    env!("CARGO_MANIFEST_DIR"),
+    "/src/build_test_graph.py"
+  )).arg(&out_dir)
+    .output()
+    .expect("Failed to execute command");
+  if output.stderr.len() > 0 {
+    panic!(String::from_utf8(output.stderr).unwrap());
+  }
+
+  let in_path: PathBuf = [&out_dir, "graph.o"].iter().collect();
+  let out_path: PathBuf = [&out_dir, "libgraph.a"].iter().collect();
+  let mut builder = Builder::new(File::create(out_path.to_str().unwrap()).unwrap());
+  builder.append_path(in_path.to_str().unwrap()).unwrap();
+
+  println!("cargo:rustc-link-lib=static=graph");
+  println!("cargo:rustc-link-search=native={}", out_dir);
+}
diff --git a/rust/tests/test_nnvm/src/build_test_graph.py b/rust/tests/test_nnvm/src/build_test_graph.py
new file mode 100755
index 000000000000..429cc2128931
--- /dev/null
+++ b/rust/tests/test_nnvm/src/build_test_graph.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+
+"""Builds a simple NNVM graph for testing."""
+
+from os import path as osp
+import sys
+
+import nnvm
+from nnvm import sym
+from nnvm.compiler import graph_util
+from nnvm.testing import init
+import numpy as np
+import tvm
+
+
+def _get_model(dshape):
+    data = sym.Variable('data', shape=dshape)
+    fc = sym.dense(data, units=dshape[-1]*2, use_bias=True)
+    left, right = sym.split(fc, indices_or_sections=2, axis=1)
+    return sym.Group(((left + 1), (right - 1), fc))
+
+
+def _init_params(graph, input_shapes, initializer=init.Xavier(), seed=10):
+    if isinstance(graph, sym.Symbol):
+        graph = nnvm.graph.create(graph)
+    ishapes, _ = graph_util.infer_shape(graph, **input_shapes)
+    param_shapes = dict(zip(graph.index.input_names, ishapes))
+    np.random.seed(seed)
+    params = {}
+    for param, shape in param_shapes.items():
+        if param in {'data', 'label'} or not shape:
+            continue
+
+        init_value = np.arange(np.product(shape), 0, -1).reshape(*shape).astype('float32')
+        if param.endswith('_bias'):
+            params[param] = tvm.nd.array(init_value)
+            continue
+
+        init_value = np.empty(shape).astype('float32')
+        initializer(param, init_value)
+        # init_value /= init_value.sum() + 1e-10
+        params[param] = tvm.nd.array(init_value)
+    return params
+
+def main():
+    dshape = (4, 8)
+    net = _get_model(dshape)
+    ishape_dict = {'data': dshape}
+    params = _init_params(net, ishape_dict)
+    graph, lib, params = nnvm.compiler.build(net, 'llvm --system-lib',
+                                             shape=ishape_dict,
+                                             params=params,
+                                             dtype='float32')
+
+    out_dir = sys.argv[1]
+    lib.save(osp.join(sys.argv[1], 'graph.o'))
+    with open(osp.join(out_dir, 'graph.json'), 'w') as f_resnet:
+        f_resnet.write(graph.json())
+    with open(osp.join(out_dir, 'graph.params'), 'wb') as f_params:
+        f_params.write(nnvm.compiler.save_param_dict(params))
+
+if __name__ == '__main__':
+    main()
diff --git a/rust/tests/test_nnvm/src/main.rs b/rust/tests/test_nnvm/src/main.rs
new file mode 100644
index 000000000000..0953ce2a2603
--- /dev/null
+++ b/rust/tests/test_nnvm/src/main.rs
@@ -0,0 +1,80 @@
+#![feature(try_from)]
+
+#[macro_use]
+extern crate ndarray;
+extern crate serde;
+extern crate serde_json;
+
+extern crate tvm;
+use std::{collections::HashMap, convert::TryFrom, fs, io::Read};
+
+use ndarray::Array;
+use tvm::runtime::{Graph, GraphExecutor, SystemLibModule, Tensor};
+
+const BATCH_SIZE: usize = 4;
+const IN_DIM: usize = 8;
+
+macro_rules! check_sum {
+  ($e:expr, $a:ident, $b:ident) => {
+    let a = Array::try_from($e.get_input(stringify!($a)).unwrap()).unwrap();
+    check_sum!(a, $b);
+  };
+  ($e:expr, $a:expr, $b:ident) => {
+    let a = Array::try_from($e.get_output($a).unwrap()).unwrap();
+    check_sum!(a, $b);
+  };
+  ($a:ident, $b:ident) => {
+    let a_sum: f32 = $a.scalar_sum();
+    let b_sum: f32 = $b.scalar_sum();
+    assert!((a_sum - b_sum).abs() < 1e-2, "{} != {}", a_sum, b_sum);
+  };
+}
+
+fn main() {
+  let syslib = SystemLibModule::default();
+
+  let mut params_bytes = Vec::new();
+  fs::File::open(concat!(env!("OUT_DIR"), "/graph.params"))
+    .unwrap()
+    .read_to_end(&mut params_bytes)
+    .unwrap();
+  let params = tvm::runtime::load_param_dict(&params_bytes)
+    .unwrap()
+    .into_iter()
+    .map(|(k, v)| (k, v.to_owned()))
+    .collect::<HashMap<String, Tensor<'static>>>();
+
+  let graph =
+    Graph::try_from(&fs::read_to_string(concat!(env!("OUT_DIR"), "/graph.json")).unwrap()).unwrap();
+  let mut exec = GraphExecutor::new(graph, &syslib).unwrap();
+
+  let x = Array::from_shape_vec(
+    (BATCH_SIZE, IN_DIM),
+    (0..BATCH_SIZE * IN_DIM)
+      .map(|x| x as f32)
+      .collect::<Vec<f32>>(),
+  ).unwrap();
+  let w = Array::try_from(params.get("dense0_weight").unwrap())
+    .unwrap()
+    .into_shape((IN_DIM * 2, IN_DIM))
+    .unwrap();
+  let b = Array::try_from(params.get("dense0_bias").unwrap()).unwrap();
+  let dense = x.dot(&w.t()) + &b;
+  let left = dense.slice(s![.., 0..IN_DIM]);
+  let right = dense.slice(s![.., IN_DIM..]);
+  let expected_o0 = &left + 1f32;
+  let expected_o1 = &right - 1f32;
+
+  exec.load_params(params);
+  exec.set_input("data", x.clone().into());
+
+  check_sum!(exec, data, x);
+  check_sum!(exec, dense0_weight, w);
+  check_sum!(exec, dense0_bias, b);
+
+  exec.run();
+
+  check_sum!(exec, 0, expected_o0);
+  check_sum!(exec, 1, expected_o1);
+  check_sum!(exec, 2, dense);
+}
diff --git a/rust/tests/test_tvm_basic/Cargo.toml b/rust/tests/test_tvm_basic/Cargo.toml
new file mode 100644
index 000000000000..bd4193bcb8fb
--- /dev/null
+++ b/rust/tests/test_tvm_basic/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "test-tvm-basic"
+version = "0.0.0"
+license = "Apache-2.0"
+authors = ["Nick Hynes <nhynes@berkeley.edu>"]
+
+[dependencies]
+ndarray = "0.11.2"
+tvm = { path = "../../" }
+
+[build-dependencies]
+ar = "0.6.0"
diff --git a/rust/tests/test_tvm_basic/build.rs b/rust/tests/test_tvm_basic/build.rs
new file mode 100644
index 000000000000..778dd1cab1ca
--- /dev/null
+++ b/rust/tests/test_tvm_basic/build.rs
@@ -0,0 +1,28 @@
+extern crate ar;
+
+use std::{env, path::PathBuf, process::Command};
+
+use ar::Builder;
+use std::fs::File;
+
+fn main() {
+  let out_dir = env::var("OUT_DIR").unwrap();
+
+  let output = Command::new(concat!(
+    env!("CARGO_MANIFEST_DIR"),
+    "/src/build_test_lib.py"
+  )).arg(&out_dir)
+    .output()
+    .expect("Failed to execute command");
+  if output.stderr.len() > 0 {
+    panic!(String::from_utf8(output.stderr).unwrap());
+  }
+
+  let in_path: PathBuf = [&out_dir, "test.o"].iter().collect();
+  let out_path: PathBuf = [&out_dir, "libtest.a"].iter().collect();
+  let mut builder = Builder::new(File::create(out_path.to_str().unwrap()).unwrap());
+  builder.append_path(in_path.to_str().unwrap()).unwrap();
+
+  println!("cargo:rustc-link-lib=static=test");
+  println!("cargo:rustc-link-search=native={}", out_dir);
+}
diff --git a/rust/tests/test_tvm_basic/src/build_test_lib.py b/rust/tests/test_tvm_basic/src/build_test_lib.py
new file mode 100755
index 000000000000..7289a778fcec
--- /dev/null
+++ b/rust/tests/test_tvm_basic/src/build_test_lib.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python3
+
+"""Prepares a simple TVM library for testing."""
+
+from os import path as osp
+import sys
+
+import tvm
+
+def main():
+    n = tvm.var('n')
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.placeholder((n,), name='B')
+    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+    s = tvm.create_schedule(C.op)
+    s[C].parallel(s[C].op.axis[0])
+    print(tvm.lower(s, [A, B, C], simple_mode=True))
+    tvm.build(s, [A, B, C], 'llvm --system-lib').save(osp.join(sys.argv[1], 'test.o'))
+
+if __name__ == '__main__':
+    main()
diff --git a/rust/tests/test_tvm_basic/src/main.rs b/rust/tests/test_tvm_basic/src/main.rs
new file mode 100644
index 000000000000..b6c11451d12a
--- /dev/null
+++ b/rust/tests/test_tvm_basic/src/main.rs
@@ -0,0 +1,25 @@
+extern crate ndarray;
+#[macro_use]
+extern crate tvm;
+
+use ndarray::Array;
+use tvm::{
+  ffi::runtime::DLTensor,
+  runtime::{Module, SystemLibModule},
+};
+
+fn main() {
+  let syslib = SystemLibModule::default();
+  let add = syslib
+    .get_function("default_function")
+    .expect("main function not found");
+  let mut a = Array::from_vec(vec![1f32, 2., 3., 4.]);
+  let mut b = Array::from_vec(vec![1f32, 0., 1., 0.]);
+  let mut c = Array::from_vec(vec![0f32; 4]);
+  let e = Array::from_vec(vec![2f32, 2., 4., 4.]);
+  let mut a_dl: DLTensor = (&mut a).into();
+  let mut b_dl: DLTensor = (&mut b).into();
+  let mut c_dl: DLTensor = (&mut c).into();
+  call_packed!(add, &mut a_dl, &mut b_dl, &mut c_dl);
+  assert!(c.all_close(&e, 1e-8f32));
+}
diff --git a/src/README.md b/src/README.md
index dfa7a1d33d22..b0363a04411a 100644
--- a/src/README.md
+++ b/src/README.md
@@ -5,12 +5,14 @@ There can be internal header files within each module that sit in src.
 
 ## Modules
 - common: Internal common utilities.
-- api: API function registration
-- lang: The definition of DSL related data structure
-- arithmetic: Arithmetic expression and set simplification
-- op: The detail implementations about each operation(compute, scan, placeholder)
+- api: API function registration.
+- lang: The definition of DSL related data structure.
+- arithmetic: Arithmetic expression and set simplification.
+- op: The detail implementations about each operation(compute, scan, placeholder).
 - schedule: The operations on the schedule graph before converting to IR.
-- pass: The optimization pass on the IR structure
+- pass: The optimization pass on the IR structure.
 - codegen: The code generator.
-- runtime: Minimum runtime related codes
-- contrib: Contrib extension libraries
+- runtime: Minimum runtime related codes.
+- autotvm: The auto-tuning module.
+- relay: Implementation of Relay. The second generation of NNVM, a new IR for deep learning frameworks.
+- contrib: Contrib extension libraries.
diff --git a/src/api/api_base.cc b/src/api/api_base.cc
index 37970e69e24f..47895c61e2c0 100644
--- a/src/api/api_base.cc
+++ b/src/api/api_base.cc
@@ -3,6 +3,7 @@
  *  Implementation of basic API functions
  * \file api_base.cc
  */
+#include <dmlc/memory_io.h>
 #include <tvm/expr.h>
 #include <tvm/tensor.h>
 #include <tvm/api_registry.h>
@@ -24,25 +25,46 @@ TVM_REGISTER_API("_raw_ptr")
   });
 
 TVM_REGISTER_API("_save_json")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-    *ret = SaveJSON(args[0]);
-  });
+.set_body_typed<std::string(NodeRef)>(SaveJSON);
 
 TVM_REGISTER_API("_load_json")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-    *ret = LoadJSON<NodeRef>(args[0]);
-  });
-
-TVM_REGISTER_API("_nop")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-  });
+.set_body_typed<NodeRef(std::string)>(LoadJSON<NodeRef>);
 
 TVM_REGISTER_API("_TVMSetStream")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
     TVMSetStream(args[0], args[1], args[2]);
   });
+TVM_REGISTER_API("_save_param_dict")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    CHECK_EQ(args.size() % 2, 0u);
+    constexpr uint64_t TVMNDArrayListMagic = 0xF7E58D4F05049CB7;
+    size_t num_params = args.size() / 2;
+    std::vector<std::string> names;
+    names.reserve(num_params);
+    std::vector<DLTensor*> arrays;
+    arrays.reserve(num_params);
+    for (size_t i = 0; i < num_params * 2; i += 2) {
+      names.emplace_back(args[i].operator std::string());
+      arrays.emplace_back(args[i + 1].operator DLTensor*());
+    }
+    std::string bytes;
+    dmlc::MemoryStringStream strm(&bytes);
+    dmlc::Stream* fo = &strm;
+    uint64_t header = TVMNDArrayListMagic, reserved = 0;
+    fo->Write(header);
+    fo->Write(reserved);
+    fo->Write(names);
+    {
+      uint64_t sz = static_cast<uint64_t>(arrays.size());
+      fo->Write(sz);
+      for (size_t i = 0; i < sz; ++i) {
+        tvm::runtime::SaveDLTensor(fo, arrays[i]);
+      }
+    }
+    TVMByteArray arr;
+    arr.data = bytes.c_str();
+    arr.size = bytes.length();
+    *rv = arr;
+  });
 
-TVM_DLL::dmlc::Registry<::tvm::NodeFactoryReg > * GetTVMNodeFactoryRegistry() {
-  return ::dmlc::Registry<::tvm::NodeFactoryReg>::Get();
-}
 }  // namespace tvm
diff --git a/src/api/api_ir.cc b/src/api/api_ir.cc
index bc9293c20b7a..1040f6ce6f66 100644
--- a/src/api/api_ir.cc
+++ b/src/api/api_ir.cc
@@ -5,7 +5,7 @@
  */
 #include <tvm/expr.h>
 #include <tvm/ir.h>
-#include <ir/IROperator.h>
+#include <tvm/ir_operator.h>
 #include <tvm/api_registry.h>
 #include <tvm/ir_operator.h>
 
@@ -117,6 +117,50 @@ TVM_REGISTER_API("make.CommReducer")
       *ret = Node::make(args[0], args[1], args[2], args[3], args[4]);   \
     })                                                                  \
 
+
+REGISTER_MAKE5(Reduce);
+REGISTER_MAKE4(AttrStmt);
+
+REGISTER_MAKE2(IntImm);
+REGISTER_MAKE2(UIntImm);
+REGISTER_MAKE2(FloatImm);
+REGISTER_MAKE1(StringImm);
+
+REGISTER_MAKE2(Add);
+REGISTER_MAKE2(Sub);
+REGISTER_MAKE2(Mul);
+REGISTER_MAKE2(Div);
+REGISTER_MAKE2(Mod);
+REGISTER_MAKE2(Min);
+REGISTER_MAKE2(Max);
+REGISTER_MAKE2(EQ);
+REGISTER_MAKE2(NE);
+REGISTER_MAKE2(LT);
+REGISTER_MAKE2(LE);
+REGISTER_MAKE2(GT);
+REGISTER_MAKE2(GE);
+REGISTER_MAKE2(And);
+REGISTER_MAKE2(Or);
+
+REGISTER_MAKE1(Not);
+REGISTER_MAKE3(Select);
+REGISTER_MAKE3(Ramp);
+REGISTER_MAKE2(Cast);
+REGISTER_MAKE2(Broadcast);
+REGISTER_MAKE2(Shuffle);
+REGISTER_MAKE3(Let);
+REGISTER_MAKE3(LetStmt);
+REGISTER_MAKE3(AssertStmt);
+REGISTER_MAKE3(ProducerConsumer);
+REGISTER_MAKE5(Allocate);
+REGISTER_MAKE4(Provide);
+REGISTER_MAKE4(Prefetch);
+REGISTER_MAKE1(Free);
+REGISTER_MAKE2(Block);
+REGISTER_MAKE3(IfThenElse);
+REGISTER_MAKE1(Evaluate);
+
+// operator overloading, smarter than make
 #define REGISTER_MAKE_BINARY_OP(Node, Func)                  \
   TVM_REGISTER_API("make."#Node)                             \
   .set_body([](TVMArgs args,  TVMRetValue *ret) {            \
@@ -138,49 +182,27 @@ TVM_REGISTER_API("make.CommReducer")
       }                                                                 \
     })
 
-REGISTER_MAKE5(Reduce);
-REGISTER_MAKE4(AttrStmt);
 
-REGISTER_MAKE2(IntImm);
-REGISTER_MAKE2(UIntImm);
-REGISTER_MAKE2(FloatImm);
-REGISTER_MAKE1(StringImm);
-REGISTER_MAKE_BINARY_OP(Add, operator+);
-REGISTER_MAKE_BINARY_OP(Sub, operator-);
-REGISTER_MAKE_BINARY_OP(Mul, operator*);
-REGISTER_MAKE_BINARY_OP(Div, operator/);
-REGISTER_MAKE_BINARY_OP(Mod, operator%);
-REGISTER_MAKE_BINARY_OP(Min, min);
-REGISTER_MAKE_BINARY_OP(Max, max);
-REGISTER_MAKE_BINARY_OP(EQ, operator==);
-REGISTER_MAKE_BINARY_OP(NE, operator!=);
-REGISTER_MAKE_BINARY_OP(LT, operator<); // NOLINT(*)
-REGISTER_MAKE_BINARY_OP(LE, operator<=); // NOLINT(*)
-REGISTER_MAKE_BINARY_OP(GT, operator>);  // NOLINT(*)
-REGISTER_MAKE_BINARY_OP(GE, operator>=);
-REGISTER_MAKE_BINARY_OP(And, operator&&);
-REGISTER_MAKE_BINARY_OP(Or, operator||);
+REGISTER_MAKE_BINARY_OP(_OpAdd, operator+);
+REGISTER_MAKE_BINARY_OP(_OpSub, operator-);
+REGISTER_MAKE_BINARY_OP(_OpMul, operator*);
+REGISTER_MAKE_BINARY_OP(_OpDiv, operator/);
+REGISTER_MAKE_BINARY_OP(_OpMod, operator%);
+REGISTER_MAKE_BINARY_OP(_OpMin, min);
+REGISTER_MAKE_BINARY_OP(_OpMax, max);
+REGISTER_MAKE_BINARY_OP(_OpEQ, operator==);
+REGISTER_MAKE_BINARY_OP(_OpNE, operator!=);
+REGISTER_MAKE_BINARY_OP(_OpLT, operator<); // NOLINT(*)
+REGISTER_MAKE_BINARY_OP(_OpLE, operator<=); // NOLINT(*)
+REGISTER_MAKE_BINARY_OP(_OpGT, operator>);  // NOLINT(*)
+REGISTER_MAKE_BINARY_OP(_OpGE, operator>=);
+REGISTER_MAKE_BINARY_OP(_OpAnd, operator&&);
+REGISTER_MAKE_BINARY_OP(_OpOr, operator||);
 REGISTER_MAKE_BIT_OP(bitwise_and, operator&);
 REGISTER_MAKE_BIT_OP(bitwise_or, operator|);
 REGISTER_MAKE_BIT_OP(bitwise_xor, operator^);
 REGISTER_MAKE_BIT_OP(left_shift, operator<<); // NOLINT(*)
 REGISTER_MAKE_BIT_OP(right_shift, operator>>);
-REGISTER_MAKE1(Not);
-REGISTER_MAKE3(Select);
-REGISTER_MAKE3(Ramp);
-REGISTER_MAKE2(Cast);
-REGISTER_MAKE2(Broadcast);
-REGISTER_MAKE3(Let);
-REGISTER_MAKE3(LetStmt);
-REGISTER_MAKE3(AssertStmt);
-REGISTER_MAKE3(ProducerConsumer);
-REGISTER_MAKE5(Allocate);
-REGISTER_MAKE4(Provide);
-REGISTER_MAKE4(Prefetch);
-REGISTER_MAKE1(Free);
-REGISTER_MAKE2(Block);
-REGISTER_MAKE3(IfThenElse);
-REGISTER_MAKE1(Evaluate);
 
 }  // namespace ir
 }  // namespace tvm
diff --git a/src/api/api_lang.cc b/src/api/api_lang.cc
index 8c55684ed851..e30111e938bd 100644
--- a/src/api/api_lang.cc
+++ b/src/api/api_lang.cc
@@ -45,11 +45,15 @@ TVM_REGISTER_API("_str")
 
 TVM_REGISTER_API("_Array")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
-    std::vector<std::shared_ptr<Node> > data;
+    std::vector<NodePtr<Node> > data;
     for (int i = 0; i < args.size(); ++i) {
-      data.push_back(args[i].node_sptr());
+      if (args[i].type_code() != kNull) {
+        data.push_back(args[i].node_sptr());
+      } else {
+        data.push_back(NodePtr<Node>(nullptr));
+      }
     }
-    auto node = std::make_shared<ArrayNode>();
+    auto node = make_node<ArrayNode>();
     node->data = std::move(data);
     *ret = node;
   });
@@ -87,7 +91,7 @@ TVM_REGISTER_API("_Map")
         data.emplace(std::make_pair(args[i].operator std::string(),
                                     args[i + 1].node_sptr()));
       }
-      auto node = std::make_shared<StrMapNode>();
+      auto node = make_node<StrMapNode>();
       node->data = std::move(data);
       *ret = node;
     } else {
@@ -101,7 +105,7 @@ TVM_REGISTER_API("_Map")
         data.emplace(std::make_pair(args[i].node_sptr(),
                                     args[i + 1].node_sptr()));
       }
-      auto node = std::make_shared<MapNode>();
+      auto node = make_node<MapNode>();
       node->data = std::move(data);
       *ret = node;
     }
@@ -163,7 +167,7 @@ TVM_REGISTER_API("_MapItems")
     auto& sptr = args[0].node_sptr();
     if (sptr->is_type<MapNode>()) {
       auto* n = static_cast<const MapNode*>(sptr.get());
-      auto rkvs = std::make_shared<ArrayNode>();
+      auto rkvs = make_node<ArrayNode>();
       for (const auto& kv : n->data) {
         rkvs->data.push_back(kv.first);
         rkvs->data.push_back(kv.second);
@@ -171,7 +175,7 @@ TVM_REGISTER_API("_MapItems")
       *ret = rkvs;
     } else {
       auto* n = static_cast<const StrMapNode*>(sptr.get());
-      auto rkvs = std::make_shared<ArrayNode>();
+      auto rkvs = make_node<ArrayNode>();
       for (const auto& kv : n->data) {
         rkvs->data.push_back(ir::StringImm::make(kv.first).node_);
         rkvs->data.push_back(kv.second);
@@ -239,6 +243,14 @@ TVM_REGISTER_API("_TensorIntrin")
                                   args[6]);
   });
 
+TVM_REGISTER_API("_TensorIntrinCall")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+    *ret = TensorIntrinCallNode::make(args[0],
+                                      args[1],
+                                      args[2],
+                                      args[3]);
+  });
+
 TVM_REGISTER_API("_TensorEqual")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
     *ret = args[0].operator Tensor() == args[1].operator Tensor();
@@ -278,6 +290,18 @@ TVM_REGISTER_API("_ScanOp")
                             args[7]);
   });
 
+TVM_REGISTER_API("_TensorComputeOp")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+    *ret = TensorComputeOpNode::make(args[0],
+                                     args[1],
+                                     args[2],
+                                     args[3],
+                                     args[4],
+                                     args[5],
+                                     args[6],
+                                     args[7]);
+  });
+
 TVM_REGISTER_API("_ExternOp")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
     *ret = ExternOpNode::make(args[0],
@@ -289,6 +313,16 @@ TVM_REGISTER_API("_ExternOp")
                               args[6]);
   });
 
+TVM_REGISTER_API("_HybridOp")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+    *ret = HybridOpNode::make(args[0],
+                              args[1],
+                              args[2],
+                              args[3],
+                              args[4],
+                              args[5]);
+  });
+
 TVM_REGISTER_API("_OpGetOutput")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
     *ret = args[0].operator Operation().output(
diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc
index a0048a2ed771..bf9e85e8134a 100644
--- a/src/api/api_pass.cc
+++ b/src/api/api_pass.cc
@@ -5,6 +5,7 @@
  */
 #include <tvm/expr.h>
 #include <tvm/ir.h>
+#include <tvm/attrs.h>
 #include <tvm/ir_pass.h>
 #include <tvm/ir_visitor.h>
 #include <tvm/ir_mutator.h>
@@ -65,6 +66,26 @@ TVM_REGISTER_API("ir_pass.Equal")
     }
   });
 
+TVM_REGISTER_API("ir_pass.StorageFlatten")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    if (args.size() <= 3) {
+      *ret = StorageFlatten(args[0], args[1], args[2]);
+    } else {
+      *ret = StorageFlatten(args[0], args[1], args[2], args[3]);
+    }
+  });
+
+TVM_REGISTER_API("ir_pass.AttrsEqual")
+.set_body_typed<bool(const NodeRef&, const NodeRef&)>([](const NodeRef& lhs, const NodeRef& rhs) {
+    return AttrsEqual()(lhs, rhs);
+  });
+
+TVM_REGISTER_API("ir_pass.AttrsHash")
+.set_body_typed<int64_t(const NodeRef&)>([](const NodeRef &node) {
+    return AttrsHash()(node);
+  });
+
+
 TVM_REGISTER_API("ir_pass.ExprUseVar")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     *ret = ExprUseVar(args[0].operator Expr(), args[1].operator Var());
@@ -113,7 +134,6 @@ REGISTER_PASS1(ConvertSSA);
 REGISTER_PASS1(VerifySSA);
 REGISTER_PASS1(RewriteUnsafeSelect);
 REGISTER_PASS4(Inline);
-REGISTER_PASS3(StorageFlatten);
 REGISTER_PASS4(IRTransform);
 REGISTER_PASS1(VectorizeLoop);
 REGISTER_PASS5(UnrollLoop);
@@ -141,5 +161,7 @@ REGISTER_PASS1(LowerTVMBuiltin);
 REGISTER_PASS1(CombineContextCall);
 REGISTER_PASS2(VerifyMemory);
 REGISTER_PASS2(VerifyGPUCode);
+REGISTER_PASS1(DecorateDeviceScope);
+REGISTER_PASS1(InstrumentBoundCheckers);
 }  // namespace ir
 }  // namespace tvm
diff --git a/src/api/api_test.cc b/src/api/api_test.cc
new file mode 100644
index 000000000000..2c637a28f01a
--- /dev/null
+++ b/src/api/api_test.cc
@@ -0,0 +1,60 @@
+ /*!
+ *  Copyright (c) 2018 by Contributors
+ *  Code mainly used for test purposes.
+ * \file api_test.cc
+ */
+#include <tvm/expr.h>
+#include <tvm/tensor.h>
+#include <tvm/attrs.h>
+#include <tvm/api_registry.h>
+
+namespace tvm {
+// Attrs used to python API
+struct TestAttrs : public AttrsNode<TestAttrs> {
+  int axis;
+  std::string name;
+  Array<Expr> padding;
+  TypedEnvFunc<int(int)> func;
+
+  TVM_DECLARE_ATTRS(TestAttrs, "attrs.TestAttrs") {
+    TVM_ATTR_FIELD(axis)
+        .set_default(10)
+        .set_lower_bound(1)
+        .set_upper_bound(10)
+        .describe("axis field");
+    TVM_ATTR_FIELD(name)
+        .describe("name");
+    TVM_ATTR_FIELD(padding)
+        .describe("padding of input")
+        .set_default(Array<Expr>({0, 0}));
+    TVM_ATTR_FIELD(func)
+        .describe("some random env function")
+        .set_default(TypedEnvFunc<int(int)>(nullptr));
+  }
+};
+
+TVM_REGISTER_NODE_TYPE(TestAttrs);
+
+TVM_REGISTER_API("_nop")
+.set_body([](TVMArgs args,  TVMRetValue *ret) {
+  });
+
+TVM_REGISTER_API("_context_test")
+.set_body([](TVMArgs args,  TVMRetValue *ret) {
+    DLContext ctx = args[0];
+    int dtype = args[1];
+    int did = args[2];
+    CHECK_EQ(static_cast<int>(ctx.device_type), dtype);
+    CHECK_EQ(static_cast<int>(ctx.device_id), did);
+    *ret = ctx;
+  });
+
+// internal function used for debug and testing purposes
+TVM_REGISTER_API("_ndarray_use_count")
+.set_body([](TVMArgs args,  TVMRetValue *ret) {
+    runtime::NDArray nd = args[0];
+    // substract the current one
+    *ret = (nd.use_count() - 1);
+  });
+
+}  // namespace tvm
diff --git a/src/api/dsl_api.cc b/src/api/dsl_api.cc
index 80d7c3163e10..1c2c294a5f30 100644
--- a/src/api/dsl_api.cc
+++ b/src/api/dsl_api.cc
@@ -7,6 +7,7 @@
 #include <dmlc/logging.h>
 #include <dmlc/thread_local.h>
 #include <tvm/api_registry.h>
+#include <tvm/attrs.h>
 #include <vector>
 #include <string>
 #include <exception>
@@ -27,7 +28,7 @@ struct TVMAPIThreadLocalEntry {
 /*! \brief Thread local store that can be used to hold return values. */
 typedef dmlc::ThreadLocalStore<TVMAPIThreadLocalEntry> TVMAPIThreadLocalStore;
 
-using TVMAPINode = std::shared_ptr<Node>;
+using TVMAPINode = NodePtr<Node>;
 
 struct APIAttrGetter : public AttrVisitor {
   std::string skey;
@@ -124,22 +125,35 @@ class DSLAPIImpl : public DSLAPI {
         (*static_cast<TVMAPINode*>(handle))->type_index());
   }
   void NodeGetAttr(NodeHandle handle,
-                  const char* key,
-                  TVMValue* ret_val,
-                  int* ret_type_code,
-                  int* ret_success) const final {
+                   const char* key,
+                   TVMValue* ret_val,
+                   int* ret_type_code,
+                   int* ret_success) const final {
     TVMRetValue rv;
     APIAttrGetter getter;
+    TVMAPINode* tnode = static_cast<TVMAPINode*>(handle);
     getter.skey = key;
     getter.ret = &rv;
-    TVMAPINode* tnode = static_cast<TVMAPINode*>(handle);
     if (getter.skey == "type_key") {
       ret_val->v_str = (*tnode)->type_key();
       *ret_type_code = kStr;
       *ret_success = 1;
-    } else {
+      return;
+    } else if (!(*tnode)->is_type<DictAttrsNode>()) {
       (*tnode)->VisitAttrs(&getter);
       *ret_success = getter.found_ref_object || rv.type_code() != kNull;
+    } else {
+      // specially handle dict attr
+      DictAttrsNode* dnode = static_cast<DictAttrsNode*>(tnode->get());
+      auto it = dnode->dict.find(key);
+      if (it != dnode->dict.end()) {
+        *ret_success = 1;
+        rv = (*it).second;
+      } else {
+        *ret_success = 0;
+      }
+    }
+    if (*ret_success) {
       if (rv.type_code() == kStr ||
           rv.type_code() == kTVMType) {
         TVMAPIThreadLocalEntry *e = TVMAPIThreadLocalStore::Get();
@@ -159,7 +173,16 @@ class DSLAPIImpl : public DSLAPI {
     TVMAPINode* tnode = static_cast<TVMAPINode*>(handle);
     APIAttrDir dir;
     dir.names = &(ret->ret_vec_str);
-    (*tnode)->VisitAttrs(&dir);
+
+    if (!(*tnode)->is_type<DictAttrsNode>()) {
+      (*tnode)->VisitAttrs(&dir);
+    } else {
+      // specially handle dict attr
+      DictAttrsNode* dnode = static_cast<DictAttrsNode*>(tnode->get());
+      for (const auto& kv : dnode->dict) {
+        ret->ret_vec_str.push_back(kv.first);
+      }
+    }
     ret->ret_vec_charp.clear();
     for (size_t i = 0; i < ret->ret_vec_str.size(); ++i) {
       ret->ret_vec_charp.push_back(ret->ret_vec_str[i].c_str());
diff --git a/src/arithmetic/canonical.cc b/src/arithmetic/canonical.cc
index ed6239961a3b..2151ebf2adba 100644
--- a/src/arithmetic/canonical.cc
+++ b/src/arithmetic/canonical.cc
@@ -6,8 +6,12 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/arithmetic.h>
 #include <tvm/ir_pass.h>
-#include "./canonical.h"
-#include "./compute_expr.h"
+#include <algorithm>
+#include <map>
+#include <limits>
+#include <vector>
+#include "canonical.h"
+#include "compute_expr.h"
 #include "arithmetic/Simplify.h"
 
 namespace tvm {
@@ -44,7 +48,7 @@ struct ComExprEntry {
 };
 
 // canonical expression for communicative expression.
-struct ComExprNode {
+struct ComExprNode : public NodeBase {
   // base constant value.
   int64_t base{0};
   // The values to be sumed.
@@ -56,7 +60,7 @@ struct ComExpr {
  public:
   // constructor
   ComExpr() {}
-  explicit ComExpr(std::shared_ptr<ComExprNode> ptr) : ptr_(ptr) {}
+  explicit ComExpr(NodePtr<ComExprNode> ptr) : ptr_(ptr) {}
   // get member
   ComExprNode* operator->() const {
     return ptr_.get();
@@ -102,7 +106,7 @@ struct ComExpr {
   }
 
  private:
-  std::shared_ptr<ComExprNode> ptr_;
+  NodePtr<ComExprNode> ptr_;
 };
 
 // binary comparison op.
@@ -169,7 +173,7 @@ class Canonical::Internal : public IRMutator {
       if (sum.defined()) return sum;
       const int64_t *v1 = as_const_int(value);
       const uint64_t *v2 = as_const_uint(value);
-      std::shared_ptr<ComExprNode> n = std::make_shared<ComExprNode>();
+      auto n = make_node<ComExprNode>();
       if (v1) {
         n->base = *v1;
       } else if (v2) {
@@ -232,6 +236,24 @@ class Canonical::Internal : public IRMutator {
   bool EnableOpt(Type t) const {
     return (t.lanes() == 1 && (t.is_int() || t.is_uint()));
   }
+  // Max
+  Expr Mutate_(const Max* op, const Expr& e) final {
+    CacheEntry a = Produce(op->a);
+    CacheEntry b = Produce(op->b);
+    if (a.has_side_effect || b.has_side_effect) {
+      return Binary_(op, e, a.value, b.value);
+    }
+    return Binary(op, e);
+  }
+  // Min
+  Expr Mutate_(const Min* op, const Expr& e) final {
+    CacheEntry a = Produce(op->a);
+    CacheEntry b = Produce(op->b);
+    if (a.has_side_effect || b.has_side_effect) {
+      return Binary_(op, e, a.value, b.value);
+    }
+    return Binary(op, e);
+  }
   // Add
   Expr Mutate_(const Add* op, const Expr& e) final {
     if (!EnableOpt(op->type)) {
@@ -273,7 +295,7 @@ class Canonical::Internal : public IRMutator {
     } else if (is_const(b.value)) {
       return SumMulConst(a.AsSum(), b.value);
     } else {
-      return Binary_(op, e, a.value, b.value);
+      return Binary(op, e);
     }
   }
   // Variable
@@ -467,8 +489,8 @@ class Canonical::Internal : public IRMutator {
     Type type = coeff.type();
     int64_t value = GetConstIntValue(coeff);
     if (value < 0) return {};
-    std::shared_ptr<ComExprNode> xnode = std::make_shared<ComExprNode>();
-    std::shared_ptr<ComExprNode> ynode = std::make_shared<ComExprNode>();
+    auto xnode = make_node<ComExprNode>();
+    auto ynode = make_node<ComExprNode>();
     if (a->base % value == 0) {
       xnode->base = a->base;
     } else {
@@ -503,7 +525,7 @@ class Canonical::Internal : public IRMutator {
     std::vector<ComExpr> pair = TryLinearEquation(a, v);
     if (pair.size() == 0) {
       int64_t value = GetConstIntValue(v);
-      std::shared_ptr<ComExprNode> n = std::make_shared<ComExprNode>();
+      auto n = make_node<ComExprNode>();
       n->base = a->base % value;
       for (auto e : a->elem) {
         if (e.scale % value == 0) continue;
@@ -511,7 +533,15 @@ class Canonical::Internal : public IRMutator {
         n->elem.push_back(e);
       }
       Expr ret = Sum2Expr(ComExpr(n), v.type()) % v;
-      return Binary(ret.as<Mod>(), ret);
+      if (const Mod* mod = ret.as<Mod>()) {
+        return Binary(mod, ret);
+      } else {
+        // Sometimes the result is a constant, this may happen when value is -1
+        CHECK(is_const(ret)) << "CanonicalSimplify: "
+          << Sum2Expr(ComExpr(n), v.type()) << " % " << v << " is " << ret
+          << " which is neither Mod, nor a constant";
+        return ret;
+      }
     }
     ret_entry_.sum = pair[1];
     ret_entry_.max_level = stack_.back().max_level;
@@ -550,8 +580,7 @@ class Canonical::Internal : public IRMutator {
     if (value == 0) {
       return make_zero(v.type());
     }
-    std::shared_ptr<ComExprNode> vsum =
-        std::make_shared<ComExprNode>(*a.operator->());
+    auto vsum = make_node<ComExprNode>(*a.operator->());
     vsum->base *= value;
     for (auto& e : vsum->elem) {
       e.scale *= value;
@@ -572,15 +601,14 @@ class Canonical::Internal : public IRMutator {
   ComExpr SumAdd_(const ComExpr& suma,
                   const ComExpr& sumb,
                   int bscale) {
-    std::shared_ptr<ComExprNode> n = std::make_shared<ComExprNode>();
+    auto n = make_node<ComExprNode>();
     n->base = suma->base + sumb->base * bscale;
     // merge of suma and sumb;
     size_t i = 0, j = 0;
     while (i < suma->elem.size() && j < sumb->elem.size()) {
       const auto& a = suma->elem[i];
       const auto& b = sumb->elem[j];
-      if (a.value.same_as(b.value)) {
-        CHECK_EQ(a.level, b.level);
+      if (a.value.same_as(b.value) && a.level == b.level) {
         ComExprEntry e = a;
         e.scale = a.scale + b.scale * bscale;
         if (e.scale != 0) {
diff --git a/src/arithmetic/compute_expr.h b/src/arithmetic/compute_expr.h
index 5f44347f3539..218e9d218a66 100644
--- a/src/arithmetic/compute_expr.h
+++ b/src/arithmetic/compute_expr.h
@@ -14,10 +14,6 @@
 namespace tvm {
 namespace arith {
 
-using HalideIR::Internal::add_would_overflow;
-using HalideIR::Internal::sub_would_overflow;
-using HalideIR::Internal::mul_would_overflow;
-
 /*!
  * \brief Compute the expression with the given binary op.
  * \param lhs The left operand
@@ -42,23 +38,9 @@ template<typename Op>
 inline Expr ComputeReduce(
     const Array<Expr>& values, Expr empty_value);
 
-template<typename T>
-inline bool GetConst(Expr e, T* out);
-
-template<>
-inline bool GetConst<int64_t>(Expr e, int64_t *out) {
-  if (e.type().is_vector()) return false;
-  const int64_t *v = as_const_int(e);
-  if (v) {
-    *out = *v; return true;
-  } else {
-    return false;
-  }
-}
-template<>
-inline bool GetConst<uint64_t>(Expr e, uint64_t *out) {
+inline bool GetConst(Expr e, int64_t* out) {
   if (e.type().is_vector()) return false;
-  const uint64_t *v = as_const_uint(e);
+  const int64_t* v = as_const_int(e);
   if (v) {
     *out = *v; return true;
   } else {
@@ -69,66 +51,37 @@ inline bool GetConst<uint64_t>(Expr e, uint64_t *out) {
 // get a small constant int
 inline bool GetConstInt(Expr e, int* out) {
   int64_t v1 = 0;
-  uint64_t v2 = 0;
   if (GetConst(e, &v1)) {
     if (v1 > static_cast<int64_t>(
             std::numeric_limits<int>::max())) return false;
     *out = static_cast<int>(v1); return true;
   }
-  if (GetConst(e, &v2)) {
-    if (v2 > static_cast<uint64_t>(
-            std::numeric_limits<int>::max())) return false;
-    *out = static_cast<int>(v2); return true;
-  }
   return false;
 }
 
-#define TVM_CONST_PROPAGATION(OP_NAME, OP)                       \
-  int64_t ia = 0, ib = 0;                                        \
-  if (GetConst(a, &ia) && GetConst(b, &ib)) {                    \
-    if (OP_NAME ## _would_overflow(a.type().bits(), ia, ib)) {   \
-      LOG(FATAL) << "signed int overflow";                       \
-    }                                                            \
-    return ir::IntImm::make(a.type(), ia OP ib);                 \
-  }                                                              \
-  uint64_t ua = 0, ub = 0;                                       \
-  if (GetConst(a, &ua) && GetConst(b, &ub)) {                    \
-    return ir::UIntImm::make(a.type(), ua OP ub);                \
-  }                                                              \
-
 template<>
 inline Expr ComputeExpr<ir::Add>(Expr a, Expr b) {
-  if (is_zero(a)) return b;
-  if (is_zero(b)) return a;
-  TVM_CONST_PROPAGATION(add, +);
-  return ir::Add::make(a, b);
+  return a + b;
 }
 
 template<>
 inline Expr ComputeExpr<ir::Sub>(Expr a, Expr b) {
-  if (is_zero(b)) return a;
-  TVM_CONST_PROPAGATION(sub, -);
-  return ir::Sub::make(a, b);
+  return a - b;
 }
 
 template<>
 inline Expr ComputeExpr<ir::Mul>(Expr a, Expr b) {
-  if (is_one(a)) return b;
-  if (is_one(b)) return a;
-  TVM_CONST_PROPAGATION(mul, *);
-  return ir::Mul::make(a, b);
+  return a * b;
 }
 
 template<>
 inline Expr ComputeExpr<ir::Div>(Expr a, Expr b) {
-  if (is_one(b)) return a;
-  return ir::Div::make(a, b);
+  return a / b;
 }
 
 template<>
 inline Expr ComputeExpr<ir::Mod>(Expr a, Expr b) {
-  if (is_zero(a)) return make_zero(a.type());
-  return ir::Mod::make(a, b);
+  return a % b;
 }
 
 template<>
diff --git a/src/arithmetic/detect_linear_equation.cc b/src/arithmetic/detect_linear_equation.cc
index 642a866866d2..6f4d3cfb53bb 100644
--- a/src/arithmetic/detect_linear_equation.cc
+++ b/src/arithmetic/detect_linear_equation.cc
@@ -8,7 +8,7 @@
 #include <tvm/ir_visitor.h>
 #include <tvm/ir_functor_ext.h>
 #include <tvm/arithmetic.h>
-#include "./compute_expr.h"
+#include "compute_expr.h"
 
 namespace tvm {
 namespace arith {
@@ -111,8 +111,9 @@ class LinearEqDetector
     return ComputeExpr<Add>(a, b);
   }
   Expr SubCombine(Expr a, Expr b) {
-    if (!a.defined()) return -b;
+    // Check b first in case they are both undefined
     if (!b.defined()) return a;
+    if (!a.defined()) return -b;
     return ComputeExpr<Sub>(a, b);
   }
   Expr MulCombine(Expr a, Expr b) {
@@ -194,7 +195,7 @@ bool DetectClipBound(
   if (!LinearEqDetector(var).Detect(canonical, &ret)) return false;
   ret.coeff = Simplify(ret.coeff);
   IntervalEntry& p = (*bmap)[var.get()];
-  if (is_one(ret.coeff)) {
+  if (is_const_int(ret.coeff, 1)) {
     // var + shift >=0 -> var >= -shift
     if (p.min_value.defined()) {
       p.min_value = ir::Max::make(p.min_value, -ret.base);
@@ -203,7 +204,7 @@ bool DetectClipBound(
     }
     return true;
   }
-  if (is_const(ret.coeff, -1)) {
+  if (is_const_int(ret.coeff, -1)) {
     // -var + shift >=0 -> var <= shift
     if (p.max_value.defined()) {
       p.max_value = ir::Min::make(p.max_value, ret.base);
diff --git a/src/arithmetic/int_set.cc b/src/arithmetic/int_set.cc
index c004b9666a58..78c592471a1a 100644
--- a/src/arithmetic/int_set.cc
+++ b/src/arithmetic/int_set.cc
@@ -9,8 +9,8 @@
 #include <tvm/ir_functor_ext.h>
 #include <arithmetic/Interval.h>
 #include <unordered_map>
-#include "./compute_expr.h"
-#include "./int_set_internal.h"
+#include "compute_expr.h"
+#include "int_set_internal.h"
 
 namespace tvm {
 namespace arith {
@@ -329,7 +329,7 @@ inline IntSet AsStrideSet(IntSet a) {
   if (a.as<StrideSet>()) return a;
   const IntervalSet* s = a.as<IntervalSet>();
   CHECK(s->i.is_bounded());
-  std::shared_ptr<StrideSet> n = std::make_shared<StrideSet>();
+  NodePtr<StrideSet> n = make_node<StrideSet>();
   n->base = s->i;
   return IntSet(n);
 }
@@ -348,7 +348,7 @@ inline IntSet CombineSets<Add>(IntSet a, IntSet b) {
   b = AsStrideSet(b);
   const StrideSet* a_stride = a.as<StrideSet>();
   const StrideSet* b_stride = b.as<StrideSet>();
-  auto n = std::make_shared<StrideSet>(*a_stride);
+  auto n = make_node<StrideSet>(*a_stride);
   for (size_t i = 0; i < b_stride->extents.size(); ++i) {
     n->extents.push_back(b_stride->extents[i]);
     n->strides.push_back(b_stride->strides[i]);
diff --git a/src/arithmetic/int_set_internal.h b/src/arithmetic/int_set_internal.h
index 9284e6e016e0..e28fe2a9d958 100644
--- a/src/arithmetic/int_set_internal.h
+++ b/src/arithmetic/int_set_internal.h
@@ -21,14 +21,14 @@ struct IntervalSet : public IntSetNode {
   Interval i;
 
   static IntSet make(Interval i) {
-    std::shared_ptr<IntervalSet> n =
-        std::make_shared<IntervalSet>();
+    NodePtr<IntervalSet> n =
+        make_node<IntervalSet>();
     n->i = i;
     return IntSet(n);
   }
   static IntSet make(Expr min, Expr max) {
-    std::shared_ptr<IntervalSet> n =
-        std::make_shared<IntervalSet>();
+    NodePtr<IntervalSet> n =
+        make_node<IntervalSet>();
     n->i.min = min;
     n->i.max = max;
     return IntSet(n);
diff --git a/src/arithmetic/modular.cc b/src/arithmetic/modular.cc
index c0eee45cc395..d79300eb7782 100644
--- a/src/arithmetic/modular.cc
+++ b/src/arithmetic/modular.cc
@@ -8,7 +8,7 @@
 #include <tvm/ir_visitor.h>
 #include <tvm/arithmetic.h>
 #include <limits>
-#include "./int_set_internal.h"
+#include "int_set_internal.h"
 
 namespace tvm {
 namespace arith {
@@ -159,7 +159,7 @@ IntSet EvalModular(const Expr& e,
     CHECK(m) << "Need to pass ModularSet for Modular Analysis";
     mmap[kv.first.get()] = m->e;
   }
-  std::shared_ptr<ModularSet> n = std::make_shared<ModularSet>();
+  NodePtr<ModularSet> n = make_node<ModularSet>();
   n->e = ModularEvaluator(mmap)(e);
   return IntSet(n);
 }
diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index 0cb0ec3cc4be..859fdb2bc86c 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -32,7 +32,7 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 */
 Target CreateTarget(const std::string& target_name,
                     const std::vector<std::string>& options) {
-  auto target = Target(std::make_shared<TargetNode>());
+  auto target = Target(make_node<TargetNode>());
   auto t = static_cast<TargetNode*>(target.node_.get());
 
   t->target_name = target_name;
@@ -58,7 +58,7 @@ Target CreateTarget(const std::string& target_name,
   }
   t->device_type = kDLCPU;
   t->thread_warp_size = 1;
-  if (target_name == "llvm") {
+  if (target_name == "c" || target_name == "llvm") {
     t->keys_array.push_back(ir::StringImm::make("cpu"));
   } else if (target_name == "cuda" || target_name == "nvptx") {
     t->device_type = kDLGPU;
@@ -92,7 +92,7 @@ Target CreateTarget(const std::string& target_name,
     t->device_type = kDLOpenCL;
     t->keys_array.push_back(ir::StringImm::make("sdaccel"));
     t->keys_array.push_back(ir::StringImm::make("hls"));
-  } else if (target_name == "aocl") {
+  } else if (target_name == "aocl" || target_name == "aocl_sw_emu") {
     t->device_type = kDLAOCL;
     t->keys_array.push_back(ir::StringImm::make("aocl"));
     t->keys_array.push_back(ir::StringImm::make("hls"));
@@ -102,7 +102,7 @@ Target CreateTarget(const std::string& target_name,
   } else if (target_name == "stackvm") {
     t->device_type = kDLCPU;
   } else if (target_name == "ext_dev") {
-    t->device_type = kExtDev;
+    t->device_type = kDLExtDev;
   } else {
     LOG(ERROR) << "Unknown target name " << target_name;
     return target::stackvm();
@@ -154,13 +154,15 @@ std::unordered_set<std::string> TargetNode::libs() const {
   return result;
 }
 
-std::string TargetNode::str() const {
+const std::string& TargetNode::str() const {
+  if (str_repr_.length() != 0) return str_repr_;
   std::ostringstream result;
   result << target_name;
   for (const auto &x : options()) {
     result << " " << x;
   }
-  return result.str();
+  str_repr_ = result.str();
+  return str_repr_;
 }
 
 
@@ -362,7 +364,8 @@ Stmt BuildStmt(Schedule sch,
   stmt = ir::InjectPrefetch(stmt);
 
   // Phase 1
-  stmt = ir::StorageFlatten(stmt, out_binds, 64);
+  stmt = ir::StorageFlatten(stmt, out_binds, 64,
+                            config->instrument_bound_checkers);
   stmt = ir::CanonicalSimplify(stmt);
   if (loop_partition) {
     stmt = ir::LoopPartition(stmt, config->partition_const_loop);
@@ -380,6 +383,9 @@ Stmt BuildStmt(Schedule sch,
   stmt = ir::RemoveNoOp(stmt);
   stmt = ir::RewriteUnsafeSelect(stmt);
 
+  if (config->instrument_bound_checkers)
+    stmt = ir::InstrumentBoundCheckers(stmt);
+
   return stmt;
 }
 
@@ -475,7 +481,7 @@ runtime::Module build(const Array<LoweredFunc>& funcs,
 }
 
 BuildConfig build_config() {
-  return BuildConfig(std::make_shared<BuildConfigNode>());
+  return BuildConfig(make_node<BuildConfigNode>());
 }
 
 /*! \brief Entry to hold the BuildConfig context stack. */
@@ -533,7 +539,7 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 });
 
 struct GenericFunc::Manager {
-  std::unordered_map<std::string, std::shared_ptr<Node> > fmap;
+  std::unordered_map<std::string, NodePtr<Node> > fmap;
   // mutex
   std::mutex mutex;
 
@@ -551,7 +557,7 @@ GenericFunc GenericFunc::Get(const std::string& name) {
   std::lock_guard<std::mutex>(m->mutex);
   auto it = m->fmap.find(name);
   if (it == m->fmap.end()) {
-    auto f = std::make_shared<GenericFuncNode>();
+    auto f = make_node<GenericFuncNode>();
     f->name_ = name;
     m->fmap[name] = f;
     return GenericFunc(f);
@@ -669,7 +675,7 @@ TVM_REGISTER_API("_BuildConfigGetAddLowerPassInfo")
 
 TVM_REGISTER_API("_GenericFuncCreate")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
-  *ret = GenericFunc(std::make_shared<GenericFuncNode>());
+  *ret = GenericFunc(make_node<GenericFuncNode>());
   });
 
 TVM_REGISTER_API("_GenericFuncGetGlobal")
diff --git a/src/codegen/codegen.cc b/src/codegen/codegen.cc
index 8bc7d238a866..12570e5881a9 100644
--- a/src/codegen/codegen.cc
+++ b/src/codegen/codegen.cc
@@ -40,7 +40,6 @@ std::string PackImportsToC(const runtime::Module& mod, bool system_lib) {
     CHECK_EQ(im->imports().size(), 0U)
         << "Only support simply one-level hierarchy";
     std::string tkey = im->type_key();
-    std::string bin;
     stream->Write(tkey);
     im->SaveToBinary(stream);
   }
diff --git a/src/codegen/codegen_aocl.cc b/src/codegen/codegen_aocl.cc
index 8830588758ef..d9167a7aadcd 100644
--- a/src/codegen/codegen_aocl.cc
+++ b/src/codegen/codegen_aocl.cc
@@ -5,15 +5,16 @@
 #include <tvm/build_module.h>
 #include <vector>
 #include <string>
-#include "./codegen_opencl.h"
-#include "./build_common.h"
+#include "codegen_opencl.h"
+#include "build_common.h"
 #include "../runtime/opencl/aocl/aocl_module.h"
 #include "../runtime/file_util.h"
 
 namespace tvm {
 namespace codegen {
 
-runtime::Module BuildAOCL(Array<LoweredFunc> funcs, std::string target_str) {
+runtime::Module BuildAOCL(Array<LoweredFunc> funcs, std::string target_str,
+                          bool emulation) {
   // Get code.
   using tvm::runtime::Registry;
   bool output_ssa = false;
@@ -31,17 +32,16 @@ runtime::Module BuildAOCL(Array<LoweredFunc> funcs, std::string target_str) {
   runtime::SaveBinaryToFile("aocl.cl", code.c_str());
 
   // Compile the .cl file.
+  std::string cmd = "aoc aocl.cl";
+  // AOCL supports fp64.
+  cmd += " -Dcl_khr_fp64";
   Target target = Target::create(target_str);
-  if (target->device_name == "") {
-    LOG(FATAL) << "AOCL device name not specified in build target.";
+  if (target->device_name != "") {
+    cmd += " -board=" + target->device_name;
   }
-  std::string cmd = "aoc aocl.cl";
-  for (std::string option : target->options()) {
-    if (option == "-mattr=emulator") {
-      cmd += " -march=emulator";
-    }
+  if (emulation) {
+    cmd += " -march=emulator";
   }
-  cmd += " -board=" + target->device_name;
   if (system(cmd.c_str()) != 0) {
     LOG(FATAL) << "OpenCL offline compilation error.";
   }
@@ -55,7 +55,12 @@ runtime::Module BuildAOCL(Array<LoweredFunc> funcs, std::string target_str) {
 
 TVM_REGISTER_API("codegen.build_aocl")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = BuildAOCL(args[0], args[1]);
+    *rv = BuildAOCL(args[0], args[1], false);
+  });
+
+TVM_REGISTER_API("codegen.build_aocl_sw_emu")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = BuildAOCL(args[0], args[1], true);
   });
 
 }  // namespace codegen
diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc
index ec27f41cc702..3624dc0403aa 100644
--- a/src/codegen/codegen_c.cc
+++ b/src/codegen/codegen_c.cc
@@ -4,7 +4,7 @@
  */
 #include <iomanip>
 #include <cctype>
-#include "./codegen_c.h"
+#include "codegen_c.h"
 #include "../pass/ir_util.h"
 #include "../arithmetic/compute_expr.h"
 
@@ -22,12 +22,43 @@ void CodeGenC::InitFuncState(LoweredFunc f) {
   handle_data_type_.clear();
   CodeGenSourceBase::ClearFuncState();
 }
-void CodeGenC::AddFunction(LoweredFunc f) {
-  // clear previous generated state.
-  this->InitFuncState(f);
+
+void CodeGenC::ReserveKeywordsAsUnique() {
   // skip the first underscore, so SSA variable starts from _1
   GetUniqueName("_");
   GetUniqueName("extern");
+  GetUniqueName("void");
+  GetUniqueName("int");
+  GetUniqueName("float");
+  GetUniqueName("double");
+  GetUniqueName("char");
+  GetUniqueName("unsigned");
+  GetUniqueName("short");
+  GetUniqueName("long");
+  GetUniqueName("if");
+  GetUniqueName("else");
+  GetUniqueName("switch");
+  GetUniqueName("case");
+  GetUniqueName("default");
+  GetUniqueName("for");
+  GetUniqueName("do");
+  GetUniqueName("while");
+  GetUniqueName("goto");
+  GetUniqueName("register");
+  GetUniqueName("continue");
+  GetUniqueName("break");
+  GetUniqueName("typedef");
+  GetUniqueName("struct");
+  GetUniqueName("enum");
+  GetUniqueName("union");
+  GetUniqueName("return");
+}
+
+void CodeGenC::AddFunction(LoweredFunc f) {
+  // clear previous generated state.
+  this->InitFuncState(f);
+  // reserve keywords
+  ReserveKeywordsAsUnique();
   // add to alloc buffer type.
   for (const auto & kv : f->handle_data_type) {
     RegisterHandleType(kv.first.get(), kv.second.type());
@@ -187,6 +218,7 @@ std::string CodeGenC::GetStructRef(
       case intrinsic::kArrNDim: os << "ndim"; break;
       case intrinsic::kArrTypeCode: os << "dtype.code"; break;
       case intrinsic::kArrTypeBits: os << "dtype.bits"; break;
+      case intrinsic::kArrByteOffset: os << "byte_offset"; break;
       case intrinsic::kArrTypeLanes: os << "dtype.lanes"; break;
       case intrinsic::kArrDeviceId: os << "ctx.device_id"; break;
       case intrinsic::kArrDeviceType: os << "ctx.device_type"; break;
@@ -207,7 +239,7 @@ std::string CodeGenC::GetStructRef(
     } else if (t.is_int()) {
       os << "v_int64";
     } else {
-      LOG(FATAL) << "donot know how to handle type" << t;
+      LOG(FATAL) << "Do not know how to handle type" << t;
     }
     os << ")";
     return os.str();
@@ -652,11 +684,10 @@ void CodeGenC::VisitStmt_(const Store* op) {
 }
 
 void CodeGenC::VisitExpr_(const Let* op, std::ostream& os) {  // NOLINT(*)
-  CHECK(print_ssa_form_)
-      << "LetExpr is only supported by print SSA form";
   std::string value = PrintExpr(op->value);
   CHECK(!var_idmap_.count(op->var.get()));
   var_idmap_[op->var.get()] = value;
+  os << PrintExpr(op->body);
 }
 
 void CodeGenC::VisitExpr_(const Ramp* op, std::ostream& os) {  // NOLINT(*)
@@ -835,8 +866,10 @@ void CodeGenC::VisitStmt_(const Evaluate *op) {
     }
   }
   std::string vid = this->PrintExpr(op->value);
-  this->PrintIndent();
-  this->stream << "(void)" << vid << ";\n";
+  if (vid != "") {
+    this->PrintIndent();
+    this->stream << "(void)" << vid << ";\n";
+  }
 }
 
 void CodeGenC::VisitStmt_(const ProducerConsumer *op) {
diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h
index 0f14415f2af6..c9af24a04a3c 100644
--- a/src/codegen/codegen_c.h
+++ b/src/codegen/codegen_c.h
@@ -14,7 +14,7 @@
 #include <vector>
 #include <unordered_map>
 #include <unordered_set>
-#include "./codegen_source_base.h"
+#include "codegen_source_base.h"
 
 namespace tvm {
 namespace codegen {
@@ -183,6 +183,8 @@ class CodeGenC :
   std::unordered_map<const Variable*, std::string> alloc_storage_scope_;
   /*! \brief the data type of allocated buffers */
   std::unordered_map<const Variable*, Type> handle_data_type_;
+  /*! \brief reserves common C keywords */
+  void ReserveKeywordsAsUnique();
 
  private:
   /*! \brief whether to print in SSA form */
diff --git a/src/codegen/codegen_c_host.cc b/src/codegen/codegen_c_host.cc
new file mode 100644
index 000000000000..248354dbc339
--- /dev/null
+++ b/src/codegen/codegen_c_host.cc
@@ -0,0 +1,252 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file codegen_c_host.cc
+ */
+#include <tvm/packed_func_ext.h>
+#include <vector>
+#include <string>
+#include "codegen_c_host.h"
+#include "build_common.h"
+
+namespace tvm {
+namespace codegen {
+
+CodeGenCHost::CodeGenCHost() {
+  module_name = GetUniqueName("__tvm_module_ctx");
+}
+
+void CodeGenCHost::Init(bool output_ssa) {
+  decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n";
+  decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n";
+  decl_stream << "extern void* " << module_name << " = NULL;\n";
+  CodeGenC::Init(output_ssa);
+}
+
+void CodeGenCHost::AddFunction(LoweredFunc f) {
+  // clear previous generated state.
+  this->InitFuncState(f);
+  // reserve keywords
+  ReserveKeywordsAsUnique();
+  // add to alloc buffer type.
+  for (const auto & kv : f->handle_data_type) {
+    RegisterHandleType(kv.first.get(), kv.second.type());
+  }
+
+  this->stream << "#ifdef __cplusplus\n";
+  this->stream << "extern \"C\"\n";
+  this->stream << "#endif\n";
+  this->stream << "TVM_DLL int32_t " << f->name << "(";
+  for (size_t i = 0; i < f->args.size(); ++i) {
+    Var v = f->args[i];
+    std::string vid = AllocVarID(v.get());
+    if (i != 0) stream << ", ";
+    if (v.type().is_handle()) {
+      auto it = alloc_storage_scope_.find(v.get());
+      if (it != alloc_storage_scope_.end()) {
+        PrintStorageScope(it->second, stream);
+      }
+      stream << ' ';
+
+      if (handle_data_type_.count(v.get())) {
+        PrintType(handle_data_type_.at(v.get()), stream);
+      } else {
+        stream << "void";
+      }
+      stream << "*";
+
+      if (f->is_restricted && restrict_keyword_.length() != 0) {
+        stream << ' ' << restrict_keyword_;
+      }
+    } else {
+      PrintType(v.type(), stream);
+    }
+    stream << ' ' << vid;
+  }
+  stream << ") {\n";
+  this->PreFunctionBody(f);
+  int func_scope = this->BeginScope();
+  this->PrintStmt(f->body);
+  this->PrintIndent();
+  this->stream << "return 0;\n";
+  this->EndScope(func_scope);
+  this->PrintIndent();
+  this->stream << "}\n\n";
+}
+
+std::string CodeGenCHost::Finish() {
+  return CodeGenC::Finish();
+}
+
+void CodeGenCHost::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
+  int lanes = t.lanes();
+  if (t.is_handle()) {
+    CHECK_EQ(lanes, 1)
+        << "does not support vector types";
+    os << "void*"; return;
+  }
+  if (t == Bool()) {
+    os << "bool"; return;
+  }
+  bool fail = false;
+  if (t.is_float()) {
+    switch (t.bits()) {
+      case 16:
+        os << "half";
+        break;
+      case 32: os << "float"; break;
+      case 64:
+        os << "double";
+        break;
+      default: fail = true; break;
+    }
+    if (!fail && lanes == 1) return;
+    if (!fail && (lanes >= 2 && lanes <= 16)) {
+      os << lanes; return;
+    }
+  } else if (t.is_uint() || t.is_int()) {
+    if (t.is_uint()) {
+      os << 'u';
+    }
+    switch (t.bits()) {
+      case 8: os << "int8_t"; break;
+      case 16: os << "int16_t"; break;
+      case 32: os << "int32_t"; break;
+      case 64: os << "int64_t"; break;
+      case 1: os << "int32_t"; break;
+      default: fail = true; break;
+    }
+    if (!fail && lanes == 1) return;
+    if (!fail && (lanes >= 2 && lanes <= 16)) {
+      os << lanes; return;
+    }
+  }
+  LOG(FATAL) << "Cannot convert type " << t << " to C type";
+}
+
+void CodeGenCHost::VisitExpr_(const Broadcast* op, std::ostream& os) {   // NOLINT(*)
+  std::string v = PrintExpr(op->value);
+  os << "((";
+  PrintType(op->type, os);
+  os << ")(";
+  for (int i = 0; i < op->lanes; ++i) {
+    if (i != 0) os << ", ";
+    os << v;
+  }
+  os << "))";
+}
+
+void CodeGenCHost::PrintGetFuncFromBackend(std::string func_name, std::string packed_func_name) {
+  this->PrintIndent();
+  this->stream << "if (" << packed_func_name << " == NULL) {\n";
+  int packed_func_if_scope = this->BeginScope();
+  this->PrintIndent();
+  this->stream << "if (TVMBackendGetFuncFromEnv(" << module_name
+              << ", \"" << func_name << "\""
+              << ", &" << packed_func_name << ") != 0) {\n";
+  int get_func_env_scope = this->BeginScope();
+  this->PrintIndent();
+  this->stream << "return -1;\n";
+  this->EndScope(get_func_env_scope);
+  this->PrintIndent();
+  this->stream << "}\n";
+  this->EndScope(packed_func_if_scope);
+  this->PrintIndent();
+  this->stream << "}\n";
+}
+
+void CodeGenCHost::PrintFuncCall(std::string packed_func_name, int num_args) {
+  this->PrintIndent();
+  std::string ret_val = GetUniqueName("ret_val");
+  std::string ret_type_code = GetUniqueName("ret_type_code");
+  this->stream << "TVMValue " << ret_val << ";\n";
+  this->PrintIndent();
+  this->stream << "int " << ret_type_code << ";\n";
+  this->PrintIndent();
+  this->stream << "if (TVMFuncCall(" << packed_func_name << ", "
+               << "(TVMValue*) stack_value" << ", " << "(int*) stack_tcode" << ", "
+               << num_args << ", " << "&" << ret_val << ", " << "&"
+               << ret_type_code << ") != 0) {\n";
+  int func_call_scope = this->BeginScope();
+  this->PrintIndent();
+  this->stream << "return -1;\n";
+  this->EndScope(func_call_scope);
+  this->PrintIndent();
+  this->stream << "}\n";
+}
+
+void CodeGenCHost::VisitExpr_(const Call *op, std::ostream& os) { // NOLINT(*)
+  if (op->is_intrinsic(intrinsic::tvm_stack_alloca)) {
+    std::string stack_name = GetUniqueName("stack");
+    const std::string& type = op->args[0].as<StringImm>()->value;
+    const IntImm* num = op->args[1].as<IntImm>();
+    CHECK(num != nullptr);
+    static_assert(alignof(TVMValue) % alignof(TVMArray) == 0, "invariant");
+    size_t unit = sizeof(TVMValue);
+    size_t size = 0;
+    if (type == "shape") {
+      size = (num->value * sizeof(tvm_index_t) + unit - 1) / unit;
+    } else if (type == "arg_value") {
+      size = (num->value * sizeof(TVMValue) + unit - 1) / unit;
+    } else if (type == "arg_tcode") {
+      size = (num->value * sizeof(int) + unit - 1) / unit;
+    } else if (type == "array") {
+      size = (num->value * sizeof(TVMArray) + unit - 1) / unit;
+    } else {
+      LOG(FATAL) << "Unknown stack alloca type " << type;
+    }
+    this->PrintIndent();
+    this->stream << "TVMValue " << stack_name << "[" << size << "];\n";
+    os << stack_name;
+  } else if (op->is_intrinsic(intrinsic::tvm_call_packed_lowered)) {
+    const StringImm* s = op->args[0].as<StringImm>();
+    CHECK(s != nullptr) << "tvm_call_packed_lowered expects first argument as function name";
+    int64_t begin = op->args[3].as<IntImm>()->value;
+    int64_t end = op->args[4].as<IntImm>()->value;
+    int64_t num_args = end - begin;
+    CHECK_GE(num_args, 0);
+    std::string func_name = s->value;
+    std::string packed_func_name = GetUniqueName(func_name + "_packed");
+    decl_stream << "static void* " << packed_func_name << " = NULL;\n";
+    this->PrintGetFuncFromBackend(func_name, packed_func_name);
+    this->PrintFuncCall(packed_func_name, num_args);
+  } else if (op->is_intrinsic(intrinsic::tvm_throw_last_error)) {
+    this->PrintIndent();
+    this->stream << "return -1;\n";
+  } else {
+    CodeGenC::VisitExpr_(op, os);
+  }
+}
+
+void CodeGenCHost::VisitStmt_(const AssertStmt *op) { // NOLINT(*)
+  std::string cond = PrintExpr(op->condition);
+  PrintIndent();
+  stream << "if (!(" << cond << ")) {\n";
+  int assert_if_scope = this->BeginScope();
+  PrintIndent();
+  stream << "TVMAPISetLastError(\"" << op->message.as<StringImm>()->value << "\");\n";
+  PrintIndent();
+  stream << "return -1;\n";
+  this->EndScope(assert_if_scope);
+  PrintIndent();
+  stream << "}\n";
+  this->PrintStmt(op->body);
+}
+
+runtime::Module BuildCHost(Array<LoweredFunc> funcs) {
+  using tvm::runtime::Registry;
+  bool output_ssa = false;
+  CodeGenCHost cg;
+  cg.Init(output_ssa);
+  for (LoweredFunc f : funcs) {
+    cg.AddFunction(f);
+  }
+  std::string code = cg.Finish();
+  return CSourceModuleCreate(code, "c");
+}
+
+TVM_REGISTER_API("codegen.build_c")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = BuildCHost(args[0]);
+  });
+}  // namespace codegen
+}  // namespace tvm
diff --git a/src/codegen/codegen_c_host.h b/src/codegen/codegen_c_host.h
new file mode 100644
index 000000000000..eb47a7829e2c
--- /dev/null
+++ b/src/codegen/codegen_c_host.h
@@ -0,0 +1,40 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file codegen_c_host.h
+ * \brief Generate C host code.
+ */
+#ifndef TVM_CODEGEN_CODEGEN_C_HOST_H_
+#define TVM_CODEGEN_CODEGEN_C_HOST_H_
+
+#include <tvm/codegen.h>
+#include <tvm/packed_func_ext.h>
+#include <string>
+#include "codegen_c.h"
+
+namespace tvm {
+namespace codegen {
+
+class CodeGenCHost final : public CodeGenC {
+ public:
+  CodeGenCHost();
+  void Init(bool output_ssa);
+  void AddFunction(LoweredFunc f);
+  std::string Finish();
+
+  void PrintType(Type t, std::ostream& os) final; // NOLINT(*)
+
+  // overload visitor functions
+  void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*)
+  void VisitExpr_(const Call *op, std::ostream& os) final; // NOLINT(*)
+  void VisitStmt_(const AssertStmt *op) final; // NOLINT(*)
+
+ private:
+  std::string module_name;
+  void PrintGetFuncFromBackend(std::string func_name, std::string packed_func_name);
+  void PrintFuncCall(std::string packed_func_name, int num_args);
+};
+
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // TVM_CODEGEN_CODEGEN_C_HOST_H_
diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
index 44c02830d0fc..0ab56a116eab 100644
--- a/src/codegen/codegen_cuda.cc
+++ b/src/codegen/codegen_cuda.cc
@@ -7,7 +7,7 @@
 #include <tvm/packed_func_ext.h>
 #include <vector>
 #include <string>
-#include "./codegen_cuda.h"
+#include "codegen_cuda.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace tvm {
@@ -42,7 +42,7 @@ std::string CodeGenCUDA::Finish() {
 }
 
 void CodeGenCUDA::VisitStmt_(const ir::For* op) {
-  CHECK(is_zero(op->min));
+  CHECK(is_const_int(op->min, 0));
   if (op->for_type == ir::ForType::Unrolled) {
     PrintIndent();
     stream << "#pragma unroll\n";
@@ -77,6 +77,8 @@ void CodeGenCUDA::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
     if (!fail && (lanes >= 2 && lanes <= 4)) {
       os << lanes; return;
     }
+  } else if (t == Bool()) {
+    os << "bool"; return;
   } else if (t.is_uint() || t.is_int()) {
     if (t.is_uint()) {
       if (t.lanes() != 1) {
@@ -271,6 +273,16 @@ void CodeGenCUDA::VisitExpr_(const Ramp* op, std::ostream& os) {
 }
 
 void CodeGenCUDA::VisitExpr_(const Broadcast* op, std::ostream& os) {   // NOLINT(*)
+  if (op->type.is_int() && op->type.bits() == 8 && op->lanes == 4) {
+    // make_int8x4
+    const int64_t *p = as_const_int(op->value);
+    CHECK(p);
+    int64_t v = *p & 0xFF;
+    v = (v << 24) | (v << 16) | (v << 8) | v;
+    os << "(int)" << v;
+    return;
+  }
+
   std::string v = PrintExpr(op->value);
   os << "make_";
   PrintType(op->type, os);
diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h
index f5d9861ec6b2..cef2c77f9901 100644
--- a/src/codegen/codegen_cuda.h
+++ b/src/codegen/codegen_cuda.h
@@ -9,7 +9,7 @@
 #include <tvm/codegen.h>
 #include <tvm/packed_func_ext.h>
 #include <string>
-#include "./codegen_c.h"
+#include "codegen_c.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/codegen_metal.cc b/src/codegen/codegen_metal.cc
index 37121ccb755c..031313190370 100644
--- a/src/codegen/codegen_metal.cc
+++ b/src/codegen/codegen_metal.cc
@@ -5,8 +5,9 @@
 #include <tvm/packed_func_ext.h>
 #include <vector>
 #include <string>
-#include "./codegen_metal.h"
-#include "./build_common.h"
+#include <algorithm>
+#include "codegen_metal.h"
+#include "build_common.h"
 #include "../runtime/metal/metal_module.h"
 #include "../runtime/thread_storage_scope.h"
 
@@ -140,6 +141,9 @@ void CodeGenMetal::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
         << "do not yet support vector types";
     os << "void*"; return;
   }
+  if (t == Bool()) {
+    os << "bool"; return;
+  }
   bool fail = false;
   if (t.is_float()) {
     switch (t.bits()) {
diff --git a/src/codegen/codegen_metal.h b/src/codegen/codegen_metal.h
index 6f8bef64bbcf..9779fb800ff9 100644
--- a/src/codegen/codegen_metal.h
+++ b/src/codegen/codegen_metal.h
@@ -9,7 +9,7 @@
 #include <tvm/codegen.h>
 #include <tvm/packed_func_ext.h>
 #include <string>
-#include "./codegen_c.h"
+#include "codegen_c.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/codegen_opencl.cc b/src/codegen/codegen_opencl.cc
index 2d5026e827e2..a0b3c2000a80 100644
--- a/src/codegen/codegen_opencl.cc
+++ b/src/codegen/codegen_opencl.cc
@@ -5,8 +5,8 @@
 #include <tvm/packed_func_ext.h>
 #include <vector>
 #include <string>
-#include "./codegen_opencl.h"
-#include "./build_common.h"
+#include "codegen_opencl.h"
+#include "build_common.h"
 #include "../runtime/thread_storage_scope.h"
 #include "../runtime/opencl/opencl_module.h"
 
@@ -80,6 +80,9 @@ void CodeGenOpenCL::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
         << "do not yet support vector types";
     os << "void*"; return;
   }
+  if (t == Bool()) {
+    os << "bool"; return;
+  }
   bool fail = false;
   if (t.is_float()) {
     switch (t.bits()) {
diff --git a/src/codegen/codegen_opencl.h b/src/codegen/codegen_opencl.h
index 424bfa5ae2b3..90569d176a0b 100644
--- a/src/codegen/codegen_opencl.h
+++ b/src/codegen/codegen_opencl.h
@@ -9,7 +9,7 @@
 #include <tvm/codegen.h>
 #include <tvm/packed_func_ext.h>
 #include <string>
-#include "./codegen_c.h"
+#include "codegen_c.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/codegen_opengl.cc b/src/codegen/codegen_opengl.cc
index 5e750a39e598..7fd85d35409d 100644
--- a/src/codegen/codegen_opengl.cc
+++ b/src/codegen/codegen_opengl.cc
@@ -8,8 +8,8 @@
 #include <tvm/packed_func_ext.h>
 #include <vector>
 #include <string>
-#include "./codegen_opengl.h"
-#include "./build_common.h"
+#include "codegen_opengl.h"
+#include "build_common.h"
 #include "../runtime/thread_storage_scope.h"
 
 namespace tvm {
diff --git a/src/codegen/codegen_opengl.h b/src/codegen/codegen_opengl.h
index 3cae1e323ec4..aa1552dfcff7 100644
--- a/src/codegen/codegen_opengl.h
+++ b/src/codegen/codegen_opengl.h
@@ -9,7 +9,7 @@
 #include <tvm/codegen.h>
 #include <tvm/packed_func_ext.h>
 #include <string>
-#include "./codegen_c.h"
+#include "codegen_c.h"
 #include "../runtime/opengl/opengl_module.h"
 
 namespace tvm {
diff --git a/src/codegen/codegen_source_base.cc b/src/codegen/codegen_source_base.cc
index cf3a6ec5ab04..39a573ceec68 100644
--- a/src/codegen/codegen_source_base.cc
+++ b/src/codegen/codegen_source_base.cc
@@ -2,7 +2,7 @@
  *  Copyright (c) 2017 by Contributors
  * \file codegen_source_base.cc
  */
-#include "./codegen_source_base.h"
+#include "codegen_source_base.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/codegen_source_base.h b/src/codegen/codegen_source_base.h
index 89c5bbc05ce4..3fc46c35c7f7 100644
--- a/src/codegen/codegen_source_base.h
+++ b/src/codegen/codegen_source_base.h
@@ -23,6 +23,7 @@ namespace codegen {
  */
 class CodeGenSourceBase {
  public:
+  virtual ~CodeGenSourceBase() = default;
   /*!
    * \brief Register constant value appeared in expresion tree
    *  This avoid generated a ssa id for each appearance of the value
@@ -111,6 +112,13 @@ class CodeGenSourceBase {
  */
 runtime::Module SourceModuleCreate(std::string code, std::string fmt);
 
+/*!
+ * \brief Create a C source module for viewing and compiling GCC code.
+ * \param code The code to be viewed.
+ * \param fmt The code. format.
+ */
+runtime::Module CSourceModuleCreate(std::string code, std::string fmt);
+
 /*!
  * \brief Create a source module for viewing and limited saving for device.
  * \param data The code data to be viewed.
diff --git a/src/codegen/codegen_vhls.cc b/src/codegen/codegen_vhls.cc
index b9f9f7505978..5776b895b4b3 100644
--- a/src/codegen/codegen_vhls.cc
+++ b/src/codegen/codegen_vhls.cc
@@ -5,8 +5,8 @@
 #include <tvm/build_module.h>
 #include <vector>
 #include <string>
-#include "./codegen_vhls.h"
-#include "./build_common.h"
+#include "codegen_vhls.h"
+#include "build_common.h"
 #include "../runtime/opencl/sdaccel/sdaccel_module.h"
 
 namespace tvm {
diff --git a/src/codegen/codegen_vhls.h b/src/codegen/codegen_vhls.h
index bcb7d6f49d8c..c0faefc75837 100644
--- a/src/codegen/codegen_vhls.h
+++ b/src/codegen/codegen_vhls.h
@@ -9,7 +9,7 @@
 #include <tvm/codegen.h>
 #include <tvm/packed_func_ext.h>
 #include <string>
-#include "./codegen_c.h"
+#include "codegen_c.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/intrin_rule.cc b/src/codegen/intrin_rule.cc
index 5f15a879c2ed..f326fceb6ee8 100644
--- a/src/codegen/intrin_rule.cc
+++ b/src/codegen/intrin_rule.cc
@@ -3,7 +3,7 @@
  * \file intrin_rule_default.cc
  * \brief Default intrinsic rules.
  */
-#include "./intrin_rule.h"
+#include "intrin_rule.h"
 
 namespace tvm {
 namespace codegen {
@@ -24,6 +24,16 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.sqrt")
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.pow")
 .set_body(DispatchExtern<FloatSuffix>);
 
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.sigmoid")
+.set_body([](const TVMArgs& args, TVMRetValue* rv){
+    Expr e = args[0];
+    const Call* call = e.as<Call>();
+    CHECK(call != nullptr);
+
+    auto one = make_const(call->args[0].type(), 1);
+    *rv = one / (one + exp(-call->args[0]));
+  });
+
 }  // namespace intrin
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/codegen/intrin_rule_aocl.cc b/src/codegen/intrin_rule_aocl.cc
new file mode 100644
index 000000000000..fc5dbe741d63
--- /dev/null
+++ b/src/codegen/intrin_rule_aocl.cc
@@ -0,0 +1,82 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file intrin_rule_aocl.cc
+ * \brief AOCL intrinsic rules.
+ */
+#include "intrin_rule.h"
+
+namespace tvm {
+namespace codegen {
+namespace intrin {
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.floor")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.ceil")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.trunc")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.fabs")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.round")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.exp")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.log")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.tanh")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.sqrt")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.pow")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.popcount")
+.set_body(DispatchExtern<Direct>);
+
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.floor")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.ceil")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.trunc")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.fabs")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.round")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.exp")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.log")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.tanh")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.sqrt")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.pow")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.popcount")
+.set_body(DispatchExtern<Direct>);
+
+
+}  // namespace intrin
+}  // namespace codegen
+}  // namespace tvm
diff --git a/src/codegen/intrin_rule_cuda.cc b/src/codegen/intrin_rule_cuda.cc
index 43461a15932d..a6867c7f201c 100644
--- a/src/codegen/intrin_rule_cuda.cc
+++ b/src/codegen/intrin_rule_cuda.cc
@@ -3,7 +3,7 @@
  * \file intrin_rule_cuda.cc
  * \brief CUDA intrinsic rules.
  */
-#include "./intrin_rule.h"
+#include "intrin_rule.h"
 
 namespace tvm {
 namespace codegen {
@@ -91,6 +91,8 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.popcount")
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.tvm_warp_shuffle")
 .set_body(DispatchExtern<CUDAShuffle>);
 
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.fmod")
+.set_body(DispatchExtern<CUDAMath>);
 
 }  // namespace intrin
 }  // namespace codegen
diff --git a/src/codegen/intrin_rule_metal.cc b/src/codegen/intrin_rule_metal.cc
index 3c210919132e..2e65d5537dd2 100644
--- a/src/codegen/intrin_rule_metal.cc
+++ b/src/codegen/intrin_rule_metal.cc
@@ -3,7 +3,7 @@
  * \file intrin_rule_metal.cc
  * \brief Metal intrinsic rules.
  */
-#include "./intrin_rule.h"
+#include "intrin_rule.h"
 
 namespace tvm {
 namespace codegen {
@@ -42,6 +42,9 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.pow")
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.popcount")
 .set_body(DispatchExtern<Direct>);
 
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.metal.fmod")
+.set_body(DispatchExtern<Direct>);
+
 }  // namespace intrin
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/codegen/intrin_rule_opencl.cc b/src/codegen/intrin_rule_opencl.cc
index d91deaeda5fe..e4cf11bf6e64 100644
--- a/src/codegen/intrin_rule_opencl.cc
+++ b/src/codegen/intrin_rule_opencl.cc
@@ -3,7 +3,7 @@
  * \file intrin_rule_opencl.cc
  * \brief OpenCL intrinsic rules.
  */
-#include "./intrin_rule.h"
+#include "intrin_rule.h"
 
 namespace tvm {
 namespace codegen {
@@ -42,6 +42,9 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.pow")
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.popcount")
 .set_body(DispatchExtern<Direct>);
 
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.opencl.fmod")
+.set_body(DispatchExtern<Direct>);
+
 // There is no warp shuffle instruction in standard OpenCL
 // When shuffle is used, we assume it is intel's shuffle extension
 struct IntelShuffle {
diff --git a/src/codegen/intrin_rule_opengl.cc b/src/codegen/intrin_rule_opengl.cc
index e9728a25b40c..c9aa21c1a883 100644
--- a/src/codegen/intrin_rule_opengl.cc
+++ b/src/codegen/intrin_rule_opengl.cc
@@ -3,7 +3,7 @@
  * \file intrin_rule_opencl.cc
  * \brief OpenCL intrinsic rules.
  */
-#include "./intrin_rule.h"
+#include "intrin_rule.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/intrin_rule_vhls.cc b/src/codegen/intrin_rule_vhls.cc
index b360142cd985..996c45707364 100644
--- a/src/codegen/intrin_rule_vhls.cc
+++ b/src/codegen/intrin_rule_vhls.cc
@@ -3,7 +3,7 @@
  * \file intrin_rule_vhls.cc
  * \brief VHLS intrinsic rules.
  */
-#include "./intrin_rule.h"
+#include "intrin_rule.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc
index 9d1decb43227..d1a0716bc1d9 100644
--- a/src/codegen/llvm/codegen_amdgpu.cc
+++ b/src/codegen/llvm/codegen_amdgpu.cc
@@ -8,7 +8,7 @@
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/registry.h>
-#include "./codegen_llvm.h"
+#include "codegen_llvm.h"
 #include "../build_common.h"
 #include "../codegen_source_base.h"
 #include "../../pass/ir_util.h"
@@ -47,8 +47,10 @@ class CodeGenAMDGPU : public CodeGenLLVM {
       if (info.scope.rank == runtime::StorageRank::kLocal) {
         // const int local_address_space = 5;
         // TODO(tqchen): for higher version of LLVM, local address space can be set.
-        llvm::AllocaInst* alloca = builder_->CreateAlloca(
-            LLVMType(op->type), ConstInt32(constant_size));
+        llvm::AllocaInst* alloca = WithFunctionEntry([&]() {
+            return builder_->CreateAlloca(
+                LLVMType(op->type), ConstInt32(constant_size));
+          });
         if (alloca->getAlignment() < static_cast<uint32_t>(info.alignment)) {
           alloca->setAlignment(info.alignment);
         }
@@ -160,10 +162,10 @@ runtime::Module BuildAMDGPU(Array<LoweredFunc> funcs, std::string target) {
   config << "-mtriple=amdgcn-amd-amdhsa-hcc -mcpu=gfx"
          << DetectROCMComputeVersion(target)
          << target.substr(4, target.length() - 4);
-  llvm::TargetMachine* tm = GetLLVMTargetMachine(config.str());
+  std::unique_ptr<llvm::TargetMachine> tm = GetLLVMTargetMachine(config.str());
   std::unique_ptr<CodeGenAMDGPU> cg(new CodeGenAMDGPU());
   std::unique_ptr<llvm::LLVMContext> ctx(new llvm::LLVMContext());
-  cg->Init(funcs[0]->name, tm, ctx.get(), false, false);
+  cg->Init(funcs[0]->name, tm.get(), ctx.get(), false, false);
   for (LoweredFunc f :  funcs) {
     cg->AddFunction(f);
   }
diff --git a/src/codegen/llvm/codegen_arm.cc b/src/codegen/llvm/codegen_arm.cc
index 18a0eb54e182..9f19fa1f47f0 100644
--- a/src/codegen/llvm/codegen_arm.cc
+++ b/src/codegen/llvm/codegen_arm.cc
@@ -4,7 +4,7 @@
  * \brief ARM specific code generator
  */
 #ifdef TVM_LLVM_VERSION
-#include "./codegen_cpu.h"
+#include "codegen_cpu.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/llvm/codegen_cpu.cc b/src/codegen/llvm/codegen_cpu.cc
index a8a2127febde..4e005346624b 100644
--- a/src/codegen/llvm/codegen_cpu.cc
+++ b/src/codegen/llvm/codegen_cpu.cc
@@ -6,7 +6,7 @@
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/ir_pass.h>
-#include "./codegen_cpu.h"
+#include "codegen_cpu.h"
 #include "../../pass/ir_util.h"
 
 namespace tvm {
@@ -503,7 +503,9 @@ llvm::Value* CodeGenCPU::GetPackedFuncHandle(const std::string& fname) {
       handle_not_null, end_block, init_block, md_very_likely_branch_);
   // Initialize the handle if needed.
   builder_->SetInsertPoint(init_block);
-  llvm::Value* out = builder_->CreateAlloca(t_tvm_func_handle_);
+  llvm::Value* out = WithFunctionEntry([&]() {
+      return builder_->CreateAlloca(t_tvm_func_handle_);
+    });
   llvm::LoadInst* ctx = builder_->CreateAlignedLoad(
       gv_mod_ctx_, gv_mod_ctx_->getAlignment());
   ctx->setMetadata(
@@ -513,6 +515,8 @@ llvm::Value* CodeGenCPU::GetPackedFuncHandle(const std::string& fname) {
       RuntimeTVMGetFuncFromEnv(), {ctx, GetConstString(fname), out});
   init_block = CheckCallSuccess(retcode);
   llvm::Value* loaded_handle = builder_->CreateAlignedLoad(out, align);
+  // Store the handle
+  builder_->CreateStore(loaded_handle, hptr);
   builder_->CreateBr(end_block);
   // end block
   builder_->SetInsertPoint(end_block);
@@ -637,19 +641,23 @@ llvm::Value* CodeGenCPU::CreateIntrinsic(const Call* op) {
   } else if (op->is_intrinsic(intrinsic::tvm_stack_alloca)) {
     CHECK_EQ(op->args.size(), 2U);
     const std::string& type = op->args[0].as<StringImm>()->value;
-    llvm::Value* num = MakeValue(op->args[1]);
-    if (type == "shape") {
-      return builder_->CreateAlloca(t_tvm_shape_index_, num);
-    } else if (type == "arg_value") {
-      return builder_->CreateAlloca(t_tvm_value_, num);
-    } else if (type == "arg_tcode") {
-      return builder_->CreateAlloca(t_int_, num);
-    } else if (type == "array") {
-      return builder_->CreateAlloca(t_tvm_array_, num);
-    } else {
-      LOG(FATAL) << "Unknown stack alloca type " << type;
-      return nullptr;
-    }
+    return WithFunctionEntry([&]() -> llvm::AllocaInst* {
+        const int64_t* pval = as_const_int(op->args[1]);
+        CHECK(pval) << "require stack alloca to contain constant value";
+        llvm::Value* num = ConstInt32(pval[0]);
+        if (type == "shape") {
+          return builder_->CreateAlloca(t_tvm_shape_index_, num);
+        } else if (type == "arg_value") {
+          return builder_->CreateAlloca(t_tvm_value_, num);
+        } else if (type == "arg_tcode") {
+          return builder_->CreateAlloca(t_int_, num);
+        } else if (type == "array") {
+          return builder_->CreateAlloca(t_tvm_array_, num);
+        } else {
+          LOG(FATAL) << "Unknown stack alloca type " << type;
+          return nullptr;
+        }
+      });
   } else {
     return CodeGenLLVM::CreateIntrinsic(op);
   }
diff --git a/src/codegen/llvm/codegen_cpu.h b/src/codegen/llvm/codegen_cpu.h
index 5027dab911bd..b7a95a835d89 100644
--- a/src/codegen/llvm/codegen_cpu.h
+++ b/src/codegen/llvm/codegen_cpu.h
@@ -9,7 +9,7 @@
 #include <utility>
 #include <vector>
 #include <string>
-#include "./codegen_llvm.h"
+#include "codegen_llvm.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc
index ae576c981395..215a6c9c5b1b 100644
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -7,8 +7,8 @@
 
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/c_runtime_api.h>
-#include "./codegen_llvm.h"
-#include "./codegen_cpu.h"
+#include "codegen_llvm.h"
+#include "codegen_cpu.h"
 #include "../codegen_common.h"
 #include "../../pass/ir_util.h"
 #include "../../arithmetic/compute_expr.h"
@@ -788,11 +788,7 @@ DEFINE_CODEGEN_CMP_OP(GE);
 llvm::Value* CodeGenLLVM::VisitExpr_(const Div* op) {
   llvm::Value* a = MakeValue(op->a);
   llvm::Value* b = MakeValue(op->b);
-  int shift;
-  if ((op->type.is_int() || op->type.is_uint()) &&
-      is_const_power_of_two_integer(op->b, &shift)) {
-    return builder_->CreateAShr(a, shift);
-  } else if (op->type.is_int()) {
+  if (op->type.is_int()) {
     return builder_->CreateSDiv(a, b);
   } else if (op->type.is_uint()) {
     return builder_->CreateUDiv(a, b);
@@ -1049,8 +1045,10 @@ void CodeGenLLVM::VisitStmt_(const Allocate* op) {
     if (info.alignment > 16) {
       info.alignment = 16;
     }
-    llvm::AllocaInst* alloca = builder_->CreateAlloca(
-        LLVMType(op->type), ConstInt32(constant_size));
+    llvm::AllocaInst* alloca = WithFunctionEntry([&]() {
+        return builder_->CreateAlloca(
+            LLVMType(op->type), ConstInt32(constant_size));
+      });
     if (alloca->getAlignment() < static_cast<uint32_t>(info.alignment)) {
       alloca->setAlignment(info.alignment);
     }
diff --git a/src/codegen/llvm/codegen_llvm.h b/src/codegen/llvm/codegen_llvm.h
index 4e61247f4acf..080306310370 100644
--- a/src/codegen/llvm/codegen_llvm.h
+++ b/src/codegen/llvm/codegen_llvm.h
@@ -15,7 +15,7 @@
 #include <utility>
 #include <vector>
 #include <string>
-#include "./llvm_common.h"
+#include "llvm_common.h"
 #include "../../runtime/thread_storage_scope.h"
 
 namespace tvm {
@@ -132,6 +132,26 @@ class CodeGenLLVM :
     /*! \brief The alignment of allocation */
     int alignment{0};
   };
+  /*!
+   * \brief Execute falloca at the beginning of the
+   *  currrent function and obtain its return value.
+   *
+   *  This is a helper function to make sure that
+   *  alloca always happen in the beginning of the function.
+   *
+   * \param falloca The allocation function to be executed.
+   * \tparam F The function to be executed.
+   * \return The result.
+   */
+  template<typename F>
+  inline llvm::AllocaInst* WithFunctionEntry(F falloca) {
+    llvm::BasicBlock* current = builder_->GetInsertBlock();
+    llvm::BasicBlock* entry = &(function_->getEntryBlock());
+    builder_->SetInsertPoint(entry, entry->begin());
+    llvm::AllocaInst* res = falloca();
+    builder_->SetInsertPoint(current);
+    return res;
+  }
   // create intrinstic given call
   virtual llvm::Value* CreateIntrinsic(const Call* op);
   // create extern function call
diff --git a/src/codegen/llvm/codegen_nvptx.cc b/src/codegen/llvm/codegen_nvptx.cc
index 1cca1eacfe85..2d416d34ea0c 100644
--- a/src/codegen/llvm/codegen_nvptx.cc
+++ b/src/codegen/llvm/codegen_nvptx.cc
@@ -6,7 +6,7 @@
 #ifdef TVM_LLVM_VERSION
 
 #include <tvm/runtime/device_api.h>
-#include "./codegen_llvm.h"
+#include "codegen_llvm.h"
 #include "../build_common.h"
 #include "../../pass/ir_util.h"
 #include "../../runtime/cuda/cuda_module.h"
@@ -49,8 +49,10 @@ class CodeGenNVPTX : public CodeGenLLVM {
       if (info.scope.rank == runtime::StorageRank::kLocal) {
         // const int local_address_space = 5;
         // TODO(tqchen): for higher version of LLVM, local address space can be set.
-        llvm::AllocaInst* alloca = builder_->CreateAlloca(
-            LLVMType(op->type), ConstInt32(constant_size));
+        llvm::AllocaInst* alloca = WithFunctionEntry([&]() {
+            return builder_->CreateAlloca(
+                LLVMType(op->type), ConstInt32(constant_size));
+          });
         if (alloca->getAlignment() < static_cast<uint32_t>(info.alignment)) {
           alloca->setAlignment(info.alignment);
         }
@@ -171,10 +173,10 @@ runtime::Module BuildNVPTX(Array<LoweredFunc> funcs, std::string target) {
   config << "-mtriple=nvptx64-nvidia-cuda -mcpu=sm_"
          << compute_ver
          << target.substr(5, target.length() - 5);
-  llvm::TargetMachine* tm = GetLLVMTargetMachine(config.str());
+  std::unique_ptr<llvm::TargetMachine> tm = GetLLVMTargetMachine(config.str());
   std::unique_ptr<CodeGenNVPTX> cg(new CodeGenNVPTX());
   std::unique_ptr<llvm::LLVMContext> ctx(new llvm::LLVMContext());
-  cg->Init(funcs[0]->name, tm, ctx.get(), false, false);
+  cg->Init(funcs[0]->name, tm.get(), ctx.get(), false, false);
   for (LoweredFunc f :  funcs) {
     cg->AddFunction(f);
   }
diff --git a/src/codegen/llvm/intrin_rule_llvm.cc b/src/codegen/llvm/intrin_rule_llvm.cc
index 4b2a3ca5bd02..307f0a3bc412 100644
--- a/src/codegen/llvm/intrin_rule_llvm.cc
+++ b/src/codegen/llvm/intrin_rule_llvm.cc
@@ -4,7 +4,7 @@
  */
 #ifdef TVM_LLVM_VERSION
 
-#include "./intrin_rule_llvm.h"
+#include "intrin_rule_llvm.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/llvm/intrin_rule_llvm.h b/src/codegen/llvm/intrin_rule_llvm.h
index 85641cb178e7..30e7674c3297 100644
--- a/src/codegen/llvm/intrin_rule_llvm.h
+++ b/src/codegen/llvm/intrin_rule_llvm.h
@@ -11,7 +11,7 @@
 #include <tvm/api_registry.h>
 #include <tvm/codegen.h>
 #include <string>
-#include "./llvm_common.h"
+#include "llvm_common.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/llvm/intrin_rule_rocm.cc b/src/codegen/llvm/intrin_rule_rocm.cc
index b9bee94e9c24..092eb77f8f59 100644
--- a/src/codegen/llvm/intrin_rule_rocm.cc
+++ b/src/codegen/llvm/intrin_rule_rocm.cc
@@ -4,7 +4,7 @@
  */
 #ifdef TVM_LLVM_VERSION
 
-#include "./intrin_rule_llvm.h"
+#include "intrin_rule_llvm.h"
 #include <tvm/ir.h>
 #include <tvm/expr.h>
 #include <tvm/api_registry.h>
diff --git a/src/codegen/llvm/llvm_common.cc b/src/codegen/llvm/llvm_common.cc
index 01f2c8869dc1..48c3e788a7f2 100644
--- a/src/codegen/llvm/llvm_common.cc
+++ b/src/codegen/llvm/llvm_common.cc
@@ -6,7 +6,7 @@
 
 #include <tvm/base.h>
 #include <mutex>
-#include "./llvm_common.h"
+#include "llvm_common.h"
 
 namespace tvm {
 namespace codegen {
@@ -114,7 +114,7 @@ void ParseLLVMTargetOptions(const std::string& target_str,
 }
 
 
-llvm::TargetMachine*
+std::unique_ptr<llvm::TargetMachine>
 GetLLVMTargetMachine(const std::string& target_str,
                      bool allow_null) {
   std::string target_triple, mcpu, mattr;
@@ -143,7 +143,7 @@ GetLLVMTargetMachine(const std::string& target_str,
   }
   llvm::TargetMachine* tm = target->createTargetMachine(
       target_triple, mcpu, mattr, opt, llvm::Reloc::PIC_);
-  return tm;
+  return std::unique_ptr<llvm::TargetMachine>(tm);
 }
 
 }  // namespace codegen
diff --git a/src/codegen/llvm/llvm_common.h b/src/codegen/llvm/llvm_common.h
index d5d27bf83d71..89008bb480d2 100644
--- a/src/codegen/llvm/llvm_common.h
+++ b/src/codegen/llvm/llvm_common.h
@@ -26,6 +26,7 @@
 #include <llvm/IR/Type.h>
 #include <llvm/IR/Intrinsics.h>
 #include <llvm/IR/MDBuilder.h>
+#include <llvm/IR/Verifier.h>
 
 #include <llvm/IR/LegacyPassManager.h>
 #include <llvm/Transforms/Utils/Cloning.h>
@@ -78,7 +79,7 @@ void ParseLLVMTargetOptions(const std::string& target_str,
  * \param allow_null Whether allow null to be returned.
  * \return target machine
  */
-llvm::TargetMachine*
+std::unique_ptr<llvm::TargetMachine>
 GetLLVMTargetMachine(const std::string& target_str, bool allow_null = false);
 
 }  // namespace codegen
diff --git a/src/codegen/llvm/llvm_module.cc b/src/codegen/llvm/llvm_module.cc
index 1b0e43f9c23a..25de224e44cd 100644
--- a/src/codegen/llvm/llvm_module.cc
+++ b/src/codegen/llvm/llvm_module.cc
@@ -7,8 +7,8 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/codegen.h>
 #include <mutex>
-#include "./llvm_common.h"
-#include "./codegen_llvm.h"
+#include "llvm_common.h"
+#include "codegen_llvm.h"
 #include "../../runtime/file_util.h"
 #include "../../runtime/module_util.h"
 
@@ -160,14 +160,19 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     bool system_lib = (target.find("-system-lib") != std::string::npos);
     CHECK_NE(funcs.size(), 0U);
     ctx_ = std::make_shared<llvm::LLVMContext>();
-    std::unique_ptr<CodeGenLLVM> cg = CodeGenLLVM::Create(tm_);
+    std::unique_ptr<CodeGenLLVM> cg = CodeGenLLVM::Create(tm_.get());
     entry_func_ = funcs[0]->name;
-    cg->Init(funcs[0]->name, tm_, ctx_.get(), system_lib, system_lib);
+    cg->Init(funcs[0]->name, tm_.get(), ctx_.get(), system_lib, system_lib);
     for (LoweredFunc f :  funcs) {
       cg->AddFunction(f);
     }
     cg->AddMainFunction(funcs[0]->name);
     module_ = cg->Finish();
+    std::string verify_errors_storage;
+    llvm::raw_string_ostream verify_errors(verify_errors_storage);
+    LOG_IF(FATAL, llvm::verifyModule(*module_, &verify_errors))
+        << "LLVM module verification failed with the following errors: \n"
+        << verify_errors.str();
     module_->addModuleFlag(
         llvm::Module::Warning, "tvm_target",
         llvm::MDString::get(*ctx_, target));
@@ -218,8 +223,8 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       builder.setMAttrs(mattrs);
     }
     builder.setTargetOptions(opt);
-    llvm::TargetMachine *tm = builder.selectTarget();
-    llvm::TargetMachine *tm_sys = GetLLVMTargetMachine("llvm");
+    auto tm = std::unique_ptr<llvm::TargetMachine>(builder.selectTarget());
+    std::unique_ptr<llvm::TargetMachine> tm_sys = GetLLVMTargetMachine("llvm");
     if (tm_sys->getTargetTriple().getArch() != tm->getTargetTriple().getArch()) {
       LOG(FATAL) << "Cannot run module, architecture mismatch "
                  << " module=" << tm->getTargetTriple().str()
@@ -231,7 +236,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
         << mptr_->getDataLayout().getStringRepresentation() << ")"
         << " and ExecutionEngine ("
         << layout.getStringRepresentation() << ")";
-    ee_ = builder.create(tm);
+    ee_ = builder.create(tm.release());
     CHECK(ee_ != nullptr)
         << "Failed to initialize git engine for " << mptr_->getTargetTriple();
     ee_->runStaticConstructorsDestructors(false);
@@ -275,7 +280,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
   // The raw pointer to the module.
   llvm::Module* mptr_{nullptr};
   // The target machine
-  llvm::TargetMachine* tm_{nullptr};
+  std::unique_ptr<llvm::TargetMachine> tm_{nullptr};
   // The module, can be moved to ee if JIT is enabled.
   std::unique_ptr<llvm::Module> module_;
   // the context.
diff --git a/src/codegen/source_module.cc b/src/codegen/source_module.cc
index 69dbda49976b..56facea1567f 100644
--- a/src/codegen/source_module.cc
+++ b/src/codegen/source_module.cc
@@ -4,7 +4,7 @@
  * \brief Source code module, only for viewing
  */
 #include <tvm/runtime/packed_func.h>
-#include "./codegen_source_base.h"
+#include "codegen_source_base.h"
 #include "../runtime/file_util.h"
 #include "../runtime/meta_data.h"
 
@@ -53,6 +53,52 @@ runtime::Module SourceModuleCreate(std::string code, std::string fmt) {
   return runtime::Module(n);
 }
 
+// Simulator function
+class CSourceModuleNode : public runtime::ModuleNode {
+ public:
+  CSourceModuleNode(std::string code,
+                   std::string fmt)
+      : code_(code), fmt_(fmt) {}
+  const char* type_key() const {
+    return "c";
+  }
+
+  PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) final {
+    LOG(FATAL) << "C Source module cannot execute, to get executable module"
+               << " build TVM with \'" << fmt_ << "\' runtime support";
+    return PackedFunc();
+  }
+
+  std::string GetSource(const std::string& format) final {
+    return code_;
+  }
+
+  void SaveToFile(const std::string& file_name,
+                  const std::string& format) final {
+    std::string fmt = GetFileFormat(file_name, format);
+    std::string meta_file = GetMetaFilePath(file_name);
+    if (fmt == "cc") {
+      CHECK_NE(code_.length(), 0);
+      SaveBinaryToFile(file_name, code_);
+    } else {
+      CHECK_EQ(fmt, fmt_)
+          << "Can only save to format=" << fmt_;
+    }
+  }
+
+ protected:
+  std::string code_;
+  std::string fmt_;
+};
+
+runtime::Module CSourceModuleCreate(std::string code, std::string fmt) {
+  std::shared_ptr<CSourceModuleNode> n =
+      std::make_shared<CSourceModuleNode>(code, fmt);
+  return runtime::Module(n);
+}
+
 // supports limited save without cross compile
 class DeviceSourceModuleNode final : public runtime::ModuleNode {
  public:
diff --git a/src/codegen/spirv/build_vulkan.cc b/src/codegen/spirv/build_vulkan.cc
index 3cd1b56cda43..f5ec5628545a 100644
--- a/src/codegen/spirv/build_vulkan.cc
+++ b/src/codegen/spirv/build_vulkan.cc
@@ -8,7 +8,7 @@
 #include <dmlc/memory_io.h>
 #include <tvm/ir_pass.h>
 
-#include "./codegen_spirv.h"
+#include "codegen_spirv.h"
 #include "../build_common.h"
 #include "../../runtime/vulkan/vulkan_module.h"
 
diff --git a/src/codegen/spirv/codegen_spirv.cc b/src/codegen/spirv/codegen_spirv.cc
index 395bdff1477d..812fee4a114e 100644
--- a/src/codegen/spirv/codegen_spirv.cc
+++ b/src/codegen/spirv/codegen_spirv.cc
@@ -5,8 +5,9 @@
  */
 #include <tvm/ir.h>
 #include <tvm/ir_pass.h>
+#include <string>
 #include "../codegen_common.h"
-#include "./codegen_spirv.h"
+#include "codegen_spirv.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/spirv/codegen_spirv.h b/src/codegen/spirv/codegen_spirv.h
index a6c09362ddf7..6a43182f7f2e 100644
--- a/src/codegen/spirv/codegen_spirv.h
+++ b/src/codegen/spirv/codegen_spirv.h
@@ -12,7 +12,7 @@
 
 #include <vector>
 
-#include "./ir_builder.h"
+#include "ir_builder.h"
 #include "../../runtime/thread_storage_scope.h"
 
 namespace tvm {
diff --git a/src/codegen/spirv/ir_builder.cc b/src/codegen/spirv/ir_builder.cc
index eb7a67228e60..fdf4b9852430 100644
--- a/src/codegen/spirv/ir_builder.cc
+++ b/src/codegen/spirv/ir_builder.cc
@@ -3,7 +3,7 @@
  * \file ir_builder.cc
  * \brief IRBuilder for SPIRV block
  */
-#include "./ir_builder.h"
+#include "ir_builder.h"
 
 namespace tvm {
 namespace codegen {
@@ -438,8 +438,25 @@ Value IRBuilder::Cast(const SType& dst_type, spirv::Value value) {
   const tvm::Type& from = value.stype.type;
   const tvm::Type& to = dst_type.type;
   CHECK_EQ(from.lanes(), to.lanes());
-
-  if (from.is_int() && to.is_int()) {
+  if (from == Bool()) {
+    if (to.is_int()) {
+      return Select(value, IntImm(dst_type, 1), IntImm(dst_type, 0));
+    } else if (to.is_uint()) {
+      return Select(value, UIntImm(dst_type, 1), UIntImm(dst_type, 0));
+    } else {
+      LOG(FATAL) << "cannot cast from " << from << " to " << to;
+      return Value();
+    }
+  } else if (to == Bool()) {
+    if (from.is_int()) {
+      return NE(value, IntImm(value.stype, 0));
+    } else if (to.is_uint()) {
+      return NE(value, UIntImm(value.stype, 0));
+    } else {
+      LOG(FATAL) << "cannot cast from " << from << " to " << to;
+      return Value();
+    }
+  } else if (from.is_int() && to.is_int()) {
     return MakeValue(spv::OpSConvert, dst_type, value);
   } else if (from.is_uint() && to.is_uint()) {
     return MakeValue(spv::OpUConvert, dst_type, value);
diff --git a/src/codegen/stack_vm/stack_vm_module.cc b/src/codegen/stack_vm/stack_vm_module.cc
deleted file mode 100644
index 731663deb448..000000000000
--- a/src/codegen/stack_vm/stack_vm_module.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file stack_vm_module.cc
- */
-#include <tvm/runtime/registry.h>
-#include <tvm/runtime/module.h>
-#include <tvm/codegen.h>
-#include "./codegen_stack_vm.h"
-
-namespace tvm {
-namespace codegen {
-
-class StackVMModuleNode : public runtime::ModuleNode {
- public:
-  const char* type_key() const {
-    return "stackvm";
-  }
-
-  PackedFunc GetFunction(
-      const std::string& name,
-      const std::shared_ptr<ModuleNode>& sptr_to_self) final {
-    if (name == runtime::symbol::tvm_module_main) {
-      return GetFunction(entry_func_, sptr_to_self);
-    }
-    auto it = fmap_.find(name);
-    if (it == fmap_.end()) return PackedFunc();
-    const StackVM& vm = it->second;
-    // capture sptr_to_self to keep module node alive.
-    return PackedFunc([vm, sptr_to_self](TVMArgs args, TVMRetValue* rv) {
-        vm(args);
-      });
-  }
-
-  std::string GetSource(const std::string& format) final {
-    std::ostringstream os;
-    for (const auto& kv : fmap_) {
-      os << "Function: " << kv.first << '\n';
-      os << kv.second;
-    }
-    return os.str();
-  }
-
-  static runtime::Module Build(const Array<LoweredFunc>& funcs) {
-    CHECK_NE(funcs.size(), 0U);
-    std::shared_ptr<StackVMModuleNode> n =
-        std::make_shared<StackVMModuleNode>();
-    for (LoweredFunc f : funcs) {
-      StackVM vm = codegen::CodeGenStackVM().Compile(f);
-      CHECK(!n->fmap_.count(f->name))
-          << "Function name " << f->name << "already exist in list";
-      vm.mod_ctx = n.get();
-      n->fmap_[f->name] = std::move(vm);
-    }
-    n->entry_func_ = funcs[0]->name;
-    return runtime::Module(n);
-  }
-
- private:
-  // entry function.
-  std::string entry_func_;
-  // internal function map
-  std::unordered_map<std::string, StackVM> fmap_;
-};
-
-TVM_REGISTER_API("codegen.build_stackvm")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = StackVMModuleNode::Build(args[0]);
-  });
-
-}  // namespace codegen
-}  // namespace tvm
diff --git a/src/codegen/stack_vm/codegen_stack_vm.cc b/src/codegen/stackvm/codegen_stackvm.cc
similarity index 95%
rename from src/codegen/stack_vm/codegen_stack_vm.cc
rename to src/codegen/stackvm/codegen_stackvm.cc
index 168e411fa6e2..0bede2dc0751 100644
--- a/src/codegen/stack_vm/codegen_stack_vm.cc
+++ b/src/codegen/stackvm/codegen_stackvm.cc
@@ -1,11 +1,12 @@
 /*!
  *  Copyright (c) 2017 by Contributors
- * \file codegen_stack_vm.cc
+ * \file codegen_stackvm.cc
  */
 #include <tvm/runtime/registry.h>
 #include <tvm/packed_func_ext.h>
 #include <limits>
-#include "./codegen_stack_vm.h"
+#include "codegen_stackvm.h"
+#include "../../runtime/stackvm/stackvm_module.h"
 
 namespace tvm {
 namespace codegen {
@@ -19,6 +20,7 @@ StackVM CodeGenStackVM::Compile(LoweredFunc f) {
     CHECK_EQ(static_cast<size_t>(vid), i);
   }
   this->Push(f->body);
+  vm_.InitCache();
   return std::move(vm_);
 }
 
@@ -486,5 +488,22 @@ void CodeGenStackVM::VisitExpr_(const Let *op) {
   this->PushOp(StackVM::STORE_HEAP, static_cast<int>(vid));
   this->Push(op->body);
 }
+
+runtime::Module BuildStackVM(const Array<LoweredFunc>& funcs) {
+  CHECK_NE(funcs.size(), 0U);
+  std::unordered_map<std::string, StackVM> fmap;
+  for (LoweredFunc f : funcs) {
+    StackVM vm = codegen::CodeGenStackVM().Compile(f);
+    CHECK(!fmap.count(f->name))
+        << "Function name " << f->name << "already exist in list";
+    fmap[f->name] = std::move(vm);
+  }
+  return runtime::StackVMModuleCreate(fmap, funcs[0]->name);
+}
+
+TVM_REGISTER_API("codegen.build_stackvm")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = BuildStackVM(args[0]);
+  });
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/codegen/stack_vm/codegen_stack_vm.h b/src/codegen/stackvm/codegen_stackvm.h
similarity index 95%
rename from src/codegen/stack_vm/codegen_stack_vm.h
rename to src/codegen/stackvm/codegen_stackvm.h
index 089284529242..23bd61dcb4c2 100644
--- a/src/codegen/stack_vm/codegen_stack_vm.h
+++ b/src/codegen/stackvm/codegen_stackvm.h
@@ -3,8 +3,8 @@
  * \file codegen_stack_vm.h
  * \brief Codegen into Simple Stack VM.
  */
-#ifndef TVM_CODEGEN_STACK_VM_CODEGEN_STACK_VM_H_
-#define TVM_CODEGEN_STACK_VM_CODEGEN_STACK_VM_H_
+#ifndef TVM_CODEGEN_STACKVM_CODEGEN_STACKVM_H_
+#define TVM_CODEGEN_STACKVM_CODEGEN_STACKVM_H_
 
 #include <tvm/ir.h>
 #include <tvm/ir_functor_ext.h>
@@ -14,12 +14,14 @@
 #include <vector>
 #include <unordered_map>
 
-#include "./stack_vm.h"
+#include "../../runtime/stackvm/stackvm.h"
 
 namespace tvm {
 namespace codegen {
 
 using namespace ir;
+using runtime::StackVM;
+
 /*!
  * \brief A base class to generate a stack VM.
  *  This module is used to generate host wrapper
@@ -145,4 +147,4 @@ class CodeGenStackVM
 
 }  // namespace codegen
 }  // namespace tvm
-#endif  // TVM_CODEGEN_STACK_VM_CODEGEN_STACK_VM_H_
+#endif  // TVM_CODEGEN_STACKVM_CODEGEN_STACKVM_H_
diff --git a/src/codegen/verilog/codegen_verilog.cc b/src/codegen/verilog/codegen_verilog.cc
index a4887390ad5d..af3d2fcfe467 100644
--- a/src/codegen/verilog/codegen_verilog.cc
+++ b/src/codegen/verilog/codegen_verilog.cc
@@ -6,7 +6,7 @@
 #include <cctype>
 #include <sstream>
 #include <iostream>
-#include "./codegen_verilog.h"
+#include "codegen_verilog.h"
 #include "../../arithmetic/compute_expr.h"
 
 namespace tvm {
@@ -213,11 +213,11 @@ VerilogValue CodeGenVerilog::VisitExpr_(const UIntImm *op) {
   return IntConst(op, this);
 }
 VerilogValue CodeGenVerilog::VisitExpr_(const FloatImm *op) {
-  LOG(FATAL) << "Donot support float constant in Verilog";
+  LOG(FATAL) << "Do not support float constant in Verilog";
   return VerilogValue();
 }
 VerilogValue CodeGenVerilog::VisitExpr_(const StringImm *op) {
-  LOG(FATAL) << "Donot support string constant in Verilog";
+  LOG(FATAL) << "Do not support string constant in Verilog";
   return VerilogValue();
 }
 
diff --git a/src/codegen/verilog/codegen_verilog.h b/src/codegen/verilog/codegen_verilog.h
index 7c8b811c2fa6..a38640ac3799 100644
--- a/src/codegen/verilog/codegen_verilog.h
+++ b/src/codegen/verilog/codegen_verilog.h
@@ -14,7 +14,7 @@
 #include <string>
 #include <vector>
 #include <unordered_map>
-#include "./verilog_ir.h"
+#include "verilog_ir.h"
 #include "../codegen_source_base.h"
 
 namespace tvm {
diff --git a/src/codegen/verilog/verilog_ir.cc b/src/codegen/verilog/verilog_ir.cc
index 1a03fc881665..0cc4b9cf3c21 100644
--- a/src/codegen/verilog/verilog_ir.cc
+++ b/src/codegen/verilog/verilog_ir.cc
@@ -5,7 +5,8 @@
 #include <tvm/ir_pass.h>
 #include <tvm/ir_visitor.h>
 #include <tvm/ir_mutator.h>
-#include "./verilog_ir.h"
+#include <utility>
+#include "verilog_ir.h"
 #include "../../arithmetic/compute_expr.h"
 
 namespace tvm {
@@ -16,14 +17,14 @@ using namespace ir;
 
 ControlSignal ControlSignalNode::make(
     ControlSignalType type, int advance_size) {
-  auto n = std::make_shared<ControlSignalNode>();
+  auto n = make_node<ControlSignalNode>();
   n->ctrl_type = type;
   n->advance_size = advance_size;
   return ControlSignal(n);
 }
 
 StageInput StageInputNode::make(Var var, StageInputType input_type) {
-  std::shared_ptr<StageInputNode> n = std::make_shared<StageInputNode>();
+  NodePtr<StageInputNode> n = make_node<StageInputNode>();
   n->var = var;
   n->input_type = input_type;
   return StageInput(n);
@@ -80,7 +81,7 @@ class PipelineExtractor: public IRVisitor {
         arg_handle_[arg.get()] = arg;
       }
     }
-    pipeline_ = std::make_shared<PipelineNode>();
+    pipeline_ = make_node<PipelineNode>();
     this->Visit(f->body);
     // setup channels
     for (const auto &kv : cmap_) {
@@ -112,7 +113,7 @@ class PipelineExtractor: public IRVisitor {
       if (cb.node != nullptr) {
         CHECK(cb.node->channel.same_as(ch));
       } else {
-        cb.node = std::make_shared<ChannelBlockNode>();
+        cb.node = make_node<ChannelBlockNode>();
         cb.node->channel = ch;
       }
       if (op->attr_key == attr::channel_read_scope) {
@@ -166,8 +167,8 @@ class PipelineExtractor: public IRVisitor {
     // The replace logic
     StageInputReplacer repl(var_info_);
     // Setup the compute block.
-    std::shared_ptr<ComputeBlockNode> compute =
-        std::make_shared<ComputeBlockNode>();
+    NodePtr<ComputeBlockNode> compute =
+        make_node<ComputeBlockNode>();
     compute->loop = Array<Stmt>(loop_);
     // setup the advance triggers
     for (const auto& e : trigger_) {
@@ -179,8 +180,8 @@ class PipelineExtractor: public IRVisitor {
       } else {
         ch = Channel(attr->node.node_);
       }
-      std::shared_ptr<SignalTriggerNode> trigger
-          = std::make_shared<SignalTriggerNode>();
+      NodePtr<SignalTriggerNode> trigger
+          = make_node<SignalTriggerNode>();
       trigger->channel_var = ch->handle_var;
       // predicate for the trigger
       Expr predicate = const_true();
@@ -194,7 +195,7 @@ class PipelineExtractor: public IRVisitor {
       ChannelEntry& cb = cmap_.at(ch->handle_var.get());
       trigger->signal_index = static_cast<int>(cb.node->ctrl_signals.size());
       // Grab the advance constant size.
-      int trigger_size;
+      int trigger_size = 0;
       if (attr->attr_key == attr::pipeline_stage_scope) {
         cb.node->ctrl_signals.push_back(
             ControlSignalNode::make(kComputeFinish, 0));
@@ -248,7 +249,7 @@ class PipelineExtractor: public IRVisitor {
     CHECK(!cmap_.count(var))
         << "Multiple access to the same handle";
     ChannelEntry& cb = cmap_[var];
-    cb.node = std::make_shared<ChannelBlockNode>();
+    cb.node = make_node<ChannelBlockNode>();
     cb.node->channel = ChannelNode::make(arg_handle_.at(var), dtype);
     return cb.node->channel;
   }
@@ -256,7 +257,7 @@ class PipelineExtractor: public IRVisitor {
  private:
   // The channel information.
   struct ChannelEntry {
-    std::shared_ptr<ChannelBlockNode> node;
+    NodePtr<ChannelBlockNode> node;
     int read_ref_count{0};
     int write_ref_count{0};
   };
@@ -275,7 +276,7 @@ class PipelineExtractor: public IRVisitor {
   // The argument handle map
   std::unordered_map<const Variable*, Var> arg_handle_;
   // The result block.
-  std::shared_ptr<PipelineNode> pipeline_;
+  NodePtr<PipelineNode> pipeline_;
 };
 
 Pipeline MakePipeline(LoweredFunc f) {
diff --git a/src/codegen/verilog/verilog_module.cc b/src/codegen/verilog/verilog_module.cc
index 0319d6e6556c..0670a02e34ac 100644
--- a/src/codegen/verilog/verilog_module.cc
+++ b/src/codegen/verilog/verilog_module.cc
@@ -6,7 +6,7 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/codegen.h>
 #include <mutex>
-#include "./codegen_verilog.h"
+#include "codegen_verilog.h"
 #include "../../runtime/file_util.h"
 #include "../../runtime/meta_data.h"
 
diff --git a/src/codegen/verilog/vpi_device_api.cc b/src/codegen/verilog/vpi_device_api.cc
index d53a12962fd7..656630351cf5 100644
--- a/src/codegen/verilog/vpi_device_api.cc
+++ b/src/codegen/verilog/vpi_device_api.cc
@@ -10,7 +10,7 @@
 #include <unordered_map>
 #include <map>
 #include <queue>
-#include "./vpi_session.h"
+#include "vpi_session.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/verilog/vpi_session.cc b/src/codegen/verilog/vpi_session.cc
index 6fbbbc01d32b..36c08cac3f84 100644
--- a/src/codegen/verilog/vpi_session.cc
+++ b/src/codegen/verilog/vpi_session.cc
@@ -4,7 +4,7 @@
  * \brief IPC session call to verilog simulator via VPI.
  */
 #include <tvm/api_registry.h>
-#include "./vpi_session.h"
+#include "vpi_session.h"
 
 namespace tvm {
 namespace codegen {
@@ -50,7 +50,7 @@ inline VPIHandleNode* VPIHandle::get() const {
 VPIHandle VPIHandleCreate(
     const std::shared_ptr<VPISessionEntry>& sess,
     VPIRawHandle handle) {
-  std::shared_ptr<VPIHandleNode> n = std::make_shared<VPIHandleNode>();
+  auto n = make_node<VPIHandleNode>();
   n->sess = sess;
   n->handle = handle;
   return VPIHandle(n);
@@ -102,7 +102,7 @@ int VPIGetIntProp(VPIHandleNode* h, int code) {
 }
 
 VPISession VPISession::make(int h_pipe_read, int h_pipe_write) {
-  std::shared_ptr<VPISessionNode> n = std::make_shared<VPISessionNode>();
+  auto n = make_node<VPISessionNode>();
   n->sess = std::make_shared<VPISessionEntry>(h_pipe_read, h_pipe_write);
   n->sess->in_control = true;
   VPISession sess(n);
diff --git a/src/codegen/verilog/vpi_session.h b/src/codegen/verilog/vpi_session.h
index 88a7f2f1906e..9fab0f173995 100644
--- a/src/codegen/verilog/vpi_session.h
+++ b/src/codegen/verilog/vpi_session.h
@@ -27,7 +27,7 @@ using runtime::PackedFunc;
 class VPISession : public NodeRef {
  public:
   VPISession() {}
-  explicit VPISession(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit VPISession(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief Get handle by name.
    * \param name The name of the handle.
@@ -63,7 +63,7 @@ class VPISession : public NodeRef {
 class VPIHandle : public NodeRef {
  public:
   VPIHandle() {}
-  explicit VPIHandle(std::shared_ptr<Node> n) : NodeRef(n) {}
+  explicit VPIHandle(NodePtr<Node> n) : NodeRef(n) {}
   /*!
    * \brief Get handle by name.
    * \param name The name of the handle.
diff --git a/src/common/arena.h b/src/common/arena.h
new file mode 100644
index 000000000000..c5da093a70b8
--- /dev/null
+++ b/src/common/arena.h
@@ -0,0 +1,148 @@
+/*!
+ * Copyright 2018 by Contributors
+ *
+ * \file arena.h
+ * \brief Arena allocator that allocates
+ *  memory chunks and frees them all during destruction time.
+ */
+#ifndef TVM_COMMON_ARENA_H_
+#define TVM_COMMON_ARENA_H_
+
+#include <type_traits>
+
+namespace tvm {
+namespace common {
+
+const constexpr int kArenaPageSize = 16 << 10;
+
+/*!
+ * \brief Arena allocator that allocates memory from continuous
+ *  chunk and frees them all only during destruction.
+ */
+class Arena {
+ public:
+  Arena() {
+    // eagerly allocate the first page.
+    head_ = reinterpret_cast<PageHeader*>(new Page());
+    head_->next = nullptr;
+    head_->ptr = sizeof(PageHeader);
+  }
+  ~Arena() {
+    // delete all the allocated pages.
+    while (head_ != nullptr) {
+      Page* page = reinterpret_cast<Page*>(head_);
+      head_ = head_->next;
+      delete page;
+    }
+  }
+  /*!
+   * \brief Allocate a space from Arena for type T
+   * \param T the data type to be allocated
+   * \note The space of T is not initialized.
+   */
+  template<typename T>
+  T* allocate_() {
+    return static_cast<T*>(Alloc(sizeof(T), alignof(T)));
+  }
+  /*!
+   * \brief Create a new instance of type T.
+   * \param args The constructor argument.
+   * \tparam T the type to be created.
+   * \tparam Args Arguments to the constructor.
+   *
+   * \return The allocated object.
+   * \note The type T must be simple type, or only contain
+   *  memory allocated from the same arena.
+   *  Otherwise the destructor needs to be called explicitly.
+   */
+  template<typename T, typename... Args>
+  T* make(Args&&... args) {
+    T* ptr = allocate_<T>();
+    new (ptr) T(std::forward<Args>(args)...);
+    return ptr;
+  }
+
+ private:
+  // page size 16 KB
+  // The page data type;
+  using Page = std::aligned_storage<kArenaPageSize, 1024>::type;
+  /*! \brief Page header */
+  struct PageHeader {
+    /*! \brief points to the next page */
+    PageHeader* next;
+    /*! \brief memory allocator ptr inside page */
+    size_t ptr;
+  };
+  /* \brief The page header */
+  PageHeader* head_{nullptr};
+  /*!
+   * \brief Align ptr by upper bound.
+   * \param ptr The pointer value.
+   * \param align The alignment requirement.
+   */
+  size_t UpperAlign(size_t ptr, size_t align) {
+    return ptr + (align - (ptr % align)) % align;
+  }
+  /*!
+   * \brief Internal aligned alloc function.
+   * \param size The size of the memory.
+   * \param align The alignment requirement.
+   */
+  void* Alloc(size_t size, size_t align) {
+    size_t ptr = UpperAlign(head_->ptr, align);
+    if (ptr + size <= kArenaPageSize) {
+      head_->ptr = ptr + size;
+      return reinterpret_cast<char*>(head_) + ptr;
+    } else {
+      PageHeader* new_head = reinterpret_cast<PageHeader*>(new Page());
+      new_head->next = head_;
+      ptr = UpperAlign(sizeof(PageHeader), align);
+      CHECK_LE(ptr + size, kArenaPageSize);
+      new_head->ptr = ptr + size;
+      head_ = new_head;
+      return reinterpret_cast<char*>(head_) + ptr;
+    }
+  }
+};
+
+/*!
+ * \brief Link list node
+ * \tparam T the content data type
+ */
+template<typename T>
+struct LinkNode {
+  /*! \brief The content value */
+  T value;
+  /*! \brief pointer to the next location */
+  LinkNode<T>* next{nullptr};
+};
+/*!
+ * \brief LinkedList structure
+ * \tparam T the content data type
+ * \note This is a simple data structure that can be used together with the arena.
+ * \sa LinkNode
+ */
+template<typename T>
+struct LinkedList {
+  /*! \brief Head pointer */
+  LinkNode<T>* head{nullptr};
+  /*! \brief Tail pointer */
+  LinkNode<T>* tail{nullptr};
+  /*!
+   * \brief Push a new node to the end of the linked list.
+   * \param node The node to be pushed.
+   */
+  void Push(LinkNode<T>* node) {
+    node->next = nullptr;
+    if (this->tail != nullptr) {
+      this->tail->next = node;
+      this->tail = node;
+    } else {
+      head = tail = node;
+    }
+  }
+};
+
+}  // namespace common
+}  // namespace tvm
+#endif  // TVM_COMMON_ARENA_H_
diff --git a/src/common/base64.h b/src/common/base64.h
index 31b02d3ca2a3..3f530e10a7e5 100644
--- a/src/common/base64.h
+++ b/src/common/base64.h
@@ -58,10 +58,10 @@ class StreamBufferReader {
   /*!
    * \return allows quick read using get char
    */
-  char GetChar() {
+  int GetChar() {
     while (true) {
       if (read_ptr_ < read_len_) {
-        return buffer_[read_ptr_++];
+        return static_cast<int>(buffer_[read_ptr_++]);
       } else {
         read_len_ = stream_->Read(&buffer_[0], buffer_.length());
         if (read_len_ == 0) return EOF;
diff --git a/src/common/ring_buffer.h b/src/common/ring_buffer.h
index 421f19466957..dcec54d1823d 100644
--- a/src/common/ring_buffer.h
+++ b/src/common/ring_buffer.h
@@ -36,19 +36,31 @@ class RingBuffer {
    * \param n The size of capacity.
    */
   void Reserve(size_t n) {
-    if (ring_.size() >= n) return;
-    size_t old_size = ring_.size();
-    size_t new_size = ring_.size();
-    while (new_size < n) {
-      new_size *= 2;
-    }
-    ring_.resize(new_size);
-    if (head_ptr_ + bytes_available_ > old_size) {
-      // copy the ring overflow part into the tail.
-      size_t ncopy = head_ptr_ + bytes_available_ - old_size;
-      memcpy(&ring_[0] + old_size, &ring_[0], ncopy);
+    if (ring_.size() < n) {
+        size_t old_size = ring_.size();
+        size_t new_size = static_cast<size_t>(n * 1.2);
+        ring_.resize(new_size);
+        if (head_ptr_ + bytes_available_ > old_size) {
+          // copy the ring overflow part into the tail.
+          size_t ncopy = head_ptr_ + bytes_available_ - old_size;
+          memcpy(&ring_[0] + old_size, &ring_[0], ncopy);
+        }
+    } else if (ring_.size() > n * 8 && ring_.size() > kInitCapacity) {
+        // shrink too large temporary buffer to avoid out of memory on some embedded devices
+        size_t old_bytes = bytes_available_;
+
+        std::vector<char> tmp(old_bytes);
+
+        Read(&tmp[0], old_bytes);
+        ring_.resize(kInitCapacity);
+        ring_.shrink_to_fit();
+
+        memcpy(&ring_[0], &tmp[0], old_bytes);
+        head_ptr_ = 0;
+        bytes_available_ = old_bytes;
     }
   }
+
   /*!
    * \brief Peform a non-blocking read from buffer
    *  size must be smaller than this->bytes_available()
diff --git a/src/contrib/cblas/cblas.cc b/src/contrib/cblas/cblas.cc
index 24ed9deb97cd..7473d45562fd 100644
--- a/src/contrib/cblas/cblas.cc
+++ b/src/contrib/cblas/cblas.cc
@@ -5,6 +5,8 @@
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/util.h>
 #include <dmlc/logging.h>
+#include "gemm_common.h"
+
 
 extern "C" {
 #if USE_MKL_BLAS == 1
@@ -19,38 +21,56 @@ namespace contrib {
 
 using namespace runtime;
 
+inline CBLAS_TRANSPOSE BooleanToTranspose(bool trans) {
+  return trans ? CblasTrans : CblasNoTrans;
+}
+
+struct CblasSgemmOp {
+  typedef float TDatatype;
+  void operator()(bool ta, bool tb,
+                  int M, int N, int K,
+                  float alpha, float* A, int lda,
+                  float* B, int ldb,
+                  float beta, float* C, int ldc) {
+    cblas_sgemm(CblasColMajor,
+                BooleanToTranspose(ta),
+                BooleanToTranspose(tb),
+                M, N, K,
+                alpha, A, lda,
+                B, ldb,
+                beta, C, ldc);
+  }
+};
+
+struct CblasDgemmOp {
+  typedef double TDatatype;
+  void operator()(bool ta, bool tb,
+                  int M, int N, int K,
+                  double alpha, double* A, int lda,
+                  double* B, int ldb,
+                  double beta, double* C, int ldc) {
+    cblas_dgemm(CblasColMajor,
+                BooleanToTranspose(ta),
+                BooleanToTranspose(tb),
+                M, N, K,
+                alpha, A, lda,
+                B, ldb,
+                beta, C, ldc);
+  }
+};
+
+
 // matrix multiplication for row major
 TVM_REGISTER_GLOBAL("tvm.contrib.cblas.matmul")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     DLTensor* A = args[0];
-    DLTensor* B = args[1];
-    DLTensor* C = args[2];
-    bool transa = args[3];
-    bool transb = args[4];
-    // call gemm for simple compact code.
-    CHECK_EQ(A->ndim, 2);
-    CHECK_EQ(B->ndim, 2);
-    CHECK_EQ(C->ndim, 2);
-    CHECK(C->strides == nullptr);
-    CHECK(B->strides == nullptr);
-    CHECK(A->strides == nullptr);
-    CHECK(TypeMatch(A->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(B->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(C->dtype, kDLFloat, 32));
-    cblas_sgemm(CblasColMajor,
-                transb ? CblasTrans : CblasNoTrans,
-                transa ? CblasTrans : CblasNoTrans,
-                transb ? B->shape[0] : B->shape[1],
-                transa ? A->shape[1] : A->shape[0],
-                transb ? B->shape[1] : B->shape[0],
-                1.0f,
-                reinterpret_cast<float*>(static_cast<char*>(B->data) + B->byte_offset),
-                B->shape[1],
-                reinterpret_cast<float*>(static_cast<char*>(A->data) + A->byte_offset),
-                A->shape[1],
-                0.0f,
-                reinterpret_cast<float*>(static_cast<char*>(C->data) + C->byte_offset),
-                C->shape[1]);
+    CHECK(TypeMatch(A->dtype, kDLFloat, 32) ||
+          TypeMatch(A->dtype, kDLFloat, 64));
+
+    if (TypeMatch(A->dtype, kDLFloat, 32))
+      CallGemm(args, ret, CblasSgemmOp());
+    else
+      CallGemm(args, ret, CblasDgemmOp());
   });
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/contrib/cblas/gemm_common.h b/src/contrib/cblas/gemm_common.h
new file mode 100644
index 000000000000..c69da5ea3e17
--- /dev/null
+++ b/src/contrib/cblas/gemm_common.h
@@ -0,0 +1,101 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/contrib/gemm.h
+ * \brief Shared implementation of gemm
+ */
+#ifndef TVM_CONTRIB_CBLAS_GEMM_COMMON_H_
+#define TVM_CONTRIB_CBLAS_GEMM_COMMON_H_
+#include <algorithm>
+
+namespace tvm {
+namespace contrib {
+
+using namespace runtime;
+
+inline int ColumnStride(DLTensor* tensor) {
+  // If the tensor itself is transposed then it will have strides
+  // backward from what we expect.  Regardless, the max of the strides
+  // (the other stride is 1) is the column stride.
+  if (tensor->strides) {
+    return std::max(tensor->strides[0], tensor->strides[1]);
+  } else {
+    return tensor->shape[1];
+  }
+}
+
+
+inline int ElementStride(DLTensor* tensor) {
+  if (tensor->strides) {
+    return std::min(tensor->strides[0], tensor->strides[1]);
+  } else {
+    return 1;
+  }
+}
+
+
+// Reversed strides indicates an in-place transpose operation.
+inline bool IsInPlaceTransposed(DLTensor* tensor) {
+  return tensor->strides && (tensor->strides[1] > tensor->strides[0]);
+}
+
+
+inline int RowCount(DLTensor* tensor, bool trans) {
+  return tensor->shape[trans ? 1 : 0];
+}
+
+
+inline int ColumnCount(DLTensor* tensor, bool trans) {
+  return tensor->shape[trans ? 0 : 1];
+}
+
+// Call a column major blas.  Note that data is stored in tvm as row
+// major, so this we switch the arguments.
+template<typename TGemmOp>
+inline void CallGemm(TVMArgs args, TVMRetValue *ret, TGemmOp op) {
+  DLTensor* A = args[0];
+  DLTensor* B = args[1];
+  DLTensor* C = args[2];
+  bool transa = args[3];
+  bool transb = args[4];
+  int bit_depth = sizeof(typename TGemmOp::TDatatype) * 8;
+  CHECK_EQ(A->ndim, 2);
+  CHECK_EQ(B->ndim, 2);
+  CHECK_EQ(C->ndim, 2);
+
+  CHECK_EQ(ElementStride(A), 1);
+  CHECK_EQ(ElementStride(B), 1);
+  CHECK_EQ(ElementStride(C), 1);
+
+  // C can never be transposed.
+  CHECK(!IsInPlaceTransposed(C));
+
+  // Reversed strides indicates an in-place transpose operation.
+  transa = IsInPlaceTransposed(A) ? !transa : transa;
+  transb = IsInPlaceTransposed(B) ? !transb : transb;
+
+  CHECK(TypeMatch(B->dtype, kDLFloat, bit_depth));
+  CHECK(TypeMatch(C->dtype, kDLFloat, bit_depth));
+  double alpha = args.size() > 5 ? args[5] : 1.0;
+  double beta = args.size() > 6 ? args[6] : 0.0;
+  op(transb,
+     transa,
+     ColumnCount(B, transb),
+     RowCount(A, transa),
+     ColumnCount(A, transa),
+     static_cast<float>(alpha),
+     reinterpret_cast<typename TGemmOp::TDatatype*>(static_cast<char*>(B->data)
+                                                    + B->byte_offset),
+     ColumnStride(B),
+     reinterpret_cast<typename TGemmOp::TDatatype*>(static_cast<char*>(A->data)
+                                                    + A->byte_offset),
+     ColumnStride(A),
+     static_cast<float>(beta),
+     reinterpret_cast<typename TGemmOp::TDatatype*>(static_cast<char*>(C->data)
+                                                    + C->byte_offset),
+     ColumnStride(C));
+}
+
+}  // namespace contrib
+}  // namespace tvm
+
+#endif  // TVM_CONTRIB_CBLAS_GEMM_COMMON_H_
diff --git a/src/contrib/cublas/cublas.cc b/src/contrib/cublas/cublas.cc
index 4171aadf6381..364129b7cba7 100644
--- a/src/contrib/cublas/cublas.cc
+++ b/src/contrib/cublas/cublas.cc
@@ -1,81 +1,81 @@
 /*!
- *  Copyright (c) 2017 by Contributors
+ *  Copyright (c) 2018 by Contributors
  * \file Use external cblas library call.
  */
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/util.h>
 #include <dmlc/logging.h>
+#include "../cblas/gemm_common.h"
+#include "cublas_utils.h"
 
-extern "C" {
-#include <cublas_v2.h>
-}
 
 namespace tvm {
 namespace contrib {
 
 using namespace runtime;
 
-#ifndef CHECK_CUBLAS_ERROR
-#define CHECK_CUBLAS_ERROR(error) \
-if (error != CUBLAS_STATUS_SUCCESS) { \
-  fprintf(stderr, "cuBLAS error: "); \
-  if (error == CUBLAS_STATUS_NOT_INITIALIZED) fprintf(stderr, "CUBLAS_STATUS_NOT_INITIALIZED"); \
-  if (error == CUBLAS_STATUS_ALLOC_FAILED) fprintf(stderr, "CUBLAS_STATUS_ALLOC_FAILED"); \
-  if (error == CUBLAS_STATUS_INVALID_VALUE) fprintf(stderr, "CUBLAS_STATUS_INVALID_VALUE"); \
-  if (error == CUBLAS_STATUS_ARCH_MISMATCH) fprintf(stderr, "CUBLAS_STATUS_ARCH_MISMATCH"); \
-  if (error == CUBLAS_STATUS_MAPPING_ERROR) fprintf(stderr, "CUBLAS_STATUS_MAPPING_ERROR"); \
-  if (error == CUBLAS_STATUS_EXECUTION_FAILED) fprintf(stderr, "CUBLAS_STATUS_EXECUTION_FAILED"); \
-  if (error == CUBLAS_STATUS_INTERNAL_ERROR) fprintf(stderr, "CUBLAS_STATUS_INTERNAL_ERROR"); \
-  if (error == CUBLAS_STATUS_NOT_SUPPORTED) fprintf(stderr, "CUBLAS_STATUS_NOT_SUPPORTED"); \
-  if (error == CUBLAS_STATUS_LICENSE_ERROR) fprintf(stderr, "CUBLAS_STATUS_LICENSE_ERROR"); \
-  fprintf(stderr, "\n"); \
-  exit(EXIT_FAILURE); \
+inline cublasOperation_t BooleanToTranspose(bool item) {
+  return item ? CUBLAS_OP_T : CUBLAS_OP_N;
 }
-#endif
+
+struct CublasSgemmOp {
+  typedef float TDatatype;
+  cublasHandle_t handle;
+  explicit CublasSgemmOp(cublasHandle_t hdl)
+    : handle(hdl)
+    {}
+
+  void operator()(bool ta, bool tb,
+                  int M, int N, int K,
+                  float alpha, float* A, int lda,
+                  float* B, int ldb,
+                  float beta, float* C, int ldc) {
+    CHECK_CUBLAS_ERROR(cublasSgemm(handle,
+                                   BooleanToTranspose(ta),
+                                   BooleanToTranspose(tb),
+                                   M, N, K,
+                                   &alpha, A, lda,
+                                   B, ldb,
+                                   &beta, C, ldc));
+  }
+};
+
+
+struct CublasDgemmOp {
+  typedef double TDatatype;
+  cublasHandle_t handle;
+  explicit CublasDgemmOp(cublasHandle_t hdl)
+    : handle(hdl)
+    {}
+  void operator()(bool ta, bool tb,
+                  int M, int N, int K,
+                  double alpha, double* A, int lda,
+                  double* B, int ldb,
+                  double beta, double* C, int ldc) {
+    CHECK_CUBLAS_ERROR(cublasDgemm(handle,
+                                   BooleanToTranspose(ta),
+                                   BooleanToTranspose(tb),
+                                   M, N, K,
+                                   &alpha, A, lda,
+                                   B, ldb,
+                                   &beta, C, ldc));
+  }
+};
 
 // matrix multiplication for row major
 TVM_REGISTER_GLOBAL("tvm.contrib.cublas.matmul")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     DLTensor* A = args[0];
-    DLTensor* B = args[1];
-    DLTensor* C = args[2];
-    bool transa = args[3];
-    bool transb = args[4];
-    // call gemm for simple compact code.
-    CHECK_EQ(A->ndim, 2);
-    CHECK_EQ(B->ndim, 2);
-    CHECK_EQ(C->ndim, 2);
-    CHECK(C->strides == nullptr);
-    CHECK(B->strides == nullptr);
-    CHECK(A->strides == nullptr);
-    CHECK(TypeMatch(A->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(B->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(C->dtype, kDLFloat, 32));
 
-    cublasHandle_t handle;
-    CHECK_CUBLAS_ERROR(cublasCreate(&handle));
-    float alpha = 1.0;
-    float beta = 0.0;
-    float *A_ptr = reinterpret_cast<float*>(static_cast<char*>(B->data) + B->byte_offset);
-    float *B_ptr = reinterpret_cast<float*>(static_cast<char*>(A->data) + A->byte_offset);
-    float *C_ptr = reinterpret_cast<float*>(static_cast<char*>(C->data) + C->byte_offset);
+    CHECK(TypeMatch(A->dtype, kDLFloat, 32) ||
+          TypeMatch(A->dtype, kDLFloat, 64));
 
-    CHECK_CUBLAS_ERROR(cublasSgemm(handle,
-                                   transb ? CUBLAS_OP_T : CUBLAS_OP_N,
-                                   transa ? CUBLAS_OP_T : CUBLAS_OP_N,
-                                   transb ? B->shape[0] : B->shape[1],
-                                   transa ? A->shape[1] : A->shape[0],
-                                   transb ? B->shape[1] : B->shape[0],
-                                   &alpha,
-                                   A_ptr,
-                                   B->shape[1],
-                                   B_ptr,
-                                   A->shape[1],
-                                   &beta,
-                                   C_ptr,
-                                   C->shape[1]));
+    CuBlasThreadEntry* entry_ptr = CuBlasThreadEntry::ThreadLocal();
 
-    CHECK_CUBLAS_ERROR(cublasDestroy(handle));
+    if (TypeMatch(A->dtype, kDLFloat, 32))
+      CallGemm(args, ret, CublasSgemmOp(entry_ptr->handle));
+    else
+      CallGemm(args, ret, CublasDgemmOp(entry_ptr->handle));
 });
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/contrib/cublas/cublas_utils.cc b/src/contrib/cublas/cublas_utils.cc
new file mode 100644
index 000000000000..0011fe853d8d
--- /dev/null
+++ b/src/contrib/cublas/cublas_utils.cc
@@ -0,0 +1,39 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file Use external cudnn utils function
+ */
+#include "cublas_utils.h"
+#include <dmlc/thread_local.h>
+#include <tvm/runtime/registry.h>
+#include "../../runtime/cuda/cuda_common.h"
+
+namespace tvm {
+namespace contrib {
+
+
+CuBlasThreadEntry::CuBlasThreadEntry() {
+  CHECK_CUBLAS_ERROR(cublasCreate(&handle));
+}
+
+
+CuBlasThreadEntry::~CuBlasThreadEntry() {
+  if (handle) {
+    cublasDestroy(handle);
+    handle = 0;
+  }
+}
+
+
+typedef dmlc::ThreadLocalStore<CuBlasThreadEntry> CuBlasThreadStore;
+
+
+CuBlasThreadEntry* CuBlasThreadEntry::ThreadLocal() {
+  auto stream = runtime::CUDAThreadEntry::ThreadLocal()->stream;
+  CuBlasThreadEntry* retval = CuBlasThreadStore::Get();
+  CHECK_CUBLAS_ERROR(cublasSetStream(retval->handle, static_cast<cudaStream_t>(stream)));
+  return retval;
+}
+
+
+}  // namespace contrib
+}  // namespace tvm
diff --git a/src/contrib/cublas/cublas_utils.h b/src/contrib/cublas/cublas_utils.h
new file mode 100644
index 000000000000..2b0874757d98
--- /dev/null
+++ b/src/contrib/cublas/cublas_utils.h
@@ -0,0 +1,53 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file Use external cudnn utils function
+ */
+
+#ifndef TVM_CONTRIB_CUBLAS_CUBLAS_UTILS_H_
+#define TVM_CONTRIB_CUBLAS_CUBLAS_UTILS_H_
+
+#include <dmlc/logging.h>
+
+extern "C" {
+#include <cublas_v2.h>
+}
+
+namespace tvm {
+namespace contrib {
+
+inline const char* GetCublasErrorString(int error) {
+  switch (error) {
+  case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+  case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+  case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
+  case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
+  case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+  case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
+  case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
+  case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
+  case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR";
+  }
+  return "Unrecognized error";
+}
+
+#ifndef CHECK_CUBLAS_ERROR
+#define CHECK_CUBLAS_ERROR(fn)                  \
+  do {                                          \
+    int error = static_cast<int>(fn);                      \
+    CHECK_EQ(error, CUBLAS_STATUS_SUCCESS) << "CUBLAS: " << GetCublasErrorString(error); \
+  } while (0)  // ; intentionally left off.
+#endif  // CHECK_CUBLAS_ERROR
+
+
+struct CuBlasThreadEntry {
+  CuBlasThreadEntry();
+  ~CuBlasThreadEntry();
+  cublasHandle_t handle{nullptr};
+  static CuBlasThreadEntry* ThreadLocal();
+};  // CuBlasThreadEntry
+
+
+}  // namespace contrib
+}  // namespace tvm
+
+#endif  // TVM_CONTRIB_CUBLAS_CUBLAS_UTILS_H_
diff --git a/src/contrib/nnpack/convolution.cc b/src/contrib/nnpack/convolution.cc
index 9ca02118aeb3..e600360c67f1 100644
--- a/src/contrib/nnpack/convolution.cc
+++ b/src/contrib/nnpack/convolution.cc
@@ -6,126 +6,214 @@
 #include <tvm/runtime/util.h>
 #include <dmlc/logging.h>
 #include <nnpack.h>
-#include "./nnpack_utils.h"
+#include "nnpack_utils.h"
 
 namespace tvm {
 namespace contrib {
 using namespace runtime;
 
 TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    NNPackThreadLocalEntry *entry = NNPackThreadLocalEntry::ThreadLocal();
-    nnp_initialize();
-    DLTensor* input  = args[0];
-    DLTensor* kernel = args[1];
-    DLTensor* bias   = args[2];
-    DLTensor* output = args[3];
-    uint64_t pad_top = args[4], pad_right = args[5], pad_bottom = args[6], pad_left = args[7];
-    nnp_padding input_padding{pad_top, pad_right, pad_bottom, pad_left};
-    uint64_t stride_width = args[8], stride_height = args[9];
-    nnp_size stride_size{stride_width, stride_height};
-    NNPackConfig(args[10]);
-
-    CHECK_EQ(input->ndim, 3);
-    CHECK_EQ(kernel->ndim, 4);
-    CHECK_EQ(bias->ndim, 1);
-    CHECK_EQ(output->ndim, 3);
-
-    CHECK_EQ(input->shape[0], kernel->shape[1]);
-    size_t input_channels = input->shape[0];
-    CHECK_EQ(output->shape[0], kernel->shape[0]);
-    CHECK_EQ(output->shape[0], bias->shape[0]);
-    size_t output_channels = output->shape[0];
-    nnp_size input_size{static_cast<size_t>(input->shape[1]),
-                        static_cast<size_t>(input->shape[2])};
-    nnp_size kernel_size{static_cast<size_t>(kernel->shape[2]),
-                         static_cast<size_t>(kernel->shape[3])};
-
-    CHECK(input->strides == nullptr);
-    CHECK(kernel->strides == nullptr);
-    CHECK(bias->strides == nullptr);
-
-    CHECK(TypeMatch(input->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(kernel->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(bias->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(output->dtype, kDLFloat, 32));
-
-    nnp_convolution_inference(nnp_convolution_algorithm_auto,
-                              nnp_convolution_transform_strategy_block_based,
-                              input_channels,
-                              output_channels,
-                              input_size,
-                              input_padding,
-                              kernel_size,
-                              stride_size,
-                              static_cast<float*>(input->data),
-                              static_cast<float*>(kernel->data),
-                              static_cast<float*>(bias->data),
-                              static_cast<float*>(output->data),
-                              NULL,
-                              NULL,
-                              nnp_activation_identity,
-                              NULL,
-                              entry->threadpool,
-                              NULL);
-  });
-
-
-TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_output")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    NNPackThreadLocalEntry *entry = NNPackThreadLocalEntry::ThreadLocal();
-    nnp_initialize();
-    DLTensor* input  = args[0];
-    DLTensor* kernel = args[1];
-    DLTensor* bias   = args[2];
-    DLTensor* output = args[3];
-    uint64_t pad_top = args[4], pad_right = args[5], pad_bottom = args[6], pad_left = args[7];
-    nnp_padding input_padding{pad_top, pad_right, pad_bottom, pad_left};
-    NNPackConfig(args[8]);
-
-    CHECK_EQ(input->ndim, 4);
-    CHECK_EQ(kernel->ndim, 4);
-    CHECK_EQ(bias->ndim, 1);
-    CHECK_EQ(output->ndim, 4);
-
-    CHECK_EQ(input->shape[0], output->shape[0]);
-    size_t batch_size = input->shape[0];
-    CHECK_EQ(input->shape[1], kernel->shape[1]);
-    size_t input_channels = input->shape[1];
-    CHECK_EQ(output->shape[1], bias->shape[0]);
-    CHECK_EQ(output->shape[1], kernel->shape[0]);
-    size_t output_channels = output->shape[1];
-    nnp_size input_size{static_cast<size_t>(input->shape[2]),
-                        static_cast<size_t>(input->shape[3])};
-    nnp_size kernel_size{static_cast<size_t>(kernel->shape[2]),
-                         static_cast<size_t>(kernel->shape[3])};
-
-    CHECK(input->strides == nullptr);
-    CHECK(kernel->strides == nullptr);
-    CHECK(bias->strides == nullptr);
-
-    CHECK(TypeMatch(input->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(kernel->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(bias->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(output->dtype, kDLFloat, 32));
-
-    nnp_convolution_output(nnp_convolution_algorithm_auto,
-                           batch_size,
-                           input_channels,
-                           output_channels,
-                           input_size,
-                           input_padding,
-                           kernel_size,
-                           static_cast<float*>(input->data),
-                           static_cast<float*>(kernel->data),
-                           static_cast<float*>(bias->data),
-                           static_cast<float*>(output->data),
-                           NULL,
-                           NULL,
-                           nnp_activation_identity,
-                           NULL,
-                           entry->threadpool,
-                           NULL);
-  });
+    .set_body([](TVMArgs args, TVMRetValue *ret) {
+      NNPackThreadLocalEntry *entry = NNPackThreadLocalEntry::ThreadLocal();
+      static std::once_flag flag;
+      std::call_once(flag,
+                     []() { CHECK_EQ(nnp_initialize(), nnp_status_success); });
+      DLTensor *input = args[0];
+      DLTensor *kernel = args[1];
+      DLTensor *bias = nullptr;
+      if (args[2].type_code() == kArrayHandle) {
+        bias = args[2];
+      }
+      DLTensor *output = args[3];
+      uint64_t pad_top = args[4], pad_right = args[5], pad_bottom = args[6],
+               pad_left = args[7];
+      nnp_padding input_padding{pad_top, pad_right, pad_bottom, pad_left};
+      uint64_t stride_width = args[8], stride_height = args[9];
+      nnp_size stride_size{stride_width, stride_height};
+      NNPackConfig(args[10]);
+
+      uint64_t algo_ = args[11];
+      nnp_convolution_algorithm algo =
+          static_cast<nnp_convolution_algorithm>(algo_);
+      CHECK_EQ(input->ndim, 4);
+      CHECK_EQ(kernel->ndim, 4);
+      if (bias) {
+        CHECK_EQ(bias->ndim, 1);
+      }
+      CHECK_EQ(output->ndim, 4);
+      CHECK_EQ(input->shape[1], kernel->shape[1]);
+      CHECK_EQ(input->shape[0], output->shape[0]);
+      size_t input_channels = input->shape[1];
+      CHECK_EQ(output->shape[1], kernel->shape[0]);
+      if (bias) {
+        CHECK_EQ(output->shape[1], bias->shape[0]);
+      }
+      size_t output_channels = output->shape[1];
+      nnp_size input_size{static_cast<size_t>(input->shape[2]),
+                          static_cast<size_t>(input->shape[3])};
+      nnp_size kernel_size{static_cast<size_t>(kernel->shape[2]),
+                           static_cast<size_t>(kernel->shape[3])};
+      CHECK(input->strides == nullptr);
+      CHECK(kernel->strides == nullptr);
+      if (bias) {
+        CHECK(bias->strides == nullptr);
+      }
+
+      CHECK(TypeMatch(input->dtype, kDLFloat, 32));
+      CHECK(TypeMatch(kernel->dtype, kDLFloat, 32));
+      if (bias) {
+        CHECK(TypeMatch(bias->dtype, kDLFloat, 32));
+      }
+      CHECK(TypeMatch(output->dtype, kDLFloat, 32));
+
+      // Allocate a zero-bias if we don't pass one in.
+      std::unique_ptr<std::vector<float>> zero_bias;
+      if (!bias) {
+        zero_bias.reset(new std::vector<float>(output->shape[1], 0.0));
+      }
+
+      for (auto n = 0; n < input->shape[0]; ++n) {
+        nnp_status status = nnp_convolution_inference(
+            algo, nnp_convolution_transform_strategy_compute, input_channels,
+            output_channels, input_size, input_padding, kernel_size,
+            stride_size,
+            static_cast<float *>(input->data) + n * input->shape[1] *
+                                                   input->shape[2] *
+                                                   input->shape[3],
+            static_cast<float *>(kernel->data),
+            bias ? static_cast<float *>(bias->data) : zero_bias->data(),
+            static_cast<float *>(output->data) + n * output->shape[1] *
+                                                    output->shape[2] *
+                                                    output->shape[3],
+            NULL, NULL, nnp_activation_identity, NULL, entry->threadpool, NULL);
+
+        CHECK_EQ(status, nnp_status_success);
+      }
+    });
+
+TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.convolution_inference_without_weight_transform")
+    .set_body([](TVMArgs args, TVMRetValue *ret) {
+      NNPackThreadLocalEntry *entry = NNPackThreadLocalEntry::ThreadLocal();
+      static std::once_flag flag;
+      std::call_once(flag,
+                     []() { CHECK_EQ(nnp_initialize(), nnp_status_success); });
+      DLTensor *input = args[0];
+      DLTensor *transformed_kernel = args[1];
+      DLTensor *bias = nullptr;
+      if (args[2].type_code() == kArrayHandle) {
+        bias = args[2];
+      }
+      DLTensor *output = args[3];
+      uint64_t pad_top = args[4], pad_right = args[5], pad_bottom = args[6],
+               pad_left = args[7];
+      nnp_padding input_padding{pad_top, pad_right, pad_bottom, pad_left};
+      uint64_t stride_width = args[8], stride_height = args[9];
+      nnp_size stride_size{stride_width, stride_height};
+      NNPackConfig(args[10]);
+
+      uint64_t algo_ = args[11];
+      nnp_convolution_algorithm algo =
+          static_cast<nnp_convolution_algorithm>(algo_);
+      CHECK_EQ(input->ndim, 4);
+      if (bias) {
+        CHECK_EQ(bias->ndim, 1);
+      }
+      CHECK_EQ(output->ndim, 4);
+      CHECK_EQ(input->shape[0], output->shape[0]);
+      size_t input_channels = input->shape[1];
+      if (bias) {
+        CHECK_EQ(output->shape[1], bias->shape[0]);
+      }
+      size_t output_channels = output->shape[1];
+      nnp_size input_size{static_cast<size_t>(input->shape[2]),
+                          static_cast<size_t>(input->shape[3])};
+      nnp_size kernel_size{3, 3};
+      CHECK(input->strides == nullptr);
+      CHECK(transformed_kernel->strides == nullptr);
+      if (bias) {
+        CHECK(bias->strides == nullptr);
+      }
+
+      CHECK(TypeMatch(input->dtype, kDLFloat, 32));
+      CHECK(TypeMatch(transformed_kernel->dtype, kDLFloat, 32));
+      if (bias) {
+        CHECK(TypeMatch(bias->dtype, kDLFloat, 32));
+      }
+      CHECK(TypeMatch(output->dtype, kDLFloat, 32));
+
+      // Allocate a zero-bias if we don't pass one in.
+      std::unique_ptr<std::vector<float>> zero_bias;
+      if (!bias) {
+        zero_bias.reset(new std::vector<float>(output->shape[1], 0.0));
+      }
+
+      for (auto n = 0; n < input->shape[0]; ++n) {
+      nnp_status status = nnp_convolution_inference(
+          algo, nnp_convolution_transform_strategy_reuse, input_channels, output_channels,
+          input_size, input_padding, kernel_size, stride_size,
+          static_cast<float *>(input->data) + n * input->shape[1] *
+                               input->shape[2] *
+                               input->shape[3],
+          static_cast<float *>(transformed_kernel->data),
+          bias ? static_cast<float *>(bias->data) : zero_bias->data(),
+          static_cast<float *>(output->data) + n * output->shape[1] *
+                               output->shape[2] *
+                               output->shape[3],
+          NULL, NULL,
+          nnp_activation_identity, NULL, entry->threadpool, NULL);
+      CHECK_EQ(status, nnp_status_success);
+      }
+    });
+
+TVM_REGISTER_GLOBAL(
+    "tvm.contrib.nnpack.convolution_inference_weight_transform")
+    .set_body([](TVMArgs args, TVMRetValue *ret) {
+      NNPackThreadLocalEntry *entry = NNPackThreadLocalEntry::ThreadLocal();
+      static std::once_flag flag;
+      std::call_once(flag,
+                     []() { CHECK_EQ(nnp_initialize(), nnp_status_success); });
+      DLTensor *kernel = args[0];
+      DLTensor *transformed_kernel = args[1];
+      // Dummy sizes
+      nnp_padding input_padding{1, 1, 1, 1};
+      nnp_size stride_size{1, 1};
+
+      nnp_size input_size{100, 100};
+
+      NNPackConfig(args[2]);
+
+      uint64_t algo_ = args[3];
+      nnp_convolution_algorithm algo =
+          static_cast<nnp_convolution_algorithm>(algo_);
+      CHECK_EQ(kernel->ndim, 4);
+      size_t input_channels = kernel->shape[1];
+      size_t output_channels = kernel->shape[0];
+      CHECK_EQ(kernel->shape[2], 3);
+      CHECK_EQ(kernel->shape[3], 3);
+      nnp_size kernel_size{static_cast<size_t>(kernel->shape[2]),
+                           static_cast<size_t>(kernel->shape[3])};
+      CHECK(kernel->strides == nullptr);
+      CHECK(TypeMatch(kernel->dtype, kDLFloat, 32));
+
+      size_t transformed_kernel_size = 0;
+      nnp_status status;
+      status = nnp_convolution_inference(
+          algo, nnp_convolution_transform_strategy_precompute, input_channels,
+          output_channels, input_size, input_padding, kernel_size, stride_size,
+          nullptr, nullptr, nullptr, nullptr, nullptr, &transformed_kernel_size,
+          nnp_activation_identity, nullptr, entry->threadpool, nullptr);
+      CHECK_EQ(status, nnp_status_success);
+
+      CHECK_LE(transformed_kernel_size, GetDataSize(*transformed_kernel));
+
+      status = nnp_convolution_inference(
+          algo, nnp_convolution_transform_strategy_precompute, input_channels,
+          output_channels, input_size, input_padding, kernel_size, stride_size,
+          nullptr, static_cast<float *>(kernel->data), nullptr, nullptr,
+          static_cast<float *>(transformed_kernel->data),
+          &transformed_kernel_size, nnp_activation_identity, nullptr,
+          entry->threadpool, nullptr);
+      CHECK_EQ(status, nnp_status_success);
+    });
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/contrib/nnpack/fully_connected.cc b/src/contrib/nnpack/fully_connected.cc
index df6356d933aa..80f981b29cf6 100644
--- a/src/contrib/nnpack/fully_connected.cc
+++ b/src/contrib/nnpack/fully_connected.cc
@@ -6,7 +6,7 @@
 #include <tvm/runtime/util.h>
 #include <dmlc/logging.h>
 #include <nnpack.h>
-#include "./nnpack_utils.h"
+#include "nnpack_utils.h"
 
 namespace tvm {
 namespace contrib {
@@ -43,38 +43,5 @@ TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.fully_connected_inference")
                                   entry->threadpool);
   });
 
-
-TVM_REGISTER_GLOBAL("tvm.contrib.nnpack.fully_connected_output")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    NNPackThreadLocalEntry *entry = NNPackThreadLocalEntry::ThreadLocal();
-    nnp_initialize();
-    DLTensor* A = args[0];
-    DLTensor* B = args[1];
-    DLTensor* C = args[2];
-    NNPackConfig(args[3]);
-
-    CHECK_EQ(A->ndim, 2);
-    CHECK_EQ(B->ndim, 2);
-    CHECK_EQ(C->ndim, 2);
-    CHECK_EQ(B->shape[0], C->shape[1]);
-    CHECK_EQ(B->shape[1], A->shape[1]);
-    CHECK_EQ(A->shape[0], C->shape[0]);
-    CHECK(C->strides == nullptr);
-    CHECK(B->strides == nullptr);
-    CHECK(A->strides == nullptr);
-    CHECK(TypeMatch(A->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(B->dtype, kDLFloat, 32));
-    CHECK(TypeMatch(C->dtype, kDLFloat, 32));
-
-    nnp_fully_connected_output(A->shape[0],
-                               B->shape[1],
-                               B->shape[0],
-                               static_cast<float*>(A->data),
-                               static_cast<float*>(B->data),
-                               static_cast<float*>(C->data),
-                               entry->threadpool,
-                               NULL);
-  });
-
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/contrib/nnpack/nnpack_utils.cc b/src/contrib/nnpack/nnpack_utils.cc
index 631f25b36647..12eb828cc7e6 100644
--- a/src/contrib/nnpack/nnpack_utils.cc
+++ b/src/contrib/nnpack/nnpack_utils.cc
@@ -2,7 +2,7 @@
  *  Copyright (c) 2017 by Contributors
  * \file Use external nnpack library call.
  */
-#include "./nnpack_utils.h"
+#include "nnpack_utils.h"
 
 namespace tvm {
 namespace contrib {
@@ -10,27 +10,38 @@ using namespace runtime;
 
 typedef dmlc::ThreadLocalStore<NNPackThreadLocalEntry> NNPackThreadLocalStore;
 
+
 NNPackThreadLocalEntry* NNPackThreadLocalEntry::ThreadLocal() {
   return NNPackThreadLocalStore::Get();
 }
 
 bool NNPackConfig(uint64_t nthreads) {
   NNPackThreadLocalEntry *entry = NNPackThreadLocalEntry::ThreadLocal();
-  if (entry->threadpool != NULL &&
-      pthreadpool_get_threads_count(entry->threadpool) != nthreads) {
+  if (entry->threadpool && pthreadpool_get_threads_count(entry->threadpool) == nthreads) {
+    CHECK_NE(nthreads, 1);
+    return true;
+  }
+  if (entry->threadpool) {
     pthreadpool_destroy(entry->threadpool);
-    entry->threadpool = NULL;
+    entry->threadpool = nullptr;
   }
-  if (entry->threadpool == NULL) {
-    entry->threadpool = pthreadpool_create(nthreads);
+
+  if (nthreads == 1) {
+    // a null threadpool means the function is invoked on the calling thread,
+    // which is the desired logic for nthreads == 1
+    CHECK(!entry->threadpool);
+    return true;
   }
+
+  entry->threadpool = pthreadpool_create(nthreads);
   return true;
 }
 
 
-TVM_REGISTER_GLOBAL("contrib.nnpack._Config")
+TVM_REGISTER_GLOBAL("contrib.nnpack._initialize")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    CHECK(NNPackConfig(args[0]));
+    *ret = nnp_initialize();
   });
+
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/contrib/nnpack/nnpack_utils.h b/src/contrib/nnpack/nnpack_utils.h
index fe7420786bde..1d44adff16ef 100644
--- a/src/contrib/nnpack/nnpack_utils.h
+++ b/src/contrib/nnpack/nnpack_utils.h
@@ -15,7 +15,7 @@ namespace contrib {
 using namespace runtime;
 
 struct NNPackThreadLocalEntry {
-  pthreadpool_t threadpool{NULL};
+  pthreadpool_t threadpool{nullptr};
   static NNPackThreadLocalEntry* ThreadLocal();
 };
 
diff --git a/src/contrib/random/random.cc b/src/contrib/random/random.cc
index 27e2b065a01b..68821fe04124 100644
--- a/src/contrib/random/random.cc
+++ b/src/contrib/random/random.cc
@@ -8,9 +8,9 @@
 #include <dmlc/thread_local.h>
 #include <algorithm>
 #ifndef _LIBCPP_SGX_CONFIG
-#include "./mt_random_engine.cc"
+#include "mt_random_engine.cc"
 #else
-#include "./sgx_random_engine.cc"
+#include "sgx_random_engine.cc"
 #endif
 
 #define DLPACK_INTEGER_TYPE_SWITCH(type, DType, ...)    \
diff --git a/src/lang/api_registry.cc b/src/lang/api_registry.cc
new file mode 100644
index 000000000000..c9f84092f5da
--- /dev/null
+++ b/src/lang/api_registry.cc
@@ -0,0 +1,50 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file api_registry.cc
+ */
+#include <tvm/api_registry.h>
+
+namespace tvm {
+
+TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+.set_dispatch<EnvFuncNode>([](const EnvFuncNode *op, IRPrinter *p) {
+    p->stream << "EnvFunc(" << op->name << ")";
+});
+
+NodePtr<EnvFuncNode> CreateEnvNode(const std::string& name) {
+  auto* f = runtime::Registry::Get(name);
+  CHECK(f != nullptr) << "Cannot find global function \'" << name << '\'';
+  NodePtr<EnvFuncNode> n = make_node<EnvFuncNode>();
+  n->func = *f;
+  n->name = name;
+  return n;
+}
+
+EnvFunc EnvFunc::Get(const std::string& name) {
+  return EnvFunc(CreateEnvNode(name));
+}
+
+TVM_REGISTER_API("_EnvFuncGet")
+.set_body_typed<EnvFunc(const std::string& name)>(EnvFunc::Get);
+
+TVM_REGISTER_API("_EnvFuncCall")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    EnvFunc env = args[0];
+    CHECK_GE(args.size(), 1);
+    env->func.CallPacked(TVMArgs(args.values + 1,
+                                 args.type_codes + 1,
+                                 args.size() - 1), rv);
+  });
+
+TVM_REGISTER_API("_EnvFuncGetPackedFunc")
+.set_body_typed<PackedFunc(const EnvFunc& n)>([](const EnvFunc&n) {
+    return n->func;
+  });
+
+TVM_REGISTER_NODE_TYPE(EnvFuncNode)
+.set_creator(CreateEnvNode)
+.set_global_key([](const Node* n) {
+    return static_cast<const EnvFuncNode*>(n)->name;
+  });
+
+}  // namespace tvm
diff --git a/src/lang/attr_functor.h b/src/lang/attr_functor.h
new file mode 100644
index 000000000000..69b7ec1f6e60
--- /dev/null
+++ b/src/lang/attr_functor.h
@@ -0,0 +1,207 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file attr_functor.h
+ * \brief A way to define arbitrary function signature
+ *        with dispatch on common attributes.
+ *
+ * Common attributes include:
+ *  - int, float, str constants
+ *  - array of attributes
+ *  - map of attributes
+ */
+#ifndef TVM_LANG_ATTR_FUNCTOR_H_
+#define TVM_LANG_ATTR_FUNCTOR_H_
+
+namespace tvm {
+
+template <typename FType>
+class AttrFunctor;
+
+#define ATTR_FUNCTOR_DEFAULT                                        \
+  { return VisitAttrDefault_(op, std::forward<Args>(args)...); }
+
+
+#define ATTR_FUNCTOR_DISPATCH(OP)                                       \
+  vtable.template set_dispatch<OP>(                                     \
+      [](const NodeRef& n, TSelf* self, Args... args) {                 \
+        return self->VisitAttr_(static_cast<const OP*>(n.node_.get()),  \
+                                std::forward<Args>(args)...);           \
+      });                                                               \
+
+// A functor for common attribute information.
+template <typename R, typename... Args>
+class AttrFunctor<R(const NodeRef& n, Args...)> {
+ private:
+  using TSelf = AttrFunctor<R(const NodeRef& n, Args...)>;
+  using FType = tvm::IRFunctor<R(const NodeRef& n, TSelf* self, Args...)>;
+
+ public:
+  /*! \brief the result type of this functor */
+  using result_type = R;
+  /*!
+   * \brief The functor call.
+   * \param n The expression node.
+   * \param args Additional arguments.
+   * \return The result of the call
+   */
+  virtual R VisitAttr(const NodeRef& n, Args... args) {
+    static FType vtable = InitVTable();
+    if (vtable.can_dispatch(n)) {
+      return vtable(n, this, std::forward<Args>(args)...);
+    } else {
+      return VisitAttrDefault_(n.get(), std::forward<Args>(args)...);
+    }
+  }
+  virtual R VisitAttrDefault_(const Node* node, Args... args) = 0;
+  virtual R VisitAttr_(const ArrayNode* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const StrMapNode* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::IntImm* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::UIntImm* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::FloatImm* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::StringImm* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  // deep comparison of symbolic integer expressions.
+  virtual R VisitAttr_(const Variable* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Add* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Sub* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Mul* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Div* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Mod* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Min* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Max* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::GE* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::GT* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::LT* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::LE* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::EQ* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::NE* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::And* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Or* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Not* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Cast* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Call* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+  virtual R VisitAttr_(const ir::Select* op, Args... args) ATTR_FUNCTOR_DEFAULT;
+
+ private:
+  // initialize the vtable.
+  static FType InitVTable() {
+    using namespace ir;
+    FType vtable;
+    // Set dispatch
+    ATTR_FUNCTOR_DISPATCH(StrMapNode);
+    ATTR_FUNCTOR_DISPATCH(ArrayNode);
+    ATTR_FUNCTOR_DISPATCH(IntImm);
+    ATTR_FUNCTOR_DISPATCH(UIntImm);
+    ATTR_FUNCTOR_DISPATCH(FloatImm);
+    ATTR_FUNCTOR_DISPATCH(StringImm);
+    ATTR_FUNCTOR_DISPATCH(Variable);
+    ATTR_FUNCTOR_DISPATCH(Add);
+    ATTR_FUNCTOR_DISPATCH(Sub);
+    ATTR_FUNCTOR_DISPATCH(Mul);
+    ATTR_FUNCTOR_DISPATCH(Div);
+    ATTR_FUNCTOR_DISPATCH(Min);
+    ATTR_FUNCTOR_DISPATCH(Max);
+    ATTR_FUNCTOR_DISPATCH(GE);
+    ATTR_FUNCTOR_DISPATCH(GT);
+    ATTR_FUNCTOR_DISPATCH(LE);
+    ATTR_FUNCTOR_DISPATCH(LT);
+    ATTR_FUNCTOR_DISPATCH(EQ);
+    ATTR_FUNCTOR_DISPATCH(NE);
+    ATTR_FUNCTOR_DISPATCH(And);
+    ATTR_FUNCTOR_DISPATCH(Or);
+    ATTR_FUNCTOR_DISPATCH(Not);
+    ATTR_FUNCTOR_DISPATCH(Cast);
+    ATTR_FUNCTOR_DISPATCH(Call);
+    ATTR_FUNCTOR_DISPATCH(Select);
+    return vtable;
+  }
+};
+
+class AttrsEqualHandler :
+      protected AttrFunctor<bool(const NodeRef&, const NodeRef&)> {
+ public:
+  /*!
+   * \brief Check if lhs equals rhs
+   * \param lhs The left operand.
+   * \param rhs The right operand.
+   */
+  bool Equal(const NodeRef& lhs, const NodeRef& rhs);
+
+ protected:
+  bool VisitAttrDefault_(const Node* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ArrayNode* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const StrMapNode* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::IntImm* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::UIntImm* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::FloatImm* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::StringImm* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Add* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Sub* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Mul* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Div* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Mod* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Min* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Max* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::GE* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::GT* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::LT* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::LE* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::EQ* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::NE* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::And* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Or* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Not* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Cast* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Call* lhs, const NodeRef& other) final;
+  bool VisitAttr_(const ir::Select* lhs, const NodeRef& other) final;
+};
+
+class AttrsHashHandler :
+      protected AttrFunctor<size_t(const NodeRef&)> {
+ public:
+  /*!
+   * \brief Get hash value of node
+   * \param node The node to be hashed.
+   */
+  size_t Hash(const NodeRef& node) {
+    if (!node.defined()) return 0;
+    return this->VisitAttr(node);
+  }
+
+ protected:
+  size_t VisitAttrDefault_(const Node* lhs) final;
+  size_t VisitAttr_(const ir::IntImm* lhs) final;
+  size_t VisitAttr_(const ir::UIntImm* lhs) final;
+  size_t VisitAttr_(const ir::FloatImm* lhs) final;
+  size_t VisitAttr_(const ir::StringImm* lhs) final;
+  size_t VisitAttr_(const ArrayNode* lhs) final;
+  size_t VisitAttr_(const StrMapNode* lhs) final;
+  size_t VisitAttr_(const ir::Add* op) final;
+  size_t VisitAttr_(const ir::Sub* op) final;
+  size_t VisitAttr_(const ir::Mul* op) final;
+  size_t VisitAttr_(const ir::Div* op) final;
+  size_t VisitAttr_(const ir::Mod* op) final;
+  size_t VisitAttr_(const ir::Min* op) final;
+  size_t VisitAttr_(const ir::Max* op) final;
+  size_t VisitAttr_(const ir::GE* op) final;
+  size_t VisitAttr_(const ir::GT* op) final;
+  size_t VisitAttr_(const ir::LE* op) final;
+  size_t VisitAttr_(const ir::LT* op) final;
+  size_t VisitAttr_(const ir::EQ* op) final;
+  size_t VisitAttr_(const ir::NE* op) final;
+  size_t VisitAttr_(const ir::And* op) final;
+  size_t VisitAttr_(const ir::Or* op) final;
+  size_t VisitAttr_(const ir::Not* op) final;
+  size_t VisitAttr_(const ir::Cast* op) final;
+  size_t VisitAttr_(const ir::Call* op) final;
+  size_t VisitAttr_(const ir::Select* op) final;
+  /*!
+   * \brief alias of dmlc::HashCombine
+   * \param lhs The first hash value.
+   * \param rhs The second hash value.
+   */
+  static size_t Combine(size_t lhs, size_t rhs) {
+    return dmlc::HashCombine(lhs, rhs);
+  }
+};
+}  // namespace tvm
+#endif  // TVM_LANG_ATTR_FUNCTOR_H_
diff --git a/src/lang/attrs.cc b/src/lang/attrs.cc
new file mode 100644
index 000000000000..1daf1e792553
--- /dev/null
+++ b/src/lang/attrs.cc
@@ -0,0 +1,330 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file attrs.cc
+ */
+#include <tvm/attrs.h>
+#include <tvm/api_registry.h>
+#include "attr_functor.h"
+
+namespace tvm {
+
+void DictAttrsNode::VisitAttrs(AttrVisitor* v)  {
+  v->Visit("__dict__", &dict);
+}
+
+void DictAttrsNode::VisitNonDefaultAttrs(AttrVisitor* v) {
+  v->Visit("__dict__", &dict);
+}
+
+void DictAttrsNode::InitByPackedArgs(
+    const runtime::TVMArgs& args, bool allow_unknown) {
+  for (int i = 0; i < args.size(); i += 2) {
+    std::string key = args[i];
+    runtime::TVMArgValue val = args[i + 1];
+    if (val.type_code() == kNodeHandle) {
+      dict.Set(key, val.operator NodeRef());
+    } else if (val.type_code() == kStr) {
+      dict.Set(key, Expr(val.operator std::string()));
+    } else {
+      dict.Set(key, val.operator Expr());
+    }
+  }
+}
+
+Array<AttrFieldInfo> DictAttrsNode::ListFieldInfo() const {
+  return {};
+}
+
+Attrs DictAttrsNode::make(Map<std::string, NodeRef> dict) {
+  NodePtr<DictAttrsNode> n = make_node<DictAttrsNode>();
+  n->dict = std::move(dict);
+  return Attrs(n);
+}
+
+TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+.set_dispatch<DictAttrsNode>([](const DictAttrsNode *op, IRPrinter *p) {
+    p->stream << op->dict;
+});
+
+TVM_REGISTER_NODE_TYPE(DictAttrsNode);
+
+TVM_REGISTER_NODE_TYPE(AttrFieldInfoNode);
+
+
+using namespace ir;
+// Equal handler.
+bool AttrsEqualHandler::Equal(const NodeRef& lhs, const NodeRef& rhs) {
+  if (lhs.same_as(rhs)) return true;
+  if (!lhs.defined() || !rhs.defined()) return false;
+  return this->VisitAttr(lhs, rhs);
+}
+
+bool AttrsEqualHandler::VisitAttrDefault_(const Node* lhs, const NodeRef& other) {
+  if (lhs->derived_from<BaseAttrsNode>()) {
+    AttrsEqual equal;
+    equal.handler_ = this;
+    return static_cast<const BaseAttrsNode*>(lhs)->ContentEqual(
+        other.get(), equal);
+  }
+  return lhs == other.get();
+}
+
+bool AttrsEqualHandler::VisitAttr_(const IntImm* lhs, const NodeRef& other) {
+  if (const auto* rhs = other.as<IntImm>()) {
+    return lhs->value == rhs->value;
+  }
+  return false;
+}
+
+bool AttrsEqualHandler::VisitAttr_(const UIntImm* lhs, const NodeRef& other) {
+  if (const auto* rhs = other.as<UIntImm>()) {
+    return lhs->value == rhs->value;
+  }
+  return false;
+}
+
+bool AttrsEqualHandler::VisitAttr_(const FloatImm* lhs, const NodeRef& other) {
+  if (const auto* rhs = other.as<FloatImm>()) {
+    return lhs->value == rhs->value;
+  }
+  return false;
+}
+
+bool AttrsEqualHandler::VisitAttr_(const StringImm* lhs, const NodeRef& other) {
+  if (const auto* rhs = other.as<StringImm>()) {
+    return lhs->value == rhs->value;
+  }
+  return false;
+}
+
+bool AttrsEqualHandler::VisitAttr_(const ArrayNode* lhs, const NodeRef& other) {
+  if (const auto* rhs = other.as<ArrayNode>()) {
+    if (rhs->data.size() != lhs->data.size()) return false;
+    for (size_t  i = 0; i < lhs->data.size(); ++i) {
+      if (!Equal(NodeRef(lhs->data[i]), NodeRef(rhs->data[i]))) return false;
+    }
+  }
+  return true;
+}
+
+bool AttrsEqualHandler::VisitAttr_(const StrMapNode* lhs, const NodeRef& other) {
+  if (const auto* rhs = other.as<StrMapNode>()) {
+    if (rhs->data.size() != lhs->data.size()) return false;
+    for (const auto& kv : lhs->data) {
+      auto it = rhs->data.find(kv.first);
+      if (it == rhs->data.end()) return false;
+      if (!Equal(NodeRef(kv.second), NodeRef(it->second))) return false;
+    }
+  }
+  return true;
+}
+
+#define TVM_DEFINE_ATTRS_BINOP_EQUAL(NodeName)                          \
+  bool AttrsEqualHandler::VisitAttr_(const NodeName* lhs, const NodeRef& other) { \
+    if (const auto* rhs = other.as<NodeName>()) {                       \
+      if (!Equal(lhs->a, rhs->a)) return false;                         \
+      if (!Equal(lhs->b, rhs->b)) return false;                         \
+      return true;                                                      \
+    } else {                                                            \
+      return false;                                                     \
+    }                                                                   \
+  }                                                                     \
+
+TVM_DEFINE_ATTRS_BINOP_EQUAL(Add);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(Sub);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(Mul);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(Div);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(Mod);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(Max);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(Min);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(GE);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(GT);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(LE);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(LT);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(EQ);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(NE);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(And);
+TVM_DEFINE_ATTRS_BINOP_EQUAL(Or);
+
+bool AttrsEqualHandler::VisitAttr_(const Not* lhs, const NodeRef& other) {
+  if (const auto* rhs = other.as<Not>()) {
+    return Equal(lhs->a, rhs->a);
+  } else {
+    return false;
+  }
+}
+
+bool AttrsEqualHandler::VisitAttr_(const Cast* lhs, const NodeRef& other) {
+  if (const auto* rhs = other.as<Cast>()) {
+    if (lhs->type != rhs->type) return false;
+    return Equal(lhs->value, rhs->value);
+  } else {
+    return false;
+  }
+}
+
+bool AttrsEqualHandler::VisitAttr_(const Call* lhs, const NodeRef& other) {
+  if (const auto* rhs = other.as<Call>()) {
+    return
+        lhs->name == rhs->name &&
+        lhs->type == rhs->type &&
+        lhs->call_type == rhs->call_type &&
+        Equal(lhs->args, rhs->args);
+  } else {
+    return false;
+  }
+}
+
+bool AttrsEqualHandler::VisitAttr_(const Select* lhs, const NodeRef& other) {
+  if (const auto* rhs = other.as<Select>()) {
+    return
+        Equal(lhs->condition, rhs->condition) &&
+        Equal(lhs->true_value, rhs->true_value) &&
+        Equal(lhs->false_value, rhs->false_value);
+  } else {
+    return false;
+  }
+}
+
+// Hash Handler.
+size_t AttrsHashHandler::VisitAttrDefault_(const Node* value) {
+  if (value->derived_from<BaseAttrsNode>()) {
+    AttrsHash hasher;
+    hasher.handler_ = this;
+    return static_cast<const BaseAttrsNode*>(value)->ContentHash(hasher);
+  } else {
+    return NodeHash()(GetRef<NodeRef>(value));
+  }
+}
+
+size_t AttrsHashHandler::VisitAttr_(const IntImm* op) {
+  return std::hash<int64_t>()(op->value);
+}
+
+size_t AttrsHashHandler::VisitAttr_(const UIntImm* op) {
+  return std::hash<uint64_t>()(op->value);
+}
+
+size_t AttrsHashHandler::VisitAttr_(const FloatImm* op) {
+  return std::hash<double>()(op->value);
+}
+
+size_t AttrsHashHandler::VisitAttr_(const StringImm* op) {
+  return std::hash<std::string>()(op->value);
+}
+
+size_t AttrsHashHandler::VisitAttr_(const ArrayNode* op) {
+  size_t result = op->data.size();
+  for (size_t  i = 0; i < op->data.size(); ++i) {
+    result = Combine(result, this->Hash(NodeRef(op->data[i])));
+  }
+  return result;
+}
+
+size_t AttrsHashHandler::VisitAttr_(const StrMapNode* lhs) {
+    using Entry = std::pair<std::string, NodePtr<Node> >;
+    std::vector<Entry> data(lhs->data.begin(), lhs->data.end());
+    std::sort(data.begin(), data.end(), [](const Entry& a, const Entry& b) {
+        return a.first < b.first;
+      });
+    size_t result = 0;
+    for (const Entry& kv : data) {
+      result = Combine(result, std::hash<std::string>()(kv.first));
+      result = Combine(result, this->Hash(NodeRef(kv.second)));
+    }
+    return result;
+}
+
+
+#define TVM_DEFINE_ATTRS_BINOP_HASH(NodeName)                           \
+  size_t AttrsHashHandler::VisitAttr_(const NodeName* op) {             \
+    static size_t key = std::hash<std::string>()(NodeName::_type_key);  \
+    return Combine(key, Combine(Hash(op->a), Hash(op->b)));             \
+  }                                                                     \
+
+TVM_DEFINE_ATTRS_BINOP_HASH(Add);
+TVM_DEFINE_ATTRS_BINOP_HASH(Sub);
+TVM_DEFINE_ATTRS_BINOP_HASH(Mul);
+TVM_DEFINE_ATTRS_BINOP_HASH(Div);
+TVM_DEFINE_ATTRS_BINOP_HASH(Mod);
+TVM_DEFINE_ATTRS_BINOP_HASH(Max);
+TVM_DEFINE_ATTRS_BINOP_HASH(Min);
+TVM_DEFINE_ATTRS_BINOP_HASH(GE);
+TVM_DEFINE_ATTRS_BINOP_HASH(GT);
+TVM_DEFINE_ATTRS_BINOP_HASH(LE);
+TVM_DEFINE_ATTRS_BINOP_HASH(LT);
+TVM_DEFINE_ATTRS_BINOP_HASH(EQ);
+TVM_DEFINE_ATTRS_BINOP_HASH(NE);
+TVM_DEFINE_ATTRS_BINOP_HASH(And);
+TVM_DEFINE_ATTRS_BINOP_HASH(Or);
+
+size_t AttrsHashHandler::VisitAttr_(const Not* op) {
+  static size_t key = std::hash<std::string>()(Not::_type_key);
+  return Combine(key, Hash(op->a));
+}
+
+size_t AttrsHashHandler::VisitAttr_(const Cast* op) {
+  static size_t key = std::hash<std::string>()(Cast::_type_key);
+  AttrsHash hasher;
+  size_t res = key;
+  res = Combine(res, hasher(op->type));
+  res = Combine(res, Hash(op->value));
+  return res;
+}
+
+size_t AttrsHashHandler::VisitAttr_(const Call* op) {
+  static size_t key = std::hash<std::string>()(Call::_type_key);
+  AttrsHash hasher;
+  size_t res = key;
+  res = Combine(res, hasher(op->name));
+  res = Combine(res, hasher(op->type));
+  res = Combine(res, Hash(op->args));
+  return res;
+}
+
+size_t AttrsHashHandler::VisitAttr_(const Select* op) {
+  static size_t key = std::hash<std::string>()(Select::_type_key);
+  size_t res = key;
+  res = Combine(res, Hash(op->condition));
+  res = Combine(res, Hash(op->true_value));
+  res = Combine(res, Hash(op->false_value));
+  return res;
+}
+
+
+// Default case
+bool AttrsEqual::operator()(const NodeRef& lhs, const NodeRef& rhs) const {
+  if (lhs.same_as(rhs)) return true;
+  if (handler_ == nullptr) {
+    return AttrsEqualHandler().Equal(lhs, rhs);
+  } else {
+    return handler_->Equal(lhs, rhs);
+  }
+}
+
+size_t AttrsHash::operator()(const NodeRef& node) const {
+  if (!node.defined()) return 0;
+  if (handler_ == nullptr) {
+    return AttrsHashHandler().Hash(node);
+  } else {
+    return handler_->Hash(node);
+  }
+}
+
+size_t DictAttrsNode::ContentHash(AttrsHash hasher) const {
+  return hasher(this->dict);
+}
+
+bool DictAttrsNode::ContentEqual(const Node* other, AttrsEqual equal) const {
+  if (this == other) return true;
+  if (other == nullptr) return false;
+  if (this->type_index() != other->type_index()) return false;
+  return equal(this->dict, static_cast<const DictAttrsNode*>(other)->dict);
+}
+
+TVM_REGISTER_API("_AttrsListFieldInfo")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  *ret = args[0].operator Attrs()->ListFieldInfo();
+});
+
+}  // namespace tvm
diff --git a/src/lang/buffer.cc b/src/lang/buffer.cc
index 3f23c2d480bf..524cad2eeac6 100644
--- a/src/lang/buffer.cc
+++ b/src/lang/buffer.cc
@@ -226,16 +226,12 @@ inline Expr ElemOffset(const BufferNode* n, Array<Expr> index) {
   Expr base = n->elem_offset;
   if (n->strides.size() == 0) {
     CHECK_EQ(n->shape.size(), index.size());
-    if (n->shape.size() != 0) {
-      if (is_zero(base)) {
-        base = index[0];
-      } else {
-        base = base + index[0];
+    if (index.size() > 0) {
+      Expr offset = index[0];
+      for (size_t i = 1; i < index.size(); ++i) {
+        offset = MergeMulMod(offset * n->shape[i] + index[i]);
       }
-    }
-    base = MergeMulMod(base);
-    for (size_t i = 1; i < index.size(); ++i) {
-      base = MergeMulMod(base * n->shape[i] + index[i]);
+      base = base + offset;
     }
   } else {
     CHECK_EQ(n->strides.size(), index.size());
@@ -264,32 +260,49 @@ inline Expr BufferOffset(const BufferNode* n, Array<Expr> index, Type dtype) {
 }
 
 Expr Buffer::vload(Array<Expr> begin, Type dtype) const {
+  // specially handle bool, stored as Int(8)
   const BufferNode* n = operator->();
   CHECK(dtype.element_of() == n->dtype.element_of() &&
         dtype.lanes() % n->dtype.lanes() == 0)
       << "Cannot load " << dtype
       << " from buffer of " << n->dtype;
-  return ir::Load::make(
-      dtype, n->data, BufferOffset(n, begin, dtype),
-      const_true(dtype.lanes()));
+  if (dtype == Bool()) {
+    return ir::Cast::make(
+        Bool(),
+        ir::Load::make(
+            Int(8), n->data, BufferOffset(n, begin, Int(8)),
+            const_true()));
+  } else {
+    return ir::Load::make(
+        dtype, n->data, BufferOffset(n, begin, dtype),
+        const_true(dtype.lanes()));
+  }
 }
 
 Stmt Buffer::vstore(Array<Expr> begin, Expr value) const {
+  // specially handle bool, stored as Int(8)
   const BufferNode* n = operator->();
   Type dtype = value.type();
   CHECK(dtype.element_of() == n->dtype.element_of() &&
         dtype.lanes() % n->dtype.lanes() == 0)
       << "Cannot load " << dtype
       << " from buffer of " << n->dtype;
-  return ir::Store::make(n->data, value, BufferOffset(n, begin, dtype),
-                         const_true(dtype.lanes()));
+  if (value.type() == Bool()) {
+    return ir::Store::make(n->data,
+                           ir::Cast::make(Int(8), value),
+                           BufferOffset(n, begin, Int(8)),
+                           const_true());
+  } else {
+    return ir::Store::make(n->data, value, BufferOffset(n, begin, dtype),
+                           const_true(dtype.lanes()));
+  }
 }
 
 Buffer Buffer::MakeStrideView() const {
   if ((*this)->strides.size() != 0) return *this;
   if ((*this)->shape.size() == 0) return *this;
   std::vector<Expr> temp;
-  auto n = std::make_shared<BufferNode>(*operator->());
+  auto n = make_node<BufferNode>(*operator->());
   Expr acc = make_const(n->DefaultIndexType(), 1);
   for (size_t i = n->shape.size(); i != 0 ; --i) {
     temp.push_back(acc);
@@ -344,9 +357,9 @@ Expr Buffer::access_ptr(int access_mask, Type ptr_type, int content_lanes, Expr
   } else if (self->strides.size() == self->shape.size()) {
     int highest_dim = 0;
     extent = arith::ComputeExpr<ir::Mul>(
-        self->strides[highest_dim], self->shape[highest_dim]);
+        self->strides[highest_dim], self->shape[highest_dim]) - offset;
   } else {
-    extent = arith::ComputeReduce<ir::Mul>(self->shape, Expr());
+    extent = arith::ComputeReduce<ir::Mul>(self->shape, Expr()) - offset;
   }
   Expr elem_offset = self->elem_offset + offset;
   if (content_lanes > 1) {
@@ -373,7 +386,7 @@ Buffer BufferNode::make(Var data,
                         std::string scope,
                         int data_alignment,
                         int offset_factor) {
-  auto n = std::make_shared<BufferNode>();
+  auto n = make_node<BufferNode>();
   n->data = std::move(data);
   n->dtype = dtype;
   n->shape = std::move(shape);
diff --git a/src/lang/channel.cc b/src/lang/channel.cc
index dd850becf956..dcc44a0d0611 100644
--- a/src/lang/channel.cc
+++ b/src/lang/channel.cc
@@ -7,7 +7,7 @@
 namespace tvm {
 
 Channel ChannelNode::make(Var handle_var, Type dtype) {
-  auto n = std::make_shared<ChannelNode>();
+  auto n = make_node<ChannelNode>();
   n->handle_var = handle_var;
   n->dtype = dtype;
   return Channel(n);
diff --git a/src/lang/expr.cc b/src/lang/expr.cc
index 684211079e94..7ac0e372371c 100644
--- a/src/lang/expr.cc
+++ b/src/lang/expr.cc
@@ -5,6 +5,7 @@
 #include <tvm/base.h>
 #include <tvm/expr.h>
 #include <tvm/ir.h>
+#include <tvm/ir_operator.h>
 #include <ir/IRPrinter.h>
 #include <memory>
 
@@ -13,18 +14,18 @@ namespace tvm {
 using HalideIR::IR::RangeNode;
 
 Range::Range(Expr begin, Expr end)
-    : Range(std::make_shared<RangeNode>(
+    : Range(make_node<RangeNode>(
           begin,
           is_zero(begin) ? end : (end - begin))) {
 }
 
 Range Range::make_by_min_extent(Expr min, Expr extent) {
-  return Range(std::make_shared<HalideIR::IR::RangeNode>(min, extent));
+  return Range(make_node<HalideIR::IR::RangeNode>(min, extent));
 }
 
 IterVar IterVarNode::make(Range dom, Var var,
                           IterVarType t, std::string thread_tag) {
-  std::shared_ptr<IterVarNode> n = std::make_shared<IterVarNode>();
+  NodePtr<IterVarNode> n = make_node<IterVarNode>();
   n->dom = dom;
   n->var = var;
   n->iter_type = t;
@@ -47,6 +48,10 @@ std::ostream& operator<<(std::ostream& os, const NodeRef& n) {  // NOLINT(*)
   return os;
 }
 
+void Dump(const NodeRef& n) {
+  std::cerr << n << "\n";
+}
+
 Var var(const std::string& name_hint, Type t) {
   return Var(name_hint, t);
 }
diff --git a/src/lang/ir.cc b/src/lang/ir.cc
index 1e0a6e5065f4..875258540584 100644
--- a/src/lang/ir.cc
+++ b/src/lang/ir.cc
@@ -52,7 +52,7 @@ CommReducer CommReducerNode::make(Array<Var> lhs,
                                   Array<Var> rhs,
                                   Array<Expr> result,
                                   Array<Expr> identity_element) {
-  auto node = std::make_shared<CommReducerNode>();
+  auto node = make_node<CommReducerNode>();
   node->lhs = lhs;
   node->rhs = rhs;
   node->result = result;
@@ -83,7 +83,7 @@ Expr Reduce::make(CommReducer combiner, Array<Expr> source,
   if (!condition.defined()) {
     condition = const_true();
   }
-  auto n = std::make_shared<Reduce>();
+  auto n = make_node<Reduce>();
   CHECK(source.defined());
   for (size_t i = 0; i < axis.size(); ++i) {
     CHECK(axis[i].defined());
diff --git a/src/lang/ir_operator.cc b/src/lang/ir_operator.cc
index ded27bbdce7e..9ae2912901be 100644
--- a/src/lang/ir_operator.cc
+++ b/src/lang/ir_operator.cc
@@ -8,8 +8,414 @@
 
 namespace tvm {
 
+/*!
+ * \brief Check whether type is used to represent index.
+ *
+ * Index types are frequently used in shape computation
+ * and need to be aggressively constant-folded.
+ *
+ * \param type The type to represent index.
+ * \return the checked result.
+ */
+inline bool IsIndexType(const Type& type) {
+  return type.is_int() && type.lanes() == 1 &&
+      (type.bits() == 32 || type.bits() == 64);
+}
+
+// simple cast that only checks if type matches and cast
+inline Expr SimpleCast(const Type& t, Expr value) {
+  if (value.type() == t) return value;
+  return ir::Cast::make(t, value);
+}
+
+// The public function with a quick checking path.
+void BinaryOpMatchTypes(Expr& lhs, Expr& rhs) {  // NOLINT(*)
+  if (lhs.type() == rhs.type()) return;
+  Type ltype = lhs.type();
+  Type rtype = rhs.type();
+  if (ltype.lanes() == 1 && rtype.lanes() != 1) {
+    lhs = ir::Broadcast::make(lhs, rtype.lanes());
+  } else if (rtype.lanes() == 1 && ltype.lanes() != 1) {
+    rhs = ir::Broadcast::make(rhs, ltype.lanes());
+  } else {
+    CHECK(ltype.lanes() == rtype.lanes())
+        << "Cannot match type " << ltype << " vs " << rtype;
+  }
+  if (lhs.type() == rhs.type()) return;
+  // Only do very simple type coversion
+  // int->float, int(32)->int(64)
+  // require the types to be relatively consistent
+  // This will the reduce amount code generated by operators
+  // and also help user to find potential type conversion problems.
+  if (!lhs.type().is_float() && rhs.type().is_float()) {
+    // int->float
+    lhs = ir::Cast::make(rhs.type(), lhs);
+  } else if (lhs.type().is_float() && !rhs.type().is_float()) {
+    // int->float
+    rhs = ir::Cast::make(lhs.type(), rhs);
+  } else if ((lhs.type().is_int() && rhs.type().is_int()) ||
+             (lhs.type().is_uint() && rhs.type().is_uint())) {
+    // promote int to higher bits
+    if (lhs.type().bits() < rhs.type().bits()) {
+      lhs = ir::Cast::make(rhs.type(), lhs);
+    } else {
+      rhs = ir::Cast::make(lhs.type(), rhs);
+    }
+  } else if ((lhs.type().is_int() && rhs.type().is_uint()) ||
+             (lhs.type().is_uint() && rhs.type().is_int())) {
+    int bits = std::max(lhs.type().bits(), rhs.type().bits());
+    lhs = SimpleCast(Int(bits, lhs.type().lanes()), lhs);
+    rhs = SimpleCast(Int(bits, rhs.type().lanes()), rhs);
+  } else {
+    LOG(FATAL) << "Cannot match type " << ltype << " vs " << rtype;
+  }
+}
+
+
+template<typename ValueType>
+inline bool ConstPowerHelper(ValueType val, int *shift) {
+  if (val <= 0) return false;
+  shift[0] = 0;
+  while (val != 0) {
+    if (val & 1) {
+      return (val == 1);
+    }
+    ++shift[0];
+    val = val >> 1;
+  }
+  return true;
+}
+
+bool is_const_power_of_two_integer(const Expr& x, int* shift) {
+  if (const auto* op = x.as<ir::IntImm>()) {
+    return ConstPowerHelper(op->value, shift);
+  } else if (const auto* op = x.as<ir::UIntImm>()) {
+    return ConstPowerHelper(op->value, shift);
+  } else {
+    return false;
+  }
+}
+
+Expr cast(const Type& t, Expr value) {
+  using ir::IntImm;
+  if (value.type() == t) return value;
+  // const fold IntImm as they are used in index computations
+  if (t.lanes() == 1) {
+    if (const IntImm* op = value.as<IntImm>()) {
+      return make_const(t, op->value);
+    }
+    return ir::Cast::make(t, value);
+  } else {
+    if (value.type().lanes() == 1) {
+      // manually unroll cast
+      Type vtype = t.element_of();
+      if (value.type() != vtype) {
+        if (const IntImm* op = value.as<IntImm>()) {
+          value = make_const(vtype, op->value);
+        } else {
+          value = ir::Cast::make(vtype, value);
+        }
+      }
+      return ir::Broadcast::make(value, t.lanes());
+    } else {
+      CHECK(value.type().lanes() == t.lanes());
+      return ir::Cast::make(t, value);
+    }
+  }
+}
+
+Expr reinterpret(const Type& t, Expr value) {
+  if (value.type() == t) return value;
+  return ir::Call::make(t, ir::Call::reinterpret, { value }, ir::Call::PureIntrinsic);
+}
+
+#define TVM_CONST_PROPAGATION(BODY)                                     \
+  using ir::IntImm;                                                     \
+  using ir::UIntImm;                                                    \
+  const IntImm* pa = a.as<IntImm>();                                    \
+  const IntImm* pb = b.as<IntImm>();                                    \
+  const Type& ta = a.type();                                            \
+  const Type& tb = b.type();                                            \
+  if (IsIndexType(ta) && IsIndexType(tb)) {                             \
+    BODY;                                                               \
+  }                                                                     \
+  BinaryOpMatchTypes(a, b);
+
+
+Expr operator+(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      if (pa && pb) return IntImm::make(rtype, pa->value + pb->value);
+      if (pa && pa->value == 0) return SimpleCast(rtype, b);
+      if (pb && pb->value == 0) return SimpleCast(rtype, a);
+    });
+  return ir::Add::make(a, b);
+}
+
+Expr operator-(Expr a) {
+  using ir::IntImm;
+  const IntImm* pa = a.as<IntImm>();
+  if (pa) {
+    return ir::IntImm::make(a.type(), -pa->value);
+  }
+  return make_zero(a.type()) - a;
+}
+
+Expr operator-(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      if (pa && pb) return IntImm::make(rtype, pa->value - pb->value);
+      if (pb && pb->value == 0) return SimpleCast(rtype, a);
+    });
+  return ir::Sub::make(a, b);
+}
+
+Expr operator*(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      if (pa && pb) return IntImm::make(rtype, pa->value * pb->value);
+      if (pa) {
+        if (pa->value == 1) return SimpleCast(rtype, b);
+        if (pa->value == 0) return SimpleCast(rtype, a);
+      }
+      if (pb) {
+        if (pb->value == 1) return SimpleCast(rtype, a);
+        if (pb->value == 0) return SimpleCast(rtype, b);
+      }
+    });
+  return ir::Mul::make(a, b);
+}
+
+Expr operator/(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      // due to division and mod can have different modes
+      // only constant fold positive number where rule is fixed.
+      if (pa && pb && pa->value >= 0 && pb->value > 0) {
+        return IntImm::make(rtype, pa->value / pb->value);
+      }
+      if (pa) {
+        if (pa->value == 0) return SimpleCast(rtype, a);
+      }
+      if (pb) {
+        if (pb->value == 1) return SimpleCast(rtype, a);
+        CHECK_NE(pb->value, 0) << "Divide by zero";
+      }
+    });
+  return ir::Div::make(a, b);
+}
+
+Expr operator%(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      // due to division and mod can have different modes
+      // only constant fold positive number where rule is fixed.
+      if (pa && pb && pa->value >= 0 && pb->value > 0) {
+        return IntImm::make(rtype, pa->value % pb->value);
+      }
+      if (pa) {
+        if (pa->value == 0) return SimpleCast(rtype, a);
+      }
+      if (pb) {
+        if (pb->value == 1) return make_zero(rtype);
+        CHECK_NE(pb->value, 0) << "Divide by zero";
+      }
+    });
+  return ir::Mod::make(a, b);
+}
+
+Expr min(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      if (pa && pb) return IntImm::make(rtype, std::min(pa->value, pb->value));
+    });
+  return ir::Min::make(a, b);
+}
+
+Expr max(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      if (pa && pb) return IntImm::make(rtype, std::max(pa->value, pb->value));
+    });
+  return ir::Max::make(a, b);
+}
+
+Expr select(Expr cond, Expr true_value, Expr false_value) {
+  using ir::IntImm;
+  using ir::UIntImm;
+  CHECK(cond.type().is_bool());
+  BinaryOpMatchTypes(true_value, false_value);
+  if (const UIntImm* op = cond.as<UIntImm>()) {
+    if (op->value != 0) {
+      return true_value;
+    } else {
+      return false_value;
+    }
+  } else if (const IntImm* op = cond.as<IntImm>()) {
+    if (op->value != 0) {
+      return true_value;
+    } else {
+      return false_value;
+    }
+  }
+  return ir::Select::make(cond, true_value, false_value);
+}
+
+Expr likely(Expr cond) {
+  if (is_const(cond)) return cond;
+  return ir::Call::make(cond.type(), ir::Call::likely, { cond }, ir::Call::PureIntrinsic);
+}
+
+Expr operator>(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value > pb->value);
+    });
+  return ir::GT::make(a, b);
+}
+
+Expr operator>=(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value >= pb->value);
+    });
+  return ir::GE::make(a, b);
+}
+
+Expr operator<(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value < pb->value);
+    });
+  return ir::LT::make(a, b);
+}
+
+Expr operator<=(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value <= pb->value);
+    });
+  return ir::LE::make(a, b);
+}
+
+Expr operator==(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value == pb->value);
+    });
+  return ir::EQ::make(a, b);
+}
+
+Expr operator!=(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value != pb->value);
+    });
+  return ir::NE::make(a, b);
+}
+
+Expr operator&&(Expr a, Expr b) {
+  using ir::UIntImm;
+  if (a.type().is_bool() && b.type().is_bool()) {
+    const UIntImm* pa = a.as<UIntImm>();
+    const UIntImm* pb = b.as<UIntImm>();
+    if (pa && pa->value) return b;
+    if (pa && !pa->value) return a;
+    if (pb && pb->value) return a;
+    if (pb && !pb->value) return b;
+  }
+  return ir::And::make(a, b);
+}
+
+Expr operator||(Expr a, Expr b) {
+  using ir::UIntImm;
+  if (a.type().is_bool() && b.type().is_bool()) {
+    const UIntImm* pa = a.as<UIntImm>();
+    const UIntImm* pb = b.as<UIntImm>();
+    if (pa && pa->value) return a;
+    if (pa && !pa->value) return b;
+    if (pb && pb->value) return b;
+    if (pb && !pb->value) return a;
+  }
+  return ir::Or::make(a, b);
+}
+
+Expr operator!(Expr a) {
+  using ir::UIntImm;
+  const UIntImm* pa = a.as<UIntImm>();
+  if (pa) {
+    return UIntImm::make(UInt(1), !(pa->value));
+  }
+  return ir::Not::make(a);
+}
+
+Expr operator>>(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      if (pa && pb) return IntImm::make(rtype, (pa->value >> pb->value));
+      if (pb) {
+        if (pb->value == 0) return SimpleCast(rtype, a);
+      }
+    });
+  return ir::Call::make(a.type(), ir::Call::shift_right, { a, b }, ir::Call::PureIntrinsic);
+}
+
+Expr operator<<(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      if (pa && pb) return IntImm::make(rtype, (pa->value << pb->value));
+      if (pb) {
+        if (pb->value == 0) return SimpleCast(rtype, a);
+      }
+    });
+  return ir::Call::make(a.type(), ir::Call::shift_left, { a, b }, ir::Call::PureIntrinsic);
+}
+
+Expr operator&(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      if (pa && pb) return IntImm::make(rtype, (pa->value & pb->value));
+    });
+  return ir::Call::make(a.type(), ir::Call::bitwise_and, { a, b }, ir::Call::PureIntrinsic);
+}
+
+Expr operator|(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      if (pa && pb) return IntImm::make(rtype, (pa->value | pb->value));
+    });
+  return ir::Call::make(a.type(), ir::Call::bitwise_or, { a, b }, ir::Call::PureIntrinsic);
+}
+
+Expr operator^(Expr a, Expr b) {
+  TVM_CONST_PROPAGATION({
+      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      if (pa && pb) return IntImm::make(rtype, (pa->value ^ pb->value));
+    });
+  return ir::Call::make(a.type(), ir::Call::bitwise_xor, { a, b }, ir::Call::PureIntrinsic);
+}
+
+Expr operator~(Expr a) {
+  CHECK(a.type().is_int() || a.type().is_uint());
+  return ir::Call::make(a.type(), ir::Call::bitwise_not, { a }, ir::Call::PureIntrinsic);
+}
+
+Expr pow(Expr x, Expr y) {
+  BinaryOpMatchTypes(x, y);
+  CHECK(x.type().is_float()) << "power only applies to float";
+  return ir::Call::make(x.type(), "pow", { x, y }, ir::Call::PureIntrinsic);
+}
+
+Expr abs(Expr x) {
+  if (x.type().is_int()) {
+    return select(x >= make_zero(x.type()), x, -x);
+  } else if (x.type().is_float()) {
+    return ir::Call::make(x.type(), "fabs", {x}, ir::Call::PureIntrinsic);
+  } else if (x.type().is_uint()) {
+    return x;
+  } else {
+    LOG(FATAL) << "Data type " << x.type()
+               <<" not supported for absolute op. Skipping absolute op...";
+    return x;
+  }
+}
+
 Expr sum(Expr source, Array<IterVar> rdom) {
-  Var x("x"), y("y");
+  Var x("x", source.type()), y("y", source.type());
   Expr result = ir::Add::make(x, y);
   Expr identity_element = make_zero(source.type());
   ir::CommReducer combiner =
@@ -18,7 +424,7 @@ Expr sum(Expr source, Array<IterVar> rdom) {
 }
 
 Expr max(Expr source, Array<IterVar> rdom) {
-  Var x("x"), y("y");
+  Var x("x", source.type()), y("y", source.type());
   Expr result = ir::Max::make(x, y);
   Expr identity_element = source.type().min();
   ir::CommReducer combiner =
@@ -27,7 +433,7 @@ Expr max(Expr source, Array<IterVar> rdom) {
 }
 
 Expr min(Expr source, Array<IterVar> rdom) {
-  Var x("x"), y("y");
+  Var x("x", source.type()), y("y", source.type());
   Expr result = ir::Min::make(x, y);
   Expr identity_element = source.type().max();
   ir::CommReducer combiner =
@@ -35,4 +441,19 @@ Expr min(Expr source, Array<IterVar> rdom) {
   return ir::Reduce::make(combiner, {source}, rdom, make_const(Bool(1), true), 0);
 }
 
+Expr prod(Expr source, Array<IterVar> rdom) {
+  Var x("x", source.type()), y("y", source.type());
+  Expr result = ir::Mul::make(x, y);
+  Expr identity_element = make_const(source.type(), 1);
+  ir::CommReducer combiner =
+    ir::CommReducerNode::make({x}, {y}, {result}, {identity_element});
+  return ir::Reduce::make(combiner, {source}, rdom, make_const(Bool(1), true), 0);
+}
+
+Expr fmod(Expr x, Expr y) {
+  BinaryOpMatchTypes(x, y);
+  CHECK(x.type().is_float()) << "fmod only applies to float";
+  return ir::Call::make(x.type(), "fmod", { x, y }, ir::Call::PureIntrinsic);
+}
+
 }  // namespace tvm
diff --git a/src/lang/reflection.cc b/src/lang/reflection.cc
index 7c4e862f0abb..86a11a7e5b42 100644
--- a/src/lang/reflection.cc
+++ b/src/lang/reflection.cc
@@ -5,9 +5,11 @@
  */
 #include <tvm/base.h>
 #include <tvm/expr.h>
-#include <tvm/container.h>
+#include <tvm/attrs.h>
+#include <tvm/node/container.h>
 #include <tvm/packed_func_ext.h>
 #include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/packed_func.h>
 #include <dmlc/json.h>
 #include <dmlc/memory_io.h>
 #include <string>
@@ -19,35 +21,17 @@ DMLC_REGISTRY_ENABLE(::tvm::NodeFactoryReg);
 
 namespace tvm {
 
+::dmlc::Registry<NodeFactoryReg>* NodeFactoryReg::Registry() {
+  return ::dmlc::Registry<NodeFactoryReg>::Get();
+}
+
 inline std::string Type2String(const Type& t) {
-  if (t.code()  ==Type::Handle) return "handle";
-  std::ostringstream os;
-  os << t;
-  return os.str();
+  return runtime::TVMType2String(Type2TVMType(t));
 }
 
 
 inline Type String2Type(std::string s) {
-  std::istringstream is(s);
-  halideir_type_code_t code = Type::Int;
-  if (s.substr(0, 3) == "int") {
-    code = Type::Int; s = s.substr(3);
-  } else if (s.substr(0, 4) == "uint") {
-    code = Type::UInt; s = s.substr(4);
-  } else if (s.substr(0, 5) == "float") {
-    code = Type::Float; s = s.substr(5);
-  } else if (s.substr(0, 5) == "float") {
-    code = Type::Float; s = s.substr(5);
-  } else if (s == "handle") {
-    return Handle();
-  } else {
-    LOG(FATAL) << "unknown type " << s;
-  }
-  int bits = 32, lanes = 1;
-  if (sscanf(s.c_str(), "%dx%d", &bits, &lanes) == 0) {
-    LOG(FATAL) << "unknown type " << s;
-  }
-  return Type(code, bits, lanes);
+  return TVMType2Type(runtime::String2TVMType(s));
 }
 
 
@@ -114,6 +98,8 @@ using AttrMap = std::map<std::string, std::string>;
 struct JSONNode {
   // The type key of the data
   std::string type_key;
+  // The global key for global object
+  std::string global_key;
   // the attributes
   AttrMap attrs;
   // container keys
@@ -124,6 +110,9 @@ struct JSONNode {
   void Save(dmlc::JSONWriter *writer) const {
     writer->BeginObject();
     writer->WriteObjectKeyValue("type_key", type_key);
+    if (global_key.size() != 0) {
+      writer->WriteObjectKeyValue("global_key", global_key);
+    }
     if (attrs.size() != 0) {
       writer->WriteObjectKeyValue("attrs", attrs);
     }
@@ -139,9 +128,11 @@ struct JSONNode {
   void Load(dmlc::JSONReader *reader) {
     attrs.clear();
     data.clear();
+    global_key.clear();
     type_key.clear();
     dmlc::JSONObjectReadHelper helper;
     helper.DeclareOptionalField("type_key", &type_key);
+    helper.DeclareOptionalField("global_key", &global_key);
     helper.DeclareOptionalField("attrs", &attrs);
     helper.DeclareOptionalField("keys", &keys);
     helper.DeclareOptionalField("data", &data);
@@ -194,6 +185,14 @@ class JSONAttrGetter : public AttrVisitor {
       return;
     }
     node_->type_key = node->type_key();
+    // sepcially handle global object
+    auto* f = dmlc::Registry<NodeFactoryReg>::Find(node_->type_key);
+    CHECK(f != nullptr)
+        << "Node type \'" << node_->type_key << "\' is not registered in TVM";
+    if (f->fglobal_key != nullptr) {
+      node_->global_key = f->fglobal_key(node);
+      return;
+    }
     node_->attrs.clear();
     node_->data.clear();
     if (node->is_type<ArrayNode>()) {
@@ -218,6 +217,11 @@ class JSONAttrGetter : public AttrVisitor {
             node_index_->at(kv.second.get()));
       }
     } else {
+      // do not need to recover content of global singleton object
+      // they are registered via the environment
+      auto* f = dmlc::Registry<NodeFactoryReg>::Find(node->type_key());
+      if (f != nullptr && f->fglobal_key != nullptr) return;
+      // recursively index normal object.
       node->VisitAttrs(this);
     }
   }
@@ -225,7 +229,7 @@ class JSONAttrGetter : public AttrVisitor {
 
 class JSONAttrSetter : public AttrVisitor {
  public:
-  const std::vector<std::shared_ptr<Node> >* node_list_;
+  const std::vector<NodePtr<Node> >* node_list_;
   const std::vector<runtime::NDArray>* tensor_list_;
   JSONNode* node_;
 
@@ -378,13 +382,13 @@ std::string SaveJSON(const NodeRef& n) {
   return os.str();
 }
 
-std::shared_ptr<Node> LoadJSON_(std::string json_str) {
+NodePtr<Node> LoadJSON_(std::string json_str) {
   std::istringstream is(json_str);
   dmlc::JSONReader reader(&is);
   JSONGraph jgraph;
   // load in json graph.
   jgraph.Load(&reader);
-  std::vector<std::shared_ptr<Node> > nodes;
+  std::vector<NodePtr<Node> > nodes;
   std::vector<runtime::NDArray> tensors;
   // load in tensors
   for (const std::string& blob : jgraph.b64ndarrays) {
@@ -402,9 +406,9 @@ std::shared_ptr<Node> LoadJSON_(std::string json_str) {
       auto* f = dmlc::Registry<NodeFactoryReg>::Find(jnode.type_key);
       CHECK(f != nullptr)
           << "Node type \'" << jnode.type_key << "\' is not registered in TVM";
-      nodes.emplace_back(f->body());
+      nodes.emplace_back(f->fcreator(jnode.global_key));
     } else {
-      nodes.emplace_back(std::shared_ptr<Node>());
+      nodes.emplace_back(NodePtr<Node>());
     }
   }
   CHECK_EQ(nodes.size(), jgraph.nodes.size());
@@ -414,7 +418,11 @@ std::shared_ptr<Node> LoadJSON_(std::string json_str) {
 
   for (size_t i = 0; i < nodes.size(); ++i) {
     setter.node_ = &jgraph.nodes[i];
-    setter.Set(nodes[i].get());
+    // do not need to recover content of global singleton object
+    // they are registered via the environment
+    if (setter.node_->global_key.length() == 0) {
+      setter.Set(nodes[i].get());
+    }
   }
   return nodes.at(jgraph.root);
 }
@@ -467,22 +475,15 @@ class NodeAttrSetter : public AttrVisitor {
   }
 };
 
-// API function to make node.
-// args format:
-//    type_key, key1, value1, ..., key_n, value_n
-void MakeNode(runtime::TVMArgs args, runtime::TVMRetValue* rv) {
+
+void InitNodeByPackedArgs(Node* n, const TVMArgs& args) {
   NodeAttrSetter setter;
-  setter.type_key = args[0].operator std::string();
-  CHECK_EQ(args.size() % 2, 1);
-  for (int i = 1; i < args.size(); i += 2) {
-    setter.attrs.emplace(
-        args[i].operator std::string(),
-        runtime::TVMArgValue(args.values[i + 1], args.type_codes[i + 1]));
-  }
-  auto* f = dmlc::Registry<NodeFactoryReg>::Find(setter.type_key);
-  CHECK(f != nullptr)
-      << "Node type \'" << setter.type_key << "\' is not registered in TVM";
-  std::shared_ptr<Node> n = f->body();
+  setter.type_key = n->type_key();
+  CHECK_EQ(args.size() % 2, 0);
+  for (int i = 0; i < args.size(); i += 2) {
+    setter.attrs.emplace(args[i].operator std::string(),
+                         args[i + 1]);
+  }
   n->VisitAttrs(&setter);
   if (setter.attrs.size() != 0) {
     std::ostringstream os;
@@ -492,10 +493,29 @@ void MakeNode(runtime::TVMArgs args, runtime::TVMRetValue* rv) {
     }
     LOG(FATAL) << os.str();
   }
+}
+
+// API function to make node.
+// args format:
+//   key1, value1, ..., key_n, value_n
+void MakeNode(const TVMArgs& args, TVMRetValue* rv) {
+  std::string type_key = args[0];
+  std::string empty_str;
+  auto* f = dmlc::Registry<NodeFactoryReg>::Find(type_key);
+  CHECK(f != nullptr)
+      << "Node type \'" << type_key << "\' is not registered in TVM";
+  TVMArgs kwargs(args.values + 1, args.type_codes + 1, args.size() - 1);
+  CHECK(f->fglobal_key == nullptr)
+      << "Cannot make node type \'" << type_key << "\' with global_key.";
+  NodePtr<Node> n = f->fcreator(empty_str);
+  if (n->derived_from<BaseAttrsNode>()) {
+    static_cast<BaseAttrsNode*>(n.get())->InitByPackedArgs(kwargs);
+  } else {
+    InitNodeByPackedArgs(n.get(), kwargs);
+  }
   *rv = NodeRef(n);
 }
 
 TVM_REGISTER_GLOBAL("make._Node")
 .set_body(MakeNode);
-
 }  // namespace tvm
diff --git a/src/lang/tensor.cc b/src/lang/tensor.cc
index 5db4f45e799f..9b1a58abcee4 100644
--- a/src/lang/tensor.cc
+++ b/src/lang/tensor.cc
@@ -10,6 +10,8 @@
 
 namespace tvm {
 
+// Tensor
+
 Expr Tensor::operator()(Array<Var> indices) const {
   Array<Expr> arr(indices.begin(), indices.end());
   return operator()(arr);
@@ -26,11 +28,20 @@ Expr Tensor::operator()(Array<Expr> indices) const {
   return n;
 }
 
+Tensor Operation::output(size_t i) const {
+  auto node = make_node<TensorNode>();
+  node->op = *this;
+  node->value_index = i;
+  node->dtype = (*this)->output_dtype(i);
+  node->shape = (*this)->output_shape(i);
+  return Tensor(node);
+}
+
 Tensor TensorNode::make(Array<Expr> shape,
                         Type dtype,
                         Operation op,
                         int value_index) {
-  auto n = std::make_shared<TensorNode>();
+  auto n = make_node<TensorNode>();
   n->shape = std::move(shape);
   n->dtype = dtype;
   n->op = op;
@@ -46,14 +57,8 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 
 TVM_REGISTER_NODE_TYPE(TensorNode);
 
-Tensor Operation::output(size_t i) const {
-  auto node = std::make_shared<TensorNode>();
-  node->op = *this;
-  node->value_index = i;
-  node->dtype = (*this)->output_dtype(i);
-  node->shape = (*this)->output_shape(i);
-  return Tensor(node);
-}
+
+// TensorIntrin
 
 TensorIntrin TensorIntrinNode::make(std::string name,
                                     Operation op,
@@ -62,7 +67,7 @@ TensorIntrin TensorIntrinNode::make(std::string name,
                                     Stmt body,
                                     Stmt reduce_init,
                                     Stmt reduce_update) {
-  auto n = std::make_shared<TensorIntrinNode>();
+  auto n = make_node<TensorIntrinNode>();
   n->name = std::move(name);
   n->op = std::move(op);
   n->inputs = std::move(inputs);
@@ -79,4 +84,27 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
   });
 
 TVM_REGISTER_NODE_TYPE(TensorIntrinNode);
+
+
+// TensorIntrinCall
+
+TensorIntrinCall TensorIntrinCallNode::make(TensorIntrin intrin,
+                                            Array<Tensor> tensors,
+                                            Array<Region> regions,
+                                            Array<IterVar> reduce_axis) {
+  auto n = make_node<TensorIntrinCallNode>();
+  n->intrin = std::move(intrin);
+  n->tensors = std::move(tensors);
+  n->regions = std::move(regions);
+  n->reduce_axis = std::move(reduce_axis);
+  return TensorIntrinCall(n);
+}
+
+TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+.set_dispatch<TensorIntrinCallNode>([](const TensorIntrinCallNode *n, IRPrinter *p) {
+    p->stream << "TensorIntrinCall(intrin=" << n->intrin << ", " << n << ")";
+  });
+
+TVM_REGISTER_NODE_TYPE(TensorIntrinCallNode);
+
 }  // namespace tvm
diff --git a/src/op/compute_op.cc b/src/op/compute_op.cc
index 366ea2c78fe6..d4cb2b4c632b 100644
--- a/src/op/compute_op.cc
+++ b/src/op/compute_op.cc
@@ -9,9 +9,11 @@
 #include <tvm/ir_visitor.h>
 #include <tvm/ir_pass.h>
 #include <unordered_set>
-#include "./compute_op.h"
-#include "./op_util.h"
+#include <string>
+#include "compute_op.h"
+#include "op_util.h"
 #include "../schedule/message_passing.h"
+#include "../arithmetic/compute_expr.h"
 
 namespace tvm {
 
@@ -68,7 +70,7 @@ Tensor compute(Array<Expr> shape,
                std::string name,
                std::string tag,
                Map<std::string, NodeRef> attrs) {
-  auto op_node = std::make_shared<ComputeOpNode>();
+  auto op_node = make_node<ComputeOpNode>();
   // compute dimension.
   size_t ndim = shape.size();
   std::vector<IterVar> axis;
@@ -90,7 +92,7 @@ Array<Tensor> compute(Array<Expr> shape,
                       std::string name,
                       std::string tag,
                       Map<std::string, NodeRef> attrs) {
-  auto op_node = std::make_shared<ComputeOpNode>();
+  auto op_node = make_node<ComputeOpNode>();
   // compute dimension.
   size_t ndim = shape.size();
   std::vector<IterVar> axis;
@@ -116,7 +118,10 @@ Operation ComputeOpNode::make(std::string name,
                               Map<std::string, NodeRef> attrs,
                               Array<IterVar> axis,
                               Array<Expr> body) {
-  auto n = std::make_shared<ComputeOpNode>();
+  if (!attrs.defined()) {
+    attrs = Map<std::string, NodeRef>();
+  }
+  auto n = make_node<ComputeOpNode>();
   n->name = std::move(name);
   n->tag = std::move(tag);
   n->attrs = std::move(attrs);
@@ -162,7 +167,7 @@ Operation ComputeOpNode::ReplaceInputs(
     if (!new_reduce.same_as(this->body[0])) {
       const ir::Reduce* r = new_reduce.as<ir::Reduce>();
       for (size_t k = 0; k < this->body.size(); ++k) {
-        std::shared_ptr<ir::Reduce> n = std::make_shared<ir::Reduce>(*r);
+        auto n = make_node<ir::Reduce>(*r);
         n->value_index = static_cast<int>(k);
         n->type = r->source[k].type();
         arr.push_back(Expr(n));
@@ -316,27 +321,32 @@ Stmt MakeComputeStmt(const ComputeOpNode* self,
       source.push_back(stage->op.output(i));
     }
     MakeReduction(self, source, &init, &provide);
-    init = op::Substitute(init, n.init_vmap);
     init = MergeNest(n.init_nest, init);
+    init = op::Substitute(init, n.init_vmap);
     // common nest
     std::vector<std::vector<Stmt> > common(
         n.main_nest.begin(), n.main_nest.begin() + n.num_common_loop + 1);
     std::vector<std::vector<Stmt> > reduce(
         n.main_nest.begin() + n.num_common_loop + 1, n.main_nest.end());
-    provide = op::Substitute(provide, n.main_vmap);
     provide = MergeNest(reduce, provide);
     if (debug_keep_trivial_loop) {
-      return MergeNest(common, provide);
+      provide = MergeNest(common, provide);
     } else {
-      return MergeNest(common, Block::make(init, provide));
+      provide = MergeNest(common, Block::make(init, provide));
     }
+    // run substitution in the on the full nest, because  loop condition
+    // could depend on outer loops.
+    return op::Substitute(provide, n.main_vmap);
   } else {
     std::vector<Stmt> provides;
     for (size_t i = 0; i < self->body.size(); ++i) {
       provides.emplace_back(MakeProvide(self, stage->op.output(i)));
     }
-    Stmt provide = op::Substitute(Block::make(provides), n.main_vmap);
-    return MergeNest(n.main_nest, provide);
+    Stmt provide = Block::make(provides);
+    provide = MergeNest(n.main_nest, provide);
+    // run substitution in the on the full nest, because  loop condition
+    // could depend on outer loops.
+    return op::Substitute(provide, n.main_vmap);
   }
 }
 
@@ -541,4 +551,38 @@ static void VerifyComputeOp(const ComputeOpNode* op) {
   v.Run();
 }
 
+Stmt TransformUpdate(const Stage& stage,
+                     const std::unordered_map<IterVar, Range>& dom_map,
+                     const ComputeLoopNest& n,
+                     Stmt body,
+                     Stmt update) {
+  Array<Expr> conds;
+  std::unordered_set<const Variable*> banned;
+  for (size_t i = 0; i < stage->leaf_iter_vars.size(); ++i) {
+    IterVar iv = stage->leaf_iter_vars[i];
+    auto iit = stage->iter_var_attrs.find(iv);
+    if (iit != stage->iter_var_attrs.end()) {
+      const IterVarAttr& attr = (*iit).second;
+      if (attr->iter_type == kTensorized) {
+        break;
+      }
+    }
+    if (iv->iter_type == kCommReduce) {
+      auto vit = dom_map.find(iv);
+      CHECK(vit != dom_map.end());
+      const Range& vrange = vit->second;
+      conds.push_back(likely(iv->var > vrange->min));
+      banned.insert(iv->var.get());
+    }
+  }
+  for (const Expr& pred : n.main_predicates) {
+    if (ir::ExprUseVar(pred, banned)) {
+      LOG(FATAL) << "Tensorize update transform failed, the condition "
+                 << pred << " has a conflict with the reset condition";
+    }
+  }
+
+  return IfThenElse::make(arith::ComputeReduce<ir::Or>(conds, const_true(1)),
+                          update, body);
+}
 }  // namespace tvm
diff --git a/src/op/compute_op.h b/src/op/compute_op.h
index 996764c6cdc1..87b0814c1ad9 100644
--- a/src/op/compute_op.h
+++ b/src/op/compute_op.h
@@ -14,7 +14,7 @@
 
 namespace tvm {
 // loop nest structure for general compute
-// This the the loop nest structured used in compute.
+// This the loop nest structured used in compute.
 // Does not include the loop body.
 struct ComputeLoopNest {
   // The common number of loops between init and main
@@ -73,6 +73,21 @@ Stmt MakeTensorize(const ComputeOpNode* self,
                    const Stage& stage,
                    const std::unordered_map<IterVar, Range>& dom_map,
                    bool debug_keep_trivial_loop);
+
+/*!
+ * \brief Transform the update part when there is no init func in tensorizing
+ * \param stage The stage for tensorizing.
+ * \param dom_map The range of each iter var.
+ * \param n The loop nest structured used in compute.
+ * \param body The body func in tensorize intrin
+ * \param update The update func in tensorize intrin
+ * \return Transformed result.
+ */
+Stmt TransformUpdate(const Stage& stage,
+                     const std::unordered_map<IterVar, Range>& dom_map,
+                     const ComputeLoopNest& n,
+                     Stmt body,
+                     Stmt update);
 }  // namespace tvm
 
 #endif  // TVM_OP_COMPUTE_OP_H_
diff --git a/src/op/cross_thread_reduction.cc b/src/op/cross_thread_reduction.cc
index eb320388860a..c4599dee9bd8 100644
--- a/src/op/cross_thread_reduction.cc
+++ b/src/op/cross_thread_reduction.cc
@@ -4,8 +4,8 @@
  * \file cross_thread_reduction.cc
  */
 #include <tvm/ir_pass.h>
-#include "./compute_op.h"
-#include "./op_util.h"
+#include "compute_op.h"
+#include "op_util.h"
 
 namespace tvm {
 using namespace ir;
diff --git a/src/op/extern_op.cc b/src/op/extern_op.cc
index 759e258e90ef..cc6d57092f2a 100644
--- a/src/op/extern_op.cc
+++ b/src/op/extern_op.cc
@@ -7,7 +7,7 @@
 #include <tvm/arithmetic.h>
 #include <tvm/ir.h>
 #include <unordered_set>
-#include "./op_util.h"
+#include "op_util.h"
 
 namespace tvm {
 using namespace ir;
@@ -43,7 +43,10 @@ Operation ExternOpNode::make(std::string name,
                              Array<Buffer> input_placeholders,
                              Array<Buffer> output_placeholders,
                              Stmt body) {
-  auto n = std::make_shared<ExternOpNode>();
+  if (!attrs.defined()) {
+    attrs = Map<std::string, NodeRef>();
+  }
+  auto n = make_node<ExternOpNode>();
   n->name = std::move(name);
   n->tag = std::move(tag);
   n->attrs = std::move(attrs);
@@ -68,7 +71,7 @@ Operation ExternOpNode::ReplaceInputs(
     const Operation& self,
     const std::unordered_map<Tensor, Tensor>& rmap) const {
   CHECK_EQ(self.operator->(), this);
-  auto n = std::make_shared<ExternOpNode>(*this);
+  auto n = make_node<ExternOpNode>(*this);
   n->body = op::ReplaceTensor(this->body, rmap);
   for (size_t i = 0; i < n->inputs.size(); ++i) {
     Tensor t = n->inputs[i];
diff --git a/src/op/hybrid_op.cc b/src/op/hybrid_op.cc
new file mode 100644
index 000000000000..4dbb2c0b964f
--- /dev/null
+++ b/src/op/hybrid_op.cc
@@ -0,0 +1,189 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \brief Hybrid computation rule.
+ * \file hybrid_op.cc
+ */
+#include <tvm/operation.h>
+#include <tvm/arithmetic.h>
+#include <tvm/ir.h>
+#include <tvm/ir_mutator.h>
+#include <unordered_set>
+#include "op_util.h"
+
+namespace tvm {
+using namespace ir;
+// HybridOpNode
+TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+.set_dispatch<HybridOpNode>([](const HybridOpNode *op, IRPrinter *p) {
+    p->stream << "hybrid(" << op->name << ", " << op << ")";
+  });
+
+TVM_REGISTER_NODE_TYPE(HybridOpNode);
+
+int HybridOpNode::num_outputs() const {
+  return static_cast<int>(outputs.size());
+}
+
+Array<IterVar> HybridOpNode::root_iter_vars() const {
+  return {};
+}
+
+Type HybridOpNode::output_dtype(size_t i) const {
+  return outputs[i]->dtype;
+}
+
+Array<Expr> HybridOpNode::output_shape(size_t i) const {
+  return outputs[i]->shape;
+}
+
+
+Operation HybridOpNode::make(std::string name,
+                             std::string tag,
+                             Map<std::string, NodeRef> attrs,
+                             Array<Tensor> inputs,
+                             Array<Tensor> outputs,
+                             Stmt body) {
+  if (!attrs.defined()) {
+    attrs = Map<std::string, NodeRef>();
+  }
+  auto n = make_node<HybridOpNode>();
+  n->name = std::move(name);
+  n->tag = std::move(tag);
+  n->attrs = std::move(attrs);
+  n->inputs = std::move(inputs);
+  n->outputs = std::move(outputs);
+  n->body = std::move(body);
+  Operation res = Operation(n);
+  return res;
+}
+
+Array<Tensor> HybridOpNode::InputTensors() const {
+  return inputs;
+}
+
+Operation HybridOpNode::ReplaceInputs(
+    const Operation& self,
+    const std::unordered_map<Tensor, Tensor>& rmap) const {
+  CHECK_EQ(self.operator->(), this);
+  auto n = make_node<HybridOpNode>(*this);
+  n->body = op::ReplaceTensor(this->body, rmap);
+  for (size_t i = 0; i < n->inputs.size(); ++i) {
+    Tensor t = n->inputs[i];
+    if (rmap.count(t)) {
+      n->inputs.Set(i, rmap.at(t));
+    }
+  }
+
+  if (body.same_as(n->body) &&
+      inputs.same_as(n->inputs)) {
+    return self;
+  } else {
+    return Operation(n);
+  }
+}
+
+void HybridOpNode::PropBoundToInputs(
+    const Operation& self,
+    const std::unordered_map<const Variable*, IntSet>& dom_map,
+    std::unordered_map<Tensor, TensorDom>* out_dom_map) const {
+  for (Tensor t : this->inputs) {
+    auto it = out_dom_map->find(t);
+    if (it == out_dom_map->end()) continue;
+    TensorDom& dom = it->second;
+    for (size_t i = 0; i < t->shape.size(); ++i) {
+      dom.data[i].emplace_back(IntSet::range(
+          Range::make_by_min_extent(
+              make_const(t->shape[i].type(), 0), t->shape[i])));
+    }
+  }
+}
+
+void HybridOpNode::GatherBound(
+    const Operation& self,
+    const std::unordered_map<Tensor, TensorDom>& tensor_dom,
+    std::unordered_map<IterVar, Range>* out_dom_map) const {
+}
+
+Stmt HybridOpNode::BuildRealize(
+    const Stage& stage,
+    const std::unordered_map<IterVar, Range>& realize_map,
+    const Stmt& body) const {
+  CHECK_EQ(stage->op.get(), this);
+  Stmt realize_body = body;
+  for (int k = 0; k < num_outputs(); ++k) {
+    Tensor t = stage->op.output(k);
+    HalideIR::Internal::Region bounds;
+    for (size_t i = 0; i < t->shape.size(); ++i) {
+      bounds.push_back(
+          Range::make_by_min_extent(
+              make_const(t->shape[i].type(), 0), t->shape[i]));
+    }
+    realize_body = ir::Realize::make(
+        t->op, t->value_index, t->dtype,
+        bounds, const_true(), realize_body);
+  }
+  return realize_body;
+}
+
+Stmt HybridOpNode::BuildProvide(
+    const Stage& stage,
+    const std::unordered_map<IterVar, Range>& dom_map,
+    bool debug_keep_trivial_loop) const {
+  CHECK_EQ(stage->op.operator->(), this);
+  Stmt ret = AttrStmt::make(make_zero(Int(32)), attr::extern_scope, 0, this->body);
+  auto f_push_bind = [&ret](Buffer buffer, Tensor tensor) {
+    Array<NodeRef> bind_spec;
+    Array<Expr> tuple;
+    bind_spec.push_back(buffer);
+    bind_spec.push_back(tensor);
+    for (size_t k = 0; k < buffer->shape.size(); ++k) {
+      tuple.push_back(make_const(buffer->shape[k].type(), 0));
+      tuple.push_back(buffer->shape[k]);
+    }
+    ret = AttrStmt::make(
+        bind_spec, attr::buffer_bind_scope,
+        Call::make(Handle(), intrinsic::tvm_tuple, tuple, Call::Intrinsic), ret);
+  };
+  for (int i = static_cast<int>(outputs.size()) - 1; i >= 0; --i) {
+    Buffer buffer = decl_buffer(
+      outputs[i]->shape,
+      outputs[i]->dtype);
+    f_push_bind(buffer, stage->op.output(i));
+  }
+  for (int i = static_cast<int>(inputs.size()) - 1; i >= 0; --i) {
+    Buffer buffer = decl_buffer(
+      inputs[i]->shape,
+      inputs[i]->dtype);
+    f_push_bind(buffer, inputs[i]);
+  }
+
+  std::unordered_map<Tensor, Tensor> rmap;
+  for (int i = 0; i < this->num_outputs(); ++i) {
+    rmap[outputs[i]] = stage->op.output(i);
+  }
+  auto n = make_node<HybridOpNode>(*this);
+  /*
+   * These two lines of codes replace tensors' reads & writes.
+   * This is the simplest way I (@were) can come up with to glue
+   * hybrid scripts to the structure of TVM op.
+   * NAMING CONFLICT: In hybrid script all the tensors have their own 
+   * names specified by the users. However, In TVM op, all the output
+   * tensors' names are the same as the op's name. I cannot change the
+   * name to the op's name in the function body after the op node is
+   * formed, because:
+   *   1. Output tensors all point to the corresponding op node. 
+   *   2. Once OpNode is wrapped up by an Operation node, it can
+   *      no longer be changed.
+   * This is a chiken-egg paradox. It is impossible to put the output
+   * tensors into the function body without forming the op node. The
+   * function body is immutable after the node is formed.
+   *
+   * Finally, I decided to resolve this issue "lazily". During the
+   * pipeline of compilation, these tensors will be replaced when
+   * forming the function body and passing to next stage of compilation.
+   * */
+  ret = op::ReplaceTensor(ret, rmap);
+  ret = op::ReplaceProvideTensor(ret, rmap);
+  return ret;
+}
+}  // namespace tvm
diff --git a/src/op/op_util.cc b/src/op/op_util.cc
index 4f34d8d972ce..886f7c912303 100644
--- a/src/op/op_util.cc
+++ b/src/op/op_util.cc
@@ -7,7 +7,8 @@
 #include <tvm/ir_pass.h>
 #include <tvm/operation.h>
 #include <tvm/ir_mutator.h>
-#include "./op_util.h"
+#include <string>
+#include "op_util.h"
 #include "../schedule/message_passing.h"
 #include "../arithmetic/compute_expr.h"
 
@@ -163,6 +164,37 @@ std::vector<Stmt> MakeIfNest(const std::vector<Expr>& predicates) {
   return nest;
 }
 
+// replacer to replace tensors' usage in Provide
+class ProviderReplacer : public ir::IRMutator {
+ public:
+  explicit ProviderReplacer(const std::unordered_map<Tensor, Tensor>& vmap)
+      : vmap_(vmap) {}
+
+  Stmt Mutate_(const ir::Provide* op, const Stmt& s) {
+    Tensor t = Operation(op->func.node_).output(op->value_index);
+    auto it = vmap_.find(t);
+    if (it != vmap_.end()) {
+      Stmt ret = ir::Provide::make(
+        it->second->op, it->second->value_index, op->value, op->args);
+      found = true;
+      return IRMutator::Mutate_(ret.as<ir::Provide>(), ret);
+    }
+    return IRMutator::Mutate_(op, s);
+  }
+
+  // whether it is found.
+  bool found{false};
+
+ private:
+  const std::unordered_map<Tensor, Tensor>& vmap_;
+};
+
+Stmt ReplaceProvideTensor(Stmt stmt,
+                   const std::unordered_map<Tensor, Tensor>& replace) {
+  ProviderReplacer repl(replace);
+  Stmt ret = repl.Mutate(stmt);
+  return repl.found ? ret : stmt;
+}
 
 // replacer to replace tensors
 class TensorReplacer : public ir::IRMutator {
diff --git a/src/op/op_util.h b/src/op/op_util.h
index 558e8d4e7324..6971f14eef73 100644
--- a/src/op/op_util.h
+++ b/src/op/op_util.h
@@ -49,14 +49,22 @@ MakeLoopNest(const Stage& stage,
 std::vector<Stmt> MakeIfNest(const std::vector<Expr>& predicates);
 
 /*!
- * \brief Replace the tensor reference in stmt by the replace map.
+ * \brief Replace the tensor reference (especially in Provide's) in stmt by the replace map.
+ * \param stmt The statement to be processed.
+ * \param replace The replacement rule.
+ */
+Stmt ReplaceProvideTensor(Stmt stmt,
+                   const std::unordered_map<Tensor, Tensor>& replace);
+
+/*!
+ * \brief Replace the tensor reference (especially in Call's) in stmt by the replace map.
  * \param stmt The statement to be processed.
  * \param replace The replacement rule.
  */
 Stmt ReplaceTensor(Stmt stmt,
                    const std::unordered_map<Tensor, Tensor>& replace);
 /*!
- * \brief Replace the tensor reference in expr by the replace map.
+ * \brief Replace the tensor reference (especially in Call's) in stmt by the replace map.
  * \param expr The expression to be processed.
  * \param replace The replacement rule.
  */
diff --git a/src/op/placeholder_op.cc b/src/op/placeholder_op.cc
index a2cd0eb2d81f..fcd5993dafa5 100644
--- a/src/op/placeholder_op.cc
+++ b/src/op/placeholder_op.cc
@@ -36,7 +36,7 @@ Array<Expr> PlaceholderOpNode::output_shape(size_t i) const {
 Operation PlaceholderOpNode::make(std::string name,
                                   Array<Expr> shape,
                                   Type dtype) {
-  auto n = std::make_shared<PlaceholderOpNode>();
+  auto n = make_node<PlaceholderOpNode>();
   n->name = name;
   n->shape = shape;
   n->dtype = dtype;
diff --git a/src/op/scan_op.cc b/src/op/scan_op.cc
index 626c8eba46b4..610d4619390d 100644
--- a/src/op/scan_op.cc
+++ b/src/op/scan_op.cc
@@ -6,7 +6,7 @@
 #include <tvm/operation.h>
 #include <tvm/ir.h>
 #include <tvm/ir_pass.h>
-#include "./op_util.h"
+#include "op_util.h"
 #include "../schedule/graph.h"
 
 namespace tvm {
@@ -51,7 +51,10 @@ Operation ScanOpNode::make(std::string name,
                            Array<Tensor> update,
                            Array<Tensor> state_placeholder,
                            Array<Tensor> inputs) {
-  auto n = std::make_shared<ScanOpNode>();
+  if (!attrs.defined()) {
+    attrs = Map<std::string, NodeRef>();
+  }
+  auto n = make_node<ScanOpNode>();
   CHECK_EQ(init.size(), update.size());
   CHECK_EQ(init.size(), state_placeholder.size());
 
@@ -135,7 +138,7 @@ Operation ScanOpNode::ReplaceInputs(
     const Operation& self,
     const std::unordered_map<Tensor, Tensor>& rmap) const {
   CHECK_EQ(self.operator->(), this);
-  std::shared_ptr<ScanOpNode> n = std::make_shared<ScanOpNode>(*this);
+  auto n = make_node<ScanOpNode>(*this);
   for (size_t i = 0; i < n->init.size(); ++i) {
     if (rmap.count(n->init[i])) {
       n->init.Set(i, rmap.at(n->init[i]));
diff --git a/src/op/tensor_compute_op.cc b/src/op/tensor_compute_op.cc
new file mode 100644
index 000000000000..f9b8188d4685
--- /dev/null
+++ b/src/op/tensor_compute_op.cc
@@ -0,0 +1,361 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \brief Tensor Compute Op.
+ * \file tensor_compute_op.cc
+ */
+#include <tvm/operation.h>
+#include <tvm/arithmetic.h>
+#include <tvm/ir.h>
+#include <tvm/ir_visitor.h>
+#include <tvm/ir_pass.h>
+#include <unordered_set>
+#include "./op_util.h"
+#include "./compute_op.h"
+#include "../arithmetic/compute_expr.h"
+
+namespace tvm {
+using namespace ir;
+// TensorComputeOpNode
+TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+.set_dispatch<TensorComputeOpNode>([](const TensorComputeOpNode *op,
+                                      IRPrinter *p) {
+    p->stream << "tensor_compute_op(" << op->name << ", " << op << ")";
+  });
+
+TVM_REGISTER_NODE_TYPE(TensorComputeOpNode);
+
+int TensorComputeOpNode::num_outputs() const {
+  return static_cast<int>(this->intrin->buffers.size() - this->inputs.size());
+}
+
+Array<IterVar> TensorComputeOpNode::root_iter_vars() const {
+  Array<IterVar> ret = axis;
+  for (IterVar iv : reduce_axis) {
+    ret.push_back(iv);
+  }
+  return ret;
+}
+
+Type TensorComputeOpNode::output_dtype(size_t i) const {
+  return this->intrin->buffers[this->inputs.size() + i]->dtype;
+}
+
+Array<Expr> TensorComputeOpNode::output_shape(size_t i) const {
+  Array<Expr> shape;
+  for (const auto& ivar : this->axis) {
+    shape.push_back(ivar->dom->extent);
+  }
+  return shape;
+}
+
+
+Operation TensorComputeOpNode::make(std::string name,
+                                    std::string tag,
+                                    Array<IterVar> axis,
+                                    Array<IterVar> reduce_axis,
+                                    int schedulable_ndim,
+                                    TensorIntrin intrin,
+                                    Array<Tensor> tensors,
+                                    Array<Region> regions) {
+  auto n = make_node<TensorComputeOpNode>();
+  n->name = std::move(name);
+  n->tag = std::move(tag);
+  n->axis = std::move(axis);
+  n->reduce_axis = std::move(reduce_axis);
+  n->schedulable_ndim = std::move(schedulable_ndim);
+  n->intrin = std::move(intrin);
+  n->inputs = std::move(tensors);
+  n->input_regions = std::move(regions);
+  return Operation(n);
+}
+
+Array<Tensor> TensorComputeOpNode::InputTensors() const {
+  return inputs;
+}
+
+Operation TensorComputeOpNode::ReplaceInputs(
+    const Operation& self,
+    const std::unordered_map<Tensor, Tensor>& rmap) const {
+  CHECK_EQ(self.operator->(), this);
+  auto n = make_node<TensorComputeOpNode>(*this);
+  auto intrin = make_node<TensorIntrinNode>(*(this->intrin.operator->()));
+  intrin->body = op::ReplaceTensor(this->intrin->body, rmap);
+  if (intrin->reduce_init.defined()) {
+    intrin->reduce_init = op::ReplaceTensor(this->intrin->reduce_init, rmap);
+  }
+  if (intrin->reduce_update.defined()) {
+    intrin->reduce_update = op::ReplaceTensor(this->intrin->reduce_update, rmap);
+  }
+  for (size_t i = 0; i < n->inputs.size(); ++i) {
+    Tensor t = n->inputs[i];
+    if (rmap.count(t)) {
+      n->inputs.Set(i, rmap.at(t));
+    }
+  }
+
+  if (intrin->body.same_as(n->intrin->body) &&
+      intrin->reduce_init.same_as(n->intrin->reduce_init) &&
+      intrin->reduce_update.same_as(n->intrin->reduce_update) &&
+      inputs.same_as(n->inputs)) {
+    return self;
+  } else {
+    n->intrin = TensorIntrin(intrin);
+    return Operation(n);
+  }
+}
+
+void TensorComputeOpNode::PropBoundToInputs(
+    const Operation& self,
+    const std::unordered_map<const Variable*, IntSet>& dom_map,
+    std::unordered_map<Tensor, TensorDom>* out_dom_map) const {
+  for (size_t i = 0; i < this->inputs.size(); ++i) {
+    Tensor t = this->inputs[i];
+    Region region = input_regions[i];
+
+    auto it = out_dom_map->find(t);
+    if (it == out_dom_map->end()) continue;
+    TensorDom& dom = it->second;
+    for (size_t j = 0; j < t.ndim(); ++j) {
+      dom.data[j].emplace_back(EvalSet(region[j], dom_map));
+    }
+  }
+}
+
+void TensorComputeOpNode::GatherBound(
+    const Operation& self,
+    const std::unordered_map<Tensor, TensorDom>& tensor_dom,
+    std::unordered_map<IterVar, Range>* out_dom_map) const {
+  const TensorDom& tdom = tensor_dom.at(self.output(0));
+  for (size_t i = 0; i < this->axis.size(); ++i) {
+    Range r = arith::Union(tdom.data.at(i)).cover_range(this->axis[i]->dom);
+    CHECK(!out_dom_map->count(this->axis[i]));
+    (*out_dom_map)[this->axis[i]] = r;
+  }
+  for (size_t i = 0; i < this->reduce_axis.size(); ++i) {
+    CHECK(!out_dom_map->count(this->reduce_axis[i]));
+    (*out_dom_map)[this->reduce_axis[i]] = this->reduce_axis[i]->dom;
+  }
+}
+
+Stmt TensorComputeOpNode::BuildRealize(
+    const Stage& stage,
+    const std::unordered_map<IterVar, Range>& realize_map,
+    const Stmt& body) const {
+  CHECK_EQ(stage->op.get(), this);
+  HalideIR::Internal::Region bounds;
+  for (IterVar iv : this->axis) {
+    bounds.push_back(realize_map.at(iv));
+  }
+  Stmt realize = body;
+  for (int i = this->num_outputs(); i > 0; --i) {
+    Tensor t = stage->op.output(i-1);
+    realize = ir::Realize::make(t->op, t->value_index,
+      t->dtype, bounds, const_true(), realize);
+    // alignment requirement, only useful for compute
+    for (int i = 0; i < schedulable_ndim; ++i) {
+      auto it = stage->iter_var_attrs.find(this->axis[i]);
+      if (it != stage->iter_var_attrs.end()) {
+        IterVarAttr attr = (*it).second;
+        if (attr->dim_align_factor != 0) {
+          Array<Expr> tuple = {static_cast<int>(i),
+                               attr->dim_align_factor,
+                               attr->dim_align_offset};
+          realize = ir::AttrStmt::make(
+              t, ir::attr::buffer_dim_align,
+              Call::make(Handle(), ir::intrinsic::tvm_tuple, tuple, Call::Intrinsic),
+              realize);
+        }
+      }
+    }
+  }
+  return realize;
+}
+
+ComputeLoopNest MakeLoopNest(
+    const TensorComputeOpNode* self,
+    const Stage& stage,
+    const std::unordered_map<IterVar, Range>& dom_map,
+    bool debug_keep_trivial_loop) {
+  CHECK_EQ(stage->op.operator->(), self);
+  ComputeLoopNest ret;
+  // make main loop nest
+  ret.main_nest = op::MakeLoopNest(
+      stage, dom_map, 0, false, std::unordered_set<IterVar>(), &ret.main_vmap,
+      debug_keep_trivial_loop);
+  ret.main_predicates = schedule::MakeBoundCheck(
+      stage, dom_map, ret.main_vmap, false,
+      std::unordered_set<IterVar>());
+  for (auto& e : ret.main_predicates) {
+    e = likely(e);
+  }
+  if (stage->store_predicate.defined()) {
+    ret.main_predicates.push_back(stage->store_predicate);
+  }
+  if (self->reduce_axis.size() != 0) {
+    // try to find the location to insert the initialization.
+    // Fuse the initialization and provide loop when possible.
+    std::unordered_map<IterVar, int> update_state;
+    for (IterVar iv : self->reduce_axis) {
+      update_state[iv] = 2;
+    }
+    for (int i = 0; i < self->schedulable_ndim; ++i) {
+      update_state[self->axis[i]] = 1;
+    }
+    // find which iter var is related to reduction and which is related to axis.
+    schedule::PassDownBitMaskOr(stage, &update_state);
+    auto leaf_iter_vars = stage->leaf_iter_vars;
+    // first first loop that is related to reduction.
+    size_t begin_loop = leaf_iter_vars.size();
+    for (size_t i = 0; i < leaf_iter_vars.size(); ++i) {
+      auto iv = leaf_iter_vars[i];
+      int flag = update_state.at(iv);
+      if ((flag & 2) != 0) {
+        begin_loop = i; break;
+      }
+      ret.init_vmap[iv] = ret.main_vmap.at(iv);
+    }
+    ret.num_common_loop = begin_loop;
+    // skip loops that does not relates to axis.
+    std::unordered_set<IterVar> skip_iter;
+    for (auto kv : update_state) {
+      int flag = kv.second;
+      if ((flag & 1) == 0) skip_iter.insert(kv.first);
+    }
+    ret.init_nest = op::MakeLoopNest(
+        stage, dom_map, begin_loop, true,
+        skip_iter, &(ret.init_vmap), debug_keep_trivial_loop);
+    ret.init_predicates = schedule::MakeBoundCheck(
+        stage, dom_map, ret.init_vmap, true, skip_iter);
+    for (auto& e : ret.init_predicates) {
+      e = likely(e);
+    }
+  } else {
+    CHECK_EQ(ret.main_nest.size(), stage->leaf_iter_vars.size() + 1);
+    ret.num_common_loop = stage->leaf_iter_vars.size();
+  }
+  // copy elison here.
+  return ret;
+}
+
+
+Stmt TensorComputeOpNode::BuildProvide(
+    const Stage& stage,
+    const std::unordered_map<IterVar, Range>& dom_map,
+    bool debug_keep_trivial_loop) const {
+  CHECK_EQ(stage->op.operator->(), this);
+
+  // Start bind data.
+  Stmt nop = Evaluate::make(0);
+  std::vector<Stmt> input_bind_nest, output_bind_nest;
+  Array<Tensor> inputs = this->InputTensors();
+
+  // input binding
+  size_t num_inputs = inputs.size();
+  for (size_t i = 0; i < num_inputs; ++i) {
+    Tensor tensor = inputs[i];
+    Region region = this->input_regions[i];
+    Buffer buffer = this->intrin->buffers[i];
+    Array<NodeRef> bind_spec{buffer, tensor};
+
+    Array<Expr> tuple;
+    for (size_t i = 0; i < region.size(); ++i) {
+      tuple.push_back(region[i]->min);
+      tuple.push_back(region[i]->extent);
+    }
+    input_bind_nest.emplace_back(AttrStmt::make(
+        bind_spec, ir::attr::buffer_bind_scope,
+        Call::make(Handle(), ir::intrinsic::tvm_tuple, tuple, Call::Intrinsic), nop));
+  }
+
+  // output binding
+  for (int i = 0; i < this->num_outputs(); ++i) {
+    Tensor tensor = stage->op.output(i);
+    Buffer buffer = this->intrin->buffers[num_inputs + i];
+    Array<NodeRef> bind_spec{buffer, tensor};
+
+    Array<Expr> tuple;
+    for (size_t i = 0; i < this->axis.size(); ++i) {
+      auto ivar = this->axis[i];
+      if (i < static_cast<size_t>(this->schedulable_ndim)) {
+        tuple.push_back(ivar->var);
+        tuple.push_back(1);
+      } else {
+        Range dom = ivar->dom;
+        tuple.push_back(dom->min);
+        tuple.push_back(dom->extent);
+      }
+    }
+
+    output_bind_nest.emplace_back(AttrStmt::make(
+        bind_spec, ir::attr::buffer_bind_scope,
+        Call::make(Handle(), ir::intrinsic::tvm_tuple, tuple, Call::Intrinsic), nop));
+  }
+
+  // Check variable remap
+  std::unordered_map<const Variable*, Expr> vmap;
+  ir::ArgBinder binder(&vmap);
+
+  size_t tloc = stage->leaf_iter_vars.size();
+  ComputeLoopNest n = MakeLoopNest(this, stage, dom_map, debug_keep_trivial_loop);
+
+  if (this->reduce_axis.size() == 0) {
+    std::vector<std::vector<Stmt> > nest(
+        n.main_nest.begin(), n.main_nest.begin() + tloc + 1);
+    nest.emplace_back(op::MakeIfNest(n.main_predicates));
+    CHECK_EQ(n.init_predicates.size(), 0U);
+    CHECK(this->intrin->body.defined())
+        << "Normal store op for intrin " << this << " is not defined";
+    Stmt body = MergeNest(output_bind_nest, this->intrin->body);
+    body = MergeNest(input_bind_nest, body);
+    body = ir::Substitute(body, vmap);
+    body = MergeNest(binder.asserts(), body);
+    body = op::Substitute(body, n.main_vmap);
+    Stmt ret =  MergeNest(nest, body);
+    return ret;
+  } else {
+    // Need to split reduction
+    CHECK(this->intrin->reduce_update.defined())
+        << "Reduction update op is not defined";
+    // Need init and update steps
+    CHECK_NE(this->reduce_axis.size(), 0U);
+    std::vector<std::vector<Stmt> > common(
+        n.main_nest.begin(), n.main_nest.begin() + n.num_common_loop + 1);
+    std::vector<std::vector<Stmt> > update_nest(
+        n.main_nest.begin() + n.num_common_loop + 1, n.main_nest.begin() + tloc + 1);
+    update_nest.emplace_back(op::MakeIfNest(n.main_predicates));
+
+    if (this->intrin->reduce_init.defined()) {
+      // init nest
+      std::vector<std::vector<Stmt> > init_nest(
+          n.init_nest.begin(), n.init_nest.begin() + tloc + 1);
+      init_nest.emplace_back(op::MakeIfNest(n.init_predicates));
+      Stmt init = MergeNest(output_bind_nest, this->intrin->reduce_init);
+      init = op::Substitute(init, n.init_vmap);
+      init = MergeNest(init_nest, init);
+      // The update
+      Stmt update = MergeNest(output_bind_nest, this->intrin->reduce_update);
+      update = MergeNest(input_bind_nest, update);
+      update = ir::Substitute(update, vmap);
+      update = MergeNest(binder.asserts(), update);
+      update = op::Substitute(update, n.main_vmap);
+      update = MergeNest(update_nest, update);
+      return MergeNest(common, Block::make(init, update));
+    } else {
+      // When init op is not available, use body op for reset in the first iter.
+      CHECK(this->intrin->body.defined())
+          << "Normal body op is not defined";
+      Stmt update = TransformUpdate(stage, dom_map, n,
+                                    this->intrin->body,
+                                    this->intrin->reduce_update);
+      update = MergeNest(output_bind_nest, update);
+      update = MergeNest(input_bind_nest, update);
+      update = ir::Substitute(update, vmap);
+      update = MergeNest(binder.asserts(), update);
+      update = op::Substitute(update, n.main_vmap);
+      update = MergeNest(update_nest, update);
+      return MergeNest(common, update);
+    }
+  }
+}
+
+}  // namespace tvm
diff --git a/src/op/tensorize.cc b/src/op/tensorize.cc
index 148ad0f90fe7..a61aac422284 100644
--- a/src/op/tensorize.cc
+++ b/src/op/tensorize.cc
@@ -7,10 +7,9 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/ir_pass.h>
 #include <tvm/api_registry.h>
-#include "./op_util.h"
-#include "./compute_op.h"
+#include "op_util.h"
+#include "compute_op.h"
 #include "../schedule/message_passing.h"
-#include "../arithmetic/compute_expr.h"
 
 namespace tvm {
 
@@ -52,10 +51,10 @@ size_t InferTensorizeRegion(
       const IterVarAttr& attr = (*iit).second;
       if (!found_point) {
         CHECK(!attr->bind_thread.defined())
-            << "Donot allow thread in tensorize scope";
+            << "Do not allow thread in tensorize scope";
       }
       if (attr->iter_type == kTensorized) {
-        CHECK(!found_point) << "Donot allow two tensorized point";
+        CHECK(!found_point) << "Do not allow two tensorized point";
         found_point = true;
         loc_scope = i - 1;
       }
@@ -323,50 +322,6 @@ void VerifyTensorizeBody(
   }
 }
 
-/*!
- * \brief Transform the update part when there is no init func in tensorizing
- * \param stage The stage for tensorizing.
- * \param dom_map The range of each iter var.
- * \param n The loop nest structured used in compute. 
- * \param body The body func in tensorize intrin
- * \param update The update func in tensorize intrin
- * \return Transformed result.
- */
-Stmt TransformUpdate(const Stage& stage,
-                     const std::unordered_map<IterVar, Range>& dom_map,
-                     const ComputeLoopNest& n,
-                     Stmt body,
-                     Stmt update) {
-  Array<Expr> conds;
-  std::unordered_set<const Variable*> banned;
-  for (size_t i = 0; i < stage->leaf_iter_vars.size(); ++i) {
-    IterVar iv = stage->leaf_iter_vars[i];
-    auto iit = stage->iter_var_attrs.find(iv);
-    if (iit != stage->iter_var_attrs.end()) {
-      const IterVarAttr& attr = (*iit).second;
-      if (attr->iter_type == kTensorized) {
-        break;
-      }
-    }
-    if (iv->iter_type == kCommReduce) {
-      auto vit = dom_map.find(iv);
-      CHECK(vit != dom_map.end());
-      const Range& vrange = vit->second;
-      conds.push_back(likely(iv->var > vrange->min));
-      banned.insert(iv->var.get());
-    }
-  }
-  for (const Expr& pred : n.main_predicates) {
-    if (ir::ExprUseVar(pred, banned)) {
-      LOG(FATAL) << "Tensorize update transform failed, the condition "
-                 << pred << " has a conflict with the reset condition";
-    }
-  }
-
-  return IfThenElse::make(arith::ComputeReduce<ir::Or>(conds, const_true(1)),
-                          update, body);
-}
-
 Stmt MakeTensorize(const ComputeOpNode* self,
                    const Stage& stage,
                    const std::unordered_map<IterVar, Range>& dom_map,
diff --git a/src/pass/arg_binder.cc b/src/pass/arg_binder.cc
index 390c918d9692..623886c31b86 100644
--- a/src/pass/arg_binder.cc
+++ b/src/pass/arg_binder.cc
@@ -6,8 +6,8 @@
 #include <tvm/ir.h>
 #include <tvm/ir_pass.h>
 #include <tvm/runtime/device_api.h>
-#include "./ir_util.h"
-#include "./arg_binder.h"
+#include "ir_util.h"
+#include "arg_binder.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace tvm {
@@ -91,7 +91,9 @@ void ArgBinder::BindBuffer(const Buffer& arg,
   // bind pointer and offset.
   if (is_zero(arg->elem_offset)) {
     CHECK(is_zero(value->elem_offset))
-        << "Trying to bind a Buffer with offset into one without offset";
+        << "Trying to bind a Buffer with offset into one without offset "
+        << " required elem_offset=" << arg->elem_offset
+        << ", provided elem_offset=" << value->elem_offset;
   }
 
   this->Bind(arg->data, value->data, arg_name + ".data");
diff --git a/src/pass/bound_checker.cc b/src/pass/bound_checker.cc
new file mode 100644
index 000000000000..a7c03d0d1d60
--- /dev/null
+++ b/src/pass/bound_checker.cc
@@ -0,0 +1,195 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file bounds_checker.cc
+ */
+// Instrument checkers for out of the bounds access.
+
+#include <tvm/ir.h>
+#include <tvm/ir_mutator.h>
+#include <tvm/ir_pass.h>
+#include <tvm/ir_visitor.h>
+#include <vector>
+#include <unordered_map>
+#include <utility>
+
+namespace tvm {
+namespace ir {
+
+class BoundCollector : public IRVisitor {
+ public:
+  BoundCollector() {}
+
+  void Visit_(const AttrStmt *op) {
+    if (op->attr_key == ir::attr::buffer_bound) {
+      if (const Variable *key = op->node.as<Variable>()) {
+        mem_to_shape[key] = op->value;
+      }
+    }
+    IRVisitor::Visit_(op);
+  }
+  // Hashtable which maps buffer_var to shape.
+  std::unordered_map<const Variable *, Expr> mem_to_shape;
+};
+
+class BoundChecker : public IRMutator {
+ public:
+  explicit BoundChecker(
+      const std::unordered_map<const Variable *, Expr> &mem_to_shape)
+      : mem_to_shape_(mem_to_shape) {}
+
+  Stmt Mutate_(const Allocate *op, const Stmt &s) final {
+    // If the shape was updated we should update the hashtable.
+    if (UpdateIsNeeded(op->buffer_var)) {
+      Update(op->buffer_var, op->extents, op->type);
+    }
+    return IRMutator::Mutate_(op, s);
+  }
+
+  Expr Mutate_(const Call *op, const Expr &ex) final {
+    if (process_store_ && op->is_intrinsic(intrinsic::tvm_if_then_else)) {
+      unsafe_rewritten_ = true;
+    }
+    return IRMutator::Mutate_(op, ex);
+  }
+
+  Stmt Mutate_(const Store *op, const Stmt &s) final {
+    store_scope_bound_collector_.clear();
+    process_store_ = true;
+    unsafe_rewritten_ = false;
+    IRMutator::Mutate_(op, s);
+    process_store_ = false;
+    if (CanInstrument(op->index, op->buffer_var)) {
+      Collect(op->index, op->buffer_var);
+    }
+    // The collector should has at least one item.
+    if (store_scope_bound_collector_.size()) {
+      Expr condition = MakeCondition();
+      if (!condition.as<StringImm>()) {
+        Stmt nop = Evaluate::make(1);
+        Stmt then_case =
+            Store::make(op->buffer_var, op->value, op->index, op->predicate);
+        Stmt else_case =
+            AssertStmt::make(condition, StringImm::make(error_message_), nop);
+        Stmt body = IfThenElse::make(condition, then_case, else_case);
+        return body;
+      }
+    }
+    return s;
+  }
+
+  Expr Mutate_(const Load *op, const Expr &ex) final {
+    if (CanInstrument(op->index, op->buffer_var)) {
+      Collect(op->index, op->buffer_var);
+    }
+    return IRMutator::Mutate_(op, ex);
+  }
+
+ private:
+  bool UpdateIsNeeded(const VarExpr &buffer_var) const {
+    return (buffer_var.defined() && mem_to_shape_.count(buffer_var.get()));
+  }
+
+  void Update(const VarExpr &buffer_var, const Array<Expr> &new_shape,
+              const Type &type) {
+    // Sanity check at first.
+    if (!new_shape.size()) {
+      return;
+    }
+
+    for (size_t i = 0; i < new_shape.size(); ++i) {
+      if (!new_shape[0].defined() || !new_shape[i].type().is_scalar() ||
+          is_negative_const(new_shape[i])) {
+        return;
+      }
+    }
+
+    // Scalarize the shape.
+    Expr shape = Mul::make(make_const(UInt(64), type.lanes()),
+                           Cast::make(UInt(64), new_shape[0]));
+    for (size_t i = 1; i < new_shape.size(); ++i) {
+      // Cast to unsigned to avoid integer overlow at frist.
+      shape = Mul::make(shape, Mul::make(make_const(UInt(64), type.lanes()),
+                                         Cast::make(UInt(64), new_shape[i])));
+    }
+    mem_to_shape_[buffer_var.get()] = shape;
+  }
+
+  bool IndexIsValid(const Expr &index) const {
+    if (!index.defined()) {
+      return false;
+    }
+
+    if (const Ramp *ramp_index = index.as<Ramp>()) {
+      return ramp_index->base.defined() &&
+             ramp_index->base.type().is_scalar() &&
+             ramp_index->stride.defined() &&
+             ramp_index->stride.type().is_scalar() && (ramp_index->lanes > 0);
+    }
+    return true;
+  }
+
+  bool CanInstrument(const Expr &index, const VarExpr &buffer_var) const {
+    return buffer_var.defined() && mem_to_shape_.count(buffer_var.get()) &&
+           IndexIsValid(index) && !unsafe_rewritten_;
+  }
+
+  void Collect(Expr index, VarExpr buffer_var) {
+    store_scope_bound_collector_.push_back(
+        std::make_pair(index, mem_to_shape_[buffer_var.get()]));
+  }
+
+  Expr MakeCondition() {
+    Expr condition;
+    for (size_t i = 0; i < store_scope_bound_collector_.size(); ++i) {
+      std::pair<Expr, Expr> buffer_to_mem = store_scope_bound_collector_[i];
+      Expr index = buffer_to_mem.first;
+      Expr upper_bound = buffer_to_mem.second;
+
+      if (const Ramp *ramp_index = index.as<Ramp>()) {
+        // In case index is base + stride * i.
+        // Non inclusive range.
+        index = Add::make(
+            ramp_index->base,
+            Mul::make(ramp_index->stride, make_const(ramp_index->stride.type(),
+                                                     ramp_index->lanes - 1)));
+      }
+
+      // Try to simplify index and bound.
+      index = ir::Simplify(index);
+      upper_bound = ir::Simplify(upper_bound);
+
+      // Cast to the same type - signed, to be able to check lower bound.
+      index = Cast::make(Int(64), index);
+      upper_bound = Cast::make(Int(64), upper_bound);
+
+      // Looks like a lower bound should always be zero after normalization.
+      Expr lower_bound = make_zero(Int(64));
+
+      Expr current_condition =
+          And::make(GE::make(index, lower_bound), LT::make(index, upper_bound));
+      condition =
+          !i ? current_condition : And::make(condition, current_condition);
+    }
+    return condition;
+  }
+
+  // Whether we process store value recursively.
+  bool process_store_{false};
+  // Whether we face tvm_if_then_else intrinsic.
+  bool unsafe_rewritten_{false};
+  // Pool which collects the pair of index and shape for specific store/load.
+  std::vector<std::pair<Expr, Expr>> store_scope_bound_collector_;
+  // Error message.
+  const char *const error_message_ = "OUT OF THE BOUNDS";
+  // Hashtable which maps buffer_var to shape.
+  std::unordered_map<const Variable *, Expr> mem_to_shape_;
+};
+
+Stmt InstrumentBoundCheckers(Stmt stmt) {
+  BoundCollector bound_collector;
+  // At first walk recursively and collect bound attributes.
+  bound_collector.Visit(stmt);
+  return BoundChecker(bound_collector.mem_to_shape).Mutate(stmt);
+}
+}  // namespace ir
+}  // namespace tvm
diff --git a/src/pass/combine_context_call.cc b/src/pass/combine_context_call.cc
index dff91e6690f2..d60256bcfcf0 100644
--- a/src/pass/combine_context_call.cc
+++ b/src/pass/combine_context_call.cc
@@ -90,7 +90,7 @@ class ContextCallCombiner final : public IRMutator {
 };
 
 LoweredFunc CombineContextCall(LoweredFunc f) {
-  auto n = std::make_shared<LoweredFuncNode>(*f.operator->());
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
   n->body = ContextCallCombiner().Combine(n->body);
   return LoweredFunc(n);
 }
diff --git a/src/pass/coproc_sync.cc b/src/pass/coproc_sync.cc
index b3e64a989702..13dfef107e87 100644
--- a/src/pass/coproc_sync.cc
+++ b/src/pass/coproc_sync.cc
@@ -8,8 +8,8 @@
 #include <tvm/ir_visitor.h>
 #include <unordered_map>
 #include <unordered_set>
-#include "./ir_util.h"
-#include "./storage_access.h"
+#include "ir_util.h"
+#include "storage_access.h"
 
 namespace tvm {
 namespace ir {
diff --git a/src/pass/detect_device.cc b/src/pass/detect_device.cc
new file mode 100644
index 000000000000..c5fb0dd1b8f3
--- /dev/null
+++ b/src/pass/detect_device.cc
@@ -0,0 +1,21 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file detect_device.cc
+ */
+
+#include <tvm/ir_pass.h>
+#include <tvm/ir_mutator.h>
+#include "../pass/ir_util.h"
+
+namespace tvm {
+namespace ir {
+Stmt DecorateDeviceScope(Stmt stmt) {
+  Stmt body = AttrStmt::make(make_zero(Int(32)),
+                             ir::attr::device_scope,
+                             0,
+                             stmt);
+  return body;
+}
+
+}  // namespace ir
+}  // namespace tvm
diff --git a/src/pass/inject_double_buffer.cc b/src/pass/inject_double_buffer.cc
index 03ffdb01e107..1384ea1a89ac 100644
--- a/src/pass/inject_double_buffer.cc
+++ b/src/pass/inject_double_buffer.cc
@@ -7,7 +7,7 @@
 #include <tvm/ir_pass.h>
 #include <tvm/ir_visitor.h>
 #include <tvm/ir_mutator.h>
-#include "./ir_util.h"
+#include "ir_util.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace tvm {
diff --git a/src/pass/inject_virtual_thread.cc b/src/pass/inject_virtual_thread.cc
index 833513756053..3fc2e24fb4f1 100644
--- a/src/pass/inject_virtual_thread.cc
+++ b/src/pass/inject_virtual_thread.cc
@@ -321,7 +321,7 @@ class VTInjector : public IRMutator {
     CHECK_EQ(max_loop_depth_, 0);
     Stmt then_case = this->Mutate(op->then_case);
     Stmt else_case;
-    if (else_case.defined()) {
+    if (op->else_case.defined()) {
       int temp = max_loop_depth_;
       max_loop_depth_ = 0;
       else_case = this->Mutate(op->else_case);
@@ -430,7 +430,8 @@ class VTInjector : public IRMutator {
     } else {
       // insert a for loop
       Var idx(var_->name_hint + ".s", var_->type);
-      stmt = Substitute(stmt, {{var_, idx}});
+      Map<Var, Expr> values{{var_, idx}};
+      stmt = Substitute(stmt, values);
       return For::make(idx, make_zero(idx.type()),
                        make_const(idx.type(), num_threads_),
                        ForType::Serial, DeviceAPI::None, stmt);
diff --git a/src/pass/ir_deep_compare.cc b/src/pass/ir_deep_compare.cc
index 8a1b09e49339..2c0168ea5460 100644
--- a/src/pass/ir_deep_compare.cc
+++ b/src/pass/ir_deep_compare.cc
@@ -418,6 +418,19 @@ bool Equal(const Stmt& lhs, const Stmt& rhs) {
 }
 
 bool Equal(const Expr& lhs, const Expr& rhs) {
+  // quick pass for constant expressions.
+  if (const int64_t *a = as_const_int(lhs)) {
+    if (const int64_t *b = as_const_int(rhs)) {
+      return a[0] == b[0];
+    }
+  }
+  if (!lhs.defined()) {
+    if (rhs.defined()) return false;
+    if (!rhs.defined()) return true;
+  } else {
+    if (!rhs.defined()) return false;
+  }
+  // deep comparison.
   return IRDeepCompare().Equal(lhs, rhs);
 }
 
diff --git a/src/pass/ir_mutator.cc b/src/pass/ir_mutator.cc
index 9ca9ccd190ff..e82c4f554be0 100644
--- a/src/pass/ir_mutator.cc
+++ b/src/pass/ir_mutator.cc
@@ -5,7 +5,7 @@
 #include <tvm/ir.h>
 #include <tvm/ir_mutator.h>
 #include <tvm/packed_func_ext.h>
-#include "./ir_util.h"
+#include "ir_util.h"
 
 namespace tvm {
 namespace ir {
diff --git a/src/pass/ir_util.cc b/src/pass/ir_util.cc
index 579706ca9964..89426f982ba8 100644
--- a/src/pass/ir_util.cc
+++ b/src/pass/ir_util.cc
@@ -3,7 +3,7 @@
  * \file ir_util.cc
  * \brief Helper functions to construct and compose IR nodes.
  */
-#include "./ir_util.h"
+#include "ir_util.h"
 
 namespace tvm {
 namespace ir {
@@ -13,38 +13,38 @@ Stmt MergeNest(const std::vector<Stmt>& nest, Stmt body) {
   for (auto ri = nest.rbegin(); ri != nest.rend(); ++ri) {
     Stmt s = *ri;
     if (s.as<For>()) {
-      auto n = std::make_shared<For>(*s.as<For>());
+      auto n = make_node<For>(*s.as<For>());
       CHECK(is_no_op(n->body));
       n->body = body;
       body = Stmt(n);
     } else if (s.as<LetStmt>()) {
-      auto n = std::make_shared<LetStmt>(*s.as<LetStmt>());
+      auto n = make_node<LetStmt>(*s.as<LetStmt>());
       CHECK(is_no_op(n->body));
       n->body = body;
       body = Stmt(n);
     } else if (s.as<AttrStmt>()) {
-      auto n = std::make_shared<AttrStmt>(*s.as<AttrStmt>());
+      auto n = make_node<AttrStmt>(*s.as<AttrStmt>());
       CHECK(is_no_op(n->body));
       n->body = body;
       body = Stmt(n);
     } else if (s.as<IfThenElse>()) {
-      auto n = std::make_shared<IfThenElse>(*s.as<IfThenElse>());
+      auto n = make_node<IfThenElse>(*s.as<IfThenElse>());
       CHECK(is_no_op(n->then_case));
       CHECK(!n->else_case.defined());
       n->then_case = body;
       body = Stmt(n);
     } else if (s.as<Block>()) {
-      auto n = std::make_shared<Block>(*s.as<Block>());
+      auto n = make_node<Block>(*s.as<Block>());
       CHECK(is_no_op(n->rest));
       n->rest = body;
       body = Stmt(n);
     } else if (s.as<AssertStmt>()) {
-      auto n = std::make_shared<AssertStmt>(*s.as<AssertStmt>());
+      auto n = make_node<AssertStmt>(*s.as<AssertStmt>());
       CHECK(is_no_op(n->body));
       n->body = body;
       body = Stmt(n);
     } else if (s.as<Allocate>()) {
-      auto n = std::make_shared<Allocate>(*s.as<Allocate>());
+      auto n = make_node<Allocate>(*s.as<Allocate>());
       CHECK(is_no_op(n->body));
       n->body = body;
       body = Stmt(n);
diff --git a/src/pass/ir_util.h b/src/pass/ir_util.h
index f871133fb74f..3cef4486ee1b 100644
--- a/src/pass/ir_util.h
+++ b/src/pass/ir_util.h
@@ -7,6 +7,7 @@
 #define TVM_PASS_IR_UTIL_H_
 
 #include <tvm/ir.h>
+#include <tvm/ir_operator.h>
 #include <tvm/runtime/device_api.h>
 #include <vector>
 
@@ -75,7 +76,7 @@ inline Expr TVMStructGet(
   Array<Expr> args ={
     handle,
     make_const(Int(32), index),
-    make_const(Int(32), kind)};
+    make_const(Int(32), static_cast<int>(kind))};
   return Call::make(dtype, intrinsic::tvm_struct_get, args, Call::PureIntrinsic);
 }
 
@@ -125,7 +126,7 @@ inline Stmt TVMStructSet(
   Array<Expr> args ={
     handle,
     make_const(Int(32), index),
-    make_const(Int(32), kind),
+    make_const(Int(32), static_cast<int>(kind)),
     value};
   return Evaluate::make(
       Call::make(Int(32), intrinsic::tvm_struct_set, args, Call::Intrinsic));
diff --git a/src/pass/lift_attr_scope.cc b/src/pass/lift_attr_scope.cc
index a3a60aaac4d1..d5fd53812b99 100644
--- a/src/pass/lift_attr_scope.cc
+++ b/src/pass/lift_attr_scope.cc
@@ -7,7 +7,7 @@
  */
 #include <tvm/ir_pass.h>
 #include <tvm/ir_mutator.h>
-#include "./ir_util.h"
+#include "ir_util.h"
 
 namespace tvm {
 namespace ir {
diff --git a/src/pass/loop_partition.cc b/src/pass/loop_partition.cc
index 0de8a88edb00..95ce130785d7 100644
--- a/src/pass/loop_partition.cc
+++ b/src/pass/loop_partition.cc
@@ -239,11 +239,16 @@ class ThreadPartitionInserter : public IRMutator {
 // Try to do partition at the candidate IRs
 class LoopPartitioner : public IRMutator {
  public:
-  explicit LoopPartitioner(std::unordered_set<const Node*> candidates)
-    : candidates_(candidates) {}
+  explicit LoopPartitioner(bool split_const_loop)
+      : selector(CandidateSelector(split_const_loop)) {}
+
+  Stmt VisitAndMutate(const Stmt& stmt) {
+    selector.Visit(stmt);
+    return Mutate(stmt);
+  }
 
   Stmt Mutate_(const For* op, const Stmt& stmt) {
-    if (candidates_.count(op)) {
+    if (selector.candidates.count(op)) {
       Stmt s = TryPartition(op, stmt, op->loop_var,
           op->min, op->min + op->extent - 1, op->body, false);
       if (s.defined()) return s;
@@ -266,7 +271,7 @@ class LoopPartitioner : public IRMutator {
     const IterVarNode *iv = op->node.as<IterVarNode>();
     CHECK(iv);
     Var var = iv->var;
-    if (candidates_.count(op)) {
+    if (selector.candidates.count(op)) {
       Stmt s = TryPartition(op, stmt, var, 0, op->value - 1, op->body, true);
       if (s.defined()) return s;
     }
@@ -295,9 +300,9 @@ class LoopPartitioner : public IRMutator {
   inline Stmt MakeFor(const Node* op, Expr extent, Stmt body);
 
   /* Candidate IRs that may be partitioned potentially */
-  std::unordered_set<const Node*> candidates_;
   std::unordered_map<const Variable*, IntSet> hint_map_;
   std::unordered_map<const Variable*, IntSet> relax_map_;
+  CandidateSelector selector;
 };
 
 Stmt LoopPartitioner::TryPartition(const Node* node,
@@ -322,7 +327,7 @@ Stmt LoopPartitioner::TryPartition(const Node* node,
   Expr body_begin;
   Stmt pre_stmt;
   if (true_itrv.as<arith::IntervalSet>()->i.has_lower_bound()) {
-    body_begin = true_itrv.min();
+    body_begin = ir::Simplify(true_itrv.min());
     if (!can_prove(body_begin == min)) {
       Expr cond = (body_begin - min >= 0);
       if (!can_prove(cond)) {
@@ -343,7 +348,7 @@ Stmt LoopPartitioner::TryPartition(const Node* node,
   Expr post_doubt_begin;
   Stmt post_stmt;
   if (true_itrv.as<arith::IntervalSet>()->i.has_upper_bound()) {
-    post_doubt_begin = true_itrv.max() + 1;
+    post_doubt_begin = ir::Simplify(true_itrv.max() + 1);
     if (!can_prove(true_itrv.max() == max)) {
       // require the extent to be non-negative
       Expr cond = (max - post_doubt_begin + 1 >= 0);
@@ -354,8 +359,17 @@ Stmt LoopPartitioner::TryPartition(const Node* node,
       }
       // [post_doubt_begin, max]
       if (!partition_thread_scope) {
-        Stmt post_body = Substitute(body, {{Var{var}, var + post_doubt_begin}});
-        post_stmt = MakeFor(node, max - post_doubt_begin + 1, post_body);
+        Stmt post_body;
+        // If the loop is going from 0 to 1, replace the loop var with min value
+        if (as_const_int(max) && as_const_int(post_doubt_begin)) {
+            if (*as_const_int(max) == *as_const_int(post_doubt_begin)) {
+                post_body = Substitute(body, {{Var{var}, post_doubt_begin}});
+                post_stmt = post_body;
+            }
+        } else {
+            post_body = Substitute(body, {{Var{var}, var + post_doubt_begin}});
+            post_stmt = MakeFor(node, max - post_doubt_begin + 1, post_body);
+        }
       }
     }
   } else {
@@ -368,8 +382,15 @@ Stmt LoopPartitioner::TryPartition(const Node* node,
     Stmt simplified_body = ConditionEliminator(partitions).Mutate(body);
     Stmt new_body = Substitute(simplified_body, {{Var{var}, var + body_begin}});
     s = MakeFor(node, post_doubt_begin - body_begin, new_body);
-    if (pre_stmt.defined())  s = Block::make(pre_stmt, s);
-    if (post_stmt.defined()) s = Block::make(s, post_stmt);
+
+    if (!(pre_stmt.defined() && post_stmt.defined())) s = VisitAndMutate(s);
+    if (pre_stmt.defined()) s = Block::make(pre_stmt, s);
+    if (post_stmt.defined()) {
+      if (as_const_int(max) && as_const_int(post_doubt_begin)) {
+        post_stmt = VisitAndMutate(post_stmt);
+      }
+      s = Block::make(s, post_stmt);
+    }
   } else {
     Expr cond = const_true();
     if (!can_prove(body_begin == min)) cond = cond && (var >= body_begin);
@@ -402,9 +423,7 @@ class RemoveLikelyTags : public IRMutator {
 };
 
 Stmt LoopPartition(Stmt stmt, bool split_const_loop) {
-  CandidateSelector selector(split_const_loop);
-  selector.Visit(stmt);
-  stmt = LoopPartitioner(selector.candidates).Mutate(stmt);
+  stmt = LoopPartitioner(split_const_loop).VisitAndMutate(stmt);
   stmt = RemoveLikelyTags().Mutate(stmt);
   return stmt;
 }
diff --git a/src/pass/lower_intrin.cc b/src/pass/lower_intrin.cc
index 33ac6a94ecf7..1a9caf4b591e 100644
--- a/src/pass/lower_intrin.cc
+++ b/src/pass/lower_intrin.cc
@@ -8,7 +8,7 @@
 #include <tvm/ir_pass.h>
 #include <tvm/api_registry.h>
 #include <unordered_set>
-#include "./ir_util.h"
+#include "ir_util.h"
 
 namespace tvm {
 namespace ir {
@@ -104,7 +104,7 @@ class IntrinInjecter : public IRMutator {
 
 LoweredFunc
 LowerIntrin(LoweredFunc f, const std::string& target) {
-  auto n = std::make_shared<LoweredFuncNode>(*f.operator->());
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
   n->body = IntrinInjecter(target).Mutate(n->body);
   return LoweredFunc(n);
 }
diff --git a/src/pass/lower_thread_allreduce.cc b/src/pass/lower_thread_allreduce.cc
index 8c0eb037d953..2f700ed9112d 100644
--- a/src/pass/lower_thread_allreduce.cc
+++ b/src/pass/lower_thread_allreduce.cc
@@ -7,7 +7,7 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/ir_pass.h>
 #include <unordered_set>
-#include "./ir_util.h"
+#include "ir_util.h"
 #include "../arithmetic/compute_expr.h"
 #include "../runtime/thread_storage_scope.h"
 
@@ -317,7 +317,7 @@ class ThreadAllreduceBuilder final : public IRMutator {
 LoweredFunc
 LowerThreadAllreduce(LoweredFunc f, int warp_size) {
   CHECK_NE(f->func_type, kHostFunc);
-  auto n = std::make_shared<LoweredFuncNode>(*f.operator->());
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
   n->body = ThreadAllreduceBuilder(warp_size).Mutate(n->body);
   return LoweredFunc(n);
 }
diff --git a/src/pass/lower_tvm_builtin.cc b/src/pass/lower_tvm_builtin.cc
index a63fef07bd12..cf3d9f7eeeb1 100644
--- a/src/pass/lower_tvm_builtin.cc
+++ b/src/pass/lower_tvm_builtin.cc
@@ -7,7 +7,7 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/ir_pass.h>
 #include <unordered_set>
-#include "./ir_util.h"
+#include "ir_util.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace tvm {
@@ -288,7 +288,7 @@ class BuiltinLower : public IRMutator {
 };
 
 LoweredFunc LowerTVMBuiltin(LoweredFunc f) {
-  auto n = std::make_shared<LoweredFuncNode>(*f.operator->());
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
   n->body = BuiltinLower().Build(n->body);
   return LoweredFunc(n);
 }
diff --git a/src/pass/lower_warp_memory.cc b/src/pass/lower_warp_memory.cc
index 8f153fd61188..01ab2b51752e 100644
--- a/src/pass/lower_warp_memory.cc
+++ b/src/pass/lower_warp_memory.cc
@@ -13,7 +13,7 @@
 #include <tvm/ir_visitor.h>
 #include <tvm/ir_pass.h>
 #include <unordered_set>
-#include "./ir_util.h"
+#include "ir_util.h"
 #include "../arithmetic/compute_expr.h"
 #include "../runtime/thread_storage_scope.h"
 
@@ -93,7 +93,7 @@ class WarpStoreCoeffFinder : private IRVisitor {
         arith::DetectLinearEquation(index, {warp_index_});
     CHECK_EQ(m.size(), 2U)
         << "LowerWarpMemory failed due to store index=" << index;
-    int coeff;
+    int coeff = 0;
     Expr mcoeff = ir::Simplify(m[0]);
 
     CHECK(arith::GetConstInt(mcoeff, &coeff) && coeff > 0)
@@ -317,7 +317,7 @@ class WarpMemoryRewriter : private IRMutator {
 LoweredFunc
 LowerWarpMemory(LoweredFunc f, int warp_size) {
   CHECK_EQ(f->func_type, kDeviceFunc);
-  auto n = std::make_shared<LoweredFuncNode>(*f.operator->());
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
   n->body = WarpMemoryRewriter(warp_size).Rewrite(n->body);
   return LoweredFunc(n);
 }
diff --git a/src/pass/make_api.cc b/src/pass/make_api.cc
index 206bd95010ce..41f92ad24085 100644
--- a/src/pass/make_api.cc
+++ b/src/pass/make_api.cc
@@ -12,8 +12,8 @@
 #include <utility>
 #include <unordered_set>
 
-#include "./ir_util.h"
-#include "./arg_binder.h"
+#include "ir_util.h"
+#include "arg_binder.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace tvm {
@@ -132,7 +132,7 @@ LoweredFunc MakeAPI(Stmt body,
     }
   }
 
-  std::shared_ptr<LoweredFuncNode> n = std::make_shared<LoweredFuncNode>();
+  NodePtr<LoweredFuncNode> n = make_node<LoweredFuncNode>();
   n->name = name;
   n->args = args;
   n->handle_data_type = binder.def_handle_dtype();
@@ -197,7 +197,7 @@ class DeviceTypeBinder: public IRMutator {
 
 LoweredFunc BindDeviceType(LoweredFunc f,
                            int device_type) {
-  auto n = std::make_shared<LoweredFuncNode>(*f.operator->());
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
   n->body = DeviceTypeBinder(device_type).Mutate(n->body);
   return LoweredFunc(n);
 }
diff --git a/src/pass/narrow_channel_access.cc b/src/pass/narrow_channel_access.cc
index 733eeffb632e..7faf7d1b173e 100644
--- a/src/pass/narrow_channel_access.cc
+++ b/src/pass/narrow_channel_access.cc
@@ -11,7 +11,7 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/arithmetic.h>
 #include <tvm/channel.h>
-#include "./ir_util.h"
+#include "ir_util.h"
 
 namespace tvm {
 namespace ir {
diff --git a/src/pass/remap_thread_axis.cc b/src/pass/remap_thread_axis.cc
index 94e4819a1d71..08a62b25e2c4 100644
--- a/src/pass/remap_thread_axis.cc
+++ b/src/pass/remap_thread_axis.cc
@@ -67,7 +67,7 @@ RemapThreadAxis(LoweredFunc f, Map<Expr, IterVar> thread_map) {
   }
 
   CHECK_EQ(f->func_type, kDeviceFunc);
-  auto n = std::make_shared<LoweredFuncNode>(*f.operator->());
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
   // replace the thread axis
   for (size_t i = 0; i < n->thread_axis.size(); ++i) {
     auto it = tmap.find(n->thread_axis[i]->thread_tag);
diff --git a/src/pass/split_host_device.cc b/src/pass/split_host_device.cc
index c7b20e137638..4cfbc7c90d8c 100644
--- a/src/pass/split_host_device.cc
+++ b/src/pass/split_host_device.cc
@@ -153,7 +153,8 @@ class HostDeviceSplitter : public IRMutator {
 
   Stmt Mutate_(const AttrStmt *op, const Stmt& s) final {
     if (op->attr_key == attr::thread_extent ||
-        op->attr_key == attr::pipeline_exec_scope) {
+        op->attr_key == attr::pipeline_exec_scope ||
+        op->attr_key == attr::device_scope) {
       return SplitDeviceFunc(s);
     }
     return IRMutator::Mutate_(op, s);
@@ -165,8 +166,8 @@ class HostDeviceSplitter : public IRMutator {
       handle_data_type_[kv.first.get()] = kv.second;
     }
     name_ = f->name;
-    std::shared_ptr<LoweredFuncNode> n =
-        std::make_shared<LoweredFuncNode>(*f.operator->());
+    NodePtr<LoweredFuncNode> n =
+        make_node<LoweredFuncNode>(*f.operator->());
     n->body = this->Mutate(f->body);
     n->func_type = kHostFunc;
     Array<LoweredFunc> ret{LoweredFunc(n)};
@@ -180,7 +181,7 @@ class HostDeviceSplitter : public IRMutator {
   Stmt SplitDeviceFunc(Stmt body) {
     std::ostringstream os;
     os << name_ << "_kernel" << device_funcs_.size();
-    std::shared_ptr<LoweredFuncNode> n = std::make_shared<LoweredFuncNode>();
+    NodePtr<LoweredFuncNode> n = make_node<LoweredFuncNode>();
     // isolate the device function.
     IRUseDefAnalysis m;
     m.visit_thread_extent_ = false;
diff --git a/src/pass/split_pipeline.cc b/src/pass/split_pipeline.cc
index 38bd5f86fd68..c143a0d19153 100644
--- a/src/pass/split_pipeline.cc
+++ b/src/pass/split_pipeline.cc
@@ -11,7 +11,7 @@
 #include <tvm/channel.h>
 #include <unordered_map>
 #include <unordered_set>
-#include "./ir_util.h"
+#include "ir_util.h"
 
 namespace tvm {
 namespace ir {
@@ -102,9 +102,8 @@ class MarkChannelAccess : public IRMutator {
     } else {
       alloc_size = op->extents[0];
       for (size_t i = 1; i < op->extents.size(); ++i) {
-        alloc_size *= op->extents[i];
+        alloc_size = alloc_size * op->extents[i];
       }
-      alloc_size = ir::Simplify(alloc_size);
     }
 
     if (rw.write_count) {
diff --git a/src/pass/storage_access.cc b/src/pass/storage_access.cc
index 09be1a53da42..e7adcc75854f 100644
--- a/src/pass/storage_access.cc
+++ b/src/pass/storage_access.cc
@@ -5,8 +5,9 @@
 #include <tvm/ir_pass.h>
 #include <tvm/ir_mutator.h>
 #include <tvm/target_info.h>
-#include "./ir_util.h"
-#include "./storage_access.h"
+#include <string>
+#include "ir_util.h"
+#include "storage_access.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace tvm {
diff --git a/src/pass/storage_flatten.cc b/src/pass/storage_flatten.cc
index f5cb98495ff9..488d44544c31 100644
--- a/src/pass/storage_flatten.cc
+++ b/src/pass/storage_flatten.cc
@@ -14,8 +14,8 @@
 #include <tvm/target_info.h>
 #include <tvm/runtime/device_api.h>
 #include <unordered_map>
-#include "./ir_util.h"
-#include "./arg_binder.h"
+#include "ir_util.h"
+#include "arg_binder.h"
 #include "../arithmetic/compute_expr.h"
 #include "../runtime/thread_storage_scope.h"
 
@@ -31,7 +31,8 @@ using intrinsic::tvm_address_of;
 class StorageFlattener : public IRMutator {
  public:
   explicit StorageFlattener(Map<Tensor, Buffer> extern_buffer,
-                            int cache_line_size) {
+                            int cache_line_size, bool create_bound_attributes)
+      : create_bound_attributes_(create_bound_attributes) {
     for (auto kv : extern_buffer) {
       BufferEntry e;
       e.buffer = kv.second;
@@ -59,7 +60,8 @@ class StorageFlattener : public IRMutator {
     if (op->attr_key == attr::realize_scope) {
       storage_scope_[op->node.get()] = op->value.as<StringImm>()->value;
       return this->Mutate(op->body);
-    } else if (op->attr_key == attr::double_buffer_scope) {
+    } else if (op->attr_key == attr::double_buffer_scope &&
+               op->node.node_->derived_from<OperationNode>()) {
       Operation func(op->node.node_);
       Stmt body = Mutate(op->body);
       for (int i = 0; i < func->num_outputs(); ++i) {
@@ -100,6 +102,8 @@ class StorageFlattener : public IRMutator {
   }
 
   Stmt Mutate_(const Provide* op, const Stmt& s) final {
+    if (create_bound_attributes_)
+      shape_collector_.clear();
     Stmt stmt = IRMutator::Mutate_(op, s);
     op = stmt.as<Provide>();
     TensorKey key{op->func, op->value_index};
@@ -116,7 +120,20 @@ class StorageFlattener : public IRMutator {
           {e.buffer->data, op->value},
           Call::Intrinsic));
     } else {
-      return e.buffer.vstore(e.RelIndex(op->args), op->value);
+      Stmt body = e.buffer.vstore(e.RelIndex(op->args), op->value);
+      if (create_bound_attributes_ && ShapeIsValid(e.buffer->shape)) {
+        shape_collector_.push_back(
+            std::make_pair(e.buffer->data, e.buffer->shape));
+      }
+      // To create bound attribute collector should has at least one item.
+      if (create_bound_attributes_ && shape_collector_.size()) {
+        for (size_t i = 0; i < shape_collector_.size(); ++i) {
+          body = AttrStmt::make(
+              shape_collector_[i].first, ir::attr::buffer_bound,
+              MakeBound(e.buffer->dtype, shape_collector_[i].second), body);
+        }
+      }
+      return body;
     }
   }
 
@@ -191,10 +208,16 @@ class StorageFlattener : public IRMutator {
       buf_map_[key].released = true;
       Stmt ret;
 
+      Type storage_type = e.buffer->dtype;
+      // specially handle bool, lower its storage
+      // type to be Int(8)(byte)
+      if (storage_type == Bool()) {
+        storage_type = Int(8);
+      }
       if (strides.size() != 0) {
         int first_dim = 0;
         ret = Allocate::make(
-            e.buffer->data, e.buffer->dtype,
+            e.buffer->data, storage_type,
             {arith::ComputeExpr<Mul>(e.buffer->strides[first_dim], e.buffer->shape[first_dim])},
             make_const(Bool(e.buffer->dtype.lanes()), true), body);
       } else {
@@ -203,12 +226,17 @@ class StorageFlattener : public IRMutator {
           shape.push_back(make_const(Int(32), 1));
         }
         ret = Allocate::make(
-            e.buffer->data, e.buffer->dtype, shape,
+            e.buffer->data, storage_type, shape,
             make_const(Bool(e.buffer->dtype.lanes()), true), body);
       }
       ret = AttrStmt::make(
           e.buffer->data, attr::storage_scope,
           StringImm::make(e.buffer->scope), ret);
+
+      if (create_bound_attributes_ && ShapeIsValid(e.buffer->shape)) {
+        ret = AttrStmt::make(e.buffer->data, ir::attr::buffer_bound,
+                             MakeBound(e.buffer->dtype, e.buffer->shape), ret);
+      }
       return ret;
     }
   }
@@ -247,6 +275,11 @@ class StorageFlattener : public IRMutator {
       const BufferEntry& e = it->second;
       CHECK(!e.released)
           << "Read a buffer that is already out of scope";
+
+      if (create_bound_attributes_ && ShapeIsValid(e.buffer->shape)) {
+        shape_collector_.push_back(
+            std::make_pair(e.buffer->data, e.buffer->shape));
+      }
       return e.buffer.vload(e.RelIndex(op->args), e.buffer->dtype);
     } else {
       return expr;
@@ -422,6 +455,31 @@ class StorageFlattener : public IRMutator {
       }
     }
   };
+
+  bool ShapeIsValid(const Array<Expr> &shape) {
+    // Zero-dimensional tensor does not need boundary check.
+    if (!shape.size())
+      return false;
+
+    for (size_t i = 0; i < shape.size(); ++i) {
+      if (!shape[i].defined() || !shape[i].type().is_scalar() ||
+          is_negative_const(shape[i])) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  Expr MakeBound(const Type &type, const Array<Expr> &shape) {
+    // We have already checked the shape size to be greater then 0.
+    Expr bound = Mul::make(make_const(shape[0].type(), type.lanes()), shape[0]);
+    for (size_t i = 1; i < shape.size(); ++i) {
+      bound = Mul::make(
+          bound, Mul::make(make_const(bound.type(), type.lanes()), shape[i]));
+    }
+    return bound;
+  }
+
   // The buffer assignment map
   // Variable remap
   std::unordered_map<const Variable*, Expr> var_remap_;
@@ -433,16 +491,21 @@ class StorageFlattener : public IRMutator {
   std::unordered_map<const Node*, std::string> storage_scope_;
   // The current thread scope.
   std::vector<ThreadScope> curr_thread_scope_;
+  // Collects shapes.
+  std::vector<std::pair<VarExpr, Array<Expr>>> shape_collector_;
   // The size of cacheline
   int cache_line_size_;
   // The current stage is an OpenGL shader.
   bool is_opengl_{false};
+  // Whether to mark load/store with theirs bounds.
+  bool create_bound_attributes_{false};
 };
 
-Stmt StorageFlatten(Stmt stmt,
-                    Map<Tensor, Buffer> extern_buffer,
-                    int cache_line_size) {
-  stmt = StorageFlattener(extern_buffer, cache_line_size).Mutate(stmt);
+Stmt StorageFlatten(Stmt stmt, Map<Tensor, Buffer> extern_buffer,
+                    int cache_line_size, bool create_bound_attributes) {
+  stmt =
+      StorageFlattener(extern_buffer, cache_line_size, create_bound_attributes)
+          .Mutate(stmt);
   return stmt;
 }
 
diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc
index 0170499e1491..54f5010f1461 100644
--- a/src/pass/storage_rewrite.cc
+++ b/src/pass/storage_rewrite.cc
@@ -12,7 +12,7 @@
 #include <map>
 #include <unordered_set>
 #include <unordered_map>
-#include "./ir_util.h"
+#include "ir_util.h"
 #include "../arithmetic/compute_expr.h"
 #include "../runtime/thread_storage_scope.h"
 
@@ -578,12 +578,18 @@ class StoragePlanRewriter : public IRMutator {
           combo_size = combo_size / type_bits;
           // round up for can not divided
           if (!divided) {
-             combo_size += make_const(Int(32), 1);
+             combo_size = combo_size + make_const(Int(32), 1);
           }
           combo_size = ir::Simplify(combo_size);
           e->new_alloc = Allocate::make(
               e->alloc_var, alloc_type, {combo_size}, const_true(),
               Evaluate::make(0));
+          if (e->scope.tag.length() != 0) {
+            MemoryInfo info = GetMemoryInfo(e->scope.to_string());
+            uint64_t total_elem = e->const_nbits / e->elem_type.bits();
+            CHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
+                << "Allocation exceed bound of memory tag " << e->scope.to_string();
+          }
         }
       }
     }
@@ -944,8 +950,7 @@ class VectorAllocRewriter : public IRMutator {
 
 
 LoweredFunc PointerValueTypeRewrite(LoweredFunc f) {
-  std::shared_ptr<LoweredFuncNode> n =
-      std::make_shared<LoweredFuncNode>(*f.operator->());
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
   VectorAllocRewriter rewriter;
   n->body = rewriter.Mutate(n->body);
   for (Var arg : f->args) {
diff --git a/src/pass/storage_sync.cc b/src/pass/storage_sync.cc
index 6e2d1020a6b5..6f7fc886fd8c 100644
--- a/src/pass/storage_sync.cc
+++ b/src/pass/storage_sync.cc
@@ -8,8 +8,8 @@
 #include <tvm/ir_visitor.h>
 #include <unordered_map>
 #include <unordered_set>
-#include "./ir_util.h"
-#include "./storage_access.h"
+#include "ir_util.h"
+#include "storage_access.h"
 #include "../runtime/thread_storage_scope.h"
 
 namespace tvm {
@@ -329,7 +329,7 @@ Stmt ThreadSync(Stmt stmt, std::string storage_scope) {
 
 LoweredFunc ThreadSync(LoweredFunc f, std::string storage_scope) {
   CHECK_NE(f->func_type, kHostFunc);
-  auto n = std::make_shared<LoweredFuncNode>(*f.operator->());
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
   n->body = ThreadSync(f->body, storage_scope);
   return LoweredFunc(n);
 }
diff --git a/src/pass/unroll_loop.cc b/src/pass/unroll_loop.cc
index 6c0ac517553f..d4481e86c0fc 100644
--- a/src/pass/unroll_loop.cc
+++ b/src/pass/unroll_loop.cc
@@ -76,7 +76,9 @@ class LoopUnroller : public IRMutator {
       normal_loop_depth_ += 1;
     }
 
-    if (auto_unroll && explicit_unroll_) {
+    if ((auto_unroll && explicit_unroll_) ||
+        // unroll loops with extent = 1, no matter how many steps in body
+        (value <= auto_max_extent_ && auto_max_extent_ == 1)) {
       return Unroll(op);
     } else {
       if (auto_unroll) {
diff --git a/src/pass/vectorize_loop.cc b/src/pass/vectorize_loop.cc
index 206b75ed068d..19874a803657 100644
--- a/src/pass/vectorize_loop.cc
+++ b/src/pass/vectorize_loop.cc
@@ -355,7 +355,8 @@ class Vectorizer : public IRMutator {
   // scalarize the statment
   Stmt Scalarize(Stmt stmt) {
     Var idx(var_->name_hint + ".s", var_->type);
-    stmt = Substitute(stmt, {{var_, idx}});
+    Map<Var, Expr> values{{var_, idx}};
+    stmt = Substitute(stmt, values);
     return For::make(idx, 0, var_lanes_, ForType::Serial, DeviceAPI::None, stmt);
   }
 
@@ -437,7 +438,6 @@ class LoopVectorizer : public IRMutator {
   Stmt Mutate_(const For* op, const Stmt& s) final {
     if (op->for_type == ForType::Vectorized) {
       CHECK(is_zero(op->min));
-      CHECK(is_positive_const(op->extent));
       int lanes = 0;
       bool succ = arith::GetConstInt(op->extent, &lanes);
       if (!succ || lanes < 1) {
diff --git a/src/pass/verify_gpu_code.cc b/src/pass/verify_gpu_code.cc
index 363b7c4cf7cc..70908eb43d6b 100644
--- a/src/pass/verify_gpu_code.cc
+++ b/src/pass/verify_gpu_code.cc
@@ -86,17 +86,29 @@ class GPUCodeVerifier : public IRVisitor {
       // record the number of threads in a block
       std::string name = var.get()->name_hint;
       if (name == "threadIdx.x" || name == "threadIdx.y" || name == "threadIdx.z") {
+        size_t length = static_cast<size_t>(extent->value);
         if (!visited_threads_.count(name)) {
           visited_threads_.insert(name);
-          size_t length = static_cast<size_t>(extent->value);
           thread_per_block_ *= length;
 
           if (name == "threadIdx.x") {
             valid_ &= length <= max_thread_x_;
+            thread_x_extent_ = length;
           } else if (name == "threadIdx.y") {
             valid_ &= length <= max_thread_y_;
+            thread_y_extent_ = length;
           } else if (name == "threadIdx.z") {
             valid_ &= length <= max_thread_z_;
+            thread_z_extent_ = length;
+          }
+        } else {
+          // the thread should be bound to axes with the same length
+          if (name == "threadIdx.x") {
+            valid_ &= length == thread_x_extent_;
+          } else if (name == "threadIdx.y") {
+            valid_ &= length == thread_y_extent_;
+          } else if (name == "threadIdx.z") {
+            valid_ &= length == thread_z_extent_;
           }
         }
       }
@@ -111,6 +123,8 @@ class GPUCodeVerifier : public IRVisitor {
   std::unordered_set<const tvm::Variable *> visited_shared_buffers_;
   std::unordered_set<std::string> visited_threads_;
 
+  size_t thread_x_extent_, thread_y_extent_, thread_z_extent_;
+
   size_t local_memory_per_block_;
   size_t shared_memory_per_block_;
   size_t thread_per_block_;
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
new file mode 100644
index 000000000000..e36d916f5498
--- /dev/null
+++ b/src/relay/backend/compile_engine.cc
@@ -0,0 +1,382 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file relay/backend/compile_engine.cc
+ * \brief Internal compialtion engine.
+ */
+#include <tvm/schedule.h>
+#include <tvm/packed_func_ext.h>
+#include <tvm/operation.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/op_attr_types.h>
+#include <utility>
+#include <limits>
+#include <mutex>
+#include <functional>
+#include "compile_engine.h"
+
+namespace tvm {
+namespace relay {
+
+CCacheKey CCacheKeyNode::make(Function source_func, Target target) {
+  auto n = make_node<CCacheKeyNode>();
+  n->source_func = std::move(source_func);
+  n->target = std::move(target);
+  return CCacheKey(n);
+}
+
+// The getter to get schedule from compile engine.
+// Get schedule from functor.
+class ScheduleGetter :
+      public ExprFunctor<Array<Tensor>(const Expr&)> {
+ public:
+  explicit ScheduleGetter(Target target)
+      : target_(target) {}
+
+  Array<IndexExpr> GetShape(const Array<IndexExpr>& shape) {
+    // for now, we always use int32 shape when possible
+    // even if the result of shape inference becomes int64.
+    Array<IndexExpr> res;
+    for (IndexExpr val : shape) {
+      const int64_t* pval = as_const_int(val);
+      if (pval != nullptr) {
+        CHECK_LE(pval[0], std::numeric_limits<int32_t>::max());
+        CHECK_GE(pval[0], std::numeric_limits<int32_t>::min());
+        res.push_back(ir::IntImm::make(Int(32), *pval));
+      } else {
+        res.push_back(val);
+      }
+    }
+    return res;
+  }
+
+  std::pair<Schedule, CachedFunc> Create(const Function& prim_func) {
+    static auto fschedule =
+        Op::GetAttr<FTVMSchedule>("FTVMSchedule");
+    auto cache_node = make_node<CachedFuncNode>();
+    cache_node->target = target_;
+    for (Var param : prim_func->params) {
+      Array<tvm::Tensor> inputs;
+      if (const auto* ttype = param->checked_type().as<TensorTypeNode>()) {
+        tvm::Tensor tensor = tvm::placeholder(
+            GetShape(ttype->shape), ttype->dtype);
+        cache_node->inputs.push_back(tensor);
+        inputs.push_back(tensor);
+      } else {
+        // flatten tuple of tensor type.
+        const auto* tuple_type = param->type_as<TupleTypeNode>();
+        for (Type field : tuple_type->fields) {
+          const auto* ttype = field.as<TensorTypeNode>();
+          CHECK(ttype != nullptr);
+          tvm::Tensor tensor = tvm::placeholder(
+              GetShape(ttype->shape), ttype->dtype);
+          cache_node->inputs.push_back(tensor);
+          inputs.push_back(tensor);
+        }
+      }
+      memo_[param] = inputs;
+    }
+    readable_name_stream_ << "fused";
+    cache_node->outputs = this->VisitExpr(prim_func->body);
+    cache_node->func_name = readable_name_stream_.str();
+    CachedFunc cfunc(cache_node);
+    CHECK(master_op_.defined());
+    Schedule schedule = fschedule[master_op_](
+        master_attrs_, cache_node->outputs, target_);
+    for (const auto& scalar : scalars_) {
+      schedule[scalar].compute_inline();
+    }
+    return std::make_pair(schedule, cfunc);
+  }
+
+  Array<Tensor> VisitExpr(const Expr& expr) {
+    auto it = memo_.find(expr);
+    if (it != memo_.end()) {
+      return it->second;
+    } else {
+      Array<Tensor> res = ExprFunctor::VisitExpr(expr);
+      memo_[expr] = res;
+      return res;
+    }
+  }
+
+  Array<Tensor> VisitExpr_(const VarNode* op) final {
+    LOG(FATAL) << "Free variable " << op->name_hint();
+    return {};
+  }
+
+  Array<Tensor> VisitExpr_(const ConstantNode* op) final {
+    CHECK(op->is_scalar());
+    void* data = op->data->data;
+    DataType dtype = TVMType2Type(op->data->dtype);
+    Tensor value = tvm::compute({}, [&](const Array<tvm::Var>&) {
+        if (dtype == Int(32)) {
+          return make_const(dtype, static_cast<const int32_t*>(data)[0]);
+        } else if (dtype == Int(64)) {
+          return make_const(dtype, static_cast<const int64_t*>(data)[0]);
+        } else if (dtype == Float(32)) {
+          return make_const(dtype, static_cast<const float*>(data)[0]);
+        } else if (dtype == Float(64)) {
+          return make_const(dtype, static_cast<const double*>(data)[0]);
+        } else if (dtype == Bool()) {
+          return make_const(dtype, static_cast<const uint8_t*>(data)[0]);
+        } else {
+          LOG(FATAL) << "not handled";
+          return tvm::Expr();
+        }
+      });
+    scalars_.push_back(value->op);
+    return {value};
+  }
+
+  Array<Tensor> VisitExpr_(const CallNode* call_node) final {
+    static auto fcompute =
+        Op::GetAttr<FTVMCompute>("FTVMCompute");
+    static auto fpattern =
+        Op::GetAttr<TOpPattern>("TOpPattern");
+
+    Array<Tensor> inputs;
+    int count_tuple = 0;
+    for (Expr arg : call_node->args) {
+      if (arg->checked_type().as<TupleTypeNode>()) {
+        ++count_tuple;
+      }
+      for (Tensor tensor : VisitExpr(arg)) {
+        inputs.push_back(tensor);
+      }
+    }
+    if (count_tuple) {
+      CHECK_EQ(call_node->args.size(), 1U)
+          << "Only allow function with a single tuple input";
+    }
+    CHECK(call_node->op.as<OpNode>())
+        << "Primitive function only allows call into primitive ops";
+    Op op = Downcast<Op>(call_node->op);
+    Array<Tensor> outputs = fcompute[op](
+        call_node->attrs,
+        inputs,
+        call_node->checked_type(),
+        target_);
+
+    int op_pattern = fpattern[op];
+    if (op_pattern >= kCommReduce) {
+      CHECK(!master_op_.defined() || master_op_pattern_ < kCommReduce)
+          << "Two complicated op in a primitive function "
+          << " master=" << master_op_ << " current=" << op;
+    }
+    if (op_pattern >= master_op_pattern_) {
+      master_op_ = op;
+      master_attrs_ = call_node->attrs;
+      master_op_pattern_ = op_pattern;
+    }
+    if (outputs.size() != 1) {
+      const auto* tuple_type =
+          call_node->checked_type().as<TupleTypeNode>();
+      CHECK(tuple_type) << "Expect output to be a tuple type";
+      CHECK_EQ(tuple_type->fields.size(), outputs.size());
+    }
+    readable_name_stream_ << '_' << op->name;
+    return outputs;
+  }
+
+  Array<Tensor> VisitExpr_(const FunctionNode* op) final {
+    LOG(FATAL) << "Do not support sub function";
+    return Array<Tensor>();
+  }
+
+  Array<Tensor> VisitExpr_(const LetNode* op) final {
+    Array<Tensor> val = VisitExpr(op->value);
+    CHECK(!memo_.count(op->var));
+    memo_[op->var] = val;
+    return VisitExpr(op->body);
+  }
+
+  Array<Tensor> VisitExpr_(const TupleNode* op) final {
+    Array<Tensor> fields;
+    for (Expr field : op->fields) {
+      CHECK(field->checked_type().as<TensorTypeNode>())
+          << "Only allow Tuple of Tensor";
+      Array<Tensor> res = VisitExpr(field);
+      CHECK_EQ(res.size(), 1);
+      fields.push_back(res[0]);
+    }
+    return fields;
+  }
+
+  Array<Tensor> VisitExpr_(const TupleGetItemNode* op) final {
+    const auto* tuple_type = op->tuple->type_as<TupleTypeNode>();
+    Array<Tensor> tuple = VisitExpr(op->tuple);
+    CHECK_EQ(tuple_type->fields.size(), tuple.size());
+    CHECK_GE(op->index, 0);
+    CHECK_LT(static_cast<size_t>(op->index), tuple.size());
+    return {tuple[op->index]};
+  }
+
+ private:
+  tvm::Target target_;
+  Op master_op_;
+  Attrs master_attrs_;
+  int master_op_pattern_{0};
+  std::ostringstream readable_name_stream_;
+  std::unordered_map<Expr, Array<Tensor>, NodeHash, NodeEqual> memo_;
+  Array<Operation> scalars_;
+};
+
+
+class CompileEngineImpl : public CompileEngineNode {
+ public:
+  // Lower the function.
+  CachedFunc Lower(const CCacheKey& key)  {
+    return LowerInternal(key)->cached_func;
+  }
+
+  // For now, build one module per function.
+  PackedFunc JIT(const CCacheKey& key) final {
+    CCacheValue value = LowerInternal(key);
+    if (value->packed_func != nullptr) return value->packed_func;
+    // build the function.
+    if (const auto* f = runtime::Registry::Get("relay.backend.build")) {
+      tvm::runtime::Module m = (*f)(value->cached_func->funcs, key->target);
+      value->packed_func = m.GetFunction(value->cached_func->func_name);
+    } else {
+      LOG(FATAL) << "relay.backend.build is not registered";
+    }
+    return value->packed_func;
+  }
+  void Clear() final {
+    cache_.clear();
+  }
+  // List all items in the cache.
+  Array<NodeRef> ListItems() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    Array<NodeRef> items;
+    for (auto& kv : cache_) {
+      items.push_back(kv.first);
+      items.push_back(kv.second);
+    }
+    return items;
+  }
+  /*!
+   * \brief Create schedule for target.
+   * \param source_func The primitive function to be lowered.
+   * \param target The target we want to create schedule for.
+   * \return Pair of schedule and cache.
+   *  The funcs field in cache is not yet populated.
+   */
+  std::pair<Schedule, CachedFunc> CreateSchedule(
+      const Function& source_func, const Target& target) {
+    return ScheduleGetter(target).Create(source_func);
+  }
+
+ private:
+  // implement lowered func
+  CCacheValue LowerInternal(const CCacheKey& key)  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    CCacheValue value;
+    auto it = cache_.find(key);
+    if (it != cache_.end()) {
+      it->second->use_count += 1;
+      if (it->second->cached_func.defined()) return it->second;
+      value = it->second;
+    } else {
+      value = CCacheValue(make_node<CCacheValueNode>());
+      value->use_count = 0;
+      cache_[key] = value;
+    }
+    // Enforce use the target.
+    TargetContext target_ctx(key->target);
+
+    CHECK(!value->cached_func.defined());
+    auto spair = CreateSchedule(key->source_func, key->target);
+    auto cache_node = make_node<CachedFuncNode>(
+        *(spair.second.operator->()));
+    cache_node->func_name = GetUniqeName(cache_node->func_name);
+    // NOTE: array will copy on write.
+    Array<Tensor> all_args = cache_node->inputs;
+    for (Tensor arg : cache_node->outputs) {
+      all_args.push_back(arg);
+    }
+    // lower the function
+    if (const auto* f = runtime::Registry::Get("relay.backend.lower")) {
+      cache_node->funcs = (*f)(
+          spair.first, all_args, cache_node->func_name, key->source_func);
+    } else {
+      LOG(FATAL) << "relay.backend._lower is not registred";
+    }
+    value->cached_func = CachedFunc(cache_node);
+    return value;
+  }
+  /*!
+   * \brief Get unique name from name.
+   * \param name The orginal name.
+   * \return Updated name which is unique.
+   */
+  std::string GetUniqeName(std::string name) {
+    for (size_t i = 0; i < name.length(); ++i) {
+      if (name[i] == '.') name[i] = '_';
+    }
+    while (true) {
+      auto it = name_map_.find(name);
+      if (it == name_map_.end()) {
+        name_map_[name] = 1;
+        return name;
+      } else {
+        std::ostringstream os;
+        os << name << "_" << it->second;
+        ++(it->second);
+        name = os.str();
+      }
+    }
+    return name;
+  }
+  /*! \brief compiler cache lock*/
+  std::mutex mutex_;
+  /*! \brief internal name map to get an unique name */
+  std::unordered_map<std::string, int> name_map_;
+  /*! \brief internal compiler cache */
+  std::unordered_map<CCacheKey, CCacheValue> cache_;
+};
+
+/*! \brief The global compile engine */
+const CompileEngine& CompileEngine::Global() {
+  // intentionally allocate raw pointer to avoid
+  // free during destructuion.
+  static CompileEngine* inst = new CompileEngine(
+      make_node<CompileEngineImpl>());
+  return *inst;
+}
+
+
+TVM_REGISTER_GLOBAL("relay.backend._make_CCacheKey")
+.set_body_typed<CCacheKey(Function, Target)>(CCacheKeyNode::make);
+
+TVM_REGISTER_GLOBAL("relay.backend._CompileEngineGlobal")
+.set_body_typed<CompileEngine()>([]() {
+    return CompileEngine::Global();
+  });
+
+TVM_REGISTER_GLOBAL("relay.backend._CompileEngineClear")
+.set_body_typed<void(const CompileEngine&)>([](CompileEngine self) {
+    self->Clear();
+  });
+
+TVM_REGISTER_GLOBAL("relay.backend._CompileEngineLower")
+.set_body_typed<CachedFunc(CompileEngine, CCacheKey)>(
+    [](CompileEngine self, CCacheKey key) {
+      return self->Lower(key);
+    });
+
+TVM_REGISTER_GLOBAL("relay.backend._CompileEngineJIT")
+.set_body_typed<PackedFunc(CompileEngine, CCacheKey)>(
+    [](CompileEngine self, CCacheKey key) {
+      return self->JIT(key);
+    });
+
+TVM_REGISTER_GLOBAL("relay.backend._CompileEngineListItems")
+.set_body_typed<Array<NodeRef>(CompileEngine)>(
+    [](CompileEngine self){
+      return static_cast<CompileEngineImpl*>(self.operator->())->ListItems();
+    });
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h
new file mode 100644
index 000000000000..40b53ab31e5e
--- /dev/null
+++ b/src/relay/backend/compile_engine.h
@@ -0,0 +1,206 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file relay/backend/compile_engine.h
+ * \brief Internal compialtion engine handle function cache.
+ *  and interface to low level code generation.
+ */
+#ifndef TVM_RELAY_BACKEND_COMPILE_ENGINE_H_
+#define TVM_RELAY_BACKEND_COMPILE_ENGINE_H_
+
+#include <tvm/lowered_func.h>
+#include <tvm/relay/expr.h>
+#include <string>
+#include <functional>
+
+namespace tvm {
+namespace relay {
+
+/*! \brief Node container to represent a cached function. */
+struct CachedFuncNode : public Node {
+  /* \brief compiled target */
+  tvm::Target target;
+  /*! \brief Function name */
+  std::string func_name;
+  /* \brief The inputs to the function */
+  tvm::Array<Tensor> inputs;
+  /* \brief The outputs to the function */
+  tvm::Array<Tensor> outputs;
+  /*! \brief The lowered functions to support the function. */
+  tvm::Array<tvm::LoweredFunc> funcs;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("target", &target);
+    v->Visit("func_name", &func_name);
+    v->Visit("inputs", &inputs);
+    v->Visit("outputs", &outputs);
+    v->Visit("funcs", &funcs);
+  }
+
+  static constexpr const char* _type_key = "relay.CachedFunc";
+  TVM_DECLARE_NODE_TYPE_INFO(CachedFuncNode, Node);
+};
+
+TVM_DEFINE_NODE_REF(CachedFunc, CachedFuncNode);
+
+
+class CCacheKey;
+/*! \brief Compile cache key */
+class CCacheKeyNode : public Node {
+ public:
+  /*! \brief The source function to be lowered. */
+  Function source_func;
+  /*! \brief The hardware target.*/
+  Target target;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("source_func", &source_func);
+    v->Visit("target", &target);
+  }
+  /*! \return The hash value of CCacheKey. */
+  inline size_t Hash() const;
+  /*!
+   * \brief check content equality
+   * \param other The other value.
+   * \return The result of equality check.
+   */
+  inline bool Equal(const CCacheKeyNode* other) const;
+  /*!
+   * \brief create a cache key.
+   * \param source_func The source function.
+   * \param target The target device.
+   * \return the created key.
+   */
+  TVM_DLL static CCacheKey make(Function source_func,
+                                Target target);
+
+  static constexpr const char* _type_key = "relay.CCacheKey";
+  TVM_DECLARE_NODE_TYPE_INFO(CCacheKeyNode, tvm::Node);
+
+ private:
+  /*!
+   * \brief internal cached hash value.
+   */
+  mutable size_t hash_{0};
+};
+
+/*! \brief cache entry used in compile engine */
+class CCacheKey : public NodeRef {
+ public:
+  CCacheKey() {}
+  explicit CCacheKey(NodePtr<Node> n) : NodeRef(n) {}
+  const CCacheKeyNode* operator->() const {
+    return static_cast<CCacheKeyNode*>(node_.get());
+  }
+  // comparator
+  inline bool operator==(const CCacheKey& other) const {
+    CHECK(defined() && other.defined());
+    return (*this)->Equal(other.operator->());
+  }
+  using ContainerType = CCacheKeyNode;
+};
+
+/*! \brief Node container for compile cache. */
+class CCacheValueNode : public Node {
+ public:
+  /*! \brief The corresponding function */
+  CachedFunc cached_func;
+  /*! \brief Result of Packed function generated by JIT */
+  PackedFunc packed_func;
+  /*! \brief usage statistics */
+  int use_count{0};
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("cached_func", &cached_func);
+    v->Visit("use_count", &use_count);
+  }
+  static constexpr const char* _type_key = "relay.CCacheValue";
+  TVM_DECLARE_NODE_TYPE_INFO(CCacheValueNode, tvm::Node);
+};
+
+/*! \brief cache entry used in compile engine */
+class CCacheValue : public NodeRef {
+ public:
+  CCacheValue() {}
+  explicit CCacheValue(NodePtr<Node> n) : NodeRef(n) {}
+  CCacheValueNode* operator->() {
+    return static_cast<CCacheValueNode*>(node_.get());
+  }
+  const CCacheValueNode* operator->() const {
+    return static_cast<const CCacheValueNode*>(node_.get());
+  }
+  using ContainerType = CCacheValueNode;
+};
+
+/*!
+ * \brief Backend compilation engine for
+ *        low level code generation.
+ */
+class CompileEngineNode : public Node {
+ public:
+  /*!
+   * \brief Get lowered result.
+   * \param key The key to the cached function.
+   * \return The result.
+   */
+  virtual CachedFunc Lower(const CCacheKey& key) = 0;
+  /*!
+   * \brief Just in time compile to get a PackedFunc.
+   * \param key The key to the cached function.
+   * \return The result.
+   */
+  virtual PackedFunc JIT(const CCacheKey& key) = 0;
+  /*! \brief clear the cache. */
+  virtual void Clear() = 0;
+
+  // VisitAttrs
+  void VisitAttrs(AttrVisitor*) final {}
+
+  static constexpr const char* _type_key = "relay.CompileEngine";
+  TVM_DECLARE_NODE_TYPE_INFO(CompileEngineNode, Node);
+};
+
+/*! \brier cache entry used in compile engine */
+class CompileEngine : public NodeRef {
+ public:
+  CompileEngine() {}
+  explicit CompileEngine(NodePtr<Node> n) : NodeRef(n) {}
+  CompileEngineNode* operator->() {
+    return static_cast<CompileEngineNode*>(node_.get());
+  }
+  using ContainerType = CompileEngineNode;
+  /*! \brief The global compile engine. */
+  TVM_DLL static const CompileEngine& Global();
+};
+
+// implementations
+inline size_t CCacheKeyNode::Hash() const {
+  if (hash_ != 0) return hash_;
+  // do structral hash, avoid 0.
+  hash_ = StructuralHash()(this->source_func);
+  hash_ = dmlc::HashCombine(
+      hash_, std::hash<std::string>()(target->str()));
+  if (hash_ == 0) hash_ = 1;
+  return hash_;
+}
+
+inline bool CCacheKeyNode::Equal(
+    const CCacheKeyNode* other) const {
+  if (Hash() != other->Hash()) return false;
+  return this->target->str() == other->target->str() &&
+      AlphaEqual(this->source_func, other->source_func);
+}
+
+}  // namespace relay
+}  // namespace tvm
+
+namespace std {
+// overload hash
+template<>
+struct hash<::tvm::relay::CCacheKey> {
+  size_t operator()(const ::tvm::relay::CCacheKey& key) const {
+    CHECK(key.defined());
+    return key->Hash();
+  }
+};
+}  // namespace std
+#endif  // TVM_RELAY_BACKEND_COMPILE_ENGINE_H_
diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
new file mode 100644
index 000000000000..e17c7a6839ea
--- /dev/null
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -0,0 +1,355 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file relay/backend/graph_mem_alloca.cc
+ * \brief Memory index assignment pass for executing
+ *   the program in the graph runtime.
+ */
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include "../../common/arena.h"
+
+namespace tvm {
+namespace relay {
+
+struct StorageToken {
+  /*! \brief Reference counter */
+  int ref_counter{0};
+  /*! \brief number of bytes */
+  size_t max_bytes{0};
+  /*! \brief The corresponding tensor type node. */
+  const TensorTypeNode* ttype{nullptr};
+  /*! \brief virtual device index */
+  int device_id{0};
+  /*! \brief The storage id */
+  int64_t storage_id{-1};
+};
+
+class StorageAllocaBaseVisitor : public ExprVisitor {
+ public:
+  // run the visitor on a function.
+  void Run(const Function& func) {
+    for (Var param : func->params) {
+      CreateToken(param.operator->(), false);
+    }
+    // must always keep output alive.
+    for (StorageToken* tok : GetToken(func->body)) {
+      tok->ref_counter += 1;
+    }
+  }
+
+  void VisitExpr_(const ConstantNode* op) final {
+    this->CreateToken(op, false);
+  }
+
+  void VisitExpr_(const VarNode* op) final {
+    // Do nothing.
+  }
+
+  void VisitExpr_(const FunctionNode* op) final {
+    // do not recursive into sub function.
+  }
+
+  void VisitExpr_(const GlobalVarNode* op) final {
+    // Do nothing.
+  }
+
+  void VisitExpr_(const OpNode* op) final {
+    // Do nothing.
+  }
+
+  void VisitExpr_(const TupleNode* op) final {
+    std::vector<StorageToken*> fields;
+    for (Expr field : op->fields) {
+      auto tok = GetToken(field);
+      CHECK_EQ(tok.size(), 1U);
+      fields.push_back(tok[0]);
+    }
+    token_map_[op] = fields;
+  }
+
+  void VisitExpr_(const TupleGetItemNode* op) final {
+    const auto& tok = GetToken(op->tuple);
+    CHECK_LT(static_cast<size_t>(op->index), tok.size());
+    token_map_[op] = {tok[op->index]};
+  }
+
+  void VisitExpr_(const IfNode* op) final {
+    LOG(FATAL) << "if is not supported.";
+  }
+
+  void VisitExpr_(const LetNode* op) final {
+    auto token = GetToken(op->value);
+    token_map_[op->var.operator->()] = token;
+    token_map_[op] = GetToken(op->body);
+  }
+
+ protected:
+  /*! \brief internal token map */
+  std::unordered_map<const ExprNode*, std::vector<StorageToken*> > token_map_;
+
+  /*!
+   * \brief Get the necessary token.
+   * \param expr The expression.
+   * \return The corresponding token.
+   */
+  const std::vector<StorageToken*>& GetToken(const Expr& expr) {
+    this->VisitExpr(expr);
+    auto it = token_map_.find(expr.operator->());
+    CHECK(it != token_map_.end());
+    return it->second;
+  }
+  /*!
+   * \brief Populate the token map to set op's tokens
+   * \param op The node to be processed.
+   * \param can_realloc Whether we can re-allocate the memory.
+   */
+  virtual void CreateToken(const ExprNode* op, bool can_realloc) = 0;
+};
+
+
+class StorageAllocaInit : protected StorageAllocaBaseVisitor {
+ public:
+  explicit StorageAllocaInit(common::Arena* arena)
+      : arena_(arena) {}
+
+
+  /*! \return The internal token map */
+  std::unordered_map<const ExprNode*, std::vector<StorageToken*> >
+  GetInitTokenMap(const Function& func) {
+    this->Run(func);
+    return std::move(token_map_);
+  }
+
+
+ protected:
+  using StorageAllocaBaseVisitor::VisitExpr_;
+
+  void CreateToken(const ExprNode* op, bool can_realloc)  final {
+    CHECK(!token_map_.count(op));
+    std::vector<StorageToken*> tokens;
+    if (const auto* tuple_type = op->checked_type().as<TupleTypeNode>()) {
+      for (Type t : tuple_type->fields) {
+        const auto* ttype = t.as<TensorTypeNode>();
+        CHECK(ttype);
+        StorageToken* token = arena_->make<StorageToken>();
+        token->ttype = ttype;
+        tokens.push_back(token);
+      }
+    } else {
+      const auto* ttype = op->checked_type().as<TensorTypeNode>();
+      CHECK(ttype);
+      StorageToken* token = arena_->make<StorageToken>();
+      token->ttype = ttype;
+      tokens.push_back(token);
+    }
+    token_map_[op] = tokens;
+  }
+
+  void VisitExpr_(const CallNode* op) final {
+    // create token for the call node.
+    CreateToken(op, true);
+    // for each input, visit argument token.
+    for (Expr arg : op->args) {
+      for (StorageToken* tok : GetToken(arg)) {
+        tok->ref_counter += 1;
+      }
+    }
+  }
+
+ private:
+  // allocator
+  common::Arena* arena_;
+};
+
+
+class StorageAllocator : public StorageAllocaBaseVisitor {
+ public:
+  /*!
+   * \return totoal number of bytes allocated
+   */
+  size_t TotalAllocBytes() const {
+    size_t total = 0;
+    for (const auto* p : data_) {
+      total += p->max_bytes;
+    }
+    return total;
+  }
+
+  // Run storage allocation for a function.
+  Map<Expr, Array<Integer> > Plan(const Function& func) {
+    prototype_ = StorageAllocaInit(&arena_).GetInitTokenMap(func);
+    this->Run(func);
+
+    Map<Expr, Array<Integer> > smap;
+
+    for (const auto& kv : token_map_) {
+      Array<Integer> vec;
+      for (StorageToken* tok : kv.second) {
+        vec.push_back(tok->storage_id);
+      }
+      smap.Set(GetRef<Expr>(kv.first), vec);
+    }
+    return smap;
+  }
+
+
+ protected:
+  using StorageAllocaBaseVisitor::VisitExpr_;
+  // override create token by getting token as prototype requirements.
+  void CreateToken(const ExprNode* op, bool can_realloc) final {
+    CHECK(!token_map_.count(op));
+    auto it = prototype_.find(op);
+    CHECK(it != prototype_.end());
+    std::vector<StorageToken*> tokens;
+    for (StorageToken* tok : it->second) {
+      if (can_realloc) {
+        tokens.push_back(Request(tok));
+      } else {
+        // Allocate a new token,
+        StorageToken* allocated_tok = Alloc(tok, GetMemorySize(tok));
+        // ensure it never get de-allocated.
+        allocated_tok->ref_counter += 1;
+        tokens.push_back(allocated_tok);
+      }
+    }
+    token_map_[op] = tokens;
+  }
+  // The call map
+  void VisitExpr_(const CallNode* op) final {
+    std::vector<StorageToken*> args;
+    // for each input, visit argument token.
+    for (Expr arg : op->args) {
+      for (StorageToken* tok : GetToken(arg)) {
+        args.push_back(tok);
+      }
+    }
+    // create token for the call node.
+    CreateToken(op, true);
+    // check if there is orphaned output that can be released immediately.
+    for (StorageToken* tok : token_map_.at(op)) {
+      CheckForRelease(tok);
+    }
+    for (StorageToken* tok : args) {
+      tok->ref_counter -= 1;
+      CheckForRelease(tok);
+    }
+  }
+  /*!
+   * \brief ceil(size/word_size) to get number of words.
+   * \param size The original size.
+   * \param word_size The element size.
+   */
+  static size_t DivRoundUp(size_t size, size_t word_size) {
+    return (size + word_size - 1) / word_size;
+  }
+  /*!
+   * \brief Get the memory requirement.
+   * \param prototype The prototype token.
+   * \return The required memory size.
+   */
+  size_t GetMemorySize(StorageToken* prototype) {
+    const TensorTypeNode* ttype = prototype->ttype;
+    CHECK(ttype != nullptr);
+    size_t size = 1;
+    for (IndexExpr dim : ttype->shape) {
+      const int64_t* pval = as_const_int(dim);
+      CHECK(pval != nullptr)
+          << "Cannot allocate memory symbolic tensor shape "
+          << ttype->shape;
+      CHECK_GE(*pval, 0)
+          << "Cannot allocate memory for tensor with negative shape"
+          << *pval;
+      size *= static_cast<size_t>(pval[0]);
+    }
+    size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
+    return size;
+  }
+  /*!
+   * \brief Request a storage token for a given prototype.
+   * \param prototype. The prototype storage token.
+   * \return The result token.
+   */
+  StorageToken* Request(StorageToken* prototype) {
+    // calculate the size;
+    size_t size = GetMemorySize(prototype);
+    // search memory block in [size / match_range_, size * match_range_)
+    if (match_range_ == 0) {
+      return this->Alloc(prototype, size);
+    }
+    auto begin = free_.lower_bound(size / match_range_);
+    auto mid = free_.lower_bound(size);
+    auto end = free_.upper_bound(size * match_range_);
+    // search for memory blocks larger than requested
+    for (auto it = mid; it != end; ++it) {
+      StorageToken *tok = it->second;
+      if (tok->device_id != prototype->device_id) continue;
+      CHECK_EQ(tok->ref_counter, 0);
+      // Use exect matching strategy
+      tok->max_bytes = std::max(size, tok->max_bytes);
+      tok->ref_counter = prototype->ref_counter;
+      // find a exact match, erase from map and return
+      free_.erase(it);
+      return tok;
+    }
+    // then search for memory blocks smaller than requested space
+    for (auto it = mid; it != begin;) {
+      --it;
+      StorageToken *tok = it->second;
+      if (tok->device_id != prototype->device_id) continue;
+      CHECK_EQ(tok->ref_counter, 0);
+      // Use exect matching strategy
+      tok->max_bytes = std::max(size, tok->max_bytes);
+      tok->ref_counter = prototype->ref_counter;
+      // erase from map and return
+      free_.erase(it);
+      return tok;
+    }
+    // cannot find anything return a new one.
+    return this->Alloc(prototype, size);
+  }
+  /*!
+   * \brief Allocate a storage token by consuming prototype
+   * \param prototype The prototype token.
+   * \param size The size of memory being requested.
+   */
+  StorageToken* Alloc(StorageToken* prototype, size_t size) {
+    prototype->max_bytes = size;
+    prototype->storage_id = static_cast<int64_t>(data_.size());
+    data_.push_back(prototype);
+    return prototype;
+  }
+  /*!
+   * \brief Check if we can release token.
+   * \tok The token to be released.
+   */
+  void CheckForRelease(StorageToken* tok) {
+    CHECK_GE(tok->storage_id, 0);
+    CHECK_GE(tok->ref_counter, 0);
+    if (tok->ref_counter == 0) {
+      free_.insert({tok->max_bytes, tok});
+    }
+  }
+
+ private:
+  // allocator
+  common::Arena arena_;
+  // scale used for rough match
+  size_t match_range_{16};
+  // free list of storage entry
+  std::multimap<size_t, StorageToken*> free_;
+  // all the storage resources available
+  std::vector<StorageToken*> data_;
+  /*! \brief internal prototype token map */
+  std::unordered_map<const ExprNode*, std::vector<StorageToken*> > prototype_;
+};
+
+
+Map<Expr, Array<Integer> > GraphPlanMemory(const Function& func) {
+  return StorageAllocator().Plan(func);
+}
+
+TVM_REGISTER_GLOBAL("relay.backend.GraphPlanMemory")
+.set_body_typed<Map<Expr, Array<Integer> >(const Function&)>(GraphPlanMemory);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
new file mode 100644
index 000000000000..734180c53759
--- /dev/null
+++ b/src/relay/backend/interpreter.cc
@@ -0,0 +1,477 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file src/tvm/relay/interpreter.cc
+ * \brief An interpreter for the Relay IR.
+ */
+#include <tvm/packed_func_ext.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/interpreter.h>
+#include <tvm/relay/pass.h>
+#include <tvm/relay/attrs/debug.h>
+#include "compile_engine.h"
+
+namespace tvm {
+namespace relay {
+
+using namespace runtime;
+
+inline const PackedFunc& GetPackedFunc(const std::string& name) {
+  const PackedFunc* pf = tvm::runtime::Registry::Get(name);
+  CHECK(pf != nullptr) << "Cannot find function " << name << " in registry";
+  return *pf;
+}
+
+/* Value Implementation */
+Closure ClosureNode::make(tvm::Map<Var, Value> env, Function func) {
+  NodePtr<ClosureNode> n = make_node<ClosureNode>();
+  n->env = std::move(env);
+  n->func = std::move(func);
+  return Closure(n);
+}
+
+TVM_REGISTER_API("relay._make.Closure")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = ClosureNode::make(args[0], args[1]);
+  });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<ClosureNode>([](const ClosureNode* node, tvm::IRPrinter* p) {
+    p->stream << "ClosureNode(" << node->func << ")";
+  });
+
+TupleValue TupleValueNode::make(tvm::Array<Value> value) {
+  NodePtr<TupleValueNode> n = make_node<TupleValueNode>();
+  n->fields = value;
+  return TupleValue(n);
+}
+
+TVM_REGISTER_API("relay._make.TupleValue")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = TupleValueNode::make(args[0]);
+  });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<TupleValueNode>([](const TupleValueNode* node, tvm::IRPrinter* p) {
+    p->stream << "TupleValueNode(" << node->fields << ")";
+  });
+
+TensorValue TensorValueNode::make(runtime::NDArray data) {
+  NodePtr<TensorValueNode> n = make_node<TensorValueNode>();
+  n->data = std::move(data);
+  return TensorValue(n);
+}
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<TensorValueNode>([](const TensorValueNode* node, tvm::IRPrinter* p) {
+    auto to_str = GetPackedFunc("relay._tensor_value_repr");
+    std::string data_str = to_str(GetRef<TensorValue>(node));
+    p->stream << "TensorValueNode(" << data_str << ")";
+  });
+
+TVM_REGISTER_API("relay._make.TensorValue")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    runtime::NDArray data = args[0];
+    *ret = TensorValueNode::make(data);
+  });
+
+/*!
+ * \brief A stack frame in the Relay interpreter.
+ *
+ * Contains a mapping from relay::Var to relay::Value.
+ */
+struct Frame {
+  /*! \brief The set of local variables and arguments for the frame. */
+  tvm::Map<Var, Value> locals;
+
+  explicit Frame(tvm::Map<Var, Value> locals) : locals(locals) {}
+};
+
+/*!
+ * \brief The call stack in the Relay interpreter.
+ *
+ * Contains a stack of frames; each corresponding to
+ * a function call.
+ */
+struct Stack {
+  /*! \brief The stack frames. */
+  std::vector<Frame> frames;
+  Stack() : frames() { frames.push_back(Frame({})); }
+
+  Frame& current_frame() { return frames.back(); }
+
+  Value Lookup(const Var& local) {
+    for (auto frame = frames.rbegin(); frame != frames.rend(); frame++) {
+      auto elem = frame->locals.find(local);
+      if (elem != frame->locals.end()) {
+        return (*elem).second;
+      }
+    }
+
+    LOG(FATAL) << "could not find variable binding for " << local
+               << "address= " << local.operator->();
+    return Value();
+  }
+  /*!
+   * A wrapper around Frame to add RAII semantics to pushing and popping
+   * stack frames.
+   */
+  struct LocalFrame {
+    Stack& st;
+    explicit LocalFrame(Stack& st, const Frame& fr) : st(st) {
+      st.frames.push_back(fr);
+    }
+    ~LocalFrame() { st.frames.pop_back(); }
+  };
+};
+
+/*! \brief A representation of the interpreter state which can be passed back to Python. */
+class InterpreterState;
+
+/*! \brief A container capturing the state of the interpreter. */
+class InterpreterStateNode : public Node {
+ public:
+  using Frame = tvm::Map<Var, Value>;
+  using Stack = tvm::Array<Frame>;
+
+  /*! \brief The current expression under evaluation. */
+  Expr current_expr;
+
+  /*! \brief The call stack of the interpreter. */
+  Stack stack;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("current_expr", &current_expr);
+    v->Visit("stack", &stack);
+  }
+
+  TVM_DLL static InterpreterState make(Expr current_expr, Stack stack);
+
+  static constexpr const char* _type_key = "relay.InterpreterState";
+  TVM_DECLARE_NODE_TYPE_INFO(InterpreterStateNode, Node);
+};
+
+RELAY_DEFINE_NODE_REF(InterpreterState, InterpreterStateNode, NodeRef);
+
+InterpreterState InterpreterStateNode::make(Expr current_expr, Stack stack) {
+  NodePtr<InterpreterStateNode> n = make_node<InterpreterStateNode>();
+  n->current_expr = std::move(current_expr);
+  n->stack = std::move(stack);
+  return InterpreterState(n);
+}
+
+// NOTE: the current interpreter assumes A-normal form.
+// which is better for execution.
+//
+// It will run duplicated computations when taking program that
+// contains DAG in dataflow-form.
+//
+// Conversion to ANF is recommended before running the interpretation.
+class Interpreter :
+      public ExprFunctor<Value(const Expr& n)> {
+ public:
+  Interpreter(Module mod,
+              DLContext context,
+              Target target)
+      : mod_(mod), context_(context), target_(target) {
+    engine_ = CompileEngine::Global();
+  }
+
+  template <typename T>
+  T WithFrame(const Frame& fr, const std::function<T()>& f) {
+    Stack::LocalFrame lf(stack_, fr);
+    return f();
+  }
+
+  void extend(const Var& id, Value v) {
+    stack_.current_frame().locals.Set(id, v);
+  }
+
+  inline Value Lookup(const Var& local) {
+    return stack_.Lookup(local);
+  }
+
+  Value Eval(const Expr& expr) {
+    return (*this)(expr);
+  }
+
+  Value VisitExpr(const Expr& expr) final {
+    auto ret = ExprFunctor<Value(const Expr& n)>::VisitExpr(expr);
+    return ret;
+  }
+
+  Value VisitExpr_(const VarNode* var_node) final {
+    return Lookup(GetRef<Var>(var_node));
+  }
+
+  Value VisitExpr_(const GlobalVarNode* op) final {
+    return Eval(mod_->Lookup(GetRef<GlobalVar>(op)));
+  }
+
+  Value VisitExpr_(const OpNode* id) override {
+    // TODO(@jroesch): Eta-expand and return in this case.
+    LOG(FATAL) << "internal error, need to wrap intrinsic into call synthetic call node "
+               << "in "
+               << "this case, eta expand";
+    return Value();
+  }
+
+  Value VisitExpr_(const ConstantNode* op) final {
+    return TensorValueNode::make(op->data.CopyTo(context_));
+  }
+
+  Value VisitExpr_(const TupleNode* op) final {
+    std::vector<Value> values;
+
+    for (const auto& field : op->fields) {
+      Value field_value = Eval(field);
+      values.push_back(field_value);
+    }
+
+    return TupleValueNode::make(values);
+  }
+
+  Value VisitExpr_(const FunctionNode* func_node) final {
+    auto func = GetRef<Function>(func_node);
+    tvm::Map<Var, Value> captured_mod;
+    Array<Var> free_vars = FreeVars(func);
+
+    for (const auto& var : free_vars) {
+      captured_mod.Set(var, Eval(var));
+    }
+
+    return ClosureNode::make(captured_mod, func);
+  }
+
+  Value InvokePrimitiveOp(Function func,
+                          const Array<Value>& args) {
+    auto call_node = func->body.as<CallNode>();
+
+    if (call_node && call_node->op == Op::Get("debug")) {
+      auto dattrs = call_node->attrs.as<DebugAttrs>();
+      auto interp_state = this->get_state(call_node->args[0]);
+
+      if (dattrs->debug_func.defined()) {
+        dattrs->debug_func(interp_state);
+      } else {
+        RELAY_DEBUG(interp_state);
+      }
+
+      return args[0];
+    }
+
+    // Marshal the arguments.
+    // Handle tuple input/output by flattening them.
+    size_t arg_len = 0;
+    for (size_t i = 0; i < args.size(); ++i) {
+      if (args[i].as<TensorValueNode>()) {
+        ++arg_len;
+      } else {
+        const auto* tvalue = args[i].as<TupleValueNode>();
+        arg_len += tvalue->fields.size();
+      }
+    }
+    size_t num_inputs = arg_len;
+    if (const auto* tuple_type = func->body->checked_type().as<TupleTypeNode>()) {
+      arg_len += tuple_type->fields.size();
+    } else {
+      CHECK(func->body->checked_type().as<TensorTypeNode>());
+      arg_len += 1;
+    }
+    std::vector<TVMValue> values(arg_len);
+    std::vector<int> codes(arg_len);
+    TVMArgsSetter setter(values.data(), codes.data());
+
+    auto fset_input = [&](size_t i, Value val) {
+      const TensorValueNode* tv = val.as<TensorValueNode>();
+      CHECK(tv != nullptr) << "expect Tensor argument";
+      setter(i, tv->data);
+      DLContext arg_ctx = tv->data->ctx;
+      CHECK(arg_ctx.device_type ==  context_.device_type &&
+            arg_ctx.device_id == context_.device_id)
+        << "Interpreter expect context to be "
+        << context_ << ", but get " << arg_ctx;
+    };
+
+    int arg_counter = 0;
+    for (Value arg : args) {
+      if (arg.as<TensorValueNode>()) {
+        fset_input(arg_counter++,  arg);
+      } else {
+        const TupleValueNode* tuple = arg.as<TupleValueNode>();
+        CHECK(tuple != nullptr);
+        for (size_t i = 0; i < tuple->fields.size(); ++i) {
+          fset_input(arg_counter++, tuple->fields[i]);
+        }
+      }
+    }
+
+    // TVM's calling convention is that the final argument is the output
+    // buffer. To preserve the illusion of being a functional language
+    // we need to allocate space for the output buffer based on the
+    // return type.
+    auto fset_output = [&](size_t i, Type val_type) {
+      const TensorTypeNode* rtype = val_type.as<TensorTypeNode>();
+      CHECK(rtype != nullptr);
+      // Allocate output tensor.
+      std::vector<int64_t> shape;
+      for (auto dim : rtype->shape) {
+        const auto* ivalue = as_const_int(dim);
+        CHECK(ivalue) << "expected concrete dimensions";
+        shape.push_back(ivalue[0]);
+      }
+      DLDataType dtype = Type2TVMType(rtype->dtype);
+      auto out_tensor = TensorValueNode::make(
+          NDArray::Empty(shape, dtype, context_));
+      setter(num_inputs + i, out_tensor->data);
+      return out_tensor;
+    };
+
+    PackedFunc packed_func = engine_->JIT(CCacheKeyNode::make(func, target_));
+    TVMRetValue rv;
+    if (const TupleTypeNode* rtype = func->body->checked_type().as<TupleTypeNode>()) {
+      Array<Value> fields;
+      for (size_t i = 0; i < rtype->fields.size(); ++i) {
+        fields.push_back(fset_output(i, rtype->fields[i]));
+      }
+      packed_func.CallPacked(TVMArgs(values.data(), codes.data(), arg_len), &rv);
+      return TupleValueNode::make(fields);
+    } else {
+      Value out_tensor = fset_output(0, func->body->checked_type());
+      packed_func.CallPacked(TVMArgs(values.data(), codes.data(), arg_len), &rv);
+      return out_tensor;
+    }
+  }
+
+  // Invoke the closure
+  Value Invoke(const Closure& closure, const tvm::Array<Value>& args) {
+    // Get a reference to the function inside the closure.
+    if (closure->func->IsPrimitive()) {
+      return InvokePrimitiveOp(closure->func, args);
+    }
+    auto func = closure->func;
+    // Allocate a frame with the parameters and free variables.
+    tvm::Map<Var, Value> locals;
+
+    CHECK_EQ(func->params.size(), args.size());
+
+    for (size_t i = 0; i < func->params.size(); i++) {
+      CHECK_EQ(locals.count(func->params[i]), 0);
+      locals.Set(func->params[i], args[i]);
+    }
+
+    // Add the var to value mappings from the Closure's modironment.
+    for (auto it = closure->env.begin(); it != closure->env.end(); ++it) {
+      CHECK_EQ(locals.count((*it).first), 0);
+      locals.Set((*it).first, (*it).second);
+    }
+
+    return WithFrame<Value>(Frame(locals), [&]() { return Eval(func->body); });
+  }
+
+  Value VisitExpr_(const CallNode* call) final {
+    tvm::Array<Value> args;
+    for (auto arg : call->args) {
+      args.push_back(Eval(arg));
+    }
+    // We should not find operators after running fusion,
+    // and operator lowering.
+    //
+    // We have some functions cotaining chunks of operators
+    // which will be loaded into operator map.
+    if (auto op_node = call->op.as<OpNode>()) {
+      LOG(FATAL) << "found " << op_node->name
+                 << "; operators should be removed by future passes; try "
+                    "fusing and lowering";
+    }
+    // Now we just evaluate and expect to find a closure.
+    Value fn_val = Eval(call->op);
+    if (const ClosureNode* closure_node = fn_val.as<ClosureNode>()) {
+      auto closure = GetRef<Closure>(closure_node);
+      return this->Invoke(closure, args);
+    } else {
+      LOG(FATAL) << "internal error: type error, expected function value in the call "
+                 << "position";
+      return Value();
+    }
+  }
+
+  Value VisitExpr_(const LetNode* op) final {
+    auto value = Eval(op->value);
+    this->extend(op->var, value);
+    return Eval(op->body);
+  }
+
+  Value VisitExpr_(const TupleGetItemNode* op) final {
+    Value val = Eval(op->tuple);
+    auto product_node = val.as<TupleValueNode>();
+    CHECK(product_node)
+      << "interal error: when evaluating TupleGetItem expected a tuple value";
+    CHECK_LT(static_cast<size_t>(op->index), product_node->fields.size())
+        << "internal error: index out of bounds";
+    return product_node->fields[op->index];
+  }
+
+  Value VisitExpr_(const IfNode* op) final {
+    Value v = Eval(op->cond);
+    if (const TensorValueNode* bv = v.as<TensorValueNode>()) {
+      DLContext cpu_ctx;
+      cpu_ctx.device_type = kDLCPU;
+      cpu_ctx.device_id = 0;
+      NDArray cpu_array = bv->data.CopyTo(cpu_ctx);
+      CHECK_EQ(TVMType2Type(cpu_array->dtype), Bool());
+      // TODO(@jroesch, @MK): Refactor code into helper from DCE.
+      if (reinterpret_cast<uint8_t*>(cpu_array->data)[0]) {
+        return Eval(op->true_branch);
+      } else {
+        return Eval(op->false_branch);
+      }
+    } else {
+      LOG(FATAL) << "type error, type system should have caught this";
+      return Value();
+    }
+  }
+
+  InterpreterState get_state(Expr e = Expr()) const {
+    InterpreterStateNode::Stack stack;
+    for (auto fr : this->stack_.frames) {
+      InterpreterStateNode::Frame frame = fr.locals;
+      stack.push_back(frame);
+    }
+    auto state = InterpreterStateNode::make(e, stack);
+    return state;
+  }
+
+ private:
+  // module
+  Module mod_;
+  // For simplicity we only run the interpreter on a single context.
+  // Context to run the interpreter on.
+  DLContext context_;
+  // Target parameter being used by the interpreter.
+  Target target_;
+  // value stack.
+  Stack stack_;
+  // Backend compile engine.
+  CompileEngine engine_;
+};
+
+
+TypedPackedFunc<Value(Expr)>
+CreateInterpreter(
+    Module mod,
+    DLContext context,
+    Target target) {
+  auto intrp = std::make_shared<Interpreter>(mod, context, target);
+  auto packed = [intrp](Expr expr) {
+    return intrp->Eval(expr);
+  };
+  return TypedPackedFunc<Value(Expr)>(packed);
+}
+
+TVM_REGISTER_API("relay.backend.CreateInterpreter")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = CreateInterpreter(args[0], args[1], args[2]);
+  });
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/ir/alpha_equal.cc b/src/relay/ir/alpha_equal.cc
new file mode 100644
index 000000000000..064343c834ea
--- /dev/null
+++ b/src/relay/ir/alpha_equal.cc
@@ -0,0 +1,395 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file src/tvm/relay/ir/alpha_equal.cc
+ * \brief Alpha equality check by deep comparing two nodes.
+ */
+#include <tvm/ir_pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/relay/pass.h>
+#include "type_functor.h"
+#include "../../lang/attr_functor.h"
+
+namespace tvm {
+namespace relay {
+
+// Alpha Equal handler for Relay.
+class AlphaEqualHandler:
+      public AttrsEqualHandler,
+      public TypeFunctor<bool(const Type&, const Type&)>,
+      public ExprFunctor<bool(const Expr&, const Expr&)> {
+ public:
+  explicit AlphaEqualHandler(bool map_free_var)
+      : map_free_var_(map_free_var) {}
+
+  /*!
+   * Check equality of two nodes.
+   * \param lhs The left hand operand.
+   * \param rhs The right hand operand.
+   * \return The comparison result.
+   */
+  bool Equal(const NodeRef& lhs, const NodeRef& rhs) {
+    if (lhs.same_as(rhs)) return true;
+    if (!lhs.defined() || !rhs.defined()) return false;
+    if (lhs->derived_from<TypeNode>()) {
+      if (!rhs->derived_from<TypeNode>()) return false;
+      return TypeEqual(Downcast<Type>(lhs), Downcast<Type>(rhs));
+    }
+    if (lhs->derived_from<ExprNode>()) {
+      if (!rhs->derived_from<ExprNode>()) return false;
+      return ExprEqual(Downcast<Expr>(lhs), Downcast<Expr>(rhs));
+    }
+    return AttrEqual(lhs, rhs);
+  }
+
+  /*!
+   * Check equality of two attributes.
+   * \param lhs The left hand operand.
+   * \param rhs The right hand operand.
+   * \return The comparison result.
+   */
+  bool AttrEqual(const NodeRef& lhs, const NodeRef& rhs) {
+    return AttrsEqualHandler::Equal(lhs, rhs);
+  }
+  /*!
+   * Check equality of two types.
+   * \param lhs The left hand operand.
+   * \param rhs The right hand operand.
+   * \return the comparison result.
+   */
+  bool TypeEqual(const Type& lhs, const Type& rhs) {
+    if (lhs.same_as(rhs)) return true;
+    if (!lhs.defined() || !rhs.defined()) return false;
+    return this->VisitType(lhs, rhs);
+  }
+  /*!
+   * Check equality of two expressions.
+   *
+   * \note We run graph structural equality checking when comparing two Exprs.
+   *   This means that AlphaEqualHandler can only be used once for each pair.
+   *   The equality checker checks data-flow equvalence of the Expr DAG.
+   *   This function also runs faster as it memomizes equal_map.
+   *
+   * \param lhs The left hand operand.
+   * \param rhs The right hand operand.
+   * \return The comparison result.
+   */
+  bool ExprEqual(const Expr& lhs, const Expr& rhs) {
+    if (lhs.same_as(rhs)) return true;
+    if (!lhs.defined() || !rhs.defined()) return false;
+    auto it = equal_map_.find(lhs);
+    if (it != equal_map_.end()) {
+      return it->second.same_as(rhs);
+    }
+    if (this->VisitExpr(lhs, rhs)) {
+      equal_map_[lhs] = rhs;
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+ protected:
+  /*!
+   * \brief Check if data type equals each other.
+   * \param lhs The left hand operand.
+   * \param rhs The right hand operand.
+   * \return The compare result.
+   */
+  bool DataTypeEqual(const DataType& lhs, const DataType& rhs) {
+    return lhs == rhs;
+  }
+  /*!
+   * \brief Check Equality of leaf node of the graph.
+   *  if map_free_var_ is set to true, try to map via equal node.
+   * \param lhs The left hand operand.
+   * \param rhs The right hand operand.
+   * \return The compare result.
+   */
+  bool LeafNodeEqual(const NodeRef& lhs, const NodeRef& rhs) {
+    if (lhs.same_as(rhs)) return true;
+    auto it = equal_map_.find(lhs);
+    if (it != equal_map_.end()) {
+      return it->second.same_as(rhs);
+    } else {
+      if (map_free_var_) {
+        if (lhs->type_index() != rhs->type_index()) return false;
+        equal_map_[lhs] = rhs;
+        return true;
+      } else {
+        return false;
+      }
+    }
+  }
+  using AttrsEqualHandler::VisitAttr_;
+  bool VisitAttr_(const Variable* lhs, const NodeRef& other) final {
+    return LeafNodeEqual(GetRef<NodeRef>(lhs), other);
+  }
+
+  // Type equality
+  bool VisitType_(const TensorTypeNode* lhs, const Type& other) final {
+    if (const TensorTypeNode* rhs = other.as<TensorTypeNode>()) {
+      return (lhs->dtype == rhs->dtype &&
+              AttrEqual(lhs->shape, rhs->shape));
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitType_(const IncompleteTypeNode* lhs, const Type& other) final {
+    return LeafNodeEqual(GetRef<NodeRef>(lhs), other);
+  }
+
+  bool VisitType_(const TypeVarNode* lhs, const Type& other) final {
+    if (const TypeVarNode* rhs = other.as<TypeVarNode>()) {
+      if (lhs->kind != rhs->kind) return false;
+      return LeafNodeEqual(GetRef<NodeRef>(lhs), other);
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitType_(const FuncTypeNode* lhs, const Type& other) final {
+    if (const FuncTypeNode* rhs = other.as<FuncTypeNode>()) {
+      if (lhs->arg_types.size() != rhs->arg_types.size()) return false;
+      if (lhs->type_params.size() != rhs->type_params.size()) return false;
+      if (lhs->type_constraints.size() != rhs->type_constraints.size()) return false;
+      for (size_t i = 0; i < lhs->type_params.size(); ++i) {
+        if (lhs->type_params[i]->kind != rhs->type_params[i]->kind) {
+          return false;
+        }
+        equal_map_[lhs->type_params[i]] = rhs->type_params[i];
+        // set up type parameter equal
+        if (lhs->type_params[i]->kind == TypeVarNode::Kind::kShapeVar) {
+          // map variable
+          equal_map_[lhs->type_params[i]->var] = rhs->type_params[i]->var;
+        }
+      }
+      for (size_t i = 0; i < lhs->arg_types.size(); i++) {
+        if (!TypeEqual(lhs->arg_types[i], rhs->arg_types[i])) return false;
+      }
+      if (!TypeEqual(lhs->ret_type, rhs->ret_type)) return false;
+      for (size_t i = 0; i < lhs->type_constraints.size(); i++) {
+        if (!TypeEqual(lhs->type_constraints[i],
+                       rhs->type_constraints[i])) {
+          return false;
+        }
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitType_(const TypeRelationNode* lhs, const Type& other) final {
+    if (const TypeRelationNode* rhs = other.as<TypeRelationNode>()) {
+      if (lhs->func->name != rhs->func->name) return false;
+      if (lhs->num_inputs != rhs->num_inputs) return false;
+      if (!this->AttrEqual(lhs->attrs, rhs->attrs)) return false;
+      if (lhs->args.size() != rhs->args.size()) return false;
+      for (size_t i = 0; i < lhs->args.size(); ++i) {
+        if (!TypeEqual(lhs->args[i], rhs->args[i])) return false;
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitType_(const TupleTypeNode* lhs, const Type& other) final {
+    if (const TupleTypeNode* rhs = other.as<TupleTypeNode>()) {
+      if (lhs->fields.size() != rhs->fields.size()) return false;
+      for (size_t i = 0; i < lhs->fields.size(); ++i) {
+        if (!TypeEqual(lhs->fields[i], rhs->fields[i])) return false;
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+  // Expr equal checking.
+  bool NDArrayEqual(const runtime::NDArray& lhs,
+                    const runtime::NDArray& rhs) {
+    if (lhs.defined() != rhs.defined()) {
+      return false;
+    } else if (lhs.same_as(rhs)) {
+      return true;
+    } else {
+      auto ldt = lhs->dtype;
+      auto rdt = rhs->dtype;
+      CHECK_EQ(lhs->ctx.device_type, kDLCPU) << "can only compare CPU tensor";
+      CHECK_EQ(rhs->ctx.device_type, kDLCPU) << "can only compare CPU tensor";
+      if (ldt.code == rdt.code && ldt.lanes == rdt.lanes && ldt.bits == rdt.bits) {
+        size_t data_size = runtime::GetDataSize(*lhs.operator->());
+        return std::memcmp(lhs->data, rhs->data, data_size) == 0;
+      } else {
+        return false;
+      }
+    }
+  }
+  // merge declaration of two variables together.
+  bool MergeVarDecl(const Var& lhs, const Var& rhs) {
+    if (lhs.same_as(rhs)) return true;
+    if (!lhs.defined() || !rhs.defined()) return false;
+    if (!TypeEqual(lhs->type_annotation,
+                   rhs->type_annotation)) return false;
+    CHECK(!equal_map_.count(lhs))
+        << "Duplicated declaration of variable " <<  lhs;
+    equal_map_[lhs] = rhs;
+    return true;
+  }
+
+  bool VisitExpr_(const VarNode* lhs, const Expr& other) final {
+    // This function will only be triggered if we are matching free variables.
+    if (const VarNode* rhs = other.as<VarNode>()) {
+      if (lhs->name_hint() != rhs->name_hint()) return false;
+      if (!TypeEqual(lhs->type_annotation, rhs->type_annotation)) return false;
+      return LeafNodeEqual(GetRef<NodeRef>(lhs), other);
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitExpr_(const GlobalVarNode* lhs, const Expr& other) final {
+    if (const GlobalVarNode* rhs = other.as<GlobalVarNode>()) {
+      // use name equality for global var for now.
+      if (lhs->name_hint != rhs->name_hint) return false;
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitExpr_(const TupleNode* lhs, const Expr& other) final {
+    if (const TupleNode* rhs = other.as<TupleNode>()) {
+      if (lhs->fields.size() != rhs->fields.size()) return false;
+      for (size_t i = 0; i < lhs->fields.size(); ++i) {
+        if (!ExprEqual(lhs->fields[i], rhs->fields[i])) return false;
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitExpr_(const FunctionNode* lhs, const Expr& other) final {
+    if (const FunctionNode* rhs = other.as<FunctionNode>()) {
+      if (lhs->params.size() != rhs->params.size()) return false;
+      if (lhs->type_params.size() != rhs->type_params.size()) return false;
+      // map type parameter to be the same
+      for (size_t i = 0; i < lhs->type_params.size(); ++i) {
+        if (lhs->type_params[i]->kind != rhs->type_params[i]->kind) return false;
+        equal_map_[lhs->type_params[i]] = rhs->type_params[i];
+      }
+      // check parameter type annotations
+      for (size_t i = 0; i < lhs->params.size(); ++i) {
+        if (!MergeVarDecl(lhs->params[i], rhs->params[i])) return false;
+      }
+      // check return types.
+      if (!TypeEqual(lhs->ret_type, rhs->ret_type)) return false;
+      return ExprEqual(lhs->body, rhs->body);
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitExpr_(const CallNode* lhs, const Expr& other) final {
+    if (const CallNode* rhs = other.as<CallNode>()) {
+      if (!ExprEqual(lhs->op, rhs->op)) return false;
+      if (lhs->args.size() != rhs->args.size()) return false;
+      // skip type_args check for primitive ops.
+      bool is_primitive = IsPrimitiveOp(lhs->op);
+      if (!is_primitive) {
+        if (lhs->type_args.size() != rhs->type_args.size()) {
+          return false;
+        }
+      }
+      for (size_t i = 0; i < lhs->args.size(); ++i) {
+        if (!ExprEqual(lhs->args[i], rhs->args[i])) {
+          return false;
+        }
+      }
+
+      if (!is_primitive) {
+        for (size_t i = 0; i < lhs->type_args.size(); ++i) {
+          if (!TypeEqual(lhs->type_args[i], rhs->type_args[i])) return false;
+        }
+      }
+      return AttrEqual(lhs->attrs, rhs->attrs);
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitExpr_(const LetNode* lhs, const Expr& other) final {
+    if (const LetNode* rhs = other.as<LetNode>()) {
+      if (!ExprEqual(lhs->value, rhs->value)) return false;
+      if (!MergeVarDecl(lhs->var, rhs->var)) return false;
+      return ExprEqual(lhs->body, rhs->body);
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitExpr_(const IfNode* lhs, const Expr& other) final {
+    if (const IfNode* rhs = other.as<IfNode>()) {
+      return ExprEqual(lhs->cond, rhs->cond) &&
+          ExprEqual(lhs->true_branch, rhs->true_branch) &&
+          ExprEqual(lhs->false_branch, rhs->false_branch);
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitExpr_(const OpNode* op, const Expr& other) final {
+    return op == other.get();
+  }
+
+  bool VisitExpr_(const ConstantNode* lhs, const Expr& other) final {
+    if (const ConstantNode* rhs = other.as<ConstantNode>()) {
+      return NDArrayEqual(lhs->data, rhs->data);
+    } else {
+      return false;
+    }
+  }
+
+  bool VisitExpr_(const TupleGetItemNode* lhs, const Expr& other) final {
+    if (const TupleGetItemNode* rhs = other.as<TupleGetItemNode>()) {
+      return ExprEqual(lhs->tuple, rhs->tuple) && lhs->index == rhs->index;
+    } else {
+      return false;
+    }
+  }
+
+ private:
+  // whether to map open terms.
+  bool map_free_var_{false};
+  // renaming of NodeRef to indicate two nodes equals to each other
+  std::unordered_map<NodeRef, NodeRef, NodeHash, NodeEqual> equal_map_;
+};
+
+bool AlphaEqual(const Type& lhs, const Type& rhs) {
+  return AlphaEqualHandler(false).TypeEqual(lhs, rhs);
+}
+
+bool AlphaEqual(const Expr& lhs, const Expr& rhs) {
+  return AlphaEqualHandler(false).ExprEqual(lhs, rhs);
+}
+
+// TODO(@jroesch): move to correct namespace?
+TVM_REGISTER_API("relay._make._alpha_equal")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = AlphaEqualHandler(false).Equal(args[0], args[1]);
+  });
+
+TVM_REGISTER_API("relay._make._type_alpha_equal")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = AlphaEqualHandler(false).TypeEqual(args[0], args[1]);
+  });
+
+TVM_REGISTER_API("relay._make._graph_equal")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = AlphaEqualHandler(true).Equal(args[0], args[1]);
+  });
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/ir/base.cc b/src/relay/ir/base.cc
new file mode 100644
index 000000000000..06593b6420f5
--- /dev/null
+++ b/src/relay/ir/base.cc
@@ -0,0 +1,70 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file base.cc
+ * \brief The core base types for Relay.
+ */
+#include <tvm/api_registry.h>
+#include <tvm/relay/base.h>
+
+namespace tvm {
+namespace relay {
+
+using tvm::IRPrinter;
+using namespace tvm::runtime;
+
+NodePtr<SourceNameNode> GetSourceNameNode(const std::string& name) {
+  // always return pointer as the reference can change as map re-allocate.
+  // or use another level of indirection by creating a unique_ptr
+  static std::unordered_map<std::string, NodePtr<SourceNameNode> > source_map;
+
+  auto sn = source_map.find(name);
+  if (sn == source_map.end()) {
+    NodePtr<SourceNameNode> n = make_node<SourceNameNode>();
+    n->name = std::move(name);
+    source_map[name] = n;
+    return n;
+  } else {
+    return sn->second;
+  }
+}
+
+SourceName SourceName::Get(const std::string& name) {
+  return SourceName(GetSourceNameNode(name));
+}
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<SourceNameNode>([](const SourceNameNode* node, tvm::IRPrinter* p) {
+    p->stream << "SourceName(" << node->name << ", " << node << ")";
+  });
+
+TVM_REGISTER_NODE_TYPE(SourceNameNode)
+.set_creator(GetSourceNameNode)
+.set_global_key([](const Node* n) {
+    return static_cast<const SourceNameNode*>(n)->name;
+  });
+
+Span SpanNode::make(SourceName source, int lineno, int col_offset) {
+  auto n = make_node<SpanNode>();
+  n->source = std::move(source);
+  n->lineno = lineno;
+  n->col_offset = col_offset;
+  return Span(n);
+}
+
+TVM_REGISTER_NODE_TYPE(SpanNode);
+
+TVM_REGISTER_API("relay._make.Span")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = SpanNode::make(args[0], args[1], args[2]);
+  });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<SpanNode>([](const SpanNode* node, tvm::IRPrinter* p) {
+    p->stream << "SpanNode(" << node->source << ", " << node->lineno << ", "
+              << node->col_offset << ")";
+  });
+
+TVM_REGISTER_NODE_TYPE(IdNode);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
new file mode 100644
index 000000000000..cdb2a32a0009
--- /dev/null
+++ b/src/relay/ir/expr.cc
@@ -0,0 +1,283 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file src/tvm/ir/expr.cc
+ * \brief The expression AST nodes of Relay.
+ */
+#include <tvm/relay/expr.h>
+
+namespace tvm {
+namespace relay {
+
+using tvm::IRPrinter;
+using namespace tvm::runtime;
+
+Constant ConstantNode::make(runtime::NDArray data) {
+  NodePtr<ConstantNode> n = make_node<ConstantNode>();
+  n->data = std::move(data);
+  return Constant(n);
+}
+
+TVM_REGISTER_NODE_TYPE(ConstantNode);
+
+TVM_REGISTER_API("relay._make.Constant")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = ConstantNode::make(args[0]);
+  });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<ConstantNode>([](const ConstantNode* node, tvm::IRPrinter* p) {
+    const PackedFunc* fprint = Registry::Get("relay._constant_repr");
+    CHECK(fprint) << "unable to find printing function for constants";
+    std::string data = (*fprint)(GetRef<Constant>(node));
+    p->stream << "Constant(" << data << ")";
+  });
+
+TensorType ConstantNode::tensor_type() const {
+  auto dtype = TVMType2Type(data->dtype);
+  Array<tvm::Expr> shape;
+  for (int i = 0; i < data->ndim; i++) {
+    CHECK_LE(data->shape[i], std::numeric_limits<int32_t>::max());
+    CHECK_GE(data->shape[i], std::numeric_limits<int32_t>::min());
+    shape.push_back(
+        tvm::ir::IntImm::make(Int(32), data->shape[i]));
+  }
+
+  return TensorTypeNode::make(shape, dtype);
+}
+
+Tuple TupleNode::make(tvm::Array<relay::Expr> fields) {
+  NodePtr<TupleNode> n = make_node<TupleNode>();
+  n->fields = std::move(fields);
+  return Tuple(n);
+}
+
+TVM_REGISTER_NODE_TYPE(TupleNode);
+
+TVM_REGISTER_API("relay._make.Tuple")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = TupleNode::make(args[0]);
+  });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<TupleNode>([](const TupleNode* node, tvm::IRPrinter* p) {
+    p->stream << "Tuple(" << node->fields << ")";
+  });
+
+
+Var VarNode::make(Id vid, Type type_annotation) {
+  NodePtr<VarNode> n = make_node<VarNode>();
+  n->vid = std::move(vid);
+  n->type_annotation = std::move(type_annotation);
+  return Var(n);
+}
+
+Var VarNode::make(std::string name_hint, Type type_annotation) {
+  NodePtr<IdNode> n = make_node<IdNode>();
+  n->name_hint = std::move(name_hint);
+  return VarNode::make(Id(n), type_annotation);
+}
+
+TVM_REGISTER_NODE_TYPE(VarNode);
+
+TVM_REGISTER_API("relay._make.Var")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = VarNode::make(args[0].operator std::string(), args[1]);
+  });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<VarNode>([](const VarNode* node, tvm::IRPrinter* p) {
+    p->stream << "Var(" << node->name_hint();
+    if (node->type_annotation.defined()) {
+      p->stream << ", ty=";
+      p->print(node->type_annotation);
+    }
+    p->stream << ")";
+  });
+
+GlobalVar GlobalVarNode::make(std::string name_hint) {
+  NodePtr<GlobalVarNode> n = make_node<GlobalVarNode>();
+  n->name_hint = std::move(name_hint);
+  return GlobalVar(n);
+}
+
+TVM_REGISTER_NODE_TYPE(GlobalVarNode);
+
+TVM_REGISTER_API("relay._make.GlobalVar")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = GlobalVarNode::make(args[0]);
+  });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<GlobalVarNode>([](const GlobalVarNode* node, tvm::IRPrinter* p) {
+    p->stream << "GlobalVar(" << node->name_hint << ")";
+  });
+
+
+Function FunctionNode::make(tvm::Array<Var> params,
+                            Expr body,
+                            Type ret_type,
+                            tvm::Array<TypeVar> type_params,
+                            tvm::Attrs attrs) {
+  NodePtr<FunctionNode> n = make_node<FunctionNode>();
+  n->params = std::move(params);
+  n->body = std::move(body);
+  n->ret_type = std::move(ret_type);
+  n->type_params = std::move(type_params);
+  n->attrs = std::move(attrs);
+  return Function(n);
+}
+
+FuncType FunctionNode::func_type_annotation() const {
+  Array<Type> param_types;
+  for (auto param : this->params) {
+    param_types.push_back(param->type_annotation);
+  }
+  return FuncTypeNode::make(param_types, this->ret_type, this->type_params, {});
+}
+
+bool FunctionNode::IsPrimitive() const {
+  NodeRef res = FunctionGetAttr(GetRef<Function>(this), "Primitive");
+  const ir::IntImm* pval = res.as<ir::IntImm>();
+  return pval && pval->value != 0;
+}
+
+NodeRef FunctionGetAttr(const Function& func, const std::string& key) {
+  if (!func->attrs.defined()) { return NodeRef(); }
+
+  const DictAttrsNode* dict_attrs = func->attrs.as<DictAttrsNode>();
+  CHECK(dict_attrs);
+  auto it = dict_attrs->dict.find(key);
+  if (it != dict_attrs->dict.end()) {
+    return (*it).second;
+  } else {
+    return NodeRef();
+  }
+}
+
+Function FunctionSetAttr(const Function& func, const std::string& key, const NodeRef& data) {
+  const DictAttrsNode* dattrs = func->attrs.as<DictAttrsNode>();
+  Attrs func_attrs;
+  if (dattrs) {
+    Map<std::string, NodeRef> dict = dattrs->dict;
+    dict.Set(key, data);
+    func_attrs = DictAttrsNode::make(dict);
+  } else {
+    Map<std::string, NodeRef> dict = {{key, data}};
+    func_attrs = DictAttrsNode::make(dict);
+  }
+
+  return FunctionNode::make(
+    func->params,
+    func->body,
+    func->ret_type,
+    func->type_params,
+    func_attrs);
+}
+
+TVM_REGISTER_NODE_TYPE(FunctionNode);
+
+TVM_REGISTER_API("relay._make.Function")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  *ret = FunctionNode::make(args[0], args[1], args[2], args[3], args[4]);
+});
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<FunctionNode>([](const FunctionNode* node,
+                                   tvm::IRPrinter* p) {
+      p->stream << "FunctionNode(" << node->params << ", " << node->ret_type
+                << ", " << node->body << ", " << node->type_params << ", "
+                << node->attrs << ")";
+});
+
+Call CallNode::make(Expr op, Array<Expr> args, Attrs attrs,
+                    Array<Type> type_args) {
+  NodePtr<CallNode> n = make_node<CallNode>();
+  n->op = std::move(op);
+  n->args = std::move(args);
+  n->attrs = std::move(attrs);
+  n->type_args = std::move(type_args);
+  return Call(n);
+}
+
+TVM_REGISTER_NODE_TYPE(CallNode);
+
+TVM_REGISTER_API("relay._make.Call")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  *ret = CallNode::make(args[0], args[1], args[2], args[3]);
+});
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<CallNode>([](const CallNode* node, tvm::IRPrinter* p) {
+  p->stream << "CallNode(" << node->op << ", " << node->args << ", "
+    << node->attrs << ", " << node->type_args << ")";
+});
+
+Let LetNode::make(Var var, Expr value, Expr body) {
+  NodePtr<LetNode> n = make_node<LetNode>();
+  n->var = std::move(var);
+  n->value = std::move(value);
+  n->body = std::move(body);
+  return Let(n);
+}
+
+TVM_REGISTER_NODE_TYPE(LetNode);
+
+TVM_REGISTER_API("relay._make.Let")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = LetNode::make(args[0], args[1], args[2]);
+  });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<LetNode>([](const LetNode* node, tvm::IRPrinter* p) {
+  p->stream << "LetNode(" << node->var << ", " << node->value
+            << ", " << node->body << ")";
+});
+
+If IfNode::make(Expr cond, Expr true_branch, Expr false_branch) {
+  NodePtr<IfNode> n = make_node<IfNode>();
+  n->cond = std::move(cond);
+  n->true_branch = std::move(true_branch);
+  n->false_branch = std::move(false_branch);
+  return If(n);
+}
+
+TVM_REGISTER_NODE_TYPE(IfNode);
+
+TVM_REGISTER_API("relay._make.If").set_body([](TVMArgs args, TVMRetValue* ret) {
+  *ret = IfNode::make(args[0], args[1], args[2]);
+});
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<IfNode>([](const IfNode* node, tvm::IRPrinter* p) {
+  p->stream << "IfNode(" << node->cond << ", " << node->true_branch
+            << ", " << node->false_branch << ")";
+});
+
+TupleGetItem TupleGetItemNode::make(Expr tuple, int index) {
+  NodePtr<TupleGetItemNode> n = make_node<TupleGetItemNode>();
+  n->tuple = std::move(tuple);
+  n->index = index;
+  return TupleGetItem(n);
+}
+
+TVM_REGISTER_NODE_TYPE(TupleGetItemNode);
+
+TVM_REGISTER_API("relay._make.TupleGetItem").set_body([](TVMArgs args, TVMRetValue* ret) {
+  *ret = TupleGetItemNode::make(args[0], args[1]);
+});
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<TupleGetItemNode>([](const TupleGetItemNode* node, tvm::IRPrinter* p) {
+  p->stream << "TupleGetItemNode(" << node->tuple << ", " << node->index << ")";
+});
+
+
+TVM_REGISTER_API("relay._expr.TempExprRealize")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    TempExpr temp = args[0];
+    *ret = temp->Realize();
+});
+
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
new file mode 100644
index 000000000000..c1719e81a6c6
--- /dev/null
+++ b/src/relay/ir/expr_functor.cc
@@ -0,0 +1,331 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file src/tvm/relay/expr_mutator.cc
+ * \brief A wrapper around ExprFunctor which functionally updates the AST.
+ *
+ * ExprMutator uses memoization and self return in order to amortize
+ * the cost of using functional updates.
+ */
+#include <tvm/relay/expr_functor.h>
+#include "type_functor.h"
+
+namespace tvm {
+namespace relay {
+
+Expr ExprMutator::VisitExpr(const Expr& expr) {
+  auto it = this->memo_.find(expr);
+  if (it != this->memo_.end()) {
+    return it->second;
+  } else {
+    Expr new_expr = ExprFunctor::VisitExpr(expr);
+    memo_[expr] = new_expr;
+    return new_expr;
+  }
+}
+
+Expr ExprMutator::VisitExpr_(const VarNode* op) {
+  // NOTE: var will only be mutated once
+  // Thanks to the memo and reused during rewriting if necessary.
+  // It is safe to assume that the
+  if (op->type_annotation.defined()) {
+    auto type = this->VisitType(op->type_annotation);
+    if (!op->type_annotation.same_as(type)) {
+      return VarNode::make(op->vid, type);
+    }
+  }
+  // default case return self.
+  return GetRef<Expr>(op);
+}
+
+Expr ExprMutator::VisitExpr_(const ConstantNode* op) {
+  return GetRef<Expr>(op);
+}
+
+Expr ExprMutator::VisitExpr_(const GlobalVarNode* op) {
+  return GetRef<Expr>(op);
+}
+
+Expr ExprMutator::VisitExpr_(const OpNode* op) {
+  return GetRef<Expr>(op);
+}
+
+Expr ExprMutator::VisitExpr_(const TupleNode* op) {
+  tvm::Array<Expr> fields;
+  bool all_fields_unchanged = true;
+  for (auto field : op->fields) {
+    auto new_field = this->Mutate(field);
+    fields.push_back(new_field);
+    all_fields_unchanged &= new_field.same_as(field);
+  }
+
+  if (all_fields_unchanged) {
+    return GetRef<Expr>(op);
+  } else {
+    return TupleNode::make(fields);
+  }
+}
+
+Expr ExprMutator::VisitExpr_(const FunctionNode* op) {
+  tvm::Array<TypeVar> ty_params;
+  bool all_ty_params_changed = true;
+
+  for (auto ty_param : op->type_params) {
+    TypeVar new_ty_param = Downcast<TypeVar>(VisitType(ty_param));
+    ty_params.push_back(new_ty_param);
+    all_ty_params_changed &= new_ty_param.same_as(ty_param);
+  }
+
+  tvm::Array<Var> params;
+  bool all_params_changed = true;
+  for (auto param : op->params) {
+    Var new_param = Downcast<Var>(this->Mutate(param));
+    params.push_back(new_param);
+    all_params_changed &= param.same_as(new_param);
+  }
+
+  auto ret_type = this->VisitType(op->ret_type);
+  auto body = this->Mutate(op->body);
+
+  if (ty_params.same_as(op->type_params) &&
+      params.same_as(op->params) &&
+      ret_type.same_as(op->ret_type) &&
+      body.same_as(op->body)) {
+    return GetRef<Expr>(op);
+  } else {
+    return FunctionNode::make(params, body, ret_type, ty_params, op->attrs);
+  }
+}
+
+Expr ExprMutator::VisitExpr_(const CallNode* call_node) {
+  auto new_op = this->Mutate(call_node->op);
+  bool unchanged = call_node->op.same_as(new_op);
+
+  tvm::Array<Type> ty_args;
+  for (auto ty_arg : call_node->type_args) {
+    auto new_ty_arg = this->VisitType(ty_arg);
+    ty_args.push_back(new_ty_arg);
+    unchanged &= new_ty_arg.same_as(ty_arg);
+  }
+
+  tvm::Array<Expr> call_args;
+  for (auto arg : call_node->args) {
+    auto new_arg = this->Mutate(arg);
+    call_args.push_back(new_arg);
+    unchanged &= new_arg.same_as(arg);
+  }
+
+  if (unchanged) {
+    return GetRef<Expr>(call_node);
+  } else {
+    return CallNode::make(new_op, call_args, call_node->attrs, ty_args);
+  }
+}
+
+Expr ExprMutator::VisitExpr_(const LetNode* op) {
+  Var var = Downcast<Var>(this->Mutate(op->var));
+  auto value = this->Mutate(op->value);
+  auto body = this->Mutate(op->body);
+
+  if (var.same_as(op->var) &&
+      value.same_as(op->value) &&
+      body.same_as(op->body)) {
+    return GetRef<Expr>(op);
+  } else {
+    return LetNode::make(var, value, body);
+  }
+}
+
+Expr ExprMutator::VisitExpr_(const IfNode* op) {
+  auto guard = this->Mutate(op->cond);
+  auto true_b = this->Mutate(op->true_branch);
+  auto false_b = this->Mutate(op->false_branch);
+  if (op->cond.same_as(guard) &&
+      op->true_branch.same_as(true_b) &&
+      op->false_branch.same_as(false_b)) {
+    return GetRef<Expr>(op);;
+  } else {
+    return IfNode::make(guard, true_b, false_b);
+  }
+}
+
+Expr ExprMutator::VisitExpr_(const TupleGetItemNode* g) {
+  auto t = this->Mutate(g->tuple);
+  if (g->tuple == t) {
+    return GetRef<Expr>(g);
+  } else {
+    return TupleGetItemNode::make(t, g->index);
+  }
+}
+
+Type ExprMutator::VisitType(const Type& t) { return t; }
+
+void ExprVisitor::VisitExpr(const Expr& expr) {
+  auto it = visit_counter_.find(expr.get());
+  if (it != visit_counter_.end()) {
+    ++it->second;
+  } else {
+    using TParent = ExprFunctor<void(const Expr&)>;
+    TParent::VisitExpr(expr);
+    visit_counter_.insert({expr.get(), 1});
+  }
+}
+
+void ExprVisitor::ExprVisitor::VisitExpr_(const VarNode* op) {
+  if (op->type_annotation.defined()) {
+    this->VisitType(op->type_annotation);
+  }
+}
+
+void ExprVisitor::ExprVisitor::VisitExpr_(const GlobalVarNode* op) {
+}
+
+void ExprVisitor::ExprVisitor::VisitExpr_(const ConstantNode* op) {
+}
+
+void ExprVisitor::ExprVisitor::VisitExpr_(const TupleNode* op) {
+  for (auto field : op->fields) {
+    this->VisitExpr(field);
+  }
+}
+
+void ExprVisitor::ExprVisitor::VisitExpr_(const FunctionNode* op) {
+  for (auto param : op->params) {
+    this->VisitExpr(param);
+  }
+
+  this->VisitExpr(op->body);
+}
+
+void ExprVisitor::VisitExpr_(const CallNode* op) {
+  this->VisitExpr(op->op);
+
+  for (auto ty_arg : op->type_args) {
+    this->VisitType(ty_arg);
+  }
+
+  for (auto arg : op->args) {
+    this->VisitExpr(arg);
+  }
+}
+
+void ExprVisitor::VisitExpr_(const LetNode* op) {
+  this->VisitExpr(op->value);
+  this->VisitExpr(op->var);
+  this->VisitExpr(op->body);
+}
+
+void ExprVisitor::VisitExpr_(const IfNode* op) {
+  this->VisitExpr(op->cond);
+  this->VisitExpr(op->true_branch);
+  this->VisitExpr(op->false_branch);
+}
+
+void ExprVisitor::VisitExpr_(const OpNode* op) { return; }
+
+void ExprVisitor::VisitExpr_(const TupleGetItemNode* op) {
+  this->VisitExpr(op->tuple);
+}
+
+void ExprVisitor::VisitType(const Type& t) { return; }
+
+
+// visitor to implement apply
+class ExprApplyVisit : public ExprVisitor {
+ public:
+  explicit ExprApplyVisit(std::function<void(const Expr&)> f) : f_(f) {}
+  void VisitExpr(const Expr& e) final {
+    if (visited_.count(e.get()) != 0) return;
+    visited_.insert(e.get());
+    ExprVisitor::VisitExpr(e);
+    f_(e);
+  }
+
+ private:
+  std::function<void(const Expr&)> f_;
+  std::unordered_set<const Node*> visited_;
+};
+
+void PostOrderVisit(const Expr& e, std::function<void(const Expr&)> fvisit) {
+  ExprApplyVisit(fvisit).VisitExpr(e);
+}
+
+TVM_REGISTER_API("relay._ir_pass.post_order_visit")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    PackedFunc f = args[1];
+    PostOrderVisit(args[0], [f](const Expr& n) {
+        f(n);
+      });
+  });
+
+
+// Implement bind.
+class ExprBinder : public ExprMutator {
+ public:
+  explicit ExprBinder(const tvm::Map<Var, Expr>& args_map)
+    : args_map_(args_map) {
+  }
+
+  Expr VisitExpr_(const LetNode* op) final {
+    CHECK(!args_map_.count(op->var))
+        << "Cannot bind an internel variable in let";
+    return ExprMutator::VisitExpr_(op);
+  }
+
+  Expr VisitExpr_(const FunctionNode* op) final {
+    for (Var param : op->params) {
+      CHECK(!args_map_.count(param))
+          << "Cannnot bind an internal function parameter";
+    }
+    return ExprMutator::VisitExpr_(op);
+  }
+
+  Expr VisitExpr_(const VarNode* op) final {
+    auto id = GetRef<Var>(op);
+    auto it = args_map_.find(id);
+    if (it != args_map_.end()) {
+      return (*it).second;
+    } else {
+      return id;
+    }
+  }
+
+ private:
+  const tvm::Map<Var, Expr>& args_map_;
+};
+
+Expr Bind(const Expr& expr, const tvm::Map<Var, Expr>& args_map) {
+  if (const FunctionNode* func = expr.as<FunctionNode>()) {
+    Expr new_body = ExprBinder(args_map).Mutate(func->body);
+    Array<Var> new_params;
+    for (Var param : func->params) {
+      if (!args_map.count(param)) {
+        new_params.push_back(param);
+      }
+    }
+    if (new_body.same_as(func->body) &&
+        new_params.size() == func->params.size()) {
+      return expr;
+    }
+    return FunctionNode::make(new_params,
+                              new_body,
+                              func->ret_type,
+                              func->type_params,
+                              func->attrs);
+  } else {
+    return ExprBinder(args_map).Mutate(expr);
+  }
+}
+
+
+TVM_REGISTER_API("relay._expr.Bind")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    NodeRef input = args[0];
+    if (input->derived_from<ExprNode>()) {
+      *ret = Bind(Downcast<Expr>(input), args[1]);
+    } else {
+      CHECK(input->derived_from<TypeNode>());
+      *ret = Bind(Downcast<Type>(input), args[1]);
+    }
+  });
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/ir/hash.cc b/src/relay/ir/hash.cc
new file mode 100644
index 000000000000..d7a8df98fa3f
--- /dev/null
+++ b/src/relay/ir/hash.cc
@@ -0,0 +1,308 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file src/tvm/relay/ir/hash.cc
+ * \brief Hash functions for Relay types and expressions.
+ */
+#include <tvm/ir_pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/relay/pass.h>
+#include <tvm/attrs.h>
+#include "type_functor.h"
+#include "../../lang/attr_functor.h"
+
+namespace tvm {
+namespace relay {
+
+// Hash handler for Relay.
+class RelayHashHandler:
+      public AttrsHashHandler,
+      public TypeFunctor<size_t(const Type&)>,
+      public ExprFunctor<size_t(const Expr&)> {
+ public:
+  explicit RelayHashHandler() {}
+
+  /*!
+   * Compute hash of a node.
+   * \param ref The node to hash.
+   * \return the hash value.
+   */
+  size_t Hash(const NodeRef& ref) {
+    if (!ref.defined()) return ref.hash();
+
+    if (ref->derived_from<TypeNode>()) {
+      return TypeHash(Downcast<Type>(ref));
+    }
+    if (ref->derived_from<ExprNode>()) {
+      return ExprHash(Downcast<Expr>(ref));
+    }
+    return AttrHash(ref);
+  }
+
+  /*!
+   * Compute hash of the attributes.
+   * \param ref The attributes.
+   * \return the hash value
+   */
+  size_t AttrHash(const NodeRef& ref) {
+    if (!ref.defined()) { return ref.hash(); }
+    return AttrsHashHandler::Hash(ref);
+  }
+  /*!
+   * Compute hash of a Relay type.
+   * \param ref The type to hash.
+   * \param rhs The right hand operand.
+   * \return the hash value.
+   */
+  size_t TypeHash(const Type& type) {
+    if (!type.defined()) { return type.hash(); }
+    auto found = hash_map_.find(type);
+    if (found != hash_map_.end()) {
+      return found->second;
+    } else {
+      auto hash = this->VisitType(type);
+      hash_map_.insert({type, hash});
+      return hash;
+    }
+  }
+  /*!
+   * Compute the hash of an expression.
+   *
+   * \note We run graph structural equality checking when comparing two Exprs.
+   *   This means that AlphaEqualHandler can only be used once for each pair.
+   *   The equality checker checks data-flow equvalence of the Expr DAG.
+   *   This function also runs faster as it memomizes equal_map.
+   *
+   * \param expr The expression to hash.
+   * \return the hash value.
+   */
+  size_t ExprHash(const Expr& expr) {
+    if (!expr.defined()) return expr.hash();
+    auto found = hash_map_.find(expr);
+    if (found != hash_map_.end()) {
+      return found->second;
+    } else {
+      auto hash = this->VisitExpr(expr);
+      hash_map_.insert({expr, hash});
+      return hash;
+    }
+  }
+
+ protected:
+  /*!
+   * \brief Hash a DataType.
+   * \param dtype The dtype to hash.
+   * \return the hash value.
+   */
+  size_t DataTypeHash(const DataType& dtype) {
+    return ::tvm::AttrsHash()(dtype);
+  }
+
+  using AttrsHashHandler::VisitAttr_;
+  size_t VisitAttr_(const Variable* var) final {
+    size_t hash = std::hash<std::string>()(Variable::_type_key);
+    auto it = hash_map_.find(GetRef<VarExpr>(var));
+    if (it != hash_map_.end()) {
+      return it->second;
+    }
+    return Combine(hash, std::hash<std::string>()(var->name_hint));
+  }
+
+  // Type hashing
+  size_t VisitType_(const TensorTypeNode* tensor_type) final {
+    size_t hash = std::hash<std::string>()(TensorTypeNode::_type_key);
+    hash = Combine(hash, DataTypeHash(tensor_type->dtype));
+    hash = Combine(hash, Hash(tensor_type->shape));
+    return hash;
+  }
+
+  size_t VisitType_(const IncompleteTypeNode* incomplete) final {
+    size_t hash = std::hash<std::string>()(IncompleteTypeNode::_type_key);
+    return Combine(hash, std::hash<int>()(incomplete->kind));
+  }
+
+  size_t VisitType_(const TypeVarNode* tyvar) final {
+    /*
+      TypeVar/Var/Variable have two locations where they are hashed:
+
+        The declaration site of a function, let, or function type.
+        The first occurence in the term.
+
+      We will only reach this code if the TypeVar itself is unbound, we assign
+      a free variable index to it, meaning this hashing function implements
+      structural equality for both open (i.e graph equality) and closed terms
+      (i.e alpha_equality).
+    */
+    return BindVar(GetRef<TypeVar>(tyvar));
+  }
+
+  size_t VisitType_(const FuncTypeNode* func_type) final {
+    size_t hash = std::hash<std::string>()(FuncTypeNode::_type_key);
+
+    for (auto type_param : func_type->type_params) {
+      hash = Combine(hash, BindVar(type_param));
+    }
+
+    for (auto arg : func_type->arg_types) {
+      hash = Combine(hash, TypeHash(arg));
+    }
+
+    hash = Combine(hash, TypeHash(func_type->ret_type));
+    for (auto cs : func_type->type_constraints) {
+      hash = Combine(hash, TypeHash(cs));
+    }
+
+    return hash;
+  }
+
+  size_t VisitType_(const TypeRelationNode* type_rel) final {
+    size_t hash = std::hash<std::string>()(TypeRelationNode::_type_key);
+    hash = Combine(hash, std::hash<std::string>()(type_rel->func->name));
+    hash = Combine(hash, AttrHash(type_rel->attrs));
+
+    for (auto arg : type_rel->args) {
+      hash = Combine(hash, TypeHash(arg));
+    }
+
+    return hash;
+  }
+
+  size_t VisitType_(const TupleTypeNode* tuple_type) final {
+    size_t hash = std::hash<std::string>()(TupleTypeNode::_type_key);
+    for (size_t i = 0; i < tuple_type->fields.size(); i++) {
+      hash = Combine(hash, TypeHash(tuple_type->fields[i]));
+    }
+    return hash;
+  }
+
+  // Expr hashing.
+  size_t NDArrayHash(const runtime::NDArray& array) {
+    size_t hash = std::hash<uint8_t>()(array->dtype.code);
+    hash = Combine(hash, std::hash<uint8_t>()(array->dtype.bits));
+    hash = Combine(hash, std::hash<uint16_t>()(array->dtype.lanes));
+    CHECK_EQ(array->ctx.device_type, kDLCPU) << "can only compare CPU tensor";
+    size_t data_size = runtime::GetDataSize(*array.operator->());
+    uint8_t * data = reinterpret_cast<uint8_t*>(array->data);
+    for (size_t i = 0; i < data_size; i++) {
+      hash = Combine(hash, std::hash<uint8_t>()(data[i]));
+    }
+    return hash;
+  }
+
+  size_t BindVar(const NodeRef& var) {
+    size_t hash = std::hash<int>()(var_counter++);
+    CHECK_EQ(hash_map_.count(var), 0);
+    hash_map_[var] = hash;
+
+    const auto* ty_param = var.as<TypeVarNode>();
+    if (ty_param && ty_param->kind == TypeVarNode::Kind::kShapeVar) {
+      hash_map_[ty_param->var] = hash;
+    }
+    return hash;
+  }
+
+  size_t VisitExpr_(const VarNode* var) final {
+    // hash free variable
+    size_t name_hash = std::hash<const Node*>()(var->vid.get());
+    return Combine(name_hash, TypeHash(var->type_annotation));
+  }
+
+  size_t VisitExpr_(const GlobalVarNode* global) final {
+    return std::hash<std::string>()(global->name_hint);
+  }
+
+  size_t VisitExpr_(const TupleNode* tuple) final {
+    size_t hash = std::hash<std::string>()(TupleNode::_type_key);
+    for (size_t i = 0; i < tuple->fields.size(); i++) {
+      hash = Combine(hash, ExprHash(tuple->fields[i]));
+    }
+    return hash;
+  }
+
+  size_t VisitExpr_(const FunctionNode* func) final {
+    size_t hash = std::hash<std::string>()(FunctionNode::_type_key);
+    for (auto type_param : func->type_params) {
+      hash = Combine(hash, BindVar(type_param));
+    }
+
+    for (auto param : func->params) {
+      hash = Combine(hash, BindVar(param));
+    }
+
+    hash = Combine(hash, TypeHash(func->ret_type));
+    hash =  Combine(hash, ExprHash(func->body));
+
+    return hash;
+  }
+
+  size_t VisitExpr_(const CallNode* call) final {
+    size_t hash = std::hash<std::string>()(CallNode::_type_key);
+    hash = Combine(hash, ExprHash(call->op));
+
+    for (auto arg : call->args) {
+      hash = Combine(hash, ExprHash(arg));
+    }
+
+    hash = Combine(hash, AttrHash(call->attrs));
+
+    return hash;
+  }
+
+  size_t VisitExpr_(const LetNode* let) final {
+    size_t hash = std::hash<std::string>()(LetNode::_type_key);
+    hash = Combine(hash, BindVar(let->var));
+    hash = Combine(hash, ExprHash(let->value));
+    hash = Combine(hash, ExprHash(let->body));
+    return hash;
+  }
+
+  size_t VisitExpr_(const IfNode* ite) final {
+    size_t key = std::hash<std::string>()(IfNode::_type_key);
+    size_t hash = key;
+    hash = Combine(hash, ExprHash(ite->cond));
+    hash = Combine(hash, ExprHash(ite->true_branch));
+    hash = Combine(hash, ExprHash(ite->false_branch));
+    return hash;
+  }
+
+  size_t VisitExpr_(const OpNode* op) final {
+    return GetRef<Op>(op).hash();
+  }
+
+  size_t VisitExpr_(const ConstantNode* rconst) final {
+    return NDArrayHash(rconst->data);
+  }
+
+  size_t VisitExpr_(const TupleGetItemNode* get_item) final {
+    size_t hash = std::hash<std::string>()(TupleGetItemNode::_type_key);
+    hash = Combine(hash, ExprHash(get_item->tuple));
+    hash = Combine(hash, std::hash<int>()(get_item->index));
+    return hash;
+  }
+
+ private:
+  // renaming of NodeRef to indicate two nodes equals to each other
+  std::unordered_map<NodeRef, size_t, NodeHash, NodeEqual> hash_map_;
+  int var_counter = 0;
+};
+
+size_t StructuralHash::operator()(const Type& type) const {
+  return RelayHashHandler().TypeHash(type);
+}
+
+size_t StructuralHash::operator()(const Expr& expr) const {
+  return RelayHashHandler().ExprHash(expr);
+}
+
+TVM_REGISTER_API("relay._ir_pass._expr_hash")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = static_cast<int64_t>(RelayHashHandler().Hash(args[0]));
+  });
+
+TVM_REGISTER_API("relay._ir_pass._type_hash")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = static_cast<int64_t>(RelayHashHandler().TypeHash(args[0]));
+  });
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/ir/module.cc b/src/relay/ir/module.cc
new file mode 100644
index 000000000000..4443ed50783e
--- /dev/null
+++ b/src/relay/ir/module.cc
@@ -0,0 +1,141 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file  module.cc
+ * \brief The global module in Relay.
+ */
+#include <tvm/relay/module.h>
+#include <tvm/relay/pass.h>
+#include <sstream>
+
+namespace tvm {
+namespace relay {
+
+using tvm::IRPrinter;
+using namespace runtime;
+
+Module ModuleNode::make(tvm::Map<GlobalVar, Function> global_funcs) {
+  auto n = make_node<ModuleNode>();
+  n->functions = std::move(global_funcs);
+
+  for (const auto& kv : n->functions) {
+    // set gloval var map
+    CHECK(!n->global_var_map_.count(kv.first->name_hint))
+        << "Duplicate global function name " << kv.first->name_hint;
+    n->global_var_map_.Set(kv.first->name_hint, kv.first);
+  }
+  return Module(n);
+}
+
+GlobalVar ModuleNode::GetGlobalVar(const std::string& name) {
+  auto it = global_var_map_.find(name);
+  CHECK(it != global_var_map_.end())
+      << "Cannot find global var " << name << " in the Module";
+  return (*it).second;
+}
+
+void ModuleNode::Add(const GlobalVar& var,
+                          const Function& func,
+                          bool update) {
+  // Type check the item before we add it to the modironment.
+  auto mod = GetRef<Module>(this);
+  Function checked_func = InferType(func, mod, var);
+  auto type = checked_func->checked_type();
+  CHECK(type.as<IncompleteTypeNode>() == nullptr);
+  if (functions.find(var) != functions.end()) {
+    CHECK(update)
+        << "Already have definition for " << var->name_hint;
+    auto old_type = functions[var].as<FunctionNode>()->checked_type();
+    CHECK(AlphaEqual(type, old_type))
+        << "Module#update changes type, not possible in this mode.";
+  }
+  this->functions.Set(var, checked_func);
+
+  auto it = global_var_map_.find(var->name_hint);
+  if (it != global_var_map_.end()) {
+    CHECK_EQ((*it).second, var);
+  } else {
+    // set global var map
+    CHECK(!global_var_map_.count(var->name_hint))
+        << "Duplicate global function name " << var->name_hint;
+  }
+
+  global_var_map_.Set(var->name_hint, var);
+}
+
+void ModuleNode::Update(const GlobalVar& var, const Function& func) {
+  this->Add(var, func, true);
+}
+
+void ModuleNode::Remove(const GlobalVar& var) {
+  auto functions_node = this->functions.CopyOnWrite();
+  functions_node->data.erase(var.node_);
+  auto gvar_node = global_var_map_.CopyOnWrite();
+  gvar_node->data.erase(var->name_hint);
+}
+
+Function ModuleNode::Lookup(const GlobalVar& var) {
+  auto it = functions.find(var);
+  CHECK(it != functions.end())
+      << "There is no definition of " << var->name_hint;
+  return (*it).second;
+}
+
+Function ModuleNode::Lookup(const std::string& name) {
+  GlobalVar id = this->GetGlobalVar(name);
+  return this->Lookup(id);
+}
+
+void ModuleNode::Update(const Module& mod) {
+  for (auto pair : mod->functions) {
+    this->Update(pair.first, pair.second);
+  }
+}
+
+TVM_REGISTER_NODE_TYPE(ModuleNode);
+
+TVM_REGISTER_API("relay._make.Module")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    *ret = ModuleNode::make(args[0]);
+  });
+
+TVM_REGISTER_API("relay._module.Module_Add")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    Module mod = args[0];
+    mod->Add(args[1], args[2], args[3]);
+  });
+
+TVM_REGISTER_API("relay._module.Module_GetGlobalVar")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    Module mod = args[0];
+    *ret = mod->GetGlobalVar(args[1]);
+  });
+
+TVM_REGISTER_API("relay._module.Module_Lookup")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    Module mod = args[0];
+    GlobalVar var = args[1];
+    *ret = mod->Lookup(var);
+  });
+
+TVM_REGISTER_API("relay._module.Module_Lookup_str")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    Module mod = args[0];
+    std::string var_name = args[1];
+    auto var = mod->GetGlobalVar(var_name);
+    *ret = mod->Lookup(var);
+  });
+
+TVM_REGISTER_API("relay._module.Module_Update")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    Module mod = args[0];
+    mod->Update(args[1]);
+  });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<ModuleNode>(
+    [](const ModuleNode *node, tvm::IRPrinter *p) {
+      p->stream << "ModuleNode( " << node->functions << ")";
+    });
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/ir/op.cc b/src/relay/ir/op.cc
new file mode 100644
index 000000000000..bc9955251a7e
--- /dev/null
+++ b/src/relay/ir/op.cc
@@ -0,0 +1,159 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file src/tvm/relay/op.cc
+ * \brief Resolve incomplete types to complete types.
+ */
+#include <tvm/relay/op.h>
+#include <tvm/relay/type.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/packed_func.h>
+
+#include <memory>
+#include <mutex>
+
+namespace dmlc {
+// enable registry
+DMLC_REGISTRY_ENABLE(::tvm::relay::OpRegistry);
+}  // namespace dmlc
+
+namespace tvm {
+namespace relay {
+
+::dmlc::Registry<OpRegistry>* OpRegistry::Registry() {
+  return ::dmlc::Registry<OpRegistry>::Get();
+}
+
+// single manager of operator information.
+struct OpManager {
+  // mutex to avoid registration from multiple threads.
+  std::mutex mutex;
+  // global operator counter
+  std::atomic<int> op_counter{0};
+  // storage of additional attribute table.
+  std::unordered_map<std::string, std::unique_ptr<GenericOpMap>> attr;
+  // frontend functions
+  std::vector<PackedFunc*> frontend_funcs;
+  // get singleton of the op manager
+  static OpManager* Global() {
+    static OpManager inst;
+    return &inst;
+  }
+};
+
+// find operator by name
+const Op& Op::Get(const std::string& name) {
+  const OpRegistry* reg = dmlc::Registry<OpRegistry>::Find(name);
+  CHECK(reg != nullptr) << "Operator " << name << " is not registered";
+  return reg->op();
+}
+
+OpRegistry::OpRegistry() {
+  OpManager* mgr = OpManager::Global();
+  NodePtr<OpNode> n = make_node<OpNode>();
+  n->index_ = mgr->op_counter++;
+  op_ = Op(n);
+}
+
+// Get attribute map by key
+const GenericOpMap& Op::GetGenericAttr(const std::string& key) {
+  OpManager* mgr = OpManager::Global();
+  std::lock_guard<std::mutex> lock(mgr->mutex);
+  auto it = mgr->attr.find(key);
+  if (it == mgr->attr.end()) {
+    LOG(FATAL) << "Operator attribute \'" << key << "\' is not registered";
+  }
+  return *it->second.get();
+}
+
+void OpRegistry::UpdateAttr(const std::string& key,
+                            TVMRetValue value,
+                            int plevel) {
+  OpManager* mgr = OpManager::Global();
+  std::lock_guard<std::mutex> lock(mgr->mutex);
+  std::unique_ptr<GenericOpMap>& op_map = mgr->attr[key];
+  if (op_map == nullptr) {
+    op_map.reset(new GenericOpMap());
+    op_map->attr_name_ = key;
+  }
+  uint32_t index = op_->index_;
+  if (op_map->data_.size() <= index) {
+    op_map->data_.resize(index + 1, std::make_pair(TVMRetValue(), 0));
+  }
+  std::pair<TVMRetValue, int>& p = op_map->data_[index];
+  CHECK(p.second != plevel)
+      << "Attribute " << key << " of operator " << this->name
+      << " is already registered with same plevel=" << plevel;
+  if (p.second < plevel) {
+    op_map->data_[index] = std::make_pair(value, plevel);
+  }
+}
+
+// Frontend APIs
+TVM_REGISTER_API("relay.op._ListOpNames")
+.set_body_typed<Array<tvm::Expr>()>([]() {
+    Array<tvm::Expr> ret;
+    for (const std::string& name :
+             dmlc::Registry<OpRegistry>::ListAllNames()) {
+      ret.push_back(tvm::Expr(name));
+    }
+    return ret;
+  });
+
+TVM_REGISTER_API("relay.op._GetOp").set_body_typed<Op(std::string)>(Op::Get);
+
+TVM_REGISTER_API("relay.op._OpGetAttr")
+    .set_body([](TVMArgs args, TVMRetValue* rv) {
+      Op op = args[0];
+      std::string attr_name = args[1];
+      auto op_map = Op::GetAttr<TVMRetValue>(attr_name);
+      if (op_map.count(op)) {
+        *rv = op_map[op];
+      }
+    });
+
+TVM_REGISTER_API("relay.op._Register")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    std::string op_name = args[0];
+    std::string attr_key = args[1];
+    runtime::TVMArgValue value = args[2];
+    int plevel = args[3];
+    auto& reg =
+        OpRegistry::Registry()->__REGISTER_OR_GET__(op_name).set_name();
+    // enable resgiteration and override of certain properties
+    if (attr_key == "num_inputs" && plevel > 128) {
+      reg.set_num_inputs(value);
+    } else if (attr_key == "attrs_type_key" && plevel > 128) {
+      reg.set_attrs_type_key(value);
+    } else {
+      // normal attr table override.
+      if (args[2].type_code() == kFuncHandle) {
+        // do an eager copy of the PackedFunc
+        PackedFunc f = args[2];
+        // If we get a function from frontend, avoid deleting it.
+        OpManager::Global()->frontend_funcs.push_back(new PackedFunc(f));
+        reg.set_attr(attr_key, f, plevel);
+      } else {
+        reg.set_attr(attr_key, args[2], plevel);
+      }
+    }
+  });
+
+NodePtr<Node> CreateOp(const std::string& name) {
+  auto op = Op::Get(name);
+  CHECK(op.defined()) << "Cannot find op \'" << name << '\'';
+  return op.node_;
+}
+
+TVM_REGISTER_NODE_TYPE(OpNode)
+.set_creator(CreateOp)
+.set_global_key([](const Node* n) {
+    return static_cast<const OpNode*>(n)->name;
+  });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<OpNode>([](const OpNode* node, tvm::IRPrinter* p) {
+    p->stream << "Op(" << node->name << ")";
+  });
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/ir/text_printer.cc b/src/relay/ir/text_printer.cc
new file mode 100644
index 000000000000..46b0d25b3d7d
--- /dev/null
+++ b/src/relay/ir/text_printer.cc
@@ -0,0 +1,818 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file text_printer.cc
+ * \brief Text printer to print relay in text form.
+ */
+#include <tvm/relay/module.h>
+#include <tvm/relay/expr_functor.h>
+#include <sstream>
+#include "type_functor.h"
+#include "../../lang/attr_functor.h"
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief the text value used in text printer.
+ * Defined as a struct for future compatibility reason
+ */
+struct TextValue {
+  /*! \brief The str representation */
+  std::string name;
+  // constructor
+  TextValue() {}
+  // constructor
+  explicit TextValue(std::string name) : name(name) {}
+};
+
+// operator overloading
+inline std::ostream& operator<<(std::ostream& os, const TextValue& val) {  // NOLINT(*)
+  return os << val.name;
+}
+
+/*!
+ * \brief Meta data context for TextPrinter.
+ *
+ * This is an important part to enable bi-directional serializability.
+ * We use tvm's Node system to build the current IR.
+ * It can be hard to design a text format for all the possible nodes
+ * as the set of nodes can grow when we do more extensions.
+ *
+ * Instead of trying to design readable text format for every node,
+ * we support a meta-data section in the text format.
+ * We allow the text format to refer to a node in the meta-data section.
+ *
+ * The meta-data section is a json serialized string of an Array<NodeRef>.
+ * Each element in the meta-data section can be referenced by the text format.
+ * Each meta data node is printed in the following format.
+ *
+ * meta.<type-key-of-node>(<index-in-meta-section>)
+ *
+ * Specifically, consider the following IR(constructed by python).
+ *
+ * \code
+ *
+ * n = tvm.var("n")
+ * x = tvm.relay.var("x", shape=(n, 1))
+ * f = tvm.relay.Function([x], x)
+ * print(f.astext())
+ *
+ * \endcode
+ *
+ * The corresponding text format is shown in the following code block.
+ *
+ * \code
+ *
+ * fn (%x: Tensor[(meta.Variable(id=0),), float32]) {
+ *   %x
+ * }
+ * # Meta data section is a json-serialized string
+ * # of the following array.
+ * # [tvm.var("n")]
+ *
+ * \endcode
+ *
+ * Note that we store tvm.var("n") in the meta data section.
+ * Since it is stored in the index-0 in the meta-data section,
+ * we print it as meta.Variable(0).
+ *
+ * The text parser can recover this object by loading from the corresponding
+ * location in the meta data section.
+ *
+ * This is is a design trade-off.
+ * It allows us to embedded any meta-data in the text format,
+ * while still being able to tweak the text part of the printed IR easily.
+ */
+class TextMetaDataContext {
+ public:
+  /*!
+   * \brief Get text representation of meta node.
+   * \param node The node to be converted to meta node.
+   * \return A string representation of the meta node.
+   */
+  std::string GetMetaNode(const NodeRef& node) {
+    std::ostringstream os;
+    auto it = meta_index_.find(node);
+    int64_t index;
+    if (it != meta_index_.end()) {
+      index = it->second;
+    } else {
+      index = static_cast<int64_t>(meta_data_.size());
+      meta_data_.push_back(node);
+      meta_index_[node] = index;
+    }
+    os << "meta." << node->type_key() << "(id=" << index << ")";
+    return os.str();
+  }
+  /*!
+   * \brief Get the metadata section in json format.
+   * \return the meta datastring.
+   */
+  std::string GetMetaSection() const {
+    if (meta_data_.size() == 0) return std::string();
+    return SaveJSON(Array<NodeRef>(meta_data_));
+  }
+
+  /*! \return whether the meta data context is empty. */
+  bool empty() const {
+    return meta_data_.empty();
+  }
+
+ private:
+  /*! \brief additional metadata stored in TVM json format */
+  std::vector<NodeRef> meta_data_;
+  /*! \brief map from meta data into its index */
+  std::unordered_map<NodeRef, int64_t, NodeHash, NodeEqual> meta_index_;
+};
+
+class TextPrinter :
+    public ExprFunctor<TextValue(const Expr&)>,
+    public TypeFunctor<void (const Type&, std::ostream& os)>,  // NOLINT(*)
+    public AttrFunctor<void (const NodeRef&, std::ostream& os)> { // NOLINT(*)
+ public:
+  explicit TextPrinter(bool show_meta_data,
+                       runtime::TypedPackedFunc<std::string(Expr)> annotate)
+      : show_meta_data_(show_meta_data), annotate_(annotate) {}
+  /*!
+   * \brief Print a node to string.
+   * \param node.
+   * \return The string representation.
+   */
+  std::string Print(const NodeRef& node) {
+    if (node.as<FunctionNode>()) {
+      this->PrintFunc(Downcast<Function>(node));
+    } else if (node.as<ModuleNode>()) {
+      this->PrintEnv(Downcast<Module>(node));
+    } else if (node.as_derived<TypeNode>()) {
+      this->PrintType(Downcast<Type>(node), stream_);
+    } else if (node.as_derived<ExprNode>()) {
+      this->PrintExpr(Downcast<Expr>(node));
+    } else {
+      stream_ << node;
+    }
+    if (!meta_.empty()) {
+      if (show_meta_data_) {
+        std::string meta_json = meta_.GetMetaSection();
+        // append meta data in the end.
+        stream_ << "# meta data\n"
+                << "r\"\"\"\n"
+                << meta_json << "\n"
+                << "\"\"\"";
+      } else {
+        stream_ << "# meta data omitted. you can use show_meta_data=True to include meta-data\n";
+      }
+    }
+    return stream_.str();
+  }
+
+  void PrintFunc(const Function& func) {
+    this->PrintFuncInternal("fn ", func);
+    stream_ << "\n";
+  }
+
+  void PrintEnv(const Module& mod) {
+    int counter = 0;
+    for (const auto& kv : mod->functions) {
+      std::ostringstream os;
+      if (counter++ != 0) {
+        stream_ << "\n";
+      }
+      os << "def @" << kv.first->name_hint;
+      this->PrintFuncInternal(os.str(), kv.second);
+      stream_ << "\n";
+    }
+  }
+
+  void PrintExpr(const Expr& expr) {
+    TextValue val = GetValue(expr);
+    stream_ << val << "\n";
+  }
+
+  /*!
+   * \brief Get text representation of expr.
+   *
+   * This function may generate additional instructions
+   * in order to compute the final result id of expr.
+   *
+   * When trying to recursively print out an Expr.
+   * The caller should always call GetValue of its children first.
+   * Then the caller can print out to stream_ using the obtained value.
+   *
+   * This is to avoid the call of subsequent GetValue print out
+   * additional instructions which get mixed with the partial instruction
+   * printed by the caller.
+   *
+   * \param expr The input expression.
+   * \return The text value of Expr.
+   */
+  TextValue GetValue(const Expr& expr) {
+    auto it = memo_.find(expr);
+    if (it != memo_.end()) return it->second;
+    TextValue val = this->VisitExpr(expr);
+    memo_[expr] = val;
+    return val;
+  }
+  //------------------------------------
+  // Overload of Expr printing functions
+  //------------------------------------
+  TextValue VisitExpr_(const ConstantNode* op) final {
+    // Print out simple scalar directly.
+    if (op->is_scalar()) {
+      std::ostringstream os;
+      DataType dtype = TVMType2Type(op->data->dtype);
+      CHECK_EQ(op->data->ctx.device_type, kDLCPU);
+      if (dtype == Int(32)) {
+        return ConstScalar(dtype, static_cast<const int32_t*>(op->data->data));
+      } else if (dtype == Int(64)) {
+        return ConstScalar(dtype, static_cast<const int64_t*>(op->data->data));
+      } else if (dtype == Float(32)) {
+        return ConstScalar(dtype, static_cast<const float*>(op->data->data));
+      } else if (dtype == Float(64)) {
+        return ConstScalar(dtype, static_cast<const double*>(op->data->data));
+      } else if (dtype == Bool()) {
+        return ConstScalar(dtype, static_cast<const uint8_t*>(op->data->data));
+      }
+    }
+    // default fall-back, record it as meta node.
+    TextValue id = this->AllocTempVar();
+    this->PrintIndent();
+    stream_ << id << " = " << meta_.GetMetaNode(GetRef<NodeRef>(op));
+    this->PrintEndInst("");
+    this->PrintOptionalInfo(GetRef<Expr>(op));
+    stream_ << '\n';
+    return id;
+  }
+
+  TextValue VisitExpr_(const TupleNode* op) final {
+    std::vector<TextValue> fields;
+    for (Expr field : op->fields) {
+      fields.push_back(GetValue(field));
+    }
+    // NOTE: always recursively visit to get ids,
+    // before print out the current line
+    TextValue id = this->AllocTempVar();
+    this->PrintIndent();
+    stream_ << id << " = (";
+    for (size_t i = 0; i < fields.size(); ++i) {
+      stream_ << fields[i];
+      if (i + 1 != fields.size()) {
+        stream_ << ", ";
+      }
+    }
+    if (fields.size() == 1) {
+      stream_ << ',';
+    }
+    stream_ << ')';
+    this->PrintEndInst("\n");
+    return id;
+  }
+
+  TextValue VisitExpr_(const VarNode* op) final {
+    Var var = GetRef<Var>(op);
+    // This is an unbounded var.
+    TextValue val = AllocVarName(var);
+    this->PrintIndent();
+    stream_ << "free_var ";
+    this->PrintVarDecl(var, stream_);
+    this->PrintEndInst("\n");
+    return val;
+  }
+
+  TextValue VisitExpr_(const GlobalVarNode* op) final {
+    return TextValue('@' + op->name_hint);
+  }
+
+  TextValue VisitExpr_(const FunctionNode* op) final {
+    TextValue id = AllocTempVar();
+    std::ostringstream os;
+    os << id << " = fn";
+    this->PrintFuncInternal(os.str(), GetRef<Function>(op));
+    this->PrintEndInst("\n");
+    return id;
+  }
+
+  TextValue VisitExpr_(const CallNode* op) final {
+    // possibly through meta-data
+    std::vector<TextValue> args;
+    for (Expr arg : op->args) {
+      args.emplace_back(GetValue(arg));
+    }
+    TextValue call_op = GetValue(op->op);
+    TextValue id = this->AllocTempVar();
+    this->PrintIndent();
+
+    stream_ << id << " = " << call_op;
+
+    auto type_args = op->type_args;
+
+    if (!IsPrimitiveOp(op->op) && type_args.size() > 0U) {
+      stream_ << "<";
+      for (size_t i = 0; i < op->type_args.size(); ++i) {
+        this->PrintType(type_args[i], stream_);
+        if (i + 1 != type_args.size()) {
+          stream_ << ", ";
+        }
+      }
+      stream_ << ">";
+    }
+
+    stream_ << "(";
+    for (size_t i = 0; i < args.size(); ++i) {
+      stream_ << args[i];
+      if (i + 1 != args.size()) {
+        stream_ << ", ";
+      }
+    }
+    this->PrintCallAttrs(op->op, op->attrs, stream_);
+    stream_ << ")";
+    this->PrintEndInst("");
+    this->PrintOptionalInfo(GetRef<Expr>(op));
+    stream_ << '\n';
+    return id;
+  }
+
+  TextValue VisitExpr_(const LetNode* op) final {
+    TextValue id = this->AllocTempVar();
+    this->PrintIndent();
+    stream_ << id << " = ";
+    this->PrintScope(GetRef<Expr>(op));
+    this->PrintEndInst("\n");
+    return id;
+  }
+
+  TextValue VisitExpr_(const IfNode* op) final {
+    TextValue id = this->AllocTempVar();
+    this->PrintIndent();
+    stream_ << id << " = ";
+    this->PrintScope(GetRef<Expr>(op));
+    this->PrintEndInst("\n");
+    return id;
+  }
+
+  TextValue VisitExpr_(const OpNode* op) final {
+    return TextValue(op->name);
+  }
+
+  TextValue VisitExpr_(const TupleGetItemNode* op) final {
+    TextValue tuple = GetValue(op->tuple);
+    TextValue id = this->AllocTempVar();
+    this->PrintIndent();
+    stream_ << id << " = " << tuple << "." << op->index << "";
+    this->PrintEndInst("\n");
+    return id;
+  }
+
+  /*!
+   * \brief Print the type to os
+   * \param type The type to be printed.
+   * \param os The output type.
+   */
+  void PrintType(const Type& type, std::ostream& os) {  // NOLINT(*)
+    this->VisitType(type, os);
+  }
+  //------------------------------------
+  // Overload of Expr printing functions
+  //------------------------------------
+  void VisitType_(const TensorTypeNode* node, std::ostream& os) final {  // NOLINT(*)
+    // scalar type
+    if (node->shape.size() == 0) {
+      os << runtime::TVMType2String(Type2TVMType(node->dtype));
+      return;
+    }
+    os << "Tensor[(";
+    for (size_t i = 0; i < node->shape.size(); ++i) {
+      this->PrintAttr(node->shape[i], os);
+      if (i + 1 != node->shape.size()) {
+        os << ", ";
+      }
+    }
+    // conform to python tuple format (1,)
+    if (node->shape.size() == 1) {
+      os << ",";
+    }
+    os << "), " << runtime::TVMType2String(Type2TVMType(node->dtype)) << "]";
+  }
+
+  void VisitType_(const TupleTypeNode* node, std::ostream& os) final {  // NOLINT(*)
+    os << "Tuple[";
+    for (size_t i = 0; i < node->fields.size(); ++i) {
+      this->PrintType(node->fields[i], os);
+      if (i + 1 != node->fields.size()) {
+        os << ", ";
+      }
+    }
+    os << "]";
+  }
+
+  void VisitTypeDefault_(const Node* node, std::ostream& os) final {  // NOLINT(*)
+    // by default always print as meta-data
+    os << meta_.GetMetaNode(GetRef<NodeRef>(node));
+  }
+
+  /*!
+   * \brief Print an attribute value to os.
+   * \param value The value to be printed.
+   * \param os The output type.
+   */
+  void PrintAttr(const NodeRef& value, std::ostream& os) {  // NOLINT(*)
+    if (value.defined()) {
+      this->VisitAttr(value, os);
+    } else {
+      os << "None";
+    }
+  }
+  //------------------------------------
+  // Overload of Attr printing functions
+  //------------------------------------
+  void VisitAttr_(const ArrayNode* op, std::ostream& os) final {  // NOLINT(*)
+    os << "[";
+    for (size_t i = 0; i < op->data.size(); ++i) {
+      this->PrintAttr(NodeRef(op->data[i]), os);
+      if (i + 1 != op->data.size()) {
+        os << ", ";
+      }
+    }
+    os << "]";
+  }
+  void VisitAttrDefault_(const Node* op, std::ostream& os) final { // NOLINT(*)
+    os << meta_.GetMetaNode(GetRef<NodeRef>(op));
+  }
+
+  void VisitAttr_(const ir::IntImm* op, std::ostream& os) final {  // NOLINT(*)
+    this->PrintConstScalar(op->type, &(op->value), os);
+  }
+
+  void VisitAttr_(const ir::UIntImm* op, std::ostream& os) final {  // NOLINT(*)
+    this->PrintConstScalar(op->type, &(op->value), os);
+  }
+
+  void VisitAttr_(const ir::FloatImm* op, std::ostream& os) final {  // NOLINT(*)
+    this->PrintConstScalar(op->type, &(op->value), os);
+  }
+
+  void VisitAttr_(const ir::StringImm* op, std::ostream& os) final {  // NOLINT(*)
+    this->PrintString(op->value, os);
+  }
+
+ protected:
+  /*!
+   * \brief Print attributes after call.
+   * \param op The operator to be called.
+   * \param attrs The attributes.
+   * \param os The output stream.
+   */
+  void PrintCallAttrs(const Expr& op, const Attrs& attrs, std::ostream& os);  // NOLINT(*)
+
+  /*!
+   * \brief Print the a new scopr.
+   * \param body The body.
+   */
+  void PrintScope(Expr body) {
+    stream_ << "{\n";
+    int sid = this->BeginScope();
+    this->PrintScopeBody(body);
+    this->EndScope(sid);
+    this->PrintIndent();
+    stream_ << "}";
+  }
+  /*!
+   * \brief Print the body of a new scope without {}
+   *
+   * This function will keep printing continuous sequence
+   * of let/if scope without introducing a new scope in the text.
+   *
+   * \param body The body.
+   */
+  void PrintScopeBody(Expr body) {
+    if (const LetNode* let = body.as<LetNode>()) {
+      TextValue value = GetValue(let->value);
+      AllocVarName(let->var);
+      // let var = value;
+      this->PrintIndent();
+      stream_ << "let ";
+      this->PrintVarDecl(let->var, stream_);
+      stream_ << " = " << value;
+      this->PrintEndInst("\n");
+      this->PrintScopeBody(let->body);
+    } else if (const IfNode* ifnode = body.as<IfNode>()) {
+      TextValue cond = GetValue(ifnode->cond);
+      this->PrintIndent();
+      stream_ << "if (" << cond << ") ";
+      this->PrintScope(ifnode->true_branch);
+      this->PrintIndent();
+      stream_ << "else ";
+      this->PrintScope(ifnode->false_branch);
+      this->PrintEndInst("\n");
+    } else {
+      TextValue value = GetValue(body);
+      this->PrintIndent();
+      stream_ << value;
+      this->PrintEndInst("\n");
+    }
+  }
+
+  /*!
+   * \brief Internal function to print a function argument list and its body.
+   * \param prefix The prefix before argument list.
+   * \param fn The function to be printed.
+   */
+  void PrintFuncInternal(std::string prefix, const Function& fn) {
+    // TODO(tqchen, M.K.) support generic function
+    // Possibly through meta-data
+    CHECK_EQ(fn->type_params.size(), 0U)
+        << "generic fn not yet supported";
+    this->PrintIndent();
+    stream_ << prefix << "(";
+    size_t decl_indent = prefix.length() + 1;
+    for (size_t i = 0; i < fn->params.size(); ++i) {
+      if (i != 0) {
+        this->PrintIndent(decl_indent);
+      }
+      AllocVarName(fn->params[i]);
+      this->PrintVarDecl(fn->params[i], stream_);
+      if (i + 1 != fn->params.size()) {
+        stream_ << ",\n";
+      }
+    }
+    stream_ << ')';
+    if (fn->ret_type.defined()) {
+      stream_ << '\n';
+      this->PrintIndent(decl_indent);
+      stream_ << "-> ";
+      this->PrintType(fn->ret_type, stream_);
+    }
+    stream_ << ' ';
+    this->PrintScope(fn->body);
+  }
+  /*!
+   * \brief Print additional info about expr in comment.
+   * \param expr The expression.
+   */
+  void PrintOptionalInfo(const Expr& expr) {
+    // additional information in comment.
+    if (annotate_ != nullptr) {
+      stream_ << " # " << annotate_(expr);
+    } else if (expr->checked_type_.defined()) {
+      stream_ << " # ty=";
+      this->PrintType(expr->checked_type(), stream_);
+    }
+  }
+  /*!
+   * \brief print var_name[:type]
+   * \param var The variable to be printed
+   * \param os The output stream
+   */
+  void PrintVarDecl(const Var& var, std::ostream& os) {  // NOLINT(*)
+    TextValue v = GetValue(var);
+    os << v;
+    if (var->type_annotation.defined()) {
+      os << ": ";
+      this->PrintType(var->type_annotation, os);
+    }
+  }
+  /*!
+   * \brief Get a constant scalar value.
+   * \param dtype The data type.
+   * \param data The pointer to the data.
+   * \tparam T the content data type holding the data.
+   */
+  template<typename T>
+  TextValue ConstScalar(DataType dtype, const T* data) {
+    std::ostringstream os;
+    PrintConstScalar(dtype, data, os);
+    return TextValue(os.str());
+  }
+  /*!
+   * \brief special method to print out const scalar
+   * \param dtype The data type
+   * \param data The pointer to hold the data.
+   * \param os The output stream.
+   */
+  template<typename T>
+  void PrintConstScalar(DataType dtype, const T* data, std::ostream& os) {  // NOLINT(*)
+    if (dtype == Int(32)) {
+      os << data[0];
+    } else if (dtype == Float(32)) {
+      os << data[0] << 'f';
+    } else if (dtype == Bool()) {
+      PrintBool(data[0] != 0, os);
+    } else {
+      os << dtype << "(" << data[0] << ")";
+    }
+  }
+  /*!
+   * \brief Print constant bool value.
+   * \param value The value to be printed.
+   * \param os The output stream
+   */
+  void PrintBool(bool value, std::ostream& os) { // NOLINT(*)
+    if (value) {
+      os << "True";
+    } else {
+      os << "False";
+    }
+  }
+  /*!
+   * \brief Print constant string.
+   * \param value The value to be printed.
+   * \param os The output stream
+   */
+  void PrintString(const std::string& value, std::ostream& os) { // NOLINT(*)
+    // TODO(M.K.): add escape.
+    os << "\"" << value << "\"";
+  }
+  /*!
+   * \brief get a unique name with the corresponding prefix
+   * \param prefix The prefix of the name
+   * \return The returned name.
+   */
+  std::string GetUniqueName(std::string prefix) {
+    auto it = name_alloc_map_.find(prefix);
+    if (it != name_alloc_map_.end()) {
+      while (true) {
+        std::ostringstream os;
+        os << prefix << (++it->second);
+        std::string name = os.str();
+        if (name_alloc_map_.count(name) == 0) {
+          prefix = name;
+          break;
+        }
+      }
+    }
+    name_alloc_map_[prefix] = 0;
+    return prefix;
+  }
+  /*!
+   * \brief mark the beginning of a new scope
+   * \return The scope id.
+   */
+  int BeginScope() {
+    int sid = static_cast<int>(scope_valid_.size());
+    scope_valid_.push_back(true);
+    indent_ += 2;
+    return sid;
+  }
+  /*!
+   * \brief mark the end of an old scope.
+   * \param scope_id The scope id to be ended.
+   */
+  void EndScope(int scope_id) {
+    scope_valid_[scope_id] = false;
+    indent_ -= 2;
+  }
+  /*!
+   * \brief Print the indent to the stream.
+   * \param more_indent More indentation besides the current one.
+   */
+  void PrintIndent(int64_t more_indent = 0) {
+    for (int i = 0; i < indent_ + more_indent; ++i) {
+      stream_ << ' ';
+    }
+  }
+  /*!
+   * \brief print end of the line.
+   */
+  void PrintEndInst(const char* suffix) {
+    stream_ << suffix;
+  }
+  /*!
+   * \brief Allocate temporary value
+   * \return A new text value.
+   */
+  TextValue AllocTempVar() {
+    std::ostringstream os;
+    os << '%' << temp_var_counter_++;
+    return TextValue(os.str());
+  }
+  /*!
+   * \brief Allocate name to a variable.
+   * \param var The input variable.
+   * \return The corresponding name.
+   */
+  TextValue AllocVarName(const Var& var) {
+    std::string name = var->name_hint();
+    // always make sure first name is alpha
+    if (name.length() != 0 && !std::isalpha(name[0])) {
+      name = "%v" + name;
+    } else {
+      name = "%" + name;
+    }
+    TextValue val(GetUniqueName(name));
+    // still print if ir is malformed, but show the error.
+    if (memo_.count(var)) {
+      memo_[var] = TextValue(val.name + "-malformed-ir");
+    }
+    memo_[var] = val;
+    return val;
+  }
+
+ private:
+  class AttrPrinter;
+  friend class AttrPrinter;
+  /*! \brief Whether to print meta data. */
+  bool show_meta_data_;
+  /*! \brief additional comment function */
+  runtime::TypedPackedFunc<std::string(Expr)> annotate_;
+  /*! \brief meta data context */
+  TextMetaDataContext meta_;
+  /*! \brief Check whether scope is still valid */
+  std::vector<bool> scope_valid_;
+  /*! \brief The current indentation value */
+  int indent_{0};
+  /*! \brief name allocation map */
+  std::unordered_map<std::string, int> name_alloc_map_;
+  /*! \brief Map from expression to its text value */
+  std::unordered_map<Expr, TextValue, NodeHash, NodeEqual> memo_;
+  /*! \brief counter of temporary variable */
+  int64_t temp_var_counter_{0};
+  /*! \brief Output stream */
+  std::ostringstream stream_;
+};
+
+/*!
+ * \brief Attribute printer which prints the attributes in the call.
+ */
+class TextPrinter::AttrPrinter: public AttrVisitor {
+ public:
+  AttrPrinter(std::ostream& stream, TextPrinter* parent)  // NOLINT(*)
+      : stream_(stream), parent_(parent) {}
+
+  void Visit(const char* key, double* value) final {
+    PrintSep();
+    stream_ << key << "=" << value[0];
+  }
+  void Visit(const char* key, int64_t* value) final {
+    PrintSep();
+    stream_ << key << "=" << value[0];
+  }
+  void Visit(const char* key, uint64_t* value) final {
+    PrintSep();
+    stream_ << key << "=" << value[0];
+  }
+  void Visit(const char* key, int* value) final {
+    PrintSep();
+    stream_ << key << "=" << value[0];
+  }
+  void Visit(const char* key, bool* value) final {
+    PrintSep();
+    stream_ << key << "=";
+    parent_->PrintBool(value[0], stream_);
+  }
+  void Visit(const char* key, std::string* value) final {
+    PrintSep();
+    stream_ << key << "=";
+    parent_->PrintString(value[0], stream_);
+  }
+  void Visit(const char* key, void** value) final {
+    LOG(FATAL) << "do not allow void as argument";
+  }
+  void Visit(const char* key, DataType* value) final {
+    PrintSep();
+    stream_ << key << "=";
+    parent_->PrintString(runtime::TVMType2String(Type2TVMType(value[0])), stream_);
+  }
+  void Visit(const char* key, NodeRef* value) final {
+    PrintSep();
+    stream_ << key << "=";
+    parent_->PrintAttr(value[0], stream_);
+  }
+  void Visit(const char* key, runtime::NDArray* value) final {
+    LOG(FATAL) << "do not allow NDarray as argument";
+  }
+
+ private:
+  void PrintSep() {
+    stream_ << ", ";
+  }
+  std::ostream& stream_;  // NOLINT(*)
+  TextPrinter* parent_;
+};
+
+void TextPrinter::PrintCallAttrs(const Expr& op,
+                                 const Attrs& attrs,
+                                 std::ostream& os) {  // NOLINT(*)
+  if (!attrs.defined()) return;
+  if (const auto* op_node = op.as<OpNode>()) {
+    if (attrs->type_index() == op_node->attrs_type_index) {
+      AttrPrinter printer(os, this);
+      const_cast<BaseAttrsNode*>(attrs.operator->())
+          ->VisitNonDefaultAttrs(&printer);
+      return;
+    }
+  }
+  os << ", " << meta_.GetMetaNode(attrs);
+}
+
+std::string RelayPrint(const NodeRef& node,
+                       bool show_meta_data,
+                       runtime::TypedPackedFunc<std::string(Expr)> annotate) {
+  return TextPrinter(show_meta_data, annotate).Print(node);
+}
+
+TVM_REGISTER_API("relay._expr.RelayPrint")
+.set_body_typed<std::string(
+    const NodeRef&, bool,
+    runtime::TypedPackedFunc<std::string(Expr)>)>(RelayPrint);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/ir/type.cc b/src/relay/ir/type.cc
new file mode 100644
index 000000000000..bbe6472609df
--- /dev/null
+++ b/src/relay/ir/type.cc
@@ -0,0 +1,168 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file src/tvm/ir/type.cc
+ * \brief The type system AST nodes of Relay.
+ */
+#include <tvm/relay/type.h>
+
+namespace tvm {
+namespace relay {
+
+using tvm::IRPrinter;
+using namespace tvm::runtime;
+
+TensorType TensorTypeNode::make(Array<IndexExpr> shape, DataType dtype) {
+  NodePtr<TensorTypeNode> n = make_node<TensorTypeNode>();
+  n->shape = std::move(shape);
+  n->dtype = std::move(dtype);
+  return TensorType(n);
+}
+
+TensorType TensorTypeNode::Scalar(DataType dtype) {
+  return TensorTypeNode::make({}, dtype);
+}
+
+IndexExpr TensorTypeNode::Size() const {
+  if (shape.size() == 0) {
+    return make_const(Int(64), 1);
+  }
+
+  IndexExpr size = shape[0];
+  for (size_t i = 1; i < shape.size(); ++i) {
+    size *= shape[i];
+  }
+  return size;
+}
+
+TVM_REGISTER_NODE_TYPE(TensorTypeNode);
+
+TVM_REGISTER_API("relay._make.TensorType")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  Array<IndexExpr> shape = args[0];
+  *ret = TensorTypeNode::make(shape, args[1]);
+});
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<TensorTypeNode>([](const TensorTypeNode* node,
+                                 tvm::IRPrinter* p) {
+  p->stream << "TensorType(" << node->shape << ", " << node->dtype << ")";
+});
+
+TypeVar TypeVarNode::make(std::string name, TypeVarNode::Kind kind) {
+  NodePtr<TypeVarNode> n = make_node<TypeVarNode>();
+  n->var = tvm::Var(name);
+  n->kind = std::move(kind);
+  return TypeVar(n);
+}
+
+TVM_REGISTER_NODE_TYPE(TypeVarNode);
+
+TVM_REGISTER_API("relay._make.TypeVar")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  int kind = args[1];
+  *ret =
+    TypeVarNode::make(args[0], static_cast<TypeVarNode::Kind>(kind));
+    });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<TypeVarNode>([](const TypeVarNode* node,
+                                    tvm::IRPrinter* p) {
+  p->stream << "TypeVarNode(" << node->var->name_hint << ", "
+    << node->kind << ")";
+});
+
+IncompleteType IncompleteTypeNode::make(TypeVarNode::Kind kind) {
+  auto n = make_node<IncompleteTypeNode>();
+  n->kind = std::move(kind);
+  return IncompleteType(n);
+}
+
+TVM_REGISTER_NODE_TYPE(IncompleteTypeNode);
+
+TVM_REGISTER_API("relay._make.IncompleteType")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    int kind = args[0];
+    *ret = IncompleteTypeNode::make(static_cast<TypeVarNode::Kind>(kind));
+  });
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<IncompleteTypeNode>(
+    [](const IncompleteTypeNode* node,
+       tvm::IRPrinter* p) {
+      p->stream << "IncompleteTypeNode(" << node->kind << ", " << node << ")";
+    });
+
+FuncType FuncTypeNode::make(tvm::Array<Type> arg_types,
+                            Type ret_type,
+                            tvm::Array<TypeVar> type_params,
+                            tvm::Array<TypeConstraint> type_constraints) {
+  NodePtr<FuncTypeNode> n = make_node<FuncTypeNode>();
+  n->arg_types = std::move(arg_types);
+  n->ret_type = std::move(ret_type);
+  n->type_params = std::move(type_params);
+  n->type_constraints = std::move(type_constraints);
+  return FuncType(n);
+}
+
+TVM_REGISTER_NODE_TYPE(FuncTypeNode);
+
+TVM_REGISTER_API("relay._make.FuncType")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  *ret = FuncTypeNode::make(args[0], args[1], args[2], args[3]);
+});
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<FuncTypeNode>([](const FuncTypeNode* node,
+                                   tvm::IRPrinter* p) {
+  p->stream << "FuncTypeNode(" << node->type_params << ", "
+            << node->arg_types << ", " << node->ret_type << ", "
+            << node->type_constraints << ")";
+});
+
+TypeRelation TypeRelationNode::make(TypeRelationFn func,
+                                    Array<Type> args,
+                                    int num_inputs,
+                                    Attrs attrs) {
+  NodePtr<TypeRelationNode> n = make_node<TypeRelationNode>();
+  n->func = std::move(func);
+  n->args = std::move(args);
+  n->num_inputs = num_inputs;
+  n->attrs = std::move(attrs);
+  return TypeRelation(n);
+}
+
+TVM_REGISTER_NODE_TYPE(TypeRelationNode);
+
+TVM_REGISTER_API("relay._make.TypeRelation")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = TypeRelationNode::make(args[0], args[1], args[2], args[3]);
+});
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<TypeRelationNode>([](const TypeRelationNode* node, tvm::IRPrinter* p) {
+    p->stream << "TypeRelationNode("
+              << node->func->name
+              << ", " << node->args << ")";
+});
+
+TupleType TupleTypeNode::make(Array<Type> fields) {
+  NodePtr<TupleTypeNode> n = make_node<TupleTypeNode>();
+  n->fields = std::move(fields);
+  return TupleType(n);
+}
+
+TVM_REGISTER_NODE_TYPE(TupleTypeNode);
+
+TVM_REGISTER_API("relay._make.TupleType")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = TupleTypeNode::make(args[0]);
+});
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<TupleTypeNode>([](const TupleTypeNode* node,
+                                tvm::IRPrinter* p) {
+  p->stream << "TupleTypeNode(" << node->fields << ")";
+});
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/ir/type_functor.cc b/src/relay/ir/type_functor.cc
new file mode 100644
index 000000000000..0ef1743cbbc4
--- /dev/null
+++ b/src/relay/ir/type_functor.cc
@@ -0,0 +1,159 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file type_functor.cc
+ * \brief Implementations of type functors.
+ */
+#include "type_functor.h"
+
+namespace tvm {
+namespace relay {
+
+void TypeVisitor::VisitType_(const TypeVarNode* op) {
+}
+
+void TypeVisitor::VisitType_(const TensorTypeNode* op) {
+}
+
+void TypeVisitor::VisitType_(const IncompleteTypeNode* op) {
+}
+
+void TypeVisitor::VisitType_(const FuncTypeNode* op) {
+  for (auto type_param : op->type_params) {
+    this->VisitType(type_param);
+  }
+
+  for (auto type_cs : op->type_constraints) {
+    this->VisitType(type_cs);
+  }
+
+  for (auto arg_type : op->arg_types) {
+    this->VisitType(arg_type);
+  }
+  this->VisitType(op->ret_type);
+}
+
+void TypeVisitor::VisitType_(const TupleTypeNode* op) {
+  for (const Type& t : op->fields) {
+    this->VisitType(t);
+  }
+}
+
+void TypeVisitor::VisitType_(const TypeRelationNode* op) {
+  for (const Type& t : op->args) {
+    this->VisitType(t);
+  }
+}
+
+
+// Type Mutator.
+Array<Type> TypeMutator::MutateArray(Array<Type> arr) {
+  // The array will do copy on write
+  // If no changes are made, the original array will be returned.
+  for (size_t i = 0; i < arr.size(); ++i) {
+    Type ty = arr[i];
+    Type new_ty = VisitType(ty);
+    if (!ty.same_as(new_ty)) {
+      arr.Set(i, new_ty);
+    }
+  }
+  return arr;
+}
+
+Type TypeMutator::VisitType_(const TypeVarNode* op) {
+  return GetRef<TypeVar>(op);
+}
+
+Type TypeMutator::VisitType_(const TensorTypeNode* op) {
+  // TODO(tvm-team) recursively visit to replace Var
+  return GetRef<Type>(op);
+}
+
+Type TypeMutator::VisitType_(const IncompleteTypeNode* op) {
+  return GetRef<Type>(op);
+}
+
+Type TypeMutator::VisitType_(const FuncTypeNode* op) {
+  bool changed = false;
+  Array<TypeVar> type_params;
+  for (auto type_param : op->type_params) {
+    auto new_type_param = VisitType(type_param);
+    changed = changed || !new_type_param.same_as(type_param);
+    if (const TypeVarNode* tin = new_type_param.as<TypeVarNode>()) {
+      type_params.push_back(GetRef<TypeVar>(tin));
+    } else {
+      LOG(FATAL) << new_type_param << std::endl;
+    }
+  }
+
+  Array<TypeConstraint> type_constraints;
+  for (auto type_cs : op->type_constraints) {
+    auto new_type_cs = VisitType(type_cs);
+    changed = changed || !new_type_cs.same_as(type_cs);
+    if (const TypeConstraintNode* tin =
+        new_type_cs.as_derived<TypeConstraintNode>()) {
+      type_constraints.push_back(GetRef<TypeConstraint>(tin));
+    } else {
+      LOG(FATAL) << new_type_cs << std::endl;
+    }
+  }
+
+  Array<Type> new_args = MutateArray(op->arg_types);
+  changed = changed || !new_args.same_as(op->arg_types);
+
+  Type new_ret_type = VisitType(op->ret_type);
+  changed = changed || !new_ret_type.same_as(op->ret_type);
+
+  if (!changed) return GetRef<Type>(op);
+  return FuncTypeNode::make(new_args,
+                            new_ret_type,
+                            type_params,
+                            type_constraints);
+}
+
+Type TypeMutator::VisitType_(const TupleTypeNode* op) {
+  Array<Type> new_fields = MutateArray(op->fields);
+  if (new_fields.same_as(op->fields)) {
+    return GetRef<Type>(op);
+  } else {
+    return TupleTypeNode::make(new_fields);
+  }
+}
+
+Type TypeMutator::VisitType_(const TypeRelationNode* type_rel) {
+  Array<Type> new_args = MutateArray(type_rel->args);
+  if (new_args.same_as(type_rel->args)) {
+    return GetRef<Type>(type_rel);
+  } else {
+    return TypeRelationNode::make(type_rel->func,
+                                  new_args,
+                                  type_rel->num_inputs,
+                                  type_rel->attrs);
+  }
+}
+
+// Implements bind.
+class TypeBinder : public TypeMutator {
+ public:
+  explicit TypeBinder(const tvm::Map<TypeVar, Type>& args_map)
+    : args_map_(args_map) {}
+
+  Type VisitType_(const TypeVarNode* op) override {
+    auto id = GetRef<TypeVar>(op);
+    auto it = args_map_.find(id);
+    if (it != args_map_.end()) {
+      return (*it).second;
+    } else {
+      return id;
+    }
+  }
+
+ private:
+  const tvm::Map<TypeVar, Type>& args_map_;
+};
+
+Type Bind(const Type& type, const tvm::Map<TypeVar, Type>& args_map) {
+  return TypeBinder(args_map).VisitType(type);
+}
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/ir/type_functor.h b/src/relay/ir/type_functor.h
new file mode 100644
index 000000000000..e8dfd2b7cd7c
--- /dev/null
+++ b/src/relay/ir/type_functor.h
@@ -0,0 +1,129 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file type_functor.h
+ * \brief A way to defined arbitrary function signature with dispatch on types.
+ */
+#ifndef TVM_RELAY_IR_TYPE_FUNCTOR_H_
+#define TVM_RELAY_IR_TYPE_FUNCTOR_H_
+
+#include <tvm/node/ir_functor.h>
+#include <tvm/relay/expr.h>
+#include <string>
+#include <vector>
+
+namespace tvm {
+namespace relay {
+
+template <typename FType>
+class TypeFunctor;
+
+// functions to be overriden.
+#define TYPE_FUNCTOR_DEFAULT \
+  { return VisitTypeDefault_(op, std::forward<Args>(args)...); }
+
+
+#define RELAY_TYPE_FUNCTOR_DISPATCH(OP)                                   \
+  vtable.template set_dispatch<OP>(                                       \
+      [](const NodeRef& n, TSelf* self, Args... args) {                   \
+        return self->VisitType_(static_cast<const OP*>(n.node_.get()),    \
+                                std::forward<Args>(args)...);             \
+      });
+
+template <typename R, typename... Args>
+class TypeFunctor<R(const Type& n, Args...)> {
+ private:
+  using TSelf = TypeFunctor<R(const Type& n, Args...)>;
+  using FType = tvm::IRFunctor<R(const NodeRef& n, TSelf* self, Args...)>;
+
+ public:
+  /*! \brief the result type of this functor */
+  using result_type = R;
+  /*! \brief virtual destructor */
+  virtual ~TypeFunctor() {}
+  /*!
+   * \brief Same as call.
+   * \param n The expression node.
+   * \param args Additional arguments.
+   * \return The result of the call
+   */
+  R operator()(const Type& n, Args... args) {
+    return VisitType(n, std::forward<Args>(args)...);
+  }
+  /*!
+   * \brief The functor call.
+   * \param n The expression node.
+   * \param args Additional arguments.
+   * \return The result of the call
+   */
+  virtual R VisitType(const Type& n, Args... args) {
+    static FType vtable = InitVTable();
+    return vtable(n, this, std::forward<Args>(args)...);
+  }
+  // Functions that can be overriden by subclass
+  virtual R VisitType_(const TensorTypeNode* op,
+                       Args... args) TYPE_FUNCTOR_DEFAULT;
+  virtual R VisitType_(const TypeVarNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
+  virtual R VisitType_(const TypeConstraintNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
+  virtual R VisitType_(const FuncTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
+  virtual R VisitType_(const TypeRelationNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
+  virtual R VisitType_(const TupleTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
+  virtual R VisitType_(const IncompleteTypeNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
+
+  virtual R VisitTypeDefault_(const Node* op, Args...) {
+    LOG(FATAL) << "Do not have a default for " << op->type_key();
+    throw;  // unreachable, written to stop compiler warning
+  }
+
+ private:
+  // initialize the vtable.
+  static FType InitVTable() {
+    FType vtable;
+    // Set dispatch
+    RELAY_TYPE_FUNCTOR_DISPATCH(TensorTypeNode);
+    RELAY_TYPE_FUNCTOR_DISPATCH(TypeVarNode);
+    RELAY_TYPE_FUNCTOR_DISPATCH(TypeConstraintNode);
+    RELAY_TYPE_FUNCTOR_DISPATCH(FuncTypeNode);
+    RELAY_TYPE_FUNCTOR_DISPATCH(TypeRelationNode);
+    RELAY_TYPE_FUNCTOR_DISPATCH(TupleTypeNode);
+    RELAY_TYPE_FUNCTOR_DISPATCH(IncompleteTypeNode);
+    return vtable;
+  }
+};
+
+/*!
+ * \brief A type visitor that recursively visit types.
+ */
+class TypeVisitor : public TypeFunctor<void(const Type& n)> {
+ public:
+  void VisitType_(const TypeVarNode* op) override;
+  void VisitType_(const IncompleteTypeNode* op) override;
+  void VisitType_(const TensorTypeNode* op) override;
+  void VisitType_(const FuncTypeNode* op) override;
+  void VisitType_(const TupleTypeNode* op) override;
+  void VisitType_(const TypeRelationNode* op) override;
+};
+
+// Mutator that transform a type to another one.
+class TypeMutator : public TypeFunctor<Type(const Type& n)> {
+ public:
+  Type VisitType_(const TypeVarNode* op) override;
+  Type VisitType_(const TensorTypeNode* op) override;
+  Type VisitType_(const IncompleteTypeNode* op) override;
+  Type VisitType_(const FuncTypeNode* op) override;
+  Type VisitType_(const TupleTypeNode* op) override;
+  Type VisitType_(const TypeRelationNode* type_rel) override;
+
+ private:
+  Array<Type> MutateArray(Array<Type> arr);
+};
+
+/*!
+ * \brief Bind free type variables in the type.
+ * \param type The type to be updated.
+ * \param args_map The binding map.
+ */
+Type Bind(const Type& type, const Map<TypeVar, Type>& args_map);
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_IR_TYPE_FUNCTOR_H_
diff --git a/src/relay/op/debug.cc b/src/relay/op/debug.cc
new file mode 100644
index 000000000000..4c9b0a5ca83e
--- /dev/null
+++ b/src/relay/op/debug.cc
@@ -0,0 +1,54 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file nn.cc
+ * \brief Property def of nn operators.
+ */
+
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/debug.h>
+#include <topi/elemwise.h>
+#include <vector>
+#include "./type_relations.h"
+#include "./op_common.h"
+#include "./layout.h"
+
+namespace tvm {
+namespace relay {
+
+Array<Tensor> DebugCompute(const Attrs& attrs,
+                               const Array<Tensor>& inputs,
+                               const Type& out_type,
+                               const Target& target) {
+  return Array<Tensor>{ topi::identity(inputs[0]) };
+}
+
+RELAY_REGISTER_OP("debug")
+.describe(R"code(Enter the interpreter's debugger.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("program", "Tuple", "The program to execute before debugging.")
+.set_support_level(1)
+.add_type_rel("Debug", IdentityRel)
+.set_attr<TOpPattern>("TOpPattern", kOpaque)
+.set_attr<FTVMCompute>("FTVMCompute", DebugCompute);
+
+Expr MakeDebug(Expr expr, std::string name) {
+  auto dattrs = make_node<DebugAttrs>();
+  if (name.size() > 0) {
+    dattrs->debug_func = EnvFunc::Get(name);
+  } else {
+    dattrs->debug_func = EnvFunc();
+  }
+  static const Op& op = Op::Get("debug");
+  return CallNode::make(op, {expr}, Attrs(dattrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.debug")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeDebug, args, rv);
+  });
+
+}  // namespace relay
+}  // namespace tvm
+
diff --git a/src/relay/op/image/resize.cc b/src/relay/op/image/resize.cc
new file mode 100644
index 000000000000..e6efcb8ce459
--- /dev/null
+++ b/src/relay/op/image/resize.cc
@@ -0,0 +1,116 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file resize.cc
+ * \brief Image operators
+ */
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/image.h>
+#include <topi/elemwise.h>
+#include <topi/image/resize.h>
+#include "../layout.h"
+#include "../op_common.h"
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(ResizeAttrs);
+
+bool ResizeRel(const Array<Type>& types,
+               int num_inputs,
+               const Attrs& attrs,
+               const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  static const Layout kNCHW("NCHW");
+
+  const ResizeAttrs* param = attrs.as<ResizeAttrs>();
+  CHECK(param != nullptr);
+  const Layout in_layout(param->layout);
+  CHECK(in_layout.Convertible(kNCHW))
+    << "Resize only support input layouts that are convertible from NCHW."
+    << " But got " << in_layout;
+
+  auto oshape = ConvertLayout(data->shape, in_layout, kNCHW);
+  oshape[2] = param->size[0];
+  oshape[3] = param->size[1];
+
+  // assign output type
+  reporter->Assign(types[1],
+                   TensorTypeNode::make(ConvertLayout(oshape, kNCHW, in_layout),
+                                        data->dtype));
+  return true;
+}
+
+Array<Tensor> ResizeCompute(const Attrs& attrs,
+                            const Array<Tensor>& inputs,
+                            const Type& out_type,
+                            const Target& target) {
+  const auto* param = attrs.as<ResizeAttrs>();
+  CHECK(param != nullptr);
+  CHECK(param->layout == "NCHW" || param->layout == "NHWC");
+  const auto* out_ttype = out_type.as<TensorTypeNode>();
+  CHECK(out_ttype != nullptr);
+  Array<IndexExpr> oshape;
+  if (param->layout == "NCHW") {
+    oshape.push_back(out_ttype->shape[2]);
+    oshape.push_back(out_ttype->shape[3]);
+  } else if (param->layout == "NHWC") {
+    oshape.push_back(out_ttype->shape[1]);
+    oshape.push_back(out_ttype->shape[2]);
+  }
+  return Array<Tensor>{ topi::image::resize(inputs[0],
+                                            oshape,
+                                            param->layout,
+                                            param->align_corners,
+                                            param->method) };
+}
+
+// Positional relay function to create image operator
+// used by frontend FFI.
+Expr MakeResize(Expr data,
+                Array<IndexExpr> size,
+                std::string layout,
+                std::string method,
+                bool align_corners) {
+  auto attrs = make_node<ResizeAttrs>();
+  attrs->size = std::move(size);
+  attrs->layout = std::move(layout);
+  attrs->method = std::move(method);
+  attrs->align_corners = align_corners;
+  static const Op& op = Op::Get("image.resize");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.image._make.resize")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 5>(MakeResize, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("image.resize")
+.describe(R"code(Perform resize to input array with nearest neighbour or bilinear interpolation.
+
+- **data**: data is 4D array of shape
+            (batch_size, channels, in_height, in_width) for NCHW
+            (batch_size, in_height, in_width, channels) for NHWC
+
+- **out**: Output is 4D array of shape
+           for layout NCHW
+           (batch_size, channels, size[0], size[1])
+
+           for layout NHWC
+           (batch_size, size[0], size[1], channels)
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.ResizeAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(5)
+.add_type_rel("Resize", ResizeRel)
+.set_attr<FTVMCompute>("FTVMCompute", ResizeCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/layout.cc b/src/relay/op/layout.cc
new file mode 100644
index 000000000000..98fea55aa4c1
--- /dev/null
+++ b/src/relay/op/layout.cc
@@ -0,0 +1,80 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file src/relay/op/layout.cc
+ * \brief Layout expression.
+ */
+
+#include "layout.h"
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(LayoutNode);
+
+std::vector<IndexExpr> ConvertLayout(
+    std::vector<IndexExpr> src,
+    const Layout& src_layout,
+    const Layout& dst_layout) {
+  CHECK_EQ(src_layout.ndim(), src.size());
+  if (src_layout == dst_layout) {
+    return src;
+  } else if (!src_layout.defined()) {
+    LOG(FATAL) << "cannot convert undefined layout to " << dst_layout;
+  } else if (!dst_layout.defined()) {
+    LOG(FATAL) << "cannot convert " << src_layout << " to undefined layout";
+  }
+
+  CHECK(src_layout.Convertible(dst_layout))
+    << "cannot convert from "
+    << src_layout << " to " << dst_layout;
+
+  std::vector<IndexExpr> dst(dst_layout.ndim());
+  for (size_t i = 0; i < src_layout.ndim(); ++i) {
+    Layout::LayoutDim src_dim = src_layout[i];
+    if (Layout::IsSuperdim(src_dim)) {
+      int dst_major_pos = dst_layout.Indexof(Layout::ToSuperdim(src_dim));
+      int dst_minor_pos = dst_layout.Indexof(Layout::ToSubdim(src_dim));
+      int src_minor_pos = src_layout.Indexof(Layout::ToSubdim(src_dim));
+      int src_factor = src_layout.Subsizeof(src_dim);
+      int dst_factor = dst_layout.Subsizeof(src_dim);
+      IndexExpr src_dim_size = src[i];
+
+      if (src_minor_pos >= 0) {
+        CHECK(is_const_int(src[src_minor_pos], src_factor))
+          << "src shape " << Array<IndexExpr>(src)
+          << " does not agree with layout "
+          << src_layout;
+        src_dim_size *= src_factor;
+      }
+      dst[dst_major_pos] = src_dim_size;
+      if (dst_minor_pos >= 0) {
+        CHECK_GT(dst_factor, 0);
+        if (const int64_t* const_src_dim_size = as_const_int(src_dim_size)) {
+          CHECK_LE(dst_factor, const_src_dim_size[0])
+            << "Converting " << Array<IndexExpr>(src)
+            << " from " << src_layout
+            << " to " << dst_layout
+            << ": cannot split dimension size of "
+            << src_dim_size << " by " << dst_factor;
+        }
+        dst[dst_major_pos] /= dst_factor;
+        dst[dst_minor_pos] = dst_factor;
+      }
+    }
+  }
+  return dst;
+}
+
+std::vector<IndexExpr> ConvertLayout(
+    const Array<IndexExpr>& src,
+    const Layout& src_layout,
+    const Layout& dst_layout) {
+  std::vector<IndexExpr> ret(src.size());
+  for (size_t i = 0; i < src.size(); ++i) {
+    ret[i] = src[i];
+  }
+  return ConvertLayout(ret, src_layout, dst_layout);
+}
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/layout.h b/src/relay/op/layout.h
new file mode 100644
index 000000000000..90c920bf3aa1
--- /dev/null
+++ b/src/relay/op/layout.h
@@ -0,0 +1,443 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file relay/op/layout.h
+ * \brief Layout expression.
+ *
+ *  This file is adapted from its nnvm counterpart and will keep involving
+ *  to the new layout system
+ *
+ *  The layout is composed of upper cases, lower cases and numbers,
+ *  where upper case indicates a (super-)dimension and
+ *  the corresponding lower case with factor size indicates the split (sub-)dimension.
+ *  For example, NCHW16c can describe a 5-D tensor of
+ *  [batch_size, channel, height, width, channel_block].
+ *  Here sub-dimension channel_block=16 is the split of super-dimension C (channel).
+ */
+#ifndef TVM_RELAY_OP_LAYOUT_H_
+#define TVM_RELAY_OP_LAYOUT_H_
+
+#include <tvm/base.h>
+#include <tvm/expr.h>
+#include <tvm/relay/base.h>
+
+#include <string>
+#include <sstream>
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+namespace tvm {
+namespace relay {
+
+class LayoutNode : public Node {
+ public:
+  std::string name;
+  Array<Integer> superdim_pos;
+  Array<Integer> subdim_pos;
+  Array<Integer> subdim_size;
+  Array<Integer> layout_simplified;
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("name", &name);
+    v->Visit("superdim_pos", &superdim_pos);
+    v->Visit("subdim_pos", &subdim_pos);
+    v->Visit("subdim_size", &subdim_size);
+    v->Visit("layout_simplified", &layout_simplified);
+  }
+
+  static constexpr const char* _type_key = "Layout";
+  TVM_DECLARE_NODE_TYPE_INFO(LayoutNode, Node);
+};
+
+class Layout : public NodeRef {
+ public:
+  using LayoutDim = char;
+  static constexpr uint32_t kUniqueDim = 26;
+
+  explicit Layout(NodePtr<Node> n) : NodeRef(n) {}
+
+  /*! \brief default constructor */
+  Layout() : Layout("__undef__") {} // NOLINT(*)
+
+  /*! \brief construct from a string */
+  Layout(const char* str) : Layout(std::string(str)) {} // NOLINT(*)
+
+  /*!
+   * \brief construct from a string.
+   * \param layout input in layout convention:
+   *        upper case indicates a dimension and
+   *        the corresponding lower case with factor size
+   *        indicates the split dimension.
+   *        return undefined layout if "__undef__" is passed.
+   */
+  Layout(const std::string& layout) { // NOLINT(*)
+    if (layout.length() != 0) {
+      Parse(layout);
+    } else {
+      Parse("__undef__");
+    }
+  }
+
+  /*!
+   * \brief access the internal node container
+   * \return the pointer to the internal node container
+   */
+  const LayoutNode* operator->() const {
+    return static_cast<const LayoutNode*>(node_.get());
+  }
+
+  /*!
+   * \brief access the internal node container
+   * \return the pointer to the internal node container
+   */
+  LayoutNode* operator->() {
+    return static_cast<LayoutNode*>(node_.get());
+  }
+
+  /*!
+   * \brief Check whether a given dimension is a super-dimension.
+   * \param dim input dimension
+   * \return Whether a given dimension is a super-dimension.
+   */
+  static bool IsSuperdim(LayoutDim dim) {
+    return dim >= 'A' && dim <= 'Z';
+  }
+
+  /*!
+   * \brief Check whether a given dimension is a sub-dimension.
+   * \param dim input dimension
+   * \return Whether a given dimension is a sub-dimension.
+   */
+  static bool IsSubdim(LayoutDim dim) {
+    return dim >= 'a' && dim <= 'z';
+  }
+
+  /*!
+   * \brief Convert a given dimension to super-dimension.
+   * \param dim input dimension
+   * \return The converted description.
+   */
+  static LayoutDim ToSuperdim(LayoutDim dim) {
+    if (IsSubdim(dim)) {
+      return dim - 'a' + 'A';
+    }
+    return dim;
+  }
+
+  /*!
+   * \brief Convert a given dimension to sub-dimension.
+   * \param dim input dimension
+   * \return The converted description.
+   */
+  static LayoutDim ToSubdim(LayoutDim dim) {
+    if (IsSuperdim(dim)) {
+      return dim - 'A' + 'a';
+    }
+    return dim;
+  }
+
+  /*!
+ * \brief Return an undefined layout.
+ * \return a (global) undefined layout.
+ */
+  static const Layout& Undef() {
+    static Layout undef;
+    return undef;
+  }
+
+  /*!
+   * \brief Two layouts are convertible only if
+   *        they have same set of super-dimensions.
+   *        e.g., NCHW, NCHW16c, NHWC are convertible between each other,
+   *        but NCHW, CHW, OIHW are not.
+   * \param dst the target layout
+   * \return Whether can be converted to dst layout.
+   */
+  bool Convertible(const Layout &dst) const {
+    const LayoutNode *n = operator->();
+    if (!this->defined() || !dst.defined()) return false;
+    for (size_t i = 0; i < kUniqueDim; ++i) {
+      if ((n->superdim_pos[i]->value >= 0 && dst->superdim_pos[i]->value < 0) ||
+          (n->superdim_pos[i]->value < 0 && dst->superdim_pos[i]->value >= 0)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /*!
+   * \brief Returns a sublayout which is the portion of the object
+   *        that starts at dimension \p pos and spans \p len dimensions
+   *        (or until the end of the layout, whichever comes first).
+   * \param pos The start position.
+   * \param len The length of the sub-layout.
+   * \return A newly constructed Layout object.
+   */
+  Layout Sublayout(size_t pos, size_t len) const {
+    const Array<Integer>& layout_simplified = operator->()->layout_simplified;
+    if (pos > ndim()) return Layout::Undef();
+    if (pos + len > ndim()) len = ndim() - pos;
+    if (len == 0) return Layout::Undef();
+    std::ostringstream new_layout;
+    for (size_t i = pos; i < pos + len; ++i) {
+      if (IsSubdim(layout_simplified[i]->value)) {
+        auto block_size = this->Subsizeof(layout_simplified[i]->value);
+        CHECK_GT(block_size, 0);
+        new_layout << block_size;
+      }
+      new_layout << static_cast<char>(layout_simplified[i]->value);
+    }
+    return Layout(new_layout.str());
+  }
+
+  /*! \return A newly constructed reversed Layout object. */
+  Layout Reverse() const {
+    const Array<Integer>& layout_simplified = operator->()->layout_simplified;
+    if (!this->defined()) return Layout::Undef();
+    std::ostringstream new_layout;
+    for (int64_t i = this->ndim() - 1; i >= 0; --i) {
+      if (IsSubdim(layout_simplified[i]->value)) {
+        auto block_size = this->Subsizeof(layout_simplified[i]->value);
+        CHECK_GT(block_size, 0);
+        new_layout << block_size;
+      }
+      new_layout << layout_simplified[i]->value;
+    }
+    return Layout(new_layout.str());
+  }
+
+  /*!
+   * \brief Split \p dim by \p size and put the sub-dimension to position \p target_pos.
+   * \param dim The source dimension to be split. It must be a super-dimension.
+   * \param target_pos The target position of the newly split sub-dimension.
+   * \param size size of the sub-dimension.
+   * \return A newly constructed Layout object.
+   */
+  Layout Split(LayoutDim dim, size_t target_pos, uint32_t size) const {
+    const std::string &name = operator->()->name;
+    CHECK(target_pos <= this->ndim()) << "Invalid split position "
+                                      << target_pos << " for layout " << name;
+    CHECK(IsSuperdim(dim)) << "Cannot split a sub-dimension " << dim;
+    CHECK(this->Contains(dim)) << "Axis " << dim << " does not exist in " << name;
+    CHECK(!this->Contains(ToSubdim(dim))) << "Dimension " << dim
+                                           << " has already been split in "
+                                           << name;
+    CHECK(size > 0) << "Invalid split size " << size;
+    std::ostringstream new_layout;
+    for (size_t i = 0; i <= this->ndim(); ++i) {
+      if (i == target_pos) {
+        new_layout << size << Layout::ToSubdim(dim);
+      }
+      if (i == this->ndim()) break;
+      new_layout << this->at(i);
+    }
+    Layout x(new_layout.str());
+    return x;
+  }
+
+
+  /*! \return number of dimensions */
+  size_t ndim() const {
+    return operator->()->layout_simplified.size();
+  }
+
+  /*! \return number of super dimensions */
+  size_t ndim_super() const {
+    size_t ct = 0;
+    for (auto x : operator->()->layout_simplified) {
+      if (IsSuperdim(x))
+        ct++;
+    }
+    return ct;
+  }
+
+  /*!
+   * \brief The description of the \p i-th dimension.
+   *        If it is a sub-dimension, the size will be returned as well,
+   *        e.g., 16c. Otherwise a single character is returned, e.g., C.
+   * \param i The position
+   * \return the description of the dimension.
+   */
+  std::string at(size_t i) const {
+    const Array<Integer>& layout_simplified = operator->()->layout_simplified;
+    CHECK_LT(i, this->ndim()) << "position " << i
+                              << " exceeds ndim=" << this->ndim();
+    std::ostringstream repr;
+    if (IsSubdim(layout_simplified[i]->value)) {
+      auto factor = Subsizeof(layout_simplified[i]->value);
+      CHECK_GT(factor, 0);
+      repr << factor;
+    }
+    repr << static_cast<char>(layout_simplified[i]->value);
+    return repr.str();
+  }
+
+  /*!
+   * \brief return the index of the input dimension.
+   *        If it is not found in the layout or the layout is undefined,
+   *        return -1.
+   * \param dim the input dimension.
+   * \return the index or -1 if not found.
+   */
+  int32_t Indexof(LayoutDim dim) const {
+    if (!this->defined()) return -1;
+    else if (IsSuperdim(dim)) return operator->()->superdim_pos[dim - 'A']->value;
+    else if (IsSubdim(dim)) return operator->()->subdim_pos[dim - 'a']->value;
+    return -1;
+  }
+
+  /*!
+   * \param dim the input super-dimension or sub-dimension.
+   * \return the size of the sub-dimension of \p dim (if \p dim is a super-dimension),
+   *         or the size of \p dim itself (if \p dim is a sub-dimension).
+   *         Return -1 if \p dim is not in the layout or the layout is undefined.
+   */
+  int64_t Subsizeof(LayoutDim dim) const {
+    CHECK(IsSuperdim(dim) || IsSubdim(dim)) << "Invalid dim " << dim;
+    if (!this->defined() || !this->Contains(ToSubdim(dim))) {
+      return -1;
+    }
+    int idx = ToSubdim(dim) - 'a';
+    return operator->()->subdim_size[idx]->value;
+  }
+
+  /*!
+   * \brief Whether the layout contains a dimension.
+   * \param dim dimension to be checked.
+   * \return Whether the layout contains the dimension.
+   */
+  bool Contains(LayoutDim dim) const {
+    if (IsSuperdim(dim)) {
+      return operator->()->superdim_pos[dim-'A']->value >= 0;
+    } else if (IsSubdim(dim)) {
+      return operator->()->subdim_pos[dim-'a']->value >= 0;
+    }
+    return false;
+  }
+
+  LayoutDim operator[](size_t i) const {
+    return operator->()->layout_simplified[i];
+  }
+
+  /*! \return whether the layout is defined */
+  bool defined() const {
+    return operator->()->name != "__undef__";
+  }
+  /*! \return the string description of the layout */
+  const std::string& name() const {
+    return operator->()->name;
+  }
+
+  /*!
+   * \brief Whether the two layouts are equal.
+   * \param rhs Another layout.
+   * \return whether the two layouts are equal.
+   */
+  bool Equals(const Layout &rhs) const {
+    return operator->()->name == rhs->name;
+  }
+
+  /*!
+ * \brief allow output string of layout to ostream
+ * \param os the output stream
+ * \param l the layout
+ * \return the ostream
+ */
+  friend std::ostream& operator<<(std::ostream& os, const Layout& l) {
+    os << l.name();
+    return os;
+  }
+
+  using ContainerType = LayoutNode;
+
+ private:
+  void Parse(const std::string &layout) {
+    node_ = make_node<LayoutNode>();
+
+    std::vector<uint32_t> superdim_pos(kUniqueDim, -1);
+    std::vector<uint32_t> subdim_pos(kUniqueDim, -1);
+    std::vector<uint32_t> subdim_size(kUniqueDim, -1);
+    std::vector<char> layout_simplified;
+
+    if (layout != "__undef__") {  // parse layout string
+      int32_t factor = 0;
+      uint32_t curr = 0;
+      for (size_t i = 0; i < layout.size(); ++i) {
+        const LayoutDim c = layout.at(i);
+        if (IsSuperdim(c)) {
+          int pos = c - 'A';
+          CHECK_EQ(factor, 0) << "Invalid layout " << layout
+                              << ": invalid factor size " << factor
+                              << " before dimension " << c;
+          CHECK_EQ(superdim_pos[pos], -1) << "Invalid layout " << layout
+                                          << ": duplicate dimension " << c;
+          superdim_pos[pos] = curr++;
+          layout_simplified.push_back(c);
+        } else if (IsSubdim(c)) {
+          int pos = c - 'a';
+          CHECK_GT(factor, 0) << "Invalid layout " << layout << ": invalid factor size "
+                              << factor << " for dimension " << c;
+          CHECK_EQ(subdim_pos[pos], -1) << "Invalid layout " << layout
+                                        << ": duplicate dimension " << c;
+          CHECK_EQ(subdim_size[pos], -1) << "Invalid layout " << layout
+                                         << ": duplicate dimension " << c;
+          subdim_pos[pos] = curr++;
+          subdim_size[pos] = factor;
+          layout_simplified.push_back(c);
+          factor = 0;
+        } else if (c >= '0' && c <= '9') {
+          CHECK(factor >= 0) << "Invalid layout " << layout << ": _ is adjacent to a number.";
+          factor = factor * 10 + c - '0';
+        } else {
+          LOG(FATAL) << "Invalid layout " << layout;
+        }
+      }
+      CHECK(!layout_simplified.empty()) << "Invalid layout " << layout;
+      for (LayoutDim dim : layout_simplified) {
+        CHECK(IsSuperdim(dim) || superdim_pos[dim-'a'] >= 0)
+          << "Invalid layout " << layout << ": missing axis "
+          << static_cast<char>(dim - 'a' + 'A');
+      }
+    }
+
+    LayoutNode *node = operator->();
+    node->name = layout;
+
+    for (uint32_t i = 0; i < kUniqueDim; ++i) {
+      node->superdim_pos.push_back(superdim_pos[i]);
+      node->subdim_pos.push_back(subdim_pos[i]);
+      node->subdim_size.push_back(subdim_size[i]);
+    }
+    for (LayoutDim dim : layout_simplified) {
+      node->layout_simplified.push_back(dim);
+    }
+  }
+};
+
+/*!
+ * \brief Convert shape in src_layout to shape in dst_layout
+ * \param src original shape
+ * \param src_layout layout of original shape
+ * \param dst_layout target layout
+ * \return shape in target layout
+ */
+std::vector<IndexExpr> ConvertLayout(
+    std::vector<IndexExpr> src,
+    const Layout& src_layout,
+    const Layout& dst_layout);
+
+/*!
+ * \brief Convert shape in src_layout to shape in dst_layout
+ * \param src original shape
+ * \param src_layout layout of original shape
+ * \param dst_layout target layout
+ * \return shape in target layout
+ */
+std::vector<IndexExpr> ConvertLayout(
+    const Array<IndexExpr>& src,
+    const Layout& src_layout,
+    const Layout& dst_layout);
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_OP_LAYOUT_H_
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
new file mode 100644
index 000000000000..170b6b6d13c5
--- /dev/null
+++ b/src/relay/op/nn/convolution.cc
@@ -0,0 +1,338 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file convolution.cc
+ * \brief Convolution operators
+ */
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/nn.h>
+#include <vector>
+
+#include "../../pass/alter_op_layout.h"
+#include "../layout.h"
+
+namespace tvm {
+namespace relay {
+
+// relay.nn.conv2d
+TVM_REGISTER_NODE_TYPE(Conv2DAttrs);
+
+bool Conv2DRel(const Array<Type>& types,
+               int num_inputs,
+               const Attrs& attrs,
+               const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* weight = types[1].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+  static const Layout kNCHW("NCHW");
+  static const Layout kOIHW("OIHW");
+
+  const Conv2DAttrs* param = attrs.as<Conv2DAttrs>();
+  CHECK(param != nullptr);
+  const Layout in_layout(param->data_layout);
+  const Layout kernel_layout(param->weight_layout);
+  CHECK(in_layout.Convertible(kNCHW))
+    << "Conv only support input layouts that are convertible from NCHW."
+    << " But got " << in_layout;
+  CHECK(kernel_layout.Convertible(kOIHW))
+    << "Conv only support kernel layouts that are convertible from OIHW."
+    << " But got "<< kernel_layout;
+
+  Layout out_layout(param->out_layout);
+  if (!out_layout.defined()) out_layout = in_layout;
+  CHECK(out_layout.Convertible(kNCHW))
+      << "Conv only support output layouts that are convertible from NCHW."
+      << " But got " << out_layout;
+
+  std::vector<IndexExpr> dshape_nchw = ConvertLayout(
+      data->shape, in_layout, kNCHW);
+
+  IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
+  // infer weight if the kernel_size and channels are defined
+  if (param->kernel_size.defined() && param->channels.defined()) {
+    CHECK_EQ(param->kernel_size.size(), 2);
+    CHECK_EQ(param->dilation.size(), 2);
+    std::vector<IndexExpr> wshape(
+       {param->channels / param->groups,
+         dshape_nchw[1] / param->groups,
+         param->kernel_size[0],
+         param->kernel_size[1]});
+    wshape = ConvertLayout(wshape, kOIHW, kernel_layout);
+    wshape[kernel_layout.Indexof('O')] *= param->groups;
+    channels = param->channels;
+    dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
+    dilated_ksize_x = 1 + (param->kernel_size[1] - 1) * param->dilation[1];
+    // assign result to reporter
+    reporter->Assign(types[1], TensorTypeNode::make(wshape, data->dtype));
+  } else {
+    // use weight to infer the conv shape.
+    if (weight == nullptr) return false;
+    auto wshape = ConvertLayout(weight->shape, kernel_layout, kOIHW);
+    if (param->kernel_size.defined()) {
+      CHECK_EQ(param->kernel_size.size(), 2);
+      // check the size
+      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
+            reporter->AssertEQ(param->kernel_size[1], wshape[3]))
+          << "Conv2D: shape of weight is inconsistent with kernel_size, "
+          << " kernel_size=" << param->kernel_size
+          << " wshape=" << Array<IndexExpr>(wshape);
+    }
+    if (param->channels.defined()) {
+      CHECK(reporter->AssertEQ(param->channels, wshape[0]))
+          << "Conv2D: shape of weight is inconsistent with channels, "
+          << " channels=" << param->channels
+          << " wshape=" << Array<IndexExpr>(wshape);
+    }
+    CHECK(reporter->AssertEQ(dshape_nchw[1] / param->groups, wshape[1]));
+    channels = wshape[0];
+    dilated_ksize_y = 1 + (wshape[2] - 1) * param->dilation[0];
+    dilated_ksize_x = 1 + (wshape[3] - 1) * param->dilation[1];
+  }
+  // dilation
+  std::vector<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
+
+  oshape[2] = (dshape_nchw[2] + param->padding[0] * 2 - dilated_ksize_y) / param->strides[0] + 1;
+  oshape[3] = (dshape_nchw[3] + param->padding[1] * 2 - dilated_ksize_x) / param->strides[1] + 1;
+  DataType out_dtype = param->out_dtype;
+  if (out_dtype.bits() == 0) {
+    out_dtype = data->dtype;
+  }
+  oshape = ConvertLayout(oshape, kNCHW, out_layout);
+  // assign output type
+  reporter->Assign(types[2], TensorTypeNode::make(oshape, out_dtype));
+  return true;
+}
+
+template<typename T>
+Array<Array<Layout> > Conv2DInferCorrectLayout(
+    const Attrs& attrs,
+    const Array<Layout>& new_in_layouts,
+    const Array<Layout>& old_in_layouts,
+    const Array<Array<IndexExpr>> &old_in_shapes) {
+  const T* params = attrs.as<T>();
+  Layout out_layout(params->out_layout);
+
+  // We always make other operators to fit the layouts of convolution layers
+  // So this inference ignores all inputs
+  return Array<Array<Layout> >{{params->data_layout, params->weight_layout},
+                               {out_layout.defined() ? out_layout : params->data_layout}};
+}
+
+// Positional relay function to create conv2d operator
+// used by frontend FFI.
+Expr MakeConv2D(Expr data,
+                Expr weight,
+                Array<IndexExpr> strides,
+                Array<IndexExpr> padding,
+                Array<IndexExpr> dilation,
+                int groups,
+                IndexExpr channels,
+                Array<IndexExpr> kernel_size,
+                std::string data_layout,
+                std::string weight_layout,
+                std::string out_layout,
+                DataType out_dtype) {
+  auto attrs = make_node<Conv2DAttrs>();
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->dilation = std::move(dilation);
+  attrs->groups = groups;
+  attrs->channels = channels;
+  attrs->kernel_size = kernel_size;
+  attrs->data_layout = std::move(data_layout);
+  attrs->weight_layout = std::move(weight_layout);
+  attrs->out_layout = std::move(out_layout);
+  attrs->out_dtype = std::move(out_dtype);
+  static const Op& op = Op::Get("nn.conv2d");
+  return CallNode::make(op, {data, weight}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.conv2d")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 12>(MakeConv2D, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("nn.conv2d")
+.describe(R"code(2D convolution layer (e.g. spatial convolution over images).
+
+This layer creates a convolution kernel that is convolved
+with the layer input to produce a tensor of outputs.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, in_channels, height, width) if `layout` is `NCHW`.
+- **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
+- **out**:  This depends on the `layout` parameter. Output is 4D array of shape
+            (batch_size, channels, out_height, out_width) if `layout` is `NCHW`.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.Conv2DAttrs")
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("weight", "Tensor", "The weight tensor.")
+.set_support_level(2)
+.add_type_rel("Conv2D", Conv2DRel)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", Conv2DInferCorrectLayout<Conv2DAttrs>);
+
+
+// relay.nn.conv2d_transpose
+TVM_REGISTER_NODE_TYPE(Conv2DTransposeAttrs);
+
+bool Conv2DTransposeRel(const Array<Type>& types,
+                        int num_inputs,
+                        const Attrs& attrs,
+                        const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* weight = types[1].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  static const Layout kNCHW("NCHW");
+  static const Layout kOIHW("OIHW");
+
+  const Conv2DTransposeAttrs* param = attrs.as<Conv2DTransposeAttrs>();
+  CHECK(param != nullptr);
+  const Layout in_layout(param->data_layout);
+  const Layout kernel_layout(param->weight_layout);
+  CHECK(in_layout.Convertible(kNCHW))
+    << "Conv only support input layouts that are convertible from NCHW."
+    << " But got " << in_layout;
+  CHECK(kernel_layout.Convertible(kOIHW))
+    << "Conv only support kernel layouts that are convertible from OIHW."
+    << " But got "<< kernel_layout;
+
+  Layout out_layout(param->out_layout);
+  if (!out_layout.defined()) out_layout = in_layout;
+  CHECK(out_layout.Convertible(kNCHW))
+    << "Conv only support output layouts that are convertible from NCHW."
+    << " But got " << out_layout;
+
+  IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
+
+  auto dshape_nchw = ConvertLayout(data->shape, in_layout, kNCHW);
+
+  // infer weight if the kernel_size and channels are defined
+  if (param->kernel_size.defined() && param->channels.defined()) {
+    CHECK_EQ(param->kernel_size.size(), 2);
+    CHECK_EQ(param->dilation.size(), 2);
+
+    std::vector<IndexExpr> wshape({dshape_nchw[1],
+                                   param->channels / param->groups,
+                                   param->kernel_size[0],
+                                   param->kernel_size[1]});
+
+    wshape = ConvertLayout(wshape, kOIHW, kernel_layout);
+    dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
+    dilated_ksize_x = 1 + (param->kernel_size[1] - 1) * param->dilation[1];
+    channels = param->channels;
+
+    // assign result to reporter
+    reporter->Assign(types[1], TensorTypeNode::make(wshape, data->dtype));
+  } else {
+    // use weight to infer the conv shape.
+    if (weight == nullptr) return false;
+    auto wshape = ConvertLayout(weight->shape, kernel_layout, kOIHW);
+    if (param->kernel_size.defined()) {
+      CHECK_EQ(param->kernel_size.size(), 2);
+      // check the size
+      CHECK(reporter->AssertEQ(param->kernel_size[0], wshape[2]) &&
+            reporter->AssertEQ(param->kernel_size[1], wshape[3]))
+          << "Conv2D: shape of weight is inconsistent with kernel_size, "
+          << " kernel_size=" << param->kernel_size
+          << " wshape=" << Array<IndexExpr>(wshape);
+    }
+    if (param->channels.defined()) {
+      CHECK(reporter->AssertEQ(param->channels, wshape[1]))
+          << "Conv2D: shape of weight is inconsistent with channels, "
+          << " channels=" << param->channels
+          << " wshape=" << Array<IndexExpr>(wshape);
+    }
+    CHECK(reporter->AssertEQ(dshape_nchw[1] / param->groups, wshape[0]));
+    channels = wshape[1];
+    dilated_ksize_y = 1 + (wshape[2] - 1) * param->dilation[0];
+    dilated_ksize_x = 1 + (wshape[3] - 1) * param->dilation[1];
+  }
+  // dilation
+  std::vector<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
+  oshape[2] = (param->strides[0] * (dshape_nchw[2] - 1) + dilated_ksize_y -
+               2 * param->padding[0] + param->output_padding[0]);
+  oshape[3] = (param->strides[1] * (dshape_nchw[3] - 1) + dilated_ksize_x -
+               2 * param->padding[1] + param->output_padding[1]);
+
+  DataType out_dtype = param->out_dtype;
+  if (out_dtype.bits() == 0) {
+    out_dtype = data->dtype;
+  }
+  oshape = ConvertLayout(oshape, kNCHW, out_layout);
+  reporter->Assign(types[2], TensorTypeNode::make(oshape, out_dtype));
+  return true;
+}
+
+
+Expr MakeConv2DTranspose(Expr data,
+                         Expr weight,
+                         Array<IndexExpr> strides,
+                         Array<IndexExpr> padding,
+                         Array<IndexExpr> dilation,
+                         int groups,
+                         IndexExpr channels,
+                         Array<IndexExpr> kernel_size,
+                         std::string data_layout,
+                         std::string weight_layout,
+                         Array<IndexExpr> output_padding,
+                         DataType out_dtype) {
+  auto attrs = make_node<Conv2DTransposeAttrs>();
+  attrs->channels = channels;
+  attrs->kernel_size = kernel_size;
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->output_padding = std::move(output_padding);
+  attrs->dilation = std::move(dilation);
+  attrs->groups = groups;
+  attrs->data_layout = std::move(data_layout);
+  attrs->weight_layout = std::move(weight_layout);
+  attrs->out_dtype = std::move(out_dtype);
+  static const Op& op = Op::Get("nn.conv2d_transpose");
+  return CallNode::make(op, {data, weight}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.conv2d_transpose")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 12>(MakeConv2DTranspose, args, rv);
+  });
+
+RELAY_REGISTER_OP("nn.conv2d_transpose")
+.describe(R"code(Transposed 2D convolution layer (sometimes called Deconvolution).
+
+The need for transposed convolutions generally arises
+from the desire to use a transformation going in the opposite direction
+of a normal convolution, i.e., from something that has the shape of the
+output of some convolution to something that has the shape of its input
+while maintaining a connectivity pattern that is compatible with
+said convolution.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, in_channels, height, width) if `layout` is `NCHW`.
+- **weight**: (in_channels, channels, kernel_size[0], kernel_size[1])
+- **bias**: (channels,)
+- **out**:  This depends on the `layout` parameter. Output is 4D array of shape
+v            (batch_size, channels, out_height, out_width) if `layout` is `NCHW`.
+
+            out_height and out_width are calculated as::
+                out_height = (height-1)*strides[0]-2*padding[0]+kernel_size[0]+output_padding[0]
+                out_width = (width-1)*strides[1]-2*padding[1]+kernel_size[1]+output_padding[1]
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.Conv2DTransposeAttrs")
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("weight", "Tensor", "The weight tensor.")
+.set_support_level(2)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                               Conv2DInferCorrectLayout<Conv2DTransposeAttrs>)
+.add_type_rel("Conv2DTranspose", Conv2DTransposeRel);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
new file mode 100644
index 000000000000..7ed43d0df019
--- /dev/null
+++ b/src/relay/op/nn/nn.cc
@@ -0,0 +1,658 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file nn.cc
+ * \brief Property def of nn operators.
+ */
+
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/attrs/image.h>
+#include <topi/nn.h>
+#include <topi/nn/softmax.h>
+#include <topi/nn/flatten.h>
+#include <vector>
+#include "../type_relations.h"
+#include "../../pass/alter_op_layout.h"
+#include "../op_common.h"
+#include "../layout.h"
+
+namespace tvm {
+namespace relay {
+
+// relay.nn.bias_add
+TVM_REGISTER_NODE_TYPE(BiasAddAttrs);
+
+bool BiasAddRel(const Array<Type>& types,
+                int num_inputs,
+                const Attrs& attrs,
+                const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  const BiasAddAttrs* param = attrs.as<BiasAddAttrs>();
+  CHECK(param != nullptr);
+  int axis = param->axis;
+  if (axis < 0) {
+    axis = data->shape.size() + axis;
+  }
+  CHECK_LE(axis, static_cast<int>(data->shape.size()))
+      << "axis " << param->axis << " is out of range";
+
+  // assign output type
+  reporter->Assign(types[1], TensorTypeNode::make(
+      {data->shape[axis]}, data->dtype));
+  reporter->Assign(types[2], types[0]);
+  return true;
+}
+
+
+// Positional relay function to create dense operator used by frontend FFI.
+Expr MakeBiasAdd(Expr data,
+                 Expr bias,
+                 int axis) {
+  auto attrs = make_node<BiasAddAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("nn.bias_add");
+  return CallNode::make(op, {data, bias}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.bias_add")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 3>(MakeBiasAdd, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("nn.bias_add")
+.describe(R"code(Add bias to an axis of the input.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.BiasAddAttrs")
+.set_num_inputs(2)
+.add_argument("data", "nD Tensor", "Input data.")
+.add_argument("bias", "1D Tensor", "Bias.")
+.set_support_level(1)
+.add_type_rel("BiasAdd", BiasAddRel);
+
+
+// relay.nn.dense
+TVM_REGISTER_NODE_TYPE(DenseAttrs);
+
+
+bool DenseRel(const Array<Type>& types,
+              int num_inputs,
+              const Attrs& attrs,
+              const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* weight = types[1].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  const DenseAttrs* param = attrs.as<DenseAttrs>();
+  CHECK(param != nullptr);
+
+  CHECK(static_cast<int>(data->shape.size()) != 0);
+
+  Array<tvm::Expr> oshape = data->shape;
+  if (param->units.defined()) {
+    Array<tvm::Expr> dshape = data->shape;
+    // validate the weight shape is proper if defined
+    // Assign weight type
+    Array<IndexExpr> wshape({param->units, dshape[dshape.size() - 1]});
+    reporter->Assign(types[1], TensorTypeNode::make(wshape, data->dtype));
+    oshape.Set((oshape.size() - 1), param->units);
+  } else {
+    if (weight == nullptr) return false;
+    Array<tvm::Expr> wshape = weight->shape;
+    oshape.Set((oshape.size() - 1), wshape[0]);
+  }
+
+  // assign output type
+  reporter->Assign(types[2], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+
+// Positional relay function to create dense operator used by frontend FFI.
+Expr MakeDense(Expr data,
+               Expr weight,
+               IndexExpr units) {
+  auto attrs = make_node<DenseAttrs>();
+  attrs->units = units;
+  static const Op& op = Op::Get("nn.dense");
+  return CallNode::make(op, {data, weight}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.dense")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 3>(MakeDense, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("nn.dense")
+.describe(R"code(Applies a linear transformation: :math:`Y = XW^T`.
+
+- **data**: `(x1, x2, ..., xn, input_dim)`
+- **weight**: `(units, input_dim)`
+- **out**: `(x1, x2, ..., xn, units)`.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.DenseAttrs")
+.set_num_inputs(2)
+.add_argument("data", "nD Tensor", "Input data.")
+.add_argument("weight", "2D Tensor", "Weight matrix.")
+.set_support_level(1)
+.add_type_rel("Dense", DenseRel);
+
+// relay.leaky_relu
+TVM_REGISTER_NODE_TYPE(LeakyReluAttrs);
+
+// Positional relay function to create leaky relu operator used by frontend FFI.
+Expr MakeLeakyRelu(Expr data,
+                   double alpha) {
+  auto attrs = make_node<LeakyReluAttrs>();
+  attrs->alpha = alpha;
+  static const Op& op = Op::Get("nn.leaky_relu");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.leaky_relu")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeLeakyRelu, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("nn.leaky_relu")
+.describe(R"code(Leaky version of a Rectified Linear Unit.
+
+`y = x > 0 ? x : alpha * x`
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.LeakyReluAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "Input data.")
+.set_support_level(3)
+.add_type_rel("Identity", IdentityRel)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const Attrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Type& out_type,
+                    const Target& target) {
+    const auto* param = attrs.as<LeakyReluAttrs>();
+    return Array<Tensor>{ topi::leaky_relu(inputs[0], param->alpha) };
+});
+
+
+// relay.prelu
+TVM_REGISTER_NODE_TYPE(PReluAttrs);
+
+bool PReluRel(const Array<Type>& types,
+              int num_inputs,
+              const Attrs& attrs,
+              const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  const PReluAttrs* param = attrs.as<PReluAttrs>();
+  CHECK(param != nullptr);
+
+  CHECK(param->axis < static_cast<int>(data->shape.size()))
+    << "Wrong axis ("  << param->axis << ")value.";
+
+  // assign alpha type
+  Array<IndexExpr> alpha_shape({data->shape[param->axis]});
+  reporter->Assign(types[1], TensorTypeNode::make(alpha_shape, data->dtype));
+
+  // assign output type
+  reporter->Assign(types[2], TensorTypeNode::make(data->shape, data->dtype));
+  return true;
+}
+
+// Positional relay function to create prelu operator used by frontend FFI.
+Expr MakePRelu(Expr data,
+               Expr alpha,
+               int axis) {
+  auto attrs = make_node<PReluAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("nn.prelu");
+  return CallNode::make(op, {data, alpha}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.prelu")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 3>(MakePRelu, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("nn.prelu")
+.describe(R"code(Parametric version of a Rectified Linear Unit.
+It accepts two arguments: an input ``x`` and a channelwise slope ``alpha``
+and computes the output as :math:`PReLU(x) y = x > 0 ? x : alpha * x`,
+where :math:`*` is an channelwise multiplication for each sample in the batch.
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.PReluAttrs")
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "Input data.")
+.add_argument("alpha", "Tensor", "Input channelwise alpha.")
+.set_support_level(3)
+.add_type_rel("PRelu", PReluRel)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const Attrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Type& out_type,
+                    const Target& target) {
+    const auto* param = attrs.as<PReluAttrs>();
+    return Array<Tensor>{ topi::prelu(inputs[0], inputs[1], param->axis)};
+});
+
+
+// relay.softmax
+TVM_REGISTER_NODE_TYPE(SoftmaxAttrs);
+
+TVM_REGISTER_API("relay.op.nn._make.softmax")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  auto make_func = [](Expr data, int axis) {
+    auto attrs = make_node<SoftmaxAttrs>();
+    attrs->axis = axis;
+    static const Op& op = Op::Get("nn.softmax");
+    return CallNode::make(op, {data}, Attrs(attrs), {});
+  };
+
+  runtime::detail::unpack_call<Expr, 2>(make_func, args, rv);
+});
+
+RELAY_REGISTER_OP("nn.softmax")
+    .describe(R"code(Softmax layer.
+
+.. math:: \text{softmax}(x)_i = \frac{exp(x_i)}{\sum_j exp(x_j)}
+
+.. note::
+    This operator can be optimized away for inference.
+
+- **data**: The input data
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.SoftmaxAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(1)
+.add_type_rel("Identity", IdentityRel)
+.set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
+                                         const Array<Tensor>& inputs,
+                                         const Type& out_type,
+                                         const Target& target) {
+  const auto* param = attrs.as<SoftmaxAttrs>();
+  CHECK(param != nullptr);
+  return Array<Tensor>{ topi::nn::softmax(inputs[0], param->axis) };
+});
+
+
+// relay.nn.log_softmax
+TVM_REGISTER_API("relay.op.nn._make.log_softmax")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  auto make_func = [](Expr data, int axis) {
+    auto attrs = make_node<SoftmaxAttrs>();
+    attrs->axis = axis;
+    static const Op& op = Op::Get("nn.log_softmax");
+    return CallNode::make(op, {data}, Attrs(attrs), {});
+  };
+
+  runtime::detail::unpack_call<Expr, 2>(make_func, args, rv);
+});
+
+RELAY_REGISTER_OP("nn.log_softmax")
+    .describe(R"code(Computes log softmax.
+
+.. math:: \text{log_softmax}(x)_i = \log \frac{exp(x_i)}{\sum_j exp(x_j)}
+
+.. note::
+    This operator can be optimized away for inference.
+
+- **data**: The input data
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.SoftmaxAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(1)
+.add_type_rel("Identity", IdentityRel)
+.set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
+                                         const Array<Tensor>& inputs,
+                                         const Type& out_type,
+                                         const Target& target) {
+  const auto* param = attrs.as<SoftmaxAttrs>();
+  CHECK(param != nullptr);
+  CHECK(param->axis == -1 || param->axis == static_cast<int32_t>(inputs[0].ndim()) - 1)
+      << "log_softmax currently only works on last dimension";
+  return Array<Tensor>{ topi::nn::log_softmax(inputs[0]) };
+});
+
+
+// relay.nn.batch_flatten
+bool BatchFlattenRel(const Array<Type>& types,
+                     int num_inputs,
+                     const Attrs& attrs,
+                     const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+  if (data->shape.size() == 0) return false;
+
+  auto target_dim = make_const(Int(32), 1);
+
+  for (uint32_t i = 1; i < data->shape.size(); ++i) {
+    target_dim = target_dim * data->shape[i];
+  }
+
+  std::vector<IndexExpr> oshape({data->shape[0], target_dim});
+
+  // assign output type
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+Expr MakeBatchFlatten(Expr data) {
+  static const Op& op = Op::Get("nn.batch_flatten");
+  return CallNode::make(op, {data}, Attrs(), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.batch_flatten")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 1>(MakeBatchFlatten, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("nn.batch_flatten")
+.describe(R"code(Flattens the input into a 2-D array.
+
+For an input array with shape ``(d1, d2, ..., dk)``, `batch_flatten` operation reshapes
+the input array into an output array of shape ``(d1, d2*...*dk)``.
+
+Example::
+
+    x = [[
+        [1,2,3],
+        [4,5,6],
+        [7,8,9]
+    ],
+    [   [1,2,3],
+        [4,5,6],
+        [7,8,9]
+    ]],
+
+    batch_flatten(x) = [[ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.],
+       [ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.]]
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(2)
+.add_type_rel("BatchFlatten", BatchFlattenRel)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const Attrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Type& out_type,
+                    const Target& target) {
+    return Array<Tensor>{ topi::nn::flatten(inputs[0]) };
+});
+
+
+// relu
+TVM_REGISTER_API("relay.op.nn._make.relu")
+.set_body_typed<Expr(Expr)>([](Expr data) {
+    static const Op& op = Op::Get("nn.relu");
+    return CallNode::make(op, {data}, Attrs(), {});
+  });
+
+RELAY_REGISTER_OP("nn.relu")
+.describe(R"code(Returns the relu input array, computed element-wise.
+
+.. math::
+   max(x, 0)
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(1)
+.add_type_rel("Identity", IdentityRel)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
+.set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
+                                         const Array<Tensor>& inputs,
+                                         const Type& out_type,
+                                         const Target& target) {
+  return Array<Tensor>{ topi::relu(inputs[0], 0.0f) };
+});
+
+
+// Positional relay function to create LRN operator used by frontend FFI.
+TVM_REGISTER_NODE_TYPE(LRNAttrs);
+
+Expr MakeLRN(Expr data,
+             int size,
+             int axis,
+             double alpha,
+             double beta,
+             double bias) {
+  auto attrs = make_node<LRNAttrs>();
+  attrs->size = size;
+  attrs->axis = axis;
+  attrs->alpha = alpha;
+  attrs->beta = beta;
+  attrs->bias = bias;
+  static const Op& op = Op::Get("nn.lrn");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op.nn._make.lrn")
+  .set_body([](const TVMArgs& args, TVMRetValue* rv) {
+      runtime::detail::unpack_call<Expr, 6>(MakeLRN, args, rv);
+  });
+
+RELAY_REGISTER_OP("nn.lrn")
+.describe(R"code(LRN layer.
+
+Normalize the input in a local region across or within feature maps.
+Each input value is divided by (1 + (\alpha/n) \sum_i x_i^2)^\beta,
+where n is the size of each local region, and the sum is taken over the region
+centered at that value (zero padding is added where necessary).
+
+.. math::
+
+    data / (bias + (alpha * sum_data ^2 /size))^beta
+
+- **data**: The input tensor.
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.LRNAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(2)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
+.add_type_rel("Identity", IdentityRel);
+
+
+// Positional relay function to create L2Normalize operator used by frontend FFI.
+TVM_REGISTER_NODE_TYPE(L2NormalizeAttrs);
+
+Expr MakeL2Normalize(Expr data,
+                     double eps,
+                     Array<Integer> axis) {
+  auto attrs = make_node<L2NormalizeAttrs>();
+  attrs->eps = eps;
+  attrs->axis = std::move(axis);
+  static const Op& op = Op::Get("nn.l2_normalize");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op.nn._make.l2_normalize")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 3>(MakeL2Normalize, args, rv);
+  });
+
+RELAY_REGISTER_OP("nn.l2_normalize")
+.describe(R"code(L2 Normalization layer.
+
+Normalizes along dimension axis using an L2 norm
+
+.. math::
+    output = x / sqrt(max(sum(x^2), epsilon))
+
+- **data**: The input tensor.
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.L2NormalizeAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(2)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
+.add_type_rel("Identity", IdentityRel);
+
+// Dropout
+TVM_REGISTER_NODE_TYPE(DropoutAttrs);
+
+bool DropoutRel(const Array<Type>& types,
+                int num_inputs,
+                const Attrs& attrs,
+                const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  // dropout returns the original tensor with dropout applied
+  // and a mask tensor (1.0 where element not dropped, 0.0 where dropped)
+  auto ret_type = TensorTypeNode::make(data->shape, data->dtype);
+  reporter->Assign(types[1], TupleTypeNode::make(Array<Type>({ret_type, ret_type})));
+  return true;
+}
+
+Expr MakeDropout(Expr data, double rate) {
+  auto attrs = make_node<DropoutAttrs>();
+  attrs->rate = rate;
+  static const Op& op = Op::Get("nn.dropout");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op.nn._make.dropout")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeDropout, args, rv);
+  });
+
+RELAY_REGISTER_OP("nn.dropout")
+.describe(R"code(Applies the dropout operation to the input array.
+
+During training, each element of the input is set to zero with probability ``p``.
+The whole array is rescaled by ``1/(1-p)`` to keep the expected sum of the input unchanged.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.DropoutAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "Input to which dropout will be applied.")
+.set_support_level(1)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
+.add_type_rel("Dropout", DropoutRel);
+
+// batch_norm
+TVM_REGISTER_NODE_TYPE(BatchNormAttrs);
+
+bool BatchNormRel(const Array<Type>& types,
+                  int num_inputs,
+                  const Attrs& attrs,
+                  const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 6);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  const BatchNormAttrs* param = attrs.as<BatchNormAttrs>();
+
+  // axis of -1 means use the last dimension
+  CHECK(param->axis >= -1 && param->axis < (int)data->shape.size());
+  int axis = (param->axis != -1) ? param->axis : data->shape.size() - 1;
+  auto axis_size = data->shape[axis];
+
+  // if we are using beta and gamma, they need to be of shape (dim,)
+  reporter->Assign(types[1], TensorTypeNode::make({axis_size}, data->dtype));
+  reporter->Assign(types[2], TensorTypeNode::make({axis_size}, data->dtype));
+  reporter->Assign(types[3], TensorTypeNode::make({axis_size}, data->dtype));
+  reporter->Assign(types[4], TensorTypeNode::make({axis_size}, data->dtype));
+
+  // output is a tuple of the normed data (same shape as input), new running mean,
+  // and new running average (the latter two are both vectors of length dim)
+  std::vector<Type> fields;
+  auto vec_ty = TensorTypeNode::make(Array<IndexExpr>({data->shape[axis]}),
+                                     data->dtype);
+  fields.push_back(TensorTypeNode::make(data->shape, data->dtype));
+  fields.push_back(vec_ty);
+  fields.push_back(vec_ty);
+  reporter->Assign(types[5], TupleTypeNode::make(Array<Type>(fields)));
+  return true;
+}
+
+Expr MakeBatchNorm(Expr data, Expr gamma, Expr beta, Expr moving_mean, Expr moving_var,
+                   int axis, double epsilon, bool center, bool scale) {
+  auto attrs = make_node<BatchNormAttrs>();
+  attrs->axis = axis;
+  attrs->epsilon = epsilon;
+  attrs->center = center;
+  attrs->scale = scale;
+  static const Op& op = Op::Get("nn.batch_norm");
+  return CallNode::make(op, {data, gamma, beta, moving_mean, moving_var}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op.nn._make.batch_norm")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 9>(MakeBatchNorm, args, rv);
+  });
+
+RELAY_REGISTER_OP("nn.batch_norm")
+.describe(R"code(Batch normalization layer (Ioffe and Szegedy, 2014).
+Normalizes the input at each batch, i.e. applies a transformation
+that maintains the mean activation close to 0 and the activation
+standard deviation close to 1.
+
+.. math::
+
+  data\_mean[i] = mean(data[:,i,:,...]) \\
+  data\_var[i] = var(data[:,i,:,...])
+
+Then compute the normalized output, which has the same shape as input, as following:
+
+.. math::
+
+  out[:,i,:,...] = \frac{data[:,i,:,...] - data\_mean[i]}{\sqrt{data\_var[i]+\epsilon}} \
+* gamma[i] + beta[i]
+
+Both *mean* and *var* returns a scalar by treating the input as a vector.
+
+Assume the input has size *k* on axis 1, then both ``gamma`` and ``beta`` have shape *(k,)*.
+
+Besides the inputs and the outputs, this operator accepts two auxiliary
+states, ``moving_mean`` and ``moving_var``, which are *k*-length
+vectors. They are global statistics for the whole dataset, which are updated
+by::
+
+  moving_mean = moving_mean * momentum + data_mean * (1 - momentum)
+  moving_var = moving_var * momentum + data_var * (1 - momentum)
+
+The parameter ``axis`` specifies which axis of the input shape denotes
+the 'channel' (separately normalized groups).  The default is 1.  Specifying -1 sets the channel
+axis to be the last item in the input shape.
+
+.. note::
+    This operator can be optimized away for inference.
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.BatchNormAttrs")
+.set_num_inputs(5)
+.add_argument("data", "Tensor", "Input to which batch_norm will be applied.")
+.add_argument("gamma", "Tensor", "The gamma scale factor.")
+.add_argument("beta", "Tensor", "The beta offset factor.")
+.add_argument("moving_mean", "Tensor", "Running mean of input.")
+.add_argument("moving_var", "Tensor", "Running variance of input.")
+.set_support_level(1)
+.add_type_rel("BatchNorm", BatchNormRel);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/nn/pad.cc b/src/relay/op/nn/pad.cc
new file mode 100644
index 000000000000..dc99f05f4d2d
--- /dev/null
+++ b/src/relay/op/nn/pad.cc
@@ -0,0 +1,116 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file pad.cc
+ * \brief Implementation of operator pad
+ */
+#include <tvm/ir_operator.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/nn.h>
+#include <topi/nn.h>
+#include <vector>
+#include "../layout.h"
+#include "../op_common.h"
+
+namespace tvm {
+namespace relay {
+
+// relay.nn.pad
+TVM_REGISTER_NODE_TYPE(PadAttrs);
+
+bool PadRel(const Array<Type>& types,
+            int num_inputs,
+            const Attrs& attrs,
+            const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  const PadAttrs* param = attrs.as<PadAttrs>();
+  CHECK(param != nullptr);
+
+  // check that pad widths match lengths
+  CHECK(data->shape.size() == param->pad_width.size())
+    << "There should be as many pad width pairs as shape dimensions "
+    << "but the shape has " << data->shape.size() << " dimensions "
+    << "and there are " << param->pad_width.size() << " pad width pairs.";
+
+  // each pad width element should be a pair of positive integers
+  std::vector<IndexExpr> oshape;
+  for (size_t i = 0; i < param->pad_width.size(); i++) {
+    CHECK(param->pad_width[i].size() == 2)
+      << "Each pad width element should be a pair but at index " << i
+      << " there are " << param->pad_width[i].size() << " elements.";
+
+    auto width1 = as_const_int(param->pad_width[i][0]);
+    auto width2 = as_const_int(param->pad_width[i][1]);
+    CHECK(width1 != nullptr);
+    CHECK(width2 != nullptr);
+
+    CHECK(*width1 >= 0)
+      << "Param width elements should be positive but first pad width at "
+      << "index " << i << " is " << *width1 << ".";
+    CHECK(*width2 >= 0)
+      << "Param width elements should be positive but first pad width at "
+      << "index " << i << " is " << *width2 << ".";
+
+    auto padding = make_const(data->shape[i].type(), *width1 + *width2);
+    oshape.push_back(data->shape[i] + padding);
+  }
+
+  reporter->Assign(types[1], TensorTypeNode::make(Array<IndexExpr>(oshape),
+                                                  data->dtype));
+  return true;
+}
+
+Array<Tensor> PadCompute(const Attrs& attrs,
+                         const Array<Tensor>& inputs,
+                         const Type& out_type,
+                         const Target& target) {
+  const auto* param = attrs.as<PadAttrs>();
+  CHECK(param != nullptr);
+
+  auto pad_width = param->pad_width;
+  CHECK(pad_width.size() == inputs[0].ndim() &&
+    pad_width[0].size() == 2)
+    << "Illegal pad_width";
+  Array<IndexExpr> pad_before;
+  for (size_t i = 0; i < pad_width.size(); ++i) {
+    pad_before.push_back(pad_width[i][0]);
+  }
+  Array<IndexExpr> pad_after;
+  for (size_t i = 0; i < pad_width.size(); ++i) {
+    pad_after.push_back(pad_width[i][1]);
+  }
+  const auto* out_ttype = out_type.as<TensorTypeNode>();
+  return Array<Tensor>{ topi::pad(inputs[0], pad_before, pad_after,
+                                  tvm::make_const(out_ttype->dtype, param->pad_value)) };
+}
+
+// Handler to create a call to the padding op used by front-end FFI
+Expr MakePad(Expr data, Array<Array<IndexExpr> > pad_width, double pad_value) {
+  auto attrs = make_node<PadAttrs>();
+  attrs->pad_value = pad_value;
+  attrs->pad_width = std::move(pad_width);
+  static const Op& op = Op::Get("nn.pad");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op.nn._make.pad")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 3>(MakePad, args, rv);
+  });
+
+RELAY_REGISTER_OP("nn.pad")
+.describe(R"code(Pad for n-D tensor.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.PadAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(2)
+.add_type_rel("Pad", PadRel)
+.set_attr<TOpPattern>("TOpPattern", kInjective)
+.set_attr<FTVMCompute>("FTVMCompute", PadCompute);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
new file mode 100644
index 000000000000..6cf37668cab5
--- /dev/null
+++ b/src/relay/op/nn/pooling.cc
@@ -0,0 +1,376 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file pooling.cc
+ * \brief Pooling operators
+ */
+#include <tvm/relay/op.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/attrs/nn.h>
+#include <topi/nn/pooling.h>
+#include <vector>
+#include "../layout.h"
+#include "../../pass/alter_op_layout.h"
+
+namespace tvm {
+namespace relay {
+
+// relay.nn.max_pool2d & relay.nn.avg_pool2d
+TVM_REGISTER_NODE_TYPE(MaxPool2DAttrs);
+TVM_REGISTER_NODE_TYPE(AvgPool2DAttrs);
+
+template <typename T>
+Array<Array<Layout> > Pool2DInferCorrectLayout(
+    const Attrs& attrs,
+    const Array<Layout>& new_in_layouts,
+    const Array<Layout>& old_in_layouts,
+    const Array<Array<IndexExpr>> &old_in_shapes) {
+  // NOTE: Discard "const" qualifier here.
+  T *params = const_cast<T*>(attrs.as<T>());
+
+  if (new_in_layouts.defined()) {
+    CHECK_EQ(new_in_layouts.size(), 1);
+
+    Layout raw_layout(params->layout);
+    Layout input = new_in_layouts[0];
+    if (input.Indexof('W') == raw_layout.Indexof('W') &&
+        input.Indexof('H') == raw_layout.Indexof('H') &&
+        !input.Contains('w') && !input.Contains('h')) {
+      params->layout = input.name();  // modify self to follow the input layout
+    }
+  }
+
+  return Array<Array<Layout> >{{params->layout}, {params->layout}};
+}
+
+template <typename AttrType>
+bool Pool2DRel(const Array<Type>& types,
+               int num_inputs,
+               const Attrs& attrs,
+               const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+
+  CHECK(data != nullptr);
+  const auto dshape = data->shape;
+  CHECK_NE(dshape.size(), 0);
+  CHECK_GE(dshape.size(), 2U)
+      << "Pool2D only support input >= 2-D: input must have height and width";
+  const auto param = attrs.as<AttrType>();
+  CHECK(param != nullptr);
+
+  Layout layout(param->layout);
+  CHECK(layout.Contains('H') && layout.Contains('W') &&
+        !layout.Contains('h') && !layout.Contains('w'))
+    << "Invalid layout " << layout
+    << ". Pool2D layout must have H and W, which cannot be split";
+
+  const auto hidx = layout.Indexof('H');
+  const auto widx = layout.Indexof('W');
+
+  IndexExpr pad_h, pad_w;
+  if (param->padding.size() == 1) {
+    pad_h = param->padding[0] * 2;
+    pad_w = param->padding[0] * 2;
+  } else if (param->padding.size() == 2) {
+    // (top, left)
+    pad_h = param->padding[0] * 2;
+    pad_w = param->padding[1] * 2;
+  } else if (param->padding.size() == 4) {
+    // (top, left, bottom, right)
+    pad_h = param->padding[0] + param->padding[2];
+    pad_w = param->padding[1] + param->padding[3];
+  } else {
+    return false;
+  }
+
+  std::vector<IndexExpr> oshape({dshape[0], dshape[1], dshape[2], dshape[3]});
+  if (param->ceil_mode) {
+    oshape[hidx] = ((dshape[hidx] + pad_h - param->pool_size[0] +
+                    param->strides[0] - 1) / param->strides[0]) + 1;
+    oshape[widx] = ((dshape[widx] + pad_w - param->pool_size[1] +
+                    param->strides[1] - 1) / param->strides[1]) + 1;
+  } else {
+    oshape[hidx] = ((dshape[hidx] + pad_h - param->pool_size[0]) / param->strides[0]) + 1;
+    oshape[widx] = ((dshape[widx] + pad_w - param->pool_size[1]) / param->strides[1]) + 1;
+  }
+
+  // assign output type
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+// MaxPool2D
+Expr MakeMaxPool2D(Expr data,
+                   Array<IndexExpr> pool_size,
+                   Array<IndexExpr> strides,
+                   Array<IndexExpr> padding,
+                   std::string layout,
+                   bool ceil_mode) {
+  auto attrs = make_node<MaxPool2DAttrs>();
+  attrs->pool_size = std::move(pool_size);
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->layout = std::move(layout);
+  attrs->ceil_mode = ceil_mode;
+  static const Op& op = Op::Get("nn.max_pool2d");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+template<typename AttrType, topi::nn::PoolType mode>
+Array<Tensor> Pool2DCompute(const Attrs& attrs,
+                            const Array<Tensor>& inputs,
+                            const Type& out_type,
+                            const Target& target) {
+  const auto* param = attrs.as<AttrType>();
+  CHECK(param != nullptr);
+  auto pool_size = param->pool_size;
+  auto strides = param->strides;
+  auto padding = param->padding;
+  auto ceil_mode = param->ceil_mode;
+  Layout layout(param->layout);
+  CHECK(layout.Convertible(Layout("NCHW")))
+      << "max_pool2d currently only supports layouts that are convertible from NCHW";
+  CHECK_EQ(layout.Indexof('h'), -1) << "max_pool2d does not support input split on height";
+  CHECK_EQ(layout.Indexof('w'), -1) << "max_pool2d does not support input split on width";
+
+  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
+      << "Pool2D only support 4-D input (e.g., NCHW)"
+      << " or 5-D input (last dimension is a split of channel)";
+
+  if (param->padding.size() == 1) {
+    padding.push_back(padding[0]);
+    padding.push_back(padding[0]);
+    padding.push_back(padding[0]);
+  } else if (param->padding.size() == 2) {
+    padding.push_back(padding[0]);
+    padding.push_back(padding[1]);
+  }
+  if (mode == topi::nn::kAvgPool) {
+    bool count_include_pad = reinterpret_cast<const AvgPool2DAttrs*>(param)->count_include_pad;
+    return Array<Tensor>{
+      topi::nn::pool(inputs[0], pool_size, strides, padding,
+                     mode, ceil_mode, layout.name(), count_include_pad)};
+  } else {
+    return Array<Tensor>{
+      topi::nn::pool(inputs[0], pool_size, strides, padding,
+                     mode, ceil_mode, layout.name())};
+  }
+}
+
+TVM_REGISTER_API("relay.op.nn._make.max_pool2d")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 6>(MakeMaxPool2D, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("nn.max_pool2d")
+.describe(R"code(Max pooling operation for two dimensional data.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, channels, height, width) if `layout` is `NCHW`.
+- **out**: This depends on the `layout` parameter. Output is 4D array of shape
+           (batch_size, channels, out_height, out_width)  if `layout` is `NCHW`.
+           out_height and out_width are calculated as::
+
+               out_height = floor((height+padding[0]+padding[2]-pool_size[0])/strides[0])+1
+               out_width = floor((width+padding[1]+padding[3]-pool_size[1])/strides[1])+1
+
+           where padding will be an expanded array based on number of values passed as::
+               one int : all sides same padding used.
+               two int : bottom, right use same as top and left.
+               four int: padding width in the order of (top, left, bottom, right).
+
+           When `ceil_mode` is `True`, ceil will be used instead of floor in this
+           equation.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.MaxPool2DAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(2)
+.add_type_rel("MaxPool2D", Pool2DRel<MaxPool2DAttrs>)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", Pool2DInferCorrectLayout<MaxPool2DAttrs>)
+.set_attr<FTVMCompute>("FTVMCompute", Pool2DCompute<MaxPool2DAttrs, topi::nn::kMaxPool>);
+
+
+// AvgPool2D
+Expr MakeAvgPool2D(Expr data,
+                   Array<IndexExpr> pool_size,
+                   Array<IndexExpr> strides,
+                   Array<IndexExpr> padding,
+                   std::string layout,
+                   bool ceil_mode,
+                   bool count_include_pad) {
+  auto attrs = make_node<AvgPool2DAttrs>();
+  attrs->pool_size = std::move(pool_size);
+  attrs->strides = std::move(strides);
+  attrs->padding = std::move(padding);
+  attrs->layout = std::move(layout);
+  attrs->ceil_mode = ceil_mode;
+  attrs->count_include_pad = count_include_pad;
+  static const Op& op = Op::Get("nn.avg_pool2d");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.avg_pool2d")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 7>(MakeAvgPool2D, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("nn.avg_pool2d")
+.describe(R"code(
+Average pooling operation for one dimensional data.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, channels, height, width) if `layout` is `NCHW`.
+- **out**: This depends on the `layout` parameter. Output is 4D array of shape
+           (batch_size, channels, out_height, out_width)  if `layout` is `NCHW`.
+           out_height and out_width are calculated as::
+
+               out_height = floor((height+padding[0]+padding[2]-pool_size[0])/strides[0])+1
+               out_width = floor((width+padding[1]+padding[3]-pool_size[1])/strides[1])+1
+
+           where padding will be an expanded array based on number of values passed as::
+               one int : all sides same padding used.
+               two int : bottom, right use same as top and left.
+               four int: padding width in the order of (top, left, bottom, right).
+
+           When `ceil_mode` is `True`, ceil will be used instead of floor in this
+           equation.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.AvgPool2DAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(2)
+.add_type_rel("AvgPool2D", Pool2DRel<AvgPool2DAttrs>)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", Pool2DInferCorrectLayout<AvgPool2DAttrs>)
+.set_attr<FTVMCompute>("FTVMCompute", Pool2DCompute<AvgPool2DAttrs, topi::nn::kAvgPool>);
+
+// relay.nn.global_pool_2d & relay.nn.max_pool_2d
+TVM_REGISTER_NODE_TYPE(GlobalPool2DAttrs);
+
+bool GlobalPool2DRel(const Array<Type>& types,
+                     int num_inputs,
+                     const Attrs& attrs,
+                     const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) { return false; }
+  const auto dshape = data->shape;
+  CHECK_NE(dshape.size(), 0);
+  CHECK_GE(dshape.size(), 2U)
+      << "Pool2D only support input >= 2-D: input must have height and width";
+  const auto param = attrs.as<GlobalPool2DAttrs>();
+  CHECK(param != nullptr);
+
+  Layout layout(param->layout);
+  CHECK(layout.Contains('H') && layout.Contains('W') &&
+        !layout.Contains('h') && !layout.Contains('w'))
+    << "Invalid layout " << layout
+    << ". Pool2D layout must have H and W, which cannot be split";
+
+  const auto hidx = layout.Indexof('H');
+  const auto widx = layout.Indexof('W');
+  Array<IndexExpr> oshape(dshape);
+  oshape.Set(hidx, 1);
+  oshape.Set(widx, 1);
+
+  // assign output type
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+
+template<topi::nn::PoolType mode>
+Array<Tensor> GlobalPool2DCompute(const Attrs& attrs,
+                                  const Array<Tensor>& inputs,
+                                  const Type& out_type,
+                                  const Target& target) {
+  const auto* param = attrs.as<GlobalPool2DAttrs>();
+  CHECK(param != nullptr);
+  Layout layout(param->layout);
+  CHECK(layout.Convertible(Layout("NCHW")))
+    << "global_avg_pool2d currently only supports layouts that are convertible from NCHW";
+  CHECK_EQ(layout.Indexof('h'), -1)
+    << "global_avg_pool2d does not support input split on height";
+  CHECK_EQ(layout.Indexof('w'), -1)
+    << "global_avg_pool2d does not support input split on width";
+
+  CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
+    << "Pool2D only support 4-D input (e.g., NCHW)"
+    << " or 5-D input (last dimension is a split of channel)";
+  return Array<Tensor>{
+    topi::nn::global_pool(inputs[0], mode, layout.name()) };
+}
+
+Expr MakeGlobalAvgPool2D(Expr data,
+                         std::string layout) {
+  auto attrs = make_node<GlobalPool2DAttrs>();
+  attrs->layout = std::move(layout);
+  static const Op& op = Op::Get("nn.global_avg_pool2d");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.global_avg_pool2d")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeGlobalAvgPool2D, args, rv);
+  });
+
+// GlobalAvgPool
+RELAY_REGISTER_OP("nn.global_avg_pool2d")
+.describe(R"code(Global average pooling operation for 2D data.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, channels, height, width) if `layout` is `NCHW`.
+- **out**: This depends on the `layout` parameter. Output is 4D array of shape
+           (batch_size, channels, 1, 1)  if `layout` is `NCHW`.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.GlobalPool2DAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(2)
+.add_type_rel("GlobalAvgPool2D", GlobalPool2DRel)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                               Pool2DInferCorrectLayout<GlobalPool2DAttrs>)
+.set_attr<FTVMCompute>("FTVMCompute", GlobalPool2DCompute<topi::nn::kAvgPool>);
+
+// GlobalMaxPool
+Expr MakeGlobalMaxPool2D(Expr data,
+                         std::string layout) {
+  auto attrs = make_node<GlobalPool2DAttrs>();
+  attrs->layout = std::move(layout);
+  static const Op& op = Op::Get("nn.global_max_pool2d");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op.nn._make.global_max_pool2d")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeGlobalMaxPool2D, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("nn.global_max_pool2d")
+.describe(R"code(Global max pooling operation for 2D data.
+
+- **data**: This depends on the `layout` parameter. Input is 4D array of shape
+            (batch_size, channels, height, width) if `layout` is `NCHW`.
+- **out**: This depends on the `layout` parameter. Output is 4D array of shape
+           (batch_size, channels, 1, 1)  if `layout` is `NCHW`.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.GlobalPool2DAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(2)
+.add_type_rel("GlobalMaxPool2D", GlobalPool2DRel)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                               Pool2DInferCorrectLayout<GlobalPool2DAttrs>)
+.set_attr<FTVMCompute>("FTVMCompute", GlobalPool2DCompute<topi::nn::kMaxPool>);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/nn/upsampling.cc b/src/relay/op/nn/upsampling.cc
new file mode 100644
index 000000000000..d386437ae15b
--- /dev/null
+++ b/src/relay/op/nn/upsampling.cc
@@ -0,0 +1,125 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file upsampling.cc
+ * \brief upsampling operator
+ */
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/build_module.h>
+#include <topi/elemwise.h>
+#include <topi/nn/upsampling.h>
+#include <vector>
+#include "../op_common.h"
+#include "../layout.h"
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(UpSamplingAttrs);
+
+bool UpSamplingRel(const Array<Type>& types,
+                   int num_inputs,
+                   const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  static const Layout kNCHW("NCHW");
+
+  const UpSamplingAttrs* param = attrs.as<UpSamplingAttrs>();
+  CHECK(param != nullptr);
+  const Layout in_layout(param->layout);
+  CHECK(in_layout.Convertible(kNCHW))
+    << "UpSampling only support input layouts that are convertible from NCHW."
+    << " But got " << in_layout;
+
+  auto oshape = ConvertLayout(data->shape, in_layout, kNCHW);
+
+  oshape[2] = oshape[2] * param->scale;
+  oshape[3] = oshape[3] * param->scale;
+
+  // assign output type
+  reporter->Assign(types[1],
+                   TensorTypeNode::make(ConvertLayout(oshape, kNCHW, in_layout),
+                                        data->dtype));
+  return true;
+}
+
+
+// Positional relay function to create upsampling operator
+// used by frontend FFI.
+Expr MakeUpSampling(Expr data,
+                    int scale,
+                    std::string layout,
+                    std::string method) {
+  auto attrs = make_node<UpSamplingAttrs>();
+  attrs->layout = std::move(layout);
+  attrs->method = std::move(method);
+  attrs->scale = scale;
+  static const Op& op = Op::Get("nn.upsampling");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.upsampling")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 4>(MakeUpSampling, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("nn.upsampling")
+.describe(R"code(Perform upsampling on input array with nearest neighbour or bilinear interpolation.
+
+- **data**: data is 4D array of shape
+            (batch_size, channels, in_height, in_width) for NCHW
+            (batch_size, in_height, in_width, channels) for NHWC
+
+- **out**: Output is 4D array of shape
+           for layout NCHW
+           (batch_size, channels, in_height*scale, in_width*scale)
+
+           for layout NHWC
+           (batch_size, in_height*scale, in_width*scale, channels)
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.UpSamplingAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(2)
+.add_type_rel("UpSampling", UpSamplingRel)
+.set_attr<TOpPattern>("TOpPattern", kInjective)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const Attrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Type& out_type,
+                    const Target& target) {
+    const auto* uattrs = attrs.as<UpSamplingAttrs>();
+    CHECK(uattrs != nullptr);
+    auto out_tt = out_type.as<TensorTypeNode>();
+    CHECK(out_tt) << "expected a tensor type: " << out_type;
+    CHECK(uattrs->layout == "NCHW" || uattrs->layout == "NHWC")
+      << "unknown layout: " << uattrs->layout;
+
+    Array<HalideIR::Expr> oshape;
+    if (uattrs->layout == "NCHW") {
+      oshape.push_back(out_tt->shape[2]);
+      oshape.push_back(out_tt->shape[3]);
+    } else if (uattrs->layout == "NHWC") {
+      oshape.push_back(out_tt->shape[1]);
+      oshape.push_back(out_tt->shape[2]);
+    }
+
+    return Array<Tensor>{
+      topi::nn::upsampling(
+        inputs[0],
+        oshape,
+        uattrs->layout,
+        uattrs->method)
+    };
+});
+
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/op_common.h b/src/relay/op/op_common.h
new file mode 100644
index 000000000000..36cd04931903
--- /dev/null
+++ b/src/relay/op/op_common.h
@@ -0,0 +1,101 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file op_common.h
+ * \brief A set of utilities and common functionality
+ * for relay ops.
+ */
+#ifndef TVM_RELAY_OP_OP_COMMON_H_
+#define TVM_RELAY_OP_OP_COMMON_H_
+
+#include <tvm/relay/expr.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/op_attr_types.h>
+#include <vector>
+#include "../pass/alter_op_layout.h"
+
+namespace tvm {
+namespace relay {
+
+template<typename T>
+inline std::vector<T> AsVector(const Array<T> &array) {
+    std::vector<T> result;
+    result.reserve(array.size());
+    for (const T& ele : array) {
+        result.push_back(ele);
+    }
+    return result;
+}
+
+/*! Quick helper macro
+ * - Expose a positional make function to construct the node.
+ * - Register op to the registry.
+ *
+ * We make the decision to always only expose positional argument.
+ * We will do rewrapping in the frontend to support language
+ * sugars such as keyword arguments and default value.
+
+ * \param OpName the name of registry.
+ */
+#define RELAY_REGISTER_UNARY_OP(OpName)                     \
+  TVM_REGISTER_API("relay.op._make." OpName)                \
+    .set_body_typed<Expr(Expr)>([](Expr data) {             \
+        static const Op& op = Op::Get(OpName);              \
+        return CallNode::make(op, {data}, Attrs(), {});     \
+      });                                                   \
+  RELAY_REGISTER_OP(OpName)                                 \
+    .set_num_inputs(1)                                      \
+    .add_argument("data", "Tensor", "The input tensor.")    \
+    .add_type_rel("Identity", IdentityRel)                  \
+    .set_attr<TOpPattern>("TOpPattern", kElemWise)          \
+    .set_attr<TOpIsStateful>("TOpIsStateful", false)        \
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout",   \
+                                   ElemwiseArbitraryLayout) \
+
+
+/*! Quick helper macro
+ * - Expose a positional make function to construct the node.
+ * - Register op to the registry.
+ *
+ * We make the decision to always only expose positional argument.
+ * We will do rewrapping in the frontend to support language
+ * sugars such as keyword arguments and default value.
+ *
+ * \param OpName the name of registry.
+ */
+#define RELAY_REGISTER_BINARY_OP(OpName)                          \
+  TVM_REGISTER_API("relay.op._make." OpName)                      \
+    .set_body_typed<Expr(Expr, Expr)>([](Expr lhs, Expr rhs) {    \
+        static const Op& op = Op::Get(OpName);                    \
+        return CallNode::make(op, {lhs, rhs}, Attrs(), {});       \
+      });                                                         \
+  RELAY_REGISTER_OP(OpName)                                       \
+    .set_num_inputs(2)                                            \
+    .add_argument("lhs", "Tensor", "The left hand side tensor.")  \
+    .add_argument("rhs", "Tensor", "The right hand side tensor.") \
+    .add_type_rel("Broadcast", BroadcastRel)                      \
+    .set_attr<TOpPattern>("TOpPattern", kBroadcast)               \
+    .set_attr<TOpIsStateful>("TOpIsStateful", false)              \
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout",         \
+                                   BinaryBroadcastLayout)
+
+// Comparisons
+#define RELAY_REGISTER_CMP_OP(OpName)                             \
+  TVM_REGISTER_API("relay.op._make." OpName)                      \
+  .set_body_typed<Expr(Expr, Expr)>([](Expr lhs, Expr rhs) {      \
+    static const Op& op = Op::Get(OpName);                        \
+    return CallNode::make(op, {lhs, rhs}, Attrs(), {});           \
+  });                                                             \
+  RELAY_REGISTER_OP(OpName)                                       \
+    .set_num_inputs(2)                                            \
+    .add_argument("lhs", "Tensor", "The left hand side tensor.")  \
+    .add_argument("rhs", "Tensor", "The right hand side tensor.") \
+    .add_type_rel("BroadcastComp", BroadcastCompRel)              \
+    .set_attr<TOpPattern>("TOpPattern", kBroadcast)               \
+    .set_attr<TOpIsStateful>("TOpIsStateful", false)              \
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout",         \
+                                   BinaryBroadcastLayout)
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_OP_OP_COMMON_H_
diff --git a/src/relay/op/tensor/binary.cc b/src/relay/op/tensor/binary.cc
new file mode 100644
index 000000000000..da9b1af87578
--- /dev/null
+++ b/src/relay/op/tensor/binary.cc
@@ -0,0 +1,122 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file binary.cc
+ * \brief binary broadcast operators.
+ */
+#include <tvm/relay/expr.h>
+#include <tvm/relay/op.h>
+#include <topi/broadcast.h>
+#include "../type_relations.h"
+#include "../op_common.h"
+
+namespace tvm {
+namespace relay {
+
+#define RELAY_BINARY_COMPUTE(FTOPI)                        \
+  [] (const Attrs& attrs,                                  \
+      const Array<Tensor>& inputs,                         \
+      const Type& out_type,                                \
+      const Target& target) -> Array<Tensor> {             \
+    CHECK_EQ(inputs.size(), 2U);                           \
+    return {FTOPI(inputs[0], inputs[1])};                  \
+  }                                                        \
+
+
+// Addition
+RELAY_REGISTER_BINARY_OP("add")
+.describe("Elementwise add with with broadcasting")
+.set_support_level(1)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::add));
+
+// Subtraction
+RELAY_REGISTER_BINARY_OP("subtract")
+.describe("Elementwise substract with broadcasting")
+.set_support_level(1)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::subtract));
+
+// Right shift
+RELAY_REGISTER_BINARY_OP("right_shift")
+.describe("Elementwise right shift with broadcasting")
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::right_shift));
+
+
+RELAY_REGISTER_BINARY_OP("left_shift")
+.describe("Elementwise left shift with broadcasting")
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::left_shift));
+
+
+RELAY_REGISTER_BINARY_OP("maximum")
+.describe("Elementwise maximum of two tensors with broadcasting")
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::maximum));
+
+
+RELAY_REGISTER_BINARY_OP("minimum")
+.describe("Elementwise minimum of two tensors with broadcasting")
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::minimum));
+
+
+RELAY_REGISTER_BINARY_OP("divide")
+.describe("Elementwise divide with broadcasting")
+.set_support_level(1)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::divide));
+
+
+RELAY_REGISTER_BINARY_OP("multiply")
+.describe("Elementwise multiply with broadcasting")
+.set_support_level(1)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::multiply));
+
+
+RELAY_REGISTER_BINARY_OP("power")
+.describe("Elementwise power with broadcasting")
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::power));
+
+
+RELAY_REGISTER_BINARY_OP("mod")
+.describe("Elementwise mod with broadcasting")
+.set_support_level(1)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::mod));
+
+
+RELAY_REGISTER_CMP_OP("equal")
+.describe("Elementwise equal compare with broadcasting")
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::equal));
+
+
+RELAY_REGISTER_CMP_OP("not_equal")
+.describe("Elementwise not equal with broadcasting")
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::not_equal));
+
+
+RELAY_REGISTER_CMP_OP("less")
+.describe("Elementwise less than with broadcasting")
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::less));
+
+
+RELAY_REGISTER_CMP_OP("less_equal")
+.describe("Elementwise less than or equal compare with broadcasting")
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::less_equal));
+
+
+RELAY_REGISTER_CMP_OP("greater")
+.describe("Elementwise greater than compare with broadcasting")
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::greater));
+
+
+RELAY_REGISTER_CMP_OP("greater_equal")
+.describe("Elementwise greater than or equal compare with broadcasting")
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::greater_equal));
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
new file mode 100644
index 000000000000..95c26c3ab7e4
--- /dev/null
+++ b/src/relay/op/tensor/reduce.cc
@@ -0,0 +1,445 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file reduce.cc
+ * \brief Reduction operators.
+ */
+#include <tvm/relay/expr.h>
+#include <tvm/relay/op.h>
+#include <topi/elemwise.h>
+#include <topi/reduction.h>
+#include <numeric>
+#include <limits>
+#include "../op_common.h"
+#include "../type_relations.h"
+
+namespace tvm {
+namespace relay {
+
+/*! \brief Attributes for Reduce operators */
+struct ReduceAttrs : public tvm::AttrsNode<ReduceAttrs> {
+  Array<Integer> axis;
+  bool keepdims;
+  bool exclude;
+
+  TVM_DECLARE_ATTRS(ReduceAttrs, "relay.attrs.ReduceAttrs") {
+    TVM_ATTR_FIELD(axis).set_default(NullValue<Array<Integer>>())
+        .describe(R"code(The axis or axes along which to perform the reduction.
+
+      The default, `axis=()`, will compute over all elements into a
+      scalar array with shape `(1,)`.
+
+      If `axis` is int, a reduction is performed on a particular axis.
+
+      If `axis` is a tuple of ints, a reduction is performed on all the axes
+      specified in the tuple.
+
+      If `exclude` is true, reduction will be performed on the axes that are
+      NOT in axis instead.)code");
+
+    TVM_ATTR_FIELD(keepdims).set_default(false)
+      .describe("If this is set to `True`, the reduced axes are left "
+                "in the result as dimension with size one.");
+    TVM_ATTR_FIELD(exclude).set_default(false)
+      .describe("Whether to perform reduction on axis that are NOT in axis instead.");
+  }
+};
+
+/*!
+* \brief GetReduceAxes, get the new axis from indim and other arguments
+* \param indim Number of dimensions of input data.
+* \param axis The input axis vector.
+* \param exclude Whether 'axis' input given is the excluded axis.
+* \return r_axes The new reduced axes of the output.
+*/
+inline std::vector<int64_t> GetReduceAxes(const uint32_t indim,
+                                          const Array<Integer>& inaxis,
+                                          bool exclude) {
+  if (!inaxis.defined()) {
+    std::vector<int64_t> r_axes(indim);
+    std::iota(r_axes.begin(), r_axes.end(), 0);
+    return r_axes;
+  }
+
+  std::vector<int64_t> in_axes;
+  for (auto i : inaxis) {
+    int64_t axis = i->value;
+    if (axis < 0) {
+      axis = axis + indim;
+    }
+
+    // Check out of bounds error
+    CHECK(axis >= 0)
+      << "Axis out of bounds in reduce operator.";
+    CHECK(axis < indim)
+      << "Axis out of bounds in reduce operator.";
+    in_axes.push_back(axis);
+  }
+
+  CHECK(in_axes[in_axes.size() - 1] < indim)
+    << "Reduction axis " << in_axes[in_axes.size() - 1]
+    << " exceeds input dimensions " << indim;
+
+  std::sort(in_axes.begin(), in_axes.end());
+
+  if (!exclude) {
+    return in_axes;
+  }
+
+  auto r_size = indim - in_axes.size();
+  std::vector<int64_t> r_axes(r_size);
+  for (uint32_t i = 0, j = 0, k = 0; i < indim; ++i) {
+    if (j < in_axes.size() && in_axes[j] == i) {
+        ++j;
+        continue;
+    }
+    r_axes[k++] = i;
+  }
+  return r_axes;
+}
+
+
+// Get axis under exclude condition.
+Array<Integer> GetExcludeAxes(size_t indim,
+                              const Array<Integer>& inaxis) {
+  std::vector<bool> axis_flag(indim, true);
+  for (auto i : inaxis) {
+    int64_t axis = i->value;
+    if (axis < 0) {
+      axis = axis + static_cast<int64_t>(indim);
+    }
+    // Check out of bounds error
+    CHECK_GE(axis, 0)
+      << "Axis out of bounds in reduce operator.";
+    CHECK_LT(axis, static_cast<int64_t>(indim))
+      << "Axis out of bounds in reduce operator.";
+    axis_flag[axis] = false;
+  }
+
+  Array<Integer> r_axes;
+
+  for (size_t i = 0; i < axis_flag.size(); ++i) {
+    if (axis_flag[i]) {
+      r_axes.push_back(static_cast<int>(i));
+    }
+  }
+  return r_axes;
+}
+
+
+template<typename F>
+Array<Tensor> ReduceCompute(const Attrs& attrs,
+                            const Array<Tensor>& inputs,
+                            const Type& out_type,
+                            const Target& target,
+                            F f) {
+  const ReduceAttrs* param = attrs.as<ReduceAttrs>();
+  CHECK(param != nullptr);
+  auto axes = param->axis;
+  if (param->exclude) {
+    axes = GetExcludeAxes(inputs[0]->shape.size(), param->axis);
+  }
+  if (axes.size() == 0) {
+    return { topi::identity(inputs[0]) };
+  }
+  return { f(inputs[0], axes, param->keepdims, false) };
+}
+
+/*!
+* \brief ReduceShapeImpl get the outshape for the reduction operator
+* \param in_shape Shape of input data.
+* \param param ReduceAttrs details.
+* \param reporter The reporter to report solution to.
+* \return oshape Output shape inferred.
+*/
+inline std::vector<IndexExpr> ReduceShapeImpl(const std::vector<IndexExpr> &in_shape,
+                                              const ReduceAttrs* param,
+                                              const TypeReporter& reporter) {
+  uint32_t indim = in_shape.size();
+  auto r_axes = GetReduceAxes(indim, param->axis, param->exclude);
+  if (!r_axes.size()) {
+    return in_shape;
+  }
+
+  auto max_shape = make_const(Int(64), 1);
+  for (int64_t axis : r_axes) {
+    max_shape *= in_shape[axis];
+  }
+  CHECK(reporter->Assert(max_shape < make_const(Int(64), std::numeric_limits<int32_t>::max())))
+    << "The maximum possible index of reduced shape cannot be more than int32 max.";
+
+  if (param->keepdims) {
+    std::vector<IndexExpr> oshape(in_shape);
+    for (unsigned i = 0, j = 0; i < indim; ++i) {
+      if (j >= r_axes.size() || !(r_axes[j] == i)) {
+        continue;
+      }
+      oshape[i] = 1;
+      ++j;
+    }
+    return oshape;
+  } else {
+    auto osize = indim - r_axes.size();
+    std::vector<IndexExpr> oshape(osize);
+    for (unsigned i = 0, j = 0, k = 0; i < indim; ++i) {
+      if (j < r_axes.size() && (r_axes[j] == i)) {
+        ++j;
+        continue;
+      }
+      oshape[k++] = in_shape[i];
+    }
+    return oshape;
+  }
+}
+
+/*!
+* \brief ArgReduceRel Output type and shape relation evaluation function.
+* \param num_inputs Number of input types in the args.
+* \param attrs The additional attributes of the operator.
+* \param reporter The reporter to report solution to.
+* \return false if This relation cannot be resolved. true if this relation has been resolved.
+*/
+bool ArgReduceRel(const Array<Type>& types,
+               int num_inputs,
+               const Attrs& attrs,
+               const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+  CHECK(static_cast<int>(data->shape.size()) != 0);
+  std::vector<IndexExpr>&& in_shape = AsVector(data->shape);
+
+  const ReduceAttrs* param = attrs.as<ReduceAttrs>();
+  CHECK(param != nullptr);
+
+  // assign output type and shape
+  auto oshape = ReduceShapeImpl(in_shape, param, reporter);
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, Int(32)));
+  return true;
+}
+
+/*!
+* \brief ReduceRel Output type and shape relation evaluation function.
+* \param num_inputs Number of input types in the args.
+* \param attrs The additional attributes of the operator.
+* \param reporter The reporter to report solution to.
+* \return false if This relation cannot be resolved. true if this relation has been resolved.
+*/
+bool ReduceRel(const Array<Type>& types,
+               int num_inputs,
+               const Attrs& attrs,
+               const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+  CHECK(static_cast<int>(data->shape.size()) != 0);
+  std::vector<IndexExpr>&& in_shape = AsVector(data->shape);
+
+  const ReduceAttrs* param = attrs.as<ReduceAttrs>();
+  CHECK(param != nullptr);
+
+  // assign output type and shape
+  auto oshape = ReduceShapeImpl(in_shape, param, reporter);
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+#define RELAY_REGISTER_REDUCE_OP(OpName)                           \
+  TVM_REGISTER_API("relay.op._make." OpName)                       \
+  .set_body([](const TVMArgs& args, TVMRetValue* rv) {             \
+    auto make_func = [](Expr data,                                 \
+                        Array<Integer> axis,                       \
+                        bool keepdims,                             \
+                        bool exclude) {                            \
+      auto attrs = make_node<ReduceAttrs>();                       \
+      attrs->axis = std::move(axis);                               \
+      attrs->keepdims = keepdims;                                  \
+      attrs->exclude = exclude;                                    \
+      static const Op& op = Op::Get(OpName);                       \
+      return CallNode::make(op, {data}, Attrs(attrs), {});         \
+    };                                                             \
+    runtime::detail::unpack_call<Expr, 4>(make_func, args, rv);    \
+    });                                                            \
+  RELAY_REGISTER_OP(OpName)                                        \
+  .set_num_inputs(1)                                               \
+  .add_argument("data", "Tensor", "The input tensor.")
+
+
+Array<Tensor> ArgMaxCompute(const Attrs& attrs,
+                            const Array<Tensor>& inputs,
+                            const Type& out_type,
+                            const Target& target) {
+  return ReduceCompute(attrs, inputs, out_type, target, topi::argmax);
+}
+
+
+RELAY_REGISTER_REDUCE_OP("argmax")
+.describe(R"code(Creates an operation that finds the indices of the maximum
+values over a given axis.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_support_level(4)
+.add_type_rel("ArgReduce", ArgReduceRel)
+.set_attr<FTVMCompute>("FTVMCompute", ArgMaxCompute)
+.set_attr<TOpPattern>("TOpPattern", kCommReduce);
+
+
+Array<Tensor> ArgMinCompute(const Attrs& attrs,
+                            const Array<Tensor>& inputs,
+                            const Type& out_type,
+                            const Target& target) {
+  return ReduceCompute(attrs, inputs, out_type, target, topi::argmin);
+}
+
+RELAY_REGISTER_REDUCE_OP("argmin")
+.describe(R"code(Creates an operation that finds the indices of the minimum
+values over a given axis.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_support_level(4)
+.add_type_rel("ArgReduce", ArgReduceRel)
+.set_attr<FTVMCompute>("FTVMCompute", ArgMinCompute)
+.set_attr<TOpPattern>("TOpPattern", kCommReduce);
+
+Array<Tensor> SumCompute(const Attrs& attrs,
+                         const Array<Tensor>& inputs,
+                         const Type& out_type,
+                         const Target& target) {
+  return ReduceCompute(attrs, inputs, out_type, target, topi::sum);
+}
+
+
+RELAY_REGISTER_REDUCE_OP("sum")
+.describe(R"code(Computes the sum of array elements over given axes.
+
+Example::
+
+  data = [[[1,2],[2,3],[1,3]],
+          [[1,4],[4,3],[5,2]],
+          [[7,1],[7,2],[7,3]]]
+
+  sum(data, axis=1)
+  [[  4.   8.]
+   [ 10.   9.]
+   [ 21.   6.]]
+
+  sum(data, axis=[1,2])
+  [ 12.  19.  27.]
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_support_level(4)
+.add_type_rel("Reduce", ReduceRel)
+.set_attr<FTVMCompute>("FTVMCompute", SumCompute)
+.set_attr<TOpPattern>("TOpPattern", kCommReduce);
+
+
+Array<Tensor> MaxCompute(const Attrs& attrs,
+                         const Array<Tensor>& inputs,
+                         const Type& out_type,
+                         const Target& target) {
+  return ReduceCompute(attrs, inputs, out_type, target, topi::max);
+}
+
+RELAY_REGISTER_REDUCE_OP("max")
+.describe(R"code(Computes the max of array elements over given axes.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_support_level(4)
+.add_type_rel("Reduce", ReduceRel)
+.set_attr<FTVMCompute>("FTVMCompute", MaxCompute)
+.set_attr<TOpPattern>("TOpPattern", kCommReduce);
+
+
+Array<Tensor> MinCompute(const Attrs& attrs,
+                         const Array<Tensor>& inputs,
+                         const Type& out_type,
+                         const Target& target) {
+  return ReduceCompute(attrs, inputs, out_type, target, topi::min);
+}
+
+
+RELAY_REGISTER_REDUCE_OP("min")
+.describe(R"code(Computes the min of array elements over given axes.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_support_level(4)
+.add_type_rel("Reduce", ReduceRel)
+.set_attr<FTVMCompute>("FTVMCompute", MinCompute)
+.set_attr<TOpPattern>("TOpPattern", kCommReduce);
+
+
+Array<Tensor> ProdCompute(const Attrs& attrs,
+                          const Array<Tensor>& inputs,
+                          const Type& out_type,
+                          const Target& target) {
+  return ReduceCompute(attrs, inputs, out_type, target, topi::prod);
+}
+
+RELAY_REGISTER_REDUCE_OP("prod")
+.describe(R"code(Computes the products of array elements over given axes.
+
+Example::
+
+  data = [[[1,2],[2,3],[1,3]],
+          [[1,4],[4,3],[5,2]],
+          [[7,1],[7,2],[7,3]]]
+
+  mean(data, axis=1)
+  [35562240]
+
+  mean(data, axis=[1,2])
+  [ 36  480  2058]
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_support_level(4)
+.add_type_rel("Reduce", ReduceRel)
+.set_attr<FTVMCompute>("FTVMCompute", ProdCompute)
+.set_attr<TOpPattern>("TOpPattern", kCommReduce);
+
+
+Array<Tensor> MeanCompute(const Attrs& attrs,
+                          const Array<Tensor>& inputs,
+                          const Type& out_type,
+                          const Target& target) {
+  IndexExpr count = make_const(inputs[0]->dtype, 1);
+  const ReduceAttrs* param = attrs.as<ReduceAttrs>();
+  CHECK(param != nullptr);
+  auto axes = param->axis;
+  for (int64_t i : GetReduceAxes(inputs[0]->shape.size(),
+                                 param->axis,
+                                 param->exclude)) {
+    count *= inputs[0]->shape[i];
+  }
+  auto res = ReduceCompute(attrs, inputs, out_type, target, topi::sum);
+  return {topi::divide(res[0], count)};
+}
+
+
+RELAY_REGISTER_REDUCE_OP("mean")
+.describe(R"code(Computes the mean of array elements over given axes.
+
+Example::
+
+  data = [[[1,2],[2,3],[1,3]],
+          [[1,4],[4,3],[5,2]],
+          [[7,1],[7,2],[7,3]]]
+
+  mean(data)
+  [3.22]
+
+  mean(data, axis=[1,2])
+  [ 2.  3.16666667  4.5]
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_support_level(4)
+.add_type_rel("Reduce", ReduceRel)
+.set_attr<FTVMCompute>("FTVMCompute", MeanCompute)
+.set_attr<TOpPattern>("TOpPattern", kCommReduce);
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
new file mode 100644
index 000000000000..704324533185
--- /dev/null
+++ b/src/relay/op/tensor/transform.cc
@@ -0,0 +1,1703 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file transform.cc
+ * \brief Transform operators.
+ */
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/transform.h>
+#include <tvm/ir_operator.h>
+#include <tvm/ir.h>
+#include <topi/transform.h>
+#include <topi/elemwise.h>
+#include <topi/broadcast.h>
+#include <topi/reduction.h>
+#include <topi/nn.h>
+#include <vector>
+#include "../op_common.h"
+#include "../../../arithmetic/compute_expr.h"
+#include "../../pass/alter_op_layout.h"
+#include "../layout.h"
+
+namespace tvm {
+namespace relay {
+using ir::IntImm;
+
+// relay.cast
+TVM_REGISTER_NODE_TYPE(CastAttrs);
+
+bool CastRel(const Array<Type>& types,
+             int num_inputs,
+             const Attrs& attrs,
+             const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    CHECK(types[0].as<IncompleteTypeNode>())
+        << "cast: expect input type to be TensorType but get "
+        << types[0];
+    return false;
+  }
+  const auto* param = attrs.as<CastAttrs>();
+  reporter->Assign(types[1], TensorTypeNode::make(
+      data->shape, param->dtype));
+  return true;
+}
+
+Array<Tensor> CastCompute(const Attrs& attrs,
+                          const Array<Tensor>& inputs,
+                          const Type& out_type,
+                          const Target& target) {
+  const CastAttrs *param = attrs.as<CastAttrs>();
+  CHECK(param != nullptr);
+  DataType dtype = param->dtype;
+  return { topi::cast(inputs[0], dtype) };
+}
+
+Expr MakeCast(Expr data,
+              DataType dtype) {
+  auto attrs = make_node<CastAttrs>();
+  attrs->dtype = dtype;
+  static const Op& op = Op::Get("cast");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay._make.cast")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeCast, args, rv);
+});
+
+RELAY_REGISTER_OP("cast")
+.describe(R"code(Cast the data into a new data type.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_attrs_type_key("relay.attrs.CastAttrs")
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(3)
+.add_type_rel("Cast", CastRel)
+.set_attr<FTVMCompute>("FTVMCompute", CastCompute)
+.set_attr<TOpPattern>("TOpPattern", kElemWise);
+
+// relay.expand_dims
+TVM_REGISTER_NODE_TYPE(ExpandDimsAttrs);
+
+bool ExpandDimsRel(const Array<Type>& types,
+                   int num_inputs,
+                   const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // `types` contains: [data, result]
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    CHECK(types[0].as<IncompleteTypeNode>())
+        << "expand_dims: expect input type to be TensorType but get "
+        << types[0];
+    return false;
+  }
+  const auto* param = attrs.as<ExpandDimsAttrs>();
+  const int ndim = static_cast<int>(data->shape.size());
+  const int axis = param->axis;
+  const int num_newaxis = param->num_newaxis;
+  CHECK(num_newaxis >= 0)
+    << "expand_dims only accepts `num_newaxis >= 0`"
+    << ", but got num_newaxis = " << num_newaxis;
+  CHECK(-ndim - 1 <= axis && axis <= ndim)
+    << "expand_dims only accepts `axis` in [-data.ndim - 1, data.ndim]"
+    << ", but got axis = " << axis
+    << ", and data.ndim = " << ndim;
+  const int pivot = axis < 0 ? ndim + axis + 1 : axis;
+  std::vector<IndexExpr> oshape;
+  oshape.reserve(ndim + num_newaxis);
+  for (int i = 0; i < pivot; ++i) {
+    oshape.emplace_back(data->shape[i]);
+  }
+  for (int i = 0; i < num_newaxis; ++i) {
+    oshape.emplace_back(1);
+  }
+  for (int i = pivot; i < ndim; ++i) {
+    oshape.emplace_back(data->shape[i]);
+  }
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+Array<Tensor> ExpandDimsCompute(const Attrs& attrs,
+                                const Array<Tensor>& inputs,
+                                const Type& out_type,
+                                const Target& target) {
+  const ExpandDimsAttrs *param = attrs.as<ExpandDimsAttrs>();
+  CHECK(param != nullptr);
+  return { topi::expand_dims(inputs[0], param->axis, param->num_newaxis) };
+}
+
+Expr MakeExpandDims(Expr data,
+                    int axis,
+                    int num_newaxis) {
+  auto attrs = make_node<ExpandDimsAttrs>();
+  attrs->axis = axis;
+  attrs->num_newaxis = num_newaxis;
+  static const Op& op = Op::Get("expand_dims");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.expand_dims")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 3>(MakeExpandDims, args, rv);
+});
+
+RELAY_REGISTER_OP("expand_dims")
+.describe(R"code(Insert `num_newaxis` axises at the position given by `axis`
+
+- **data**: The input data to the operator.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_attrs_type_key("relay.attrs.ExpandDimsAttrs")
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(1)
+.add_type_rel("ExpandDims", ExpandDimsRel)
+.set_attr<FTVMCompute>("FTVMCompute", ExpandDimsCompute)
+.set_attr<TOpPattern>("TOpPattern", kBroadcast);
+
+// relay.concatenate
+TVM_REGISTER_NODE_TYPE(ConcatenateAttrs);
+
+bool ConcatenateRel(const Array<Type>& types,
+                    int num_inputs,
+                    const Attrs& attrs,
+                    const TypeReporter& reporter) {
+  // types: [data, result]
+  CHECK_EQ(types.size(), 2);
+  const auto* tensor_tuple = types[0].as<TupleTypeNode>();
+  if (tensor_tuple == nullptr) {
+    CHECK(types[0].as<IncompleteTypeNode>())
+        << "cast: expect input type to be TupleType but get "
+        << types[0];
+    return false;
+  }
+  const auto* param = attrs.as<ConcatenateAttrs>();
+  const auto& first = Downcast<TensorType>(tensor_tuple->fields[0]);
+  // Sanity check: ndim and dtype.
+  const int ndim = static_cast<int>(first->shape.size());
+  const DataType dtype = first->dtype;
+  for (const Type& ele : tensor_tuple->fields) {
+    const auto& e = Downcast<TensorType>(ele);
+    int e_ndim = static_cast<int>(e->shape.size());
+    const DataType& e_dtype = e->dtype;
+    CHECK_EQ(e_ndim, ndim) << "relay.concatenate requires all tensors have the same ndim";
+    CHECK_EQ(e_dtype, dtype) << "relay.concatenate requires all tensors have the same dtype";
+  }
+  // Sanity check: axis
+  int axis = param->axis;
+  CHECK(-ndim <= axis && axis < ndim)
+    << "concatenate only accepts `axis` in [-ndim, ndim)"
+    << ", but got axis = " << axis
+    << ", and ndim = " << ndim;
+  axis = axis < 0 ? ndim + axis : axis;
+  // Calculate shape
+  std::vector<IndexExpr>&& oshape = AsVector(first->shape);
+  IndexExpr &concat_dim = oshape[axis];
+  for (int i = 1; i < static_cast<int>(tensor_tuple->fields.size()); ++i) {
+    const auto& e = Downcast<TensorType>(tensor_tuple->fields[i]);
+    concat_dim += e->shape[axis];
+  }
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, dtype));
+  return true;
+}
+
+Array<Array<Layout>> ConcatenateLayout(
+    const Attrs& attrs,
+    const Array<Layout>& new_in_layouts,
+    const Array<Layout>& old_in_layouts,
+    const Array<Array<IndexExpr>> &old_in_shapes) {
+  const ConcatenateAttrs* param = attrs.as<ConcatenateAttrs>();
+
+  size_t axis = param->axis < 0 ? param->axis + old_in_shapes[0].size() :
+                static_cast<size_t>(param->axis);
+
+  Layout ret;
+  if (new_in_layouts.defined()) {  // this function is called after some operators are alternated.
+    Layout::LayoutDim concate_dim = old_in_layouts[0][axis];
+    for (size_t i = 0; i < new_in_layouts.size(); ++i) {
+      if (new_in_layouts[i].ndim() > axis &&
+          new_in_layouts[i][axis] == concate_dim) {
+        ret = new_in_layouts[i];
+        break;
+      }
+    }
+  } else {  // this function is called on the original correct relay ir
+    for (size_t i = 0; i < old_in_layouts.size(); ++i) {
+      if (old_in_layouts[i].defined()) {
+        ret = old_in_layouts[i];
+        break;
+      }
+    }
+
+    if (ret.ndim() <= axis || Layout::IsSubdim(ret[axis])) {
+      return Array<Array<Layout> > {{Layout::Undef()}, {Layout::Undef()}};
+    }
+  }
+
+  return Array<Array<Layout> > {Array<Layout>(old_in_layouts.size(), ret), {ret}};
+}
+
+Expr MakeConcatenate(Expr data,
+                     int axis) {
+  auto attrs = make_node<ConcatenateAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("concatenate");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.concatenate")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeConcatenate, args, rv);
+});
+
+RELAY_REGISTER_OP("concatenate")
+.describe(R"code(Concatenate the input tensors along the given axis.
+
+- **data** : A list of tensors.
+
+- **axis** : The axis along which the tensors are concatenated.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.ConcatenateAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input list of tensors.")
+.set_support_level(1)
+.add_type_rel("Concatenate", ConcatenateRel)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConcatenateLayout);
+
+/* relay.transpose */
+TVM_REGISTER_NODE_TYPE(TransposeAttrs);
+
+bool TransposeRel(const Array<Type>& types,
+                  int num_inputs,
+                  const Attrs& attrs,
+                  const TypeReporter& reporter) {
+  // types: [data, result]
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    CHECK(types[0].as<IncompleteTypeNode>())
+        << "transpose: expect input type to be TensorType but get "
+        << types[0];
+    return false;
+  }
+  const auto* param = attrs.as<TransposeAttrs>();
+  const int ndim = data->shape.size();
+  const Array<Integer>& axes = param->axes;
+  // check dimension match
+  CHECK(!axes.defined() || static_cast<int>(axes.size()) == ndim)
+    << "Dimension mismatch: axes has " << axes.size() << " elements"
+    << ", but data.ndim = " << ndim;
+  // construct int_axes
+  std::vector<int> int_axes;
+  int_axes.reserve(ndim);
+  // used not defined to check if it is None.
+  if (!axes.defined()) {
+    for (int i = ndim - 1; i >= 0; --i) {
+      int_axes.push_back(i);
+    }
+  } else {
+    std::vector<int> axis_used(ndim, 0);
+    for (const Integer& e : axes) {
+      int64_t axis = e;
+      // sanity check for axis and ndim
+      CHECK(-ndim <= axis && axis < ndim)
+        << "transpose only allows each `axis` in `axes` in range [-data.ndim, data.ndim)"
+        << ", but got axis = " << axis
+        << ", and data.ndim = " << ndim;
+      axis = axis < 0 ? axis + ndim : axis;
+      // sanity check for duplication
+      CHECK(!axis_used[axis]) << "Duplicate axes in transpose: " << axis;
+      axis_used[axis] = 1;
+      int_axes.push_back(static_cast<int>(axis));
+    }
+  }
+  std::vector<IndexExpr> oshape;
+  oshape.reserve(ndim);
+  for (int axis : int_axes) {
+    oshape.push_back(data->shape[axis]);
+  }
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+Array<Tensor> TransposeCompute(const Attrs& attrs,
+                               const Array<Tensor>& inputs,
+                               const Type& out_type,
+                               const Target& target) {
+  const auto* param = attrs.as<TransposeAttrs>();
+  CHECK(param != nullptr);
+  return Array<Tensor>{ topi::transpose(inputs[0], param->axes) };
+}
+
+Expr MakeTranspose(Expr data,
+                   Array<Integer> axes) {
+  auto attrs = make_node<TransposeAttrs>();
+  attrs->axes = std::move(axes);
+  static const Op& op = Op::Get("transpose");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.transpose")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeTranspose, args, rv);
+});
+
+RELAY_REGISTER_OP("transpose")
+.describe(R"code(Permutes the dimensions of an array.
+
+- **data**: The input data to the operator.
+
+- **axes**: The target axes order, reverse order if not specified.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_attrs_type_key("relay.attrs.TransposeAttrs")
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(3)
+.add_type_rel("Transpose", TransposeRel)
+.set_attr<FTVMCompute>("FTVMCompute", TransposeCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
+
+/* relay.reshape */
+TVM_REGISTER_NODE_TYPE(ReshapeAttrs);
+
+bool ReshapeRel(const Array<Type>& types,
+                int num_inputs,
+                const Attrs& attrs,
+                const TypeReporter& reporter) {
+  // types: [data, result]
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    CHECK(types[0].as<IncompleteTypeNode>())
+        << "reshape: expect input type to be TensorType but get "
+        << types[0];
+    return false;
+  }
+
+  const auto* param = attrs.as<ReshapeAttrs>();
+  Array<IndexExpr> oshape;
+  size_t src_idx = 0;
+  int infer_idx = -1;
+
+  for (size_t i = 0; i < param->newshape.size(); ++i) {
+    int svalue = param->newshape[i]->value;
+    // special flag handling for shape inference.
+    if (svalue > 0) {
+      oshape.push_back(param->newshape[i]);
+      ++src_idx;
+    } else if (svalue == 0) {
+      // keep same
+      CHECK_LT(src_idx, data->shape.size());
+      oshape.push_back(data->shape[src_idx++]);
+    } else if (svalue == -1) {
+      // inference based on rest
+      CHECK_LT(infer_idx, 0)
+          << "One and only one dim can be inferred";
+      infer_idx = i;
+      oshape.push_back(1);
+      ++src_idx;
+    } else if (svalue == -2) {
+      // copy all remaining dims from source
+      while (src_idx < data->shape.size()) {
+        oshape.push_back(data->shape[src_idx++]);
+      }
+    } else if (svalue == -3) {
+      // merge two dims from source
+      CHECK_LT(src_idx + 1, data->shape.size());
+      IndexExpr d1 = data->shape[src_idx++];
+      IndexExpr d2 = data->shape[src_idx++];
+      oshape.push_back(d1 * d2);
+    } else if (svalue == -4) {
+      // split the source dim s into two dims
+      // read the left dim and then the right dim (either can be -1)
+      CHECK_LT(i + 2, param->newshape.size());
+      CHECK_LT(src_idx, data->shape.size());
+      IndexExpr d0 = data->shape[src_idx++];
+      Integer d1 = param->newshape[++i];
+      Integer d2 = param->newshape[++i];
+      if (d1->value == -1) {
+        CHECK(d2->value != -1)
+            << "Split dims cannot both be -1.";
+        oshape.push_back(d0 / d2);
+        oshape.push_back(d2);
+      } else {
+        CHECK_EQ(d2->value, -1);
+        oshape.push_back(d1);
+        oshape.push_back(d0 / d1);
+      }
+    }
+  }
+
+  if (infer_idx >= 0) {
+    IndexExpr new_size = arith::ComputeReduce<tvm::ir::Mul>(oshape, 1);
+    IndexExpr old_size = arith::ComputeReduce<tvm::ir::Mul>(data->shape, 1);
+    oshape.Set(infer_idx, old_size / new_size);
+  }
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+Array<Tensor> ReshapeCompute(const Attrs& attrs,
+                             const Array<Tensor>& inputs,
+                             const Type& out_type,
+                             const Target& target) {
+  const auto* out_ttype = out_type.as<TensorTypeNode>();
+  CHECK(out_ttype != nullptr);
+  return { topi::reshape(inputs[0], out_ttype->shape) };
+}
+
+Expr MakeReshape(Expr data,
+                 Array<Integer> newshape) {
+  auto attrs = make_node<ReshapeAttrs>();
+  attrs->newshape = std::move(newshape);
+  static const Op& op = Op::Get("reshape");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.reshape")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeReshape, args, rv);
+});
+
+RELAY_REGISTER_OP("reshape")
+.describe(R"code(Reshapes the input array.
+
+Example::
+
+To give user more convenience in without doing manual shape inference,
+some dimensions of the shape can take special values from the set {0, -1, -2, -3, -4}.
+The significance of each is explained below:
+
+- ``0``  copy this dimension from the input to the output shape.
+
+Example::
+
+- data.shape = (2,3,4), newshape = (4,0,2), result.shape = (4,3,2)
+- data.shape = (2,3,4), newshape = (2,0,0), result.shape = (2,3,4)
+
+- ``-1`` infers the dimension of the output shape by using the remainder of the input dimensions
+keeping the size of the new array same as that of the input array.
+At most one dimension of shape can be -1.
+
+Example::
+
+- data.shape = (2,3,4), newshape = (6,1,-1), result.shape = (6,1,4)
+- data.shape = (2,3,4), newshape = (3,-1,8), result.shape = (3,1,8)
+- data.shape = (2,3,4), newshape = (-1,), result.shape = (24,)
+
+- ``-2`` copy all/remainder of the input dimensions to the output shape.
+
+Example::
+
+- data.shape = (2,3,4), newshape = (-2,), result.shape = (2,3,4)
+- data.shape = (2,3,4), newshape = (2,-2), result.shape = (2,3,4)
+- data.shape = (2,3,4), newshape = (-2,1,1), result.shape = (2,3,4,1,1)
+
+- ``-3`` use the product of two consecutive dimensions of the input shape as the output dimension.
+
+Example::
+
+- data.shape = (2,3,4), newshape = (-3,4), result.shape = (6,4)
+- data.shape = (2,3,4,5), newshape = (-3,-3), result.shape = (6,20)
+- data.shape = (2,3,4), newshape = (0,-3), result.shape = (2,12)
+- data.shape = (2,3,4), newshape = (-3,-2), result.shape = (6,4)
+
+- ``-4`` split one dimension of the input into two dimensions passed subsequent to -4 in shape (can contain -1).
+
+Example::
+
+- data.shape = (2,3,4), newshape = (-4,1,2,-2), result.shape =(1,2,3,4)
+- data.shape = (2,3,4), newshape = (2,-4,-1,3,-2), result.shape = (2,1,3,4)
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_attrs_type_key("relay.attrs.ReshapeAttrs")
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(3)
+.add_type_rel("Reshape", ReshapeRel)
+.set_attr<FTVMCompute>("FTVMCompute", ReshapeCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
+
+
+/*!
+* \brief ReshapeLikeRel User defined type constraint function.
+* \param num_inputs Number of input types in the args.
+* \param attrs The additional attributes of the operator.
+* \param reporter The reporter to report solution to.
+* \return False if the relation has not been resolved, it might be resolved later.
+*  True if this relation has been resolved.
+*/
+bool ReshapeLikeRel(const Array<Type>& types,
+                    int num_inputs,
+                    const Attrs& attrs,
+                    const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    return false;
+  }
+  const auto* reshape_like = types[1].as<TensorTypeNode>();
+  if (reshape_like == nullptr) {
+    return false;
+  }
+  CHECK(reporter->AssertEQ(data->Size(), reshape_like->Size()))
+    << "Reshape inputs size should be compatible.";
+  reporter->Assign(types[2], TensorTypeNode::make(reshape_like->shape, data->dtype));
+  return true;
+}
+
+
+Expr MakeReshapeLike(Expr data,
+                     Expr shape_like) {
+  static const Op& op = Op::Get("reshape_like");
+  return CallNode::make(op, {data, shape_like}, Attrs(), {});
+}
+
+
+TVM_REGISTER_API("relay.op._make.reshape_like")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeReshapeLike, args, rv);
+});
+
+
+RELAY_REGISTER_OP("reshape_like")
+.describe(R"code(Reshapes the input array by the size of another array.
+For an input array with shape ``(d1, d2, ..., dk)``, `reshape_like` operation reshapes
+the input array into an output array with the same shape as the second input array.
+.. note::
+    Sizes for both array should be compatible.
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("shape_like", "Tensor", "Shape tensor.")
+.set_support_level(3)
+.add_type_rel("ReshapeLike", ReshapeLikeRel)
+.set_attr<FTVMCompute>("FTVMCompute", ReshapeCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
+
+
+// Take
+TVM_REGISTER_NODE_TYPE(TakeAttrs);
+
+bool TakeRel(const Array<Type>& types,
+             int num_inputs,
+             const Attrs& attrs,
+             const TypeReporter& reporter) {
+  // `types` contains: [data, indices, result]
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  CHECK(data != nullptr);
+  const auto* indices = types[1].as<TensorTypeNode>();
+  CHECK(indices != nullptr);
+  const auto param = attrs.as<TakeAttrs>();
+  CHECK(param != nullptr);
+
+  if (!param->axis.defined()) {
+    std::vector<IndexExpr>&& oshape = AsVector(indices->shape);
+    reporter->Assign(types[2], TensorTypeNode::make(oshape, data->dtype));
+    return true;
+  }
+
+  std::vector<IndexExpr> oshape;
+  const auto ndim_data = static_cast<int>(data->shape.size());
+  const auto ndim_indices = static_cast<int>(indices->shape.size());
+  int axis = static_cast<int>(param->axis->value);
+  if (axis < 0) axis += ndim_data;
+  CHECK_LE(axis, ndim_data)
+    << "axis should be with in data shape"
+    << ", but got = " << axis;
+
+  oshape.reserve(ndim_data - 1 + ndim_indices);
+  for (int i = 0; i < axis; ++i) {
+    oshape.emplace_back(data->shape[i]);
+  }
+  for (int i = 0; i < ndim_indices; ++i) {
+    oshape.emplace_back(indices->shape[i]);
+  }
+  for (int i = axis+1; i < ndim_data; ++i) {
+    oshape.emplace_back(data->shape[i]);
+  }
+
+  reporter->Assign(types[2], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+Array<Tensor> TakeCompute(const Attrs& attrs,
+                          const Array<Tensor>& inputs,
+                          const Type& out_type,
+                          const Target& target) {
+  const auto* param = attrs.as<TakeAttrs>();
+  CHECK(param != nullptr);
+  if (!param->axis.defined()) {
+    return Array<Tensor>{ topi::take(inputs[0], inputs[1]) };
+  } else {
+    return Array<Tensor>{ topi::take(inputs[0], inputs[1], param->axis) };
+  }
+}
+
+Expr MakeTake(Expr data,
+              Expr indices,
+              Integer axis) {
+  auto attrs = make_node<TakeAttrs>();
+  attrs->axis = std::move(axis);
+  static const Op& op = Op::Get("take");
+  return CallNode::make(op, {data, indices}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.take")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 3>(MakeTake, args, rv);
+});
+
+RELAY_REGISTER_OP("take")
+.describe(R"code(Take elements from an array along an axis.
+
+When axis is not None, this function does the same thing as 'fancy' indexing
+(indexing arrays using arrays); however, it can be easier to use if you need
+elements along a given axis.
+
+**Note** that when axis is none the flattened input array is used.
+
+Examples::
+
+  a = [[ 1, 2],
+       [ 3, 4]]
+  indices = [3, 0, 2]
+  take(a, indices) = [ 4, 1, 3]
+
+  a = [[ 1., 2.],
+       [ 3., 4.]]
+  indices = [1, 0]
+  take(a, indices, axis=1) = [[ 2., 1.],
+                              [ 4., 3.]]
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.TakeAttrs")
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("indices", "Tensor", "The indices tensor.")
+.set_support_level(2)
+.add_type_rel("Take", TakeRel)
+.set_attr<FTVMCompute>("FTVMCompute", TakeCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
+
+
+// Init ops
+TVM_REGISTER_NODE_TYPE(InitOpAttrs);
+
+bool FullRel(const Array<Type>& types,
+             int num_inputs,
+             const Attrs& attrs,
+             const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const InitOpAttrs* param = attrs.as<InitOpAttrs>();
+  const auto* fill_value = types[0].as<TensorTypeNode>();
+  if (fill_value == nullptr) {
+    return false;
+  }
+
+  DataType out_dtype = param->dtype;
+  if (out_dtype.bits() == 0) {
+    out_dtype = fill_value->dtype;
+  }
+
+  CHECK_EQ(fill_value->shape.size(), 0)
+    << "Fill value should be a scalar but has dimension "
+    << fill_value->shape.size() << ".";
+
+  reporter->Assign(types[1], TensorTypeNode::make(param->shape, out_dtype));
+  return true;
+}
+
+Array<Tensor> FullCompute(const Attrs& attrs,
+                          const Array<Tensor>& inputs,
+                          const Type& out_type,
+                          const Target& target) {
+  const auto* out_ttype = out_type.as<TensorTypeNode>();
+  return { topi::full(out_ttype->shape, out_ttype->dtype, inputs[0]()) };
+}
+
+Expr MakeFull(Expr fill_value,
+              Array<IndexExpr> shape,
+              DataType dtype) {
+  auto attrs = make_node<InitOpAttrs>();
+  attrs->shape = std::move(shape);
+  attrs->dtype = std::move(dtype);
+  static const Op& op = Op::Get("full");
+  return CallNode::make(op, {fill_value}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.full")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 3>(MakeFull, args, rv);
+});
+
+RELAY_REGISTER_OP("full")
+.describe(R"code(Fill array with scalar value.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.InitOpAttrs")
+.set_num_inputs(1)
+.add_argument("fill_value", "double", "The value to fill.")
+.set_support_level(3)
+.add_type_rel("Full", FullRel)
+.set_attr<FTVMCompute>("FTVMCompute", FullCompute)
+.set_attr<TOpPattern>("TOpPattern", kElemWise);
+
+bool InitOpRel(const Array<Type>& types,
+               int num_inputs,
+               const Attrs& attrs,
+               const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 1);
+  const InitOpAttrs* param = attrs.as<InitOpAttrs>();
+
+  reporter->Assign(types[0], TensorTypeNode::make(param->shape, param->dtype));
+  return true;
+}
+
+Expr MakeZeros(Array<IndexExpr> shape,
+               DataType dtype) {
+  auto attrs = make_node<InitOpAttrs>();
+  attrs->shape = std::move(shape);
+  attrs->dtype = std::move(dtype);
+  static const Op& op = Op::Get("zeros");
+  return CallNode::make(op, {}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.zeros")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeZeros, args, rv);
+  });
+
+RELAY_REGISTER_OP("zeros")
+.describe(R"code(Fill array with zeros.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.InitOpAttrs")
+.set_num_inputs(0)
+.set_support_level(3)
+.add_type_rel("InitOp", InitOpRel);
+
+Expr MakeOnes(Array<IndexExpr> shape,
+              DataType dtype) {
+  auto attrs = make_node<InitOpAttrs>();
+  attrs->shape = std::move(shape);
+  attrs->dtype = std::move(dtype);
+  static const Op& op = Op::Get("ones");
+  return CallNode::make(op, {}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.ones")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeOnes, args, rv);
+  });
+
+RELAY_REGISTER_OP("ones")
+.describe(R"code(Fill array with ones.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.InitOpAttrs")
+.set_num_inputs(0)
+.set_support_level(3)
+.add_type_rel("InitOp", InitOpRel);
+
+bool FullLikeRel(const Array<Type>& types,
+                 int num_inputs,
+                 const Attrs& attrs,
+                 const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    return false;
+  }
+  const auto* fill_value = types[1].as<TensorTypeNode>();
+  if (fill_value == nullptr) {
+    return false;
+  }
+
+  CHECK_EQ(fill_value->shape.size(), 0)
+    << "The fill value should be a scalar but here it has dimension "
+    << fill_value->shape.size() << ".";
+
+  reporter->Assign(types[2], TensorTypeNode::make(data->shape, data->dtype));
+  return true;
+}
+
+Array<Tensor> FullLikeCompute(const Attrs& attrs,
+                              const Array<Tensor>& inputs,
+                              const Type& out_type,
+                              const Target& target) {
+  return { topi::full_like(inputs[0], inputs[1]()) };
+}
+
+Expr MakeFullLike(Expr data,
+                  Expr fill_value) {
+  static const Op& op = Op::Get("full_like");
+  return CallNode::make(op, {data, fill_value}, Attrs(), {});
+}
+
+TVM_REGISTER_API("relay.op._make.full_like")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeFullLike, args, rv);
+  });
+
+RELAY_REGISTER_OP("full_like")
+.describe(R"code(Return an scalar value array with the same shape
+and type as the input array.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("fill_value", "double", "Scalar value to fill.")
+.set_support_level(3)
+.add_type_rel("FullLike", FullLikeRel)
+.set_attr<FTVMCompute>("FTVMCompute", FullLikeCompute)
+.set_attr<TOpPattern>("TOpPattern", kElemWise);
+
+// where operator
+bool WhereRel(const Array<Type>& types,
+              int num_inputs,
+              const Attrs& attrs,
+              const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 4U);
+  const auto* condition = types[0].as<TensorTypeNode>();
+  const auto* x = types[1].as<TensorTypeNode>();
+  const auto* y = types[2].as<TensorTypeNode>();
+  CHECK(condition != nullptr && x != nullptr && y != nullptr);
+
+  const auto& cond_shape = condition->shape;
+  const auto& x_shape = x->shape;
+  const auto& y_shape = y->shape;
+  CHECK(x_shape.size() == y_shape.size()) << "x and y must have the same size";
+
+  if (cond_shape.size() != x_shape.size()) {
+    CHECK_EQ(cond_shape.size(), 1)
+        << "Shape of condition " << condition->shape
+        << " must be either equal to x or has dimension of 1.";
+  }
+  for (size_t i = 0; i < x_shape.size(); i++) {
+    CHECK(reporter->AssertEQ(x_shape[i], y_shape[i]))
+        << "x and y must have the same shape: " << x_shape << " vs " << y_shape;
+
+    CHECK(reporter->AssertEQ(cond_shape[i], x_shape[i]))
+        << "Shape of condition " << condition->shape
+        << " must be either equal to x or has dimension of 1.";
+  }
+  reporter->Assign(types[3], TensorTypeNode::make(x_shape, x->dtype));
+  return true;
+}
+
+// Positional relay function to create where operator.
+Expr MakeWhere(const Expr& condition, const Expr& x, const Expr& y) {
+  static const Op& op = Op::Get("where");
+  return CallNode::make(op, {condition, x, y});
+}
+
+Array<Tensor> WhereCompute(const Attrs& attrs,
+                           const Array<Tensor>& inputs,
+                           const Type& out_type,
+                           const Target& target) {
+  return { topi::where(inputs[0], inputs[1], inputs[2]) };
+}
+
+TVM_REGISTER_API("relay.op._make.where")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  runtime::detail::unpack_call<Expr, 3>(MakeWhere, args, rv);
+});
+
+RELAY_REGISTER_OP("where")
+.describe(R"code(
+Return the elements, either from x or y, depending on the condition.
+
+Given three ndarrays, condition, x, and y, return an ndarray with the elements
+from x or y, depending on the elements from condition are true or false.
+x and y must have the same shape. If condition has the same shape as x,
+each element in the output array is from x if the corresponding element
+in the condition is true, and from y if false.
+
+If condition does not have the same shape as x, it must be a 1D array whose
+size is the same as x’s first dimension size. Each row of the output array
+is from x’s row if the corresponding element from condition is true, and
+from y’s row if false.
+
+Note that all non-zero values are interpreted as True in condition.
+
+Examples::
+
+  x = [[1, 2], [3, 4]]
+  y = [[5, 6], [7, 8]]
+  cond = [[0, 1], [-1, 0]]
+  where(cond, x, y) = [[5, 2], [3, 8]]
+
+
+  cond = [1, 0]
+  where(cond, x, y) = [[1, 2], [7, 8]]
+
+)code" TVM_ADD_FILELINE)
+.add_argument("condition", "Tensor", "Condition array")
+.add_argument("x", "Tensor", "First array to be selected")
+.add_argument("y", "Tensor", "Second array to be selected")
+.set_num_inputs(3)
+.set_support_level(4)
+.add_type_rel("Where", WhereRel)
+.set_attr<FTVMCompute>("FTVMCompute", WhereCompute)
+.set_attr<TOpPattern>("TOpPattern", kBroadcast);
+
+
+// Squeeze
+TVM_REGISTER_NODE_TYPE(SqueezeAttrs);
+
+Expr MakeSqueeze(Expr data,
+                 Array<Integer> axis) {
+  auto attrs = make_node<SqueezeAttrs>();
+  attrs->axis = std::move(axis);
+  static const Op& op = Op::Get("squeeze");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.squeeze")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeSqueeze, args, rv);
+  });
+
+
+bool SqueezeRel(const Array<Type>& types,
+                int num_inputs,
+                const Attrs& attrs,
+                const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    return false;
+  }
+  const auto* param = attrs.as<SqueezeAttrs>();
+  CHECK(param != nullptr);
+  std::vector<IndexExpr> result_shape;
+  // if axes is None, squeeze all axes of dimension 1
+  if (!param->axis.defined()) {
+    for (const auto& e : data->shape) {
+      const int64_t* axis_ptr = as_const_int(e);
+      CHECK(axis_ptr != nullptr) << "the axes attribute must be concrete";
+      if (*axis_ptr != 1) {
+        result_shape.push_back(e);
+      }
+    }
+  } else {
+    // pair up original shape with a boolean which control whether it will be in the final shape.
+    std::vector<std::pair<IndexExpr, bool> > original_shape;
+    for (const auto& e : data->shape) {
+      original_shape.push_back(std::pair<IndexExpr, bool>(e, true));
+    }
+    for (const auto& e : param->axis) {
+      int64_t axis_val = e->value;
+      if (axis_val < 0) {
+        axis_val += static_cast<int64_t>(original_shape.size());
+      }
+      CHECK_GE(axis_val, 0);
+      CHECK_LT(axis_val, original_shape.size());
+      original_shape.at(axis_val).second = false;
+    }
+    for (const auto p : original_shape) {
+      if (p.second) {
+        result_shape.push_back(p.first);
+      } else {
+        const int64_t* axis_ptr = as_const_int(p.first);
+        CHECK(axis_ptr != nullptr) << "cannot get concrete shape of input tensor";
+        CHECK_EQ(*axis_ptr, 1) << "cannot squeeze axis with dimension not equal to 1";
+      }
+    }
+  }
+  reporter->Assign(types[1], TensorTypeNode::make(result_shape, data->dtype));
+  return true;
+}
+
+Array<Tensor> SqueezeCompute(const Attrs& attrs,
+                             const Array<Tensor>& inputs,
+                             const Type& out_type,
+                             const Target& target) {
+  const SqueezeAttrs *param = attrs.as<SqueezeAttrs>();
+  CHECK(param != nullptr);
+  return { topi::squeeze(inputs[0], param->axis) };
+}
+
+
+RELAY_REGISTER_OP("squeeze")
+.describe(R"code(Squeeze the input tensor at the dimensions given by axes
+
+- **data**: The input data to the operator.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_attrs_type_key("relay.attrs.SqueezeAttrs")
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(3)
+.add_type_rel("Squeeze", SqueezeRel)
+.set_attr<FTVMCompute>("FTVMCompute", SqueezeCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
+
+
+// Have no idea how to assert the constraint.
+// CollapseSumLike: <A, B> -> B where BroadCast(A, B) = A
+bool CollapseSumLikeRel(const Array<Type>& types,
+                        int num_inputs,
+                        const Attrs& attrs,
+                        const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  reporter->Assign(types[2], types[1]);
+  return true;
+}
+
+Expr MakeCollapseSumLike(Expr data,
+                         Expr collapse_type) {
+  static const Op& op = Op::Get("collapse_sum_like");
+  return CallNode::make(op, {data, collapse_type}, Attrs(), {});
+}
+
+Array<Tensor> CollapseSumLikeCompute(const Attrs& attrs,
+                                     const Array<Tensor>& inputs,
+                                     const Type& out_type,
+                                     const Target& target) {
+  const auto* out_ttype = out_type.as<TensorTypeNode>();
+  CHECK(out_ttype != nullptr);
+  return { topi::collapse_sum(inputs[0], out_ttype->shape) };
+}
+
+TVM_REGISTER_API("relay.op._make.collapse_sum_like")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeCollapseSumLike, args, rv);
+  });
+
+RELAY_REGISTER_OP("collapse_sum_like")
+.describe(R"code(Collapse the first input to match the shape of the second input.
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("collapse_type", "Tensor", "Provide the type to collapse to.")
+.set_support_level(10)
+.add_type_rel("CollapseSumLike", CollapseSumLikeRel)
+.set_attr<FTVMCompute>("FTVMCompute", CollapseSumLikeCompute)
+.set_attr<TOpPattern>("TOpPattern", kCommReduce);
+
+// BroadCastTo: <A, B> -> B where BroadCast(A, B) = B
+bool BroadCastToRel(const Array<Type>& types,
+                    int num_inputs,
+                    const Attrs& attrs,
+                    const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  auto ioattrs = attrs.as<InitOpAttrs>();
+  CHECK(ioattrs);
+  auto intt = types[0].as<TensorTypeNode>();
+  if (intt == nullptr) { return false; }
+  auto type = TensorTypeNode::make(ioattrs->shape, intt->dtype);
+  reporter->Assign(types[1], type);
+  return true;
+}
+
+Expr MakeBroadCastTo(Expr data, Array<IndexExpr> shape) {
+  static const Op& op = Op::Get("broadcast_to");
+  auto attrs = make_node<InitOpAttrs>();
+  attrs->shape = std::move(shape);
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+Array<Tensor> BroadCastToCompute(const Attrs& attrs,
+                                 const Array<Tensor>& inputs,
+                                 const Type& out_type,
+                                 const Target& target) {
+  auto ioattrs = attrs.as<InitOpAttrs>();
+  CHECK(ioattrs != nullptr);
+  return { topi::broadcast_to(inputs[0], ioattrs->shape) };
+}
+
+TVM_REGISTER_API("relay.op._make.broadcast_to")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeBroadCastTo, args, rv);
+  });
+
+RELAY_REGISTER_OP("broadcast_to")
+.describe(R"code(Broadcast the first input to match the shape argument.
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(4)
+.add_type_rel("BroadCastTo", BroadCastToRel)
+.set_attr<FTVMCompute>("FTVMCompute", BroadCastToCompute)
+.set_attr<TOpPattern>("TOpPattern", kBroadcast);
+
+// BroadCastToLike: <A, B> -> B where BroadCast(A, B) = B
+bool BroadCastToLikeRel(const Array<Type>& types,
+                        int num_inputs,
+                        const Attrs& attrs,
+                        const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  reporter->Assign(types[2], types[1]);
+  return true;
+}
+
+Expr MakeBroadCastToLike(Expr data,
+                         Expr broadcast_type) {
+  static const Op& op = Op::Get("broadcast_to_like");
+  return CallNode::make(op, {data, broadcast_type}, Attrs(), {});
+}
+
+Array<Tensor> BroadCastToLikeCompute(const Attrs& attrs,
+                                     const Array<Tensor>& inputs,
+                                     const Type& out_type,
+                                     const Target& target) {
+  const auto* out_ttype = out_type.as<TensorTypeNode>();
+  CHECK(out_ttype != nullptr);
+  return { topi::broadcast_to(inputs[0], out_ttype->shape) };
+}
+
+TVM_REGISTER_API("relay.op._make.broadcast_to_like")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeBroadCastToLike, args, rv);
+  });
+
+RELAY_REGISTER_OP("broadcast_to_like")
+.describe(R"code(Broadcast the first input to match the shape of the second input.
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("broadcast_type", "Tensor", "Provide the type to broadcast to.")
+.set_support_level(10)
+.add_type_rel("BroadCastToLike", BroadCastToLikeRel)
+.set_attr<FTVMCompute>("FTVMCompute", BroadCastToLikeCompute)
+.set_attr<TOpPattern>("TOpPattern", kBroadcast);
+
+
+// strided_slice
+TVM_REGISTER_NODE_TYPE(StridedSliceAttrs);
+bool StridedSliceRel(const Array<Type>& types,
+                     int num_inputs,
+                     const Attrs& attrs,
+                     const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  const StridedSliceAttrs *param = attrs.as<StridedSliceAttrs>();
+  CHECK(param != nullptr);
+
+  auto dshape = data->shape;
+  auto num_axis = dshape.size();
+
+  std::vector<int64_t> stride_vec;
+  for (Integer i : param->strides) {
+    CHECK(i.defined());
+    stride_vec.push_back(i->value);
+  }
+  for (size_t i = stride_vec.size(); i < num_axis; ++i) {
+    stride_vec.push_back(1);
+  }
+  const int64_t max_range = std::numeric_limits<int64_t>::max();
+
+  std::vector<int64_t> begin_vec;
+  for (size_t i = 0; i < param->begin.size(); ++i) {
+    if (!param->begin[i].defined()) {
+      // value=None
+      begin_vec.push_back(stride_vec[i] > 0 ? 0 : max_range);
+    } else {
+      begin_vec.push_back(param->begin[i]->value);
+    }
+  }
+  for (size_t i = begin_vec.size(); i < num_axis; ++i) {
+    begin_vec.push_back(stride_vec[i] > 0 ? 0 : max_range);
+  }
+
+  std::vector<int64_t> end_vec;
+  for (size_t i = 0; i < param->end.size(); ++i) {
+    // allow end to be None
+    if (!param->end[i].defined()) {
+      end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
+    } else {
+      end_vec.push_back(param->end[i]->value);
+    }
+  }
+  for (size_t i = end_vec.size(); i < num_axis; ++i) {
+    end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
+  }
+
+  std::vector<IndexExpr> oshape(dshape.size());
+  for (size_t i = 0; i < num_axis; ++i) {
+    int64_t stride_v = stride_vec[i];
+    int64_t begin_v = begin_vec[i];
+    int64_t end_v = end_vec[i];
+
+    if ((stride_v == 1 &&
+         begin_v == 0 &&
+         end_v == max_range) ||
+        (stride_v == -1 &&
+         begin_v == max_range &&
+         end_v == 0)) {
+      // Quick path, do not slice this dimension.
+      oshape[i] = dshape[i];
+      continue;
+    }
+    // Normal path, require the shape to be concrete integer.
+    // Require concrete integer as symbolic inference of min/max
+    // can get complicated and not very helpful.
+    const int64_t* p_dim_size = as_const_int(dshape[i]);
+    CHECK(p_dim_size)
+        << "strided_slice requires sliced dimension to be concrete int";
+    int64_t dim_size = p_dim_size[0];
+    begin_v = (begin_v < 0) ? dim_size + begin_v : begin_v;
+    end_v = (end_v < 0) ? dim_size + end_v : end_v;
+
+    int64_t slice_range, step;
+    if (stride_v < 0) {
+      if (end_v < -1) end_v = -1;
+      CHECK_LT(end_v, begin_v)
+          << "strided_slice get empty slice at axis " << i;
+      begin_v = std::min(dim_size - 1, begin_v);
+      slice_range = begin_v - end_v;
+      step = -stride_v;
+    } else {
+      if (begin_v < 0) begin_v = 0;
+      CHECK_GE(stride_v, 0);
+      CHECK_LT(begin_v, end_v)
+          << "strided_slice get empty slice at axis " << i;
+      end_v = std::min(dim_size, end_v);
+      slice_range = end_v - begin_v;
+      step = stride_v;
+    }
+    oshape[i] = make_const(dshape[i].type(), (slice_range + step - 1) / step);
+  }
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+
+// Positional relay function to create StridedSlice operator used by frontend FFI.
+Expr MakeStridedSlice(Expr data,
+                      Array<Integer> begin,
+                      Array<Integer> end,
+                      Array<Integer> strides) {
+  auto attrs = make_node<StridedSliceAttrs>();
+  attrs->begin = std::move(begin);
+  attrs->end = std::move(end);
+  attrs->strides = std::move(strides);
+  static const Op& op = Op::Get("strided_slice");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+Array<Tensor> StridedSliceCompute(const Attrs& attrs,
+                                  const Array<Tensor>& inputs,
+                                  const Type& out_type,
+                                  const Target& target) {
+  const StridedSliceAttrs *param = attrs.as<StridedSliceAttrs>();
+  CHECK(param != nullptr);
+  return Array<Tensor>{
+    topi::strided_slice(inputs[0], param->begin, param->end, param->strides)
+  };
+}
+
+
+TVM_REGISTER_API("relay.op._make.strided_slice")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 4>(MakeStridedSlice, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("strided_slice")
+    .describe(R"code(Strided slice of an array.
+
+Examples::
+
+  x = [[  1.,   4.,   7.,  10.],
+       [  2.,   5.,   8.,  11.],
+       [  3.,   6.,   9.,  12.]]
+
+  strided_slice(x, begin=[0, 1], end=[2, 4], stride=[1, 1]) = [[ 4.,  7.,  10.],
+                                                               [ 5.,  8.,  11.]]
+
+  x = [[[ 1.,  2.],
+        [ 3.,  4.]],
+
+       [[ 5.,  6.],
+        [ 7.,  8.]]]
+
+  strided_slice(x, begin=[0, 0], end=[2, 2]) = [[[ 1.,  2.],
+                                                 [ 3.,  4.]],
+
+                                                [[ 5.,  6.],
+                                                 [ 7.,  8.]]]
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(4)
+.set_attrs_type_key("relay.attrs.StridedSliceAttrs")
+.add_type_rel("StridedSlice", StridedSliceRel)
+.set_attr<FTVMCompute>("FTVMCompute", StridedSliceCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
+
+
+// relay.split
+TVM_REGISTER_NODE_TYPE(SplitAttrs);
+
+bool SplitRel(const Array<Type>& types,
+              int num_inputs,
+              const Attrs& attrs,
+              const TypeReporter& reporter) {
+  // `types` contains: [data, result]
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+  CHECK_NE(data->shape.size(), 0) << "Input shape cannot be empty";
+  const auto param = attrs.as<SplitAttrs>();
+  CHECK(param != nullptr);
+  auto axis = param->axis;
+  if (axis < 0) {
+    axis += data->shape.size();
+  }
+  CHECK_LT(axis, data->shape.size())
+    << "axis should be within the input dimension range.";
+  CHECK_GE(axis, 0)
+    << "axis should be within the input dimension range.";
+
+  if (const IntImm* sections = param->indices_or_sections.as<IntImm>()) {
+    CHECK(reporter->Assert(data->shape[axis] %
+                           sections->value == make_zero(Int(64))))
+        << "indices_or_sections need to be able to divide input.shape[axis]";
+    std::vector<Type> fields;
+    for (int i = 0; i < sections->value; ++i) {
+        std::vector<IndexExpr>&& oshape = AsVector(data->shape);
+        oshape[axis] /= int32_t(sections->value);
+        auto vec_type = TensorTypeNode::make(oshape, data->dtype);
+        fields.push_back(vec_type);
+    }
+    reporter->Assign(types[1], TupleTypeNode::make(Array<Type>(fields)));
+  } else {
+    auto indices = param->indices_or_sections.as<ArrayNode>()->data;
+    auto begin = IndexExpr(make_zero(Int(32)));
+    std::vector<Type> fields;
+    for (unsigned int i = 0; i < indices.size(); ++i) {
+      CHECK(reporter->Assert(IndexExpr(indices[i]) > begin))
+          << "indices_or_sections need to be a sorted ascending list";
+      std::vector<IndexExpr>&& oshape = AsVector(data->shape);
+      oshape[axis] = IndexExpr(indices[i]) - begin;
+      begin = IndexExpr(indices[i]);
+      auto vec_type = TensorTypeNode::make(oshape, data->dtype);
+      fields.push_back(vec_type);
+    }
+    CHECK(reporter->Assert(begin < data->shape[axis]))
+        << "The sum of sections must match the input.shape[axis]";
+    std::vector<IndexExpr>&& oshape = AsVector(data->shape);
+    oshape[axis] = data->shape[axis] - begin;
+    auto vec_type = TensorTypeNode::make(oshape, data->dtype);
+    fields.push_back(vec_type);
+    reporter->Assign(types[1], TupleTypeNode::make(Array<Type>(fields)));
+  }
+  return true;
+}
+
+Array<Tensor> SplitCompute(const Attrs& attrs,
+                           const Array<Tensor>& inputs,
+                           const Type& out_type,
+                           const Target& target) {
+  const auto param = attrs.as<SplitAttrs>();
+  CHECK(param != nullptr);
+
+  if (const IntImm* sections = param->indices_or_sections.as<IntImm>()) {
+    int64_t num_sections = sections->value;
+    return Array<Tensor>{
+      topi::split_sections(inputs[0], num_sections, param->axis) };
+  } else {
+    auto indices = Downcast<Array<Integer> >(param->indices_or_sections);
+    return Array<Tensor>{ topi::split(inputs[0], indices, param->axis) };
+  }
+}
+
+Expr MakeSplit(Expr data,
+               NodeRef indices_or_sections,
+               int axis) {
+  auto attrs = make_node<SplitAttrs>();
+  attrs->axis = axis;
+  attrs->indices_or_sections = std::move(indices_or_sections);
+  static const Op& op = Op::Get("split");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.split")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    if (args.type_codes[1] == kDLInt) {
+      *rv = MakeSplit(args[0], make_const(Int(64), int64_t(args[1])), args[2]);
+    } else {
+      *rv = MakeSplit(args[0], args[1], args[2]);
+    }
+});
+
+RELAY_REGISTER_OP("split")
+.describe(R"code(Splits an array along a particular axis into multiple sub-arrays.
+
+Indices or sections to split into. Accepts an int or a tuple
+If indices_or_sections is an integer, the input will be divided equally
+along given axis. If such a split is not possible, an error is raised.
+
+If indices_or_sections is a tuple of sorted integers,
+the entries indicate where along axis the array is split.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.SplitAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(3)
+.add_type_rel("Split", SplitRel)
+.set_attr<FTVMCompute>("FTVMCompute", SplitCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
+
+
+// relay.slice_like
+TVM_REGISTER_NODE_TYPE(SliceLikeAttrs);
+
+/*!
+* \brief SliceLikeRel User defined type constraint function.
+* \param num_inputs Number of input types in the args.
+* \param attrs The additional attributes of the operator.
+* \param reporter The reporter to report solution to.
+* \return False if the relation has not been resolved, it might be resolved later.
+*  True if this relation has been resolved.
+*/
+bool SliceLikeRel(const Array<Type>& types,
+                  int num_inputs,
+                  const Attrs& attrs,
+                  const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    return false;
+  }
+
+  const auto* target = types[1].as<TensorTypeNode>();
+  if (target == nullptr) {
+    return false;
+  }
+
+  const auto param = attrs.as<SliceLikeAttrs>();
+  CHECK(param != nullptr);
+
+  const Array<IndexExpr> dshape = data->shape;
+  const Array<IndexExpr> target_shape = target->shape;
+  std::vector<IndexExpr>&& oshape = AsVector(dshape);
+
+  if (!param->axes.defined()) {
+    for (size_t i = 0; i < dshape.size(); ++i) {
+      if (i < target_shape.size()) {
+        oshape[i] = target_shape[i];
+        CHECK(reporter->Assert(oshape[i] <= dshape[i]))
+          << "End index of axis " << i << " exceeds input shape: "
+          << oshape[i] << " vs " << dshape[i];
+      }
+    }
+  } else {
+    CHECK(param->axes.size() != 0) << "Axes cannot be empty.";
+    for (Integer val : param->axes) {
+      int axis = val->value;
+      if (axis < 0) {
+        axis += dshape.size();
+      }
+      CHECK(axis < static_cast<int>(target_shape.size()))
+        << "Axis " << axis << " exceeds dimension "
+        << target_shape.size() << " of target_shape.";
+      oshape[axis] = target_shape[axis];
+      CHECK(reporter->Assert(oshape[axis] <= dshape[axis]))
+        << "End index of axis " << axis << " exceeds input shape: "
+        << oshape[axis] << " vs " << dshape[axis];
+    }
+  }
+
+  reporter->Assign(types[2], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+
+Expr MakeSliceLike(Expr data,
+                   Expr shape_like,
+                   Array<Integer> axes) {
+  auto attrs = make_node<SliceLikeAttrs>();
+  attrs->axes = std::move(axes);
+  static const Op& op = Op::Get("slice_like");
+  return CallNode::make(op, {data, shape_like}, Attrs(attrs), {});
+}
+
+// Adapter function to make int array.
+Array<Integer> GetIntArray(Array<IndexExpr> arr) {
+  for (size_t i = 0; i < arr.size(); ++i) {
+    CHECK(!arr[i].defined() || arr[i].as<IntImm>())
+        << "Expect an int array";
+  }
+  return Array<Integer>(arr.node_);
+}
+
+Array<Tensor> SliceLikeCompute(const Attrs& attrs,
+                               const Array<Tensor>& inputs,
+                               const Type& out_type,
+                               const Target& target) {
+  const auto* param = attrs.as<SliceLikeAttrs>();
+  CHECK(param != nullptr);
+  Array<IndexExpr> src_shape = inputs[0]->shape;
+  Array<IndexExpr> target_shape = inputs[1]->shape;
+  Array<IndexExpr> begin_idx, end_idx, strides;
+  for (size_t i = 0; i < src_shape.size(); ++i) {
+    begin_idx.push_back(0);
+    strides.push_back(1);
+  }
+  end_idx = Array<IndexExpr>(src_shape);
+  if (!param->axes.defined()) {
+    for (size_t i = 0; i < src_shape.size(); ++i) {
+      if (i < target_shape.size()) {
+        end_idx.Set(i, target_shape[i]);
+        CHECK_LE(topi::GetConstInt(end_idx[i]),
+                 topi::GetConstInt(src_shape[i]))
+          << "End index of axis " << i << " exceeds input shape: "
+          << topi::GetConstInt(end_idx[i]) << " vs "
+          << topi::GetConstInt(src_shape[i]);
+      }
+    }
+  } else {
+    for (int axis : param->axes) {
+      if (axis < 0) {
+        axis = static_cast<int>(src_shape.size()) + axis;
+      }
+      end_idx.Set(axis, target_shape[axis]);
+      CHECK_LE(topi::GetConstInt(end_idx[axis]),
+               topi::GetConstInt(src_shape[axis]))
+        << "End index of axis " << axis << " exceeds input shape: "
+        << topi::GetConstInt(end_idx[axis]) << " vs "
+        << topi::GetConstInt(src_shape[axis]);
+    }
+  }
+  return Array<Tensor>{
+    topi::strided_slice(inputs[0],
+                        GetIntArray(begin_idx),
+                        GetIntArray(end_idx),
+                        GetIntArray(strides))
+  };
+}
+
+
+TVM_REGISTER_API("relay.op._make.slice_like")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 3>(MakeSliceLike, args, rv);
+});
+
+
+RELAY_REGISTER_OP("slice_like")
+.describe(R"code(Slice the first input respect to the second input.
+)code" TVM_ADD_FILELINE)
+  .set_attrs_type_key("relay.attrs.SlicelikeAttrs")
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("shape_like", "Tensor", "Shape tensor.")
+.set_support_level(10)
+.add_type_rel("SliceLike", SliceLikeRel)
+.set_attr<FTVMCompute>("FTVMCompute", SliceLikeCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
+
+
+// relay.layout_transform
+Array<Tensor> LayoutTransformCompute(const Attrs& attrs,
+                                     const Array<Tensor>& inputs,
+                                     const Type& out_type,
+                                     const Target& target) {
+  const LayoutTransformAttrs *param = attrs.as<LayoutTransformAttrs>();
+  CHECK(param != nullptr);
+
+  Layout src_layout(param->src_layout);
+  Layout dst_layout(param->dst_layout);
+
+  if (src_layout.Equals(dst_layout)) {
+    return Array<Tensor>{ inputs[0] };
+  }
+
+  CHECK(src_layout.defined() && dst_layout.defined())
+    << "cannot convert from/to undefined layout";
+  CHECK(src_layout.Convertible(dst_layout))
+    << "cannot convert from " << param->src_layout << " to " << param->dst_layout;
+
+  const auto& out_shape = ConvertLayout(inputs[0]->shape, src_layout, dst_layout);
+  return Array<Tensor> {
+      topi::layout_transform(inputs[0], out_shape, [&](const Array<tvm::Var>& dst_indices) {
+        std::vector<tvm::Expr> dst_to_src_indices;
+        for (size_t i = 0; i < src_layout.ndim(); ++i) {
+          Layout::LayoutDim src_axis = src_layout[i];
+          int dst_major_pos = dst_layout.Indexof(Layout::ToSuperdim(src_axis));
+          int dst_minor_pos = dst_layout.Indexof(Layout::ToSubdim(src_axis));
+          int32_t src_factor = static_cast<int32_t>(src_layout.Subsizeof(src_axis));
+          int32_t dst_factor = static_cast<int32_t>(dst_layout.Subsizeof(src_axis));
+
+          tvm::Expr src_index(dst_indices[dst_major_pos]);
+          if (dst_minor_pos >= 0) {
+            CHECK_GT(dst_factor, 0);
+            src_index = src_index * dst_factor + dst_indices[dst_minor_pos];
+          }
+          if (Layout::IsSuperdim(src_axis) && src_factor > 0) {
+            src_index = src_index / src_factor;
+          } else if (Layout::IsSubdim(src_axis) && src_factor > 0) {
+            src_index = src_index % src_factor;
+          }
+          dst_to_src_indices.push_back(src_index);
+        }
+        return Array<tvm::Expr>(dst_to_src_indices);
+      })
+  };
+}
+
+bool LayoutTransformRel(const Array<Type>& types,
+                        int num_inputs,
+                        const Attrs& attrs,
+                        const TypeReporter& reporter) {
+  const auto* data = types[0].as<TensorTypeNode>();
+  CHECK(data != nullptr);
+  const LayoutTransformAttrs* params = attrs.as<LayoutTransformAttrs>();
+
+  Layout src_layout(params->src_layout);
+  Layout dst_layout(params->dst_layout);
+
+  CHECK(src_layout.defined() && dst_layout.defined())
+    << "cannot convert from/to undefined layout";
+  CHECK(src_layout.Convertible(dst_layout))
+    << "cannot convert from " << params->src_layout << " to " << params->dst_layout;
+
+  const auto& out_shape = ConvertLayout(data->shape, src_layout, dst_layout);
+  reporter->Assign(types[1], TensorTypeNode::make(out_shape, data->dtype));
+  return true;
+}
+
+Expr MakeLayoutTransform(Expr data,
+                         std::string src_layout,
+                         std::string dst_layout) {
+  auto attrs = make_node<LayoutTransformAttrs>();
+  attrs->src_layout = std::move(src_layout);
+  attrs->dst_layout = std::move(dst_layout);
+  static const Op& op = Op::Get("layout_transform");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.layout_transform")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  runtime::detail::unpack_call<Expr, 3>(MakeLayoutTransform, args, rv);
+});
+
+RELAY_REGISTER_OP("layout_transform")
+.describe(R"code(Transform the input data layout.
+
+For transforming from NCHW to N16cHWC, the `__layout_transform__` operator reshapes
+the input array by output[n, c, h, w, C] = data[n, C*16+c, h, w]
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.LayoutTransformAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_type_rel("layout_transform", LayoutTransformRel)
+.set_support_level(5)
+.set_attr<FTVMCompute>("FTVMCompute", LayoutTransformCompute);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
new file mode 100644
index 000000000000..b83fdacda1ee
--- /dev/null
+++ b/src/relay/op/tensor/unary.cc
@@ -0,0 +1,180 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file unary.cc
+ * \brief Unary operators.
+ */
+#include <tvm/relay/expr.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/transform.h>
+#include <topi/elemwise.h>
+#include "../type_relations.h"
+#include "../op_common.h"
+
+namespace tvm {
+namespace relay {
+
+#define RELAY_UNARY_COMPUTE(FTOPI)                      \
+  [] (const Attrs& attrs,                               \
+      const Array<Tensor>& inputs,                      \
+      const Type& out_type,                             \
+      const Target& target) -> Array<Tensor> {          \
+    return {FTOPI(inputs[0])};                          \
+  }                                                     \
+
+
+RELAY_REGISTER_UNARY_OP("log")
+.describe(R"code(Returns the log input array, computed element-wise.
+
+.. math::
+   log(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::log));
+
+
+RELAY_REGISTER_UNARY_OP("exp")
+.describe(R"code(Returns the exp input array, computed element-wise.
+
+.. math::
+   \exp(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::exp));
+
+RELAY_REGISTER_UNARY_OP("sqrt")
+.describe(R"code(Returns the rsqrt input array, computed element-wise.
+
+.. math::
+   sqrt(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::sqrt));
+
+
+RELAY_REGISTER_UNARY_OP("zeros_like")
+.describe(R"code(Returns an array of zeros, with same type and shape as the input.
+)code" TVM_ADD_FILELINE)
+.set_support_level(4);
+
+RELAY_REGISTER_UNARY_OP("ones_like")
+.describe(R"code(Returns an array of ones, with same type and shape as the input.
+)code" TVM_ADD_FILELINE)
+.set_support_level(4);
+
+RELAY_REGISTER_UNARY_OP("sigmoid")
+.describe(R"code(Returns the sigmoid input array, computed element-wise.
+
+.. math::
+   sigmoid(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::sigmoid));
+
+
+RELAY_REGISTER_UNARY_OP("copy")
+.describe(R"code(Copy a tensor.
+)code" TVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::identity));
+
+// relay.clip
+TVM_REGISTER_NODE_TYPE(ClipAttrs);
+
+TVM_REGISTER_API("relay.op._make.clip")
+.set_body_typed<Expr(Expr, double, double)>([](Expr a, double a_min, double a_max) {
+    auto attrs = make_node<ClipAttrs>();
+    attrs->a_min = a_min;
+    attrs->a_max = a_max;
+    static const Op& op = Op::Get("clip");
+  return CallNode::make(op, {a}, Attrs(attrs), {});
+});
+
+RELAY_REGISTER_OP("clip")
+.describe(R"code(Clip tensor values.
+This function takes a tensor, a minimum value `a_min`, and a maximum value `a_max`, and returns a clipped tensor where all values below `a_min` are set to `a_min` and all values above `a_max` are set to `a_max`. `a_min` and `a_max` are cast to the tensor's dtype.
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_type_rel("Identity", IdentityRel)
+.set_attr<TOpPattern>("TOpPattern", kElemWise)
+.set_attr<TOpIsStateful>("TOpIsStateful", false)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
+.set_support_level(3);
+
+RELAY_REGISTER_UNARY_OP("floor")
+.describe(R"code(Returns the floor of input array, computed element-wise.
+)code" TVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::floor));
+
+
+RELAY_REGISTER_UNARY_OP("ceil")
+.describe(R"code(Returns the ceil of input array, computed element-wise.
+
+.. math::
+   ceil(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::ceil));
+
+
+RELAY_REGISTER_UNARY_OP("trunc")
+.describe(R"code(Returns the trunc of input array, computed element-wise.
+
+.. math::
+   trunc(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::trunc));
+
+RELAY_REGISTER_UNARY_OP("round")
+.describe(R"code(Returns the round of input array, computed element-wise.
+
+.. math::
+   round(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::round));
+
+
+RELAY_REGISTER_UNARY_OP("abs")
+.describe(R"code(Returns the abs of input array, computed element-wise.
+
+.. math::
+   abs(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::abs));
+
+
+RELAY_REGISTER_UNARY_OP("tanh")
+.describe(R"code(Returns the tanh of input array, computed element-wise.
+
+.. math::
+   Y = sinh(X) / cosh(X)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(1)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::tanh));
+
+
+RELAY_REGISTER_UNARY_OP("negative")
+.describe(R"code(Returns the numeric negative of input array, computed element-wise.
+
+.. math::
+   -(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::negative));
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/type_relations.cc b/src/relay/op/type_relations.cc
new file mode 100644
index 000000000000..467c0fcde860
--- /dev/null
+++ b/src/relay/op/type_relations.cc
@@ -0,0 +1,120 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file type_relations.cc
+ * \brief A set of utilities and common functionality
+ * for type relations.
+ */
+#include <tvm/relay/expr.h>
+#include <tvm/relay/logging.h>
+#include <tvm/relay/op.h>
+#include <tvm/ir_pass.h>
+#include <numeric>
+#include "./type_relations.h"
+
+namespace tvm {
+namespace relay {
+
+TensorType ToTensorType(const Type& t) {
+  if (auto tt_node = t.as<TensorTypeNode>()) {
+    return GetRef<TensorType>(tt_node);
+  } else {
+    return TensorType(nullptr);
+  }
+}
+
+bool IdentityRel(const Array<Type>& types,
+                 int num_inputs,
+                 const Attrs& attrs,
+                 const TypeReporter& reporter) {
+  for (size_t i = 1; i < types.size(); ++i) {
+    reporter->Assign(types[i], types[0]);
+  }
+  return true;
+}
+
+bool EqualCheck(const IndexExpr& lhs,
+                const IndexExpr& rhs) {
+  IndexExpr diff = lhs - rhs;
+  if (const int64_t* pdiff = as_const_int(diff)) {
+    return pdiff[0] == 0;
+  }
+  // symbolic
+  diff = tvm::ir::CanonicalSimplify(diff);
+  if (const int64_t* pdiff = as_const_int(diff)) {
+    return pdiff[0] == 0;
+  }
+  return false;
+}
+
+bool EqualConstInt(const IndexExpr& lhs, int64_t value) {
+  if (const int64_t* pvalue = as_const_int(lhs)) {
+    return pvalue[0] == value;
+  }
+  return false;
+}
+
+Type ConcreteBroadcast(const TensorType& t1,
+                       const TensorType& t2,
+                       DataType output_dtype) {
+  std::vector<IndexExpr> oshape;
+  size_t ndim1 = t1->shape.size();
+  size_t ndim2 = t2->shape.size();
+  size_t i = 1;
+  for (; i <= std::min(ndim1, ndim2); ++i) {
+    IndexExpr s1 = t1->shape[ndim1 - i];
+    IndexExpr s2 = t2->shape[ndim2 - i];
+    if (EqualCheck(s1, s2)) {
+      oshape.push_back(s1);
+    } else if (EqualConstInt(s1, 1)) {
+      oshape.push_back(s2);
+    } else if (EqualConstInt(s2, 1)) {
+      oshape.push_back(s1);
+    } else {
+      LOG(FATAL) << "Incompatible broadcast type " << t1 << " and " << t2;
+    }
+  }
+  size_t max_ndim = std::max(ndim1, ndim2);
+  auto& rshape = (ndim1 > ndim2) ? t1->shape : t2->shape;
+  for (; i <= max_ndim; ++i) {
+    oshape.push_back(rshape[max_ndim - i]);
+  }
+  return TensorTypeNode::make(Array<IndexExpr>(
+      oshape.rbegin(), oshape.rend()), output_dtype);
+}
+
+bool BroadcastRel(const Array<Type>& types,
+                  int num_inputs,
+                  const Attrs& attrs,
+                  const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  RELAY_LOG(INFO) << "In1: " << types[0] << "In2: " << types[1]
+                  << "Out: " << types[2] << std::endl;
+  if (auto t0 = ToTensorType(types[0])) {
+    if (auto t1 = ToTensorType(types[1])) {
+      CHECK_EQ(t0->dtype, t1->dtype);
+      reporter->Assign(types[2], ConcreteBroadcast(t0, t1, t0->dtype));
+      return true;
+    }
+  }
+  return false;
+}
+
+bool BroadcastCompRel(const Array<Type>& types,
+                      int num_inputs,
+                      const Attrs& attrs,
+                      const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  RELAY_LOG(INFO) << "In1: " << types[0] << "In2: " << types[1]
+                  << "Out: " << types[2] << std::endl;
+  if (auto t0 = ToTensorType(types[0])) {
+    if (auto t1 = ToTensorType(types[1])) {
+      CHECK_EQ(t0->dtype, t1->dtype);
+      reporter->Assign(types[2], ConcreteBroadcast(t0, t1, ::tvm::Bool()));
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/type_relations.h b/src/relay/op/type_relations.h
new file mode 100644
index 000000000000..534e917a0b6c
--- /dev/null
+++ b/src/relay/op/type_relations.h
@@ -0,0 +1,67 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/relay/op/type_relations.h
+ * \brief A set of utilities and common functionality
+ * for type relations.
+ */
+#ifndef TVM_RELAY_OP_TYPE_RELATIONS_H_
+#define TVM_RELAY_OP_TYPE_RELATIONS_H_
+
+#include <tvm/relay/error.h>
+#include <tvm/relay/type.h>
+#include <string>
+
+namespace tvm {
+namespace relay {
+/*!
+ * \brief The identity type relation, all the types are equal.
+ *
+ * \param types The input and output types to the relation.
+ * \param num_inputs The number of input arguments.
+ * \param attrs The attributes
+ * \param reporter The reporter.
+ * \return true whether relation has been resolved.
+ */
+bool IdentityRel(const Array<Type>& types,
+                 int num_inputs,
+                 const Attrs& attrs,
+                 const TypeReporter& reporter);
+
+/*!
+ * \brief The broadcast type relation, implements the broadcasting
+ * rule over the two input types producing the broadcasted type.
+ *
+ * \param types The input and output types to the relation.
+ * \param num_inputs The number of input arguments.
+ * \param attrs The attributes
+ * \param reporter The reporter.
+ * \return true whether relation has been resolved.
+ */
+bool BroadcastRel(const Array<Type>& types,
+                  int num_inputs,
+                  const Attrs& attrs,
+                  const TypeReporter& reporter);
+
+/*!
+ * \brief The broadcast type relation, implements the broadcasting
+ *  rule over the two input types producing the broadcasted type.
+ *
+ * This differs from BroadcastRel in the return dtype,
+ * it instead returns bool(uint8), for use in comparsion operators
+ * such as equal, not_equal, lt, and so on.
+ *
+ * \param types The input and output types to the relation.
+ * \param num_inputs The number of input arguments.
+ * \param attrs The attributes
+ * \param reporter The reporter.
+ * \return true whether relation has been resolved.
+ */
+bool BroadcastCompRel(const Array<Type>& types,
+                      int num_inputs,
+                      const Attrs& attrs,
+                      const TypeReporter& reporter);
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_OP_TYPE_RELATIONS_H_
diff --git a/src/relay/op/vision/multibox_op.cc b/src/relay/op/vision/multibox_op.cc
new file mode 100644
index 000000000000..55db8862e849
--- /dev/null
+++ b/src/relay/op/vision/multibox_op.cc
@@ -0,0 +1,145 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file multibox_op.cc
+ * \brief Multibox related operators
+ */
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/vision.h>
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(MultiBoxPriorAttrs);
+
+bool MultiboxPriorRel(const Array<Type>& types,
+                      int num_inputs,
+                      const Attrs& attrs,
+                      const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const MultiBoxPriorAttrs* param = attrs.as<MultiBoxPriorAttrs>();
+  const auto& dshape = data->shape;
+  CHECK_EQ(dshape.size(), 4) << "Input data should be 4D: "
+      "[batch, channel, height, width]";
+  IndexExpr in_height = dshape[2];
+  IndexExpr in_width = dshape[3];
+  int num_sizes = static_cast<int>(param->sizes.size());
+  int num_ratios = static_cast<int>(param->ratios.size());
+
+  // since input sizes are same in each batch, we could share MultiBoxPrior
+  std::vector<IndexExpr> oshape(
+    {1, in_height * in_width * (num_sizes + num_ratios - 1), 4});
+
+  // assign output type
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+
+Expr MakeMultiBoxPrior(Expr data,
+                       Array<IndexExpr> sizes,
+                       Array<IndexExpr> ratios,
+                       Array<IndexExpr> steps,
+                       Array<IndexExpr> offsets,
+                       bool clip) {
+  auto attrs = make_node<MultiBoxPriorAttrs>();
+  attrs->sizes = std::move(sizes);
+  attrs->ratios = std::move(ratios);
+  attrs->steps = std::move(steps);
+  attrs->offsets = std::move(offsets);
+  attrs->clip = clip;
+  static const Op& op = Op::Get("vision.multibox_prior");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.vision._make.multibox_prior")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  runtime::detail::unpack_call<Expr, 6>(MakeMultiBoxPrior, args, rv);
+});
+
+
+RELAY_REGISTER_OP("vision.multibox_prior")
+.describe(R"doc("Generate prior(anchor) boxes from data, sizes and ratios."
+)doc" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.MultiBoxPriorAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(5)
+.add_type_rel("MultiBoxPrior", MultiboxPriorRel);
+
+TVM_REGISTER_NODE_TYPE(MultiBoxTransformLocAttrs);
+
+bool MultiBoxTransformLocRel(const Array<Type>& types, int num_inputs,
+                             const Attrs& attrs, const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 4);
+
+  const auto* cls_prob = types[0].as<TensorTypeNode>();
+  const auto* loc_pred = types[1].as<TensorTypeNode>();
+  const auto* anchor = types[2].as<TensorTypeNode>();
+  CHECK(cls_prob != nullptr && loc_pred != nullptr && anchor != nullptr);
+
+  const auto& cls_shape = cls_prob->shape;
+  const auto& loc_shape = loc_pred->shape;
+  const auto& anchor_shape = anchor->shape;
+
+  CHECK_EQ(cls_shape.size(), 3U)
+      << "The dimension of class probability should be 3, but received "
+      << cls_shape.size();
+  CHECK_EQ(loc_shape.size(), 2U)
+      << "The dimension of location prediction should be 2, but received "
+      << loc_shape.size();
+  CHECK_EQ(anchor_shape.size(), 3U)
+      << "The dimension of anchor should be 3, but received "
+      << anchor_shape.size();
+
+  CHECK(reporter->AssertEQ(cls_shape[2], anchor_shape[1]))
+      << "Number of anchors mismatch found";
+  CHECK(reporter->AssertEQ(cls_shape[2] * 4, loc_shape[1]))
+      << "# anchors mismatch with # loc.";
+  CHECK(reporter->Assert(anchor_shape[1] > 0)) << "Number of anchors must > 0.";
+  CHECK(reporter->AssertEQ(anchor_shape[2], 4));
+
+  std::vector<IndexExpr> oshape0({cls_shape[0], anchor_shape[1], 6});
+  std::vector<IndexExpr> oshape1({cls_shape[0]});
+  std::vector<Type> fields;
+  fields.push_back(TensorTypeNode::make(oshape0, cls_prob->dtype));
+  fields.push_back(TensorTypeNode::make(oshape1, Int(32)));
+
+  // assign output type
+  reporter->Assign(types[3], TupleTypeNode::make(Array<Type>(fields)));
+  return true;
+}
+
+Expr MakeMultiBoxTransformLoc(Expr cls_prob,
+                              Expr loc_pred,
+                              Expr anchor,
+                              bool clip,
+                              double threshold,
+                              Array<IndexExpr> variances) {
+  auto attrs = make_node<MultiBoxTransformLocAttrs>();
+  attrs->clip = std::move(clip);
+  attrs->threshold = std::move(threshold);
+  attrs->variances = std::move(variances);
+  static const Op& op = Op::Get("vision.multibox_transform_loc");
+  return CallNode::make(op, {cls_prob, loc_pred, anchor}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op.vision._make.multibox_transform_loc")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  runtime::detail::unpack_call<Expr, 6>(MakeMultiBoxTransformLoc, args, rv);
+});
+
+RELAY_REGISTER_OP("vision.multibox_transform_loc")
+.describe(R"doc("Location transformation for multibox detection."
+)doc" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.MultiBoxTransformLocAttrs")
+.set_num_inputs(3)
+.add_argument("cls_prob", "Tensor", "Class probabilities.")
+.add_argument("loc_pred", "Tensor", "Location regression predictions.")
+.add_argument("anchor", "Tensor", "Multibox prior anchor boxes")
+.add_type_rel("MultiBoxTransformLoc", MultiBoxTransformLocRel)
+.set_support_level(5);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/vision/nms.cc b/src/relay/op/vision/nms.cc
new file mode 100644
index 000000000000..3e3f73bc6cb4
--- /dev/null
+++ b/src/relay/op/vision/nms.cc
@@ -0,0 +1,62 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file nms.cc
+ * \brief Non-maximum suppression operators
+ */
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/vision.h>
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(NMSAttrs);
+
+bool NMSRel(const Array<Type>& types,
+            int num_inputs,
+            const Attrs& attrs,
+            const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto* valid_count = types[1].as<TensorTypeNode>();
+  const auto& dshape = data->shape;
+  const auto& vshape = valid_count->shape;
+  CHECK_EQ(dshape.size(), 3) << "Input data should be 3-D.";
+  CHECK_EQ(vshape.size(), 1) << "Input valid count should be 1-D.";
+
+  // assign output type
+  reporter->Assign(types[2], TensorTypeNode::make(dshape, data->dtype));
+  return true;
+}
+
+
+Expr MakeNMS(Expr data,
+             Expr valid_count,
+             double overlap_threshold,
+             bool force_suppress,
+             int topk) {
+  auto attrs = make_node<NMSAttrs>();
+  attrs->overlap_threshold = overlap_threshold;
+  attrs->force_suppress = force_suppress;
+  attrs->topk = topk;
+  static const Op& op = Op::Get("vision.nms");
+  return CallNode::make(op, {data, valid_count}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.vision._make.nms")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  runtime::detail::unpack_call<Expr, 5>(MakeNMS, args, rv);
+});
+
+
+RELAY_REGISTER_OP("vision.nms")
+.describe(R"doc("Non-maximum suppression."
+)doc" TVM_ADD_FILELINE)
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "Input data.")
+.add_argument("valid_count", "Tensor", "Number of valid anchor boxes.")
+.set_support_level(5)
+.add_type_rel("NMS", NMSRel);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/alter_op_layout.cc b/src/relay/pass/alter_op_layout.cc
new file mode 100644
index 000000000000..5c4475259086
--- /dev/null
+++ b/src/relay/pass/alter_op_layout.cc
@@ -0,0 +1,312 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file alter_op_layout.cc
+ * \brief Alternate the layouts of operators or replace primitive operators with
+          other expressions. This pass can be used for computing convolution in
+          custom layouts or other general weight pre-transformation.
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/attrs/transform.h>
+#include <tvm/tvm.h>
+#include <tuple>
+#include <vector>
+#include <functional>
+#include <string>
+
+#include "alter_op_layout.h"
+
+namespace tvm {
+namespace relay {
+
+namespace alter_op_layout {
+
+// Make a transform CallNode
+Expr TransformLayout(Expr raw, Layout src_layout, Layout dst_layout) {
+  if (src_layout.Equals(dst_layout)) { return raw; }
+  CHECK(src_layout.defined() && dst_layout.defined())
+    << "Cannot insert layout transform because there are undefined layouts";
+  CHECK(src_layout.Convertible(dst_layout))
+    << "Cannot insert layout transform because there are inconvertible layouts: "
+    << src_layout << " v.s. " << dst_layout;
+  static auto &transform_op = Op::Get("layout_transform");
+  NodePtr<LayoutTransformAttrs> attrs = make_node<LayoutTransformAttrs>();
+  attrs->src_layout = src_layout.name();
+  attrs->dst_layout = dst_layout.name();
+  Call transform = CallNode::make(transform_op, {raw}, Attrs{attrs});
+  return transform;
+}
+
+// Memorize layout transform so we can reuse internal transformed nodes
+class TransformMemorizerNode : public Node {
+ public:
+  // map from (Expr, src_layout, dst_layout) to transformed Expr
+  using TransformKey = std::tuple<const Node*, std::string, std::string>;
+  struct key_hash : public std::unary_function<TransformKey , std::size_t> {
+    std::size_t operator()(const TransformKey& k) const {
+      return dmlc::HashCombine<std::string>(dmlc::HashCombine<std::string>(
+              std::hash<const Node*>()(std::get<0>(k)), std::get<1>(k)), (std::get<2>(k)));
+    }
+  };
+
+  std::unordered_map<TransformKey, Expr, key_hash> memo;
+  static constexpr const char *_type_key = "relay.alter_op_layout.TransformMemorizerNode";
+  TVM_DECLARE_NODE_TYPE_INFO(TransformMemorizerNode, Node);
+};
+
+class TransformMemorizer : public NodeRef {
+ public:
+  TransformMemorizer() {}
+  explicit TransformMemorizer(NodePtr<Node> n) : NodeRef(n) {}
+
+  TransformMemorizerNode* operator->() {
+    return static_cast<TransformMemorizerNode*>(node_.get());
+  }
+
+  // Transform layout with memorizer
+  Expr Transform(Expr raw, const Layout& src_layout, const Layout& dst_layout) {
+    if (src_layout.Equals(dst_layout)) { return raw; }
+
+    std::tuple<const Node*, std::string, std::string> key =
+        std::make_tuple<>(raw.get(), src_layout.name(), dst_layout.name());
+    auto& memo = operator->()->memo;
+
+    auto iter = memo.find(key);
+    if (iter != memo.end()) {
+      return iter->second;
+    } else {
+      Expr transform = TransformLayout(raw, src_layout, dst_layout);
+      memo[key] = transform;
+      return transform;
+    }
+  }
+
+  using ContainerType = TransformMemorizerNode;
+};
+
+
+// TempExprNode during layout transform
+// Instance of this expr will be Realized to normal expr ultimately
+class LayoutAlternatedExprNode : public TempExprNode {
+ public:
+  Expr value;
+  Layout old_layout;
+  Layout new_layout;
+  TransformMemorizer memorizer;
+
+  Expr Realize() const final {
+    // NOTE: use a copy to discard the "const" qualifier
+    TransformMemorizer tmp_memorizer = memorizer;
+    // fallback to old layout
+    return tmp_memorizer.Transform(value, new_layout, old_layout);
+  }
+
+  void VisitAttrs(AttrVisitor *v) final {
+    v->Visit("value", &value);
+    v->Visit("old_layout", &old_layout);
+    v->Visit("new_layout", &new_layout);
+  }
+
+  static constexpr const char *_type_key = "relay.alter_op_layout.LayoutAlternatedExprNode";
+  TVM_DECLARE_NODE_TYPE_INFO(LayoutAlternatedExprNode, TempExprNode);
+};
+
+RELAY_DEFINE_NODE_REF(LayoutAlternatedExpr, LayoutAlternatedExprNode, TempExpr);
+
+// Call registered FInferCorrectLayout of an op.
+// Parameters are the same as the parameters for FInferCorrectLayout
+// Returns inferred_input_layout, inferred_output_layout, success
+std::tuple<Array<Layout>, Array<Layout>, bool> CallInfer(
+    const Call& call,
+    const Array<Layout>& new_in_layouts,
+    const Array<Layout>& old_in_layouts,
+    const Array<Array<IndexExpr> > &old_in_shapes) {
+  static auto finfer_layout = Op::GetAttr<FInferCorrectLayout>("FInferCorrectLayout");
+
+  Op op = Downcast<Op>(call->op);
+  if (finfer_layout.count(op)) {
+    Array<Array<Layout> > inferred_layouts;
+    inferred_layouts = finfer_layout[op](call->attrs, new_in_layouts,
+                                         old_in_layouts, old_in_shapes);
+    CHECK_EQ(inferred_layouts.size(), 2)
+      << "FInferCorrectLayout should return an array with size of 2";
+    for (auto x : inferred_layouts) {
+      for (auto y : x) {
+        if (!y.defined()) {  // inference fails
+          return std::make_tuple<>(Array<Layout>(nullptr), Array<Layout>(nullptr), false);
+        }
+      }
+    }
+    return std::make_tuple<>(inferred_layouts[0], inferred_layouts[1], true);
+  } else {
+    return std::make_tuple<>(Array<Layout>(nullptr), Array<Layout>(nullptr), false);
+  }
+}
+
+// Call registered FTVMAlterOpLayout of an op
+// Returns the altered expression
+Call CallAlter(const Call& ref_call,
+               const std::vector<Expr>& new_args) {
+  static auto falter_layout = Op::GetAttr<FTVMAlterOpLayout>("FTVMAlterOpLayout");
+  Op op = Downcast<Op>(ref_call->op);
+
+  Expr new_e;
+  bool modified = false;
+  if (falter_layout.count(op)) {
+    tvm::Array<tvm::Tensor> tinfos;
+    for (auto expr : ref_call->args) {
+      auto ttype = expr->type_as<TensorTypeNode>();
+      tinfos.push_back(tvm::placeholder(ttype->shape, ttype->dtype));
+    }
+    Expr altered_value = falter_layout[op](ref_call->attrs, new_args, tinfos);
+    if (altered_value.defined()) {
+      new_e = altered_value;
+      modified = true;
+    }
+  }
+  if (!modified) {
+    new_e = CallNode::make(ref_call->op, new_args,
+                           ref_call->attrs, ref_call->type_args);
+  }
+
+  const CallNode *new_call = new_e.as<CallNode>();
+  CHECK(new_call) << "Can only replace the original operator with another call node";
+  return GetRef<Call>(new_call);
+}
+
+Expr AlterOpLayoutRewrite(const Call &ref_call,
+                          const Array<Expr> &new_args,
+                          const NodeRef& ctx) {
+  std::vector<LayoutAlternatedExpr> inputs;
+  std::vector<Expr> normal_new_args;
+  Array<Array<IndexExpr> > input_shapes;
+
+  // NOTE: discard the "const" qualifier
+  TransformMemorizer memorizer = Downcast<TransformMemorizer>(ctx);
+
+  // fill incomplete state and expand tuple
+  for (auto new_arg : new_args) {
+    auto push_back_one_arg = [&](Expr arg) {
+      // We always expect LayoutAlternatedExpr.
+      // This is used to convert the normal Expr to LayoutAlternatedExpr.
+      if (const LayoutAlternatedExprNode *inp = arg.as<LayoutAlternatedExprNode>()) {
+        inputs.push_back(GetRef<LayoutAlternatedExpr>(inp));
+        normal_new_args.push_back(inp->value);
+      } else {
+        auto inode = make_node<LayoutAlternatedExprNode>();
+        inode->value = arg;
+        inode->memorizer = memorizer;
+        inputs.push_back(LayoutAlternatedExpr(inode));
+        normal_new_args.push_back(arg);
+      }
+    };
+
+    if (new_arg->is_type<TupleNode>()) {
+      Tuple tuple_new_arg = Downcast<Tuple>(new_arg);
+      for (auto x : tuple_new_arg->fields) {
+        push_back_one_arg(x);
+      }
+    } else {
+      push_back_one_arg(new_arg);
+    }
+  }
+
+  // old_in, new_in = state[inputs]
+  Array<Layout> old_in, old_out, new_in, new_out, new_in2;
+  for (auto inp : inputs) {
+    old_in.push_back(inp->old_layout);
+    new_in.push_back(inp->new_layout);
+  }
+
+  for (auto arg : ref_call->args) {
+    if (arg->is_type<TupleNode>()) {  // expand tuple
+      Tuple tuple_arg = Downcast<Tuple>(arg);
+      for (auto x : tuple_arg->fields) {
+        input_shapes.push_back(x->type_as<TensorTypeNode>()->shape);
+      }
+    } else {
+      input_shapes.push_back(arg->type_as<TensorTypeNode>()->shape);
+    }
+  }
+
+  // old_in, old_out = op.infer(old_in)
+  bool success = false;
+  std::tie(old_in, old_out, success) = CallInfer(ref_call,
+                                                 Array<Layout>(nullptr),
+                                                 old_in, input_shapes);
+  if (!success) { return Expr(nullptr); }
+  CHECK_EQ(old_in.size(), new_in.size());
+
+  // if new_in == 'undef':  new_in = old_in
+  for (size_t i = 0; i < new_in.size(); ++i) {
+    if (!new_in[i].defined()) {
+      new_in.Set(i, old_in[i]);
+    }
+  }
+
+  // new_op = alter(op)
+  Call new_call = CallAlter(ref_call, normal_new_args);
+
+  // new_in2, new_out = op.infer(new_in)
+  if (new_call->op->is_type<OpNode>()) {
+    success = false;
+    std::tie(new_in2, new_out, success) = CallInfer(new_call, new_in, old_in, input_shapes);
+    if (!success) { return Expr(nullptr); }
+  } else {
+    return Expr(nullptr);
+  }
+
+  CHECK_EQ(new_out.size(), old_out.size())
+    << "The number of output nodes should keep the same during alter_op_layout";
+  CHECK_EQ(new_in.size(), new_in2.size())
+    << "The number of input nodes should keep the same during alter_op_layout";
+
+  // if (new_in != new_in2): insert transform (new_in -> new_in2)
+  Array<Expr> transformed_args;
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    transformed_args.push_back(memorizer.Transform(new_call->args[i], new_in[i], new_in2[i]));
+  }
+
+  // state[node] = (old_out, new_out)
+  CHECK(ref_call->checked_type_.defined())
+    << "Call infer_type pass before alter_op_layout pass";
+
+  if (ref_call->checked_type()->is_type<TupleTypeNode>()) {
+    Expr tuple_output = CallNode::make(new_call->op, transformed_args,
+                                       new_call->attrs, new_call->type_args);
+    Array<Expr> fields;
+    for (size_t i = 0; i < new_out.size(); ++i) {
+      auto rnode = make_node<LayoutAlternatedExprNode>();
+      rnode->value = TupleGetItemNode::make(tuple_output, i);
+      rnode->old_layout = old_out[i];
+      rnode->new_layout = new_out[i];
+      rnode->memorizer = memorizer;
+      fields.push_back(Expr(rnode));
+    }
+    return TupleNode::make(fields);
+  } else {
+    auto rnode = make_node<LayoutAlternatedExprNode>();
+    CHECK_EQ(new_out.size(), 1);
+    rnode->value = CallNode::make(new_call->op, transformed_args,
+                                  new_call->attrs, new_call->type_args);
+    rnode->old_layout = old_out[0];
+    rnode->new_layout = new_out[0];
+    rnode->memorizer = memorizer;
+    return Expr(rnode);
+  }
+}
+
+TVM_REGISTER_API("relay._ir_pass.AlterOpLayout")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  TransformMemorizer transformMemorizer(make_node<TransformMemorizerNode>());
+  auto fcontext = [&](const Call& call) -> NodeRef{
+    return transformMemorizer;
+  };
+
+  *ret = ForwardRewrite(args[0], AlterOpLayoutRewrite, fcontext);
+});
+
+}  // namespace alter_op_layout
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/alter_op_layout.h b/src/relay/pass/alter_op_layout.h
new file mode 100644
index 000000000000..fcb7b379a0ec
--- /dev/null
+++ b/src/relay/pass/alter_op_layout.h
@@ -0,0 +1,119 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file alter_op_layout.h
+ * \brief Alternate the layouts of operators or replace primitive operators with
+          other expressions. This pass can be used for computing convolution in
+          custom layouts or other general weight pre-transformation.
+ */
+
+#ifndef TVM_RELAY_PASS_ALTER_OP_LAYOUT_H_
+#define TVM_RELAY_PASS_ALTER_OP_LAYOUT_H_
+
+#include <tvm/relay/expr.h>
+
+#include "../op/layout.h"
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief Infer & correct function of node layout. See \p Layout for layout convention
+ * \param attrs The attribute of the node.
+ * \param new_in_layouts The layouts of input arguments after alter_op_layout.
+ *                       This can be undefined, which means we call this function before alternating
+ *                       any operators.
+ * \param old_in_layouts The layouts of input arguments before alter_op_layout.
+ * \param old_in_shapes The shapes of old input arguments.
+ * \return infered_layout An array of two elements that are inferred input layouts and
+ *                        inferred output layouts.
+ */
+using FInferCorrectLayout = runtime::TypedPackedFunc<
+    Array<Array<Layout>>(const Attrs& attrs,
+                         const Array<Layout>& new_in_layouts,
+                         const Array<Layout>& old_in_layouts,
+                         const Array<Array<IndexExpr>> &old_in_shapes)>;
+
+/*! \brief take arbitrary input layout and copy to output */
+inline Array<Array<Layout> > ElemwiseArbitraryLayout(const Attrs& attrs,
+                                                     const Array<Layout>& new_in_layouts,
+                                                     const Array<Layout>& old_in_layouts,
+                                                     const Array<Array<IndexExpr>> &old_in_shapes) {
+  Layout ret;
+
+  if (new_in_layouts.defined()) {
+    CHECK_GE(new_in_layouts.size(), 1);
+    ret = new_in_layouts[0];
+  } else {
+    for (size_t i = 0; i < old_in_layouts.size(); ++i) {
+      if (old_in_layouts[i].defined()) {
+        ret = old_in_layouts[i];
+        break;
+      }
+    }
+  }
+
+  return Array<Array<Layout> >{Array<Layout>(old_in_layouts.size(), ret), {ret}};
+}
+
+/*! \brief Infer layout for binary broadcast operators */
+inline Array<Array<Layout> > BinaryBroadcastLayout(const Attrs& attrs,
+                                                   const Array<Layout>& new_in_layouts,
+                                                   const Array<Layout>& old_in_layouts,
+                                                   const Array<Array<IndexExpr>> &old_in_shapes) {
+  Array<Layout> layouts;
+
+  if (new_in_layouts.defined()) {
+    layouts.assign(new_in_layouts.begin(), new_in_layouts.end());
+  } else {
+    layouts.assign(old_in_layouts.begin(), old_in_layouts.end());
+  }
+
+  if (!layouts[0].defined() && !layouts[1].defined()) {
+    // both undefined, infer fails
+    return Array<Array<Layout> > {{Layout::Undef()}, {Layout::Undef()}};
+  } else if (!layouts[0].defined() || !layouts[1].defined()) {
+    // only one is defined, use shape information to help infer
+    int defined_idx = layouts[0].defined() ? 0 : 1;
+    int undef_idx = 1 - defined_idx;
+
+    if (old_in_shapes[defined_idx].size() >= old_in_shapes[undef_idx].size()) {
+      layouts.Set(undef_idx,
+                  layouts[defined_idx].Sublayout(
+                      old_in_shapes[defined_idx].size() - old_in_shapes[undef_idx].size(),
+                      old_in_shapes[undef_idx].size()));
+      return Array<Array<Layout> > {layouts, {layouts[defined_idx]}};
+    } else {
+      // only know the tensor with smaller dimensions,
+      // so we cannot infer the final broadcasted output.
+      // fails in this case.
+      return Array<Array<Layout> > {{Layout::Undef()}, {Layout::Undef()}};
+    }
+  } else {
+    // try to broadcast the tensors to the larger dimension
+    int large_idx = layouts[0].ndim_super() >= layouts[1].ndim_super() ? 0 : 1;
+    int small_idx = 1 - large_idx;
+    Layout ret = layouts[large_idx];
+
+    // extract common part
+    size_t i = layouts[large_idx].ndim();
+    for (; i != 0; --i) {
+      auto dim = layouts[large_idx][i-1];
+      if (!layouts[small_idx].Contains(Layout::ToSuperdim(dim))) {
+        break;
+      }
+    }
+
+    Layout common_part = layouts[large_idx].Sublayout(i, layouts[large_idx].ndim() - i);
+    if (!layouts[small_idx].Convertible(common_part)) {  // fail
+      return Array<Array<Layout> > {{Layout::Undef()}, {Layout::Undef()}};
+    }
+
+    layouts.Set(small_idx, common_part);
+    return Array<Array<Layout> > {layouts, {ret}};
+  }
+}
+
+}  //  namespace relay
+}  //  namespace tvm
+
+#endif  // TVM_RELAY_PASS_ALTER_OP_LAYOUT_H_
diff --git a/src/relay/pass/canonicalize_ops.cc b/src/relay/pass/canonicalize_ops.cc
new file mode 100644
index 000000000000..4482dc3954ab
--- /dev/null
+++ b/src/relay/pass/canonicalize_ops.cc
@@ -0,0 +1,46 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file canonicalize_ops.cc
+ * \brief Canonicalize special operators to basic operators.
+    This can simplify latter analysis. (e.g. Expand bias_add to expand_dims and broadcast_add.)
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/attrs/nn.h>
+#include "pattern_util.h"
+
+namespace tvm {
+namespace relay {
+
+class BiasAddSimplifier : public ExprMutator {
+ public:
+  Expr VisitExpr_(const CallNode* n) {
+    static const Op& bias_add = Op::Get("nn.bias_add");
+    auto new_n = ExprMutator::VisitExpr_(n);
+    if (n->op.same_as(bias_add)) {
+      Call call = Downcast<Call>(new_n);
+      CHECK_EQ(call->args.size(), 2);
+      const BiasAddAttrs* param = call->attrs.as<BiasAddAttrs>();
+
+      auto ttype = n->args[0]->type_as<TensorTypeNode>();
+      size_t n_dim = ttype->shape.size();
+      Expr expanded_bias = ExpandBiasToMatchAxis(call->args[1], n_dim, {param->axis});
+      Expr ret = Add(call->args[0], expanded_bias);
+      ret->checked_type_ = n->checked_type_;
+      return ret;
+    }
+    return new_n;
+  }
+};
+
+Expr CanonicalizeOps(const Expr& e) {
+  return BiasAddSimplifier().Mutate(e);
+}
+
+TVM_REGISTER_API("relay._ir_pass.canonicalize_ops")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+*ret = CanonicalizeOps(args[0]);
+});
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/combine_parallel_conv2d.cc b/src/relay/pass/combine_parallel_conv2d.cc
new file mode 100644
index 000000000000..e346aea518e9
--- /dev/null
+++ b/src/relay/pass/combine_parallel_conv2d.cc
@@ -0,0 +1,333 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ *
+ * \file combine_parallel_conv2d.cc
+ * \brief Combine parallel 2d convolutions into a single convolution.
+ *
+ * This pass replaces convolutions that share the same input node and the same
+ * arguments (except that the number of output channels can be different) with a
+ * single convolution. The weight of the new 2d convolution is the concatenation
+ * of the original weights. Elemwise and broadcast ops following conv2d are also
+ * combined if possible.
+ *
+ * This prevents launching multiple kernels in networks with multiple
+ * convolution branches, such as Inception block.
+ */
+
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/attrs/transform.h>
+#include <tvm/relay/op_attr_types.h>
+#include <unordered_map>
+#include <unordered_set>
+#include "./expr_subst.h"
+#include "./pattern_util.h"
+
+
+namespace tvm {
+namespace relay {
+
+using Branch = std::vector<const CallNode*>;
+using Group = std::vector<Branch>;
+
+/*
+  Find parallel branches starting with conv2d as shown below and then group branches by kernel
+  shape and attributes of conv2d. Conv2d can be followed by zero or more elemwise or broadcast ops.
+  Intermediate nodes have exactly one successor. It is possible that branches meet at a point,
+  which should be handled in ParallelConv2DCombiner.
+
+         data
+        /    \
+    conv2d   conv2d
+      |        |
+      op       op
+      |        |
+*/
+class BranchGroupFinder : private ExprVisitor {
+ public:
+  std::vector<Group> Find(const Expr& expr) {
+    static const Op& conv2d = Op::Get("nn.conv2d");
+
+    this->VisitExpr(expr);
+
+    std::vector<Group> groups;
+    for (const auto& root : conv_roots_) {
+      const auto& children = children_map_.at(root);
+      size_t ngroups = groups.size();
+      for (const CallNode* child : children) {
+        if (!child->op.same_as(conv2d)) continue;
+
+        auto&& branch = CreateBranch(child);
+        // add the branch to a group, or create a new group
+        auto it = std::find_if(groups.begin() + ngroups, groups.end(), [&](const Group& group) {
+          CHECK(!group.empty() && !group[0].empty());
+          return IsCompatibleConv2D(child, group[0][0]);
+        });
+        if (it != groups.end()) {
+          it->push_back(branch);
+        } else {
+          groups.emplace_back();
+          // each group has at least one branch
+          groups.back().push_back(branch);
+        }
+      }
+    }
+    return groups;
+  }
+
+ private:
+  std::unordered_set<Expr, NodeHash, NodeEqual> conv_roots_;
+  std::unordered_map<Expr, std::vector<const CallNode*>, NodeHash, NodeEqual> children_map_;
+
+  // Two 2d convolutions can be combined if they have the same attributes or
+  // only have different output channels.
+  bool IsCompatibleConv2D(const CallNode* a, const CallNode* b) {
+    AttrsEqual eq;
+    static const Layout kOIHW("OIHW");
+    const auto* attrs_a = a->attrs.as<Conv2DAttrs>();
+    const auto* attrs_b = b->attrs.as<Conv2DAttrs>();
+    CHECK(attrs_a);
+    CHECK(attrs_b);
+    const auto* tweight_a = a->args[1]->type_as<TensorTypeNode>();
+    const auto* tweight_b = b->args[1]->type_as<TensorTypeNode>();
+    const auto shape_a = ConvertLayout(tweight_a->shape, attrs_a->weight_layout, kOIHW);
+    const auto shape_b = ConvertLayout(tweight_b->shape, attrs_b->weight_layout, kOIHW);
+
+    return eq(attrs_a->strides, attrs_b->strides) && eq(attrs_a->padding, attrs_b->padding) &&
+           eq(attrs_a->dilation, attrs_b->dilation) && eq(attrs_a->groups, attrs_b->groups) &&
+           eq(attrs_a->data_layout, attrs_b->data_layout) &&
+           eq(attrs_a->weight_layout, attrs_b->weight_layout) &&
+           eq(attrs_a->out_dtype, attrs_b->out_dtype) &&
+           eq(attrs_a->out_layout, attrs_b->out_layout) && eq(shape_a[2], shape_b[2]) &&
+           eq(shape_a[3], shape_b[3]);
+  }
+
+  // Create a branch starting from conv2d.
+  Branch CreateBranch(const CallNode* conv) {
+    static auto fpattern = Op::GetAttr<TOpPattern>("TOpPattern");
+    // each branch has at least one element, the first element is always conv2d
+    Branch branch{conv};
+    auto it = children_map_.find(GetRef<Expr>(branch.back()));
+    while (it != children_map_.end() && it->second.size() == 1) {
+      const CallNode* call = it->second[0];
+      auto pattern = fpattern[Downcast<Op>(call->op)];
+      if (pattern <= kBroadcast) {
+        branch.push_back(call);
+        it = children_map_.find(GetRef<Expr>(branch.back()));
+      } else {
+        break;
+      }
+    }
+    return branch;
+  }
+
+  void VisitExpr_(const CallNode* n) final {
+    static const Op& conv2d = Op::Get("nn.conv2d");
+    ExprVisitor::VisitExpr_(n);
+    if (n->op.same_as(conv2d) && n->attrs.as<Conv2DAttrs>()->groups == 1) {
+      conv_roots_.insert(n->args[0]);
+      children_map_[n->args[0]].push_back(n);
+    } else {
+      for (size_t i = 0; i < n->args.size(); i++) {
+        children_map_[n->args[i]].push_back(n);
+      }
+    }
+  }
+};
+
+class ParallelConv2DCombiner {
+ public:
+  Expr Combine(const Expr& expr) {
+    auto groups = BranchGroupFinder().Find(expr);
+    for (const Group& group : groups) {
+      if (group.size() < 2) continue;
+      CombineBranches(group);
+    }
+    return ExprSubst(expr, std::move(subst_map_));
+  }
+
+ private:
+  std::unordered_map<Expr, Expr, NodeHash, NodeEqual> subst_map_;
+
+  std::tuple<Expr, IndexExpr> TransformWeight(const Group& branches) {
+    int64_t num_filters = 0;  // number of filters of the transformed weight
+    Array<Expr> weights;
+    for (const auto& branch : branches) {
+      auto conv2d = branch[0];
+      weights.push_back(conv2d->args[1]);
+      auto channels = GetConv2DSuperChannelsDim(conv2d);
+      num_filters += channels;
+    }
+    auto index = branches[0][0]->attrs.as<Conv2DAttrs>()->weight_layout.find('O');
+    CHECK_NE(index, std::string::npos);
+    return std::make_tuple(MakeConcatenate(TupleNode::make(weights), index),
+                           MakeConstScalar(Int(32), num_filters));
+  }
+
+  Call MakeCombinedConv2D(const Group& branches) {
+    static const Op& conv2d = Op::Get("nn.conv2d");
+    Expr data = branches[0][0]->args[0];
+    Expr new_weight;
+    IndexExpr new_channels;
+    std::tie(new_weight, new_channels) = TransformWeight(branches);
+
+    const CallNode* group_root = branches[0][0];
+    const auto* attrs = group_root->attrs.as<Conv2DAttrs>();
+    CHECK(attrs);
+    const auto new_attrs = make_node<Conv2DAttrs>();
+    new_attrs->strides = attrs->strides;
+    new_attrs->padding = attrs->padding;
+    new_attrs->dilation = attrs->dilation;
+    new_attrs->groups = attrs->groups;
+    new_attrs->kernel_size = attrs->kernel_size;
+    new_attrs->data_layout = attrs->data_layout;
+    new_attrs->weight_layout = attrs->weight_layout;
+    new_attrs->out_layout = attrs->out_layout;
+    new_attrs->out_dtype = attrs->out_dtype;
+    new_attrs->channels = new_channels;
+
+    return CallNode::make(conv2d, {data, new_weight}, Attrs{new_attrs}, {});
+  }
+
+  bool IsArgCompatible(const CallNode* a, const CallNode* b, size_t index, size_t channel_pos) {
+    AttrsEqual eq;
+    auto ta = a->args[index]->type_as<TensorTypeNode>();
+    auto tb = b->args[index]->type_as<TensorTypeNode>();
+    auto toutput_a = a->type_as<TensorTypeNode>();
+    auto toutput_b = b->type_as<TensorTypeNode>();
+
+    if (!eq(ta->dtype, tb->dtype) || ta->shape.size() != tb->shape.size())
+      return false;
+
+    // Position of the 'C' dimension in the argument
+    size_t arg_channel_pos = channel_pos - toutput_a->shape.size() + ta->shape.size();
+
+    // Channel super-dimension shoule be present and not broadcasted
+    if ((arg_channel_pos > channel_pos) ||  // size_t overflow
+        !eq(ta->shape[arg_channel_pos], toutput_a->shape[channel_pos]) ||
+        !eq(tb->shape[arg_channel_pos], toutput_b->shape[channel_pos]))
+      return false;
+
+    for (size_t i = 0; i < ta->shape.size(); i++) {
+      if (i == arg_channel_pos) continue;
+      if (!eq(ta->shape[i], tb->shape[i]))
+        return false;
+    }
+    return true;
+  }
+
+  // Check if ops in depth-th level can be combined
+  bool CheckLevel(const Group& branches, size_t depth, size_t channel_pos, size_t parent_index) {
+    const CallNode* call = branches[0][depth];
+    AttrsEqual attrs_equal;
+    // check if all branches in current depth can be combined
+    for (auto it = branches.begin() + 1; it != branches.end(); it++) {
+      const Branch& branch = *it;
+      if (!branch[depth]->op.same_as(call->op) ||
+          !attrs_equal(branch[depth]->attrs, call->attrs) ||
+          branch[depth]->args.size() != call->args.size()) {
+        return false;
+      }
+
+      if (branch[depth]->args[parent_index].get() != branch[depth - 1])
+        return false;
+
+      // Check args
+      for (size_t i = 0; i < call->args.size(); i++) {
+        if (i == parent_index) continue;
+
+        if (!IsArgCompatible(call, branch[depth], i, channel_pos) ||
+            !attrs_equal(call->attrs, branch[depth]->attrs)) {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  // Combine args and make the combined CallNode
+  Call MakeCombinedCall(const Expr& data, const Group& branches, size_t depth, size_t channel_pos,
+                        size_t parent_index) {
+    Array<Expr> new_args;
+    const CallNode* call = branches[0][depth];
+    size_t ndim = call->type_as<TensorTypeNode>()->shape.size();
+
+    for (size_t i = 0; i < call->args.size(); i++) {
+      if (i == parent_index) {
+        new_args.push_back(data);
+        continue;
+      }
+      size_t arg_ndim = call->args[i]->type_as<TensorTypeNode>()->shape.size();
+      size_t arg_channel_pos = channel_pos - ndim + arg_ndim;
+      Array<Expr> tuple;
+      for (const auto& branch : branches) {
+        tuple.push_back(branch[depth]->args[i]);
+      }
+      auto concat = MakeConcatenate(TupleNode::make(tuple), arg_channel_pos);
+      new_args.push_back(std::move(concat));
+    }
+    return CallNode::make(call->op, new_args, call->attrs, {});
+  }
+
+  // Replace output of each branch with slices of the combined output
+  void UpdateGroupOutput(const Expr& data, const Group& branches, size_t depth,
+                         size_t channel_pos) {
+    int64_t index = 0;
+    for (const auto& branch : branches) {
+      const CallNode* conv2d = branch[0];
+      int64_t channels = GetConv2DSuperChannelsDim(conv2d);
+      Array<Integer> begin;
+      Array<Integer> end;
+      for (size_t i = 0; i < channel_pos; i++) {
+        begin.push_back(0);
+        end.push_back(NullValue<Integer>());
+      }
+      begin.push_back(index);
+      index += channels;
+      end.push_back(index);
+      auto slice = MakeStridedSlice(data, std::move(begin), std::move(end), Array<Integer>{});
+      subst_map_[GetRef<Expr>(branch[depth])] = slice;
+    }
+  }
+
+  // Combine branches in a group. Conv2d in different branches in the same group are safe to
+  // combine. Subsequent ops may or may not be combined. We start from conv2d and try to
+  // combine ops from all branches in the same depth.
+  void CombineBranches(const Group& branches) {
+    Call combined = MakeCombinedConv2D(branches);
+    auto conv_param = combined->attrs.as<Conv2DAttrs>();
+    const std::string& layout =
+        conv_param->out_layout == "" ? conv_param->data_layout : conv_param->out_layout;
+    size_t channel_pos = layout.find('C');
+    CHECK_NE(channel_pos, std::string::npos);
+    auto it = std::min_element(branches.begin(), branches.end(),
+                               [](const Branch& branch_a,
+                                  const Branch& branch_b) {
+                                    return branch_a.size() < branch_b.size();
+                                  });
+    size_t depth = it->size();
+    size_t i;
+    // starting from 1 to skip the conv2d
+    for (i = 1; i < depth; i++) {
+      size_t parent_index;
+      for (parent_index = 0; parent_index < branches[0][i]->args.size(); parent_index++) {
+        if (branches[0][i]->args[parent_index].get() == branches[0][i - 1]) break;
+      }
+      CHECK_NE(parent_index, branches[0][i]->args.size());
+      if (!CheckLevel(branches, i, channel_pos, parent_index)) break;
+      combined = MakeCombinedCall(combined, branches, i, channel_pos, parent_index);
+    }
+    UpdateGroupOutput(combined, branches, i - 1, channel_pos);
+  }
+};
+
+Expr CombineParallelConv2D(const Expr& expr) { return ParallelConv2DCombiner().Combine(expr); }
+
+TVM_REGISTER_API("relay._ir_pass.CombineParallelConv2D")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  *ret = CombineParallelConv2D(args[0]);
+});
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/dead_code.cc b/src/relay/pass/dead_code.cc
new file mode 100644
index 000000000000..0d2677e11c67
--- /dev/null
+++ b/src/relay/pass/dead_code.cc
@@ -0,0 +1,118 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ *
+ * \file dead_code.cc
+ *
+ * \brief Remove code that does not effect the program result.
+ *
+ * The algorithm is implemented by two visitor:
+ * CalcDep turn an expr into a dependency graph of expr,
+ * GenLet turn the dependency graph into a let list, taking only the used value.
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include "let_list.h"
+
+namespace tvm {
+namespace relay {
+
+bool IsBoolLit(const Expr& e, bool b) {
+  if (const ConstantNode* c = e.as<ConstantNode>()) {
+    if (c->is_scalar()) {
+      auto dt = c->tensor_type()->dtype;
+      if (dt == Bool()) {
+        return *reinterpret_cast<const uint8_t*>(c->data->data) == b;
+      } else if (dt == UInt(8)) {
+        return *reinterpret_cast<const uint8_t*>(c->data->data) == b;
+      } else if (dt == UInt(16)) {
+        return *reinterpret_cast<const uint16_t*>(c->data->data) == b;
+      } else if (dt == UInt(32)) {
+        return *reinterpret_cast<const uint32_t*>(c->data->data) == b;
+      } else if (dt == UInt(64)) {
+        return *reinterpret_cast<const uint64_t*>(c->data->data) == b;
+      } else if (dt == Int(8)) {
+        return *reinterpret_cast<const int8_t*>(c->data->data) == b;
+      } else if (dt == Int(16)) {
+        return *reinterpret_cast<const int16_t*>(c->data->data) == b;
+      } else if (dt == Int(32)) {
+        return *reinterpret_cast<const int32_t*>(c->data->data) == b;
+      } else if (dt == Int(64)) {
+        return *reinterpret_cast<const int64_t*>(c->data->data) == b;
+      }
+    }
+  }
+  return false;
+}
+
+// calculate the dependency graph from expression
+class CalcDep : private ExprMutator {
+ public:
+  static Expr Eliminate(const Expr& e) {
+    CalcDep cd;
+    auto res = cd(e);
+    GenLet gl(cd.var_map_);
+    gl(res);
+    return gl.lets_.Get(res);
+  }
+
+ private:
+  using VarMap = std::unordered_map<Var, Expr, NodeHash, NodeEqual>;
+  VarMap var_map_;
+
+  Expr VisitExpr_(const IfNode* i) final {
+    auto cond = VisitExpr(i->cond);
+    if (IsBoolLit(cond, true)) {
+      return Eliminate(i->true_branch);
+    } else if (IsBoolLit(cond, false)) {
+      return Eliminate(i->false_branch);
+    } else {
+      return IfNode::make(cond, Eliminate(i->true_branch), Eliminate(i->false_branch));
+    }
+  }
+
+  Expr VisitExpr_(const LetNode* l) final {
+    var_map_[l->var] = Eliminate(l->value);
+    return VisitExpr(l->body);
+  }
+
+  Expr VisitExpr_(const FunctionNode* f) final {
+    return FunctionNode::make(f->params,
+                              Eliminate(f->body),
+                              f->ret_type,
+                              f->type_params);
+  }
+
+  // generate the let list from dependency graph
+  class GenLet : private ExprVisitor {
+   private:
+    LetList lets_;
+    VarMap var_map_;
+    explicit GenLet(const VarMap& var_map) : var_map_(var_map) { }
+    friend CalcDep;
+
+    void VisitExpr_(const VarNode* vnode) final {
+      Var v = GetRef<Var>(vnode);
+      auto it = var_map_.find(v);
+      if (it != var_map_.end()) {
+        Expr expr = it->second;
+        var_map_.erase(it);
+        // erase before visit to handle letrec
+        VisitExpr(expr);
+        // visit before push back so the dependency of dependency is before the dependency
+        lets_.Push(v, expr);
+      }
+    }
+  };
+};
+
+Expr DeadCodeElimination(const Expr& e) {
+  return CalcDep::Eliminate(e);
+}
+
+TVM_REGISTER_API("relay._ir_pass.dead_code_elimination")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = DeadCodeElimination(args[0]);
+  });
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/expr_subst.cc b/src/relay/pass/expr_subst.cc
new file mode 100644
index 000000000000..67dc0d2f7049
--- /dev/null
+++ b/src/relay/pass/expr_subst.cc
@@ -0,0 +1,35 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file expr_subst.h
+ * \brief Utility functions for substituting expressions.
+ */
+
+#include <tvm/relay/expr_functor.h>
+#include "./expr_subst.h"
+
+namespace tvm {
+namespace relay {
+
+class ExprSubstituter : public ExprMutator {
+ public:
+  explicit ExprSubstituter(std::unordered_map<Expr, Expr, NodeHash, NodeEqual> subst_map)
+      : subst_map_(subst_map) {}
+
+  Expr VisitExpr(const Expr& expr) final {
+    auto it = subst_map_.find(expr);
+    if (it != subst_map_.end()) {
+      return ExprMutator::VisitExpr((*it).second);
+    }
+    return ExprMutator::VisitExpr(expr);
+  }
+
+ private:
+  tvm::Map<Expr, Expr> subst_map_;
+};
+
+Expr ExprSubst(const Expr& expr, std::unordered_map<Expr, Expr, NodeHash, NodeEqual> subst_map) {
+  return ExprSubstituter(std::move(subst_map)).Mutate(expr);
+}
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/expr_subst.h b/src/relay/pass/expr_subst.h
new file mode 100644
index 000000000000..67892b3a0af7
--- /dev/null
+++ b/src/relay/pass/expr_subst.h
@@ -0,0 +1,18 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file expr_subst.h
+ * \brief Utility functions for substituting expressions.
+ */
+#ifndef TVM_RELAY_PASS_EXPR_SUBST_H_
+#define TVM_RELAY_PASS_EXPR_SUBST_H_
+#include <tvm/relay/expr.h>
+#include <unordered_map>
+
+namespace tvm {
+namespace relay {
+
+Expr ExprSubst(const Expr& expr, std::unordered_map<Expr, Expr, NodeHash, NodeEqual> subst_map);
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_PASS_EXPR_SUBST_H_
diff --git a/src/relay/pass/fold_constant.cc b/src/relay/pass/fold_constant.cc
new file mode 100644
index 000000000000..60994cdd6ca9
--- /dev/null
+++ b/src/relay/pass/fold_constant.cc
@@ -0,0 +1,157 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file constant_folding.cc
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/interpreter.h>
+
+namespace tvm {
+namespace relay {
+
+using FInterpreter = runtime::TypedPackedFunc<Value(Expr)>;
+
+
+class ConstantChecker : private ExprVisitor {
+ public:
+  // Check whether an expression is constant. The results are memorized.
+  bool Check(const Expr& expr) {
+    if (expr.as<ConstantNode>()) {
+      return true;
+    }
+    const auto it = memo_.find(expr);
+    if (it != memo_.end())
+      return it->second;
+    VisitExpr(expr);
+    return memo_[expr];  // return memorized result or the default value false
+  }
+
+ private:
+  std::unordered_map<Expr, bool, NodeHash, NodeEqual> memo_;
+
+  void VisitExpr_(const TupleNode* n) final {
+    bool result = true;
+    for (const auto& field : n->fields) {
+      if (!Check(field)) {
+        result = false;
+        break;
+      }
+    }
+    memo_[GetRef<Tuple>(n)] = result;
+  }
+};
+
+
+// TODO(tvm-team) consider combine dead-code with constant folder.
+// or make a more powerful partial evaluator.
+class ConstantFolder : public ExprMutator {
+ public:
+  explicit ConstantFolder(FInterpreter executor)
+      : executor_(executor) {
+  }
+
+  Expr VisitExpr_(const LetNode* op) final {
+    Expr value = this->Mutate(op->value);
+    if (value.as<ConstantNode>()) {
+      memo_[op->var] = value;
+      return this->Mutate(op->body);
+    } else {
+      Var var = Downcast<Var>(this->Mutate(op->var));
+      Expr body = this->Mutate(op->body);
+      if (var.same_as(op->var) &&
+          value.same_as(op->value) &&
+          body.same_as(op->body)) {
+        return GetRef<Expr>(op);
+      } else {
+        return LetNode::make(var, value, body);
+      }
+    }
+  }
+
+  Expr VisitExpr_(const CallNode* call) final {
+    static auto op_stateful = Op::GetAttr<TOpIsStateful>("TOpIsStateful");
+    Expr res = ExprMutator::VisitExpr_(call);
+    call = res.as<CallNode>();
+    // We don't constant fold function with zero arguments.
+    // This is a heuristic that is useful.
+    // For example it is harmful to fold ones(shape=(4, 5)).
+    if (call->args.size() == 0) return res;
+    const OpNode* op = call->op.as<OpNode>();
+    if (op == nullptr) return res;
+    // skip stateful ops.
+    if (op_stateful.get(GetRef<Op>(op), false)) return res;
+    bool all_const_args = true;
+    for (Expr arg : call->args) {
+      if (!checker_.Check(arg)) {
+        all_const_args = false;
+      }
+    }
+    if (all_const_args) {
+      return ConstEvaluate(res);
+    } else {
+      return res;
+    }
+  }
+
+  Expr VisitExpr_(const TupleGetItemNode* op) final {
+    Expr res = ExprMutator::VisitExpr_(op);
+    op = res.as<TupleGetItemNode>();
+    if (const auto* tuple = op->tuple.as<TupleNode>()) {
+      return tuple->fields[op->index];
+    } else {
+      return res;
+    }
+  }
+
+ private:
+  // Internal interepreter.
+  FInterpreter executor_;
+  // Internal constant checker
+  ConstantChecker checker_;
+
+  // Convert value to expression.
+  Expr ValueToExpr(Value value) {
+    if (const auto* val = value.as<TensorValueNode>()) {
+      return ConstantNode::make(val->data);
+    } else if (const auto* val = value.as<TupleValueNode>()) {
+      Array<Expr> fields;
+      for (Value field : val->fields) {
+        fields.push_back(ValueToExpr(field));
+      }
+      return TupleNode::make(fields);
+    } else {
+      LOG(FATAL) << "Cannot handle " << value->type_key();
+      return Expr();
+    }
+  }
+  // Constant evaluate a expression.
+  Expr ConstEvaluate(Expr expr) {
+    expr = InferType(expr, Module(nullptr));
+    expr = FuseOps(expr, 0);
+    expr = InferType(expr, Module(nullptr));
+    return ValueToExpr(executor_(expr));
+  }
+};
+
+
+Expr FoldConstant(const Expr& expr) {
+  DLContext ctx;
+  ctx.device_type = kDLCPU;
+  ctx.device_id = 0;
+  Target target = Target::create("llvm");
+  // use a fresh build context
+  // in case we are already in a build context.
+  BuildConfigContext fresh_build_ctx(build_config());
+
+  return ConstantFolder(CreateInterpreter(
+      Module(nullptr), ctx, target)).Mutate(expr);
+}
+
+TVM_REGISTER_API("relay._ir_pass.FoldConstant")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    *ret = FoldConstant(args[0]);
+});
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/fold_scale_axis.cc b/src/relay/pass/fold_scale_axis.cc
new file mode 100644
index 000000000000..760a226a2fac
--- /dev/null
+++ b/src/relay/pass/fold_scale_axis.cc
@@ -0,0 +1,885 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ *
+ * \file fold_scale_axis.cc
+ *
+ * \brief Fold axis scaling into weights of
+ *  conv/dense operators.
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr_functor.h>
+#include "pattern_util.h"
+#include "pass_util.h"
+#include "../op/layout.h"
+
+
+namespace tvm {
+namespace relay {
+/*!
+ * \brief namespace of fold scale axis
+ *
+ * Use namespace to reduce potential naming conflict.
+ */
+namespace fold_scale_axis {
+
+using runtime::TypedPackedFunc;
+
+
+// FoldScaleAxis algorithm:
+//
+// The general idea is to transform Expr to tuple of
+// (value, axes, scale), where the final result satisfies:
+//
+// result = value
+// for i, k in enumerate(axes):
+//    k-th dimension of result *= i-th dimension of scale
+//
+// Then we can propagate this signal along and fold the scale if necessary.
+// However, it is possible that certain scale may never be consumed
+// if there is no dense/conv2d that follows multiplication.
+//
+// In order to make sure all the scale we sent out can be consumed eventually,
+// we run a backward "preparation phase", which propagates the demand
+// of the potential axes scaling back to its input.
+//
+// Forward folding process is done in two steps:
+// - Prepare phase: backward propagation of demand.
+// - Transform phase: forward transformation,
+//
+// Similarly, backward folding process is done in two steps:
+// - Prepare phase: forward propagation of demand.
+// - Transform phase: transformation by push down the axes scale signal to inputs.
+//
+
+/*!
+ * \brief sorted array axis, can also be nullptr.
+ *
+ *  nullptr means no scaling request can be done.
+ */
+using AxesSet = Array<Integer>;
+
+/*!
+ * \brief Merge two axis set together by taking
+ *  intersection.
+ *
+ * \note The axes in a AxesSet should be sorted.
+ *
+ * \param lhs The left axis.
+ * \param rhs The right axis.
+ * \return The result of the inersection.
+ */
+AxesSet Intersect(const AxesSet& lhs, const AxesSet& rhs) {
+  if (!lhs.defined()) return lhs;
+  if (!rhs.defined()) return rhs;
+  // This code relies on axes in a AxesSet to be sorted.
+  AxesSet ret;
+  size_t i = 0, j = 0;
+  while (i < lhs.size() && j < rhs.size()) {
+    if (lhs[i]->value < rhs[j]->value) {
+      ++i;
+    } else if (lhs[i]->value > rhs[j]->value) {
+      ++j;
+    } else {
+      ret.push_back(lhs[i]);
+      ++i; ++j;
+    }
+  }
+  return ret;
+}
+
+/*!
+ * \brief Preparation function for pass scale forward.
+ * \param call The call node.
+ * \param out_scale_axes Possible scaling on axes of the output.
+ * \return The result scaling on axes of the input.
+ */
+using FForwardPrep = runtime::TypedPackedFunc<
+  Array<AxesSet> (const Call& call, const AxesSet& out_scale_axes)>;
+
+/*! \brief Axis scale tuple.  */
+class ScaledExprNode : public TempExprNode {
+ public:
+  /*! \brief The value */
+  Expr value;
+  /*! \brief The axes to scale, can be nullptr(means no-scaling) */
+  AxesSet axes = NullValue<AxesSet>();
+  /*! \brief The scaling factor */
+  Expr scale = NullValue<Expr>();
+
+  Expr Realize() const final {
+    CHECK(!axes.defined())
+        << "outstanding scale";
+    return value;
+  }
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("value", &value);
+    v->Visit("axes", &axes);
+    v->Visit("scale", &scale);
+  }
+
+  static constexpr const char* _type_key = "relay.fold_scale_axis.ScaledExpr";
+  TVM_DECLARE_NODE_TYPE_INFO(ScaledExprNode, TempExprNode);
+};
+
+using FForwardRewrite = TypedPackedFunc<
+  Expr(const Call& ref_call,
+       const Array<Expr>& new_args,
+       const AxesSet& expeced_out_axes)>;
+
+//----------------------------------------------
+// Generic Visitors for FScaleAxisForward
+//----------------------------------------------
+class ForwardPrep : private ExprVisitor {
+ public:
+  std::unordered_map<const Node*, AxesSet>
+  Prepare(const Expr& body) {
+    this->Update(body, NullValue<AxesSet>());
+    this->VisitExpr(body);
+    // flist is added in the Post-DFS order
+    // which is a special case of topological order.
+    // We reversely traverse the list to invoke the lazy functions.
+    // This act like a backprop of valid scale axis messages
+    for (auto it = flist_.rbegin(); it != flist_.rend(); ++it) {
+      (*it)();
+    }
+    // return the created message;
+    return std::move(message_);
+  }
+
+ private:
+  // The invoke list
+  std::vector<std::function<void()> > flist_;
+  // The message on each node.
+  std::unordered_map<const Node*, AxesSet> message_;
+  // Update the message stored at node.
+  void Update(const Expr& node, const AxesSet& axes) {
+    // We run intersection of messages:
+    //
+    // %y = multiply(%x, %scale)
+    // %z1 = conv2d(%y, %w)
+    // %z2 = exp(%y)
+    //
+    // Consider the above code example,
+    // because %z2 will propagate null to %y,
+    // the AxesSet on %y is also null,
+    // and the forward folding won't be triggered.
+    const Node* key = node.get();
+    if (message_.count(key)) {
+      message_[key] = Intersect(message_[key], axes);
+    } else {
+      message_[key] = axes;
+    }
+  }
+  // Visitor pattern override.
+  void VisitExpr_(const LetNode* call) {
+    LOG(FATAL) << "FoldScaleAxis only accept dataflow-form";
+  }
+
+  void VisitExpr_(const FunctionNode* op) {
+    ExprVisitor::VisitExpr_(op);
+    auto flazy = [this, op] {
+      this->Update(op->body, NullValue<AxesSet>());
+    };
+    flist_.push_back(flazy);
+  }
+
+  void VisitExpr_(const CallNode* call) {
+    ExprVisitor::VisitExpr_(call);
+    // function to be lazily invoked
+    auto flazy = [this, call]() {
+      static const auto& fprep =
+        Op::GetAttr<FForwardPrep>("FScaleAxisForwardPrep");
+      // find the message send to this node.
+      auto it = message_.find(call);
+      AxesSet out_axes;
+      if (it != message_.end()) {
+        out_axes = it->second;
+      } else {
+        out_axes = NullValue<AxesSet>();
+      }
+      // pass the message back to all the children it references.
+      auto f = fprep.get(call->op, nullptr);
+      if (f != nullptr) {
+        Array<AxesSet> in_axes = f(GetRef<Call>(call), out_axes);
+        CHECK_EQ(in_axes.size(), call->args.size());
+        for (size_t i = 0; i < call->args.size(); ++i) {
+          this->Update(call->args[i], in_axes[i]);
+        }
+      } else {
+        for (size_t i = 0; i < call->args.size(); ++i) {
+          this->Update(call->args[i], NullValue<AxesSet>());
+        }
+      }
+    };
+    flist_.push_back(flazy);
+  }
+
+  void VisitExpr_(const TupleNode* op) {
+    ExprVisitor::VisitExpr_(op);
+    // do not support pass scale through tuple for now.
+    auto flazy = [this, op]() {
+      for (const Expr& field : op->fields) {
+        this->Update(field, NullValue<AxesSet>());
+      }
+    };
+    flist_.push_back(flazy);
+  }
+
+  void VisitExpr_(const IfNode* op) {
+    ExprVisitor::VisitExpr_(op);
+    // do pass through condition
+    // by assigning NullValue<AxesSet>
+    // it means fuse signal cannot pass
+    // through into these subexpressions.
+    auto flazy = [this, op]() {
+      this->Update(op->cond, NullValue<AxesSet>());
+      this->Update(op->true_branch, NullValue<AxesSet>());
+      this->Update(op->false_branch, NullValue<AxesSet>());
+    };
+    flist_.push_back(flazy);
+  }
+};
+
+//----------------------------------------------
+// Per operator defs for FScaleAxisForward
+//----------------------------------------------
+
+// Intermediate operators
+Array<AxesSet> ReluForwardPrep(const Call& call, AxesSet out) {
+  return {out};
+}
+
+Expr ReluForwardRewrite(const Call& ref_call,
+                        const Array<Expr>& new_args,
+                        const AxesSet& expected_axes) {
+  const auto* input = new_args[0].as<ScaledExprNode>();
+  if (input == nullptr) return Expr(nullptr);
+  // return transformed conv2d
+  auto rnode = make_node<ScaledExprNode>();
+  rnode->value = CallNode::make(
+      ref_call->op, {input->value}, ref_call->attrs, ref_call->type_args);
+  rnode->scale = input->scale;
+  rnode->axes = input->axes;
+  return Expr(rnode);
+}
+
+RELAY_REGISTER_OP("nn.relu")
+.set_attr<FForwardPrep>("FScaleAxisForwardPrep", ReluForwardPrep);
+
+RELAY_REGISTER_OP("nn.relu")
+.set_attr<FForwardRewrite>("FScaleAxisForwardRewrite", ReluForwardRewrite);
+
+RELAY_REGISTER_OP("nn.leaky_relu")
+.set_attr<FForwardPrep>("FScaleAxisForwardPrep", ReluForwardPrep);
+
+RELAY_REGISTER_OP("nn.leaky_relu")
+.set_attr<FForwardRewrite>("FScaleAxisForwardRewrite", ReluForwardRewrite);
+
+// AddSub
+Array<AxesSet> AddSubForwardPrep(const Call& call, AxesSet out_axes) {
+  const auto* tlhs = call->args[0]->type_as<TensorTypeNode>();
+  const auto* trhs = call->args[1]->type_as<TensorTypeNode>();
+
+  auto none = NullValue<AxesSet>();
+  if (MatchBroadcastToLeftAxes(tlhs, trhs, out_axes)) {
+    return {out_axes, none};
+  } else if (MatchBroadcastToLeftAxes(trhs, tlhs, out_axes)) {
+    return {none, out_axes};
+  } else {
+    return {none, none};
+  }
+}
+
+Expr AddSubForwardRewrite(const Call& ref_call,
+                          const Array<Expr>& new_args,
+                          const AxesSet& expected_out_axes) {
+  const auto* slhs = new_args[0].as<ScaledExprNode>();
+  const auto* srhs = new_args[1].as<ScaledExprNode>();
+  if (!slhs && !srhs) return Expr();
+  const auto* tlhs = ref_call->args[0]->type_as<TensorTypeNode>();
+  const auto* trhs = ref_call->args[1]->type_as<TensorTypeNode>();
+  auto rnode = make_node<ScaledExprNode>();
+
+  if (slhs != nullptr) {
+    CHECK(srhs == nullptr);
+    CHECK(MatchBroadcastToLeftAxes(tlhs, trhs, slhs->axes));
+    Expr scale = ExpandBiasToMatchAxis(
+        slhs->scale, tlhs->shape.size(), slhs->axes);
+    Expr rhs = Divide(new_args[1], scale);
+    rnode->value = CallNode::make(ref_call->op, {slhs->value, rhs},
+                                  ref_call->attrs, ref_call->type_args);
+    rnode->scale = slhs->scale;
+    rnode->axes = slhs->axes;
+  } else {
+    CHECK(slhs != nullptr);
+    CHECK(MatchBroadcastToLeftAxes(trhs, tlhs, srhs->axes));
+    Expr scale = ExpandBiasToMatchAxis(
+        srhs->scale, trhs->shape.size(), srhs->axes);
+    Expr lhs = Divide(new_args[0], scale);
+    rnode->value = CallNode::make(ref_call->op, {lhs, srhs->value},
+                                  ref_call->attrs, ref_call->type_args);
+    rnode->scale = srhs->scale;
+    rnode->axes = srhs->axes;
+  }
+  return Expr(rnode);
+}
+
+RELAY_REGISTER_OP("add")
+.set_attr<FForwardPrep>("FScaleAxisForwardPrep", AddSubForwardPrep);
+
+RELAY_REGISTER_OP("add")
+.set_attr<FForwardRewrite>("FScaleAxisForwardRewrite", AddSubForwardRewrite);
+
+RELAY_REGISTER_OP("subtract")
+.set_attr<FForwardPrep>("FScaleAxisForwardPrep", AddSubForwardPrep);
+
+RELAY_REGISTER_OP("subtract")
+.set_attr<FForwardRewrite>("FScaleAxisForwardRewrite", AddSubForwardRewrite);
+
+// Producer operators
+// Multiply produces the scale-axis pair.
+Expr MultiplyForwardRewrite(const Call& ref_call,
+                            const Array<Expr>& new_args,
+                            const AxesSet& expected_out_axes) {
+  if (!expected_out_axes.defined()) return Expr();
+  if (expected_out_axes.size() == 0) return Expr();
+  // TODO(tvm-team) allow same axes accumulation
+  // not as important because it is less common in nn.
+  const auto* slhs = new_args[0].as<ScaledExprNode>();
+  const auto* srhs = new_args[1].as<ScaledExprNode>();
+  CHECK(!slhs && !srhs);
+
+  const auto* tlhs = ref_call->args[0]->type_as<TensorTypeNode>();
+  const auto* trhs = ref_call->args[1]->type_as<TensorTypeNode>();
+  Expr lhs = new_args[0];
+  Expr rhs = new_args[1];
+  auto rnode = make_node<ScaledExprNode>();
+  if (MatchBroadcastToLeftAxes(tlhs, trhs, expected_out_axes, &rhs) &&
+      IsAllPositiveConstant(rhs)) {
+    rnode->value = lhs;
+    rnode->scale = rhs;
+    rnode->axes = expected_out_axes;
+    return Expr(rnode);
+  } else if (MatchBroadcastToLeftAxes(trhs, tlhs, expected_out_axes, &lhs) &&
+             IsAllPositiveConstant(lhs)) {
+    rnode->value = rhs;
+    rnode->scale = lhs;
+    rnode->axes = expected_out_axes;
+    return Expr(rnode);
+  } else {
+    return Expr();
+  }
+}
+
+RELAY_REGISTER_OP("multiply")
+.set_attr<FForwardRewrite>("FScaleAxisForwardRewrite", MultiplyForwardRewrite);
+
+// Consumer operators
+// Conv2D send out requirement of axis folding.
+Array<AxesSet> Conv2DForwardPrep(const Call& call, AxesSet out) {
+  // TODO(tvm-team) support general data layout
+  // by transforming weight
+  const auto* param = call->attrs.as<Conv2DAttrs>();
+  CHECK(param != nullptr);
+  Layout data_layout(param->data_layout);
+  Layout weight_layout(param->weight_layout);
+  int c_big_axis = data_layout.Indexof('C');
+  int c_small_axis = data_layout.Indexof('c');
+
+  CHECK_GE(c_big_axis, 0);
+  AxesSet data_axes = NullValue<AxesSet>();
+  // For now, we only support simple pattern (no folded weight/data)
+  // More general layout can be supported under the current framework.
+  // By using a unified layout transformation.
+  // We only need to change the Prep and Mutate function.
+  //
+  // only handle depthwise or full conv2d.
+  // TODO(tvm-team) handle grouped conv by reshape + bcast
+  bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, weight_layout);
+  if (weight_layout.Indexof('i') < 0 &&
+      c_small_axis < 0 &&
+      (param->groups == 1 || is_depthwise_conv2d)) {
+    data_axes = {c_big_axis};
+  }
+  return {data_axes, NullValue<AxesSet>()};
+}
+
+// Conv2D consumes the scale axis during transformation.
+Expr Conv2DForwardRewrite(const Call& ref_call,
+                          const Array<Expr>& new_args,
+                          const AxesSet& expected_axes) {
+  // if data do not have scale, normal transform path.
+  const auto* sdata = new_args[0].as<ScaledExprNode>();
+  const auto* sweight = new_args[1].as<ScaledExprNode>();
+  if (sdata == nullptr) return Expr();
+  if (sweight != nullptr) return Expr();
+  const auto* param = ref_call->attrs.as<Conv2DAttrs>();
+  CHECK(param != nullptr);
+  Layout data_layout(param->data_layout);
+  Layout weight_layout(param->weight_layout);
+  int c_big_axis = data_layout.Indexof('C');
+  CHECK_GE(c_big_axis, 0);
+  // For now, we only support simple pattern (no folded weight/data)
+  // TODO(tvm-team) support general data layout
+  CHECK_EQ(weight_layout.Indexof('i'), -1);
+  CHECK(sdata->axes.size() == 1 &&
+        c_big_axis == sdata->axes[0]->value);
+  int big_oc_axis = weight_layout.Indexof('O');
+  int big_ic_axis = weight_layout.Indexof('I');
+
+  // Check it must be depthwise or full conv2d.
+  bool is_depthwise_conv2d = IsDepthwiseConv2D(ref_call, param, weight_layout);
+  CHECK(param->groups == 1 || is_depthwise_conv2d);
+
+  Expr weight = new_args[1];
+
+  // match the ic_axis
+  if (is_depthwise_conv2d) {
+    Expr scale = ExpandBiasToMatchAxis(
+        sdata->scale, weight_layout.ndim(), {big_oc_axis});
+    weight = Multiply(weight, scale);
+  } else {
+    Expr scale = ExpandBiasToMatchAxis(
+        sdata->scale, weight_layout.ndim(), {big_ic_axis});
+    weight = Multiply(weight, scale);
+  }
+  // return transformed conv2d
+  return CallNode::make(
+      ref_call->op, {sdata->value, weight}, ref_call->attrs, ref_call->type_args);
+}
+
+RELAY_REGISTER_OP("nn.conv2d")
+.set_attr<FForwardPrep>("FScaleAxisForwardPrep", Conv2DForwardPrep);
+
+RELAY_REGISTER_OP("nn.conv2d")
+.set_attr<FForwardRewrite>("FScaleAxisForwardRewrite", Conv2DForwardRewrite);
+
+
+Expr ForwardFoldScaleAxis(Expr data) {
+  auto expected_scale_axes =
+      ForwardPrep().Prepare(data);
+  auto fcontext = [&](const Call& call) -> NodeRef{
+    auto it = expected_scale_axes.find(call.get());
+    if (it != expected_scale_axes.end()) {
+      return it->second;
+    } else {
+      return NodeRef(nullptr);
+    }
+  };
+  return ForwardRewrite(
+      data, "FScaleAxisForwardRewrite", fcontext);
+}
+
+// Expose the FoldScaleAxisFoward
+TVM_REGISTER_API("relay._ir_pass.forward_fold_scale_axis")
+.set_body_typed<Expr(Expr)>(ForwardFoldScaleAxis);
+
+//----------------------------------------
+// Implement backward transformations.
+//----------------------------------------
+class BackwardTransformer;
+
+/*!
+ * \brief Preparation function for for pass scale backward.
+ * \param call The call node.
+ * \param in_scale_axes Allowed input scaling.
+ * \return The result scaling on axes of the input.
+ */
+using FBackwardPrep = TypedPackedFunc<
+  AxesSet(const Call& call, const Array<AxesSet>& in_scale_axes)>;
+
+using FBackwardTransform = TypedPackedFunc<
+  Expr(const Call& call,
+       const AxesSet& axes,
+       const Expr& scale,
+       const BackwardTransformer& transformer)>;
+
+//----------------------------------------------
+// Generic Visitors for FScaleAxisBackward
+//----------------------------------------------
+
+class BackwardPrep : private ExprVisitor {
+ public:
+  // The message on each node.
+  std::unordered_map<const Node*, AxesSet>
+  Prepare(const Expr& body) {
+    ref_counter_ = GetExprRefCount(body);
+    this->VisitExpr(body);
+    return std::move(message_);
+  }
+
+ private:
+  // The message on each node.
+  std::unordered_map<const Node*, AxesSet> message_;
+  // reference counter of an internal expr
+  std::unordered_map<const Node*, size_t> ref_counter_;
+  // Visit the expression.
+  void VisitExpr_(const CallNode* call) {
+    ExprVisitor::VisitExpr_(call);
+    static const auto& fprep =
+        Op::GetAttr<FBackwardPrep>("FScaleAxisBackwardPrep");
+    auto f = fprep.get(call->op, nullptr);
+    if (f == nullptr) return;
+    auto rit = ref_counter_.find(call);
+    CHECK(rit != ref_counter_.end());
+    // We only allow propagation of scale backward
+    // if the expression is only referred by a single parent.
+    if (rit->second != 1) return;
+    Array<AxesSet> in_axes;
+    for (Expr arg : call->args) {
+      auto it = message_.find(arg.get());
+      if (it != message_.end()) {
+        in_axes.push_back(it->second);
+      } else {
+        in_axes.push_back(NullValue<AxesSet>());
+      }
+    }
+    AxesSet out_axes = f(GetRef<Call>(call), in_axes);
+    if (out_axes.defined()) {
+      message_[call] = out_axes;
+    }
+  }
+};
+
+class BackwardTransformerNode :
+      public Node,
+      private ExprMutator {
+ public:
+  // Run forward transform.
+  Expr Fold(Expr expr) {
+    expected_scale_axes_ = BackwardPrep().Prepare(expr);
+    return this->Mutate(expr);
+  }
+  /*!
+   * \brief Transform the expr to consider the scaling.
+   *
+   * \param expr The input expression.
+   * \param axes The axes to scale.
+   * \param scale The scale applied to the axes.
+   * \return The result of transformation.
+   */
+  Expr Transform(const Expr& expr, AxesSet axes, Expr scale) {
+    // NOTE: the result of Transform is memoized.
+    if (const CallNode* call_node = expr.as<CallNode>()) {
+      return Transform(call_node, axes, scale);
+    } else {
+      CHECK(!axes.defined()) << "outstanding scale";
+      return ExprMutator::VisitExpr(expr);
+    }
+  }
+  /*!
+   * \brief Normal way of mutating call node.
+   * \param call_node The call node to be mutated.
+   * \return the result of the call Mutation.
+   */
+  Expr NormalCallTransform(const CallNode* call_node) {
+    const Call call = GetRef<Call>(call_node);
+    const auto it = memo_.find(call);
+    if (it != memo_.end()) {
+      return it->second;
+    }
+    Expr new_expr = ExprMutator::VisitExpr_(call_node);
+    memo_[call] = new_expr;
+    return new_expr;
+  }
+  /*!
+   * \brief Get the expected axes on expr.
+   * \param expr The expresison.
+   * \return The expected axes.
+   */
+  AxesSet GetExpectedAxes(const Expr& expr) const {
+    auto it = expected_scale_axes_.find(expr.get());
+    if (it != expected_scale_axes_.end()) return it->second;
+    return NullValue<AxesSet>();
+  }
+
+  // solver is not serializable.
+  void VisitAttrs(tvm::AttrVisitor* v) final {}
+
+  static constexpr const char* _type_key = "relay.fold_scale_axis.FBackwardTransformer";
+  TVM_DECLARE_NODE_TYPE_INFO(BackwardTransformerNode, Node);
+
+ private:
+  // Valid axes on each node.
+  std::unordered_map<const Node*, AxesSet> expected_scale_axes_;
+  // Override mutation of call.
+  Expr VisitExpr_(const CallNode* call_node) final {
+    return Transform(call_node, NullValue<AxesSet>(), NullValue<Expr>());
+  }
+  // Transform of CallNode.
+  Expr Transform(const CallNode* call_node, AxesSet axes, Expr scale);
+};
+
+class BackwardTransformer : public NodeRef {
+ public:
+  BackwardTransformer() {}
+  explicit BackwardTransformer(
+      ::tvm::NodePtr<::tvm::Node> n) : NodeRef(n) {
+  }
+  BackwardTransformerNode* operator->() const {
+    return static_cast<BackwardTransformerNode*>(node_.get());
+  }
+  using ContainerType = BackwardTransformerNode;
+};
+
+Expr BackwardTransformerNode::Transform(
+    const CallNode* call_node, AxesSet axes, Expr scale) {
+  static const auto& ftransform =
+      Op::GetAttr<FBackwardTransform>("FScaleAxisBackwardTransform");
+  auto f = ftransform.get(call_node->op, nullptr);
+  if (f != nullptr) {
+    const Call call = GetRef<Call>(call_node);
+    const auto it = memo_.find(call);
+    if (it != memo_.end()) {
+      return it->second;
+    }
+    Expr new_expr = f(GetRef<Call>(call_node),
+                      axes,
+                      scale,
+                      GetRef<BackwardTransformer>(this));
+    memo_[call] = new_expr;
+    return new_expr;
+  } else {
+    CHECK(!axes.defined()) << "outstanding scale";
+    return NormalCallTransform(call_node);
+  }
+}
+
+
+//----------------------------------------------
+// Per operator defs for FScaleAxisForward
+//----------------------------------------------
+
+// Intermediate operators
+AxesSet ReluBackwardPrep(const Call& call, const Array<AxesSet>& in_axes) {
+  return in_axes[0];
+}
+
+Expr ReluBackwardTransform(const Call& call,
+                           const AxesSet& axes,
+                           const Expr& scale,
+                           const BackwardTransformer& transformer) {
+  if (!axes.defined()) {
+    return transformer->NormalCallTransform(call.operator->());
+  }
+  Expr input = transformer->Transform(
+      call->args[0], axes, scale);
+  return CallNode::make(call->op, {input}, call->attrs, call->type_args);
+}
+
+RELAY_REGISTER_OP("nn.relu")
+.set_attr<FBackwardPrep>("FScaleAxisBackwardPrep", ReluBackwardPrep);
+
+RELAY_REGISTER_OP("nn.relu")
+.set_attr<FBackwardTransform>("FScaleAxisBackwardTransform", ReluBackwardTransform);
+
+RELAY_REGISTER_OP("nn.leaky_relu")
+.set_attr<FBackwardPrep>("FScaleAxisBackwardPrep", ReluBackwardPrep);
+
+RELAY_REGISTER_OP("nn.leaky_relu")
+.set_attr<FBackwardTransform>("FScaleAxisBackwardTransform", ReluBackwardTransform);
+
+// AddSub
+AxesSet AddSubBackwardPrep(const Call& call, const Array<AxesSet>& in_axes) {
+  const auto* tlhs = call->args[0]->type_as<TensorTypeNode>();
+  const auto* trhs = call->args[1]->type_as<TensorTypeNode>();
+  AttrsEqual equal;
+  if (in_axes[0].defined() &&
+      MatchBroadcastToLeftAxes(tlhs, trhs, in_axes[0])) {
+    return in_axes[0];
+  } else if (in_axes[1].defined() &&
+             MatchBroadcastToLeftAxes(trhs, tlhs, in_axes[1])) {
+    return in_axes[1];
+  } else if (in_axes[0].defined() &&
+             in_axes[1].defined() &&
+             equal(in_axes[0], in_axes[1]) &&
+             equal(tlhs->shape, trhs->shape)) {
+    // add of two elements.
+    return in_axes[0];
+  } else {
+    auto res = NullValue<AxesSet>();
+    CHECK(!res.defined());
+    return res;
+  }
+}
+
+Expr AddSubBackwardTransform(const Call& call,
+                             const AxesSet& axes,
+                             const Expr& scale,
+                             const BackwardTransformer& transformer) {
+  const auto* tlhs = call->args[0]->type_as<TensorTypeNode>();
+  const auto* trhs = call->args[1]->type_as<TensorTypeNode>();
+  if (!axes.defined()) {
+    return transformer->NormalCallTransform(call.operator->());
+  }
+  AxesSet lhs_axes = transformer->GetExpectedAxes(call->args[0]);
+  AxesSet rhs_axes = transformer->GetExpectedAxes(call->args[1]);
+  AttrsEqual equal;
+
+  if (lhs_axes.defined() && rhs_axes.defined()) {
+    CHECK(equal(lhs_axes, rhs_axes));
+    CHECK(equal(axes, lhs_axes));
+    Expr lhs = transformer->Transform(call->args[0], axes, scale);
+    Expr rhs = transformer->Transform(call->args[1], axes, scale);
+    return CallNode::make(call->op, {lhs, rhs}, call->attrs, call->type_args);
+  } else if (lhs_axes.defined()) {
+    CHECK(equal(axes, lhs_axes));
+    Expr lhs = transformer->Transform(call->args[0], axes, scale);
+    Expr rhs = transformer->Transform(
+        call->args[1], NullValue<AxesSet>(), NullValue<Expr>());
+    Expr rhs_scale = ExpandBiasToMatchAxis(
+        scale, tlhs->shape.size(), axes);
+    rhs = Multiply(rhs, rhs_scale);
+    return CallNode::make(call->op, {lhs, rhs}, call->attrs, call->type_args);
+  } else if (rhs_axes.defined()) {
+    CHECK(equal(axes, rhs_axes));
+    Expr lhs = transformer->Transform(
+        call->args[0], NullValue<AxesSet>(), NullValue<Expr>());
+    Expr rhs = transformer->Transform(call->args[1], axes, scale);
+    Expr lhs_scale = ExpandBiasToMatchAxis(
+        scale, trhs->shape.size(), axes);
+    lhs = Multiply(lhs, lhs_scale);
+    return CallNode::make(call->op, {lhs, rhs}, call->attrs, call->type_args);
+  } else {
+    LOG(FATAL) << "outstanding scale";
+    return Expr();
+  }
+}
+
+RELAY_REGISTER_OP("add")
+.set_attr<FBackwardPrep>("FScaleAxisBackwardPrep", AddSubBackwardPrep);
+
+RELAY_REGISTER_OP("add")
+.set_attr<FBackwardTransform>("FScaleAxisBackwardTransform", AddSubBackwardTransform);
+
+RELAY_REGISTER_OP("subtract")
+.set_attr<FBackwardPrep>("FScaleAxisBackwardPrep", AddSubBackwardPrep);
+
+RELAY_REGISTER_OP("subtract")
+.set_attr<FBackwardTransform>("FScaleAxisBackwardTransform", AddSubBackwardTransform);
+
+// Producer operators
+// Multiply produces the scale-axis pair.
+Expr MultiplyBackwardTransform(const Call& call,
+                               const AxesSet& axes,
+                               const Expr& scale,
+                               const BackwardTransformer& transformer) {
+  CHECK(!axes.defined()) << "outstanding scale";
+  const auto* tlhs = call->args[0]->type_as<TensorTypeNode>();
+  const auto* trhs = call->args[1]->type_as<TensorTypeNode>();
+  AxesSet lhs_axes = transformer->GetExpectedAxes(call->args[0]);
+  AxesSet rhs_axes = transformer->GetExpectedAxes(call->args[1]);
+  if (lhs_axes.defined() && lhs_axes.size() != 0) {
+    // NOTE we won't recursively call mutating on scale part.
+    // since there  won't be scale chance within scale part.
+    Expr rhs = call->args[1];
+    // Only propagate positive scaling.
+    if (MatchBroadcastToLeftAxes(tlhs, trhs, lhs_axes, &rhs) &&
+        IsAllPositiveConstant(rhs)) {
+      return transformer->Transform(call->args[0], lhs_axes, rhs);
+    }
+  } else if (rhs_axes.defined() && rhs_axes.size() != 0) {
+    // Only propagate positive scaling.
+    Expr lhs = call->args[0];
+    if (MatchBroadcastToLeftAxes(trhs, tlhs, rhs_axes, &lhs) &&
+        IsAllPositiveConstant(lhs)) {
+      return transformer->Transform(call->args[1], rhs_axes, lhs);
+    }
+  }
+  return transformer->NormalCallTransform(call.operator->());
+}
+
+RELAY_REGISTER_OP("multiply")
+.set_attr<FBackwardTransform>("FScaleAxisBackwardTransform", MultiplyBackwardTransform);
+
+// Consumer operators
+// Conv2D send out requirement of axis folding.
+AxesSet Conv2DBackwardPrep(const Call& call, const Array<AxesSet>& in_axes) {
+  const auto* param = call->attrs.as<Conv2DAttrs>();
+  CHECK(param != nullptr);
+  Layout out_layout(param->out_layout);
+  if (!out_layout.defined()) {
+    out_layout = Layout(param->data_layout);
+  }
+  Layout weight_layout(param->weight_layout);
+  int c_big_axis = out_layout.Indexof('C');
+  int c_small_axis = out_layout.Indexof('c');
+
+  CHECK_GE(c_big_axis, 0);
+  // For now, we only support simple pattern (no folded weight/data)
+  // More general layout can be supported under the current framework.
+  // By using a unified layout transformation.
+  // We only need to change the Prep and Mutate function.
+  //
+  // only handle depthwise or full conv2d.
+  // TODO(tvm-team) handle grouped conv by reshape + bcast
+  bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, weight_layout);
+  if (weight_layout.Indexof('o') < 0 &&
+      weight_layout.Indexof('i') < 0 &&
+      c_small_axis < 0 &&
+      (param->groups == 1 || is_depthwise_conv2d)) {
+    return {c_big_axis};
+  } else {
+    return NullValue<AxesSet>();
+  }
+}
+
+// Conv2D consumes the scale axis during transformation.
+Expr Conv2DBackwardTransform(const Call& call,
+                             const AxesSet& axes,
+                             const Expr& scale,
+                             const BackwardTransformer& transformer) {
+  if (!axes.defined()) {
+    return transformer->NormalCallTransform(call.operator->());
+  }
+  const auto* param = call->attrs.as<Conv2DAttrs>();
+  CHECK(param != nullptr);
+  Layout out_layout(param->out_layout);
+  if (!out_layout.defined()) {
+    out_layout = Layout(param->data_layout);
+  }
+  Layout weight_layout(param->weight_layout);
+  int c_big_axis = out_layout.Indexof('C');
+  CHECK_GE(c_big_axis, 0);
+  // For now, we only support simple pattern (no folded weight/data)
+  // TODO(tvm-team) support general data layout
+  CHECK_EQ(weight_layout.Indexof('o'), -1);
+  CHECK_EQ(weight_layout.Indexof('i'), -1);
+  CHECK(axes.size() == 1 &&
+        c_big_axis == axes[0]->value);
+
+  int big_oc_axis = weight_layout.Indexof('O');
+  // Check it must be depthwise or full conv2d.
+  bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, weight_layout);
+  CHECK(param->groups == 1 || is_depthwise_conv2d);
+
+  Expr data = transformer->Transform(
+      call->args[0], NullValue<AxesSet>(), NullValue<Expr>());
+  Expr weight = transformer->Transform(
+      call->args[1], NullValue<AxesSet>(), NullValue<Expr>());
+  // scale on input for deptwise.
+  Expr wscale = ExpandBiasToMatchAxis(
+      scale, weight_layout.ndim(), {big_oc_axis});
+  weight = Multiply(weight, wscale);
+  return CallNode::make(
+      call->op, {data, weight}, call->attrs, call->type_args);
+}
+
+RELAY_REGISTER_OP("nn.conv2d")
+.set_attr<FBackwardPrep>("FScaleAxisBackwardPrep", Conv2DBackwardPrep);
+
+RELAY_REGISTER_OP("nn.conv2d")
+.set_attr<FBackwardTransform>("FScaleAxisBackwardTransform", Conv2DBackwardTransform);
+
+Expr BackwardFoldScaleAxis(Expr data) {
+  return make_node<BackwardTransformerNode>()->Fold(data);
+}
+
+TVM_REGISTER_API("relay._ir_pass.backward_fold_scale_axis")
+.set_body_typed<Expr(Expr)>(BackwardFoldScaleAxis);
+
+}  // namespace fold_scale_axis
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/forward_rewrite.cc b/src/relay/pass/forward_rewrite.cc
new file mode 100644
index 000000000000..4f33d4a053b7
--- /dev/null
+++ b/src/relay/pass/forward_rewrite.cc
@@ -0,0 +1,192 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ *
+ * \file forward_rewrite.cc
+ * \brief Apply rewriting rules in a forward fashion.
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/op_attr_types.h>
+#include "pass_util.h"
+
+namespace tvm {
+namespace relay {
+
+// Realizer class that realizes the expression
+// Note that we can take benefit of its internal memo
+// so that calling realize repeatively won't hurt perf.
+class TempRealizer : private ExprMutator {
+ public:
+  Expr Realize(Expr expr) {
+    return VisitExpr(expr);
+  }
+
+ private:
+  Expr VisitExpr(const Expr& expr) final {
+    auto it = memo_.find(expr);
+    if (it != memo_.end()) {
+      return it->second;
+    } else {
+      Expr res;
+      if (const auto* temp = expr.as_derived<TempExprNode>()) {
+        res = temp->Realize();
+
+      } else {
+        res = ExprFunctor::VisitExpr(expr);
+      }
+      memo_[res] = res;
+      return res;
+    }
+  }
+};
+
+class ForwardRewriter : private ExprMutator {
+ public:
+  ForwardRewriter(const OpMap<FForwardRewrite>* rewrite_map,
+                  std::function<NodeRef(const Call&)> fcontext,
+                  std::function<Expr(const Expr&)> fmulti_ref_trigger)
+      : rewrite_map_(rewrite_map),
+        fcontext_(fcontext),
+        fmulti_ref_trigger_(fmulti_ref_trigger) {}
+
+  ForwardRewriter(const FForwardRewrite* rewrite_func,
+                  std::function<NodeRef(const Call&)> fcontext,
+                  std::function<Expr(const Expr&)> fmulti_ref_trigger)
+      : rewrite_func_(rewrite_func),
+        fcontext_(fcontext),
+        fmulti_ref_trigger_(fmulti_ref_trigger) {}
+
+
+  // Transform expression.
+  Expr Rewrite(Expr expr) {
+    if (fmulti_ref_trigger_ != nullptr) {
+      ref_counter_ = GetExprRefCount(expr);
+    }
+    return this->VisitExpr(expr);
+  }
+
+ private:
+  // The rewrite rule.
+  const OpMap<FForwardRewrite>* rewrite_map_{nullptr};
+  const FForwardRewrite* rewrite_func_{nullptr};
+  // The context.const
+  std::function<NodeRef(const Call&)> fcontext_{nullptr};
+  // The multiple reference trigger
+  std::function<Expr(const Expr&)> fmulti_ref_trigger_{nullptr};
+  // Internal ref counter
+  std::unordered_map<const Node*, size_t> ref_counter_;
+  // internal realizer
+  TempRealizer realizer_;
+
+  Expr VisitExpr(const Expr& expr) final {
+    // by default always realize.
+    return realizer_.Realize(ExprMutator::VisitExpr(expr));
+  }
+
+  // Visit and allow non-realized version.
+  Expr GetTempExpr(const Expr& expr)  {
+    if (fmulti_ref_trigger_ != nullptr) {
+      Expr ret = ExprMutator::VisitExpr(expr);
+      auto it = ref_counter_.find(expr.get());
+      CHECK(it != ref_counter_.end());
+      if (it->second > 1) {
+        ret = fmulti_ref_trigger_(ret);
+      }
+      return ret;
+    } else {
+      return ExprMutator::VisitExpr(expr);
+    }
+  }
+
+  // Automatic fold TupleGetItem.
+  Expr VisitExpr_(const TupleGetItemNode* op) final {
+    Expr tuple = this->GetTempExpr(op->tuple);
+    if (const auto* ptuple = tuple.as<TupleNode>()) {
+      return ptuple->fields[op->index];
+    } else {
+      if (tuple.same_as(op->tuple)) {
+        return GetRef<Expr>(op);
+      } else {
+        return TupleGetItemNode::make(tuple, op->index);
+      }
+    }
+  }
+
+  Expr VisitExpr_(const TupleNode* op) final {
+    tvm::Array<Expr> fields;
+    bool all_fields_unchanged = true;
+    for (auto field : op->fields) {
+      auto new_field = this->GetTempExpr(field);
+      fields.push_back(new_field);
+      all_fields_unchanged &= new_field.same_as(field);
+    }
+
+    if (all_fields_unchanged) {
+      return GetRef<Expr>(op);
+    } else {
+      return TupleNode::make(fields);
+    }
+  }
+
+  Expr VisitExpr_(const CallNode* call_node) final {
+    const Call& ref_call = GetRef<Call>(call_node);
+    PackedFunc frewrite;
+    if (rewrite_func_) {
+      frewrite = *rewrite_func_;
+    } else {
+      CHECK(rewrite_map_);
+      frewrite = rewrite_map_->get(call_node->op, nullptr);
+    }
+
+    auto new_op = this->Mutate(call_node->op);
+    bool unchanged = call_node->op.same_as(new_op);
+
+    Array<Expr> call_args;
+    for (auto arg : call_node->args) {
+      Expr new_arg = this->GetTempExpr(arg);
+      if (frewrite == nullptr) {
+        new_arg = realizer_.Realize(new_arg);
+      }
+      unchanged &= new_arg.same_as(arg);
+      call_args.push_back(new_arg);
+    }
+    // try to rewrite.
+    if (frewrite != nullptr) {
+      Expr res = frewrite(
+          ref_call, call_args,
+          fcontext_ != nullptr ? fcontext_(ref_call) : NodeRef(nullptr));
+      if (res.defined()) return res;
+      // abort, use old rule
+      for (size_t i = 0; i < call_args.size(); ++i) {
+        Expr arg = call_args[i];
+        Expr new_arg = realizer_.Realize(arg);
+        if (!arg.same_as(new_arg)) {
+          call_args.Set(i, new_arg);
+          unchanged = false;
+        }
+      }
+    }
+    if (unchanged) return ref_call;
+    return CallNode::make(
+        new_op, call_args, call_node->attrs, call_node->type_args);
+  }
+};
+
+Expr ForwardRewrite(const Expr& expr,
+                    const std::string& rewrite_map_name,
+                    std::function<NodeRef(const Call&)> fcontext,
+                    std::function<Expr(const Expr&)> fmulti_ref_trigger) {
+  auto rewrite_map = Op::GetAttr<FForwardRewrite>(rewrite_map_name);
+  return ForwardRewriter(&rewrite_map, fcontext, fmulti_ref_trigger).Rewrite(expr);
+}
+
+Expr ForwardRewrite(const Expr& expr,
+                    const FForwardRewrite& rewrite_func,
+                    std::function<NodeRef(const Call&)> fcontext,
+                    std::function<Expr(const Expr&)> fmulti_ref_trigger) {
+  return ForwardRewriter(&rewrite_func, fcontext, fmulti_ref_trigger).Rewrite(expr);
+}
+
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
new file mode 100644
index 000000000000..b2b35c51a1ca
--- /dev/null
+++ b/src/relay/pass/fuse_ops.cc
@@ -0,0 +1,806 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ *
+ * \file src/tvm/relay/pass/fuse_ops.cc
+ *
+ * \brief This is a backend-aware optimization pass.
+ *   Fuse necessary ops into a single one.
+ */
+#include <tvm/ir_operator.h>
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/op_attr_types.h>
+#include "../../common/arena.h"
+
+
+namespace tvm {
+namespace relay {
+
+/*
+  Note on Fusing algorithm:
+
+  The main challenge of genenral fusor is to handle possible diamond shape branches,
+  in the following graph, conv2d can be fused to elemwise add.
+
+            conv2d
+            /  |  \
+           /   |   \
+         op    op   op
+          \    |    /
+           \   |   /
+          elemwise add
+               |
+
+  However, at the point of conv2d we do not necessarily know that all its future path
+  will merge at the elemwise add. The new fusor algorithm applies post-dominator analysis.
+  The immediate post-dominator of a node defined by the closest node where all the future path goes into.
+  In the above case, the elemwise add is the post-dominator of conv2d. The general algorithm is as follows:
+
+  - Construct a DAG of dataflow graph for dominator analysis
+  - Construct a post-dominator tree which gives immediate post dominator of each node.
+  - Run fusion algorithm with the given post-dominator information.
+
+  Note that, because we run analysis on a DAG, we use a single pass post-dominator
+  tree construction algorithm via LCA, which is simpler than the full version that handles cycles.
+
+  The fusion algorithm traverses from each node and checks if it can be fused to its
+  immediate post dominator. It has to check the following things:
+
+  - CheckPath: check all the path between a node and its immediate post-dominator
+               satiesfies the fuse condition.
+  - Note that these intermediate node can already be fused with another nodes, the algorithm
+      will still run correctly.
+  - CommitFuse: mark all the nodes between source and post-dominator as the same group.
+  - We use an Union-Find data structure to manage the groups.
+*/
+using common::LinkNode;
+using common::LinkedList;
+
+/*!
+ * \brief Indexed data flow graph in forward direction.
+ *  This is a temporary data structure used for operator fusion analysis.
+ *
+ *  This data structure only captures the dataflow fragement and
+ *  could ignore blocks like let by simply ordering each dataflow block
+ *  and mark the output node as extern_ref;
+ */
+class IndexedForwardGraph {
+ public:
+  struct Node;
+  /*!
+   * The forward edge in the dataflow graph.
+   */
+  struct Edge {
+    /*! \brief The corresponding node */
+    Node* node{nullptr};
+    /*! \brief The respective pattern of this op */
+    OpPatternKind pattern{kOpaque};
+  };
+  /*! \brief A node in the graph. */
+  struct Node {
+    /*! \brief weak reference to the corresponding edge. */
+    const tvm::Node* ref{nullptr};
+    /*! \brief The index of the node in topological order. */
+    size_t index{0};
+    /*! \brief Whether this node is referenced by external source */
+    bool extern_ref{false};
+    /*! \brief The general pattern in the node */
+    OpPatternKind pattern{kOpaque};
+    /*! \brief The outputs of the node. */
+    LinkedList<Edge> outputs;
+  };
+  /*! \brief The node map that maps node to graph */
+  std::unordered_map<const tvm::Node*, Node*> node_map;
+  /*! \brief All the nodes in post DFS order */
+  std::vector<Node*> post_dfs_order;
+
+  /*! \brief Dump the graph into string. */
+  void DebugDump() {
+    std::ostringstream os;
+    for (size_t i = 0; i < post_dfs_order.size(); ++i) {
+      Node* node = post_dfs_order[i];
+      os << "node[" << i << "], "
+         << GetRef<NodeRef>(node->ref)
+         << " outputs=[";
+      for (auto* link = node->outputs.head; link != nullptr; link = link->next) {
+        os << link->value.node->index << ", ";
+      }
+      os << "]\n";
+    }
+    LOG(INFO) << os.str();
+  }
+  /*!
+   * \brief create a indexed forward graph.
+   * \param arena The arena used for data allocation.
+   * \param body The body of the expression to create a graph.
+   */
+  static IndexedForwardGraph Create(common::Arena* arena, const Expr& body);
+
+ private:
+  class Creator;
+};
+
+// Creator of post dominator tree of the dataflow
+class IndexedForwardGraph::Creator : private ExprVisitor {
+ public:
+  explicit Creator(common::Arena* arena)
+      : arena_(arena) {}
+
+  IndexedForwardGraph Prepare(const Expr& body) {
+    this->Update(body, nullptr, kOpaque);
+    this->VisitExpr(body);
+    return std::move(graph_);
+  }
+
+ private:
+  /*! \brief allocator of all the internal node object */
+  common::Arena* arena_;
+  // The output.
+  IndexedForwardGraph graph_;
+  // attribute equal comparator
+  AttrsEqual attr_equal_;
+  // Update the message stored at the node.
+  void Update(const Expr& node,
+              IndexedForwardGraph::Node* parent,
+              OpPatternKind pattern) {
+    const tvm::Node* key = node.get();
+    IndexedForwardGraph::Node* current;
+    auto it = graph_.node_map.find(key);
+    if (it != graph_.node_map.end()) {
+      current = it->second;
+    } else {
+      current = arena_->make<IndexedForwardGraph::Node>();
+      graph_.node_map[key] = current;
+    }
+    if (parent != nullptr) {
+      auto* link = arena_->make<LinkNode<IndexedForwardGraph::Edge> >();
+      link->value.node = parent;
+      link->value.pattern = pattern;
+      current->outputs.Push(link);
+    } else {
+      current->extern_ref = true;
+    }
+  }
+  void AddNode(const tvm::Node* key) {
+    auto it = graph_.node_map.find(key);
+    CHECK(it != graph_.node_map.end())
+        << "Cannot find node " << GetRef<NodeRef>(key);
+    IndexedForwardGraph::Node* node = it->second;
+    CHECK(node->ref == nullptr);
+    node->ref = key;
+    node->index = graph_.post_dfs_order.size();
+    graph_.post_dfs_order.push_back(node);
+  }
+
+  // Post order tree
+  void VisitExpr_(const FunctionNode* op) {
+    for (auto param : op->params) {
+      this->Update(param, nullptr, kOpaque);
+    }
+    this->Update(op->body, nullptr, kOpaque);
+    ExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitExpr_(const ConstantNode* op) {
+    this->AddNode(op);
+    Node* node = graph_.node_map.at(op);
+    DataType dtype = TVMType2Type(op->data->dtype);
+    // This rule must be consistent with code generator.
+    bool is_simple_const = (
+        dtype == Int(32) ||
+        dtype == Int(64) ||
+        dtype == Float(32) ||
+        dtype == Float(64) ||
+        dtype == Bool());
+    if (op->is_scalar() && is_simple_const) {
+      node->pattern = kElemWise;
+    } else {
+      // for now, mark non-scalar constant
+      // as opaque, we will not choose to fuse it.
+      node->pattern = kOpaque;
+    }
+  }
+
+  void VisitExpr_(const CallNode* call) {
+    CHECK(graph_.node_map.count(call));
+    Node* node = graph_.node_map.at(call);
+    static auto fpattern =
+        Op::GetAttr<TOpPattern>("TOpPattern");
+    // setup pattern.
+    OpPatternKind op_pattern = kOpaque;
+    if (const OpNode* opnode = call->op.as<OpNode>()) {
+      op_pattern = static_cast<OpPatternKind>(fpattern[GetRef<Op>(opnode)]);
+    }
+    node->pattern = op_pattern;
+    const auto* rtype = call->checked_type().as<TensorTypeNode>();
+    // pass the message back to all the children it references.
+    for (size_t i = 0; i < call->args.size(); ++i) {
+      const auto* arg_type =
+          call->args[i]->checked_type().as<TensorTypeNode>();
+      // specifically check if result type
+      OpPatternKind edge_pattern = op_pattern;
+      if (edge_pattern == kBroadcast &&
+          arg_type != nullptr &&
+          rtype != nullptr &&
+          attr_equal_(rtype->shape, arg_type->shape)) {
+        edge_pattern = kElemWise;
+      }
+      this->Update(call->args[i], node, edge_pattern);
+    }
+    ExprVisitor::VisitExpr_(call);
+    this->AddNode(call);
+  }
+
+  void VisitExpr_(const TupleNode* op) {
+    CHECK(graph_.node_map.count(op));
+    Node* tuple_node = graph_.node_map.at(op);
+    tuple_node->pattern = kInjective;
+    for (const Expr& field : op->fields) {
+      this->Update(field, tuple_node, kInjective);
+    }
+    ExprVisitor::VisitExpr_(op);
+    this->AddNode(op);
+  }
+
+  void VisitExpr_(const TupleGetItemNode* op) {
+    CHECK(graph_.node_map.count(op));
+    Node* node = graph_.node_map.at(op);
+    this->Update(op->tuple, node, kOpaque);
+    ExprVisitor::VisitExpr_(op);
+    this->AddNode(op);
+  }
+
+  void VisitExpr_(const VarNode* op) {
+    this->AddNode(op);
+  }
+
+  void VisitExpr_(const LetNode* op) {
+    // do not fuse through let.
+    this->Update(op->var, nullptr, kOpaque);
+    this->Update(op->value, nullptr, kOpaque);
+    this->Update(op->body, nullptr, kOpaque);
+    ExprVisitor::VisitExpr_(op);
+    this->AddNode(op);
+  }
+
+  void VisitExpr_(const IfNode* op) {
+    // do not fuse through if.
+    this->Update(op->cond, nullptr, kOpaque);
+    this->Update(op->true_branch, nullptr, kOpaque);
+    this->Update(op->false_branch, nullptr, kOpaque);
+    ExprVisitor::VisitExpr_(op);
+    this->AddNode(op);
+  }
+};
+
+IndexedForwardGraph IndexedForwardGraph::Create(
+    common::Arena* arena, const Expr& body) {
+  return Creator(arena).Prepare(body);
+}
+
+/*!
+ * \brief Dominator tree that represent domination or
+ *  post domination relation of the node.
+ */
+class DominatorTree {
+ public:
+  /*!
+   * \brief A node in the dominator tree.
+   */
+  struct Node {
+    /*! \brief The node in the tree */
+    IndexedForwardGraph::Node* gnode{nullptr};
+    /*! \brief parent of the tree */
+    Node* parent{nullptr};
+    /*! \brief current depth*/
+    int depth{0};
+    /*! \brief aggregated pattern to parent */
+    OpPatternKind pattern{kOpaque};
+  };
+  // index -> node.
+  std::vector<Node*> nodes;
+  /*!
+   * \brief compute a post dominator relation for a given dataflow graph.
+   * \param arena The arena used for node allocation.
+   * \param graph The graph to be analyze.
+   * \return The dominator tree of the graph.
+   * \note This algorithm makes use of the fact that graph is DAG,
+   *       and runs a single pass algorithm via LCA.
+   */
+  static DominatorTree PostDom(common::Arena* arena,
+                               const IndexedForwardGraph& graph);
+
+ private:
+  // Combine pattern together.
+  static OpPatternKind CombinePattern(
+      OpPatternKind lhs, OpPatternKind rhs) {
+    if (lhs > rhs) return lhs;
+    return rhs;
+  }
+  /*!
+   * \brief Find the least common acenstor of the two nodes.
+   * \param lhs The left node.
+   * \param rhs The right node.
+   * \param edge_pattern
+   *        The combined edge pattern across all the parents.
+   * \return The least common ancestor of thw two.
+   */
+  static Node* LeastCommonAncestor(
+      Node* lhs,
+      Node* rhs,
+      OpPatternKind* edge_pattern) {
+    while (lhs != rhs) {
+      if (lhs == nullptr) return nullptr;
+      if (rhs == nullptr) return nullptr;
+      if (lhs->depth < rhs->depth) {
+        edge_pattern[0] = CombinePattern(
+            edge_pattern[0], rhs->pattern);
+        rhs = rhs->parent;
+      } else if (rhs->depth < lhs->depth) {
+        edge_pattern[0] = CombinePattern(
+            edge_pattern[0], lhs->pattern);
+        lhs = lhs->parent;
+      } else {
+        edge_pattern[0] = CombinePattern(
+            edge_pattern[0], lhs->pattern);
+        edge_pattern[0] = CombinePattern(
+            edge_pattern[0], rhs->pattern);
+        lhs = lhs->parent;
+        rhs = rhs->parent;
+      }
+    }
+    return lhs;
+  }
+};
+
+DominatorTree DominatorTree::PostDom(common::Arena* arena,
+                                     const IndexedForwardGraph& graph) {
+  DominatorTree tree;
+  tree.nodes.resize(graph.post_dfs_order.size(), nullptr);
+  // reverse topo order
+  for (size_t i = graph.post_dfs_order.size(); i != 0; --i) {
+    size_t index = i - 1;
+    Node* tnode = arena->make<Node>();
+    auto* gnode = graph.post_dfs_order[index];
+    tnode->gnode = gnode;
+    if (gnode->extern_ref) {
+      tnode->depth = 1;
+      tnode->parent = nullptr;
+      tnode->pattern = kOpaque;
+    } else {
+      // find the LCAs of all outputs.
+      OpPatternKind pattern = kElemWise;
+      Node* parent = nullptr;
+      for (auto link = gnode->outputs.head; link != nullptr; link= link->next) {
+        size_t oindex = link->value.node->index;
+        CHECK_LT(oindex, tree.nodes.size());
+        Node* onode = tree.nodes[oindex];
+        CHECK(onode != nullptr);
+        if (parent != nullptr) {
+          parent = LeastCommonAncestor(parent, onode, &pattern);
+        } else {
+          parent = onode;
+        }
+        pattern = CombinePattern(pattern, link->value.pattern);
+      }
+      tnode->depth = parent ? parent->depth + 1 : 1;
+      tnode->parent = parent;
+      tnode->pattern = pattern;
+    }
+    tree.nodes[index] = tnode;
+  }
+  return tree;
+}
+
+/*!
+ * \brief A partition of the graph marked by union find data structure.
+ */
+class GraphPartitioner {
+ public:
+  explicit GraphPartitioner(common::Arena* arena, int opt_level)
+      : arena_(arena), opt_level_(opt_level) {}
+  /*!
+   * \brief Group as a union find data structure.
+   */
+  struct Group {
+    /*! \brief The parent in the union find data structure. */
+    Group* parent{nullptr};
+    /*! \brief The pattern of the group */
+    OpPatternKind pattern;
+    /*! \brief reference to the root node. */
+    const tvm::Node* root_ref{nullptr};
+    /*!
+     * \brief Reference to the master node,
+     * this field is not nullptr only if pattern is kOutEWiseFusable.
+     */
+    const tvm::Node* master_ref{nullptr};
+    /*!
+     * \brief Find the group root, perform path compression
+     * \return The root type node.
+     */
+    Group* FindRoot() {
+      // fast path
+      if (this->parent == nullptr) return this;
+      // slow path with path compression.
+      Group* root = this;
+      while (root->parent != nullptr) {
+        root = root->parent;
+      }
+      for (Group* p = this; p != root;) {
+        Group* parent = p->parent;
+        p->parent = root;
+        p = parent;
+      }
+      return root;
+    }
+  };
+  /*!
+   * \brief Partition a graph.
+   * \return group assignments of each node.
+   */
+  std::vector<Group*> Partition(const IndexedForwardGraph& graph);
+
+ private:
+  /*! \brief The internal arena for temporary space. */
+  common::Arena* arena_;
+  /*! \brief optimization level for fuse operation. */
+  int opt_level_;
+  /*! \brief The internal groups. */
+  std::vector<Group*> groups_;
+  /*! \brief internal field used for deduplication */
+  std::unordered_set<IndexedForwardGraph::Node*> visited_;
+  // Internal implelementation of CheckPath
+  template<typename F>
+  bool CheckPath_(IndexedForwardGraph::Node* src,
+                  IndexedForwardGraph::Node* sink,
+                  F fcond) {
+    if (visited_.count(src)) return true;
+    visited_.insert(src);
+    Group* gnode =  groups_[src->index];
+    CHECK(gnode != nullptr);
+    gnode = gnode->FindRoot();
+    if (!fcond(gnode->pattern, src == sink)) return false;
+    if (src == sink) return true;
+    for (auto link = src->outputs.head; link != nullptr; link = link->next) {
+      if (!CheckPath_(link->value.node, sink, fcond)) return false;
+    }
+    return true;
+  }
+  /*!
+   * \brief Check all the node and edge pattern
+   *  between src and sink satisfies fcond.
+   *
+   * src is not checked.
+   *
+   * \param src The source node.
+   * \param sink The termination node.
+   * \param fcond The condition to be checked.
+   * \tparam F the condition function, with signature
+   * \note sink must be a post-dominator of src.
+   */
+  template<typename F>
+  bool CheckPath(IndexedForwardGraph::Node* src,
+                 IndexedForwardGraph::Node* sink,
+                 F fcond) {
+    CHECK(!src->extern_ref);
+    visited_.clear();
+    CHECK(src != sink);
+    for (auto link = src->outputs.head; link != nullptr; link = link->next) {
+      if (!CheckPath_(link->value.node, sink, fcond)) return false;
+    }
+    return true;
+  }
+  // Combine two patterns together.
+  static OpPatternKind CombinePattern(
+      OpPatternKind lhs, OpPatternKind rhs) {
+    if (lhs > kBroadcast && rhs > kBroadcast) {
+      LOG(FATAL) << "Cannot merge two complex group together";
+    }
+    if (lhs > rhs) return lhs;
+    return rhs;
+  }
+  /*!
+   * \brief Merge the child group to the parent.
+   * \param child The child group.
+   * \param parent The parent group.
+   */
+  void MergeFromTo(Group* child, Group* parent) {
+    child = child->FindRoot();
+    parent = parent->FindRoot();
+    if (child == parent) return;
+    child->parent = parent;
+    // update master ref and pattern
+    if (child->master_ref != nullptr) {
+      CHECK(parent->master_ref == nullptr);
+      parent->master_ref = child->master_ref;
+      parent->pattern = CombinePattern(
+          child->pattern, parent->pattern);
+    }
+  }
+  // Internal implelementation of CommitFuse
+  void CommitFuse_(IndexedForwardGraph::Node* src,
+                   IndexedForwardGraph::Node* sink,
+                   Group* target) {
+    if (src == sink) return;
+    if (visited_.count(src)) return;
+    visited_.insert(src);
+    Group* gnode = groups_[src->index];
+    CHECK(gnode != nullptr);
+    // merge the current group to the parent if possible.
+    MergeFromTo(gnode, target);
+    for (auto link = src->outputs.head; link != nullptr; link = link->next) {
+      CommitFuse_(link->value.node, sink, target);;
+    }
+  }
+  /*!
+   * \brief Commit fusion operation.
+   * \param src The source node.
+   * \param sink The termination node.
+   * \note sink must be a post-dominator of src.
+   */
+  void CommitFuse(IndexedForwardGraph::Node* src,
+                  IndexedForwardGraph::Node* sink) {
+    Group* target = groups_[sink->index];
+    visited_.clear();
+    CHECK(src != sink);
+    CommitFuse_(src, sink, target);
+  }
+
+  // Initialize the groups.
+  void InitGroups(const IndexedForwardGraph& graph) {
+    groups_.resize(graph.post_dfs_order.size());
+    for (size_t nid = 0; nid < groups_.size(); ++nid) {
+      const auto* graph_node = graph.post_dfs_order[nid];
+      auto* group_node = arena_->make<Group>();
+      group_node->pattern = graph_node->pattern;
+      group_node->root_ref = graph_node->ref;
+      // set master ref if necessary.
+      if (group_node->pattern == kOutEWiseFusable) {
+        group_node->master_ref = graph_node->ref;
+      }
+      groups_[nid] = group_node;
+    }
+  }
+
+  // execute the fusion algorithm.
+  void RunFuse(const IndexedForwardGraph& graph,
+               const DominatorTree& post_dom_tree,
+               int phase) {
+    for (size_t nid = 0; nid < groups_.size(); ++nid) {
+      // the group of current node has been specified already.
+      auto* graph_node = graph.post_dfs_order[nid];
+      auto* dom_node = post_dom_tree.nodes[nid];
+      Group* group_node = groups_[nid];
+      CHECK(group_node != nullptr);
+      // no actions for opaque nodes
+      if (group_node->pattern == kOpaque) continue;
+      // no actions needed if the current node have no dominator
+      if (dom_node->parent == nullptr) continue;
+      CHECK(!graph_node->extern_ref);
+      // Skip if current node is already fused to the parent.
+      size_t dom_parent_gindex = dom_node->parent->gnode->index;
+      if (groups_[dom_parent_gindex] != nullptr &&
+          group_node->FindRoot() == groups_[dom_parent_gindex]->FindRoot()) {
+        continue;
+      }
+      // Try to fuse current node to its post-dominator.
+      if (group_node->pattern == kOutEWiseFusable) {
+        if (phase != 0) continue;
+        // Path for OutEWiseFusable: conv2d
+        // Check if the dominator relation is elemwise.
+        if (dom_node->parent != nullptr && dom_node->pattern == kElemWise) {
+          CHECK(dom_node->parent->gnode != nullptr);
+          // The fuse can be executed if all the intermediate ops are still broadcast.
+          auto fcond = [](OpPatternKind kind, bool is_sink) {
+            return kind <= kBroadcast;
+          };
+          if (CheckPath(graph_node, dom_node->parent->gnode, fcond)) {
+            CommitFuse(graph_node, dom_node->parent->gnode);
+          }
+        }
+      } else if (group_node->pattern <= kBroadcast) {
+        // Pre-condition: can only be fused to parent which is injective or reduction.
+        if (dom_node->parent != nullptr &&
+            (dom_node->pattern <= kInjective ||
+             dom_node->pattern == kCommReduce)) {
+          // Check if all the intermediate ops are still broadcast.
+          // The final terminal node can already be fused to a OutEWiseFusable group.
+          auto fcond = [](OpPatternKind kind, bool is_sink) {
+            if (!is_sink) {
+              return kind <= kBroadcast;
+            } else {
+              return (kind <= kBroadcast ||
+                      kind == kCommReduce ||
+                      kind == kOutEWiseFusable);
+            }
+          };
+          if (CheckPath(graph_node, dom_node->parent->gnode, fcond)) {
+            CommitFuse(graph_node, dom_node->parent->gnode);
+          }
+        }
+      } else if (group_node->pattern == kInjective) {
+        // defer injective fusion to second phase.
+        // so conv2d always finishes fusing.
+        if (phase != 1) continue;
+        // Check if all path are injective.
+        auto fcond = [](OpPatternKind kind, bool is_sink) {
+          return kind <= kInjective;
+        };
+        if (CheckPath(graph_node, dom_node->parent->gnode, fcond)) {
+          CommitFuse(graph_node, dom_node->parent->gnode);
+        }
+      } else {
+        // do nothing.
+        CHECK(group_node->pattern == kCommReduce);
+      }
+    }
+  }
+};
+
+std::vector<GraphPartitioner::Group*>
+GraphPartitioner::Partition(const IndexedForwardGraph& graph) {
+  this->InitGroups(graph);
+  if (opt_level_ == 0) return std::move(groups_);
+  // get post dominator tree
+  auto post_dom_tree = DominatorTree::PostDom(arena_, graph);
+  // run fusion algorithm.
+  for (int phase = 0; phase < 2; ++phase) {
+    this->RunFuse(graph, post_dom_tree, phase);
+  }
+  return std::move(groups_);
+}
+
+class FuseMutator : private ExprMutator {
+ public:
+  // Run the transform
+  Expr Transform(const Expr& body, int fuse_opt_level) {
+    // setup the group map.
+    auto graph = IndexedForwardGraph::Create(&arena_, body);
+    auto groups = GraphPartitioner(&arena_, fuse_opt_level).Partition(
+        graph);
+    for (size_t nid = 0; nid < graph.post_dfs_order.size(); ++nid) {
+      CHECK(graph.post_dfs_order[nid]->ref != nullptr);
+      gmap_[graph.post_dfs_order[nid]->ref] = groups[nid];
+    }
+    // The following line can be used for debug.
+    // this->DebugDumpGroup(body);
+    return this->Mutate(body);
+  }
+
+
+ private:
+  /*! \brief Temporary information from each group. */
+  struct GroupInfo {
+   public:
+    // The parameters of the function.
+    Array<Var> params;
+    // The arguments to call the functions.
+    Array<Expr> arguments;
+    // Get a new parameter or allocate an old one
+    Var GetOrAllocParam(const Expr& expr, const Type& type) {
+      // run linear scan as most fused groups contain only a few inputs.
+      for (size_t i = 0; i < arguments.size(); ++i) {
+        if (expr.same_as(arguments[i])) return params[i];
+      }
+      // create a new parameter.
+      std::ostringstream os;
+      os << "p" << params.size();
+      auto var = VarNode::make(os.str(), type);
+      params.push_back(var);
+      arguments.push_back(expr);
+      return var;
+    }
+  };
+  /*! \brief Internal arena. */
+  common::Arena arena_;
+  /*! \brief The group assignment map. */
+  std::unordered_map<const Node*, GraphPartitioner::Group*> gmap_;
+  /* \brief Internal group information map. */
+  std::unordered_map<GraphPartitioner::Group*, GroupInfo> ginfo_;
+  // Skip primitive function.
+  Expr VisitExpr_(const FunctionNode* fn_node) {
+    if (fn_node->IsPrimitive()) {
+      return GetRef<Expr>(fn_node);
+    } else {
+      return ExprMutator::VisitExpr_(fn_node);
+    }
+  }
+  // Transform calls.
+  Expr VisitExpr_(const CallNode* call) {
+    if (call->op.as<OpNode>()) {
+      // If it is a primitive op call
+      // then we must have a group assignment for it already.
+      CHECK(gmap_.count(call));
+      auto* ret_group = gmap_.at(call)->FindRoot();
+      Array<Expr> new_args = GetNewArguments(call->args, ret_group);
+
+      auto new_call = CallNode::make(
+          call->op, new_args, call->attrs, call->type_args);
+
+      if (ret_group->root_ref == call) {
+        // This is the root of the group
+        // create the new call node.
+        return MakeNewFunction(ret_group, call->checked_type(), new_call);
+      } else {
+        // This is an intermediate node of a fused function
+        // simply return the new call.
+        return new_call;
+      }
+    } else {
+      return ExprMutator::VisitExpr_(call);
+    }
+  }
+
+  Expr VisitExpr_(const TupleNode* tuple) {
+    auto* ret_group = gmap_.at(tuple)->FindRoot();
+    Array<Expr> new_fields = GetNewArguments(tuple->fields, ret_group);
+    Tuple new_tuple = TupleNode::make(new_fields);
+    if (ret_group == gmap_.at(tuple)) {
+      bool isolated = true;
+      for (size_t i = 0; i < new_fields.size(); ++i) {
+        isolated &= (new_fields[i].same_as(ginfo_[ret_group].params[i]));
+      }
+      if (isolated) {
+        // Do not put a isolated tuple into a function
+        return ExprMutator::VisitExpr_(tuple);
+      }
+      // This tuple has been fused with other ops before it
+      return MakeNewFunction(ret_group, tuple->checked_type(), new_tuple);
+    }
+    // This tuple is an intermediate node in the group
+    return new_tuple;
+  }
+
+  Expr MakeNewFunction(GraphPartitioner::Group* group, Type ret_type, Expr body) {
+    const GroupInfo& ginfo = ginfo_[group];
+    auto func = FunctionNode::make(ginfo.params, body, ret_type, {});
+    func = FunctionSetAttr(func, "Primitive", tvm::Integer(1));
+    return CallNode::make(func, ginfo.arguments, Attrs());
+  }
+
+  Array<Expr> GetNewArguments(const tvm::Array<Expr>& args,
+                              GraphPartitioner::Group* current_group) {
+    Array<Expr> new_args;
+    for (auto arg : args) {
+      auto* arg_group = gmap_.at(arg.get())->FindRoot();
+      auto type = arg->checked_type();
+      Expr new_arg = this->Mutate(arg);
+      if (current_group != arg_group) {
+        Var param = ginfo_[current_group].GetOrAllocParam(new_arg, type);
+        new_args.push_back(param);
+      } else {
+        new_args.push_back(new_arg);
+      }
+    }
+    return new_args;
+  }
+
+  // Debug function, dump the group assignment in text.
+  void DebugDumpGroup(const Expr& body) {
+    std::string text = RelayPrint(body, false, [this](const Expr& expr) -> std::string {
+        auto it = gmap_.find(expr.get());
+        if (it == gmap_.end()) return "";
+        std::ostringstream os;
+        auto *group = it->second->FindRoot();
+        os << "group=" << group;
+        return os.str();
+      });
+    LOG(INFO) << "Dump of group info:\n" << text;
+  }
+};
+
+
+Expr FuseOps(const Expr& expr, int fuse_opt_level) {
+  // First we convert all chains of fusable ops into
+  // abstracted functions which we mark as primtive
+  // then we convert these primtive functions into
+  // new operators.
+  return FuseMutator().Transform(expr, fuse_opt_level);
+}
+
+TVM_REGISTER_API("relay._ir_pass.FuseOps")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    *ret = FuseOps(args[0], args[1]);
+});
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/kind_check.cc b/src/relay/pass/kind_check.cc
new file mode 100644
index 000000000000..7253a600dabf
--- /dev/null
+++ b/src/relay/pass/kind_check.cc
@@ -0,0 +1,117 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ *
+ * \file kindchecker.cc
+ *
+ * \brief Check that types are well formed by applying "kinding rules".
+ *
+ * This pass ensures we do not do things that violate the design of the
+ * type system when writing down types.
+ *
+ * For example tensors are not allowed to contain functions in Relay.
+ *
+ * We check this by ensuring the `dtype` field of a Tensor always
+ * contains a data type such as `int`, `float`, `uint`.
+ */
+#include <tvm/relay/pass.h>
+#include "../ir/type_functor.h"
+
+namespace tvm {
+namespace relay {
+
+using namespace tvm::runtime;
+using Kind = TypeVarNode::Kind;
+
+struct KindChecker : TypeVisitor {
+  bool valid;
+
+  KindChecker() : valid(true) {}
+
+  // checks if t is an incomplete node of kind k or a type param of kind k
+  bool MatchKind(const Type& t, Kind k) {
+    if (const IncompleteTypeNode* tv = t.as<IncompleteTypeNode>()) {
+      return tv->kind == k;
+    }
+
+    if (const TypeVarNode* tp = t.as<TypeVarNode>()) {
+      return tp->kind == k;
+    }
+
+    return false;
+  }
+
+  bool IsTypeKind(const Type& t) {
+    if (MatchKind(t, Kind::kType)) {
+      return true;
+    }
+
+    return t.as_derived<BaseTensorTypeNode>() || t.as<TupleTypeNode>() || t.as<FuncTypeNode>();
+  }
+
+  void VisitType_(const TupleTypeNode* op) override {
+    // tuples should only contain normal types
+    for (const Type& t : op->fields) {
+      this->VisitType(t);
+      valid = valid && IsTypeKind(t);
+      if (!valid) {
+        return;
+      }
+    }
+  }
+
+  void VisitType_(const FuncTypeNode* op) override {
+    // Func types should only take normal types for arguments
+    // and only return a normal type. They should also have
+    // well-formed constraints
+    for (const Type& t : op->arg_types) {
+      this->VisitType(t);
+      valid = valid && IsTypeKind(t);
+      if (!valid) {
+        return;
+      }
+    }
+
+    for (const TypeConstraint& tc : op->type_constraints) {
+      this->VisitType(tc);
+      if (!valid) {
+        return;
+      }
+    }
+
+    this->VisitType(op->ret_type);
+    valid = valid && IsTypeKind(op->ret_type);
+  }
+
+  void VisitType_(const TypeRelationNode* op) override {
+    // arguments to type relation should be normal types
+    for (const Type& t : op->args) {
+      this->VisitType(t);
+      valid = valid && IsTypeKind(t);
+      if (!valid) {
+        return;
+      }
+    }
+  }
+
+  bool Check(const Type& t) {
+    this->VisitType(t);
+    return valid;
+  }
+};
+
+bool KindCheck(const Type& t, const Module& mod) {
+  KindChecker kc;
+  return kc.Check(t);
+}
+
+TVM_REGISTER_API("relay._ir_pass.check_kind")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    if (args.size() == 1) {
+      *ret = KindCheck(args[0], ModuleNode::make({}));
+    } else {
+      *ret = KindCheck(args[0], args[1]);
+    }
+  });
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/let_list.h b/src/relay/pass/let_list.h
new file mode 100644
index 000000000000..904ceab36c3d
--- /dev/null
+++ b/src/relay/pass/let_list.h
@@ -0,0 +1,115 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file let_list.h
+ * \brief LetList record let binding and insert let expression implicitly.
+ *  using it, one can treat AST as value instead of expression,
+ *  and pass them around freely without fear of AST explosion (or effect duplication).
+ *  for example, if one write 'b = a + a; c = b + b; d = c + c', the AST will contain 8 'a'.
+ *  if one instead write 'b = ll.Push(a + a); c = ll.Push(b + b); d = ll.Get(c + c);',
+ *  the AST will contain 2 'a', as b and c are now variables.
+ */
+#ifndef TVM_RELAY_PASS_LET_LIST_H_
+#define TVM_RELAY_PASS_LET_LIST_H_
+
+#include <tvm/relay/expr.h>
+#include <utility>
+#include <vector>
+#include <tuple>
+#include "tvm/relay/type.h"
+
+namespace tvm {
+namespace relay {
+
+/*! \brief LetList allow you to transform expression into variables, so you can copy them around.
+ *  one can insert into the LetList by calling Push, and wrap an expression with bindings with Get.
+ *  additionally, there is the 'With' function, which automatically call Get.
+ */
+class LetList {
+ public:
+  /*!
+   * \brief insert a binding.
+   *
+   * \param pv the var of the binding.
+   *
+   * \param expr the value of the binding.
+   *
+   * \return a Var that hold the inserted expr.
+   */
+  Var Push(Var pv, Expr expr) {
+    lets_.emplace_back(std::make_pair(pv, expr));
+    return pv;
+  }
+
+  /*!
+   * \brief insert a binding.
+   *
+   * \param ty the type of the binding.
+   *
+   * \param expr the value of the binding.
+   *
+   * \return a Var that hold the inserted expr.
+   */
+  Var Push(Type ty, Expr expr) {
+    return Push(VarNode::make("x", ty), expr);
+  }
+
+  /*!
+   * \brief insert a binding.
+   *
+   *  \param expr the value of the binding.
+   *
+   *  \return a Var that hold the inserted expr.
+   */
+  Var Push(Expr expr) {
+    return Push(IncompleteTypeNode::make(TypeVarNode::kType), expr);
+  }
+
+  /*!
+   * \brief wrap an expr around the LetList.
+   *
+   *  \param body the Expression to be wrapped around.
+   *
+   *  \return the wrapped expr.
+   */
+  Expr Get(const Expr& body) const {
+    Expr ret = body;
+    for (auto rit = lets_.rbegin(); rit != lets_.rend(); ++rit) {
+      ret = LetNode::make(std::get<0>(*rit), std::get<1>(*rit), ret);
+    }
+    return ret;
+  }
+
+  /*! \brief generate an LetList and wrap the result automatically.
+   *
+   *  \param f a function that generate the unwrapped Expr.
+   *
+   *  \code
+   *  // Example code that generate `16 * a` using 4 plus instead of 15 plus.
+   *  Expr mult_sixteen(const Var& a) {
+   *    Op plus = Op::Get("plus");
+   *    // Automatically call Get with LetList::With
+   *    return LetList::With([&](LetList* ll) {
+   *      // Turn a call to plus into a variable to avoid duplication of code
+   *      Var b = ll->Push(CallNode::make(plus, {a, a}));
+   *      Var c = ll->Push(CallNode::make(plus, {b, b}));
+   *      Var d = ll->Push(CallNode::make(plus, {c, c}));
+   *      return CallNode::make(plus, {d, d});
+   *    });
+   *  }
+   *  \endcode
+   *
+   *  \return the wrapped Expr.
+   */
+  template<typename F>
+  static Expr With(F&& f) {
+    LetList ll;
+    return ll.Get(f(&ll));
+  }
+
+ private:
+  std::vector<std::pair<Var, Expr> > lets_;
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_PASS_LET_LIST_H_
diff --git a/src/relay/pass/pass_util.h b/src/relay/pass/pass_util.h
new file mode 100644
index 000000000000..ddd73901c452
--- /dev/null
+++ b/src/relay/pass/pass_util.h
@@ -0,0 +1,53 @@
+/*!
+ *  Copyright (c) 2018 by Contributors.
+ *
+ * \file tvm/relay/pass/pass_util.h
+ * \brief Utilities for writing
+ */
+#ifndef TVM_RELAY_PASS_PASS_UTIL_H_
+#define TVM_RELAY_PASS_PASS_UTIL_H_
+
+#include <tvm/relay/op.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/attrs/transform.h>
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief Get reference counter of each internal ExprNode in body.
+ * \param body The body expression.
+ * \return The reference count mapping.
+ */
+std::unordered_map<const Node*, size_t>
+GetExprRefCount(const Expr& body);
+
+
+/*!
+ * \brief Check if expr is positive constant.
+ * \param expr The expression to be checked.
+ * \return Whether all elements of expr is positive constant.
+ */
+bool IsAllPositiveConstant(const Expr& expr);
+
+
+/*!
+ * \brief Substitute var with subst.
+ * \param type The type to be substituted.
+ * \param tvar The type variable to be substituted.
+ * \param subst The target of substitution.
+ * \return The substituted result.
+ */
+Type TypeSubst(const Type& type, const TypeVar& tvar, const Type& subst);
+
+/*!
+ * \brief Substitute type vars in type.
+ * \param type The type to be substituted.
+ * \param subst_map The map of substitution.
+ * \return The substituted result.
+ */
+Type TypeSubst(const Type& type, const tvm::Map<TypeVar, Type>& subst_map);
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_PASS_PASS_UTIL_H_
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
new file mode 100644
index 000000000000..e6e8415bd620
--- /dev/null
+++ b/src/relay/pass/pattern_util.h
@@ -0,0 +1,195 @@
+/*!
+ *  Copyright (c) 2018 by Contributors.
+ *
+ * \file tvm/relay/pass/pattern_util.h
+ * \brief Header of internal operator functions
+ *  These can be used for writing passes.
+ */
+#ifndef TVM_RELAY_PASS_PATTERN_UTIL_H_
+#define TVM_RELAY_PASS_PATTERN_UTIL_H_
+
+#include <tvm/relay/op.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/attrs/transform.h>
+#include <string>
+#include "../op/layout.h"
+
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief Try to match lhs and rhs via broadcasting rule, such that:
+ *
+ * rhs matches the dimension of lhs specified by lhs_axes
+ * rhs's value equals 1 on rest of dimensions.
+ *
+ * \param tlhs The type of left operand (data)
+ * \param trhs The type right operand (bias)
+ * \param lhs_axes The axes on lhs to match.
+ * \param rhs_value A squeezed version of rhs which only contains matched dimension.
+ * \return Whether match is successful.
+ */
+inline bool MatchBroadcastToLeftAxes(const TensorTypeNode* tlhs,
+                                     const TensorTypeNode* trhs,
+                                     const Array<Integer>& lhs_axes,
+                                     Expr* rhs_value = nullptr) {
+  if (tlhs->shape.size() < trhs->shape.size()) return false;
+  AttrsEqual equal;
+  size_t base = tlhs->shape.size() - trhs->shape.size();
+  size_t j = 0;
+
+  NodePtr<SqueezeAttrs> squeeze_attrs;
+  if (rhs_value != nullptr) {
+    squeeze_attrs = make_node<SqueezeAttrs>();
+  }
+
+  for (size_t i = 0; i < tlhs->shape.size(); ++i) {
+    if (j < lhs_axes.size() && i == static_cast<size_t>(lhs_axes[j]->value)) {
+      if (i < base || !equal(tlhs->shape[i], trhs->shape[i - base])) {
+        return false;
+      }
+      ++j;
+    } else if (i >= base) {
+      if (!is_const_int(trhs->shape[i - base], 1)) {
+        return false;
+      }
+      if (rhs_value != nullptr) {
+        squeeze_attrs->axis.push_back(static_cast<int>(i - base));
+      }
+    }
+  }
+  if (rhs_value != nullptr && squeeze_attrs->axis.size() != 0) {
+    static const Op& squeeze_op = Op::Get("squeeze");
+    *rhs_value = CallNode::make(squeeze_op, {rhs_value[0]}, Attrs(squeeze_attrs), {});
+  }
+  return true;
+}
+
+/*!
+ * \brief Expand 1D Tensor to match axis.
+ *
+ * The result bias can be used to add or multiply to
+ * the target Tensor on the specified axis via broadcasting rule.
+ *
+ * \param bias The bias.
+ * \param target_ndim Target dimension.
+ * \param axes The axis on the output we want to match on.
+ */
+inline Expr ExpandBiasToMatchAxis(Expr bias,
+                                  int target_ndim,
+                                  const Array<Integer>& axes) {
+  static const Op& expand_dims = Op::Get("expand_dims");
+  for (size_t i = axes.size(); i != 0; --i) {
+    if (i == axes.size()) {
+      int64_t num_pad_axis = target_ndim - axes[i - 1]->value - 1;
+      if (num_pad_axis > 0) {
+        auto attrs = make_node<ExpandDimsAttrs>();
+        attrs->axis = i;
+        attrs->num_newaxis = static_cast<int>(num_pad_axis);
+        bias = CallNode::make(expand_dims, {bias}, Attrs(attrs), {});
+      }
+    } else {
+      int64_t diff = axes[i]->value - axes[i - 1]->value;
+      CHECK_GE(diff, 0L);
+      if (diff > 0) {
+        auto attrs = make_node<ExpandDimsAttrs>();
+        attrs->axis = i;
+        attrs->num_newaxis = static_cast<int>(diff);
+        bias = CallNode::make(expand_dims, {bias}, Attrs(attrs), {});
+      }
+    }
+  }
+  return bias;
+}
+
+/*!
+ * \brief Check if the call is depthwise conv2d.
+ *
+ * \param call The conv2d call.
+ * \param param The conv2d attributes.
+ * \return Whether it is depthwise_conv2d.
+ */
+inline bool IsDepthwiseConv2D(const Call& call,
+                              const Conv2DAttrs* param,
+                              const Layout& weight_layout) {
+  static const Layout kOIHW("OIHW");
+  auto wshape = ConvertLayout(
+      call->args[1]->type_as<TensorTypeNode>()->shape,
+      weight_layout, kOIHW);
+  return is_const_int(wshape[0], param->groups) &&
+      is_const_int(wshape[1], 1);
+}
+
+/*!
+ * \brief Get super-dimension of output channels of conv2d
+ * \param call The conv2d call.
+ * \return Super-dimension size of output channels of conv2d.
+ */
+inline int64_t GetConv2DSuperChannelsDim(const CallNode* call) {
+    auto param = call->attrs.as<Conv2DAttrs>();
+    auto tweight = call->args[1]->type_as<TensorTypeNode>();
+    auto index = param->weight_layout.find('O');
+    CHECK_NE(index, std::string::npos);
+    auto channels = as_const_int(tweight->shape[index]);
+    return *channels;
+}
+
+/*!
+ * \brief Create a Constant with a scalar
+ *
+ * \param dtype The data type.
+ * \param value The value of the scalar.
+ * \return A Constant.
+ */
+template<typename T>
+inline Constant MakeConstantScalar(DataType dtype, T value) {
+  CHECK_EQ(sizeof(T) * 8, dtype.bits()) << "data type mismatch";
+  runtime::NDArray arr = runtime::NDArray::Empty({}, Type2TVMType(dtype), {kDLCPU, 0});
+  *static_cast<T*>(arr->data) = value;
+  return ConstantNode::make(arr);
+}
+
+
+inline Expr Negative(Expr x) {
+  static const Op& op = Op::Get("negative");
+  return CallNode::make(op, {x}, Attrs(), {});
+}
+
+
+inline Expr Sqrt(Expr x) {
+  static const Op& op = Op::Get("sqrt");
+  return CallNode::make(op, {x}, Attrs(), {});
+}
+
+
+inline Expr Add(Expr lhs, Expr rhs) {
+  static const Op& op = Op::Get("add");
+  return CallNode::make(op, {lhs, rhs}, Attrs(), {});
+}
+
+
+inline Expr Multiply(Expr lhs, Expr rhs) {
+  static const Op& op = Op::Get("multiply");
+  return CallNode::make(op, {lhs, rhs}, Attrs(), {});
+}
+
+
+inline Expr Divide(Expr lhs, Expr rhs) {
+  static const Op& op = Op::Get("divide");
+  return CallNode::make(op, {lhs, rhs}, Attrs(), {});
+}
+
+
+inline Expr ReshapeLike(Expr lhs, Expr rhs) {
+  static const Op& op = Op::Get("reshape_like");
+  return CallNode::make(op, {lhs, rhs}, Attrs(), {});
+}
+
+Expr MakeConcatenate(Expr data, int axis);
+
+Expr MakeStridedSlice(Expr data, Array<Integer> begin, Array<Integer> end, Array<Integer> strides);
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_PASS_PATTERN_UTIL_H_
diff --git a/src/relay/pass/simplify_inference.cc b/src/relay/pass/simplify_inference.cc
new file mode 100644
index 000000000000..6acf4e65b1ac
--- /dev/null
+++ b/src/relay/pass/simplify_inference.cc
@@ -0,0 +1,92 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file simplify_inference.cc
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/attrs/nn.h>
+#include "./pattern_util.h"
+
+namespace tvm {
+namespace relay {
+
+Expr BatchNormToInferUnpack(const Attrs attrs,
+                            Expr data,
+                            Expr gamma,
+                            Expr beta,
+                            Expr moving_mean,
+                            Expr moving_var,
+                            Type tdata) {
+  const auto param = attrs.as<BatchNormAttrs>();
+  Expr epsilon = MakeConstantScalar(Float(32), static_cast<float>(param->epsilon));
+  Expr var_add_eps = Add(moving_var, epsilon);
+  Expr sqrt_var = Sqrt(var_add_eps);
+  Expr scale = Divide(MakeConstantScalar(Float(32), 1.0f), sqrt_var);
+
+  if (param->scale) {
+    scale = Multiply(scale, gamma);
+  }
+  Expr neg_mean = Negative(moving_mean);
+  Expr shift = Multiply(neg_mean, scale);
+  if (param->center) {
+    shift = Add(shift, beta);
+  }
+
+  int axis = param->axis;
+  auto ttype = tdata.as<TensorTypeNode>();
+  CHECK(ttype);
+  auto ndim = ttype->shape.size();
+  scale = ExpandBiasToMatchAxis(scale, ndim, {axis});
+  shift = ExpandBiasToMatchAxis(shift, ndim, {axis});
+
+  Expr out = Multiply(data, scale);
+  out = Add(out, shift);
+  return out;
+}
+
+class InferenceSimplifier : public ExprMutator {
+ public:
+  Expr VisitExpr_(const TupleGetItemNode* n) final {
+    static const Op& batch_norm = Op::Get("nn.batch_norm");
+    static const Op& dropout = Op::Get("nn.dropout");
+
+    Expr new_e = ExprMutator::VisitExpr_(n);
+    const auto* new_n = new_e.as<TupleGetItemNode>();
+    if (new_n->index != 0) {
+      return new_e;
+    }
+    if (const auto* call = new_n->tuple.as<CallNode>()) {
+      if (call->op.same_as(batch_norm)) {
+        return BatchNormToInferUnpack(call->attrs, call->args[0], call->args[1], call->args[2],
+                                      call->args[3], call->args[4], ty_map_.at(call->args[0]));
+      } else if (call->op.same_as(dropout)) {
+        return call->args[0];
+      }
+    }
+    return new_e;
+  }
+
+  Expr VisitExpr_(const CallNode* n) {
+    static const Op& batch_norm = Op::Get("nn.batch_norm");
+    auto new_n = ExprMutator::VisitExpr_(n);
+    if (n->op.same_as(batch_norm)) {
+      ty_map_[new_n.as<CallNode>()->args[0]] = n->args[0]->checked_type();
+    }
+    return new_n;
+  }
+
+ private:
+  std::unordered_map<Expr, Type, NodeHash, NodeEqual> ty_map_;
+};
+
+Expr SimplifyInference(const Expr& e) {
+  return InferenceSimplifier().Mutate(e);
+}
+
+TVM_REGISTER_API("relay._ir_pass.simplify_inference")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = SimplifyInference(args[0]);
+  });
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
new file mode 100644
index 000000000000..13da159e99a8
--- /dev/null
+++ b/src/relay/pass/type_infer.cc
@@ -0,0 +1,559 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file type_infer.cc
+ * \brief Relay type inference and checking.
+ *
+ * This file implements one of the most important passes to the
+ * Relay IR. In order to do many transformations and generate the
+ * most efficient code we need to obtain type information for the
+ * IR.
+ *
+ * Like computation graphs the IR leaves most type information
+ * implicit and relies performing analysis of the program to
+ * generate this information.
+ *
+ * This pass given an expression `e` will infer a type `t` for
+ * the expression simultaneous checking the property `e : t`
+ * (i.e we can show e has type t).
+ *
+ * If we can not infer a type or there are conflicting typing
+ * constraints we will trigger an error.
+ */
+
+#include <tvm/relay/error.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/pass.h>
+#include "type_solver.h"
+#include "../ir/type_functor.h"
+
+namespace tvm {
+namespace relay {
+
+// Necessary deferred relation for TupleGetItem
+struct TupleGetItemAttrs : public tvm::AttrsNode<TupleGetItemAttrs> {
+  int index;
+
+  TVM_DECLARE_ATTRS(TupleGetItemAttrs, "relay.attrs.TupleGetItemAttrs") {
+    TVM_ATTR_FIELD(index);
+  }
+};
+
+bool TupleGetItemRel(const Array<Type>& types,
+                     int num_inputs,
+                     const Attrs& attrs,
+                     const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  if (types[0].as<IncompleteTypeNode>()) return false;
+  const auto* data = types[0].as<TupleTypeNode>();
+  CHECK(data != nullptr)
+      << "TupleGetItem expect input type to be TupleType "
+      << " get " << types[0] << " instead";
+  const auto* param = attrs.as<TupleGetItemAttrs>();
+  CHECK(param != nullptr);
+  CHECK_GE(param->index, 0);
+  CHECK_LT(param->index,  data->fields.size());
+  reporter->Assign(types[1], data->fields[param->index]);
+  return true;
+}
+
+bool MakeTupleRel(const Array<Type>& types,
+                  int num_inputs,
+                  const Attrs& attrs,
+                  const TypeReporter& reporter) {
+  CHECK_EQ(static_cast<size_t>(num_inputs + 1), types.size());
+  for (int i = 0; i < num_inputs; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) return false;
+  }
+  Array<Type> fields;
+  for (int i = 0; i < num_inputs; ++i) {
+    fields.push_back(types[i]);
+  }
+  reporter->Assign(types[num_inputs], TupleTypeNode::make(fields));
+  return true;
+}
+
+TVM_REGISTER_NODE_TYPE(TupleGetItemAttrs);
+TVM_REGISTER_API("tvm.relay.type_relation.TupleGetItem")
+.set_body_typed<bool(const Array<Type>&, int, const Attrs&, const TypeReporter&)>(
+    TupleGetItemRel);
+
+TVM_REGISTER_API("tvm.relay.type_relation.MakeTuple")
+.set_body_typed<bool(const Array<Type>&, int, const Attrs&, const TypeReporter&)>(
+    MakeTupleRel);
+
+struct ResolvedTypeInfo {
+  explicit ResolvedTypeInfo(Type checked_type, Array<Type> type_args)
+      : checked_type(checked_type), type_args(type_args) {}
+  ResolvedTypeInfo() {}
+
+  Type checked_type;
+  // Only allocated when the expression is a call.
+
+  Array<Type> type_args = Array<Type>(NodePtr<Node>(nullptr));
+};
+
+//
+// The inference algorithm can roughly be devided into three stages:
+// - Populate the constraints by visiting the expression (TypeInferencer.GetType)
+//   - solver.AddConstraint and solver.Unify are called to populate the necessary constraints
+// - Solve the constraints (solver_.Solve)
+// - Recreate expression with the resolved checked_type (Resolver.VisitExpr)
+//
+class TypeInferencer : private ExprFunctor<Type(const Expr&)> {
+ public:
+  // constructors
+  TypeInferencer() {
+  }
+  explicit TypeInferencer(Module mod)
+      : mod_(mod) {
+  }
+
+  // inference the type of expr.
+  Expr Infer(Expr expr);
+
+ private:
+  // type resolver that maps back to type
+  class Resolver;
+  // internal environment
+  Module mod_;
+  // map from expression to checked type
+  // type inferencer will populate it up
+  std::unordered_map<Expr, ResolvedTypeInfo, NodeHash, NodeEqual> type_map_;
+
+  // The solver used by the inferencer.
+  TypeSolver solver_;
+  // relation function
+  TypeRelationFn tuple_getitem_rel_;
+  TypeRelationFn make_tuple_rel_;
+  // Unify two types
+  Type Unify(const Type& t1, const Type& t2, const Span& span) {
+    // TODO(tqchen, jroesch): propagate span to solver
+    try {
+      return solver_.Unify(t1, t2);
+    } catch (const dmlc::Error &e) {
+      LOG(FATAL)
+          << "Error unifying `"
+          << t1
+          << "` and `"
+          << t2
+          << "`: " << e.what();
+      return Type();
+    }
+  }
+  // Lazily get type for expr
+  // will call visit to deduce it if it is not in the type_map_
+  Type GetType(const Expr &expr) {
+    auto it = type_map_.find(expr);
+    if (it != type_map_.end() && it->second.checked_type.defined()) {
+      return it->second.checked_type;
+    }
+    Type ret = this->VisitExpr(expr);
+    ResolvedTypeInfo& rti = type_map_[expr];
+    rti.checked_type = ret;
+    return ret;
+  }
+
+  // Visitor logics
+  Type VisitExpr_(const VarNode* op) final {
+    if (op->type_annotation.defined()) {
+      return op->type_annotation;
+    } else {
+      return IncompleteTypeNode::make(TypeVarNode::kType);
+    }
+  }
+
+  Type VisitExpr_(const GlobalVarNode* op) final {
+    GlobalVar var = GetRef<GlobalVar>(op);
+    CHECK(mod_.defined())
+        << "Cannot do type inference without a global variable";
+    Expr e = mod_->Lookup(var);
+    return e->checked_type();
+  }
+
+  Type VisitExpr_(const ConstantNode* op) final {
+    return op->tensor_type();
+  }
+
+  Type VisitExpr_(const TupleNode* op) final {
+    if (!make_tuple_rel_.defined())  {
+      make_tuple_rel_ = TypeRelationFn(
+          EnvFunc::Get("tvm.relay.type_relation.MakeTuple").node_);
+    }
+    Array<Type> types;
+    for (Expr field : op->fields) {
+      types.push_back(GetType(field));
+    }
+    Type rtype = IncompleteTypeNode::make(TypeVarNode::Kind::kType);
+    types.push_back(rtype);
+    solver_.AddConstraint(TypeRelationNode::make(
+        make_tuple_rel_, types, op->fields.size(), Attrs()));
+    return rtype;
+  }
+
+  Type VisitExpr_(const TupleGetItemNode* op) final {
+    if (!tuple_getitem_rel_.defined())  {
+      tuple_getitem_rel_ = TypeRelationFn(
+          EnvFunc::Get("tvm.relay.type_relation.TupleGetItem").node_);
+    }
+    Type tuple_type = GetType(op->tuple);
+    Type rtype = IncompleteTypeNode::make(TypeVarNode::Kind::kType);
+    auto attrs = make_node<TupleGetItemAttrs>();
+    attrs->index = op->index;
+    solver_.AddConstraint(TypeRelationNode::make(
+        tuple_getitem_rel_, {tuple_type, rtype}, 1, Attrs(attrs)));
+    return rtype;
+  }
+
+  Type VisitExpr_(const OpNode* op) final {
+    return op->op_type;
+  }
+
+  Type VisitExpr_(const LetNode* op) final {
+    Type vtype = GetType(op->value);
+    if (op->var->type_annotation.defined()) {
+      vtype = Unify(vtype, op->var->type_annotation, op->span);
+    }
+    CHECK(!type_map_.count(op->var));
+    // NOTE: no scoping is necessary because var are unique in program
+    type_map_[op->var].checked_type = vtype;
+    return GetType(op->body);
+  }
+
+  Type VisitExpr_(const IfNode* op) final {
+    // Ensure the type of the guard is of Tensor[Bool, ()],
+    // that is a rank-0 boolean tensor.
+    Type cond_type = this->GetType(op->cond);
+    this->Unify(cond_type,
+                TensorTypeNode::Scalar(tvm::Bool()),
+                op->cond->span);
+    Type checked_true = this->GetType(op->true_branch);
+    Type checked_false = this->GetType(op->false_branch);
+    return this->Unify(checked_true, checked_false, op->span);
+  }
+
+  // Handle special case basic primitive operator,
+  // if successful return the return type
+  Type PrimitiveCall(const FuncTypeNode* op,
+                     Array<Type> arg_types,
+                     const Attrs& attrs) {
+    if (op->type_params.size() != arg_types.size() + 1) return Type();
+    if (op->type_constraints.size() != 1) return Type();
+    const TypeRelationNode* rel = op->type_constraints[0].as<TypeRelationNode>();
+    if (rel == nullptr) return Type();
+    // validate if the type parameter matches up
+    for (size_t i = 0; i < op->type_params.size(); ++i) {
+      if (!op->type_params[i].same_as(rel->args[i])) return Type();
+    }
+    Type rtype = IncompleteTypeNode::make(TypeVarNode::Kind::kType);
+    arg_types.push_back(rtype);
+    // we can do simple replacement here
+    solver_.AddConstraint(TypeRelationNode::make(
+        rel->func, arg_types, arg_types.size() - 1, attrs));
+    return rtype;
+  }
+
+  // instantiate the function type with fresh
+  FuncType Instantiate(const FuncTypeNode* fn_ty, Array<Type>* ty_args) {
+    tvm::Map<TypeVar, Type> subst_map;
+
+    // Build a subsitituion map up from the function type and type arguments.
+    // Eventually allow the type vars to be passed in.
+    for (auto ty_param : fn_ty->type_params) {
+      IncompleteType fresh = IncompleteTypeNode::make(ty_param->kind);
+      subst_map.Set(ty_param, fresh);
+      ty_args->push_back(fresh);
+    }
+
+    Type ret_type = fn_ty->ret_type;
+
+    // If the function type is incomplete, place a new IncompleteType
+    // This relax the fn_ty to inputs -> Any
+    // The type checking can still pass when there are additional constraints on the type
+    // This is a temporary work around to check recursive functions whose
+    // return type is not yet known.
+    if (!ret_type.defined()) {
+      ret_type = IncompleteTypeNode::make(TypeVarNode::Kind::kType);
+    }
+
+    Type inst_ty = FuncTypeNode::make(fn_ty->arg_types,
+                                      ret_type, {},
+                                      fn_ty->type_constraints);
+    inst_ty = Bind(inst_ty, subst_map);
+    return Downcast<FuncType>(inst_ty);
+  }
+
+  void AddTypeArgs(const Expr& expr, Array<Type> type_args) {
+    auto type_info = type_map_.find(expr);
+    if (type_info == type_map_.end()) {
+      type_map_.insert({expr, ResolvedTypeInfo(Type(), type_args)});
+    } else {
+      CHECK(!type_info->second.type_args.defined());
+      type_info->second.type_args = type_args;
+    }
+  }
+
+  // Handle general call node.
+  Type GeneralCall(const CallNode* call, Array<Type> arg_types) {
+    Type ftype = GetType(call->op);
+    auto* fn_ty_node = ftype.as<FuncTypeNode>();
+
+    CHECK(fn_ty_node != nullptr)
+        << "only expressions with function types can be called, found "
+        << ftype << " at " << call->span;
+
+    Array<Type> type_args;
+    FuncType fn_ty = Instantiate(fn_ty_node, &type_args);
+
+    AddTypeArgs(GetRef<Call>(call), type_args);
+
+    size_t type_arity = fn_ty->arg_types.size();
+    size_t number_of_args = arg_types.size();
+
+    if (type_arity != number_of_args) {
+      if (type_arity < number_of_args) {
+        LOG(FATAL) << "the function is provided too many arguments " << call->span;
+      } else {
+        LOG(FATAL) << "the function is provided too few arguments" << call->span;
+      }
+    }
+
+    for (size_t i = 0; i < fn_ty->arg_types.size(); i++) {
+      this->Unify(fn_ty->arg_types[i], arg_types[i], call->args[i]->span);
+    }
+
+    for (auto cs : fn_ty->type_constraints) {
+      if (auto tr = cs.as<TypeRelationNode>()) {
+        solver_.AddConstraint(
+          TypeRelationNode::make(tr->func, tr->args, tr->num_inputs, call->attrs));
+      } else {
+        solver_.AddConstraint(cs);
+      }
+    }
+
+    return fn_ty->ret_type;
+  }
+
+  Type VisitExpr_(const CallNode* call) final {
+    Array<Type> arg_types;
+    for (Expr arg : call->args) {
+      arg_types.push_back(GetType(arg));
+    }
+
+    if (const OpNode* opnode = call->op.as<OpNode>()) {
+      Type rtype = PrimitiveCall(opnode->op_type.as<FuncTypeNode>(),
+                                 arg_types,
+                                 call->attrs);
+      if (rtype.defined()) {
+        AddTypeArgs(GetRef<Call>(call), arg_types);
+        return rtype;
+      }
+    }
+
+    return GeneralCall(call, arg_types);
+  }
+
+  Type VisitExpr_(const FunctionNode* f) final {
+    for (auto param : f->params) {
+      GetType(param);
+    }
+    Type rtype = GetType(f->body);
+    // Run solver using the currently known information
+    solver_.Solve();
+    // Trying to resolve
+    Array<Type> arg_types;
+    for (size_t i = 0; i < f->params.size(); ++i) {
+      Type atype = solver_.Resolve(GetType(f->params[i]));
+      CHECK(atype.as<IncompleteTypeNode>() == nullptr)
+          << "Cannot resolve type of " << i
+          << "-th parameter of function at" << f->span;
+      arg_types.push_back(atype);
+    }
+    rtype = solver_.Resolve(rtype);
+    CHECK(rtype.as<IncompleteTypeNode>() == nullptr)
+        << "Cannot resolve return type of function at" << f->span;
+    // do not support constraint lifting for now.
+    return FuncTypeNode::make(arg_types, rtype, f->type_params, {});
+  }
+};
+
+class TypeInferencer::Resolver : public ExprMutator {
+ public:
+  Resolver(const std::unordered_map<Expr, ResolvedTypeInfo, NodeHash, NodeEqual>& tmap,
+           TypeSolver* solver)
+      : tmap_(tmap), solver_(solver) {
+  }
+
+  Expr VisitExpr_(const VarNode* op) final {
+    return AttachCheckedType(op);
+  }
+
+  Expr VisitExpr_(const ConstantNode* op) final {
+    return AttachCheckedType(op);
+  }
+
+  Expr VisitExpr_(const GlobalVarNode* op) final {
+    return GetRef<GlobalVar>(op);
+  }
+
+  Expr VisitExpr_(const OpNode* op) final {
+    return ExprMutator::VisitExpr_(op);
+  }
+
+  Expr VisitExpr_(const TupleNode* op) final {
+    return AttachCheckedType(op);
+  }
+
+  Expr VisitExpr_(const TupleGetItemNode* op) final {
+    return AttachCheckedType(op);
+  }
+
+  Expr VisitExpr_(const FunctionNode* op) final {
+    return AttachCheckedType(op);
+  }
+
+  Expr VisitExpr_(const CallNode* op) final {
+    return AttachCheckedType(op);
+  }
+
+  Expr VisitExpr_(const LetNode* op) final {
+    return AttachCheckedType(op);
+  }
+
+  Expr VisitExpr_(const IfNode* op) final {
+    return AttachCheckedType(op);
+  }
+
+  // attach checked type to the mutated node.
+  template<typename T>
+  Expr AttachCheckedType(const T* op) {
+    auto it = tmap_.find(GetRef<Expr>(op));
+    CHECK(it != tmap_.end());
+    Type checked_type = solver_->Resolve(it->second.checked_type);
+    CHECK(checked_type.as<IncompleteTypeNode>() == nullptr)
+        << "Cannot resolve type of " << GetRef<Expr>(op)
+        << " at " << op->span;
+
+    Expr new_e = ExprMutator::VisitExpr_(op);
+    // new_call and new_var's code is only going to be valid for VarNode/CallNode.
+    // Compiler optimization will likely fold these away for other nodes.
+    CallNode* new_call =(
+        std::is_base_of<CallNode, T>::value ?
+        static_cast<CallNode*>(new_e.node_.get()) : nullptr);
+    VarNode* new_var =(
+        std::is_base_of<VarNode, T>::value ?
+        static_cast<VarNode*>(new_e.node_.get()) : nullptr);
+    FunctionNode* new_fn =(
+        std::is_base_of<FunctionNode, T>::value ?
+        static_cast<FunctionNode*>(new_e.node_.get()) : nullptr);
+
+    // check if we need update the new_e
+    bool need_update_type = !checked_type.same_as(new_e->checked_type_);
+    bool need_update_call = (
+        std::is_base_of<CallNode, T>::value &&
+        it->second.type_args.defined() &&
+        !it->second.type_args.same_as(new_call->type_args));
+    bool need_update_var = (
+        std::is_base_of<VarNode, T>::value &&
+        update_missing_type_annotation_ &&
+        !new_var->type_annotation.defined());
+
+    bool need_update_fn = (
+        std::is_base_of<FunctionNode, T>::value &&
+        update_missing_type_annotation_ &&
+        !new_fn->ret_type.defined());
+
+    if (!need_update_type &&
+        !need_update_var &&
+        !need_update_call &&
+        !need_update_fn) {
+      return new_e;
+    }
+
+    if (!new_e.node_.unique()) {
+      // Copy on write optimization
+      // If new_e is an old expression,
+      // we make a copy mutating an existing reference.
+      new_e = Expr(make_node<T>(*new_e.as<T>()));
+      new_call = (
+          std::is_base_of<CallNode, T>::value ?
+          static_cast<CallNode*>(new_e.node_.get()) : nullptr);
+      new_var = (
+          std::is_base_of<VarNode, T>::value ?
+          static_cast<VarNode*>(new_e.node_.get()) : nullptr);
+      new_fn = (
+          std::is_base_of<FunctionNode, T>::value ?
+          static_cast<FunctionNode*>(new_e.node_.get()) : nullptr);
+    }
+
+    // attach the information.
+    if (need_update_type) {
+      new_e->checked_type_ = checked_type;
+    }
+
+    if (need_update_call) {
+      new_call->type_args = it->second.type_args;
+      for (size_t i = 0; i < new_call->type_args.size(); i++) {
+        new_call->type_args.Set(i, solver_->Resolve(new_call->type_args[i]));
+      }
+    }
+    if (need_update_var) {
+      new_var->type_annotation = checked_type;
+    }
+    if (need_update_fn) {
+      auto* fn_type = checked_type.as<FuncTypeNode>();
+      CHECK(fn_type != nullptr);
+      new_fn->ret_type = fn_type->ret_type;
+    }
+    return new_e;
+  }
+
+  Type VisitType(const Type &t) final {
+    return solver_->Resolve(t);
+  }
+
+ private:
+  const std::unordered_map<Expr, ResolvedTypeInfo, NodeHash, NodeEqual>& tmap_;
+  TypeSolver* solver_;
+  // whether attach the checked type as type_annotation
+  // if original type anntation is missing.
+  bool update_missing_type_annotation_{true};
+};
+
+
+Expr TypeInferencer::Infer(Expr expr) {
+  // Step 0: Populate the constraints.
+  GetType(expr);
+  // Step 1: Solve the constraints.
+  solver_.Solve();
+  // Step 2: Attach resolved types to checked_type field.
+  auto resolved_expr = Resolver(type_map_, &solver_).VisitExpr(expr);
+  CHECK(WellFormed(resolved_expr));
+  return resolved_expr;
+}
+
+
+Expr InferType(const Expr& expr, const Module& mod) {
+  auto e = TypeInferencer(mod).Infer(expr);
+  CHECK(WellFormed(e));
+  return e;
+}
+
+Function InferType(const Function& func,
+                   const Module& mod,
+                   const GlobalVar& var) {
+  Function func_copy = Function(make_node<FunctionNode>(*func.operator->()));
+  func_copy->checked_type_ = func_copy->func_type_annotation();
+  mod->functions.Set(var, func_copy);
+  Expr func_ret = TypeInferencer(mod).Infer(func_copy);
+  auto map_node = mod->functions.CopyOnWrite();
+  map_node->data.erase(var.node_);
+  CHECK(WellFormed(func_ret));
+  return Downcast<Function>(func_ret);
+}
+
+TVM_REGISTER_API("relay._ir_pass.infer_type")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = InferType(args[0], args[1]);
+  });
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/type_solver.cc b/src/relay/pass/type_solver.cc
new file mode 100644
index 000000000000..e1efcbbdd0b9
--- /dev/null
+++ b/src/relay/pass/type_solver.cc
@@ -0,0 +1,183 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file type_solver.cc
+ * \brief Type solver implementations.
+ */
+#include <string>
+#include "type_solver.h"
+
+namespace tvm {
+namespace relay {
+
+class TypeSolver::Reporter : public TypeReporterNode {
+ public:
+  explicit Reporter(TypeSolver* solver)
+      : solver_(solver) {}
+
+  void Assign(const Type& dst, const Type& src) final {
+    solver_->Unify(dst, src);
+  }
+
+  bool Assert(const IndexExpr& cond) final {
+    if (const uint64_t* pdiff = as_const_uint(cond)) {
+      return pdiff[0];
+    }
+    return true;
+  }
+
+  bool AssertEQ(const IndexExpr& lhs, const IndexExpr& rhs) final {
+    // early warning constant case.
+    IndexExpr diff = lhs - rhs;
+    if (const int64_t* pdiff = as_const_int(diff)) {
+      return pdiff[0] == 0;
+    }
+    return true;
+  }
+
+ private:
+  TypeSolver* solver_;
+};
+
+// constructor
+TypeSolver::TypeSolver()
+    : reporter_(make_node<Reporter>(this)) {
+}
+
+// destructor
+TypeSolver::~TypeSolver() {
+  // call destructor of all non-POD arena object
+  for (TypeNode* ptr : type_nodes_) {
+    ptr->~TypeNode();
+  }
+  for (RelationNode* ptr : rel_nodes_) {
+    ptr->~RelationNode();
+  }
+}
+
+// Add equality constraint
+Type TypeSolver::Unify(const Type& dst, const Type& src) {
+  // Known limitation
+  // - handle composite types whose component can be unknown.
+  // - handle shape pattern matching
+  TypeNode* lhs = GetTypeNode(dst);
+  TypeNode* rhs = GetTypeNode(src);
+
+  // do occur check so we don't create self-referencing structure
+  if (lhs->FindRoot() == rhs->FindRoot()) {
+    return lhs->resolved_type;
+  }
+  if (lhs->resolved_type.as<IncompleteTypeNode>()) {
+    MergeFromTo(lhs, rhs);
+    return rhs->resolved_type;
+  } else if (rhs->resolved_type.as<IncompleteTypeNode>()) {
+    MergeFromTo(rhs, lhs);
+    return lhs->resolved_type;
+  } else {
+    lhs->parent = rhs;
+    CHECK(AlphaEqual(lhs->resolved_type, rhs->resolved_type))
+        << "Incompatible parent types in UF:"
+        << lhs->resolved_type << " and " << rhs->resolved_type;
+    return rhs->resolved_type;
+  }
+}
+
+// Add type constraint to the solver.
+void TypeSolver::AddConstraint(const TypeConstraint& constraint) {
+  if (auto *op = constraint.as<TypeRelationNode>()) {
+    // create a new relation node.
+    RelationNode* rnode = arena_.make<RelationNode>();
+    rnode->rel = GetRef<TypeRelation>(op);
+    rel_nodes_.push_back(rnode);
+    // populate the type information.
+    for (size_t i = 0; i < op->args.size(); ++i) {
+      // insert link to the type list
+      LinkNode<TypeNode*>* tlink = arena_.make<LinkNode<TypeNode*> >();
+      TypeNode* tnode = GetTypeNode(op->args[i]);
+      tlink->value = tnode;
+      rnode->type_list.Push(tlink);
+      // insert type->relation node
+      LinkNode<RelationNode*>* rlink = arena_.make<LinkNode<RelationNode*> >();
+      rlink->value = rnode;
+      tnode->rel_list.Push(rlink);
+    }
+    // add the relation to the working queue.
+    this->AddToQueue(rnode);
+  } else {
+    LOG(FATAL) << "Do not know how to handle constraint type"
+               << constraint->type_key();
+  }
+}
+
+// Resolve a type in the solver context.
+Type TypeSolver::Resolve(const Type& type) {
+  auto it = tmap_.find(type);
+  if (it != tmap_.end()) {
+    return it->second->FindRoot()->resolved_type;
+  } else {
+    return type;
+  }
+}
+
+bool TypeSolver::Solve() {
+  // update until queue is empty
+  while (!update_queue_.empty()) {
+    RelationNode* rnode = update_queue_.front();
+    const auto& rel = rnode->rel;
+    update_queue_.pop();
+    CHECK(!rnode->resolved);
+    // update the relation with given evidence.
+    Array<Type> args;
+    for (auto* tlink = rnode->type_list.head; tlink != nullptr; tlink = tlink->next) {
+      args.push_back(tlink->value->FindRoot()->resolved_type);
+      CHECK_LE(args.size(), rel->args.size());
+    }
+    // call the function
+    bool resolved = rel->func(args, rel->num_inputs, rel->attrs, reporter_);
+    // mark inqueue as false after the function call
+    // so that rnode itself won't get enqueued again.
+    rnode->inqueue = false;
+
+    if (resolved) {
+      ++num_resolved_rels_;
+    }
+    rnode->resolved = resolved;
+  }
+  // This criterion is not necessarily right for all the possible cases
+  // TODO(tqchen): We should also count the number of in-complete types.
+  return num_resolved_rels_ == rel_nodes_.size();
+}
+
+
+// Expose type solver only for debugging purposes.
+TVM_REGISTER_API("relay._ir_pass._test_type_solver")
+.set_body([](runtime::TVMArgs args, runtime::TVMRetValue* ret) {
+    using runtime::PackedFunc;
+    using runtime::TypedPackedFunc;
+    auto solver = std::make_shared<TypeSolver>();
+
+    auto mod = [solver](std::string name) -> PackedFunc {
+      if (name == "Solve") {
+        return TypedPackedFunc<bool()>([solver]() {
+            return solver->Solve();
+          });
+      } else if (name == "Unify") {
+        return TypedPackedFunc<void(Type, Type)>([solver](Type lhs, Type rhs) {
+            solver->Unify(lhs, rhs);
+          });
+      } else if (name == "Resolve") {
+        return TypedPackedFunc<Type(Type)>([solver](Type t) {
+            return solver->Resolve(t);
+          });
+      } else if (name == "AddConstraint") {
+        return TypedPackedFunc<void(TypeConstraint)>([solver](TypeConstraint c) {
+            return solver->AddConstraint(c);
+          });
+      } else {
+        return PackedFunc();
+      }
+    };
+    *ret = runtime::TypedPackedFunc<runtime::PackedFunc(std::string)>(mod);
+  });
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/type_solver.h b/src/relay/pass/type_solver.h
new file mode 100644
index 000000000000..2f311c9b9810
--- /dev/null
+++ b/src/relay/pass/type_solver.h
@@ -0,0 +1,186 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file type_solver.h
+ * \brief Solver logic for type inference.
+ */
+#ifndef TVM_RELAY_PASS_TYPE_SOLVER_H_
+#define TVM_RELAY_PASS_TYPE_SOLVER_H_
+
+#include <tvm/relay/type.h>
+#include <tvm/relay/pass.h>
+#include <vector>
+#include <queue>
+#include "../../common/arena.h"
+
+
+namespace tvm {
+namespace relay {
+
+using common::LinkNode;
+using common::LinkedList;
+/*!
+ * \brief Interface of type solver used in type inference.
+ *
+ * TypeSolver works on a list of constraints among incomplete types.
+ * The user will populate the constraints by AddConstraint and Assign.
+ * Then we can call Solve to trying to resolve the unknown.
+ *
+ * This can be viewed as "type program(computational graph)" of types, where
+ * the type constraint are operators of the graph and the incomplete
+ * types are intermediate value of the graph.
+ * If all the input types are concretely known, we should be able to
+ * just run a forward pass on the "type program" to get all the types.
+ *
+ * The list of constraints representation means we are storing it as a bipartite
+ * graph instead of a DAG. This is because some constraints might go both direction.
+ * TypeSolver could take advantage of bidirectional constraints to deduce input
+ * value given output ones. Never-the-less, we should keep in mind that
+ * there is a "forward direction" that the TypeSolver should take advantage of.
+ */
+class TypeSolver {
+ public:
+  TypeSolver();
+  ~TypeSolver();
+  /*!
+   * \brief Add a type constraint to the solver.
+   * \param constraint The constraint to be added.
+   */
+  void AddConstraint(const TypeConstraint& constraint);
+  /*!
+   * \brief Resolve type to the solution type in the solver.
+   * \param type The type to be resolved.
+   * \return The resolved type.
+   */
+  Type Resolve(const Type& type);
+  /*!
+   * \brief Start to solve the types using the current known information.
+   * \return Whether all the incomplete types has been fully resolved.
+   */
+  bool Solve();
+  /*!
+   * \brief Unify lhs and rhs.
+   * \param lhs The left operand.
+   * \param rhs The right operand
+   */
+  Type Unify(const Type& lhs, const Type& rhs);
+
+ private:
+  class Reporter;
+  struct TypeNode;
+  struct RelationNode;
+  // Internally the solver maintains a bipartite graph of Relation and Types.
+  // All the object in the structure is managed by a arena allocator
+  // which releases the memory upon distruction of the type solver.
+  /*!
+   * \brief type node struct
+   *  TypeNode implements a union-find data structure(via parent)
+   *  that can unifies the same types to the name resolved_type.
+   *
+   *  It also contains collection of links to related Relations,
+   *  which is stored in rel_list.
+   */
+  struct TypeNode {
+    /*! \brief The final resolved type */
+    Type resolved_type;
+    /*! \brief type node in the union find algorithm */
+    TypeNode* parent{nullptr};
+    /*! \brief list of relations that is related to this type node */
+    LinkedList<RelationNode*> rel_list;
+    /*!
+     * \brief Find the root type node, perform path compression
+     * \return The root type node.
+     */
+    TypeNode* FindRoot() {
+      // fast path
+      if (this->parent == nullptr) return this;
+      // slow path with path compression.
+      TypeNode* root = this;
+      while (root->parent != nullptr) {
+        root = root->parent;
+      }
+      for (TypeNode* p = this; p != root;) {
+        TypeNode* parent = p->parent;
+        p->parent = root;
+        p = parent;
+      }
+      return root;
+    }
+  };
+  /*! \brief relation node */
+  struct RelationNode {
+    /*! \brief Whether the relation is in the queue to be solved */
+    bool inqueue{false};
+    /*! \brief Whether the relation is resolved */
+    bool resolved{false};
+    /*! \brief The corresponding type relation */
+    TypeRelation rel;
+    /*! \brief list types to this relation */
+    LinkedList<TypeNode*> type_list;
+  };
+  /*! \brief List of all allocated type nodes */
+  std::vector<TypeNode*> type_nodes_;
+  /*! \brief List of all allocated relation nodes */
+  std::vector<RelationNode*> rel_nodes_;
+  /*! \brief Number of resolved relations */
+  size_t num_resolved_rels_{0};
+  /*! \brief map from type node to types. */
+  std::unordered_map<Type, TypeNode*, NodeHash, NodeEqual> tmap_;
+  /*! \breif Internal queue to update the relation */
+  std::queue<RelationNode*> update_queue_;
+  /*! \brief allocator of all the internal node obhect*/
+  common::Arena arena_;
+  /*! \brief Reporter that reports back to self */
+  TypeReporter reporter_;
+  /*!
+   * \brief GetTypeNode that is corresponds to t.
+   * if it do not exist, create a new one.
+   * \return The type node.
+   */
+  TypeNode* GetTypeNode(const Type& t) {
+    auto it = tmap_.find(t);
+    if (it != tmap_.end()) {
+      return it->second->FindRoot();
+    } else {
+      TypeNode* n = arena_.make<TypeNode>();
+      type_nodes_.push_back(n);
+      n->resolved_type = t;
+      tmap_[t] = n;
+      return n;
+    }
+  }
+  /*!
+   * \brief Add relation node rel to the update queue
+   * \param rel The relation node
+   */
+  void AddToQueue(RelationNode* rel) {
+    if (rel->inqueue) return;
+    CHECK(!rel->resolved);
+    rel->inqueue = true;
+    update_queue_.push(rel);
+  }
+  /*!
+   * \brief Merge rhs type node to lhs
+   * \param src The source operand
+   * \param dst The dst operand.
+   */
+  void MergeFromTo(TypeNode* src, TypeNode* dst) {
+    if (src == dst) return;
+    src->parent = dst;
+    // move the link to the to dst
+    for (auto* rlink = src->rel_list.head; rlink != nullptr;) {
+      // store next pointer first before rlink get moved
+      auto* next = rlink->next;
+      // if the relation is not yet resolved
+      // send the relation to the new
+      if (!rlink->value->resolved) {
+        this->AddToQueue(rlink->value);
+        dst->rel_list.Push(rlink);
+      }
+      rlink = next;
+    }
+  }
+};
+
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_PASS_TYPE_SOLVER_H_
diff --git a/src/relay/pass/util.cc b/src/relay/pass/util.cc
new file mode 100644
index 000000000000..b99d975135be
--- /dev/null
+++ b/src/relay/pass/util.cc
@@ -0,0 +1,212 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ *
+ * \file util.cc
+ *
+ * \brief Utility functions for Relay.
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include "../ir/type_functor.h"
+
+namespace tvm {
+namespace relay {
+
+// FreeTypeVar
+class FreeTypeVarTVisitor : public TypeVisitor {
+ public:
+  FreeTypeVarTVisitor(
+      Array<TypeVar>* free_vars,
+      std::unordered_set<TypeVar, NodeHash, NodeEqual>* bound_vars)
+      : free_vars_(free_vars), bound_vars_(bound_vars) { }
+
+  void VisitType_(const TypeVarNode* tp) final {
+    TypeVar var = GetRef<TypeVar>(tp);
+    if (bound_vars_->count(var) == 0) {
+      free_vars_->push_back(var);
+    }
+  }
+
+  void VisitType_(const FuncTypeNode* f) final {
+    for (auto type_param : f->type_params) {
+      bound_vars_->insert(type_param);
+    }
+    TypeVisitor::VisitType_(f);
+  }
+
+ private:
+  Array<TypeVar>* free_vars_;
+  std::unordered_set<TypeVar, NodeHash, NodeEqual>* bound_vars_;
+};
+
+class FreeTypeVarEVisitor : private ExprVisitor {
+ public:
+  Array<TypeVar> Find(const Expr& expr) {
+    this->VisitExpr(expr);
+    return free_vars_;
+  }
+
+  Array<TypeVar> Find(const Type& type) {
+    this->VisitType(type);
+    return free_vars_;
+  }
+
+  void VisitExpr_(const FunctionNode* f) final {
+    for (const auto& tp : f->type_params) {
+      bound_vars_.insert(tp);
+    }
+    ExprVisitor::VisitExpr_(f);
+  }
+
+  void VisitType(const Type& t) final {
+    FreeTypeVarTVisitor(&free_vars_, &bound_vars_)
+        .VisitType(t);
+  }
+
+ private:
+  // The result list
+  Array<TypeVar> free_vars_;
+  std::unordered_set<TypeVar, NodeHash, NodeEqual> bound_vars_;
+};
+
+class FreeVarVisitor : protected ExprVisitor {
+ public:
+  Array<Var> Find(const Expr& expr) {
+    this->VisitExpr(expr);
+    return free_vars_;
+  }
+
+  void VisitExpr_(const VarNode* var) final {
+    if (bound_vars_.count(var) == 0) {
+      free_vars_.push_back(GetRef<Var>(var));
+    }
+  }
+
+  void VisitExpr_(const FunctionNode* op) final {
+    for (const auto& param : op->params) {
+      bound_vars_.insert(param.operator->());
+    }
+    VisitExpr(op->body);
+  }
+
+  void VisitExpr_(const LetNode* op) final {
+    bound_vars_.insert(op->var.operator->());
+    VisitExpr(op->value);
+    VisitExpr(op->body);
+  }
+
+ private:
+  // The result list
+  Array<Var> free_vars_;
+  std::unordered_set<const VarNode*> bound_vars_;
+};
+
+tvm::Array<TypeVar> FreeTypeVars(const Expr& expr) {
+  return FreeTypeVarEVisitor().Find(expr);
+}
+
+tvm::Array<TypeVar> FreeTypeVars(const Type& type) {
+  return FreeTypeVarEVisitor().Find(type);
+}
+
+tvm::Array<Var> FreeVars(const Expr& expr) {
+  return FreeVarVisitor().Find(expr);
+}
+
+TVM_REGISTER_API("relay._ir_pass.free_vars")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = FreeVars(args[0]);
+  });
+
+TVM_REGISTER_API("relay._ir_pass.free_type_vars")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    NodeRef x = args[0];
+    if (x.as<TypeNode>()) {
+      *ret = FreeTypeVars(Downcast<Type>(x));
+    } else {
+      *ret = FreeTypeVars(Downcast<Expr>(x));
+    }
+  });
+
+/*!
+ * \brief Get reference counter of each internal ExprNode in body.
+ * \param body The body expression.
+ * \return The reference count mapping.
+ */
+std::unordered_map<const Node*, size_t>
+GetExprRefCount(const Expr& body) {
+  class ExprRefCounter : private ExprVisitor {
+   public:
+    std::unordered_map<const Node*, size_t>
+    Get(const Expr& body) {
+      this->VisitExpr(body);
+      return std::move(this->visit_counter_);
+    }
+  };
+  return ExprRefCounter().Get(body);
+}
+
+template <typename T>
+bool IsNDArrayAllGreaterEqual(const runtime::NDArray& tensor, T value) {
+  CHECK_EQ(tensor->ctx.device_type, kDLCPU);
+  CHECK(tensor->strides == nullptr);
+  CHECK_EQ(tensor->byte_offset, 0);
+  const T* data = static_cast<const T*>(tensor->data);
+  int64_t num_elems = 1;
+  for (int i = 0; i < tensor->ndim; ++i) {
+    num_elems *= tensor->shape[i];
+  }
+
+  for (int64_t i = 0; i < num_elems; i++) {
+    if (*data < value) {
+      return false;
+    }
+    data++;
+  }
+  return true;
+}
+
+bool IsAllPositiveConstant(const Expr& expr) {
+  // peel through a few common transform ops.
+  static const auto& expand_dims = Op::Get("expand_dims");
+  static const auto& reshape = Op::Get("reshape");
+  static const auto& transpose = Op::Get("transpose");
+  static const auto& squeeze = Op::Get("squeeze");
+
+  if (const auto* constant = expr.as<ConstantNode>()) {
+    const auto& tensor = constant->data;
+    const auto& dtype = tensor->dtype;
+    if (dtype.lanes != 1) {
+      return false;
+    } else if (dtype.code == kDLFloat && dtype.bits == 32) {
+      return IsNDArrayAllGreaterEqual<float>(tensor, 0);
+    } else if (dtype.code == kDLFloat && dtype.bits == 64) {
+      return IsNDArrayAllGreaterEqual<double>(tensor, 0);
+    } else if (dtype.code == kDLInt && dtype.bits == 8) {
+      return IsNDArrayAllGreaterEqual<int8_t>(tensor, 0);
+    } else if (dtype.code == kDLInt && dtype.bits == 32) {
+      return IsNDArrayAllGreaterEqual<int32_t>(tensor, 0);
+    } else if (dtype.code == kDLUInt && dtype.bits == 8) {
+      return IsNDArrayAllGreaterEqual<uint8_t>(tensor, 0);
+    } else if (dtype.code == kDLUInt && dtype.bits == 32) {
+      return IsNDArrayAllGreaterEqual<uint32_t>(tensor, 0);
+    } else {
+      return false;
+    }
+  } else if (const auto* op = expr.as<CallNode>()) {
+    // tail recursion.
+    if (op->op.same_as(expand_dims) ||
+        op->op.same_as(reshape) ||
+        op->op.same_as(transpose) ||
+        op->op.same_as(squeeze)) {
+      return IsAllPositiveConstant(op->args[0]);
+    } else {
+      return false;
+    }
+  } else {
+    return false;
+  }
+}
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/well_formed.cc b/src/relay/pass/well_formed.cc
new file mode 100644
index 000000000000..d9c6b617ca5f
--- /dev/null
+++ b/src/relay/pass/well_formed.cc
@@ -0,0 +1,60 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file well_formed.cc
+ * \brief check that expression is well formed.
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <unordered_set>
+
+namespace tvm {
+namespace relay {
+
+
+//! brief make sure each Var is bind at most once.
+class WellFormedChecker : private ExprVisitor {
+  bool well_formed = true;
+
+  std::unordered_set<Var, NodeHash, NodeEqual> s;
+
+  void Check(const Var& v) {
+    if (s.count(v) != 0) {
+      well_formed = false;
+    }
+    s.insert(v);
+  }
+
+  void VisitExpr_(const LetNode* l) final {
+    // we do letrec only for FunctionNode,
+    // but shadowing let in let binding is likely programming error, and we should forbidden it.
+    Check(l->var);
+    CheckWellFormed(l->value);
+    CheckWellFormed(l->body);
+  }
+
+  void VisitExpr_(const FunctionNode* f) final {
+    for (const Var& param : f->params) {
+      Check(param);
+    }
+    CheckWellFormed(f->body);
+  }
+
+ public:
+  bool CheckWellFormed(const Expr& e) {
+    this->VisitExpr(e);
+    return well_formed;
+  }
+};
+
+bool WellFormed(const Expr& e) {
+  return WellFormedChecker().CheckWellFormed(e);
+}
+
+TVM_REGISTER_API("relay._ir_pass.well_formed")
+  .set_body([](TVMArgs args, TVMRetValue *ret) {
+      Expr e = args[0];
+      *ret = WellFormed(e);
+    });
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/runtime/builtin_fp16.cc b/src/runtime/builtin_fp16.cc
new file mode 100644
index 000000000000..c920c9571f38
--- /dev/null
+++ b/src/runtime/builtin_fp16.cc
@@ -0,0 +1,23 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file builtin_fp16.cc
+ * \brief Functions for conversion between fp32 and fp16
+*/
+#include <builtin_fp16.h>
+#include <tvm/runtime/c_runtime_api.h>
+
+extern "C" {
+
+// disable under msvc
+#ifndef _MSC_VER
+
+TVM_WEAK uint16_t __gnu_f2h_ieee(float a) {
+  return __truncXfYf2__<float, uint32_t, 23, uint16_t, uint16_t, 10>(a);
+}
+
+TVM_WEAK float __gnu_h2f_ieee(uint16_t a) {
+  return __extendXfYf2__<uint16_t, uint16_t, 10, float, uint32_t, 23>(a);
+}
+
+#endif
+}
diff --git a/src/runtime/c_dsl_api.cc b/src/runtime/c_dsl_api.cc
index 6ae8b9911a4c..ae39a1266d06 100644
--- a/src/runtime/c_dsl_api.cc
+++ b/src/runtime/c_dsl_api.cc
@@ -5,8 +5,8 @@
  */
 #include <tvm/runtime/registry.h>
 #include <tvm/c_dsl_api.h>
-#include "./dsl_api.h"
-#include "./runtime_base.h"
+#include "dsl_api.h"
+#include "runtime_base.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index 916dfadecb4c..d9435d33903d 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -17,32 +17,11 @@
 #include <algorithm>
 #include <string>
 #include <cstdlib>
-#include "./runtime_base.h"
+#include "runtime_base.h"
 
 namespace tvm {
 namespace runtime {
 
-/*!
- * \brief The name of Device API factory.
- * \param type The device type.
- */
-inline std::string DeviceName(int type) {
-  switch (type) {
-    case kDLCPU: return "cpu";
-    case kDLGPU: return "gpu";
-    case kDLOpenCL: return "opencl";
-    case kDLSDAccel: return "sdaccel";
-    case kDLAOCL: return "aocl";
-    case kDLVulkan: return "vulkan";
-    case kDLMetal: return "metal";
-    case kDLVPI: return "vpi";
-    case kDLROCM: return "rocm";
-    case kOpenGL: return "opengl";
-    case kExtDev: return "ext_dev";
-    default: LOG(FATAL) << "unknown type =" << type; return "Unknown";
-  }
-}
-
 class DeviceAPIManager {
  public:
   static const int kMaxDeviceAPI = 32;
diff --git a/src/runtime/cpu_device_api.cc b/src/runtime/cpu_device_api.cc
index e3434e01813e..d166a3a43dfa 100644
--- a/src/runtime/cpu_device_api.cc
+++ b/src/runtime/cpu_device_api.cc
@@ -8,7 +8,7 @@
 #include <tvm/runtime/device_api.h>
 #include <cstdlib>
 #include <cstring>
-#include "./workspace_pool.h"
+#include "workspace_pool.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index 98accdf1b0aa..8309b45a7963 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -8,7 +8,7 @@
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/registry.h>
 #include <cuda_runtime.h>
-#include "./cuda_common.h"
+#include "cuda_common.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc
index a0e613107bae..f818a78345bb 100644
--- a/src/runtime/cuda/cuda_module.cc
+++ b/src/runtime/cuda/cuda_module.cc
@@ -2,7 +2,7 @@
  *  Copyright (c) 2017 by Contributors
  * \file cuda_module.cc
  */
-#include "./cuda_module.h"
+#include "cuda_module.h"
 
 #include <tvm/runtime/registry.h>
 #include <cuda.h>
@@ -11,7 +11,7 @@
 #include <array>
 #include <string>
 #include <mutex>
-#include "./cuda_common.h"
+#include "cuda_common.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "../meta_data.h"
@@ -141,7 +141,7 @@ class CUDAModuleNode : public runtime::ModuleNode {
   std::mutex mutex_;
 };
 
-// a wrapped function class to get packed fucn.
+// a wrapped function class to get packed func.
 class CUDAWrappedFunc {
  public:
   // initialize the CUDA function.
diff --git a/src/runtime/dsl_api.h b/src/runtime/dsl_api.h
index a1d6e48ceb2f..3e1299bd8c96 100644
--- a/src/runtime/dsl_api.h
+++ b/src/runtime/dsl_api.h
@@ -16,6 +16,7 @@ namespace runtime {
  */
 class DSLAPI {
  public:
+  virtual ~DSLAPI() = default;
   virtual void NodeFree(NodeHandle handle) const = 0;
 
   virtual void NodeTypeKey2Index(const char* type_key,
diff --git a/src/runtime/dso_module.cc b/src/runtime/dso_module.cc
index 60fdb427c246..fe7c362472d1 100644
--- a/src/runtime/dso_module.cc
+++ b/src/runtime/dso_module.cc
@@ -6,7 +6,7 @@
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/packed_func.h>
-#include "./module_util.h"
+#include "module_util.h"
 
 #if defined(_WIN32)
 #include <windows.h>
diff --git a/src/runtime/file_util.cc b/src/runtime/file_util.cc
index 7606bf89cd92..ff579d12112d 100644
--- a/src/runtime/file_util.cc
+++ b/src/runtime/file_util.cc
@@ -6,8 +6,9 @@
 #include <dmlc/logging.h>
 #include <tvm/runtime/serializer.h>
 #include <fstream>
+#include <vector>
 
-#include "./file_util.h"
+#include "file_util.h"
 
 namespace tvm {
 namespace runtime {
@@ -141,5 +142,9 @@ void LoadMetaDataFromFile(
   fs.close();
 }
 
+void RemoveFile(const std::string& file_name) {
+  std::remove(file_name.c_str());
+}
+
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/file_util.h b/src/runtime/file_util.h
index b3357271856e..2b797614281b 100644
--- a/src/runtime/file_util.h
+++ b/src/runtime/file_util.h
@@ -7,7 +7,7 @@
 #define TVM_RUNTIME_FILE_UTIL_H_
 
 #include <string>
-#include "./meta_data.h"
+#include "meta_data.h"
 
 namespace tvm {
 namespace runtime {
@@ -71,6 +71,12 @@ void SaveMetaDataToFile(
 void LoadMetaDataFromFile(
     const std::string& file_name,
     std::unordered_map<std::string, FunctionInfo>* fmap);
+
+/*!
+ * \brief Remove (unlink) a file.
+ * \param file_name The file name.
+ */
+void RemoveFile(const std::string& file_name);
 }  // namespace runtime
 }  // namespace tvm
 #endif  // TVM_RUNTIME_FILE_UTIL_H_
diff --git a/src/runtime/graph/debug/graph_runtime_debug.cc b/src/runtime/graph/debug/graph_runtime_debug.cc
new file mode 100644
index 000000000000..452a48408ccf
--- /dev/null
+++ b/src/runtime/graph/debug/graph_runtime_debug.cc
@@ -0,0 +1,163 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file graph_runtime_debug.cc
+ */
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/ndarray.h>
+#include <chrono>
+#include "../graph_runtime.h"
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief Graph runtime with debug .
+ *
+ *  This is the extension of GraphRuntime class used for debugging
+ *  TVM runtime PackedFunc API.
+ */
+class GraphRuntimeDebug : public GraphRuntime {
+ public:
+  /*!
+   * \brief Run each operation and get the output.
+   * \param index The index of op which needs to be run.
+   * \return the elapsed time.
+   */
+  double DebugRun(size_t index) {
+    CHECK(index < op_execs().size());
+    TVMContext ctx = data_entry()[GetEntryId(index, 0)].operator->()->ctx;
+    auto tbegin = std::chrono::high_resolution_clock::now();
+    if (op_execs()[index]) {
+      op_execs()[index]();
+    }
+    TVMSynchronize(ctx.device_type, ctx.device_id, nullptr);
+    auto tend = std::chrono::high_resolution_clock::now();
+    double time = std::chrono::duration_cast<std::chrono::duration<double> >(
+        tend - tbegin).count();
+    return time;
+  }
+
+  /*!
+   * \brief Run each operation and get the output.
+   * \param index The index of op which needs to be returned.
+   * \param eid The Entry id of the op.
+   */
+  NDArray GetOutputByLayer(int index, int eid) {
+    return data_entry()[GetEntryId(index, eid)];
+  }
+
+  /*!
+   * \brief GetFunction Get the function based on input.
+   * \param name The function which needs to be invoked.
+   * \param sptr_to_self Packed function pointer.
+   */
+  PackedFunc GetFunction(const std::string& name,
+                         const std::shared_ptr<ModuleNode>& sptr_to_self);
+
+  /*!
+   * \brief Get the node index given the name of node.
+   * \param name The name of the node.
+   * \return The index of node.
+   */
+  int GetNodeIndex(const std::string& name) const {
+    for (size_t nid = 0; nid < GetNumOfNodes(); ++nid) {
+      if (GetNodeName(nid) == name) {
+        return static_cast<int>(nid);
+      }
+    }
+    LOG(FATAL) << "cannot find " << name << " among nodex";
+    return -1;
+}
+
+/*!
+ * \brief Copy index-th node to data_out.
+ *
+ * This method will do a partial run of the the graph
+ * from begining upto the index-th node and return output of index-th node.
+ * This is costly operation and suggest to use only for debug porpose.
+ *
+ * \param index: The  index of the node.
+ * \param data_out the node data.
+ */
+void DebugGetNodeOutput(int index, DLTensor* data_out) {
+  CHECK_LT(static_cast<size_t>(index), op_execs().size());
+  uint32_t eid = index;
+
+  for (size_t i = 0; i < op_execs().size(); ++i) {
+    if (op_execs()[i]) op_execs()[i]();
+    if (static_cast<int>(i) == index) break;
+  }
+
+  data_entry()[eid].CopyTo(data_out);
+}
+};
+
+
+/*!
+ * \brief GetFunction Get the function based on input.
+ * \param name The function which needs to be invoked.
+ * \param sptr_to_self Packed function pointer.
+ */
+PackedFunc GraphRuntimeDebug::GetFunction(
+    const std::string& name,
+    const std::shared_ptr<ModuleNode>& sptr_to_self) {
+  // return member functions during query.
+  if (name == "debug_run") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        *rv = this->DebugRun(static_cast<size_t>(args[0].operator int64_t()));
+      });
+  } else if (name == "get_output_by_layer") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        *rv = this->GetOutputByLayer(args[0], args[1]);
+      });
+  } else if (name == "debug_get_output") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        if (args[0].type_code() == kStr) {
+          this->DebugGetNodeOutput(this->GetNodeIndex(args[0]), args[1]);
+        } else {
+          this->DebugGetNodeOutput(args[0], args[1]);
+        }
+      });
+  } else {
+    return GraphRuntime::GetFunction(name, sptr_to_self);
+  }
+}
+
+/*!
+ * \brief GraphRuntimeDebugCreate Get the function based on input.
+ * \param sym_json The graph symbol in json format.
+ * \param m Compiled module which will be loaded.
+ * \param ctxs All devices contexts.
+ */
+Module GraphRuntimeDebugCreate(const std::string& sym_json,
+                               const tvm::runtime::Module& m,
+                               const std::vector<TVMContext>& ctxs) {
+  std::shared_ptr<GraphRuntimeDebug> exec = std::make_shared<GraphRuntimeDebug>();
+  exec->Init(sym_json, m, ctxs);
+  return Module(exec);
+}
+
+TVM_REGISTER_GLOBAL("tvm.graph_runtime_debug.create")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    CHECK_GE(args.num_args, 4)
+        << "The expected number of arguments for graph_runtime.create is "
+           "at least 4, but it has "
+        << args.num_args;
+    *rv = GraphRuntimeDebugCreate(args[0], args[1], GetAllContext(args));
+  });
+
+TVM_REGISTER_GLOBAL("tvm.graph_runtime_debug.remote_create")
+  .set_body([](TVMArgs args, TVMRetValue* rv) {
+    CHECK_GE(args.num_args, 4) << "The expected number of arguments for "
+                                  "graph_runtime.remote_create is "
+                                  "at least 4, but it has "
+                               << args.num_args;
+    void* mhandle = args[1];
+    const auto& contexts = GetAllContext(args);
+    *rv = GraphRuntimeDebugCreate(
+        args[0], *static_cast<tvm::runtime::Module*>(mhandle), contexts);
+  });
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 7a75771af23b..52bd07b70f75 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -2,406 +2,135 @@
  *  Copyright (c) 2017 by Contributors
  * \file graph_runtime.cc
  */
+#include "graph_runtime.h"
+
+#include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
-#include <tvm/runtime/ndarray.h>
-#include <dmlc/memory_io.h>
-#include <dmlc/json.h>
+#include <tvm/runtime/serializer.h>
+
+#include <algorithm>
+#include <functional>
 #include <numeric>
-#include "./graph_runtime.h"
+#include <vector>
+#include <string>
 
 namespace tvm {
 namespace runtime {
 
-/*! \brief macro to do C API call */
-#define TVM_CCALL(func)                                            \
-  {                                                                \
-    int ret = (func);                                              \
-    CHECK_EQ(ret, 0)                                               \
-        << TVMGetLastError();                                      \
-  }
-
 /*!
- * \brief Tiny graph runtime.
- *
- *  This runtime can be acccesibly in various language via
- *  TVM runtime PackedFunc API.
+ * \brief Run all the operations one by one.
  */
-class GraphRuntime : public ModuleNode {
- public:
-  ~GraphRuntime() {
-    for (DLTensor* t : storage_pool_) {
-      TVM_CCALL(TVMArrayFree(t));
-    }
-  }
-  /*!
-   * \brief Get member function to front-end
-   * \param name The name of the function.
-   * \param sptr_to_self The pointer to the module node.
-   * \return The corresponding member function.
-   */
-  PackedFunc GetFunction(
-      const std::string& name,
-      const std::shared_ptr<ModuleNode>& sptr_to_self) final;
-
-  /*!
-   * \return The type key of the executor.
-   */
-  const char* type_key() const final {
-    return "GraphRuntime";
-  }
-  void Run() {
-    // setup the array and requirements.
-    for (size_t i = 0; i < op_execs_.size(); ++i) {
-      if (op_execs_[i]) op_execs_[i]();
-    }
+void GraphRuntime::Run() {
+  // setup the array and requirements.
+  for (size_t i = 0; i < op_execs_.size(); ++i) {
+    if (op_execs_[i]) op_execs_[i]();
   }
-  /*!
-   * \brief Initialize the graph executor with graph and context.
-   * \param graph_json The execution graph.
-   * \param module The module containing the compiled functions.
-   * \param ctx The context where the graph should sit on
-   */
-  void Init(const std::string& graph_json,
-            tvm::runtime::Module module,
-            TVMContext ctx) {
+}
+/*!
+ * \brief Initialize the graph executor with graph and context.
+ * \param graph_json The execution graph.
+ * \param module The module containing the compiled functions for the host
+ * processor.
+ * \param ctxs The context of the host and devices where graph nodes will be
+ * executed on.
+ */
+void GraphRuntime::Init(const std::string& graph_json,
+                        tvm::runtime::Module module,
+                        const std::vector<TVMContext>& ctxs) {
 #ifndef _LIBCPP_SGX_NO_IOSTREAMS
-    std::istringstream is(graph_json);
+  std::istringstream is(graph_json);
 #else
-    std::string is = graph_json;
+  std::string is = graph_json;
 #endif
-    dmlc::JSONReader reader(&is);
-    this->Load(&reader);
-    module_ = module;
-    ctx_ = ctx;
-    this->SetupStorage();
-    this->SetupOpExecs();
-  }
-  /*!
-   * \brief Get the input index given the name of input.
-   * \param name The name of the input.
-   * \return The index of input.
-   */
-  int GetInputIndex(const std::string& name) {
-    for (size_t i = 0; i< input_nodes_.size(); ++i) {
-      uint32_t nid = input_nodes_[i];
-      if (nodes_[nid].name == name) {
-        return static_cast<int>(i);
-      }
-    }
-    LOG(WARNING) << "Warning: cannot find \"" << name << "\" among input";
-    return -1;
-  }
-  /*!
-   * \brief set index-th input to the graph.
-   * \param index The input index.
-   * \param data_in The input data.
-   */
-  void SetInput(int index, DLTensor* data_in) {
-    CHECK_LT(static_cast<size_t>(index), input_nodes_.size());
-    uint32_t eid = this->entry_id(input_nodes_[index], 0);
-    TVM_CCALL(TVMArrayCopyFromTo(data_in, &data_entry_[eid], nullptr));
-  }
-  /*!
-   * \brief Copy index-th input to data_out
-   * \param index The input index.
-   * \param data_out The output
-   */
-  void GetInput(int index, DLTensor* data_out) {
-    CHECK_LT(static_cast<size_t>(index), input_nodes_.size());
-    uint32_t eid = this->entry_id(input_nodes_[index], 0);
-    TVM_CCALL(TVMArrayCopyFromTo(&data_entry_[eid], data_out, nullptr));
-  }
-  /*!
-   * \brief Copy index-th output to data_out.
-   * \param index The output index.
-   * \param data_out the output data.
-   */
-  void GetOutput(int index, DLTensor* data_out) {
-    CHECK_LT(static_cast<size_t>(index), outputs_.size());
-    uint32_t eid = this->entry_id(outputs_[index]);
-    TVM_CCALL(TVMArrayCopyFromTo(&data_entry_[eid], data_out, nullptr));
-  }
-#ifdef TVM_GRAPH_RUNTIME_DEBUG
-  /*!
-   * \brief Get the node index given the name of node.
-   * \param name The name of the node.
-   * \return The index of node.
-   */
-  int GetNodeIndex(const std::string& name) {
-    for (uint32_t nid = 0; nid< nodes_.size(); ++nid) {
-      if (nodes_[nid].name == name) {
-        return static_cast<int>(nid);
-      }
-    }
-    LOG(FATAL) << "cannot find " << name << " among nodex";
-    return -1;
-  }
-
-  /*!
-   * \brief Copy index-th node to data_out.
-   *
-   * This method will do a partial run of the the graph
-   * from begining upto the index-th node and return output of index-th node.
-   * This is costly operation and suggest to use only for debug porpose.
-   *
-   * \param index: The  index of the node.
-   * \param data_out the node data.
-   */
-  void DebugGetNodeOutput(int index, DLTensor* data_out) {
-    CHECK_LT(static_cast<size_t>(index), nodes_.size());
-    uint32_t eid = index;
-
-    for (size_t i = 0; i < op_execs_.size(); ++i) {
-      if (op_execs_[i]) op_execs_[i]();
-      if (static_cast<int>(i) == index) break;
+  dmlc::JSONReader reader(&is);
+  this->Load(&reader);
+  module_ = module;
+  ctxs_ = ctxs;
+  this->SetupStorage();
+  this->SetupOpExecs();
+}
+/*!
+ * \brief Get the input index given the name of input.
+ * \param name The name of the input.
+ * \return The index of input.
+ */
+int GraphRuntime::GetInputIndex(const std::string& name) {
+  for (size_t i = 0; i< input_nodes_.size(); ++i) {
+    uint32_t nid = input_nodes_[i];
+    if (nodes_[nid].name == name) {
+      return static_cast<int>(i);
     }
-
-    TVM_CCALL(TVMArrayCopyFromTo(&data_entry_[eid], data_out, nullptr));
-  }
-#endif
-  /*!
-   * \brief Load parameters from binary stream
-   * \param strm The input stream.
-   */
-  void LoadParams(dmlc::Stream* strm);
-  /*!
-   * \brief Load parameters from parameter blob.
-   * \param param_blob A binary blob of parameter.
-   */
-  void LoadParams(const std::string& param_blob) {
-    dmlc::MemoryStringStream strm(const_cast<std::string*>(&param_blob));
-    this->LoadParams(&strm);
   }
+  LOG(WARNING) << "Warning: cannot find \"" << name << "\" among input";
+  return -1;
+}
+/*!
+ * \brief set index-th input to the graph.
+ * \param index The input index.
+ * \param data_in The input data.
+ */
+void GraphRuntime::SetInput(int index, DLTensor* data_in) {
+  CHECK_LT(static_cast<size_t>(index), input_nodes_.size());
+  uint32_t eid = this->entry_id(input_nodes_[index], 0);
+  data_entry_[eid].CopyFrom(data_in);
+}
+/*!
+ * \brief Get the number of outputs
+ *
+ * \return The number of outputs from graph.
+ */
+int GraphRuntime::NumOutputs() const {
+  return outputs_.size();
+}
+/*!
+ * \brief Return NDArray for given input index.
+ * \param index The input index.
+ *
+ * \return NDArray corresponding to given input node index.
+ */
+NDArray GraphRuntime::GetInput(int index) const {
+  CHECK_LT(static_cast<size_t>(index), input_nodes_.size());
+  uint32_t eid = this->entry_id(input_nodes_[index], 0);
+  return data_entry_[eid];
+}
+/*!
+ * \brief Return NDArray for given output index.
+ * \param index The output index.
+ *
+ * \return NDArray corresponding to given output node index.
+ */
+NDArray GraphRuntime::GetOutput(int index) const {
+  CHECK_LT(static_cast<size_t>(index), outputs_.size());
+  uint32_t eid = this->entry_id(outputs_[index]);
+  return data_entry_[eid];
+}
+/*!
+ * \brief Copy index-th output to data_out.
+ * \param index The output index.
+ * \param data_out the output data.
+ */
+void GraphRuntime::CopyOutputTo(int index, DLTensor* data_out) {
+  CHECK_LT(static_cast<size_t>(index), outputs_.size());
+  uint32_t eid = this->entry_id(outputs_[index]);
 
- private:
-  // Node entry
-  struct NodeEntry {
-    uint32_t node_id;
-    uint32_t index;
-    uint32_t version;
-    // JSON Loader
-    void Load(dmlc::JSONReader *reader) {
-      reader->BeginArray();
-      CHECK(reader->NextArrayItem()) << "invalid json format";
-      reader->Read(&node_id);
-      CHECK(reader->NextArrayItem()) << "invalid json format";
-      reader->Read(&index);
-      if (reader->NextArrayItem()) {
-        reader->Read(&version);
-        CHECK(!reader->NextArrayItem()) << "invalid json format";
-      } else {
-        version = 0;
-      }
-    }
-  };
-  // Node
-  struct Node {
-    // operator type in string
-    std::string op_type;
-    // name of the op
-    std::string name;
-    // parameters
-    TVMOpParam param;
-    // inputs
-    std::vector<NodeEntry> inputs;
-    // control deps
-    std::vector<uint32_t> control_deps;
-    // JSON Loader
-    void LoadAttrs(dmlc::JSONReader *reader, TVMOpParam* param) {
-      int bitmask = 0;
-      std::string key, value;
-      reader->BeginObject();
-      while (reader->NextObjectItem(&key)) {
-        reader->Read(&value);
-        if (key == "func_name") {
-          param->func_name = value;
-          bitmask |= 1;
-        } else if (key == "num_inputs") {
-          param->num_inputs = strtoul(value.c_str(), nullptr, 10);
-          bitmask |= 2;
-        } else if (key == "num_outputs") {
-          param->num_outputs = strtoul(value.c_str(), nullptr, 10);
-          bitmask |= 4;
-        } else if (key == "flatten_data") {
-          param->flatten_data = strtoul(value.c_str(), nullptr, 10);
-          bitmask |= 8;
-        }
-      }
-      CHECK_EQ(bitmask, 1|2|4|8) << "invalid format";
-    }
-    // JSON Loader
-    void Load(dmlc::JSONReader *reader) {
-      reader->BeginObject();
-      std::unordered_map<std::string, std::string> dict;
-      int bitmask = 0;
-      std::string key;
-      while (reader->NextObjectItem(&key)) {
-        if (key == "op") {
-          reader->Read(&op_type);
-          bitmask |= 1;
-        } else if (key == "name") {
-          reader->Read(&name);
-          bitmask |= 2;
-        } else if (key == "inputs") {
-          reader->Read(&inputs);
-          bitmask |= 4;
-        } else if (key == "attr" || key == "attrs") {
-          this->LoadAttrs(reader, &param);
-        } else if (key == "control_deps") {
-          reader->Read(&control_deps);
-        } else {
-          LOG(FATAL) << "do not support key " << key;
-        }
-      }
-      CHECK_EQ(bitmask, 1|2|4) << "invalid format";
-    }
-  };
-  struct GraphAttr {
-    size_t storage_num_not_alloctaed{0};
-    std::vector<int> storage_id;
-    std::vector<std::string> dltype;
-    std::vector<std::vector<int64_t> > shape;
-    // The graph attribute fields.
-    void Load(dmlc::JSONReader *reader) {
-      reader->BeginObject();
-      int bitmask = 0;
-      std::string key, type;
-      while (reader->NextObjectItem(&key)) {
-        if (key == "dltype") {
-          reader->BeginArray();
-          CHECK(reader->NextArrayItem());
-          reader->Read(&type);
-          CHECK_EQ(type, "list_str");
-          CHECK(reader->NextArrayItem());
-          reader->Read(&dltype);
-          CHECK(!reader->NextArrayItem());
-          bitmask |= 1;
-        } else if (key == "storage_id") {
-          reader->BeginArray();
-          CHECK(reader->NextArrayItem());
-          reader->Read(&type);
-          CHECK_EQ(type, "list_int");
-          CHECK(reader->NextArrayItem());
-          reader->Read(&storage_id);
-          CHECK(!reader->NextArrayItem());
-          bitmask |= 2;
-        } else if (key == "shape") {
-          reader->BeginArray();
-          CHECK(reader->NextArrayItem());
-          reader->Read(&type);
-          CHECK_EQ(type, "list_shape");
-          CHECK(reader->NextArrayItem());
-          reader->Read(&shape);
-          CHECK(!reader->NextArrayItem());
-          bitmask |= 4;
-        } else {
-          reader->BeginArray();
-          CHECK(reader->NextArrayItem());
-          reader->Read(&type);
-          if (type == "list_int") {
-            CHECK(reader->NextArrayItem());
-            std::vector<int> temp;
-            reader->Read(&temp);
-          } else if (type == "size_t") {
-            CHECK(reader->NextArrayItem());
-            size_t temp;
-            reader->Read(&temp);
-          } else {
-            LOG(FATAL) << "cannot skip graph attr " << key;
-          }
-          CHECK(!reader->NextArrayItem());
-        }
-      }
-      CHECK_EQ(bitmask, 1|2|4) << "invalid format";
-    }
-  };
-  // The graph attribute fields.
-  void Load(dmlc::JSONReader *reader) {
-      reader->BeginObject();
-      int bitmask = 0;
-      std::string key;
-      while (reader->NextObjectItem(&key)) {
-        if (key == "nodes") {
-          reader->Read(&nodes_);
-          bitmask |= 1;
-        } else if (key == "arg_nodes") {
-          reader->Read(&input_nodes_);
-          bitmask |= 2;
-        } else if (key == "node_row_ptr") {
-          reader->Read(&node_row_ptr_);
-          bitmask |= 4;
-        } else if (key == "heads") {
-          reader->Read(&outputs_);
-          bitmask |= 8;
-        } else if (key == "attrs") {
-          reader->Read(&attrs_);
-          bitmask |= 16;
-        } else {
-          LOG(FATAL) << "key " << key << " is not supported";
-        }
-      }
-      CHECK_EQ(bitmask, 1|2|4|8|16) << "invalid format";
-  }
-  void LoadDLTensor(dmlc::Stream* strm, DLTensor* tensor);
-  /*! \brief Setup the temporal storage */
-  void SetupStorage();
-  /*! \brief Setup the executors */
-  void SetupOpExecs();
-  /*!
-   * \brief Create a executtion function given input.
-   * \param attrs The node attributes
-   * \param args The arguments to the functor, including inputs and outputs.
-   * \param num_inputs Number of inputs
-   * \return The created executor.
-   */
-  std::function<void()> CreateTVMOp(const TVMOpParam& attrs,
-                                    const std::vector<DLTensor>& args,
-                                    size_t num_inputs);
-  // Get node entry index.
-  uint32_t entry_id(uint32_t nid, uint32_t index) const {
-    return node_row_ptr_[nid] + index;
+  // Check the shapes to avoid receiving in different dimension but same size.
+  const NDArray& data = data_entry_[eid];
+  CHECK_EQ(data->ndim, data_out->ndim);
+  for (int32_t j = 0; j < data->ndim; ++j) {
+    CHECK_EQ(data->shape[j], data_out->shape[j]);
   }
-  // Get node entry index.
-  uint32_t entry_id(const NodeEntry& e) const {
-    return entry_id(e.node_id, e.index);
-  }
-  // Number of node entries
-  uint32_t num_node_entries() const {
-    return node_row_ptr_.back();
-  }
-  // Number of nodes.
-  uint32_t num_nodes() const {
-    return static_cast<uint32_t>(nodes_.size());
-  }
-  // The graph nodes.
-  std::vector<Node> nodes_;
-  // The argument nodes.
-  std::vector<uint32_t> input_nodes_;
-  // used or quick entry indexing
-  std::vector<uint32_t> node_row_ptr_;
-  // output entries
-  std::vector<NodeEntry> outputs_;
-  // Additional graph attributes
-  GraphAttr attrs_;
-  /*! \brief The code module */
-  tvm::runtime::Module module_;
-  /*! \brief execution context */
-  TVMContext ctx_;
-  /*! \brief common storage pool */
-  std::vector<DLTensor*> storage_pool_;
-  /*! \brief data entry of each node */
-  std::vector<DLTensor> data_entry_;
-  /*! \brief operator on each node */
-  std::vector<std::function<void()> > op_execs_;
-};
 
+  data_entry_[eid].CopyTo(data_out);
+}
 
-void GraphRuntime::LoadDLTensor(dmlc::Stream* strm, DLTensor* dst) {
-  // always use strm->Read to maintain endianness conversion
-  NDArray temp;
-  temp.Load(strm);
-  temp.CopyTo(dst);
+/*!
+ * \brief Load parameters from parameter blob.
+ * \param param_blob A binary blob of parameter.
+ */
+void GraphRuntime::LoadParams(const std::string& param_blob) {
+  dmlc::MemoryStringStream strm(const_cast<std::string*>(&param_blob));
+  this->LoadParams(&strm);
 }
 
 void GraphRuntime::LoadParams(dmlc::Stream* strm) {
@@ -426,7 +155,11 @@ void GraphRuntime::LoadParams(dmlc::Stream* strm) {
     CHECK_GE(in_idx, 0) << "Found param for non-existent input: " << names[i];
     uint32_t eid = this->entry_id(input_nodes_[in_idx], 0);
     CHECK_LT(eid, data_entry_.size());
-    LoadDLTensor(strm, &data_entry_[eid]);
+
+    // The data_entry is allocated on device, NDArray.load always load the array into CPU.
+    NDArray temp;
+    temp.Load(strm);
+    data_entry_[eid].CopyFrom(temp);
   }
 }
 
@@ -436,12 +169,17 @@ void GraphRuntime::SetupStorage() {
   for (const std::string& s_type : attrs_.dltype) {
     vtype.push_back(tvm::runtime::String2TVMType(s_type));
   }
-  data_entry_.resize(num_node_entries());
-  // size of each storage pool entry
-  std::vector<size_t> pool_entry_bytes;
+
+  // Size and device type of each storage pool entry.
+  std::vector<PoolEntry> pool_entry;
   // Find the maximum space size.
   for (size_t i = 0; i < attrs_.shape.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
+    // Use the fallback device if no device index is available.
+    int device_type = static_cast<int>(ctxs_[0].device_type);
+    if (!attrs_.device_index.empty()) {
+      device_type = attrs_.device_index[i];
+    }
     size_t size = 1;
     for (int64_t sz : attrs_.shape[i]) {
       size *= static_cast<size_t>(sz);
@@ -449,50 +187,64 @@ void GraphRuntime::SetupStorage() {
     CHECK_GE(storage_id, 0) << "Do not support runtime shape op";
     DLDataType t = vtype[i];
     size_t bits = t.bits * t.lanes;
-    CHECK_EQ(bits % 8U, 0U);
-    size_t bytes = (bits / 8U) * size;
+    CHECK(bits % 8U ==  0U || bits ==1U);
+    size_t bytes = ((bits + 7U) / 8U) * size;
 
-    size_t sid = static_cast<size_t>(storage_id);
-    if (sid >= pool_entry_bytes.size()) {
-      pool_entry_bytes.resize(sid + 1, 0);
+    uint32_t sid = static_cast<uint32_t>(storage_id);
+    if (sid >= pool_entry.size()) {
+      pool_entry.resize(sid + 1, {0, -1});
+    } else {
+      CHECK(pool_entry[sid].device_type == -1 ||
+            pool_entry[sid].device_type == device_type)
+          << "The same pool entry cannot be assigned to multiple devices";
     }
-    pool_entry_bytes[sid] = std::max(pool_entry_bytes[sid], bytes);
+    pool_entry[sid].size = std::max(pool_entry[sid].size, bytes);
+    pool_entry[sid].device_type = device_type;
   }
+
   // Allocate the space.
-  for (size_t i = 0; i < pool_entry_bytes.size(); ++i) {
-    int64_t shape[] = {static_cast<int64_t>(pool_entry_bytes[i] + 3) / 4};
-    DLTensor* tensor;
-    TVM_CCALL(TVMArrayAlloc(
-        shape, 1, kDLFloat, 32, 1, ctx_.device_type, ctx_.device_id, &tensor));
-    storage_pool_.push_back(tensor);
+  for (const auto& pit : pool_entry) {
+    std::vector<int64_t> shape;
+    // This for loop is very fast since there are usually only a couple of
+    // devices available on the same hardware.
+    const auto& cit =
+        std::find_if(ctxs_.begin(), ctxs_.end(), [&pit](const TVMContext& c) {
+          return pit.device_type == static_cast<int>(c.device_type);
+        });
+    TVMContext ctx = cit == ctxs_.end() ? ctxs_[0] : *cit;
+    shape.push_back(static_cast<int64_t>(pit.size + 3) / 4);
+    storage_pool_.push_back(
+        NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, ctx));
   }
-  // Assign the pooled entries.
+
+  // Assign the pooled entries. A unified memory pool is used to simplifiy
+  // memory assignment for each node entry. The allocated memory on each device
+  // is mapped to this pool.
+  data_entry_.resize(num_node_entries());
   for (size_t i = 0; i < data_entry_.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
     CHECK_LT(static_cast<size_t>(storage_id), storage_pool_.size());
-    data_entry_[i] = *storage_pool_[storage_id];
-    data_entry_[i].shape = const_cast<int64_t*>(attrs_.shape[i].data());
-    data_entry_[i].ndim = static_cast<int>(attrs_.shape[i].size());
-    data_entry_[i].dtype = vtype[i];
+    data_entry_[i] =
+        storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
   }
 }
 
 void GraphRuntime::SetupOpExecs() {
-  op_execs_.resize(this->num_nodes());
+  op_execs_.resize(this->GetNumOfNodes());
   // setup the array and requirements.
-  for (uint32_t nid = 0; nid < this->num_nodes(); ++nid) {
+  for (uint32_t nid = 0; nid < this->GetNumOfNodes(); ++nid) {
     const auto& inode = nodes_[nid];
     if (inode.op_type == "null") continue;
     std::vector<DLTensor> args;
     for (const auto& e : inode.inputs) {
-      args.push_back(data_entry_[this->entry_id(e)]);
+      args.push_back(*(data_entry_[this->entry_id(e)].operator->()));
     }
     for (uint32_t index = 0; index < inode.param.num_outputs; ++index) {
       uint32_t eid = this->entry_id(nid, index);
-      args.push_back(data_entry_[eid]);
+      args.push_back(*(data_entry_[eid].operator->()));
     }
-    CHECK_EQ(inode.op_type, "tvm_op")
-        << "Can only take tvm_op as op";
+    CHECK(inode.op_type == "tvm_op") << "Can only take tvm_op as op";
+
     op_execs_[nid] = CreateTVMOp(inode.param, args, inode.inputs.size());
   }
 }
@@ -526,13 +278,26 @@ std::function<void()> GraphRuntime::CreateTVMOp(
       t->shape = &(arg_ptr->shape_data[i]);
     }
   }
+
   if (param.func_name == "__nop") {
     return [](){};
+  } else if (param.func_name == "__copy") {
+    // Perform cross device data copy.
+    // Directly copy data from the input to the output.
+    auto fexec = [arg_ptr]() {
+      DLTensor* from = static_cast<DLTensor*>(arg_ptr->arg_values[0].v_handle);
+      DLTensor* to = static_cast<DLTensor*>(arg_ptr->arg_values[1].v_handle);
+      TVM_CCALL(TVMArrayCopyFromTo(from, to, nullptr));
+    };
+    return fexec;
   }
-  // get compiled function from module.
+
+  // Get compiled function from the module that contains both host and device
+  // code.
   tvm::runtime::PackedFunc pf = module_.GetFunction(param.func_name, false);
   CHECK(pf != nullptr) << "no such function in module: " << param.func_name;
-  auto fexec = [arg_ptr, pf] () {
+
+  auto fexec = [arg_ptr, pf]() {
     TVMRetValue rv;
     TVMArgs targs(arg_ptr->arg_values.data(),
                   arg_ptr->arg_tcodes.data(),
@@ -545,7 +310,7 @@ std::function<void()> GraphRuntime::CreateTVMOp(
 PackedFunc GraphRuntime::GetFunction(
     const std::string& name,
     const std::shared_ptr<ModuleNode>& sptr_to_self) {
-  // return member functions during query.
+  // Return member functions during query.
   if (name == "set_input") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         if (args[0].type_code() == kStr) {
@@ -557,28 +322,27 @@ PackedFunc GraphRuntime::GetFunction(
       });
   } else if (name == "get_output") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        this->GetOutput(args[0], args[1]);
+        if (args.num_args == 2) {
+          this->CopyOutputTo(args[0], args[1]);
+        } else {
+          *rv = this->GetOutput(args[0]);
+        }
       });
   } else if (name == "get_input") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        int in_idx = 0;
         if (args[0].type_code() == kStr) {
-          int in_idx = this->GetInputIndex(args[0]);
-          CHECK_GE(in_idx, 0);
-          this->GetInput(in_idx, args[1]);
+          in_idx = this->GetInputIndex(args[0]);
         } else {
-          this->GetInput(args[0], args[1]);
+          in_idx = args[0];
         }
+        CHECK_GE(in_idx, 0);
+        *rv = this->GetInput(in_idx);
       });
-#ifdef TVM_GRAPH_RUNTIME_DEBUG
-  } else if (name == "debug_get_output") {
+  } else if (name == "get_num_outputs") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        if (args[0].type_code() == kStr) {
-          this->DebugGetNodeOutput(this->GetNodeIndex(args[0]), args[1]);
-        } else {
-          this->DebugGetNodeOutput(args[0], args[1]);
-        }
+        *rv = this->NumOutputs();
       });
-#endif
   } else if (name == "run") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         this->Run();
@@ -592,29 +356,53 @@ PackedFunc GraphRuntime::GetFunction(
   }
 }
 
-Module GraphRuntimeCreate(std::string sym_json,
-                          tvm::runtime::Module m,
-                          int device_type,
-                          int device_id) {
-  TVMContext ctx;
-  ctx.device_type = static_cast<DLDeviceType>(device_type);
-  ctx.device_id   = device_id;
+Module GraphRuntimeCreate(const std::string& sym_json,
+                          const tvm::runtime::Module& m,
+                          const std::vector<TVMContext>& ctxs) {
   std::shared_ptr<GraphRuntime> exec = std::make_shared<GraphRuntime>();
-  exec->Init(sym_json, m, ctx);
+  exec->Init(sym_json, m, ctxs);
   return Module(exec);
 }
 
+// Get all context for the host and other runtime devices.
+std::vector<TVMContext> GetAllContext(const TVMArgs& args) {
+  // Reserve the first item as the fallback device.
+  std::vector<TVMContext> ret;
+  TVMContext ctx;
+  for (int i = 2; i < args.num_args; i += 2) {
+    int dev_type = args[i];
+    ctx.device_type = static_cast<DLDeviceType>(dev_type);
+    ctx.device_id = args[i + 1];
+    ret.push_back(ctx);
+  }
+  return ret;
+}
+
+// 4-argument version is currently reserved to keep support of calling
+// from tvm4j and javascript, since they don't have heterogeneous
+// execution support yet. For heterogenenous execution, at least 5 arguments will
+// be passed in. The third one is the number of devices.
+// Eventually, we will only probably pass TVMContext for all the languages.
 TVM_REGISTER_GLOBAL("tvm.graph_runtime.create")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-    *rv = GraphRuntimeCreate(args[0], args[1], args[2], args[3]);
+  .set_body([](TVMArgs args, TVMRetValue* rv) {
+    CHECK_GE(args.num_args, 4)
+        << "The expected number of arguments for graph_runtime.create is "
+           "at least 4, but it has "
+        << args.num_args;
+    const auto& contexts = GetAllContext(args);
+    *rv = GraphRuntimeCreate(args[0], args[1], contexts);
   });
 
 TVM_REGISTER_GLOBAL("tvm.graph_runtime.remote_create")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
+  .set_body([](TVMArgs args, TVMRetValue* rv) {
+    CHECK_GE(args.num_args, 4) << "The expected number of arguments for "
+                                  "graph_runtime.remote_create is "
+                                  "at least 4, but it has "
+                               << args.num_args;
     void* mhandle = args[1];
-    *rv = GraphRuntimeCreate(args[0],
-                             *static_cast<tvm::runtime::Module*>(mhandle),
-                             args[2], args[3]);
+    const auto& contexts = GetAllContext(args);
+    *rv = GraphRuntimeCreate(
+        args[0], *static_cast<tvm::runtime::Module*>(mhandle), contexts);
   });
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h
index 7ebcf7d30b33..d9e6ef18860a 100644
--- a/src/runtime/graph/graph_runtime.h
+++ b/src/runtime/graph/graph_runtime.h
@@ -8,11 +8,26 @@
 #ifndef TVM_RUNTIME_GRAPH_GRAPH_RUNTIME_H_
 #define TVM_RUNTIME_GRAPH_GRAPH_RUNTIME_H_
 
+#include <dlpack/dlpack.h>
+#include <dmlc/memory_io.h>
+#include <dmlc/json.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/packed_func.h>
+
+#include <vector>
 #include <string>
 
 namespace tvm {
 namespace runtime {
 
+/*! \brief macro to do C API call */
+#define TVM_CCALL(func)                                            \
+  {                                                                \
+    int ret = (func);                                              \
+    CHECK_EQ(ret, 0)                                               \
+        << TVMGetLastError();                                      \
+  }
+
 /*! \brief Magic number for NDArray list file  */
 constexpr uint64_t kTVMNDArrayListMagic = 0xF7E58D4F05049CB7;
 
@@ -24,6 +39,362 @@ struct TVMOpParam {
   uint32_t flatten_data;
 };
 
+/*!
+ * \brief Tiny graph runtime.
+ *
+ *  This runtime can be acccesibly in various language via
+ *  TVM runtime PackedFunc API.
+ */
+class GraphRuntime : public ModuleNode {
+ public:
+  /*!
+   * \brief Get member function to front-end
+   * \param name The name of the function.
+   * \param sptr_to_self The pointer to the module node.
+   * \return The corresponding member function.
+   */
+  virtual PackedFunc GetFunction(const std::string& name,
+                                 const std::shared_ptr<ModuleNode>& sptr_to_self);
+
+  /*!
+   * \return The type key of the executor.
+   */
+  const char* type_key() const final {
+    return "GraphRuntime";
+  }
+  void Run();
+
+  /*!
+   * \brief Initialize the graph executor with graph and context.
+   * \param graph_json The execution graph.
+   * \param module The module containing the compiled functions for the host
+   *  processor.
+   * \param ctxs The context of the host and devices where graph nodes will be
+   *  executed on.
+   */
+
+  void Init(const std::string& graph_json,
+            tvm::runtime::Module module,
+            const std::vector<TVMContext>& ctxs);
+
+  /*!
+   * \brief Get the input index given the name of input.
+   * \param name The name of the input.
+   * \return The index of input.
+   */
+  int GetInputIndex(const std::string& name);
+
+  /*!
+   * \brief set index-th input to the graph.
+   * \param index The input index.
+   * \param data_in The input data.
+   */
+  void SetInput(int index, DLTensor* data_in);
+  /*!
+   * \brief Get the number of outputs
+   *
+   * \return The number of outputs from graph.
+   */
+  int NumOutputs() const;
+  /*!
+   * \brief Return NDArray for given input index.
+   * \param index The input index.
+   *
+   * \return NDArray corresponding to given input node index.
+   */
+  NDArray GetInput(int index) const;
+  /*!
+   * \brief Return NDArray for given output index.
+   * \param index The output index.
+   *
+   * \return NDArray corresponding to given output node index.
+   */
+  NDArray GetOutput(int index) const;
+  /*!
+   * \brief Copy index-th output to data_out.
+   * \param index The output index.
+   * \param data_out the output data.
+   */
+  void CopyOutputTo(int index, DLTensor* data_out);
+  /*!
+   * \brief Load parameters from binary stream
+   * \param strm The input stream.
+   */
+  void LoadParams(dmlc::Stream* strm);
+  /*!
+   * \brief Load parameters from parameter blob.
+   * \param param_blob A binary blob of parameter.
+   */
+  void LoadParams(const std::string& param_blob);
+
+  /*!
+   * \brief Get the tensor vector pointer.
+   */
+  std::vector<NDArray>& data_entry() {
+      return data_entry_;
+  }
+
+  /*!
+   * \brief Get the execution function pointer.
+   */
+  std::vector<std::function<void()> >& op_execs() {
+        return op_execs_;
+  }
+
+  /*!
+   * \brief Get node entry index.
+   * \param nid Node id.
+   * \param index Index of the nodes.
+   */
+  uint32_t GetEntryId(uint32_t nid, uint32_t index) const {
+    return node_row_ptr_[nid] + index;
+  }
+
+ /*!
+  * \brief Get total number of nodes.
+  * \return Total number of nodes.
+  */
+  uint32_t GetNumOfNodes() const {
+    return static_cast<uint32_t>(nodes_.size());
+  }
+
+  std::string GetNodeName(uint32_t nid) const {
+    return nodes_[nid].name;
+  }
+
+
+ private:
+  // Memory pool entry.
+  struct PoolEntry {
+    size_t size;
+    int device_type;
+    PoolEntry(int s, int dev_type) : size(s), device_type(dev_type) {}
+  };
+  // Node entry
+  struct NodeEntry {
+    uint32_t node_id;
+    uint32_t index;
+    uint32_t version;
+    // JSON Loader
+    void Load(dmlc::JSONReader *reader) {
+      reader->BeginArray();
+      CHECK(reader->NextArrayItem()) << "invalid json format";
+      reader->Read(&node_id);
+      CHECK(reader->NextArrayItem()) << "invalid json format";
+      reader->Read(&index);
+      if (reader->NextArrayItem()) {
+        reader->Read(&version);
+        CHECK(!reader->NextArrayItem()) << "invalid json format";
+      } else {
+        version = 0;
+      }
+    }
+  };
+  // Node
+  struct Node {
+    // operator type in string
+    std::string op_type;
+    // name of the op
+    std::string name;
+    // parameters
+    TVMOpParam param;
+    // inputs
+    std::vector<NodeEntry> inputs;
+    // control deps
+    std::vector<uint32_t> control_deps;
+    // JSON Loader
+    void LoadAttrs(dmlc::JSONReader *reader, TVMOpParam* param) {
+      int bitmask = 0;
+      std::string key, value;
+      reader->BeginObject();
+      while (reader->NextObjectItem(&key)) {
+        reader->Read(&value);
+        if (key == "func_name") {
+          param->func_name = value;
+          bitmask |= 1;
+        } else if (key == "num_inputs") {
+          param->num_inputs = strtoul(value.c_str(), nullptr, 10);
+          bitmask |= 2;
+        } else if (key == "num_outputs") {
+          param->num_outputs = strtoul(value.c_str(), nullptr, 10);
+          bitmask |= 4;
+        } else if (key == "flatten_data") {
+          param->flatten_data = strtoul(value.c_str(), nullptr, 10);
+          bitmask |= 8;
+        }
+      }
+      CHECK_EQ(bitmask, 1|2|4|8) << "invalid format";
+    }
+    // JSON Loader
+    void Load(dmlc::JSONReader *reader) {
+      reader->BeginObject();
+      int bitmask = 0;
+      std::string key;
+      while (reader->NextObjectItem(&key)) {
+        if (key == "op") {
+          reader->Read(&op_type);
+          bitmask |= 1;
+        } else if (key == "name") {
+          reader->Read(&name);
+          bitmask |= 2;
+        } else if (key == "inputs") {
+          reader->Read(&inputs);
+          bitmask |= 4;
+        } else if (key == "attr" || key == "attrs") {
+          this->LoadAttrs(reader, &param);
+        } else if (key == "control_deps") {
+          reader->Read(&control_deps);
+        } else {
+          LOG(FATAL) << "do not support key " << key;
+        }
+      }
+      CHECK_EQ(bitmask, 1|2|4) << "invalid format";
+    }
+  };
+  struct GraphAttr {
+    size_t storage_num_not_alloctaed{0};
+    std::vector<int> storage_id;
+    std::vector<int> device_index;
+    std::vector<std::string> dltype;
+    std::vector<std::vector<int64_t> > shape;
+    // The graph attribute fields.
+    void Load(dmlc::JSONReader *reader) {
+      reader->BeginObject();
+      int bitmask = 0;
+      std::string key, type;
+      while (reader->NextObjectItem(&key)) {
+        if (key == "dltype") {
+          reader->BeginArray();
+          CHECK(reader->NextArrayItem());
+          reader->Read(&type);
+          CHECK_EQ(type, "list_str");
+          CHECK(reader->NextArrayItem());
+          reader->Read(&dltype);
+          CHECK(!reader->NextArrayItem());
+          bitmask |= 1;
+        } else if (key == "storage_id") {
+          reader->BeginArray();
+          CHECK(reader->NextArrayItem());
+          reader->Read(&type);
+          CHECK_EQ(type, "list_int");
+          CHECK(reader->NextArrayItem());
+          reader->Read(&storage_id);
+          CHECK(!reader->NextArrayItem());
+          bitmask |= 2;
+        } else if (key == "shape") {
+          reader->BeginArray();
+          CHECK(reader->NextArrayItem());
+          reader->Read(&type);
+          CHECK_EQ(type, "list_shape");
+          CHECK(reader->NextArrayItem());
+          reader->Read(&shape);
+          CHECK(!reader->NextArrayItem());
+          bitmask |= 4;
+        } else if (key == "device_index") {
+          reader->BeginArray();
+          CHECK(reader->NextArrayItem());
+          reader->Read(&type);
+          CHECK_EQ(type, "list_int");
+          CHECK(reader->NextArrayItem());
+          reader->Read(&device_index);
+          CHECK(!reader->NextArrayItem());
+        } else {
+          reader->BeginArray();
+          CHECK(reader->NextArrayItem());
+          reader->Read(&type);
+          if (type == "list_int") {
+            CHECK(reader->NextArrayItem());
+            std::vector<int> temp;
+            reader->Read(&temp);
+          } else if (type == "size_t") {
+            CHECK(reader->NextArrayItem());
+            size_t temp;
+            reader->Read(&temp);
+          } else {
+            LOG(FATAL) << "cannot skip graph attr " << key;
+          }
+          CHECK(!reader->NextArrayItem());
+        }
+      }
+      CHECK_EQ(bitmask, 1|2|4) << "invalid format";
+    }
+  };
+  // The graph attribute fields.
+  void Load(dmlc::JSONReader *reader) {
+      reader->BeginObject();
+      int bitmask = 0;
+      std::string key;
+      while (reader->NextObjectItem(&key)) {
+        if (key == "nodes") {
+          reader->Read(&nodes_);
+          bitmask |= 1;
+        } else if (key == "arg_nodes") {
+          reader->Read(&input_nodes_);
+          bitmask |= 2;
+        } else if (key == "node_row_ptr") {
+          reader->Read(&node_row_ptr_);
+          bitmask |= 4;
+        } else if (key == "heads") {
+          reader->Read(&outputs_);
+          bitmask |= 8;
+        } else if (key == "attrs") {
+          reader->Read(&attrs_);
+          bitmask |= 16;
+        } else {
+          LOG(FATAL) << "key " << key << " is not supported";
+        }
+      }
+      CHECK_EQ(bitmask, 1|2|4|8|16) << "invalid format";
+  }
+  /*! \brief Setup the temporal storage */
+  void SetupStorage();
+  /*! \brief Setup the executors. */
+  void SetupOpExecs();
+  /*!
+   * \brief Create an execution function given input.
+   * \param attrs The node attributes.
+   * \param args The arguments to the functor, including inputs and outputs.
+   * \param num_inputs Number of inputs.
+   * \return The created executor.
+   */
+  std::function<void()> CreateTVMOp(const TVMOpParam& attrs,
+                                    const std::vector<DLTensor>& args,
+                                    size_t num_inputs);
+  // Get node entry index.
+  uint32_t entry_id(uint32_t nid, uint32_t index) const {
+    return node_row_ptr_[nid] + index;
+  }
+  // Get node entry index.
+  uint32_t entry_id(const NodeEntry& e) const {
+    return entry_id(e.node_id, e.index);
+  }
+  // Number of node entries.
+  uint32_t num_node_entries() const {
+    return node_row_ptr_.back();
+  }
+  /*! \brief The graph nodes. */
+  std::vector<Node> nodes_;
+  /*! \brief The argument nodes. */
+  std::vector<uint32_t> input_nodes_;
+  /*! \brief Used for quick entry indexing. */
+  std::vector<uint32_t> node_row_ptr_;
+  /*! \brief Output entries. */
+  std::vector<NodeEntry> outputs_;
+  /*! \brief Additional graph attributes. */
+  GraphAttr attrs_;
+  /*! \brief The code module that contains both host and device code. */
+  tvm::runtime::Module module_;
+  /*! \brief Execution context of all devices including the host. */
+  std::vector<TVMContext> ctxs_;
+  /*! \brief Common storage pool for all devices. */
+  std::vector<NDArray> storage_pool_;
+  /*! \brief Data entry of each node. */
+  std::vector<NDArray> data_entry_;
+  /*! \brief Operator on each node. */
+  std::vector<std::function<void()> > op_execs_;
+};
+
+std::vector<TVMContext> GetAllContext(const TVMArgs& args);
 }  // namespace runtime
 }  // namespace tvm
 
diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h
index 381bf9f60c79..40d08015e8cd 100644
--- a/src/runtime/meta_data.h
+++ b/src/runtime/meta_data.h
@@ -11,7 +11,7 @@
 #include <tvm/runtime/packed_func.h>
 #include <string>
 #include <vector>
-#include "./runtime_base.h"
+#include "runtime_base.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm
index 47c2899cea71..fcdbf13138a8 100644
--- a/src/runtime/metal/metal_device_api.mm
+++ b/src/runtime/metal/metal_device_api.mm
@@ -4,7 +4,7 @@
  */
 #include <tvm/runtime/registry.h>
 #include <dmlc/thread_local.h>
-#include "./metal_common.h"
+#include "metal_common.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/metal/metal_module.mm b/src/runtime/metal/metal_module.mm
index c79e2cf11ac5..cf470b6c8a34 100644
--- a/src/runtime/metal/metal_module.mm
+++ b/src/runtime/metal/metal_module.mm
@@ -8,8 +8,8 @@
 #include <array>
 #include <string>
 #include <mutex>
-#include "./metal_module.h"
-#include "./metal_common.h"
+#include "metal_module.h"
+#include "metal_common.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "../meta_data.h"
@@ -163,7 +163,7 @@ void SaveToBinary(dmlc::Stream* stream) final {
   std::mutex mutex_;
 };
 
-// a wrapped function class to get packed fucn.
+// a wrapped function class to get packed func.
 class MetalWrappedFunc {
  public:
   // initialize the METAL function.
diff --git a/src/runtime/module.cc b/src/runtime/module.cc
index dbddfde44733..80dc1f3172f8 100644
--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -9,7 +9,7 @@
 #include <unordered_set>
 #include <cstring>
 #ifndef _LIBCPP_SGX_CONFIG
-#include "./file_util.h"
+#include "file_util.h"
 #endif
 
 namespace tvm {
diff --git a/src/runtime/module_util.cc b/src/runtime/module_util.cc
index 95da78d23f09..0c6d8ae4058d 100644
--- a/src/runtime/module_util.cc
+++ b/src/runtime/module_util.cc
@@ -8,7 +8,8 @@
 #endif
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/registry.h>
-#include "./module_util.h"
+#include <string>
+#include "module_util.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index f862f32f6e99..0ffa4c174544 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -7,7 +7,7 @@
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
-#include "./runtime_base.h"
+#include "runtime_base.h"
 
 // deleter for arrays used by DLPack exporter
 extern "C" void NDArrayDLPackDeleter(DLManagedTensor* tensor);
@@ -20,20 +20,13 @@ inline void VerifyDataType(DLDataType dtype) {
   if (dtype.code == kDLFloat) {
     CHECK_EQ(dtype.bits % 8, 0);
   } else {
+    // allow uint1 as a special flag for bool.
+    if (dtype.bits == 1 && dtype.code == kDLUInt) return;
     CHECK_EQ(dtype.bits % 8, 0);
   }
   CHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
 }
 
-inline size_t GetDataSize(const DLTensor& arr) {
-  size_t size = 1;
-  for (tvm_index_t i = 0; i < arr.ndim; ++i) {
-    size *= arr.shape[i];
-  }
-  size *= (arr.dtype.bits * arr.dtype.lanes + 7) / 8;
-  return size;
-}
-
 inline size_t GetDataAlignment(const DLTensor& arr) {
   size_t align = (arr.dtype.bits / 8) * arr.dtype.lanes;
   if (align < kAllocAlignment) return kAllocAlignment;
@@ -93,6 +86,16 @@ struct NDArray::Internal {
     arr.data_ = nullptr;
     return tensor;
   }
+  // Container to DLManagedTensor
+  static DLManagedTensor* ToDLPack(NDArray::Container* from) {
+    CHECK(from != nullptr);
+    DLManagedTensor* ret = new DLManagedTensor();
+    ret->dl_tensor = from->dl_tensor;
+    ret->manager_ctx = from;
+    from->IncRef();
+    ret->deleter = NDArrayDLPackDeleter;
+    return ret;
+  }
 };
 
 NDArray NDArray::CreateView(std::vector<int64_t> shape,
@@ -115,18 +118,12 @@ NDArray NDArray::CreateView(std::vector<int64_t> shape,
 }
 
 DLManagedTensor* NDArray::ToDLPack() const {
-  CHECK(data_ != nullptr);
-  DLManagedTensor* ret = new DLManagedTensor();
-  ret->dl_tensor = data_->dl_tensor;
-  ret->manager_ctx = const_cast<NDArray*>(this);
-  data_->IncRef();
-  ret->deleter = NDArrayDLPackDeleter;
-  return ret;
+  return Internal::ToDLPack(data_);
 }
 
 NDArray NDArray::Empty(std::vector<int64_t> shape,
-                        DLDataType dtype,
-                        DLContext ctx) {
+                       DLDataType dtype,
+                       DLContext ctx) {
   NDArray ret = Internal::Create(shape, dtype, ctx);
   // setup memory content
   size_t size = GetDataSize(ret.data_->dl_tensor);
@@ -213,6 +210,24 @@ int TVMArrayCopyFromTo(TVMArrayHandle from,
   API_END();
 }
 
+int TVMArrayFromDLPack(DLManagedTensor* from,
+                       TVMArrayHandle* out) {
+  API_BEGIN();
+  *out = NDArray::Internal::MoveAsDLTensor(NDArray::FromDLPack(from));
+  API_END();
+}
+
+int TVMArrayToDLPack(TVMArrayHandle from,
+                     DLManagedTensor** out) {
+  API_BEGIN();
+  *out = NDArray::Internal::ToDLPack(reinterpret_cast<NDArray::Container*>(from));
+  API_END();
+}
+
+void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor) {
+  (*(dltensor->deleter))(dltensor);
+}
+
 int TVMArrayCopyFromBytes(TVMArrayHandle handle,
                           void* data,
                           size_t nbytes) {
diff --git a/src/runtime/opencl/aocl/aocl_device_api.cc b/src/runtime/opencl/aocl/aocl_device_api.cc
index e9cbc6b4cda0..61f636df6039 100644
--- a/src/runtime/opencl/aocl/aocl_device_api.cc
+++ b/src/runtime/opencl/aocl/aocl_device_api.cc
@@ -4,7 +4,7 @@
  */
 #include <tvm/runtime/registry.h>
 #include <dmlc/thread_local.h>
-#include "./aocl_common.h"
+#include "aocl_common.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/opencl/aocl/aocl_module.cc b/src/runtime/opencl/aocl/aocl_module.cc
index a056c5cee671..bbf2828fbd79 100644
--- a/src/runtime/opencl/aocl/aocl_module.cc
+++ b/src/runtime/opencl/aocl/aocl_module.cc
@@ -7,8 +7,8 @@
 #include <vector>
 #include <string>
 #include <unordered_map>
-#include "./aocl_common.h"
-#include "./aocl_module.h"
+#include "aocl_common.h"
+#include "aocl_module.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index c37dbaa94d7a..d42cc669e742 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -260,14 +260,12 @@ class OpenCLModuleNode : public ModuleNode {
                           const std::string& func_name,
                           const KTRefEntry& e);
 
- protected:
+ private:
   // The workspace, need to keep reference to use it in destructor.
   // In case of static destruction order problem.
   std::shared_ptr<cl::OpenCLWorkspace> workspace_;
   // the binary data
   std::string data_;
-
- private:
   // The format
   std::string fmt_;
   // function information table.
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index ac9373f1375b..6bb0948bca91 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -4,7 +4,7 @@
  */
 #include <tvm/runtime/registry.h>
 #include <dmlc/thread_local.h>
-#include "./opencl_common.h"
+#include "opencl_common.h"
 
 namespace tvm {
 namespace runtime {
@@ -232,7 +232,6 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic
   if (initialized_) return;
   std::lock_guard<std::mutex> lock(this->mu);
   if (initialized_) return;
-  initialized_ = true;
   if (context != nullptr) return;
   // matched platforms
   std::vector<cl_platform_id> platform_ids = cl::GetPlatformIDs();
@@ -246,17 +245,18 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic
       continue;
     }
     std::vector<cl_device_id> devices_matched = cl::GetDeviceIDs(platform_id, device_type);
+    if ((devices_matched.size() == 0) && (device_type == "gpu")) {
+      LOG(WARNING) << "Using CPU OpenCL device";
+      devices_matched = cl::GetDeviceIDs(platform_id, "cpu");
+    }
     if (devices_matched.size() > 0) {
       this->type_key = type_key;
       this->platform_id = platform_id;
       this->platform_name = cl::GetPlatformInfo(platform_id, CL_PLATFORM_NAME);
       this->device_type = device_type;
       this->devices = devices_matched;
-      LOG(INFO) << "Initialize OpenCL platform \'" << this->platform_name << '\'';
       break;
     }
-    LOG(INFO) << "\'" << cl::GetPlatformInfo(platform_id, CL_PLATFORM_NAME)
-              << "\' platform has no OpenCL device: " << device_type << " mode";
   }
   if (this->platform_id == nullptr) {
     LOG(WARNING) << "No OpenCL device";
@@ -273,10 +273,8 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic
     this->queues.push_back(
         clCreateCommandQueue(this->context, did, 0, &err_code));
     OPENCL_CHECK_ERROR(err_code);
-    LOG(INFO) << type_key << "(" << i
-              << ")=\'" << cl::GetDeviceInfo(did, CL_DEVICE_NAME)
-              << "\' cl_device_id=" << did;
   }
+  initialized_ = true;
 }
 
 TVM_REGISTER_GLOBAL("device_api.opencl")
diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc
index 3efd789513ba..ed5c3c235ac1 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -7,8 +7,8 @@
 #include <vector>
 #include <string>
 #include <unordered_map>
-#include "./opencl_common.h"
-#include "./opencl_module.h"
+#include "opencl_common.h"
+#include "opencl_module.h"
 
 namespace tvm {
 namespace runtime {
@@ -34,6 +34,7 @@ class OpenCLWrappedFunc {
   void operator()(TVMArgs args,
                   TVMRetValue* rv,
                   void** void_args) const {
+    CHECK(w_->context != nullptr) << "No OpenCL device";
     cl::OpenCLThreadEntry* t = w_->GetThreadEntry();
     // get the kernel from thread local kernel table.
     if (entry_.kernel_id >= t->kernel_table.size()) {
@@ -157,7 +158,6 @@ std::string OpenCLModuleNode::GetSource(const std::string& format) {
 void OpenCLModuleNode::Init() {
   workspace_ = GetGlobalWorkspace();
   workspace_->Init();
-  CHECK(workspace_->context != nullptr) << "No OpenCL device";
   device_built_flag_.resize(workspace_->devices.size(), false);
   // initialize the kernel id, need to lock global table.
   std::lock_guard<std::mutex> lock(workspace_->mu);
diff --git a/src/runtime/opencl/sdaccel/sdaccel_device_api.cc b/src/runtime/opencl/sdaccel/sdaccel_device_api.cc
index 4b057b7e009a..bc98759b9b3f 100644
--- a/src/runtime/opencl/sdaccel/sdaccel_device_api.cc
+++ b/src/runtime/opencl/sdaccel/sdaccel_device_api.cc
@@ -4,7 +4,7 @@
  */
 #include <tvm/runtime/registry.h>
 #include <dmlc/thread_local.h>
-#include "./sdaccel_common.h"
+#include "sdaccel_common.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/opencl/sdaccel/sdaccel_module.cc b/src/runtime/opencl/sdaccel/sdaccel_module.cc
index c99e78c8e347..de9a710fbfe8 100644
--- a/src/runtime/opencl/sdaccel/sdaccel_module.cc
+++ b/src/runtime/opencl/sdaccel/sdaccel_module.cc
@@ -7,8 +7,8 @@
 #include <vector>
 #include <string>
 #include <unordered_map>
-#include "./sdaccel_common.h"
-#include "./sdaccel_module.h"
+#include "sdaccel_common.h"
+#include "sdaccel_module.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/opengl/opengl_device_api.cc b/src/runtime/opengl/opengl_device_api.cc
index 3a21ed6e6d07..191b64b6ce0a 100644
--- a/src/runtime/opengl/opengl_device_api.cc
+++ b/src/runtime/opengl/opengl_device_api.cc
@@ -4,8 +4,8 @@
  */
 #include <tvm/runtime/registry.h>
 #include <cstring>
-#include "./opengl_common.h"
-#include "./opengl_module.h"
+#include "opengl_common.h"
+#include "opengl_module.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/opengl/opengl_module.cc b/src/runtime/opengl/opengl_module.cc
index d800af95f053..976227a2924b 100644
--- a/src/runtime/opengl/opengl_module.cc
+++ b/src/runtime/opengl/opengl_module.cc
@@ -4,8 +4,8 @@
  */
 #include <tvm/runtime/registry.h>
 #include <utility>
-#include "./opengl_common.h"
-#include "./opengl_module.h"
+#include "opengl_common.h"
+#include "opengl_module.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "../file_util.h"
diff --git a/src/runtime/pack_args.h b/src/runtime/pack_args.h
index 0a00e79f07df..5170e5fd9e9a 100644
--- a/src/runtime/pack_args.h
+++ b/src/runtime/pack_args.h
@@ -168,7 +168,7 @@ inline PackedFunc PackFuncNonBufferArg_(
       switch (codes[i]) {
         case INT64_TO_INT64:
         case FLOAT64_TO_FLOAT64: {
-          LOG(FATAL) << "Donot support 64bit argument to device function"; break;
+          LOG(FATAL) << "Do not support 64bit argument to device function"; break;
         }
         case INT64_TO_INT32: {
           holder[i].v_int32 = static_cast<int32_t>(args.values[base + i].v_int64);
diff --git a/src/runtime/registry.cc b/src/runtime/registry.cc
index 3f72828390ee..3c792fdb9063 100644
--- a/src/runtime/registry.cc
+++ b/src/runtime/registry.cc
@@ -10,7 +10,7 @@
 #include <mutex>
 #include <memory>
 #include <array>
-#include "./runtime_base.h"
+#include "runtime_base.h"
 
 namespace tvm {
 namespace runtime {
@@ -34,8 +34,11 @@ struct Registry::Manager {
   }
 
   static Manager* Global() {
-    static Manager inst;
-    return &inst;
+    // We deliberately leak the Manager instance, to avoid leak sanitizers
+    // complaining about the entries in Manager::fmap being leaked at program
+    // exit.
+    static Manager* inst = new Manager();
+    return inst;
   }
 };
 
diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc
index 6aff5e56c715..355200a0cbb0 100644
--- a/src/runtime/rocm/rocm_device_api.cc
+++ b/src/runtime/rocm/rocm_device_api.cc
@@ -10,7 +10,7 @@
 #include <tvm/runtime/registry.h>
 #include <hip/hip_runtime_api.h>
 #include <hsa/hsa.h>
-#include "./rocm_common.h"
+#include "rocm_common.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/rocm/rocm_module.cc b/src/runtime/rocm/rocm_module.cc
index 503b04872c82..0607e9938225 100644
--- a/src/runtime/rocm/rocm_module.cc
+++ b/src/runtime/rocm/rocm_module.cc
@@ -8,8 +8,8 @@
 #include <array>
 #include <string>
 #include <mutex>
-#include "./rocm_module.h"
-#include "./rocm_common.h"
+#include "rocm_module.h"
+#include "rocm_common.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "../meta_data.h"
@@ -123,7 +123,7 @@ class ROCMModuleNode : public runtime::ModuleNode {
   std::mutex mutex_;
 };
 
-// a wrapped function class to get packed fucn.
+// a wrapped function class to get packed func.
 class ROCMWrappedFunc {
  public:
   // initialize the ROCM function.
diff --git a/src/runtime/rpc/rpc_device_api.cc b/src/runtime/rpc/rpc_device_api.cc
index 5740a393c253..4242f8e1ae58 100644
--- a/src/runtime/rpc/rpc_device_api.cc
+++ b/src/runtime/rpc/rpc_device_api.cc
@@ -5,7 +5,7 @@
 #include <dmlc/logging.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/device_api.h>
-#include "./rpc_session.h"
+#include "rpc_session.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/rpc/rpc_event_impl.cc b/src/runtime/rpc/rpc_event_impl.cc
index fc5ecca1f421..e553c6fad4a0 100644
--- a/src/runtime/rpc/rpc_event_impl.cc
+++ b/src/runtime/rpc/rpc_event_impl.cc
@@ -5,7 +5,7 @@
  */
 #include <tvm/runtime/registry.h>
 #include <memory>
-#include "./rpc_session.h"
+#include "rpc_session.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 251871bf0cc1..80a8cc93ce19 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -6,19 +6,19 @@
 #include <tvm/runtime/registry.h>
 #include <memory>
 #include <cstring>
-#include "./rpc_session.h"
+#include "rpc_session.h"
 
 namespace tvm {
 namespace runtime {
 
 // Wrapped remote function to packed func.
-struct RPCWrappedFunc {
+class RPCWrappedFunc {
  public:
   RPCWrappedFunc(void* handle,
                  std::shared_ptr<RPCSession> sess)
       : handle_(handle), sess_(sess) {
     fwrap_ = PackedFunc([sess](TVMArgs args, TVMRetValue* rv) {
-        WrapRemote(sess, args.values[0].v_handle, args.type_codes[0], rv);
+        WrapRemote(sess, args, rv);
       });
   }
 
@@ -34,10 +34,47 @@ struct RPCWrappedFunc {
   }
 
   static void WrapRemote(std::shared_ptr<RPCSession> sess,
-                         void* handle,
-                         int tcode,
+                         TVMArgs args,
                          TVMRetValue* rv);
 
+  // deleter of RPC remote array
+  static void RemoteNDArrayDeleter(NDArray::Container* ptr) {
+    RemoteSpace* space = static_cast<RemoteSpace*>(ptr->dl_tensor.data);
+    space->sess->CallRemote(RPCCode::kNDArrayFree, ptr->manager_ctx);
+    delete space;
+    delete ptr;
+  }
+  // wrap return value as remote NDArray.
+  static NDArray WrapRemoteNDArray(std::shared_ptr<RPCSession> sess,
+                                   DLTensor* tensor,
+                                   void* nd_handle) {
+    NDArray::Container* data = new NDArray::Container();
+    data->manager_ctx = nd_handle;
+    data->deleter = RemoteNDArrayDeleter;
+    RemoteSpace* space = new RemoteSpace();
+    space->sess = sess;
+    space->data = tensor->data;
+    data->dl_tensor.data = space;
+    NDArray ret(data);
+    // RAII now in effect
+    data->shape_ = std::vector<int64_t>(
+        tensor->shape, tensor->shape + tensor->ndim);
+    data->dl_tensor.shape = dmlc::BeginPtr(data->shape_);
+    data->dl_tensor.ndim = static_cast<int>(data->shape_.size());
+    // setup dtype
+    data->dl_tensor.dtype = tensor->dtype;
+    // setup ctx, encode as remote session
+    data->dl_tensor.ctx.device_id = tensor->ctx.device_id;
+    data->dl_tensor.ctx.device_type = static_cast<DLDeviceType>(
+        static_cast<int>(tensor->ctx.device_type) +
+        kRPCSessMask * (sess->table_index() + 1));
+    // check strides.
+    CHECK(tensor->strides == nullptr);
+    // setup byteoffset
+    data->dl_tensor.byte_offset = tensor->byte_offset;
+    return ret;
+  }
+
  private:
   PackedFunc fwrap_;
   void* handle_{nullptr};
@@ -126,20 +163,28 @@ class RPCModuleNode final : public ModuleNode {
 };
 
 void RPCWrappedFunc::WrapRemote(std::shared_ptr<RPCSession> sess,
-                                void* handle,
-                                int tcode,
+                                TVMArgs args,
                                 TVMRetValue *rv) {
+  void* handle = args.values[0].v_handle;
+  int tcode = args.type_codes[0];
+
   if (handle == nullptr) return;
   if (tcode == kFuncHandle) {
     auto wf = std::make_shared<RPCWrappedFunc>(handle, sess);
     *rv = PackedFunc([wf](TVMArgs args, TVMRetValue* rv) {
         return wf->operator()(args, rv);
       });
-  } else {
-    CHECK_EQ(tcode, kModuleHandle);
+  } else if (tcode == kModuleHandle) {
     std::shared_ptr<RPCModuleNode> n =
         std::make_shared<RPCModuleNode>(handle, sess);
     *rv = Module(n);
+  } else if (tcode == kArrayHandle || tcode == kNDArrayContainer) {
+    CHECK_EQ(args.size(), 2);
+    DLTensor* tensor = args[0];
+    void* nd_handle = args[1];
+    *rv = WrapRemoteNDArray(sess, tensor, nd_handle);
+  } else {
+    LOG(FATAL) << "Cannot wrap tcode=" << tcode;
   }
 }
 
diff --git a/src/runtime/rpc/rpc_server_env.cc b/src/runtime/rpc/rpc_server_env.cc
index ca91b88247e5..fb8d95d60b95 100644
--- a/src/runtime/rpc/rpc_server_env.cc
+++ b/src/runtime/rpc/rpc_server_env.cc
@@ -35,5 +35,12 @@ TVM_REGISTER_GLOBAL("tvm.rpc.server.download")
     *rv = arr;
   });
 
+TVM_REGISTER_GLOBAL("tvm.rpc.server.remove")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    std::string file_name = RPCGetPath(args[0]);
+    LOG(INFO) << "Remove " << file_name;
+    RemoveFile(file_name);
+  });
+
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index 21fff7b29882..208944a69dce 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -11,7 +11,9 @@
 #include <array>
 #include <string>
 #include <chrono>
-#include "./rpc_session.h"
+#include <vector>
+#include <utility>
+#include "rpc_session.h"
 #include "../../common/ring_buffer.h"
 
 namespace tvm {
@@ -130,19 +132,22 @@ class RPCSession::EventHandler : public dmlc::Stream {
           break;
         }
         case kReturnReceived: {
-          CHECK_EQ(arg_buf_->value.size(), 1U);
+          CHECK_GE(arg_buf_->value.size(), 1U);
+
           TVMArgValue argv = arg_buf_->AsTVMArgs()[0];
           if (argv.type_code() == kFuncHandle ||
-              argv.type_code() == kModuleHandle) {
+              argv.type_code() == kModuleHandle ||
+              argv.type_code() == kArrayHandle) {
             CHECK(fwrap != nullptr) << "function/module wrapper not available";
             fwrap->CallPacked(arg_buf_->AsTVMArgs(), rv);
           } else {
+            CHECK_EQ(arg_buf_->value.size(), 1U);
             *rv = argv;
           }
           arg_buf_.reset();
           this->SwitchToState(kRecvCode);
           std::swap(client_mode_, client_mode);
-          return  RPCCode::kReturn;
+          return RPCCode::kReturn;
         }
         case kCopyAckReceived: {
           std::swap(client_mode_, client_mode);
@@ -172,15 +177,22 @@ class RPCSession::EventHandler : public dmlc::Stream {
     ctx.device_type = static_cast<DLDeviceType>(dev_type % kRPCSessMask);
     return ctx;
   }
-  // send Packed sequence to writer.
-  void SendPackedSeq(const TVMValue* arg_values, const int* type_codes, int n) {
+  // Send Packed sequence to writer.
+  // return_ndarray is a special flag to handle returning of ndarray
+  //    In this case, we return the shape, context and data of the array,
+  //    as well as a customized PackedFunc that handles deletion of
+  //    the array in the remote.
+  void SendPackedSeq(const TVMValue* arg_values,
+                     const int* type_codes,
+                     int n,
+                     bool return_ndarray = false) {
     this->Write(n);
-    // only handles .
     for (int i = 0; i < n; ++i) {
       int tcode = type_codes[i];
       if (tcode == kNDArrayContainer) tcode = kArrayHandle;
       this->Write(tcode);
     }
+
     // Argument packing.
     for (int i = 0; i < n; ++i) {
       int tcode = type_codes[i];
@@ -215,18 +227,32 @@ class RPCSession::EventHandler : public dmlc::Stream {
         case kNDArrayContainer:
         case kArrayHandle: {
           DLTensor* arr = static_cast<DLTensor*>(value.v_handle);
-          TVMContext ctx = StripSessMask(arr->ctx);
-          uint64_t data = reinterpret_cast<uint64_t>(
-              static_cast<RemoteSpace*>(arr->data)->data);
+          TVMContext ctx;
+          uint64_t data;
+          if (!return_ndarray) {
+            // in the client mode
+            // ctx contains the remote table index
+            // the space is wrapped by an RemoteSpace
+            // that holds reference to the session.
+            ctx = StripSessMask(arr->ctx);
+            data = reinterpret_cast<uint64_t>(
+                static_cast<RemoteSpace*>(arr->data)->data);
+          } else {
+            // When we return NDArray, we directly return
+            // the space and the context
+            // The client will be further wrapping
+            ctx = arr->ctx;
+            data = reinterpret_cast<uint64_t>(arr->data);
+          }
           this->Write(data);
           this->Write(ctx);
           this->Write(arr->ndim);
           this->Write(arr->dtype);
           this->WriteArray(arr->shape, arr->ndim);
           CHECK(arr->strides == nullptr)
-              << "Donot support strided remote array";
+              << "Do not support strided remote array";
           CHECK_EQ(arr->byte_offset, 0)
-              << "Donot support send byte offset";
+              << "Do not support send byte offset";
           break;
         }
         case kNull: break;
@@ -701,6 +727,21 @@ class RPCSession::EventHandler : public dmlc::Stream {
               << "Only server can send function and module handle back.";
         rv.MoveToCHost(&ret_value, &ret_tcode);
         SendPackedSeq(&ret_value, &ret_tcode, 1);
+      } else if (rv.type_code() == kNDArrayContainer) {
+        // always send handle in 64 bit.
+        CHECK(!client_mode_)
+            << "Only server can send NDArray back";
+        // We follow a special protocol to return NDArray to client side
+        // The first pack value is the NDArray handle as DLTensor
+        // The second pack value is a customized deleter that deletes the NDArray.
+        TVMValue ret_value_pack[2];
+        int ret_tcode_pack[2];
+        rv.MoveToCHost(&ret_value_pack[0], &ret_tcode_pack[0]);
+
+        NDArray::Container* nd = static_cast<NDArray::Container*>(ret_value_pack[0].v_handle);
+        ret_value_pack[1].v_handle = nd;
+        ret_tcode_pack[1] = kHandle;
+        SendPackedSeq(ret_value_pack, ret_tcode_pack, 2, true);
       } else {
         ret_value = rv.value();
         ret_tcode = rv.type_code();
@@ -1090,6 +1131,11 @@ void RPCModuleGetSource(TVMArgs args, TVMRetValue *rv) {
   *rv = (*static_cast<Module*>(mhandle))->GetSource(fmt);
 }
 
+void RPCNDArrayFree(TVMArgs args, TVMRetValue *rv) {
+  void* handle = args[0];
+  static_cast<NDArray::Container*>(handle)->DecRef();
+}
+
 void RPCGetTimeEvaluator(TVMArgs args, TVMRetValue *rv) {
   PackedFunc *pf = static_cast<PackedFunc*>(args[0].operator void*());
   void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3]));
@@ -1138,6 +1184,7 @@ void RPCSession::EventHandler::HandlePackedCall() {
     case RPCCode::kModuleFree: CallHandler(RPCModuleFree); break;
     case RPCCode::kModuleGetFunc: CallHandler(RPCModuleGetFunc); break;
     case RPCCode::kModuleGetSource: CallHandler(RPCModuleGetSource); break;
+    case RPCCode::kNDArrayFree: CallHandler(RPCNDArrayFree); break;
     default: LOG(FATAL) << "Unknown event " << static_cast<int>(code_);
   }
   CHECK_EQ(state_, kRecvCode);
diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h
index 68f6763ae6db..4b736de0e041 100644
--- a/src/runtime/rpc/rpc_session.h
+++ b/src/runtime/rpc/rpc_session.h
@@ -48,6 +48,7 @@ enum class RPCCode : int {
   kModuleFree,
   kModuleGetFunc,
   kModuleGetSource,
+  kNDArrayFree
 };
 
 /*!
diff --git a/src/runtime/rpc/rpc_socket_impl.cc b/src/runtime/rpc/rpc_socket_impl.cc
index 22f221d46526..6b2fa6c1f608 100644
--- a/src/runtime/rpc/rpc_socket_impl.cc
+++ b/src/runtime/rpc/rpc_socket_impl.cc
@@ -5,7 +5,7 @@
  */
 #include <tvm/runtime/registry.h>
 #include <memory>
-#include "./rpc_session.h"
+#include "rpc_session.h"
 #include "../../common/socket.h"
 
 namespace tvm {
diff --git a/src/runtime/sgx/trusted/runtime.cc b/src/runtime/sgx/trusted/runtime.cc
index a863327f956c..b7f66efbc97c 100644
--- a/src/runtime/sgx/trusted/runtime.cc
+++ b/src/runtime/sgx/trusted/runtime.cc
@@ -12,9 +12,9 @@
 #include "../../system_lib_module.cc"
 #include "../../thread_pool.cc"
 #include "../../workspace_pool.cc"
-#include "./ecall_registry.h"
-#include "./runtime.h"
-#include "./threading_backend.cc"
+#include "ecall_registry.h"
+#include "runtime.h"
+#include "threading_backend.cc"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/sgx/tvm.edl b/src/runtime/sgx/tvm.edl
index b4d9852f8499..8466d78af72f 100644
--- a/src/runtime/sgx/tvm.edl
+++ b/src/runtime/sgx/tvm.edl
@@ -1,5 +1,7 @@
 enclave {
     from "sgx_tstdc.edl" import *;
+    from "sgx_stdio.edl" import *;
+    from "sgx_backtrace.edl" import *;
 
     trusted {
         public void tvm_ecall_init([isptr, user_check] TVMRetValueHandle ret);
@@ -7,7 +9,8 @@ enclave {
                                           [in, count=num_args] const TVMValue* arg_values,
                                           [in, count=num_args] const int* type_codes,
                                           int num_args,
-                                          [isptr, user_check] TVMRetValueHandle ret);
+                                          [out] TVMValue* ret_val,
+                                          [out] int* ret_type_code);
     };
 
     untrusted {
@@ -17,12 +20,7 @@ enclave {
                                    int num_args,
                                    [out] TVMValue* ret_val,
                                    [out] int* ret_type_code);
-        void tvm_ocall_set_return([isptr, user_check] TVMRetValueHandle ret,
-                                   [in, count=num_ret] const TVMValue* value,
-                                   [in, count=num_ret] const int* type_code,
-                                   int num_ret);
         void tvm_ocall_register_export([in, string] const char* name, int func_id);
-        void* tvm_ocall_reserve_space(size_t num_bytes, size_t alignment);
     };
 };
 
diff --git a/src/runtime/sgx/untrusted/sgx_module.cc b/src/runtime/sgx/untrusted/sgx_module.cc
index 8dd696349b05..fc0710ae3a53 100644
--- a/src/runtime/sgx/untrusted/sgx_module.cc
+++ b/src/runtime/sgx/untrusted/sgx_module.cc
@@ -4,11 +4,11 @@
  * \brief SGX enclave module.
  */
 #include <dmlc/logging.h>
+#include <sgx_urts.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/threading_backend.h>
-#include <sgx_urts.h>
 #include <algorithm>
 #include <fstream>
 #include <iostream>
@@ -18,6 +18,7 @@
 #include <unordered_map>
 #include "../common.h"
 #include "../../file_util.h"
+#include "./tvm_u.h"
 
 namespace tvm {
 namespace runtime {
@@ -109,15 +110,18 @@ class SGXModuleNode : public ModuleNode {
     int func_id = exported->second;
     return PackedFunc([this, func_id](TVMArgs args, TVMRetValue* rv) {
         sgx::EnclaveContext ctx(this);
+        TVMValue ret_value;
+        int ret_type_code;
         TVM_SGX_CHECKED_CALL(tvm_ecall_packed_func(eid_, func_id,
-              args.values, args.type_codes, args.num_args, rv));
+              args.values, args.type_codes, args.num_args, &ret_value, &ret_type_code));
+        *rv = TVMArgValue(ret_value, ret_type_code);
       });
   }
 
-  void RunWorkers(int num_tasks, void* tg) {
-    std::function<void(int)> runner = [this, tg](int _worker_id) {
+  void RunWorkers(int num_tasks) {
+    std::function<void(int)> runner = [this](int _worker_id) {
       this->GetFunction("__tvm_run_worker__",
-                        std::shared_ptr<SGXModuleNode>(nullptr))(tg);
+                        std::shared_ptr<SGXModuleNode>(nullptr))();
     };
     thread_group_.reset(new tvm::runtime::threading::ThreadGroup(
           num_tasks, runner, false /* include_main_thread */));
@@ -143,7 +147,7 @@ namespace sgx {
 
 TVM_REGISTER_GLOBAL("__sgx_thread_group_launch__")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
-  EnclaveContext::GetModule()->RunWorkers(args[0], args[1]);
+  EnclaveContext::GetModule()->RunWorkers(args[0]);
 });
 
 TVM_REGISTER_GLOBAL("__sgx_thread_group_join__")
@@ -198,31 +202,25 @@ void tvm_ocall_packed_func(const char* name,
 
 // Allocates space for return values. The returned pointer is only valid between
 // successive calls to `tvm_ocall_reserve_space`.
-void* tvm_ocall_reserve_space(size_t num_bytes, size_t alignment) {
+TVM_REGISTER_GLOBAL("__sgx_reserve_space__")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  size_t num_bytes = args[0];
+  size_t alignment = args[1];
+
   static TVMContext ctx = { kDLCPU, 0 };
   static thread_local void* buf = nullptr;
   static thread_local size_t buf_size = 0;
   static thread_local size_t buf_align = 0;
 
-  if (buf_size >= num_bytes && buf_align >= alignment) return buf;
+  if (buf_size >= num_bytes && buf_align >= alignment) *rv = nullptr;
 
   DeviceAPI::Get(ctx)->FreeDataSpace(ctx, buf);
   buf = DeviceAPI::Get(ctx)->AllocDataSpace(ctx, num_bytes, alignment, {});
   buf_size = num_bytes;
   buf_align = alignment;
 
-  return buf;
-}
-
-void tvm_ocall_set_return(TVMRetValueHandle ret,
-                           const TVMValue* value,
-                           const int* type_code,
-                           int num_ret) {
-  CHECK_EQ(num_ret, 1) << "Only one return value is currently supported.";
-  CHECK(type_code[0] != kStr) << "Return kBytes, not kStr.";
-  TVMRetValue* rv = static_cast<TVMRetValue*>(ret);
-  *rv = TVMArgValue(value[0], type_code[0]);
-}
+  *rv = buf;
+});
 
 }  // extern "C"
 }  // namespace sgx
diff --git a/src/codegen/stack_vm/stack_vm.cc b/src/runtime/stackvm/stackvm.cc
similarity index 90%
rename from src/codegen/stack_vm/stack_vm.cc
rename to src/runtime/stackvm/stackvm.cc
index 95feeae3679e..f45d83027467 100644
--- a/src/codegen/stack_vm/stack_vm.cc
+++ b/src/runtime/stackvm/stackvm.cc
@@ -1,15 +1,16 @@
 /*!
  *  Copyright (c) 2017 by Contributors
  * Implementation stack VM.
- * \file stack_vm.cc
+ * \file stackvm.cc
  */
 #include <dmlc/thread_local.h>
-#include <tvm/ir.h>
+#include <tvm/runtime/util.h>
 #include <tvm/runtime/c_backend_api.h>
-#include "./stack_vm.h"
+#include <algorithm>
+#include "stackvm.h"
 
 namespace tvm {
-namespace codegen {
+namespace runtime {
 
 typedef dmlc::ThreadLocalStore<StackVM::State> StackVMStateStore;
 
@@ -172,28 +173,64 @@ std::ostream& operator<<(std::ostream& os, const StackVM& vm) {  // NOLINT(*)
   return os;
 }
 
-void StackVM::operator()(const runtime::TVMArgs& args) const {
+void StackVM::Run(const runtime::TVMArgs& args,
+                  runtime::ModuleNode* mod_ctx) const {
   StackVM::State* s = StackVM::ThreadLocalState();
+  if (s->heap.size() < heap_size) {
+    s->heap.resize(heap_size);
+  }
   s->sp = 0;
   s->pc = 0;
-  if (s->heap.size() < this->heap_size) {
-    s->heap.resize(this->heap_size);
-  }
-
+  s->mod_ctx = mod_ctx;
   s->heap[0].v_handle = (void*)args.values;  // NOLINT(*)
   s->heap[1].v_handle = (void*)args.type_codes;  // NOLINT(*)
   s->heap[2].v_int64 = args.num_args;
   this->Run(s);
 }
 
+void StackVM::InitCache() {
+  extern_func_cache_.clear();
+  extern_func_cache_.resize(
+      extern_func_name.size(), PackedFunc(nullptr));
+}
+
+void StackVM::Save(dmlc::Stream* strm) const {
+  // to be endian invariant.
+  std::vector<int32_t> code_copy(code.size());
+  std::transform(code.begin(), code.end(), code_copy.begin(), [](Code c) {
+      return c.v_int;
+    });
+  strm->Write(code_copy);
+  strm->Write(str_data);
+  strm->Write(extern_func_name);
+  strm->Write(heap_id_name);
+  strm->Write(heap_size);
+  strm->Write(stack_size);
+}
+
+bool StackVM::Load(dmlc::Stream* strm)  {
+  // to be endian invariant.
+  std::vector<int32_t> code_copy;
+  if (!strm->Read(&code_copy)) return false;
+  code.resize(code_copy.size());
+  std::transform(code_copy.begin(), code_copy.end(), code.begin(), [](int v) {
+      Code code; code.v_int = v; return code;
+    });
+  if (!strm->Read(&str_data)) return false;
+  if (!strm->Read(&extern_func_name)) return false;
+  if (!strm->Read(&heap_id_name)) return false;
+  if (!strm->Read(&heap_size)) return false;
+  if (!strm->Read(&stack_size)) return false;
+  this->InitCache();
+  return true;
+}
+
 void StackVM::Run(State* s) const {
   int64_t sp = s->sp;
   int64_t pc = s->pc;
   int64_t alloca_sp = s->sp;
   std::vector<TVMValue>& stack = s->stack;
   std::vector<TVMValue>& heap = s->heap;
-  s->extern_func.clear();
-  s->extern_func.resize(extern_func_name.size());
   if (stack.size() < stack_size) {
     stack.resize(stack_size);
   }
@@ -488,17 +525,19 @@ void StackVM::Run(State* s) const {
 }
 
 const PackedFunc& StackVM::GetExtern(State* s, int fid) const {
-  PackedFunc& f = s->extern_func[fid];
+  CHECK_LT(static_cast<size_t>(fid), extern_func_cache_.size());
+  // allow race write in this, since write is idempotent
+  PackedFunc& f = extern_func_cache_[fid];
   if (f == nullptr) {
-    CHECK(mod_ctx != nullptr)
+    CHECK(s->mod_ctx != nullptr)
         << "No local context is set in stackvm";
-    const PackedFunc* pf = mod_ctx->GetFuncFromEnv(extern_func_name[fid]);
+    CHECK(s->mod_ctx != nullptr);
+    const PackedFunc* pf = s->mod_ctx->GetFuncFromEnv(extern_func_name[fid]);
     CHECK(pf != nullptr);
     f = *pf;
-    CHECK(f != nullptr);
   }
   return f;
 }
 
-}  // namespace codegen
+}  // namespace runtime
 }  // namespace tvm
diff --git a/src/codegen/stack_vm/stack_vm.h b/src/runtime/stackvm/stackvm.h
similarity index 89%
rename from src/codegen/stack_vm/stack_vm.h
rename to src/runtime/stackvm/stackvm.h
index 54972d39a5df..b2ce975b2c73 100644
--- a/src/codegen/stack_vm/stack_vm.h
+++ b/src/runtime/stackvm/stackvm.h
@@ -1,36 +1,36 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file stack_vm.h
+ * \file stackvm.h
  * \brief A simple stack-based virtual machine.
  *
  *  This can be used to interepret host side code
  *  to setup calls into device functions
  *  when only Runtime compilation for device is available(via NVRTC or OpenCL).
  */
-#ifndef TVM_CODEGEN_STACK_VM_STACK_VM_H_
-#define TVM_CODEGEN_STACK_VM_STACK_VM_H_
+#ifndef TVM_RUNTIME_STACKVM_STACKVM_H_
+#define TVM_RUNTIME_STACKVM_STACKVM_H_
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/module.h>
-#include <tvm/packed_func_ext.h>
 #include <string>
 #include <vector>
 
 namespace tvm {
-namespace codegen {
+namespace runtime {
 
 using runtime::operator<<;
 /*!
- * \brief A simple stack-based virtual machine.
+ * \brief A simple stack-based virtual machine program.
  */
 class StackVM {
  public:
   /*!
-   * \brief Invoke the StackVM as PackedFunc
+   * \brief Invoke the StackVM program.
    * \param args The arguments to the StackVM.
+   * \param mod_ctx The module context used in running.
    */
-  void operator()(const TVMArgs& args) const;
+  void Run(const TVMArgs& args, runtime::ModuleNode* mod_ctx) const;
   /*!
    * \brief The opcode of stack vm
    * \note Notation
@@ -276,21 +276,25 @@ class StackVM {
     std::vector<TVMValue> stack;
     /*! \brief The global heap space */
     std::vector<TVMValue> heap;
-    /*! \brief extern functions */
-    std::vector<PackedFunc> extern_func;
     /*! \brief stack pointer  */
     int64_t sp{0};
     /*! \brief program counter */
     int64_t pc{0};
+    /*! \brief The current module context of stackvm */
+    runtime::ModuleNode* mod_ctx{nullptr};
   };
-  /*! \brief The external function entries. */
-  struct ExternFuncEntry {
-    std::string name;
-    runtime::PackedFunc func;
-  };
-
-  /*! \brief execute the stack vm with given state */
-  void Run(State* state) const;
+  /*! \brief Initialize local cache*/
+  void InitCache();
+  /*!
+   * \brief Save stackvm program to an output stream
+   * \param strm The output stream
+   */
+  void Save(dmlc::Stream* strm) const;
+  /*!
+   * \brief Load stackvm program from output stream
+   * \param strm The output stream
+   */
+  bool Load(dmlc::Stream* strm);
   /*!
    * \brief Print instruction at location pc
    * \param os The ostream
@@ -300,12 +304,11 @@ class StackVM {
   int64_t PrintCode(std::ostream&os, int64_t pc) const;  // NOLINT(*)
   /*! \brief Get thread local state of the stack VM */
   static State* ThreadLocalState();
+  // The code below are programs
   /*! \brief The instructions */
   std::vector<Code> code;
   /*! \brief constant error messages */
   std::vector<std::string> str_data;
-  /*! \brief The current module context of stackvm */
-  runtime::ModuleNode* mod_ctx{nullptr};
   /*! \brief Extern functions */
   std::vector<std::string> extern_func_name;
   /*! \brief name of each heap id */
@@ -385,10 +388,18 @@ class StackVM {
   friend std::ostream& operator<<(std::ostream& os, const StackVM& vm);  // NOLINT(*)
 
  private:
+  //  execute the stack vm with given state
+  void Run(State* state) const;
   // get extern function.
   const PackedFunc& GetExtern(State* s, int fid) const;
+  // cached extern function
+  mutable std::vector<PackedFunc> extern_func_cache_;
 };
 
-}  // namespace codegen
+}  // namespace runtime
 }  // namespace tvm
-#endif  // TVM_CODEGEN_STACK_VM_STACK_VM_H_
+
+namespace dmlc {
+DMLC_DECLARE_TRAITS(has_saveload, ::tvm::runtime::StackVM, true);
+}
+#endif  // TVM_RUNTIME_STACKVM_STACKVM_H_
diff --git a/src/runtime/stackvm/stackvm_module.cc b/src/runtime/stackvm/stackvm_module.cc
new file mode 100644
index 000000000000..7256c47862e5
--- /dev/null
+++ b/src/runtime/stackvm/stackvm_module.cc
@@ -0,0 +1,128 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file stackvm_module.cc
+ */
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/module.h>
+#include <dmlc/memory_io.h>
+#include "stackvm_module.h"
+#include "../file_util.h"
+#include "../module_util.h"
+
+namespace tvm {
+namespace runtime {
+
+class StackVMModuleNode : public runtime::ModuleNode {
+ public:
+  const char* type_key() const {
+    return "stackvm";
+  }
+
+  PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) final {
+    if (name == runtime::symbol::tvm_module_main) {
+      return GetFunction(entry_func_, sptr_to_self);
+    }
+    auto it = fmap_.find(name);
+    if (it == fmap_.end()) return PackedFunc();
+    const StackVM& vm = it->second;
+    // capture sptr_to_self to keep module node alive.
+    return PackedFunc([vm, sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        vm.Run(args, this);
+      });
+  }
+
+  std::string GetSource(const std::string& format) final {
+    std::ostringstream os;
+    for (const auto& kv : fmap_) {
+      os << "Function: " << kv.first << '\n';
+      os << kv.second;
+    }
+    return os.str();
+  }
+
+  void SaveToFile(const std::string& file_name,
+                  const std::string& format) final {
+    std::string data, mblob;
+    dmlc::MemoryStringStream writer(&data);
+    dmlc::Stream* strm = &writer;
+    strm->Write(fmap_);
+    strm->Write(entry_func_);
+    // also save imports
+    uint64_t num_imports = static_cast<uint64_t>(imports_.size());
+    strm->Write(num_imports);
+
+    for (runtime::Module im : imports_) {
+      CHECK_EQ(im->imports().size(), 0U)
+          << "Only support simply one-level hierarchy";
+      std::string tkey = im->type_key();
+      strm->Write(tkey);
+      LOG(INFO) << "save " << tkey;
+      im->SaveToBinary(strm);
+      LOG(INFO) << "FInish save " << tkey;
+    }
+    SaveBinaryToFile(file_name, data);
+  }
+
+  static Module Create(std::unordered_map<std::string, StackVM> fmap,
+                       std::string entry_func) {
+    std::shared_ptr<StackVMModuleNode> n =
+        std::make_shared<StackVMModuleNode>();
+    n->fmap_ = std::move(fmap);
+    n->entry_func_ = std::move(entry_func);
+    return Module(n);
+  }
+
+  static Module Load(dmlc::Stream* strm) {
+    std::unordered_map<std::string, StackVM> fmap;
+    std::string entry_func, data;
+    strm->Read(&fmap);
+    strm->Read(&entry_func);
+    std::shared_ptr<StackVMModuleNode> n =
+        std::make_shared<StackVMModuleNode>();
+    n->fmap_ = std::move(fmap);
+    n->entry_func_ = std::move(entry_func);
+    uint64_t num_imports;
+    strm->Read(&num_imports);
+    for (uint64_t i = 0; i < num_imports; ++i) {
+      std::string tkey;
+      CHECK(strm->Read(&tkey));
+      std::string fkey = "module.loadbinary_" + tkey;
+      const PackedFunc* f = Registry::Get(fkey);
+      CHECK(f != nullptr)
+          << "Loader of " << tkey << "("
+          << fkey << ") is not presented.";
+      Module m = (*f)(static_cast<void*>(strm));
+      n->imports_.emplace_back(std::move(m));
+    }
+    return Module(n);
+  }
+
+  static Module LoadFromFile(std::string file_name,
+                             std::string format) {
+    std::string data;
+    LoadBinaryFromFile(file_name, &data);
+    dmlc::MemoryStringStream reader(&data);
+    return Load(&reader);
+  }
+
+ private:
+  // internal function map
+  std::unordered_map<std::string, StackVM> fmap_;
+  // entry function.
+  std::string entry_func_;
+};
+
+Module StackVMModuleCreate(std::unordered_map<std::string, StackVM> fmap,
+                           std::string entry_func) {
+  return StackVMModuleNode::Create(fmap, entry_func);
+}
+
+TVM_REGISTER_GLOBAL("module.loadfile_stackvm")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = StackVMModuleNode::LoadFromFile(args[0], args[1]);
+  });
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/stackvm/stackvm_module.h b/src/runtime/stackvm/stackvm_module.h
new file mode 100644
index 000000000000..918228faea1f
--- /dev/null
+++ b/src/runtime/stackvm/stackvm_module.h
@@ -0,0 +1,27 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file stackvm_module.h
+ * \brief StackVM module
+ */
+#ifndef TVM_RUNTIME_STACKVM_STACKVM_MODULE_H_
+#define TVM_RUNTIME_STACKVM_STACKVM_MODULE_H_
+
+#include <tvm/runtime/packed_func.h>
+#include <string>
+#include "stackvm.h"
+
+namespace tvm {
+namespace runtime {
+/*!
+ * \brief create a stackvm module
+ *
+ * \param fmap The map from name to function
+ * \param entry_func The entry function name.
+ * \return The created module
+ */
+Module StackVMModuleCreate(std::unordered_map<std::string, StackVM> fmap,
+                           std::string entry_func);
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_STACKVM_STACKVM_MODULE_H_
diff --git a/src/runtime/system_lib_module.cc b/src/runtime/system_lib_module.cc
index 01ff99d7da87..ed48cb1a9d44 100644
--- a/src/runtime/system_lib_module.cc
+++ b/src/runtime/system_lib_module.cc
@@ -6,7 +6,7 @@
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/c_backend_api.h>
 #include <mutex>
-#include "./module_util.h"
+#include "module_util.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/vulkan/vulkan_device_api.cc b/src/runtime/vulkan/vulkan_device_api.cc
index 45f8549d54f2..cc89804806d2 100644
--- a/src/runtime/vulkan/vulkan_device_api.cc
+++ b/src/runtime/vulkan/vulkan_device_api.cc
@@ -5,7 +5,7 @@
 #include <tvm/runtime/registry.h>
 #include <dmlc/thread_local.h>
 #include <cstring>
-#include "./vulkan_common.h"
+#include "vulkan_common.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/vulkan/vulkan_module.cc b/src/runtime/vulkan/vulkan_module.cc
index b5425dd8fbc5..4afe8cc782ce 100644
--- a/src/runtime/vulkan/vulkan_module.cc
+++ b/src/runtime/vulkan/vulkan_module.cc
@@ -8,8 +8,8 @@
 #include <array>
 #include <string>
 #include <mutex>
-#include "./vulkan_common.h"
-#include "./vulkan_module.h"
+#include "vulkan_common.h"
+#include "vulkan_module.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "../meta_data.h"
@@ -223,7 +223,7 @@ class VulkanModuleNode final :public runtime::ModuleNode {
   std::mutex mutex_;
 };
 
-// a wrapped function class to get packed fucn.
+// a wrapped function class to get packed func.
 class VulkanWrappedFunc {
  public:
   // initialize the VULKAN function.
diff --git a/src/runtime/workspace_pool.cc b/src/runtime/workspace_pool.cc
index c903a8621206..d43b4641192c 100644
--- a/src/runtime/workspace_pool.cc
+++ b/src/runtime/workspace_pool.cc
@@ -3,7 +3,7 @@
  * \file workspace_pool.h
  * \brief Workspace pool utility.
  */
-#include "./workspace_pool.h"
+#include "workspace_pool.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/schedule/bound.cc b/src/schedule/bound.cc
index 7929969a8502..05c04834e78c 100644
--- a/src/schedule/bound.cc
+++ b/src/schedule/bound.cc
@@ -9,8 +9,8 @@
 #include <tvm/ir_pass.h>
 #include <unordered_map>
 #include <unordered_set>
-#include "./graph.h"
-#include "./message_passing.h"
+#include "graph.h"
+#include "message_passing.h"
 #include "../runtime/thread_storage_scope.h"
 
 namespace tvm {
diff --git a/src/schedule/graph.cc b/src/schedule/graph.cc
index da0aeb0eccaa..d92e7730b313 100644
--- a/src/schedule/graph.cc
+++ b/src/schedule/graph.cc
@@ -8,7 +8,7 @@
 #include <tvm/operation.h>
 #include <unordered_set>
 #include <unordered_map>
-#include "./graph.h"
+#include "graph.h"
 
 namespace tvm {
 namespace schedule {
diff --git a/src/schedule/message_passing.cc b/src/schedule/message_passing.cc
index b13dcefb1b9f..dff2895cd42d 100644
--- a/src/schedule/message_passing.cc
+++ b/src/schedule/message_passing.cc
@@ -6,7 +6,7 @@
 #include <tvm/arithmetic.h>
 #include <tvm/ir.h>
 #include <tvm/ir_pass.h>
-#include "./message_passing.h"
+#include "message_passing.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace tvm {
@@ -475,27 +475,32 @@ std::vector<Expr> MakeBoundCheck(
     iset_dmap[kv.first->var.get()] = IntSet::range(kv.second);
   }
 
-  for (IterVar iv : stage->op->root_iter_vars()) {
+  for (const IterVar& iv : stage->all_iter_vars) {
     if (skip_iter.count(iv) || iv->iter_type == kOpaque) continue;
-    Range dom = dom_map.at(iv);
     if (bound_state.at(iv)) {
+      Range dom = dom_map.at(iv);
       Expr value = ComputeExpr<Sub>(value_map.at(iv), dom->min);
       Expr vmax = EvalSet(value, iset_dmap).max();
       if (vmax.type() != value.type() || !can_prove(vmax < dom->extent)) {
         preds.emplace_back(value < dom->extent);
       }
     }
+  }
+  for (const IterVar& iv : stage->op->root_iter_vars()) {
+    if (skip_iter.count(iv) || iv->iter_type == kOpaque) continue;
+    Range dom = dom_map.at(iv);
     CHECK(iv->dom.defined());
     if (!skip_ivar_domain && !iv->dom.same_as(dom)) {
       Expr value = ComputeExpr<Sub>(value_map.at(iv), iv->dom->min);
       IntSet s = EvalSet(value, iset_dmap);
       Expr vmin = s.min();
       Expr vmax = s.max();
-      if (vmin.type() != value.type() || !can_prove(vmin >= iv->dom->min)) {
+      // The range of `value` resides in [vmin, vmax]
+      if (vmin.type() != value.type() || !can_prove(vmin >= 0)) {
         preds.emplace_back(value >= 0);
       }
       if (vmax.type() != value.type() || !can_prove(vmax < iv->dom->extent)) {
-        preds.emplace_back(value < (iv->dom->extent - iv->dom->min));
+        preds.emplace_back(value < iv->dom->extent);
       }
     }
   }
diff --git a/src/schedule/schedule_dataflow_rewrite.cc b/src/schedule/schedule_dataflow_rewrite.cc
index e9fbcba088fe..ccf7fd617194 100644
--- a/src/schedule/schedule_dataflow_rewrite.cc
+++ b/src/schedule/schedule_dataflow_rewrite.cc
@@ -7,7 +7,7 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/ir_pass.h>
 #include <unordered_set>
-#include "./message_passing.h"
+#include "message_passing.h"
 #include "../pass/ir_util.h"
 #include "../arithmetic/compute_expr.h"
 
@@ -46,7 +46,7 @@ Expr InjectPredicate(const Array<Expr>& predicates,
   if (predicates.size() == 0) return body;
   const Reduce* reduce = body.as<Reduce>();
   if (reduce) {
-    std::shared_ptr<Reduce> n = std::make_shared<Reduce>(*reduce);
+    auto n = make_node<Reduce>(*reduce);
     n->condition = n->condition && arith::ComputeReduce<ir::And>(predicates, Expr());
     return Expr(n);
   }
@@ -135,29 +135,29 @@ Tensor Schedule::cache_read(const Tensor& tensor,
   return cache;
 }
 
-// Cache write and relayout the data according to loop pattern
-Array<Tensor> CacheWriteWithReLayout(Schedule sch,
-                              const Array<Tensor>& tensor_array,
-                              const std::string& scope) {
-  size_t tensor_size = tensor_array.size();
-  sch->InvalidateCache();
-  Tensor tensor = tensor_array[0];
-  Stage orig_stage = sch[tensor->op];
-  const ComputeOpNode* compute = orig_stage->op.as<ComputeOpNode>();
-  std::unordered_set<IterVar> red_axis;
-  for (IterVar iv : compute->reduce_axis) {
+template<typename OpType>
+void PrepareAxisMapping(Stage orig_stage,
+                        OpType* op,
+                        std::unordered_set<IterVar>* p_red_axis,
+                        Array<IterVar>* p_new_axis,
+                        std::unordered_map<IterVar, Range>* p_dom_map,
+                        std::unordered_map<const Variable*, Expr>* p_vsub,
+                        std::unordered_map<const Variable*, Expr>* p_vsub2newvar,
+                        std::vector<Expr>* p_predicates) {
+  auto& red_axis = *p_red_axis;
+  auto& new_axis = *p_new_axis;
+  auto& dom_map = *p_dom_map;
+  auto& vsub = *p_vsub;
+  auto& vsub2newvar = *p_vsub2newvar;
+  auto& predicates = *p_predicates;
+
+  for (IterVar iv : op->reduce_axis) {
     red_axis.insert(iv);
   }
-  std::unordered_map<IterVar, Range> dom_map;
-  Array<IterVar> new_axis;
-
-  for (IterVar iv : compute->axis) {
+  for (IterVar iv : op->axis) {
     dom_map[iv] = iv->dom;
   }
   schedule::PassDownDomain(orig_stage, &dom_map, true);
-  std::unordered_map<const Variable*, Expr> vsub;
-  std::unordered_map<const Variable*, Expr> vsub2newvar;
-  std::vector<Expr> predicates;
   {
     // The source->cache
     std::unordered_map<IterVar, Expr> value_map;
@@ -178,17 +178,85 @@ Array<Tensor> CacheWriteWithReLayout(Schedule sch,
     }
     // skip reduction iteration.
     std::unordered_set<IterVar> skip_bound_check;
-    for (IterVar iv : compute->reduce_axis) {
+    for (IterVar iv : op->reduce_axis) {
       skip_bound_check.insert(iv);
     }
     schedule::PassUpIndex(orig_stage, dom_map, &value_map, true);
     predicates = schedule::MakeBoundCheck(
         orig_stage, dom_map, value_map, true, skip_bound_check);
     // The root axis
-    for (IterVar iv : compute->axis) {
-      vsub[iv->var.get()] = value_map.at(iv);
+    for (IterVar iv : op->axis) {
+      if (value_map.count(iv)) {
+        vsub[iv->var.get()] = value_map.at(iv);
+      }  // to handle tensor axis
     }
   }
+}
+
+Array<Tensor> ReplaceOriginalOp(Schedule sch,
+                                Stage orig_stage,
+                                const std::string& scope,
+                                Operation cache_op,
+                                Operation orig_new_op,
+                                size_t tensor_size) {
+  Array<Tensor> cache_tensor_list;
+  for (size_t i = 0; i < tensor_size; i++) {
+    Tensor cache_tensor = cache_op.output(i);
+    cache_tensor_list.push_back(cache_tensor);
+  }
+  // The replace of the dataflow
+  std::unordered_map<Tensor, Tensor> vmap;
+  std::unordered_map<Tensor, Tensor> rvmap;
+  vmap[orig_stage->op.output(0)] = orig_new_op.output(0);
+  rvmap[orig_new_op.output(0)] = orig_stage->op.output(0);
+  for (size_t i = 0; i < tensor_size; i++) {
+    vmap[orig_stage->op.output(0)] = orig_new_op.output(0);
+    rvmap[orig_new_op.output(0)] = orig_stage->op.output(0);
+  }
+  ReplaceDataFlow(sch->stages, &vmap, &rvmap);
+  // mutate orig stage
+  orig_stage->op = orig_new_op;
+  orig_stage->all_iter_vars = orig_stage->op->root_iter_vars();
+  orig_stage->leaf_iter_vars = orig_stage->all_iter_vars;
+  orig_stage->relations = Array<IterVarRelation>();
+  // create schedule for new cached stage.
+  ArrayNode* stages = sch->stages.CopyOnWrite();
+  size_t pos = FindNodeRef(stages, orig_stage);
+  Stage cache_stage = Stage(cache_op);
+  cache_stage.set_scope(scope);
+  CHECK_LT(pos, stages->data.size());
+  stages->data.insert(stages->data.begin() + pos,
+                      cache_stage.node_);
+  sch->stage_map.Set(cache_op, cache_stage);
+  // Update group
+  cache_stage->group = orig_stage->group;
+  if (cache_stage->group.defined()) {
+    ++cache_stage->group->num_child_stages;
+  }
+  return cache_tensor_list;
+}
+
+
+// Cache write and relayout the data according to loop pattern
+Array<Tensor> CacheWriteWithReLayout(Schedule sch,
+                                     const Array<Tensor>& tensor_array,
+                                     const std::string& scope) {
+  size_t tensor_size = tensor_array.size();
+  sch->InvalidateCache();
+  Tensor tensor = tensor_array[0];
+  Stage orig_stage = sch[tensor->op];
+  const ComputeOpNode* compute = orig_stage->op.as<ComputeOpNode>();
+
+  std::unordered_set<IterVar> red_axis;
+  Array<IterVar> new_axis;
+  std::unordered_map<IterVar, Range> dom_map;
+
+  std::unordered_map<const Variable*, Expr> vsub;
+  std::unordered_map<const Variable*, Expr> vsub2newvar;
+  std::vector<Expr> predicates;
+
+  PrepareAxisMapping(orig_stage, compute,
+    &red_axis, &new_axis, &dom_map, &vsub, &vsub2newvar, &predicates);
 
   Expr body;
   Array<Expr> body_list;
@@ -198,7 +266,7 @@ Array<Tensor> CacheWriteWithReLayout(Schedule sch,
     body = InjectPredicate(predicates, body);
     body = VarReplacer(vsub2newvar).Mutate(body);
     // Reduce nodes in ONE computeOp must be the same except value_index
-    // This is right only if the oringinal body ensures Reduce nodes are the same
+    // This is right only if the original body ensures Reduce nodes are the same
     if (body->is_type<ir::Reduce>()) {
       const ir::Reduce* reduce_body = body.as<ir::Reduce>();
       if (first_reduce != nullptr) {
@@ -234,48 +302,107 @@ Array<Tensor> CacheWriteWithReLayout(Schedule sch,
   Operation cache_op = ComputeOpNode::make(
       compute->name + "." + scope, compute->tag, compute->attrs,
       new_axis, body_list);
-  Array<Tensor> cache_tensor_list;
+
   Array<Expr> cache_expr_list;
   for (size_t i = 0; i < tensor_size; i++) {
     Tensor cache_tensor = cache_op.output(i);
-    cache_tensor_list.push_back(cache_tensor);
     cache_expr_list.push_back(cache_tensor(args));
   }
   Operation orig_new_op = ComputeOpNode::make(
       compute->name, compute->tag, compute->attrs,
       compute->axis, cache_expr_list);
-  // The replace of the dataflow
-  std::unordered_map<Tensor, Tensor> vmap;
-  std::unordered_map<Tensor, Tensor> rvmap;
-  vmap[orig_stage->op.output(0)] = orig_new_op.output(0);
-  rvmap[orig_new_op.output(0)] = orig_stage->op.output(0);
-  for (size_t i = 0; i < tensor_size; i++) {
-    vmap[orig_stage->op.output(0)] = orig_new_op.output(0);
-    rvmap[orig_new_op.output(0)] = orig_stage->op.output(0);
+  return ReplaceOriginalOp(sch, orig_stage, scope,
+    cache_op, orig_new_op, tensor_size);
+}
+
+
+// for tensor compute op
+Array<Tensor> CacheWriteWithReLayoutTensor(Schedule sch,
+                                           const Array<Tensor>& tensor_array,
+                                           const std::string& scope) {
+  size_t tensor_size = tensor_array.size();
+  sch->InvalidateCache();
+  Tensor tensor = tensor_array[0];
+  Stage orig_stage = sch[tensor->op];
+  const TensorComputeOpNode* tensor_op = orig_stage->op.as<TensorComputeOpNode>();
+  CHECK_EQ(tensor_op->num_outputs(), 1)
+      << "cache write only support single output tensor_compute_op";
+
+  std::unordered_set<IterVar> red_axis;
+  Array<IterVar> new_axis;
+  std::unordered_map<IterVar, Range> dom_map;
+
+  std::unordered_map<const Variable*, Expr> vsub;
+  std::unordered_map<const Variable*, Expr> vsub2newvar;
+  std::vector<Expr> predicates;
+
+  PrepareAxisMapping(orig_stage, tensor_op,
+    &red_axis, &new_axis, &dom_map, &vsub, &vsub2newvar, &predicates);
+
+
+  for (int i = tensor_op->schedulable_ndim; i < static_cast<int>(tensor_op->axis.size()); ++i) {
+    IterVar iv = tensor_op->axis[i];
+    IterVar new_iv = IterVarNode::make(
+      iv->dom, iv->var.copy_with_suffix(".c"), iv->iter_type);
+    new_axis.push_back(new_iv);
+  }
+  Array<Region> new_regions;
+  for (Region old_region : tensor_op->input_regions) {
+    Region region;
+    for (Range r : old_region) {
+      Expr min = VarReplacer(vsub2newvar).Mutate(r->min);
+      Expr extent = VarReplacer(vsub2newvar).Mutate(r->extent);
+      region.push_back(Range::make_by_min_extent(min, extent));
+    }
+    new_regions.push_back(region);
   }
-  ReplaceDataFlow(sch->stages, &vmap, &rvmap);
-  // mutate orig stage
-  orig_stage->op = orig_new_op;
-  orig_stage->all_iter_vars = orig_stage->op->root_iter_vars();
-  orig_stage->leaf_iter_vars = orig_stage->all_iter_vars;
-  orig_stage->relations = Array<IterVarRelation>();
-  // create schedule for new cached stage.
-  ArrayNode* stages = sch->stages.CopyOnWrite();
-  size_t pos = FindNodeRef(stages, orig_stage);
-  Stage cache_stage = Stage(cache_op);
-  cache_stage.set_scope(scope);
-  CHECK_LT(pos, stages->data.size());
-  stages->data.insert(stages->data.begin() + pos,
-                      cache_stage.node_);
-  sch->stage_map.Set(cache_op, cache_stage);
-  // Update group
-  cache_stage->group = orig_stage->group;
-  if (cache_stage->group.defined()) {
-    ++cache_stage->group->num_child_stages;
+
+  Operation cache_op = TensorComputeOpNode::make(
+      tensor_op->name + "." + scope, tensor_op->tag, new_axis,
+      tensor_op->reduce_axis, tensor_op->schedulable_ndim,
+      tensor_op->intrin, tensor_op->inputs, new_regions);
+
+  // axis will be used in generating compute op
+  Array<IterVar> compute_axis = tensor_op->axis;
+  for (size_t i = tensor_op->schedulable_ndim; i < tensor_op->axis.size(); ++i) {
+    IterVar iv = tensor_op->axis[i];
+    IterVar aiv = IterVarNode::make(iv->dom, iv->var, kDataPar);
+    compute_axis.Set(i, aiv);
   }
-  return cache_tensor_list;
+
+  // The reader args
+  Array<Expr> args;
+  {
+    // cache->compute
+    std::unordered_map<IterVar, Expr> value_map;
+    for (IterVar iv : compute_axis) {
+      value_map[iv] = iv->var;
+    }
+    schedule::PassDownIndex(orig_stage, dom_map, &value_map, true);
+    for (IterVar iv : orig_stage->leaf_iter_vars) {
+      if (red_axis.count(iv)) continue;
+      args.push_back(value_map.at(iv));
+    }
+    // tensorized region axis
+    for (size_t i = tensor_op->schedulable_ndim; i < tensor_op->axis.size(); ++i) {
+      IterVar iv = compute_axis[i];
+      args.push_back(value_map.at(iv));
+    }
+  }
+
+  Array<Expr> cache_expr_list;
+  for (size_t i = 0; i < tensor_size; i++) {
+    Tensor cache_tensor = cache_op.output(i);
+    cache_expr_list.push_back(cache_tensor(args));
+  }
+  Operation orig_new_op = ComputeOpNode::make(
+      tensor_op->name, tensor_op->tag, {},
+      compute_axis, cache_expr_list);
+  return ReplaceOriginalOp(sch, orig_stage, scope,
+    cache_op, orig_new_op, tensor_size);
 }
 
+
 Array<Tensor> Schedule::cache_write(const Array<Tensor>& tensor_array,
                              const std::string& scope) {
   (*this)->InvalidateCache();
@@ -291,23 +418,26 @@ Array<Tensor> Schedule::cache_write(const Array<Tensor>& tensor_array,
     CHECK(orig_stage.same_as(tmp_stage))
         << "Input tensor list must be generated by ONE computeOp";
   }
-
   return CacheWriteWithReLayout(*this, tensor_array, scope);
 }
 
+
 Tensor Schedule::cache_write(const Tensor& tensor,
                              const std::string& scope) {
+  // support original compute and tensor compute both
   (*this)->InvalidateCache();
-  Stage orig_stage = operator[](tensor->op);
-  const ComputeOpNode* compute = tensor->op.as<ComputeOpNode>();
-  CHECK(compute)
-      << "cache write only take ComputeOp as writers";
-  CHECK_EQ(compute->num_outputs(), 1)
-      << "cache write only support single output ComputeOp";
-
-  return (CacheWriteWithReLayout(*this, {tensor}, scope))[0];
+  const char* type_key = tensor->op->type_key();
+  if (!strcmp(type_key, "ComputeOp")) {
+    return (CacheWriteWithReLayout(*this, {tensor}, scope))[0];
+  } else if (!strcmp(type_key, "TensorComputeOp")) {
+    return (CacheWriteWithReLayoutTensor(*this, {tensor}, scope))[0];
+  } else {
+    LOG(FATAL) << "cache write only take ComputeOp or TensorComputeOp as writers";
+    return Tensor();
+  }
 }
 
+
 void RebaseNonZeroMinLoop(const Schedule& sch) {
   std::unordered_map<IterVar, IterVar> rebase_map;
   for (Stage s : sch->stages) {
@@ -400,7 +530,7 @@ void InjectInline(ScheduleNode* sch) {
               CHECK_EQ(new_body[j].size(), r->source.size());
               CHECK(r != nullptr);
               for (size_t k = 0; k < new_body[j].size(); ++k) {
-                std::shared_ptr<ir::Reduce> n = std::make_shared<ir::Reduce>(*r);
+                auto n = make_node<ir::Reduce>(*r);
                 n->value_index = static_cast<int>(k);
                 n->type = r->source[k].type();
                 new_body[j].Set(k, Expr(n));
@@ -520,11 +650,11 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor,
   const int factor_axis_pos = \
       factor_axis >= 0 ? factor_axis : static_cast<int>(compute_op->axis.size() + 1) + factor_axis;
   CHECK_LE(factor_axis_pos, compute_op->axis.size());
-  auto n = std::make_shared<ComputeOpNode>();
+  auto n = make_node<ComputeOpNode>();
   n->name = compute_op->name + ".rf";
   {
     // axis relacement.
-    auto iv_node = std::make_shared<IterVarNode>();
+    auto iv_node = make_node<IterVarNode>();
     iv_node->dom = dom_map.at(axis);
     CHECK(is_zero(iv_node->dom->min))
         << "Can only factor reduction domain starting from 0";
@@ -565,7 +695,7 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor,
   for (IterVar iv : reduce_stage->leaf_iter_vars) {
     if (touch_map.count(iv) && !iv.same_as(axis)) {
       CHECK_EQ(iv->iter_type, kCommReduce);
-      auto ncpy = std::make_shared<IterVarNode>(*iv.operator->());
+      auto ncpy = make_node<IterVarNode>(*iv.operator->());
       ncpy->dom = dom_map.at(iv);
       n->reduce_axis.push_back(IterVar(ncpy));
     }
diff --git a/src/schedule/schedule_lang.cc b/src/schedule/schedule_lang.cc
index eea8aa1aae80..29265f2e94b8 100644
--- a/src/schedule/schedule_lang.cc
+++ b/src/schedule/schedule_lang.cc
@@ -6,7 +6,7 @@
 #include <tvm/operation.h>
 #include <tvm/ir_mutator.h>
 #include <unordered_set>
-#include "./graph.h"
+#include "graph.h"
 
 namespace tvm {
 
@@ -70,7 +70,7 @@ void Split(StageNode* self,
 }  // namespace
 
 Stage::Stage(Operation op) {
-  auto n = std::make_shared<StageNode>();
+  auto n = make_node<StageNode>();
   n->op = op;
   n->origin_op = op;
   n->all_iter_vars = op->root_iter_vars();
@@ -164,16 +164,16 @@ Stage& Stage::bind(IterVar ivar, IterVar thread_ivar) {   // NOLINT(*)
   FindLeafVar(all_vars, leaf_vars, ivar);
 
   auto it = self->iter_var_attrs.find(ivar);
-  std::shared_ptr<IterVarAttrNode> n;
+  NodePtr<IterVarAttrNode> n;
   if (it != self->iter_var_attrs.end()) {
-    n = std::make_shared<IterVarAttrNode>(*(*it).second.operator->());
+    n = make_node<IterVarAttrNode>(*(*it).second.operator->());
     if (n->bind_thread.defined() &&
         !n->bind_thread.same_as(thread_ivar)) {
       LOG(WARNING) << "Axis " << ivar
                    << " is already bind to another thread " << n->bind_thread;
     }
   } else {
-    n = std::make_shared<IterVarAttrNode>();
+    n = make_node<IterVarAttrNode>();
   }
   n->bind_thread = thread_ivar;
   self->iter_var_attrs.Set(ivar, IterVarAttr(n));
@@ -188,7 +188,7 @@ Stage& Stage::env_threads(Array<IterVar> threads) {
       << "Already set env_threads";
   ArrayNode* leaf_vars = self->leaf_iter_vars.CopyOnWrite();
   ArrayNode* all_vars = self->all_iter_vars.CopyOnWrite();
-  std::vector<std::shared_ptr<Node> > temp;
+  std::vector<NodePtr<Node> > temp;
   for (IterVar iv : threads) {
     temp.push_back(iv.node_);
   }
@@ -303,7 +303,7 @@ Stage& Stage::reorder(const Array<IterVar>& order) {  // NOLINT(*)
   for (size_t i = 0; i < order.size(); ++i) {
     pos.push_back(FindLeafVar(all_vars, leaf_vars, order[i]));
   }
-  std::vector<std::shared_ptr<Node> > temp;
+  std::vector<NodePtr<Node> > temp;
   for (size_t i = 0; i < pos.size(); ++i) {
     temp.emplace_back(leaf_vars->data[pos[i]]);
   }
@@ -335,11 +335,11 @@ inline void UpdateIterVarAttr(StageNode* self,
     FindLeafVar(all_vars, leaf_vars, var);
   }
   auto it = self->iter_var_attrs.find(var);
-  std::shared_ptr<IterVarAttrNode> n;
+  NodePtr<IterVarAttrNode> n;
   if (it != self->iter_var_attrs.end()) {
-    n = std::make_shared<IterVarAttrNode>(*(*it).second.operator->());
+    n = make_node<IterVarAttrNode>(*(*it).second.operator->());
   } else {
-    n = std::make_shared<IterVarAttrNode>();
+    n = make_node<IterVarAttrNode>();
   }
   fupdate(n.get());
   self->iter_var_attrs.Set(var, IterVarAttr(n));
@@ -352,6 +352,13 @@ inline void SetAttrIterType(StageNode* self, IterVar var, IterVarType iter_type)
 }
 
 Stage& Stage::vectorize(IterVar var) {   // NOLINT(*)
+  CHECK(var->iter_type == kDataPar ||
+        var->iter_type == kOpaque ||
+        var->iter_type == kUnrolled ||
+        var->iter_type == kVectorized ||
+        var->iter_type == kTensorized ||
+        var->iter_type == kParallelized)
+      << "Cannot vectorize on " << IterVarType2String(var->iter_type);
   SetAttrIterType(operator->(), var, kVectorized);
   return *this;
 }
@@ -397,11 +404,11 @@ Stage& Stage::prefetch(const Tensor &tensor, IterVar var, Expr offset) {
   ArrayNode* leaf_vars = self->leaf_iter_vars.CopyOnWrite();
   FindLeafVar(all_vars, leaf_vars, var);
   auto it = self->iter_var_attrs.find(var);
-  std::shared_ptr<IterVarAttrNode> n;
+  NodePtr<IterVarAttrNode> n;
   if (it != self->iter_var_attrs.end()) {
-    n = std::make_shared<IterVarAttrNode>(*(*it).second.operator->());
+    n = make_node<IterVarAttrNode>(*(*it).second.operator->());
   } else {
-    n = std::make_shared<IterVarAttrNode>();
+    n = make_node<IterVarAttrNode>();
   }
   n->prefetch_data.push_back(tensor);
   n->prefetch_offset.push_back(offset);
@@ -468,8 +475,8 @@ Stage& Stage::opengl() {
 }
 
 Stage CopyStage(const Stage& s) {
-  std::shared_ptr<StageNode> n =
-      std::make_shared<StageNode>(*s.operator->());
+  NodePtr<StageNode> n =
+      make_node<StageNode>(*s.operator->());
   return Stage(n);
 }
 
@@ -477,7 +484,7 @@ Schedule Schedule::copy() const {
   // map of stages.
   const ScheduleNode* self = operator->();
   std::unordered_map<Stage, Stage, NodeHash, NodeEqual> smap;
-  std::shared_ptr<ScheduleNode> n = std::make_shared<ScheduleNode>();
+  NodePtr<ScheduleNode> n = make_node<ScheduleNode>();
   n->outputs = self->outputs;
   // Copy the stages.
   for (Stage s : self->stages) {
@@ -599,7 +606,7 @@ Stage Schedule::create_group(const Array<Tensor>& outputs,
     }
   }
   // Create the new group stage.
-  Stage gstage(std::make_shared<StageNode>());
+  Stage gstage(make_node<StageNode>());
   gstage->group = parent_group;
   if (parent_group.defined()) {
     ++parent_group->num_child_stages;
@@ -687,7 +694,7 @@ void ScheduleNode::InitCache() {
 }
 
 Schedule ScheduleNode::make(Array<Operation> ops) {
-  auto n = std::make_shared<ScheduleNode>();
+  auto n = make_node<ScheduleNode>();
   Schedule sch(n);
   n->outputs = ops;
   auto g = schedule::CreateReadGraph(n->outputs);
@@ -731,7 +738,7 @@ IterVarRelation SplitNode::make(IterVar parent,
                                 IterVar inner,
                                 Expr factor,
                                 Expr nparts) {
-  auto n = std::make_shared<SplitNode>();
+  auto n = make_node<SplitNode>();
   n->parent = parent;
   n->outer = outer;
   n->inner = inner;
@@ -742,7 +749,7 @@ IterVarRelation SplitNode::make(IterVar parent,
 
 IterVarRelation FuseNode::make(
     IterVar outer, IterVar inner, IterVar fused) {
-  auto n = std::make_shared<FuseNode>();
+  auto n = make_node<FuseNode>();
   n->outer = outer;
   n->inner = inner;
   n->fused = fused;
@@ -750,14 +757,14 @@ IterVarRelation FuseNode::make(
 }
 
 IterVarRelation RebaseNode::make(IterVar parent, IterVar rebased) {
-  auto n = std::make_shared<RebaseNode>();
+  auto n = make_node<RebaseNode>();
   n->parent = parent;
   n->rebased = rebased;
   return IterVarRelation(n);
 }
 
 IterVarRelation SingletonNode::make(IterVar iter) {
-  auto n = std::make_shared<SingletonNode>();
+  auto n = make_node<SingletonNode>();
   n->iter = iter;
   return IterVarRelation(n);
 }
diff --git a/src/schedule/schedule_ops.cc b/src/schedule/schedule_ops.cc
index 6fd2496aeabe..242423695464 100644
--- a/src/schedule/schedule_ops.cc
+++ b/src/schedule/schedule_ops.cc
@@ -11,7 +11,7 @@
 #include <utility>
 #include <unordered_map>
 #include <unordered_set>
-#include "./graph.h"
+#include "graph.h"
 #include "../op/op_util.h"
 #include "../pass/ir_util.h"
 
diff --git a/tests/cpp/attrs_test.cc b/tests/cpp/attrs_test.cc
new file mode 100644
index 000000000000..138e0b242e02
--- /dev/null
+++ b/tests/cpp/attrs_test.cc
@@ -0,0 +1,76 @@
+#include <dmlc/logging.h>
+#include <gtest/gtest.h>
+#include <tvm/attrs.h>
+#include <tvm/ir.h>
+
+namespace tvm {
+namespace test {
+// test example usage docs
+struct TestAttrs : public AttrsNode<TestAttrs> {
+  int axis;
+  std::string name;
+  Expr expr;
+  double learning_rate;
+
+  TVM_DECLARE_ATTRS(TestAttrs, "attrs.cpptest.TestAttrs") {
+    TVM_ATTR_FIELD(axis)
+        .set_default(10)
+        .set_lower_bound(1)
+        .set_upper_bound(10)
+        .describe("axis field");
+    TVM_ATTR_FIELD(name)
+        .describe("name of the field");
+    TVM_ATTR_FIELD(expr)
+        .describe("expression field")
+        .set_default(make_const(Int(32), 1));
+    TVM_ATTR_FIELD(learning_rate)
+        .describe("learning_rate")
+        .set_default(0.1);
+  }
+};
+}
+}
+
+TEST(Attrs, Basic) {
+  using namespace tvm;
+  using namespace tvm::test;
+  std::shared_ptr<TestAttrs> n = std::make_shared<TestAttrs>();
+  try {
+    n->InitBySeq("axis", 10);
+    LOG(FATAL) << "bad";
+  } catch (const tvm::AttrError& e) {
+  }
+  try {
+    n->InitBySeq("axis", 12, "name", "111");
+    LOG(FATAL) << "bad";
+  } catch (const tvm::AttrError& e) {
+  }
+
+  try {
+    n->InitBySeq("axisx", 12, "name", "111");
+    LOG(FATAL) << "bad";
+  } catch (const tvm::AttrError& e) {
+    std::string what = e.what();
+    CHECK(what.find("expr : Expr, default=1") != std::string::npos);
+    CHECK(what.find("axisx") != std::string::npos);
+  }
+  n->InitBySeq("learning_rate", Expr(1), "expr", 128, "name", "xx");
+  CHECK_EQ(n->learning_rate, 1.0);
+
+  n->InitBySeq("name", "xxx", "expr", 128);
+  CHECK_EQ(n->name, "xxx");
+  CHECK_EQ(n->axis, 10);
+  CHECK_EQ(n->expr.as<tvm::ir::IntImm>()->value, 128);
+  // Check docstring
+  std::ostringstream os;
+  n->PrintDocString(os);
+  LOG(INFO) << "docstring\n"<< os.str();
+  CHECK(os.str().find("expr : Expr, default=1") != std::string::npos);
+}
+
+
+int main(int argc, char ** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::FLAGS_gtest_death_test_style = "threadsafe";
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/cpp/expr_test.cc b/tests/cpp/expr_test.cc
index 9cdfef7f6a01..dca76205d79f 100644
--- a/tests/cpp/expr_test.cc
+++ b/tests/cpp/expr_test.cc
@@ -20,7 +20,7 @@ TEST(ExprNodeRef, Basic) {
   Var x("x");
   Expr z = max(x + 1 + 2, 100);
   const ir::Max* op = z.as<ir::Max>();
-  CHECK(op->GetNodeRef().same_as(z));
+  CHECK(NodeRef(op->GetNodePtr()).same_as(z));
 }
 
 
diff --git a/tests/cpp/ir_functor_test.cc b/tests/cpp/ir_functor_test.cc
index f87924d84619..db140f240344 100644
--- a/tests/cpp/ir_functor_test.cc
+++ b/tests/cpp/ir_functor_test.cc
@@ -1,7 +1,7 @@
 #include <dmlc/logging.h>
 #include <gtest/gtest.h>
 #include <tvm/tvm.h>
-#include <tvm/ir_functor.h>
+#include <tvm/node/ir_functor.h>
 #include <tvm/ir_functor_ext.h>
 
 TEST(IRF, Basic) {
diff --git a/tests/cpp/ir_mutator_test.cc b/tests/cpp/ir_mutator_test.cc
index fd5a60756f1c..0802d405bbe4 100644
--- a/tests/cpp/ir_mutator_test.cc
+++ b/tests/cpp/ir_mutator_test.cc
@@ -1,6 +1,7 @@
 #include <dmlc/logging.h>
 #include <gtest/gtest.h>
 #include <tvm/ir_mutator.h>
+#include <tvm/ir_operator.h>
 
 namespace {
 using namespace tvm::ir;
diff --git a/tests/cpp/ir_simplify_test.cc b/tests/cpp/ir_simplify_test.cc
index 0667dc27367c..8114bb51b771 100644
--- a/tests/cpp/ir_simplify_test.cc
+++ b/tests/cpp/ir_simplify_test.cc
@@ -1,5 +1,6 @@
 #include <dmlc/logging.h>
 #include <gtest/gtest.h>
+#include <tvm/ir_pass.h>
 #include <tvm/tvm.h>
 #include <arithmetic/Simplify.h>
 
@@ -8,6 +9,24 @@ TEST(IRSIMPLIFY, Basic) {
   simplify_test();
 }
 
+TEST(IRSIMPLIFY, MinMax) {
+  auto x = tvm::var("x");
+  auto e1 = (tvm::max(x, 1) - tvm::max(x, 1)) ;
+  auto e1s = tvm::ir::CanonicalSimplify(e1);
+  CHECK(is_zero(e1s));
+
+  auto e2 = (x * tvm::min(x, 1)) - (x * tvm::min(x, 1));
+  auto e2s = tvm::ir::CanonicalSimplify(e2);
+  CHECK(is_zero(e2s));
+}
+
+TEST(IRSIMPLIFY, Mul) {
+  auto x = tvm::var("x");
+  auto e = (x * x) - (x * x) ;
+  auto es = tvm::ir::CanonicalSimplify(e);
+  CHECK(is_zero(es));
+}
+
 int main(int argc, char ** argv) {
   testing::InitGoogleTest(&argc, argv);
   testing::FLAGS_gtest_death_test_style = "threadsafe";
diff --git a/tests/cpp/packed_func_test.cc b/tests/cpp/packed_func_test.cc
index 9b2f1df73731..abe26fabe9ea 100644
--- a/tests/cpp/packed_func_test.cc
+++ b/tests/cpp/packed_func_test.cc
@@ -135,6 +135,29 @@ TEST(PackedFunc, Type) {
   CHECK(get_type2("float32x2").operator Type() == Float(32, 2));
 }
 
+TEST(TypedPackedFunc, HighOrder) {
+  using namespace tvm;
+  using namespace tvm::runtime;
+  using Int1Func = TypedPackedFunc<int(int)>;
+  using Int2Func = TypedPackedFunc<int(int, int)>;
+  using BindFunc = TypedPackedFunc<Int1Func(Int2Func, int value)>;
+  BindFunc ftyped;
+  ftyped = [](Int2Func f1, int value) -> Int1Func {
+    auto binded = [f1, value](int x) {
+      return f1(value, x);
+    };
+    Int1Func x(binded);
+    return x;
+  };
+  auto add = [](int x, int y) { return x + y; };
+  CHECK_EQ(ftyped(Int2Func(add), 1)(2), 3);
+  PackedFunc f = ftyped(Int2Func(add), 1);
+  CHECK_EQ(f(3).operator int(), 4);
+  // call the type erased version.
+  Int1Func f1 = ftyped.packed()(Int2Func(add), 1);
+  CHECK_EQ(f1(3), 4);
+}
+
 // new namespoace
 namespace test {
 // register int vector as extension type
diff --git a/tests/cpp/relay_pass_type_infer_test.cc b/tests/cpp/relay_pass_type_infer_test.cc
new file mode 100644
index 000000000000..385bde974014
--- /dev/null
+++ b/tests/cpp/relay_pass_type_infer_test.cc
@@ -0,0 +1,22 @@
+#include <gtest/gtest.h>
+#include <tvm/tvm.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/type.h>
+#include <tvm/relay/pass.h>
+
+TEST(Relay, SelfReference) {
+  using namespace tvm;
+  auto type_a = relay::TypeVarNode::make("a", relay::TypeVarNode::kType);
+  auto type_b = relay::TypeVarNode::make("b", relay::TypeVarNode::kType);
+  auto x = relay::VarNode::make("x", type_a);
+  auto f = relay::FunctionNode::make(tvm::Array<relay::Var>{ x }, x, type_b, Array<relay::TypeVar>{});
+  auto fx = relay::CallNode::make(f, Array<relay::Expr>{ x });
+  auto type_fx = relay::InferType(fx, relay::ModuleNode::make(Map<relay::GlobalVar, relay::Function>{}));
+  CHECK_EQ(type_fx->checked_type(), type_a);
+}
+
+int main(int argc, char ** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::FLAGS_gtest_death_test_style = "threadsafe";
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/lint/pylintrc b/tests/lint/pylintrc
index f5c4452cfa16..18f526702ad8 100644
--- a/tests/lint/pylintrc
+++ b/tests/lint/pylintrc
@@ -290,10 +290,10 @@ variable-rgx=[a-z_][a-z0-9_]{2,30}$
 variable-name-hint=[a-z_][a-z0-9_]{2,30}$
 
 # Regular expression matching correct function names
-function-rgx=[a-z_][a-z0-9_]{2,30}$
+function-rgx=[a-z_][a-z0-9_]{2,48}$
 
 # Naming hint for function names
-function-name-hint=[a-z_][a-z0-9_]{2,30}$
+function-name-hint=[a-z_][a-z0-9_]{2,48}$
 
 # Regular expression matching correct class names
 class-rgx=[A-Z_][a-zA-Z0-9]+$
diff --git a/tests/python/contrib/test_cblas.py b/tests/python/contrib/test_cblas.py
index f02000ea7e7b..890820ba4519 100644
--- a/tests/python/contrib/test_cblas.py
+++ b/tests/python/contrib/test_cblas.py
@@ -18,7 +18,7 @@ def verify(target="llvm"):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func("tvm.contrib.cblas.matmul", True):
-            print("skip because extern function is not avalable")
+            print("skip because extern function is not available")
             return
         ctx = tvm.cpu(0)
         f = tvm.build(s, [A, B, D, bias], target)
@@ -27,7 +27,7 @@ def verify(target="llvm"):
         d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
         bb = 10.0
         f(a, b, d, bb)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + bb, rtol=1e-5)
     verify()
 
diff --git a/tests/python/contrib/test_cublas.py b/tests/python/contrib/test_cublas.py
index c488c8c680e1..07c7e9224fcb 100644
--- a/tests/python/contrib/test_cublas.py
+++ b/tests/python/contrib/test_cublas.py
@@ -16,7 +16,7 @@ def verify(target="cuda"):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func("tvm.contrib.cublas.matmul", True):
-            print("skip because extern function is not avalable")
+            print("skip because extern function is not available")
             return
         ctx = tvm.gpu(0)
         f = tvm.build(s, [A, B, C], target)
@@ -24,7 +24,7 @@ def verify(target="cuda"):
         b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
         c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
         f(a, b, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5)
     verify()
 
diff --git a/tests/python/contrib/test_dlpack.py b/tests/python/contrib/test_dlpack.py
new file mode 100644
index 000000000000..f97b002368ab
--- /dev/null
+++ b/tests/python/contrib/test_dlpack.py
@@ -0,0 +1,44 @@
+import tvm
+import numpy as np
+from tvm.contrib.dlpack import to_pytorch_func
+
+def test():
+    a = np.random.randn(1337)
+    tvm_a = tvm.nd.array(a)
+    np.testing.assert_equal(tvm.nd.from_dlpack(tvm_a.to_dlpack()).asnumpy(), a)
+
+    try:
+        import torch
+        import torch.utils.dlpack
+
+        x = torch.rand(56, 56)
+        tvm_x = tvm.nd.from_dlpack(torch.utils.dlpack.to_dlpack(x))
+        np.testing.assert_equal(x.numpy(), tvm_x.asnumpy())
+        y = tvm.nd.from_dlpack(tvm_x.to_dlpack())
+        np.testing.assert_equal(y.asnumpy(), tvm_x.asnumpy())
+        np.testing.assert_equal(torch.utils.dlpack.from_dlpack(y.to_dlpack()).numpy(), tvm_x.asnumpy())
+
+        n = tvm.convert(137)
+        xx = torch.rand(137,137)
+        yy = torch.rand(137,137)
+        zz2 = torch.empty(137,137)
+        zz = xx.mm(yy)
+        XX = tvm.placeholder((n,n), name='X')
+        YY = tvm.placeholder((n,n), name='Y')
+
+        k = tvm.reduce_axis((0, n), name='k')
+        ZZ = tvm.compute((n,n), lambda i,j : tvm.sum(XX[i,k]*YY[k,j], axis=k))
+        s = tvm.create_schedule(ZZ.op)
+        f = tvm.build(s, [XX, YY, ZZ], target_host='llvm', name='f')
+
+        f_pytorch = to_pytorch_func(f)
+        zz2 = torch.empty(137,137)
+        f_pytorch(xx, yy, zz2)
+        tvm.testing.assert_allclose(zz.numpy(), zz2.numpy(), rtol=1e-6)
+
+    except ImportError:
+        pass
+
+
+if __name__ ==  '__main__':
+    test()
diff --git a/tests/python/contrib/test_miopen.py b/tests/python/contrib/test_miopen.py
index 4e13b052e616..0d9e6dda2d7a 100644
--- a/tests/python/contrib/test_miopen.py
+++ b/tests/python/contrib/test_miopen.py
@@ -56,7 +56,7 @@ def verify():
         y_ref = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), ctx)
         f_ref(x, w, y_ref)
         print("Max abs diff:", np.max(np.abs(y.asnumpy() - y_ref.asnumpy())))
-        np.testing.assert_allclose(y.asnumpy(), y_ref.asnumpy(), atol=1e-3)
+        tvm.testing.assert_allclose(y.asnumpy(), y_ref.asnumpy(), atol=1e-3)
 
     verify()
 
diff --git a/tests/python/contrib/test_mps.py b/tests/python/contrib/test_mps.py
index 25437605525b..635724921708 100644
--- a/tests/python/contrib/test_mps.py
+++ b/tests/python/contrib/test_mps.py
@@ -33,7 +33,7 @@ def test_matmul():
 
     def verify(A, B, D, s, target="metal"):
         if not tvm.get_global_func("tvm.contrib.mps.matmul", True):
-            print("skip because extern function is not avalable")
+            print("skip because extern function is not available")
             return
         ctx = tvm.metal(0)
         f = tvm.build(s, [A, B, D], "metal")
@@ -41,7 +41,7 @@ def verify(A, B, D, s, target="metal"):
         b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
         c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
         f(a, b, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 1, rtol=1e-5)
     verify(A, B, D, s)
 
@@ -64,7 +64,7 @@ def test_conv2d():
 
     def verify(A, B, C, target="llvm"):
         if not tvm.get_global_func("tvm.contrib.mps.conv2d", True):
-            print("skip because extern function is not avalable")
+            print("skip because extern function is not available")
             return
         ctx = tvm.metal(0)
         f = tvm.build(s1, [A, B, C], "metal")
diff --git a/tests/python/contrib/test_mxnet_bridge.py b/tests/python/contrib/test_mxnet_bridge.py
index 2228f7305c6b..d511ec61d6a3 100644
--- a/tests/python/contrib/test_mxnet_bridge.py
+++ b/tests/python/contrib/test_mxnet_bridge.py
@@ -40,7 +40,7 @@ def mxnet_check():
     mxf(xx, yy, zz, 10.0)
 
 
-    np.testing.assert_allclose(
+    tvm.testing.assert_allclose(
         zz.asnumpy(), (xx.asnumpy() + yy.asnumpy()) * 10)
 
 
diff --git a/tests/python/contrib/test_nnpack.py b/tests/python/contrib/test_nnpack.py
index 2c07afe1ffac..3ebea0e62ce3 100644
--- a/tests/python/contrib/test_nnpack.py
+++ b/tests/python/contrib/test_nnpack.py
@@ -3,35 +3,6 @@
 import scipy.signal
 from tvm.contrib import nnpack
 
-def test_fully_connected_output():
-    n = 1024
-    l = 128
-    m = 235
-    bias = tvm.var('bias', dtype=tvm.float32)
-    A = tvm.placeholder((n, l), name='A')
-    B = tvm.placeholder((m, l), name='B')
-    C = nnpack.fully_connected_output(A, B)
-    D = tvm.compute(C.shape, lambda i, j: C[i,j] + bias, name="D")
-    s = tvm.create_schedule(D.op)
-
-    def verify(target="llvm"):
-        if not tvm.module.enabled(target):
-            print("skip because %s is not enabled..." % target)
-            return
-        if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_output", True):
-            print("skip because extern function is not avalable")
-            return
-        ctx = tvm.cpu(0)
-        f = tvm.build(s, [A, B, D, bias], target)
-        a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
-        b = tvm.nd.array(np.random.uniform(size=(m, l)).astype(B.dtype), ctx)
-        d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
-        bb = 10.0
-        f(a, b, d, bb)
-        np.testing.assert_allclose(
-            d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy().T) + bb, rtol=1e-5)
-    verify()
-
 
 def test_fully_connected_inference():
     n = 1024
@@ -49,8 +20,11 @@ def verify(target="llvm"):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
-            print("skip because extern function is not avalable")
+            print("skip because extern function is not available")
             return
+        if not nnpack.is_available():
+            return
+
         ctx = tvm.cpu(0)
         f = tvm.build(s, [A, B, D, bias], target)
         a = tvm.nd.array(np.random.uniform(size=(l)).astype(A.dtype), ctx)
@@ -58,7 +32,7 @@ def verify(target="llvm"):
         d = tvm.nd.array(np.zeros((m, ), dtype=D.dtype), ctx)
         bb = 10.0
         f(a, b, d, bb)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy().T) + bb, rtol=1e-5)
     verify()
 
@@ -100,7 +74,7 @@ def np_conv(na, nw, padding, stride=1):
     return nb
 
 def test_convolution_inference():
-    BATCH = 32
+    BATCH = 8
     IH = 48
     IW = 48
     IC = 16
@@ -111,26 +85,33 @@ def test_convolution_inference():
 
     OH = (IH + 2*PAD - K) + 1
     OW = (IW + 2*PAD - K) + 1
-    dshape = (IC, IH, IW)
+    dshape = (BATCH, IC, IH, IW)
     kshape = (OC, IC, K, K)
     bshape = (OC, )
-    oshape = (OC, OH, OW)
+    oshape = (BATCH, OC, OH, OW)
 
     data = tvm.placeholder(dshape, name='data')
     kernel = tvm.placeholder(kshape, name='kernel')
     bias = tvm.placeholder(bshape, name='bias')
-    output = nnpack.convolution_inference(data, kernel, bias,
-        [PAD, PAD, PAD, PAD], [STRIDE, STRIDE])
-    s = tvm.create_schedule(output.op)
-
-    def verify(target="llvm"):
+    def verify(target="llvm",
+               algorithm=nnpack.ConvolutionAlgorithm.AUTO,
+               with_bias=True):
         if not tvm.module.enabled(target):
             print("skip because %s is not enabled..." % target)
             return
-        if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
-            print("skip because extern function is not avalable")
+        if not tvm.get_global_func("tvm.contrib.nnpack.convolution_inference", True):
+            print("skip because extern function is not available")
+            return
+        if not nnpack.is_available():
             return
+
         ctx = tvm.cpu(0)
+        output = nnpack.convolution_inference(
+            data, kernel, bias if with_bias else None,
+            [PAD, PAD, PAD, PAD], [STRIDE, STRIDE],
+            algorithm=algorithm)
+        s = tvm.create_schedule(output.op)
+
         f = tvm.build(s, [data, kernel, bias, output], target)
 
         na = np.random.uniform(size=dshape).astype(data.dtype)
@@ -141,19 +122,30 @@ def verify(target="llvm"):
         tc = tvm.nd.array(nc, ctx)
         td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), ctx)
         f(ta, tb, tc, td)
-        nd = np_conv(np.reshape(na, (1, IC, IH, IW)), nb, PAD, STRIDE)
-        np.testing.assert_allclose(
-            td.asnumpy(), nd.reshape(IC, IH, IW), rtol=1e-5)
-    verify()
-
-def test_convolution_output():
-    BATCH = 32
+        nd = np_conv(np.reshape(na, (BATCH, IC, IH, IW)), nb, PAD, STRIDE) + nc.reshape(1, bshape[0], 1, 1)
+        tvm.testing.assert_allclose(
+            td.asnumpy(), nd.reshape(BATCH, IC, IH, IW), rtol=1e-5)
+    for algorithm in [
+            nnpack.ConvolutionAlgorithm.AUTO,
+            nnpack.ConvolutionAlgorithm.FFT_8x8,
+            nnpack.ConvolutionAlgorithm.FFT_16x16,
+            nnpack.ConvolutionAlgorithm.WT_8x8,
+            nnpack.ConvolutionAlgorithm.IMPLICIT_GEMM,
+            nnpack.ConvolutionAlgorithm.WT_8x8_FP16,
+    ]:
+        for with_bias in [True, False]:
+            verify(algorithm=algorithm, with_bias=with_bias)
+
+
+def test_convolution_inference_without_weight_transform():
+    BATCH = 6
     IH = 48
     IW = 48
     IC = 16
     OC = 16
     K = 3
     PAD = 1
+    STRIDE = 1
 
     OH = (IH + 2*PAD - K) + 1
     OW = (IW + 2*PAD - K) + 1
@@ -165,31 +157,45 @@ def test_convolution_output():
     data = tvm.placeholder(dshape, name='data')
     kernel = tvm.placeholder(kshape, name='kernel')
     bias = tvm.placeholder(bshape, name='bias')
-    output = nnpack.convolution_output(data, kernel, bias, [PAD, PAD, PAD, PAD])
-    s = tvm.create_schedule(output.op)
-
-    def verify(target="llvm"):
+    def verify(target="llvm",
+               algorithm=nnpack.ConvolutionAlgorithm.AUTO,
+               with_bias=True):
         if not tvm.module.enabled(target):
             print("skip because %s is not enabled..." % target)
             return
-        if not tvm.get_global_func("tvm.contrib.nnpack.fully_connected_inference", True):
-            print("skip because extern function is not avalable")
+        if not tvm.get_global_func("tvm.contrib.nnpack.convolution_inference_without_weight_transform", True):
+            print("skip because extern function is not available")
             return
+        if not nnpack.is_available():
+            return
+
         ctx = tvm.cpu(0)
+        transformed_kernel = nnpack.convolution_inference_weight_transform(
+            kernel, algorithm=algorithm)
+        output = nnpack.convolution_inference_without_weight_transform(
+            data, transformed_kernel, bias if with_bias else None,
+            [PAD, PAD, PAD, PAD], [STRIDE, STRIDE],
+            algorithm=algorithm)
+
+        s = tvm.create_schedule(output.op)
+
         f = tvm.build(s, [data, kernel, bias, output], target)
 
         na = np.random.uniform(size=dshape).astype(data.dtype)
         nb = np.random.uniform(size=kshape).astype(kernel.dtype)
-        nc = np.zeros(bshape, dtype=bias.dtype)
+        nc = np.random.uniform(size=bshape).astype(bias.dtype) if with_bias else np.zeros(bshape, dtype=bias.dtype)
         ta = tvm.nd.array(na, ctx)
         tb = tvm.nd.array(nb, ctx)
         tc = tvm.nd.array(nc, ctx)
         td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), ctx)
         f(ta, tb, tc, td)
-        nd = np_conv(na, nb, PAD)
-        np.testing.assert_allclose(
-            td.asnumpy(), nd, rtol=1e-5)
-    verify()
+        nd = np_conv(np.reshape(na, (BATCH, IC, IH, IW)), nb, PAD, STRIDE) + nc.reshape(1, bshape[0], 1, 1)
+        tvm.testing.assert_allclose(
+            td.asnumpy(), nd.reshape(BATCH, IC, IH, IW), rtol=1e-5)
+    for algorithm in [nnpack.ConvolutionAlgorithm.WT_8x8]:
+        for with_bias in [True, False]:
+            verify(algorithm=algorithm, with_bias=with_bias)
+
 
 if __name__ == "__main__":
     import nose
diff --git a/tests/python/contrib/test_random.py b/tests/python/contrib/test_random.py
index a74273a0ccba..6f846836043e 100644
--- a/tests/python/contrib/test_random.py
+++ b/tests/python/contrib/test_random.py
@@ -13,7 +13,7 @@ def verify(target="llvm"):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func("tvm.contrib.random.randint", True):
-            print("skip because extern function is not avalable")
+            print("skip because extern function is not available")
             return
         ctx = tvm.cpu(0)
         f = tvm.build(s, [A], target)
@@ -37,7 +37,7 @@ def verify(target="llvm"):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func("tvm.contrib.random.uniform", True):
-            print("skip because extern function is not avalable")
+            print("skip because extern function is not available")
             return
         ctx = tvm.cpu(0)
         f = tvm.build(s, [A], target)
@@ -61,7 +61,7 @@ def verify(target="llvm"):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func("tvm.contrib.random.normal", True):
-            print("skip because extern function is not avalable")
+            print("skip because extern function is not available")
             return
         ctx = tvm.cpu(0)
         f = tvm.build(s, [A], target)
diff --git a/tests/python/contrib/test_rocblas.py b/tests/python/contrib/test_rocblas.py
index 46350f4d6625..5f076a3e8963 100644
--- a/tests/python/contrib/test_rocblas.py
+++ b/tests/python/contrib/test_rocblas.py
@@ -16,7 +16,7 @@ def verify(target="rocm"):
             print("skip because %s is not enabled..." % target)
             return
         if not tvm.get_global_func("tvm.contrib.rocblas.matmul", True):
-            print("skip because extern function is not avalable")
+            print("skip because extern function is not available")
             return
         ctx = tvm.rocm(0)
         f = tvm.build(s, [A, B, C], target)
@@ -24,7 +24,7 @@ def verify(target="rocm"):
         b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
         c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
         f(a, b, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5)
     verify()
 
diff --git a/tests/python/contrib/test_sort.py b/tests/python/contrib/test_sort.py
index 3a99779e58f0..f34dad9e41fb 100644
--- a/tests/python/contrib/test_sort.py
+++ b/tests/python/contrib/test_sort.py
@@ -28,7 +28,7 @@ def test_sort():
     b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx)
     c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx)
     f(a, b, c)
-    np.testing.assert_allclose(c.asnumpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5)
+    tvm.testing.assert_allclose(c.asnumpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5)
 
 def test_sort_np():
     dshape = (1, 2, 3, 4, 5, 6)
@@ -55,7 +55,7 @@ def test_sort_np():
     b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx)
     c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx)
     f(a, b, c)
-    np.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5)
+    tvm.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5)
 
 if __name__ == "__main__":
     test_sort()
diff --git a/tests/python/contrib/test_sparse.py b/tests/python/contrib/test_sparse.py
new file mode 100644
index 000000000000..ed46ba2ea74a
--- /dev/null
+++ b/tests/python/contrib/test_sparse.py
@@ -0,0 +1,100 @@
+import tvm
+import tvm.contrib.sparse as tvmsp
+import tvm.ndarray as _nd
+import numpy as np
+from collections import namedtuple
+
+def test_static_tensor():
+    dtype = 'float32'
+    stype = 'csr'
+    target = 'llvm'
+    ctx = tvm.context(target, 0)
+    m = tvm.var('m')
+    n = tvm.var('n')
+    A = tvmsp.placeholder(shape=(m, n), name='A', dtype=dtype)
+    assert(A.stype == 'csr')
+    n = 3
+    a = np.maximum(np.random.uniform(size=(n,n)).astype(dtype)-.6, 0.)
+    a = tvmsp.array(a, ctx)
+    A.data = tvm.placeholder(a.data.shape, dtype, name='A_data')
+    Ab = tvm.decl_buffer(a.data.shape, dtype, name='A_data')
+    binds = {A.data: Ab}
+    C = tvm.compute(A.data.shape, lambda i: A.data[i] * 2., tag='cs_scatter')
+    s = tvm.create_schedule(C.op)
+    f = tvm.build(s, [A.data, C], target, binds=binds)
+    c = tvmsp.array(np.zeros((n,n), dtype), ctx)
+    c.data = tvm.nd.empty(a.data.shape, dtype)
+    c.indices = a.indices
+    c.indptr = a.indptr
+    f(a.data, c.data)
+    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2., rtol=1e-5)
+
+def test_dynamic_tensor():
+    dtype = 'float32'
+    stype = 'csr'
+    target = 'llvm'
+    ctx = tvm.context(target, 0)
+    nr, nc, n = tvm.var('nr'), tvm.var('nc'), tvm.var('n')
+    A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name='A', dtype=dtype)
+    assert(A.stype == 'csr')
+    C = tvm.compute(A.data.shape, lambda i: A.data[i] * 2., tag='cs_scatter')
+    s = tvm.create_schedule(C.op)
+    _nr, _nc = 3, 5
+    a = np.maximum(np.random.uniform(size=(_nr, _nc)).astype(dtype)-.6, 0.)
+    a = tvmsp.array(a, ctx)
+    assert a.data.dtype == a.dtype
+    Ab = namedtuple('CSRBuffer', ['data', 'indices', 'indptr'])
+    Ab.data = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_data')
+    Ab.indices = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_indices')
+    binds = {A.data: Ab.data, A.indices: Ab.indices}
+    f = tvm.build(s, [nr, A.data, C], target, binds=binds)
+    c = tvmsp.array(np.zeros((_nr, _nc), dtype), ctx)
+    c.data = tvm.nd.empty(a.data.shape, dtype)
+    c.indices = a.indices
+    c.indptr = a.indptr
+    f(a.data.shape[0], a.data, c.data)
+    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2., rtol=1e-5)
+
+def test_sparse_array_tuple():
+    dtype, itype = 'float32', 'int32'
+    stype = 'csr'
+    target = 'llvm'
+    ctx = tvm.context(target, 0)
+    nr, nc, n = tvm.var('nr'), tvm.var('nc'), tvm.var('n')
+    A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name='A', dtype=dtype)
+    assert(A.stype == 'csr')
+    C = tvm.compute(A.data.shape, lambda i: A.data[i] * 2., tag='cs_scatter')
+    s = tvm.create_schedule(C.op)
+    _nr, _nc = 3, 5
+    a = np.maximum(np.random.uniform(size=(_nr, _nc)).astype(dtype)-.6, 0.)
+    # convert to sparse array tuple
+    source_array = a
+    ridx, cidx = np.nonzero(source_array)
+    data = source_array[ridx, cidx]
+    a_data = _nd.array(data, ctx)
+    indices = np.nonzero(source_array)[1].astype(itype)
+    a_indices = _nd.array(indices, ctx)
+    indptr = [0]+np.apply_along_axis(np.count_nonzero, axis=1, arr=source_array).tolist()
+    indptr = np.cumsum(np.array(indptr, itype)).astype(itype)
+    a_indptr = _nd.array(indptr, ctx)
+    a_init = (a_data, a_indices, a_indptr)
+    # construct tvm sparse array with tuple
+    a = tvmsp.array(a_init, shape=source_array.shape, ctx=ctx)
+    assert a.data.dtype == a.dtype
+    Ab = namedtuple('CSRBuffer', ['data', 'indices', 'indptr'])
+    Ab.data = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_data')
+    Ab.indices = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_indices')
+    binds = {A.data: Ab.data, A.indices: Ab.indices}
+    f = tvm.build(s, [nr, A.data, C], target, binds=binds)
+    c = tvmsp.array(np.zeros((_nr, _nc), dtype), ctx)
+    c.data = tvm.nd.empty(a.data.shape, dtype)
+    c.indices = a.indices
+    c.indptr = a.indptr
+    f(a.data.shape[0], a.data, c.data)
+    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2., rtol=1e-5)
+
+if __name__ == "__main__":
+    test_static_tensor()
+    test_dynamic_tensor()
+    test_sparse_array_tuple()
+
diff --git a/tests/python/frontend/mxnet/model_zoo/__init__.py b/tests/python/frontend/mxnet/model_zoo/__init__.py
new file mode 100644
index 000000000000..eba8f8df0bba
--- /dev/null
+++ b/tests/python/frontend/mxnet/model_zoo/__init__.py
@@ -0,0 +1,59 @@
+"""MXNet model zoo for testing purposes."""
+from __future__ import absolute_import
+from . import mlp, vgg, resnet, dqn, inception_v3, squeezenet, dcgan
+import tvm.relay.testing
+
+# mlp
+def mx_mlp():
+    num_class = 10
+    return mlp.get_symbol(num_class)
+
+def relay_mlp():
+    num_class = 10
+    return tvm.relay.testing.mlp.get_workload(1, num_class)[0]
+
+# vgg
+def mx_vgg(num_layers):
+    num_class = 1000
+    return vgg.get_symbol(num_class, num_layers)
+
+def relay_vgg(num_layers):
+    num_class = 1000
+    return tvm.relay.testing.vgg.get_workload(
+        1, num_class, num_layers=num_layers)[0]
+
+# resnet
+def mx_resnet(num_layers):
+    num_class = 1000
+    return resnet.get_symbol(num_class, num_layers, '3,224,224')
+
+def relay_resnet(num_layers):
+    num_class = 1000
+    return tvm.relay.testing.resnet.get_workload(
+        1, num_class, num_layers=num_layers)[0]
+
+
+# dqn
+mx_dqn = dqn.get_symbol
+
+def relay_dqn():
+    return tvm.relay.testing.dqn.get_workload(1)[0]
+
+# squeezenet
+def mx_squeezenet(version):
+    return squeezenet.get_symbol(version=version)
+
+def relay_squeezenet(version):
+    return tvm.relay.testing.squeezenet.get_workload(1, version=version)[0]
+
+# inception
+mx_inception_v3 = inception_v3.get_symbol
+
+def relay_inception_v3():
+    return tvm.relay.testing.inception_v3.get_workload(1)[0]
+
+# dcgan generator
+mx_dcgan = dcgan.get_symbol
+
+def relay_dcgan(batch_size):
+    return tvm.relay.testing.dcgan.get_workload(batch_size=batch_size)[0]
diff --git a/tests/python/frontend/mxnet/model_zoo/dcgan.py b/tests/python/frontend/mxnet/model_zoo/dcgan.py
new file mode 100644
index 000000000000..8af030b6b184
--- /dev/null
+++ b/tests/python/frontend/mxnet/model_zoo/dcgan.py
@@ -0,0 +1,66 @@
+# pylint: disable=unused-argument
+"""
+The MXNet symbol of DCGAN generator
+
+Adopted from:
+https://github.com/tqchen/mxnet-gan/blob/master/mxgan/generator.py
+
+Reference:
+Radford, Alec, Luke Metz, and Soumith Chintala.
+"Unsupervised representation learning with deep convolutional generative adversarial networks."
+arXiv preprint arXiv:1511.06434 (2015).
+"""
+
+import mxnet as mx
+
+def deconv2d(data, ishape, oshape, kshape, name, stride=(2, 2)):
+    """a deconv layer that enlarges the feature map"""
+    target_shape = (oshape[-2], oshape[-1])
+    pad_y = (kshape[0] - 1) // 2
+    pad_x = (kshape[1] - 1) // 2
+    adj_y = (target_shape[0] + 2 * pad_y - kshape[0]) % stride[0]
+    adj_x = (target_shape[1] + 2 * pad_x - kshape[1]) % stride[1]
+
+    net = mx.sym.Deconvolution(data,
+                               kernel=kshape,
+                               stride=stride,
+                               pad=(pad_y, pad_x),
+                               adj=(adj_y, adj_x),
+                               num_filter=oshape[0],
+                               no_bias=True,
+                               name=name)
+    return net
+
+def deconv2d_bn_relu(data, prefix, **kwargs):
+    """a block of deconv + batch norm + relu"""
+    eps = 1e-5 + 1e-12
+
+    net = deconv2d(data, name="%s_deconv" % prefix, **kwargs)
+    net = mx.sym.BatchNorm(net, eps=eps, name="%s_bn" % prefix)
+    net = mx.sym.Activation(net, name="%s_act" % prefix, act_type='relu')
+    return net
+
+def get_symbol(oshape=(3, 64, 64), ngf=128, code=None):
+    """get symbol of dcgan generator"""
+    assert oshape[-1] == 64, "Only support 64x64 image"
+    assert oshape[-2] == 64, "Only support 64x64 image"
+
+    code = mx.sym.Variable("data") if code is None else code
+    net = mx.sym.FullyConnected(code, name="g1", num_hidden=ngf*8*4*4, no_bias=True, flatten=False)
+    net = mx.sym.Activation(net, act_type='relu')
+    # 4 x 4
+    net = mx.sym.reshape(net, shape=(-1, ngf * 8, 4, 4))
+    # 8 x 8
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 8, 4, 4), oshape=(ngf * 4, 8, 8), kshape=(4, 4), prefix="g2")
+    # 16x16
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 4, 8, 8), oshape=(ngf * 2, 16, 16), kshape=(4, 4), prefix="g3")
+    # 32x32
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 2, 16, 16), oshape=(ngf, 32, 32), kshape=(4, 4), prefix="g4")
+    # 64x64
+    net = deconv2d(
+        net, ishape=(ngf, 32, 32), oshape=oshape[-3:], kshape=(4, 4), name="g5_deconv")
+    net = mx.sym.Activation(net, act_type='tanh')
+    return net
diff --git a/tests/python/frontend/mxnet/model_zoo/dqn.py b/tests/python/frontend/mxnet/model_zoo/dqn.py
new file mode 100644
index 000000000000..e037511efdf2
--- /dev/null
+++ b/tests/python/frontend/mxnet/model_zoo/dqn.py
@@ -0,0 +1,27 @@
+"""
+The mxnet symbol of Nature DQN
+
+Reference:
+Mnih, Volodymyr, et al.
+"Human-level control through deep reinforcement learning."
+Nature 518.7540 (2015): 529.
+"""
+
+import mxnet as mx
+
+def get_symbol(num_action=18):
+    data = mx.sym.Variable(name='data')
+    net = mx.sym.Convolution(data, kernel=(8, 8), stride=(4, 4),
+                             num_filter=32, name='conv1')
+    net = mx.sym.Activation(net, act_type='relu', name='relu1')
+    net = mx.sym.Convolution(net, kernel=(4, 4), stride=(2, 2),
+                             num_filter=64, name='conv2')
+    net = mx.sym.Activation(net, act_type='relu', name='relu2')
+    net = mx.sym.Convolution(net, kernel=(3, 3), stride=(1, 1),
+                             num_filter=64, name='conv3')
+    net = mx.sym.Activation(net, act_type='relu', name='relu3')
+    net = mx.sym.FullyConnected(net, num_hidden=512, name='fc4')
+    net = mx.sym.Activation(net, act_type='relu', name='relu4')
+    net = mx.sym.FullyConnected(net, num_hidden=num_action, name='fc5', flatten=False)
+
+    return net
diff --git a/tests/python/frontend/mxnet/model_zoo/inception_v3.py b/tests/python/frontend/mxnet/model_zoo/inception_v3.py
new file mode 100644
index 000000000000..b8585bf05037
--- /dev/null
+++ b/tests/python/frontend/mxnet/model_zoo/inception_v3.py
@@ -0,0 +1,170 @@
+"""
+Inception V3, suitable for images with around 299 x 299
+
+Reference:
+Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision." arXiv preprint arXiv:1512.00567 (2015).
+
+Adopted from https://github.com/apache/incubator-mxnet/blob/
+             master/example/image-classification/symbols/inception-v3.py
+"""
+import mxnet as mx
+import numpy as np
+
+def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''):
+    conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=True, name='%s%s_conv2d' %(name, suffix))
+    bn = mx.sym.BatchNorm(data=conv, eps=2e-5, name='%s%s_batchnorm' % (name, suffix))
+    act = mx.sym.Activation(data=bn, act_type='relu', name='%s%s_relu' %(name, suffix))
+    return act
+
+
+def Inception7A(data,
+                num_1x1,
+                num_3x3_red, num_3x3_1, num_3x3_2,
+                num_5x5_red, num_5x5,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name))
+    tower_5x5 = Conv(data, num_5x5_red, name=('%s_tower' % name), suffix='_conv')
+    tower_5x5 = Conv(tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name), suffix='_conv_1')
+    tower_3x3 = Conv(data, num_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_2')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(pooling, proj, name=('%s_tower_2' %  name), suffix='_conv')
+    concat = mx.sym.Concat(*[tower_1x1, tower_5x5, tower_3x3, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+# First Downsample
+def Inception7B(data,
+                num_3x3,
+                num_d3x3_red, num_d3x3_1, num_d3x3_2,
+                pool,
+                name):
+    tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_conv' % name))
+    tower_d3x3 = Conv(data, num_d3x3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_2')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0,0), pool_type="max", name=('max_pool_%s_pool' % name))
+    concat = mx.sym.Concat(*[tower_3x3, tower_d3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7C(data,
+                num_1x1,
+                num_d7_red, num_d7_1, num_d7_2,
+                num_q7_red, num_q7_1, num_q7_2, num_q7_3, num_q7_4,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d7 = Conv(data=data, num_filter=num_d7_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=data, num_filter=num_q7_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_1, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_2, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_3, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_3')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_4, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_4')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' %  name), suffix='_conv')
+    # concat
+    concat = mx.sym.Concat(*[tower_1x1, tower_d7, tower_q7, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7D(data,
+                num_3x3_red, num_3x3,
+                num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3,
+                pool,
+                name):
+    tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_3x3 = Conv(data=tower_3x3, num_filter=num_3x3, kernel=(3, 3), pad=(0,0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=data, num_filter=num_d7_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_3x3, kernel=(3, 3), stride=(2, 2), name=('%s_tower_1' % name), suffix='_conv_3')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    # concat
+    concat = mx.sym.Concat(*[tower_3x3, tower_d7_3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7E(data,
+                num_1x1,
+                num_d3_red, num_d3_1, num_d3_2,
+                num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d3 = Conv(data=data, num_filter=num_d3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3_a = Conv(data=tower_d3, num_filter=num_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower' % name), suffix='_mixed_conv')
+    tower_d3_b = Conv(data=tower_d3, num_filter=num_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower' % name), suffix='_mixed_conv_1')
+    tower_3x3_d3 = Conv(data=data, num_filter=num_3x3_d3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower_1' % name), suffix='_mixed_conv')
+    tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower_1' % name), suffix='_mixed_conv_1')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' %  name), suffix='_conv')
+    # concat
+    concat = mx.sym.Concat(*[tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def get_symbol(num_classes=1000, **kwargs):
+    data = mx.sym.Variable(name="data")
+    # stage 1
+    conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv")
+    conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1")
+    conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2")
+    pool = mx.sym.Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool")
+    # stage 2
+    conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3")
+    conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4")
+    pool1 = mx.sym.Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool1")
+
+    # # stage 3
+    in3a = Inception7A(pool1, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 32, "mixed")
+    in3b = Inception7A(in3a, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_1")
+    in3c = Inception7A(in3b, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_2")
+    in3d = Inception7B(in3c, 384,
+                       64, 96, 96,
+                       "max", "mixed_3")
+    # stage 4
+    in4a = Inception7C(in3d, 192,
+                       128, 128, 192,
+                       128, 128, 128, 128, 192,
+                       "avg", 192, "mixed_4")
+    in4b = Inception7C(in4a, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_5")
+    in4c = Inception7C(in4b, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_6")
+    in4d = Inception7C(in4c, 192,
+                       192, 192, 192,
+                       192, 192, 192, 192, 192,
+                       "avg", 192, "mixed_7")
+    in4e = Inception7D(in4d, 192, 320,
+                       192, 192, 192, 192,
+                       "max", "mixed_8")
+    # stage 5
+    in5a = Inception7E(in4e, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "avg", 192, "mixed_9")
+    in5b = Inception7E(in5a, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "max", 192, "mixed_10")
+    # pool
+    pool = mx.sym.Pooling(data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", name="global_pool")
+    flatten = mx.sym.Flatten(data=pool, name="flatten")
+    fc1 = mx.sym.FullyConnected(data=flatten, num_hidden=num_classes, name='fc1', flatten=False)
+    softmax = mx.sym.SoftmaxOutput(data=fc1, name='softmax')
+    return softmax
diff --git a/tests/python/frontend/mxnet/model_zoo/mlp.py b/tests/python/frontend/mxnet/model_zoo/mlp.py
new file mode 100644
index 000000000000..922b208749bf
--- /dev/null
+++ b/tests/python/frontend/mxnet/model_zoo/mlp.py
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+a simple multilayer perceptron
+"""
+import mxnet as mx
+
+def get_symbol(num_classes=10, **kwargs):
+    data = mx.symbol.Variable('data')
+    data = mx.sym.Flatten(data=data)
+    try:
+        fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128, flatten=False)
+        act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
+        fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64, flatten=False)
+        act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
+        fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes, flatten=False)
+        mlp  = mx.symbol.softmax(data = fc3, name = 'softmax')
+    except:
+        fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
+        act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
+        fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
+        act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
+        fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes)
+        mlp  = mx.symbol.softmax(data = fc3, name = 'softmax')
+    return mlp
diff --git a/tests/python/frontend/mxnet/model_zoo/resnet.py b/tests/python/frontend/mxnet/model_zoo/resnet.py
new file mode 100644
index 000000000000..3f9a870d31c0
--- /dev/null
+++ b/tests/python/frontend/mxnet/model_zoo/resnet.py
@@ -0,0 +1,199 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+'''
+Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
+Original author Wei Wu
+
+Implemented the following paper:
+
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Identity Mappings in Deep Residual Networks"
+'''
+import mxnet as mx
+import numpy as np
+
+def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True, bn_mom=0.9, workspace=256, memonger=False):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    if bottle_neck:
+        bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn1')
+        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
+        conv1 = mx.sym.Convolution(data=act1, num_filter=int(num_filter*0.25), kernel=(1,1), stride=stride, pad=(0,0),
+                                   no_bias=True, workspace=workspace, name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn2')
+        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
+        conv2 = mx.sym.Convolution(data=act2, num_filter=int(num_filter*0.25), kernel=(3,3), stride=(1,1), pad=(1,1),
+                                   no_bias=True, workspace=workspace, name=name + '_conv2')
+        bn3 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn3')
+        act3 = mx.sym.Activation(data=bn3, act_type='relu', name=name + '_relu3')
+        conv3 = mx.sym.Convolution(data=act3, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), no_bias=True,
+                                   workspace=workspace, name=name + '_conv3')
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
+                                            workspace=workspace, name=name+'_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return conv3 + shortcut
+    else:
+        bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn1')
+        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
+        conv1 = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(3,3), stride=stride, pad=(1,1),
+                                      no_bias=True, workspace=workspace, name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn2')
+        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
+        conv2 = mx.sym.Convolution(data=act2, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1),
+                                      no_bias=True, workspace=workspace, name=name + '_conv2')
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
+                                            workspace=workspace, name=name+'_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return conv2 + shortcut
+
+def resnet(units, num_stages, filter_list, num_classes, image_shape, bottle_neck=True, bn_mom=0.9, workspace=256, dtype='float32', memonger=False):
+    """Return ResNet symbol of
+    Parameters
+    ----------
+    units : list
+        Number of units in each stage
+    num_stages : int
+        Number of stage
+    filter_list : list
+        Channel size of each stage
+    num_classes : int
+        Ouput size of symbol
+    dataset : str
+        Dataset type, only cifar10 and imagenet supports
+    workspace : int
+        Workspace used in convolution operator
+    dtype : str
+        Precision (float32 or float16)
+    """
+    num_unit = len(units)
+    assert(num_unit == num_stages)
+    data = mx.sym.Variable(name='data')
+    if dtype == 'float32':
+        # data = mx.sym.identity(data=data, name='id')
+        data = data
+    else:
+        if dtype == 'float16':
+            data = mx.sym.Cast(data=data, dtype=np.float16)
+    data = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=bn_mom, name='bn_data')
+    (nchannel, height, width) = image_shape
+    if height <= 32:            # such as cifar10
+        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(3, 3), stride=(1,1), pad=(1, 1),
+                                  no_bias=True, name="conv0", workspace=workspace)
+    else:                       # often expected to be 224 such as imagenet
+        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(7, 7), stride=(2,2), pad=(3, 3),
+                                  no_bias=True, name="conv0", workspace=workspace)
+        body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn0')
+        body = mx.sym.Activation(data=body, act_type='relu', name='relu0')
+        body = mx.sym.Pooling(data=body, kernel=(3, 3), stride=(2,2), pad=(1,1), pool_type='max')
+
+    for i in range(num_stages):
+        body = residual_unit(body, filter_list[i+1], (1 if i==0 else 2, 1 if i==0 else 2), False,
+                             name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck, workspace=workspace,
+                             memonger=memonger)
+        for j in range(units[i]-1):
+            body = residual_unit(body, filter_list[i+1], (1,1), True, name='stage%d_unit%d' % (i + 1, j + 2),
+                                 bottle_neck=bottle_neck, workspace=workspace, memonger=memonger)
+    bn1 = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn1')
+    relu1 = mx.sym.Activation(data=bn1, act_type='relu', name='relu1')
+    # Although kernel is not used here when global_pool=True, we should put one
+    pool1 = mx.sym.Pooling(data=relu1, global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1')
+    flat = mx.sym.Flatten(data=pool1)
+    try:
+        fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1', flatten=False)
+    except:
+        fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1')
+    if dtype == 'float16':
+        fc1 = mx.sym.Cast(data=fc1, dtype=np.float32)
+    return mx.sym.softmax(data=fc1, name='softmax')
+
+def get_symbol(num_classes, num_layers, image_shape, conv_workspace=256, dtype='float32', **kwargs):
+    """
+    Adapted from https://github.com/tornadomeet/ResNet/blob/master/train_resnet.py
+    Original author Wei Wu
+    """
+    image_shape = [int(l) for l in image_shape.split(',')]
+    (nchannel, height, width) = image_shape
+    if height <= 28:
+        num_stages = 3
+        if (num_layers-2) % 9 == 0 and num_layers >= 164:
+            per_unit = [(num_layers-2)//9]
+            filter_list = [16, 64, 128, 256]
+            bottle_neck = True
+        elif (num_layers-2) % 6 == 0 and num_layers < 164:
+            per_unit = [(num_layers-2)//6]
+            filter_list = [16, 16, 32, 64]
+            bottle_neck = False
+        else:
+            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
+        units = per_unit * num_stages
+    else:
+        if num_layers >= 50:
+            filter_list = [64, 256, 512, 1024, 2048]
+            bottle_neck = True
+        else:
+            filter_list = [64, 64, 128, 256, 512]
+            bottle_neck = False
+        num_stages = 4
+        if num_layers == 18:
+            units = [2, 2, 2, 2]
+        elif num_layers == 34:
+            units = [3, 4, 6, 3]
+        elif num_layers == 50:
+            units = [3, 4, 6, 3]
+        elif num_layers == 101:
+            units = [3, 4, 23, 3]
+        elif num_layers == 152:
+            units = [3, 8, 36, 3]
+        elif num_layers == 200:
+            units = [3, 24, 36, 3]
+        elif num_layers == 269:
+            units = [3, 30, 48, 8]
+        else:
+            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
+
+    return resnet(units       = units,
+                  num_stages  = num_stages,
+                  filter_list = filter_list,
+                  num_classes = num_classes,
+                  image_shape = image_shape,
+                  bottle_neck = bottle_neck,
+                  workspace   = conv_workspace,
+                  dtype       = dtype)
diff --git a/tests/python/frontend/mxnet/model_zoo/squeezenet.py b/tests/python/frontend/mxnet/model_zoo/squeezenet.py
new file mode 100644
index 000000000000..deb896a21385
--- /dev/null
+++ b/tests/python/frontend/mxnet/model_zoo/squeezenet.py
@@ -0,0 +1,76 @@
+"""
+Symbol of SqueezeNet
+
+Reference:
+Iandola, Forrest N., et al.
+"Squeezenet: Alexnet-level accuracy with 50x fewer parameters and< 0.5 mb model size." (2016).
+"""
+
+import mxnet as mx
+
+# Helpers
+def _make_fire(net, squeeze_channels, expand1x1_channels, expand3x3_channels):
+    net = _make_fire_conv(net, squeeze_channels, 1, 0)
+
+    left = _make_fire_conv(net, expand1x1_channels, 1, 0)
+    right = _make_fire_conv(net, expand3x3_channels, 3, 1)
+    # NOTE : Assume NCHW layout here
+    net = mx.sym.concat(left, right, dim=1)
+
+    return net
+
+def _make_fire_conv(net, channels, kernel_size, padding=0):
+    net = mx.sym.Convolution(net, num_filter=channels, kernel=(kernel_size, kernel_size),
+                             pad=(padding, padding))
+    net = mx.sym.Activation(net, act_type='relu')
+    return net
+
+# Net
+def get_symbol(num_classes=1000, version='1.0', **kwargs):
+    """Get symbol of SqueezeNet
+
+    Parameters
+    ----------
+    num_classes: int
+        The number of classification results
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+    """
+    assert version in ['1.0', '1.1'], ("Unsupported SqueezeNet version {version}:"
+                                       "1.0 or 1.1 expected".format(version=version))
+    net = mx.sym.Variable("data")
+    if version == '1.0':
+        net = mx.sym.Convolution(net, num_filter=96, kernel=(7, 7), stride=(2, 2), pad=(3, 3))
+        net = mx.sym.Activation(net, act_type='relu')
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 32, 128, 128)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 32, 128, 128)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 64, 256, 256)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 64, 256, 256)
+    else:
+        net = mx.sym.Convolution(net, num_filter=64, kernel=(3, 3), stride=(2, 2), pad=(1, 1))
+        net = mx.sym.Activation(net, act_type='relu')
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 16, 64, 64)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max',  stride=(2, 2))
+        net = _make_fire(net, 32, 128, 128)
+        net = _make_fire(net, 32, 128, 128)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max',  stride=(2, 2))
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 64, 256, 256)
+        net = _make_fire(net, 64, 256, 256)
+    net = mx.sym.Dropout(net, p=0.5)
+    net = mx.sym.Convolution(net, num_filter=num_classes, kernel=(1, 1))
+    net = mx.sym.Activation(net, act_type='relu')
+    net = mx.sym.Pooling(data=net, global_pool=True, kernel=(13, 13), pool_type='avg')
+    net = mx.sym.flatten(net)
+    return mx.sym.softmax(net)
diff --git a/tests/python/frontend/mxnet/model_zoo/vgg.py b/tests/python/frontend/mxnet/model_zoo/vgg.py
new file mode 100644
index 000000000000..68215bb80aaa
--- /dev/null
+++ b/tests/python/frontend/mxnet/model_zoo/vgg.py
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""References:
+
+Simonyan, Karen, and Andrew Zisserman. "Very deep convolutional networks for
+large-scale image recognition." arXiv preprint arXiv:1409.1556 (2014).
+"""
+
+import mxnet as mx
+import numpy as np
+
+def get_feature(internel_layer, layers, filters, batch_norm = False, **kwargs):
+    for i, num in enumerate(layers):
+        for j in range(num):
+            internel_layer = mx.sym.Convolution(data = internel_layer, kernel=(3, 3), pad=(1, 1), num_filter=filters[i], name="conv%s_%s" %(i + 1, j + 1))
+            if batch_norm:
+                internel_layer = mx.symbol.BatchNorm(data=internel_layer, name="bn%s_%s" %(i + 1, j + 1))
+            internel_layer = mx.sym.Activation(data=internel_layer, act_type="relu", name="relu%s_%s" %(i + 1, j + 1))
+        internel_layer = mx.sym.Pooling(data=internel_layer, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool%s" %(i + 1))
+    return internel_layer
+
+def get_classifier(input_data, num_classes, **kwargs):
+    flatten = mx.sym.Flatten(data=input_data, name="flatten")
+    try:
+        fc6 = mx.sym.FullyConnected(data=flatten, num_hidden=4096, name="fc6", flatten=False)
+        relu6 = mx.sym.Activation(data=fc6, act_type="relu", name="relu6")
+        drop6 = mx.sym.Dropout(data=relu6, p=0.5, name="drop6")
+        fc7 = mx.sym.FullyConnected(data=drop6, num_hidden=4096, name="fc7", flatten=False)
+        relu7 = mx.sym.Activation(data=fc7, act_type="relu", name="relu7")
+        drop7 = mx.sym.Dropout(data=relu7, p=0.5, name="drop7")
+        fc8 = mx.sym.FullyConnected(data=drop7, num_hidden=num_classes, name="fc8", flatten=False)
+    except:
+        fc6 = mx.sym.FullyConnected(data=flatten, num_hidden=4096, name="fc6")
+        relu6 = mx.sym.Activation(data=fc6, act_type="relu", name="relu6")
+        drop6 = mx.sym.Dropout(data=relu6, p=0.5, name="drop6")
+        fc7 = mx.sym.FullyConnected(data=drop6, num_hidden=4096, name="fc7")
+        relu7 = mx.sym.Activation(data=fc7, act_type="relu", name="relu7")
+        drop7 = mx.sym.Dropout(data=relu7, p=0.5, name="drop7")
+        fc8 = mx.sym.FullyConnected(data=drop7, num_hidden=num_classes, name="fc8")
+    return fc8
+
+def get_symbol(num_classes, num_layers=11, batch_norm=False, dtype='float32', **kwargs):
+    """
+    Parameters
+    ----------
+    num_classes : int, default 1000
+        Number of classification classes.
+    num_layers : int
+        Number of layers for the variant of densenet. Options are 11, 13, 16, 19.
+    batch_norm : bool, default False
+        Use batch normalization.
+    dtype: str, float32 or float16
+        Data precision.
+    """
+    vgg_spec = {11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]),
+                13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]),
+                16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]),
+                19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512])}
+    if num_layers not in vgg_spec:
+        raise ValueError("Invalide num_layers {}. Possible choices are 11,13,16,19.".format(num_layers))
+    layers, filters = vgg_spec[num_layers]
+    data = mx.sym.Variable(name="data")
+    if dtype == 'float16':
+        data = mx.sym.Cast(data=data, dtype=np.float16)
+    feature = get_feature(data, layers, filters, batch_norm)
+    classifier = get_classifier(feature, num_classes)
+    if dtype == 'float16':
+        classifier = mx.sym.Cast(data=classifier, dtype=np.float32)
+    symbol = mx.sym.softmax(data=classifier, name='softmax')
+    return symbol
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
new file mode 100644
index 000000000000..81a12b041ed7
--- /dev/null
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -0,0 +1,214 @@
+import numpy as np
+
+import tvm
+from tvm.contrib import graph_runtime
+from tvm.relay.testing.config import ctx_list
+from tvm import relay
+import mxnet as mx
+
+from mxnet import gluon
+from mxnet.gluon.model_zoo import vision
+import model_zoo
+
+
+def verify_mxnet_frontend_impl(mx_symbol,
+                               data_shape=(1, 3, 224, 224),
+                               out_shape=(1, 1000),
+                               gluon_impl=False,
+                               name=None,
+                               dtype='float32'):
+    """Use name different from test to avoid let nose pick it up"""
+    if gluon_impl:
+        def get_gluon_output(name, x):
+            net = vision.get_model(name)
+            net.collect_params().initialize(mx.init.Xavier())
+            net_sym = gluon.nn.SymbolBlock(outputs=net(mx.sym.var('data')),
+                                           inputs=mx.sym.var('data'),
+                                           params=net.collect_params())
+            out = net_sym(mx.nd.array(x.astype(dtype))).asnumpy()
+            return out, net_sym
+    else:
+        def get_mxnet_output(symbol, x, dtype='float32'):
+            from collections import namedtuple
+            Batch = namedtuple('Batch', ['data'])
+            mod = mx.mod.Module(symbol, label_names=None)
+            mod.bind(data_shapes=[('data', x.shape)], for_training=False)
+            mod.init_params()
+            mod.forward(Batch([mx.nd.array(x.astype(dtype))]))
+            out = mod.get_outputs()[0].asnumpy()
+            args, auxs = mod.get_params()
+            return out, args, auxs
+
+    def get_tvm_output(symbol, x, args, auxs, target, ctx, dtype='float32'):
+        shape_dict = {"data": x.shape}
+        if gluon_impl:
+            new_sym, params = relay.frontend.from_mxnet(symbol, shape_dict)
+        else:
+            new_sym, params = relay.frontend.from_mxnet(symbol,
+                                                        shape_dict,
+                                                        arg_params=args,
+                                                        aux_params=auxs)
+        with relay.build_config(opt_level=3):
+            graph, lib, params = relay.build(new_sym, target, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        # set inputs
+        m.set_input("data", tvm.nd.array(x.astype(dtype)))
+        m.set_input(**params)
+        m.run()
+        # get outputs
+        out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
+        return out.asnumpy()
+
+    # random input
+    x = np.random.uniform(size=data_shape)
+    if gluon_impl:
+        gluon_out, gluon_sym = get_gluon_output(name, x)
+        for target, ctx in ctx_list():
+            tvm_out = get_tvm_output(gluon_sym, x, None, None, target, ctx, dtype)
+            tvm.testing.assert_allclose(gluon_out, tvm_out, rtol=1e-5, atol=1e-5)
+    else:
+        mx_out, args, auxs = get_mxnet_output(mx_symbol, x, dtype)
+        assert "data" not in args
+        for target, ctx in ctx_list():
+            tvm_out = get_tvm_output(mx_symbol, x, args, auxs, target, ctx, dtype)
+            tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_mlp():
+    mlp = model_zoo.mx_mlp()
+    verify_mxnet_frontend_impl(mlp,
+                               data_shape=(1, 1, 28, 28),
+                               out_shape=(1, 10))
+
+def test_forward_vgg():
+    for n in [11]:
+        mx_sym = model_zoo.mx_vgg(n)
+        verify_mxnet_frontend_impl(mx_sym)
+
+def test_forward_resnet():
+    for n in [18]:
+        mx_sym = model_zoo.mx_resnet(18)
+        verify_mxnet_frontend_impl(mx_sym)
+
+def test_forward_elu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.LeakyReLU(data, act_type='elu')
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_rrelu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.LeakyReLU(data, act_type='rrelu', lower_bound=0.3, upper_bound=0.7)
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_prelu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.LeakyReLU(data, act_type='prelu')
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_softrelu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.Activation(data, act_type='softrelu')
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_fc_flatten():
+    # test flatten=True option in mxnet 0.11.1
+    data = mx.sym.var('data')
+    try:
+        mx_sym = mx.sym.FullyConnected(data, num_hidden=100, flatten=True)
+        verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 100))
+        mx_sym = mx.sym.FullyConnected(mx.sym.Flatten(data), num_hidden=100, flatten=False)
+        verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 100))
+    except:
+        pass
+
+def test_forward_clip():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicity
+    mx_sym = mx.sym.clip(data, a_min=0, a_max=1)
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_split():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.split(data, axis=1, num_outputs=4, squeeze_axis=False)
+    verify_mxnet_frontend_impl(mx_sym, (1, 4, 2, 1), (1, 1, 2, 1))
+
+def test_forward_split_squeeze():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.split(data, axis=1, num_outputs=4, squeeze_axis=True)
+    verify_mxnet_frontend_impl(mx_sym, (1, 4, 2, 1), (1, 2, 1))
+
+def test_forward_expand_dims():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.expand_dims(data, axis=1)
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 1, 3, 4))
+
+def test_forward_pooling():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.Pooling(data, kernel=(3, 3), pad=(1, 1), pool_type='avg')
+    verify_mxnet_frontend_impl(mx_sym, (1, 20, 8, 8), (1, 20, 8, 8))
+
+    mx_sym = mx.sym.Pooling(data, kernel=(3, 3), pad=(1, 1), pool_type='max')
+    verify_mxnet_frontend_impl(mx_sym, (1, 20, 8, 8), (1, 20, 8, 8))
+
+def test_forward_lrn():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.LRN(data, alpha=2, beta=2, knorm=1, nsize=5)
+    verify_mxnet_frontend_impl(mx_sym, (1, 10, 24, 24), (1, 10, 24, 24))
+
+def test_forward_ones():
+    data = mx.sym.var('data')
+    ones = mx.sym.ones(shape=(2, 3, 4), dtype='float32')
+    mx_sym = mx.sym.elemwise_add(data, ones)
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_zeros():
+    data = mx.sym.var('data')
+    zeros = mx.sym.zeros(shape=(2, 3, 4), dtype='float32')
+    mx_sym = mx.sym.elemwise_add(data, zeros)
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_ones_like():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.ones_like(data, dtype='float32')
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_zeros_like():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.zeros_like(data, dtype='float32')
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_argmax():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.argmax(data, axis=1)
+    verify_mxnet_frontend_impl(mx_sym, (5, 3), (5,))
+
+def test_forward_argmin():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.argmin(data, axis=0)
+    verify_mxnet_frontend_impl(mx_sym, (5, 4), (4,))
+
+
+if __name__ == '__main__':
+    test_forward_mlp()
+    test_forward_vgg()
+    test_forward_resnet()
+    test_forward_elu()
+    test_forward_rrelu()
+    test_forward_prelu()
+    test_forward_softrelu()
+    test_forward_fc_flatten()
+    test_forward_clip()
+    test_forward_split()
+    test_forward_split_squeeze()
+    test_forward_expand_dims()
+    test_forward_pooling()
+    test_forward_lrn()
+    test_forward_ones()
+    test_forward_zeros()
+    test_forward_ones_like()
+    test_forward_zeros_like()
+    test_forward_argmax()
+    test_forward_argmin()
diff --git a/tests/python/frontend/mxnet/test_graph.py b/tests/python/frontend/mxnet/test_graph.py
new file mode 100644
index 000000000000..c2bed8829b81
--- /dev/null
+++ b/tests/python/frontend/mxnet/test_graph.py
@@ -0,0 +1,101 @@
+import mxnet as mx
+from tvm import relay
+import model_zoo
+
+def compare_graph(f1, f2):
+    f1 = relay.ir_pass.infer_type(f1)
+    f2 = relay.ir_pass.infer_type(f2)
+    assert relay.ir_pass.alpha_equal(f1, f2)
+
+def test_mlp():
+    shape = {"data": (1, 1, 28, 28)}
+    mx_fun = model_zoo.mx_mlp()
+    from_mx_fun, _ = relay.frontend.from_mxnet(mx_fun, shape=shape)
+    relay_fun = model_zoo.relay_mlp()
+    compare_graph(from_mx_fun, relay_fun)
+
+
+def test_vgg():
+    shape = {"data": (1, 3, 224, 224)}
+    for n in [11, 13, 16, 19]:
+        mx_sym = model_zoo.mx_vgg(n)
+        from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, shape=shape)
+        relay_sym = model_zoo.relay_vgg(n)
+        compare_graph(from_mx_sym, relay_sym)
+
+
+def test_resnet():
+    shape = {"data": (1, 3, 224, 224)}
+    for n in [18, 34, 50, 101]:
+        mx_sym = model_zoo.mx_resnet(n)
+        from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, shape=shape)
+        relay_sym = model_zoo.relay_resnet(n)
+        compare_graph(from_mx_sym, relay_sym)
+
+
+def test_squeezenet():
+    shape = {"data": (1, 3, 224, 224)}
+    for version in ['1.0', '1.1']:
+        mx_sym = model_zoo.mx_squeezenet(version)
+        from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, shape)
+        relay_sym = model_zoo.relay_squeezenet(version)
+        compare_graph(from_mx_sym, relay_sym)
+
+
+def test_inception_v3():
+    shape = {"data": (1, 3, 299, 299)}
+    mx_sym = model_zoo.mx_inception_v3()
+    from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, shape)
+    relay_sym = model_zoo.relay_inception_v3()
+    compare_graph(from_mx_sym, relay_sym)
+
+
+def test_dqn():
+    shape = {"data": (1, 4, 84, 84)}
+    mx_sym = model_zoo.mx_dqn()
+    from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, shape)
+    relay_sym = model_zoo.relay_dqn()
+    compare_graph(from_mx_sym, relay_sym)
+
+
+def test_dcgan():
+    shape = {"data": (2, 100)}
+    mx_sym = model_zoo.mx_dcgan()
+    from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, shape)
+    relay_sym = model_zoo.relay_dcgan(batch_size=2)
+    compare_graph(from_mx_sym, relay_sym)
+
+
+def test_multi_outputs():
+    xshape = (10, 27)
+    yshape = (10, 9)
+
+    def mx_compose(F, **kwargs):
+        x = F.sym.Variable("x")
+        y = F.sym.Variable("y")
+        z = F.sym.split(x, **kwargs)
+        return F.sym.broadcast_sub(F.sym.broadcast_add(z[0], z[2]), y)
+
+    def relay_compose(F, **kwargs):
+        x = F.var("x", shape=xshape)
+        y = F.var("y", shape=yshape)
+        z = F.split(x, **kwargs)
+        z = F.subtract(F.add(z[0], z[2]), y)
+        return relay.Function(relay.ir_pass.free_vars(z), z)
+
+    mx_sym = mx_compose(mx, num_outputs=3, axis=1)
+    from_mx_sym, _ = relay.frontend.from_mxnet(
+        mx_sym, shape={"x":xshape, "y":yshape})
+    relay_sym = relay_compose(relay, indices_or_sections=3, axis=1)
+    compare_graph(from_mx_sym, relay_sym)
+
+
+if __name__ == "__main__":
+    test_mlp()
+    test_resnet()
+    test_vgg()
+    test_multi_outputs()
+    test_dqn()
+    test_dcgan()
+    test_squeezenet()
+    test_inception_v3()
diff --git a/tests/python/integration/test_dot.py b/tests/python/integration/test_dot.py
index 1073d43bceaa..15243c8b1235 100644
--- a/tests/python/integration/test_dot.py
+++ b/tests/python/integration/test_dot.py
@@ -46,7 +46,7 @@ def verify(target):
         b = tvm.nd.array(np.random.uniform(size=(nn,)).astype(B.dtype), ctx)
         c  = tvm.nd.array(np.zeros((1,), dtype=C.dtype), ctx)
         f(a, b, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-4)
 
     verify("llvm")
diff --git a/tests/python/integration/test_ewise.py b/tests/python/integration/test_ewise.py
index f16f15325735..b3f17b7c1bb1 100644
--- a/tests/python/integration/test_ewise.py
+++ b/tests/python/integration/test_ewise.py
@@ -31,13 +31,52 @@ def check_device(device, host="stackvm"):
         a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
         b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
         fexp(a, b)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             b.asnumpy(), np.exp(a.asnumpy()), rtol=1e-5)
 
     check_device("opencl -device=intel_graphics")
     check_device("cuda", "llvm")
     check_device("vulkan")
 
+def test_fmod():
+    # graph
+    def run(dtype):
+        n = tvm.var('n')
+        A = tvm.placeholder((n,), name='A', dtype=dtype)
+        B = tvm.placeholder((n,), name='B', dtype=dtype)
+        C = tvm.compute(A.shape, lambda *i: tvm.fmod(A(*i), B(*i)), name='C')
+        s = tvm.create_schedule(C.op)
+        # create iter var and assign them tags.
+        num_thread = 8
+        bx, tx = s[C].split(C.op.axis[0], factor=num_thread)
+
+        def check_device(device):
+            ctx = tvm.context(device, 0)
+            if not ctx.exist:
+                print("skip because %s is not enabled.." % device)
+                return
+            target = tvm.target.create(device)
+            if "cpu" not in target.keys:
+                s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
+                s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
+            fmod = tvm.build(s, [A, B, C], device, name="myfmod")
+
+            # launch the kernel.
+            n = 1024
+            a = tvm.nd.array((np.random.uniform(size=n) * 256).astype(A.dtype), ctx)
+            b = tvm.nd.array((np.random.uniform(size=n) * 256).astype(B.dtype), ctx)
+            c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+            ftimer = fmod.time_evaluator(fmod.entry_name, ctx, number=1)
+            tcost = ftimer(a, b, c).mean
+            #fmod(a, b, c)
+            np.testing.assert_allclose(
+                c.asnumpy(), np.mod(a.asnumpy(), b.asnumpy()), rtol=1e-5)
+
+        check_device("cuda")
+        check_device("opencl -device=intel_graphics")
+        check_device("metal")
+
+    run("float32")
 
 def test_multiple_cache_write():
     # graph
@@ -75,7 +114,7 @@ def check_device(device, host="stackvm"):
         a1 = tvm.nd.array(np.random.uniform(size=n).astype(A1.dtype), ctx)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
         func(a0, a1, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), a0.asnumpy() + a1.asnumpy() + (a0.asnumpy() * a1.asnumpy()),
             rtol=1e-5)
 
@@ -106,7 +145,7 @@ def test_log_pow_llvm():
     ftimer = flog.time_evaluator(flog.entry_name, ctx, number=1, repeat=repeat)
     res = ftimer(a, b)
     assert(len(res.results) == repeat)
-    np.testing.assert_allclose(
+    tvm.testing.assert_allclose(
         b.asnumpy(), np.power(np.log(a.asnumpy()), 2.0), rtol=1e-5)
 
 
@@ -136,7 +175,7 @@ def check_device(device):
             a = tvm.nd.array(np.random.randint(low=0, high=1000, size=n, dtype=A.dtype), ctx)
             b = tvm.nd.array(np.zeros(shape=n, dtype=B.dtype), ctx)
             func(a, b)
-            np.testing.assert_allclose(
+            tvm.testing.assert_allclose(
                 b.asnumpy(), list(map(lambda x: bin(x).count('1'), a.asnumpy())), rtol=1e-5)
 
         check_device("llvm")
@@ -186,7 +225,7 @@ def check_device(device):
             c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
             ftimer = fadd.time_evaluator(fadd.entry_name, ctx, number=1)
             tcost = ftimer(a, b, c).mean
-            np.testing.assert_allclose(
+            tvm.testing.assert_allclose(
                 c.asnumpy(), a.asnumpy() + b.asnumpy(), rtol=1e-6)
 
         check_device("opencl")
@@ -233,7 +272,7 @@ def check_device(device):
         a = tvm.nd.array((np.random.uniform(size=m) * 256).astype(A.dtype), ctx)
         b = tvm.nd.array(np.zeros(m, dtype=B.dtype), ctx)
         f(a, b)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             b.asnumpy(), a.asnumpy() + 3, rtol=1e-6)
     check_device("cuda")
 
@@ -245,3 +284,4 @@ def check_device(device):
     test_add()
     test_log_pow_llvm()
     test_popcount()
+    test_fmod()
diff --git a/tests/python/integration/test_ewise_fpga.py b/tests/python/integration/test_ewise_fpga.py
index 0abefff02778..493cb5016cfe 100644
--- a/tests/python/integration/test_ewise_fpga.py
+++ b/tests/python/integration/test_ewise_fpga.py
@@ -37,14 +37,14 @@ def check_device(device, host="llvm"):
         a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
         b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx)
         fexp(a, b)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             b.asnumpy(), np.exp(a.asnumpy()), rtol=1e-5)
 
     check_device("sdaccel")
     if "AWS_PLATFORM" in os.environ:
         check_device("sdaccel -device=" + os.environ.get("AWS_PLATFORM"))
 
-    check_device("aocl -device=s5_ref -mattr=emulator")
+    check_device("aocl_sw_emu")
 
 def test_multi_kernel():
     # graph
@@ -78,11 +78,11 @@ def check_device(device, host="llvm"):
         c = tvm.nd.array(np.random.uniform(size=n).astype(C.dtype), ctx)
         d = tvm.nd.array(np.random.uniform(size=n).astype(D.dtype), ctx)
         fadd(a, b, c, d)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             d.asnumpy(), a.asnumpy() * 2 + b.asnumpy(), rtol=1e-5)
 
     check_device("sdaccel")
-    check_device("aocl -device=s5_ref -mattr=emulator")
+    check_device("aocl_sw_emu")
 
 
 if __name__ == "__main__":
diff --git a/tests/python/integration/test_gemm.py b/tests/python/integration/test_gemm.py
index 6e74052d8283..928ba187a4d3 100644
--- a/tests/python/integration/test_gemm.py
+++ b/tests/python/integration/test_gemm.py
@@ -85,7 +85,7 @@ def check_device(device):
         ftimer = f.time_evaluator(f.entry_name, ctx, number=1)
         tcost = ftimer(a, b, c).mean
         print("%s: exec=%g sec/op" % (ctx, tcost))
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), np.dot(a_np, b_np.T), rtol=1e-5)
 
     check_device("vulkan")
diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py
index c8fb98746bf6..e2285808ccd3 100644
--- a/tests/python/integration/test_reduce.py
+++ b/tests/python/integration/test_reduce.py
@@ -42,7 +42,7 @@ def check_device(device, host="stackvm"):
             npy[:2] = 0
             res = np_reducer(x.asnumpy(), axis=1)
             res[:2] = 0
-            np.testing.assert_allclose(npy, res, rtol=1e-4)
+            tvm.testing.assert_allclose(npy, res, rtol=1e-4)
 
         check_device("metal")
         check_device("vulkan")
@@ -78,7 +78,7 @@ def check_target(target="llvm"):
         b  = tvm.nd.array(np.zeros(1, dtype=B.dtype), ctx)
         fsum(a, b)
         res = np.sum(a.asnumpy(), axis=0)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             b.asnumpy(), res, rtol=1e-4)
 
     check_target()
@@ -108,7 +108,7 @@ def check_target(target="llvm"):
         b  = tvm.nd.array(np.zeros(1, dtype=B.dtype), ctx)
         fsum(a, b)
         res = np.sum(a.asnumpy(), axis=0)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             b.asnumpy(), res, rtol=1e-4)
 
     check_target()
@@ -155,7 +155,7 @@ def check_target(device, host="stackvm"):
         fsum(a, b)
         res = np.sum(a.asnumpy(), axis=1)
         res[:2] = 0
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             b.asnumpy(), res, rtol=1e-4)
 
     check_target("vulkan")
@@ -206,7 +206,7 @@ def check_target(device, host="stackvm"):
         b  = tvm.nd.array(np.zeros(m, dtype=B.dtype), ctx)
         fsum(a, b)
         res = np.sum(a.asnumpy(), axis=1) + 2
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             b.asnumpy(), res, rtol=1e-4)
 
     check_target("vulkan")
@@ -256,7 +256,7 @@ def check_target():
         nd_res0 = tvm.nd.array(np.zeros(mm, dtype='int32'), ctx)
         nd_res1 = tvm.nd.array(np.zeros(mm, dtype='float32'), ctx)
         fargmax(nd_idx, nd_val, nd_res0, nd_res1)
-        np.testing.assert_allclose(np_res, nd_res0.asnumpy())
+        tvm.testing.assert_allclose(np_res, nd_res0.asnumpy())
 
     check_target()
 
@@ -316,7 +316,7 @@ def check_target(device):
         nd_res0 = tvm.nd.array(np.zeros(mm, dtype='int32'), ctx)
         nd_res1 = tvm.nd.array(np.zeros(mm, dtype='float32'), ctx)
         fargmax(nd_idx, nd_val, nd_res0, nd_res1)
-        np.testing.assert_allclose(np_res, nd_res0.asnumpy())
+        tvm.testing.assert_allclose(np_res, nd_res0.asnumpy())
 
     check_target("cuda")
     check_target("vulkan")
diff --git a/tests/python/integration/test_scan.py b/tests/python/integration/test_scan.py
index 855f3e072133..49d1cf3b75ce 100644
--- a/tests/python/integration/test_scan.py
+++ b/tests/python/integration/test_scan.py
@@ -38,7 +38,7 @@ def check_device(device):
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros((m, n), dtype=res.dtype), ctx)
         fscan(a, b)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             b.asnumpy(), np.cumsum(a_np, axis=0))
 
     check_device("vulkan")
diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py
index 87da86a4654f..8e1b458a6d2f 100644
--- a/tests/python/integration/test_tuning.py
+++ b/tests/python/integration/test_tuning.py
@@ -103,34 +103,7 @@ def get_sample_task(target=tvm.target.cuda(), target_host=None):
                                target=target, target_host=target_host)
     return task, target
 
-
-def test_task_tuner_without_measurement():
-    """test task and tuner without measurement"""
-    task, target = get_sample_task()
-
-    def custom_measure(input_pack, build_func, build_args, number, repeat,
-                       ref_input, ref_output):
-        from tvm.autotvm import MeasureResult
-
-        results = []
-        for inp in input_pack:
-            tic = time.time()
-            # do nothing
-            time.sleep(0.001)
-            results.append(MeasureResult([time.time() - tic], 0,
-                                         time.time() - tic, time.time()))
-        return results
-    measure_option = autotvm.measure_option(custom_measure)
-
-    logging.info("%s", task.config_space)
-
-    # new tuner and recorder
-    for tuner_class in [autotvm.tuner.RandomTuner, autotvm.tuner.GridSearchTuner]:
-        tuner = tuner_class(task)
-        tuner.tune(n_trial=10, measure_option=measure_option)
-        assert tuner.best_flops > 1
-
-def test_tuning_with_measure():
+def test_tuning():
     def check(target, target_host):
         ctx = tvm.context(target, 0)
         if not ctx.exist:
@@ -141,12 +114,12 @@ def check(target, target_host):
         task, target = get_sample_task(target, target_host)
         logging.info("%s", task.config_space)
 
-        measure_option = autotvm.measure_option('local',
-                                                timeout=4,
-                                                number=2)
+        measure_option = autotvm.measure_option(
+            autotvm.LocalBuilder(),
+            autotvm.LocalRunner())
 
         tuner = RandomTuner(task)
-        tuner.tune(n_trial=10, measure_option=measure_option)
+        tuner.tune(n_trial=20, measure_option=measure_option)
 
     check("cuda", None)
     check("opencl", None)
@@ -155,6 +128,4 @@ def check(target, target_host):
     # only print log when invoked from main
     logging.basicConfig(level=logging.DEBUG)
 
-    test_task_tuner_without_measurement()
-    test_tuning_with_measure()
-
+    test_tuning()
diff --git a/tests/python/relay/frontend/mxnet/model_zoo/__init__.py b/tests/python/relay/frontend/mxnet/model_zoo/__init__.py
new file mode 100644
index 000000000000..1c796f7810b7
--- /dev/null
+++ b/tests/python/relay/frontend/mxnet/model_zoo/__init__.py
@@ -0,0 +1,46 @@
+"""MXNet and Relay model zoo."""
+from __future__ import absolute_import
+from . import mlp, resnet, vgg, dqn, dcgan, squeezenet, inception_v3
+import tvm.relay.testing
+
+_num_class = 1000
+_batch = 2
+
+# mlp fc
+mx_mlp = mlp.get_symbol(_num_class)
+relay_mlp = tvm.relay.testing.mlp.get_workload(_batch, _num_class)[0]
+
+# vgg fc
+mx_vgg = {}
+relay_vgg = {}
+for num_layers in [11, 13, 16, 19]:
+    mx_vgg[num_layers] = vgg.get_symbol(_num_class, num_layers)
+    relay_vgg[num_layers] = tvm.relay.testing.vgg.get_workload(
+        _batch, _num_class, num_layers=num_layers)[0]
+
+# resnet fc
+mx_resnet = {}
+relay_resnet = {}
+for num_layers in [18, 34, 50, 101, 152, 200, 269]:
+    mx_resnet[num_layers] = resnet.get_symbol(_num_class, num_layers, '3,224,224')
+    relay_resnet[num_layers] = tvm.relay.testing.resnet.get_workload(
+        _batch, _num_class, num_layers=num_layers)[0]
+
+# squeezenet
+mx_squeezenet = {}
+relay_squeezenet = {}
+for version in ['1.0', '1.1']:
+    mx_squeezenet[version] = squeezenet.get_symbol(version=version)
+    relay_squeezenet[version] = tvm.relay.testing.squeezenet.get_workload(_batch, version=version)[0]
+
+# inception
+mx_inception_v3 = inception_v3.get_symbol()
+relay_inception_v3 = tvm.relay.testing.inception_v3.get_workload(_batch)[0]
+
+# dqn
+mx_dqn = dqn.get_symbol()
+relay_dqn = tvm.relay.testing.dqn.get_workload(_batch)[0]
+
+# dcgan generator
+mx_dcgan = dcgan.get_symbol()
+relay_dcgan = tvm.relay.testing.dcgan.get_workload(_batch)[0]
diff --git a/tests/python/relay/frontend/mxnet/model_zoo/dcgan.py b/tests/python/relay/frontend/mxnet/model_zoo/dcgan.py
new file mode 100644
index 000000000000..8af030b6b184
--- /dev/null
+++ b/tests/python/relay/frontend/mxnet/model_zoo/dcgan.py
@@ -0,0 +1,66 @@
+# pylint: disable=unused-argument
+"""
+The MXNet symbol of DCGAN generator
+
+Adopted from:
+https://github.com/tqchen/mxnet-gan/blob/master/mxgan/generator.py
+
+Reference:
+Radford, Alec, Luke Metz, and Soumith Chintala.
+"Unsupervised representation learning with deep convolutional generative adversarial networks."
+arXiv preprint arXiv:1511.06434 (2015).
+"""
+
+import mxnet as mx
+
+def deconv2d(data, ishape, oshape, kshape, name, stride=(2, 2)):
+    """a deconv layer that enlarges the feature map"""
+    target_shape = (oshape[-2], oshape[-1])
+    pad_y = (kshape[0] - 1) // 2
+    pad_x = (kshape[1] - 1) // 2
+    adj_y = (target_shape[0] + 2 * pad_y - kshape[0]) % stride[0]
+    adj_x = (target_shape[1] + 2 * pad_x - kshape[1]) % stride[1]
+
+    net = mx.sym.Deconvolution(data,
+                               kernel=kshape,
+                               stride=stride,
+                               pad=(pad_y, pad_x),
+                               adj=(adj_y, adj_x),
+                               num_filter=oshape[0],
+                               no_bias=True,
+                               name=name)
+    return net
+
+def deconv2d_bn_relu(data, prefix, **kwargs):
+    """a block of deconv + batch norm + relu"""
+    eps = 1e-5 + 1e-12
+
+    net = deconv2d(data, name="%s_deconv" % prefix, **kwargs)
+    net = mx.sym.BatchNorm(net, eps=eps, name="%s_bn" % prefix)
+    net = mx.sym.Activation(net, name="%s_act" % prefix, act_type='relu')
+    return net
+
+def get_symbol(oshape=(3, 64, 64), ngf=128, code=None):
+    """get symbol of dcgan generator"""
+    assert oshape[-1] == 64, "Only support 64x64 image"
+    assert oshape[-2] == 64, "Only support 64x64 image"
+
+    code = mx.sym.Variable("data") if code is None else code
+    net = mx.sym.FullyConnected(code, name="g1", num_hidden=ngf*8*4*4, no_bias=True, flatten=False)
+    net = mx.sym.Activation(net, act_type='relu')
+    # 4 x 4
+    net = mx.sym.reshape(net, shape=(-1, ngf * 8, 4, 4))
+    # 8 x 8
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 8, 4, 4), oshape=(ngf * 4, 8, 8), kshape=(4, 4), prefix="g2")
+    # 16x16
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 4, 8, 8), oshape=(ngf * 2, 16, 16), kshape=(4, 4), prefix="g3")
+    # 32x32
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 2, 16, 16), oshape=(ngf, 32, 32), kshape=(4, 4), prefix="g4")
+    # 64x64
+    net = deconv2d(
+        net, ishape=(ngf, 32, 32), oshape=oshape[-3:], kshape=(4, 4), name="g5_deconv")
+    net = mx.sym.Activation(net, act_type='tanh')
+    return net
diff --git a/tests/python/relay/frontend/mxnet/model_zoo/dqn.py b/tests/python/relay/frontend/mxnet/model_zoo/dqn.py
new file mode 100644
index 000000000000..e037511efdf2
--- /dev/null
+++ b/tests/python/relay/frontend/mxnet/model_zoo/dqn.py
@@ -0,0 +1,27 @@
+"""
+The mxnet symbol of Nature DQN
+
+Reference:
+Mnih, Volodymyr, et al.
+"Human-level control through deep reinforcement learning."
+Nature 518.7540 (2015): 529.
+"""
+
+import mxnet as mx
+
+def get_symbol(num_action=18):
+    data = mx.sym.Variable(name='data')
+    net = mx.sym.Convolution(data, kernel=(8, 8), stride=(4, 4),
+                             num_filter=32, name='conv1')
+    net = mx.sym.Activation(net, act_type='relu', name='relu1')
+    net = mx.sym.Convolution(net, kernel=(4, 4), stride=(2, 2),
+                             num_filter=64, name='conv2')
+    net = mx.sym.Activation(net, act_type='relu', name='relu2')
+    net = mx.sym.Convolution(net, kernel=(3, 3), stride=(1, 1),
+                             num_filter=64, name='conv3')
+    net = mx.sym.Activation(net, act_type='relu', name='relu3')
+    net = mx.sym.FullyConnected(net, num_hidden=512, name='fc4')
+    net = mx.sym.Activation(net, act_type='relu', name='relu4')
+    net = mx.sym.FullyConnected(net, num_hidden=num_action, name='fc5', flatten=False)
+
+    return net
diff --git a/tests/python/relay/frontend/mxnet/model_zoo/inception_v3.py b/tests/python/relay/frontend/mxnet/model_zoo/inception_v3.py
new file mode 100644
index 000000000000..b8585bf05037
--- /dev/null
+++ b/tests/python/relay/frontend/mxnet/model_zoo/inception_v3.py
@@ -0,0 +1,170 @@
+"""
+Inception V3, suitable for images with around 299 x 299
+
+Reference:
+Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision." arXiv preprint arXiv:1512.00567 (2015).
+
+Adopted from https://github.com/apache/incubator-mxnet/blob/
+             master/example/image-classification/symbols/inception-v3.py
+"""
+import mxnet as mx
+import numpy as np
+
+def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''):
+    conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=True, name='%s%s_conv2d' %(name, suffix))
+    bn = mx.sym.BatchNorm(data=conv, eps=2e-5, name='%s%s_batchnorm' % (name, suffix))
+    act = mx.sym.Activation(data=bn, act_type='relu', name='%s%s_relu' %(name, suffix))
+    return act
+
+
+def Inception7A(data,
+                num_1x1,
+                num_3x3_red, num_3x3_1, num_3x3_2,
+                num_5x5_red, num_5x5,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name))
+    tower_5x5 = Conv(data, num_5x5_red, name=('%s_tower' % name), suffix='_conv')
+    tower_5x5 = Conv(tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name), suffix='_conv_1')
+    tower_3x3 = Conv(data, num_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_2')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(pooling, proj, name=('%s_tower_2' %  name), suffix='_conv')
+    concat = mx.sym.Concat(*[tower_1x1, tower_5x5, tower_3x3, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+# First Downsample
+def Inception7B(data,
+                num_3x3,
+                num_d3x3_red, num_d3x3_1, num_d3x3_2,
+                pool,
+                name):
+    tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_conv' % name))
+    tower_d3x3 = Conv(data, num_d3x3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_2')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0,0), pool_type="max", name=('max_pool_%s_pool' % name))
+    concat = mx.sym.Concat(*[tower_3x3, tower_d3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7C(data,
+                num_1x1,
+                num_d7_red, num_d7_1, num_d7_2,
+                num_q7_red, num_q7_1, num_q7_2, num_q7_3, num_q7_4,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d7 = Conv(data=data, num_filter=num_d7_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=data, num_filter=num_q7_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_1, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_2, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_3, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_3')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_4, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_4')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' %  name), suffix='_conv')
+    # concat
+    concat = mx.sym.Concat(*[tower_1x1, tower_d7, tower_q7, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7D(data,
+                num_3x3_red, num_3x3,
+                num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3,
+                pool,
+                name):
+    tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_3x3 = Conv(data=tower_3x3, num_filter=num_3x3, kernel=(3, 3), pad=(0,0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=data, num_filter=num_d7_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_3x3, kernel=(3, 3), stride=(2, 2), name=('%s_tower_1' % name), suffix='_conv_3')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    # concat
+    concat = mx.sym.Concat(*[tower_3x3, tower_d7_3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7E(data,
+                num_1x1,
+                num_d3_red, num_d3_1, num_d3_2,
+                num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d3 = Conv(data=data, num_filter=num_d3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3_a = Conv(data=tower_d3, num_filter=num_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower' % name), suffix='_mixed_conv')
+    tower_d3_b = Conv(data=tower_d3, num_filter=num_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower' % name), suffix='_mixed_conv_1')
+    tower_3x3_d3 = Conv(data=data, num_filter=num_3x3_d3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower_1' % name), suffix='_mixed_conv')
+    tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower_1' % name), suffix='_mixed_conv_1')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' %  name), suffix='_conv')
+    # concat
+    concat = mx.sym.Concat(*[tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def get_symbol(num_classes=1000, **kwargs):
+    data = mx.sym.Variable(name="data")
+    # stage 1
+    conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv")
+    conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1")
+    conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2")
+    pool = mx.sym.Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool")
+    # stage 2
+    conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3")
+    conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4")
+    pool1 = mx.sym.Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool1")
+
+    # # stage 3
+    in3a = Inception7A(pool1, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 32, "mixed")
+    in3b = Inception7A(in3a, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_1")
+    in3c = Inception7A(in3b, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_2")
+    in3d = Inception7B(in3c, 384,
+                       64, 96, 96,
+                       "max", "mixed_3")
+    # stage 4
+    in4a = Inception7C(in3d, 192,
+                       128, 128, 192,
+                       128, 128, 128, 128, 192,
+                       "avg", 192, "mixed_4")
+    in4b = Inception7C(in4a, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_5")
+    in4c = Inception7C(in4b, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_6")
+    in4d = Inception7C(in4c, 192,
+                       192, 192, 192,
+                       192, 192, 192, 192, 192,
+                       "avg", 192, "mixed_7")
+    in4e = Inception7D(in4d, 192, 320,
+                       192, 192, 192, 192,
+                       "max", "mixed_8")
+    # stage 5
+    in5a = Inception7E(in4e, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "avg", 192, "mixed_9")
+    in5b = Inception7E(in5a, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "max", 192, "mixed_10")
+    # pool
+    pool = mx.sym.Pooling(data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", name="global_pool")
+    flatten = mx.sym.Flatten(data=pool, name="flatten")
+    fc1 = mx.sym.FullyConnected(data=flatten, num_hidden=num_classes, name='fc1', flatten=False)
+    softmax = mx.sym.SoftmaxOutput(data=fc1, name='softmax')
+    return softmax
diff --git a/tests/python/relay/frontend/mxnet/model_zoo/mlp.py b/tests/python/relay/frontend/mxnet/model_zoo/mlp.py
new file mode 100644
index 000000000000..922b208749bf
--- /dev/null
+++ b/tests/python/relay/frontend/mxnet/model_zoo/mlp.py
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+a simple multilayer perceptron
+"""
+import mxnet as mx
+
+def get_symbol(num_classes=10, **kwargs):
+    data = mx.symbol.Variable('data')
+    data = mx.sym.Flatten(data=data)
+    try:
+        fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128, flatten=False)
+        act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
+        fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64, flatten=False)
+        act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
+        fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes, flatten=False)
+        mlp  = mx.symbol.softmax(data = fc3, name = 'softmax')
+    except:
+        fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
+        act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
+        fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
+        act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
+        fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes)
+        mlp  = mx.symbol.softmax(data = fc3, name = 'softmax')
+    return mlp
diff --git a/tests/python/relay/frontend/mxnet/model_zoo/resnet.py b/tests/python/relay/frontend/mxnet/model_zoo/resnet.py
new file mode 100644
index 000000000000..3f9a870d31c0
--- /dev/null
+++ b/tests/python/relay/frontend/mxnet/model_zoo/resnet.py
@@ -0,0 +1,199 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+'''
+Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
+Original author Wei Wu
+
+Implemented the following paper:
+
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Identity Mappings in Deep Residual Networks"
+'''
+import mxnet as mx
+import numpy as np
+
+def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True, bn_mom=0.9, workspace=256, memonger=False):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tuple
+        Stride used in convolution
+    dim_match : Boolean
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    if bottle_neck:
+        bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn1')
+        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
+        conv1 = mx.sym.Convolution(data=act1, num_filter=int(num_filter*0.25), kernel=(1,1), stride=stride, pad=(0,0),
+                                   no_bias=True, workspace=workspace, name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn2')
+        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
+        conv2 = mx.sym.Convolution(data=act2, num_filter=int(num_filter*0.25), kernel=(3,3), stride=(1,1), pad=(1,1),
+                                   no_bias=True, workspace=workspace, name=name + '_conv2')
+        bn3 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn3')
+        act3 = mx.sym.Activation(data=bn3, act_type='relu', name=name + '_relu3')
+        conv3 = mx.sym.Convolution(data=act3, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), no_bias=True,
+                                   workspace=workspace, name=name + '_conv3')
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
+                                            workspace=workspace, name=name+'_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return conv3 + shortcut
+    else:
+        bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn1')
+        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
+        conv1 = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(3,3), stride=stride, pad=(1,1),
+                                      no_bias=True, workspace=workspace, name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn2')
+        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
+        conv2 = mx.sym.Convolution(data=act2, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1),
+                                      no_bias=True, workspace=workspace, name=name + '_conv2')
+        if dim_match:
+            shortcut = data
+        else:
+            shortcut = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
+                                            workspace=workspace, name=name+'_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return conv2 + shortcut
+
+def resnet(units, num_stages, filter_list, num_classes, image_shape, bottle_neck=True, bn_mom=0.9, workspace=256, dtype='float32', memonger=False):
+    """Return ResNet symbol of
+    Parameters
+    ----------
+    units : list
+        Number of units in each stage
+    num_stages : int
+        Number of stage
+    filter_list : list
+        Channel size of each stage
+    num_classes : int
+        Ouput size of symbol
+    dataset : str
+        Dataset type, only cifar10 and imagenet supports
+    workspace : int
+        Workspace used in convolution operator
+    dtype : str
+        Precision (float32 or float16)
+    """
+    num_unit = len(units)
+    assert(num_unit == num_stages)
+    data = mx.sym.Variable(name='data')
+    if dtype == 'float32':
+        # data = mx.sym.identity(data=data, name='id')
+        data = data
+    else:
+        if dtype == 'float16':
+            data = mx.sym.Cast(data=data, dtype=np.float16)
+    data = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=bn_mom, name='bn_data')
+    (nchannel, height, width) = image_shape
+    if height <= 32:            # such as cifar10
+        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(3, 3), stride=(1,1), pad=(1, 1),
+                                  no_bias=True, name="conv0", workspace=workspace)
+    else:                       # often expected to be 224 such as imagenet
+        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(7, 7), stride=(2,2), pad=(3, 3),
+                                  no_bias=True, name="conv0", workspace=workspace)
+        body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn0')
+        body = mx.sym.Activation(data=body, act_type='relu', name='relu0')
+        body = mx.sym.Pooling(data=body, kernel=(3, 3), stride=(2,2), pad=(1,1), pool_type='max')
+
+    for i in range(num_stages):
+        body = residual_unit(body, filter_list[i+1], (1 if i==0 else 2, 1 if i==0 else 2), False,
+                             name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck, workspace=workspace,
+                             memonger=memonger)
+        for j in range(units[i]-1):
+            body = residual_unit(body, filter_list[i+1], (1,1), True, name='stage%d_unit%d' % (i + 1, j + 2),
+                                 bottle_neck=bottle_neck, workspace=workspace, memonger=memonger)
+    bn1 = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn1')
+    relu1 = mx.sym.Activation(data=bn1, act_type='relu', name='relu1')
+    # Although kernel is not used here when global_pool=True, we should put one
+    pool1 = mx.sym.Pooling(data=relu1, global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1')
+    flat = mx.sym.Flatten(data=pool1)
+    try:
+        fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1', flatten=False)
+    except:
+        fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1')
+    if dtype == 'float16':
+        fc1 = mx.sym.Cast(data=fc1, dtype=np.float32)
+    return mx.sym.softmax(data=fc1, name='softmax')
+
+def get_symbol(num_classes, num_layers, image_shape, conv_workspace=256, dtype='float32', **kwargs):
+    """
+    Adapted from https://github.com/tornadomeet/ResNet/blob/master/train_resnet.py
+    Original author Wei Wu
+    """
+    image_shape = [int(l) for l in image_shape.split(',')]
+    (nchannel, height, width) = image_shape
+    if height <= 28:
+        num_stages = 3
+        if (num_layers-2) % 9 == 0 and num_layers >= 164:
+            per_unit = [(num_layers-2)//9]
+            filter_list = [16, 64, 128, 256]
+            bottle_neck = True
+        elif (num_layers-2) % 6 == 0 and num_layers < 164:
+            per_unit = [(num_layers-2)//6]
+            filter_list = [16, 16, 32, 64]
+            bottle_neck = False
+        else:
+            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
+        units = per_unit * num_stages
+    else:
+        if num_layers >= 50:
+            filter_list = [64, 256, 512, 1024, 2048]
+            bottle_neck = True
+        else:
+            filter_list = [64, 64, 128, 256, 512]
+            bottle_neck = False
+        num_stages = 4
+        if num_layers == 18:
+            units = [2, 2, 2, 2]
+        elif num_layers == 34:
+            units = [3, 4, 6, 3]
+        elif num_layers == 50:
+            units = [3, 4, 6, 3]
+        elif num_layers == 101:
+            units = [3, 4, 23, 3]
+        elif num_layers == 152:
+            units = [3, 8, 36, 3]
+        elif num_layers == 200:
+            units = [3, 24, 36, 3]
+        elif num_layers == 269:
+            units = [3, 30, 48, 8]
+        else:
+            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
+
+    return resnet(units       = units,
+                  num_stages  = num_stages,
+                  filter_list = filter_list,
+                  num_classes = num_classes,
+                  image_shape = image_shape,
+                  bottle_neck = bottle_neck,
+                  workspace   = conv_workspace,
+                  dtype       = dtype)
diff --git a/tests/python/relay/frontend/mxnet/model_zoo/squeezenet.py b/tests/python/relay/frontend/mxnet/model_zoo/squeezenet.py
new file mode 100644
index 000000000000..deb896a21385
--- /dev/null
+++ b/tests/python/relay/frontend/mxnet/model_zoo/squeezenet.py
@@ -0,0 +1,76 @@
+"""
+Symbol of SqueezeNet
+
+Reference:
+Iandola, Forrest N., et al.
+"Squeezenet: Alexnet-level accuracy with 50x fewer parameters and< 0.5 mb model size." (2016).
+"""
+
+import mxnet as mx
+
+# Helpers
+def _make_fire(net, squeeze_channels, expand1x1_channels, expand3x3_channels):
+    net = _make_fire_conv(net, squeeze_channels, 1, 0)
+
+    left = _make_fire_conv(net, expand1x1_channels, 1, 0)
+    right = _make_fire_conv(net, expand3x3_channels, 3, 1)
+    # NOTE : Assume NCHW layout here
+    net = mx.sym.concat(left, right, dim=1)
+
+    return net
+
+def _make_fire_conv(net, channels, kernel_size, padding=0):
+    net = mx.sym.Convolution(net, num_filter=channels, kernel=(kernel_size, kernel_size),
+                             pad=(padding, padding))
+    net = mx.sym.Activation(net, act_type='relu')
+    return net
+
+# Net
+def get_symbol(num_classes=1000, version='1.0', **kwargs):
+    """Get symbol of SqueezeNet
+
+    Parameters
+    ----------
+    num_classes: int
+        The number of classification results
+
+    version : str, optional
+        "1.0" or "1.1" of SqueezeNet
+    """
+    assert version in ['1.0', '1.1'], ("Unsupported SqueezeNet version {version}:"
+                                       "1.0 or 1.1 expected".format(version=version))
+    net = mx.sym.Variable("data")
+    if version == '1.0':
+        net = mx.sym.Convolution(net, num_filter=96, kernel=(7, 7), stride=(2, 2), pad=(3, 3))
+        net = mx.sym.Activation(net, act_type='relu')
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 32, 128, 128)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 32, 128, 128)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 64, 256, 256)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 64, 256, 256)
+    else:
+        net = mx.sym.Convolution(net, num_filter=64, kernel=(3, 3), stride=(2, 2), pad=(1, 1))
+        net = mx.sym.Activation(net, act_type='relu')
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max', stride=(2, 2))
+        net = _make_fire(net, 16, 64, 64)
+        net = _make_fire(net, 16, 64, 64)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max',  stride=(2, 2))
+        net = _make_fire(net, 32, 128, 128)
+        net = _make_fire(net, 32, 128, 128)
+        net = mx.sym.Pooling(data=net, kernel=(3, 3), pool_type='max',  stride=(2, 2))
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 48, 192, 192)
+        net = _make_fire(net, 64, 256, 256)
+        net = _make_fire(net, 64, 256, 256)
+    net = mx.sym.Dropout(net, p=0.5)
+    net = mx.sym.Convolution(net, num_filter=num_classes, kernel=(1, 1))
+    net = mx.sym.Activation(net, act_type='relu')
+    net = mx.sym.Pooling(data=net, global_pool=True, kernel=(13, 13), pool_type='avg')
+    net = mx.sym.flatten(net)
+    return mx.sym.softmax(net)
diff --git a/tests/python/relay/frontend/mxnet/model_zoo/vgg.py b/tests/python/relay/frontend/mxnet/model_zoo/vgg.py
new file mode 100644
index 000000000000..68215bb80aaa
--- /dev/null
+++ b/tests/python/relay/frontend/mxnet/model_zoo/vgg.py
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""References:
+
+Simonyan, Karen, and Andrew Zisserman. "Very deep convolutional networks for
+large-scale image recognition." arXiv preprint arXiv:1409.1556 (2014).
+"""
+
+import mxnet as mx
+import numpy as np
+
+def get_feature(internel_layer, layers, filters, batch_norm = False, **kwargs):
+    for i, num in enumerate(layers):
+        for j in range(num):
+            internel_layer = mx.sym.Convolution(data = internel_layer, kernel=(3, 3), pad=(1, 1), num_filter=filters[i], name="conv%s_%s" %(i + 1, j + 1))
+            if batch_norm:
+                internel_layer = mx.symbol.BatchNorm(data=internel_layer, name="bn%s_%s" %(i + 1, j + 1))
+            internel_layer = mx.sym.Activation(data=internel_layer, act_type="relu", name="relu%s_%s" %(i + 1, j + 1))
+        internel_layer = mx.sym.Pooling(data=internel_layer, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool%s" %(i + 1))
+    return internel_layer
+
+def get_classifier(input_data, num_classes, **kwargs):
+    flatten = mx.sym.Flatten(data=input_data, name="flatten")
+    try:
+        fc6 = mx.sym.FullyConnected(data=flatten, num_hidden=4096, name="fc6", flatten=False)
+        relu6 = mx.sym.Activation(data=fc6, act_type="relu", name="relu6")
+        drop6 = mx.sym.Dropout(data=relu6, p=0.5, name="drop6")
+        fc7 = mx.sym.FullyConnected(data=drop6, num_hidden=4096, name="fc7", flatten=False)
+        relu7 = mx.sym.Activation(data=fc7, act_type="relu", name="relu7")
+        drop7 = mx.sym.Dropout(data=relu7, p=0.5, name="drop7")
+        fc8 = mx.sym.FullyConnected(data=drop7, num_hidden=num_classes, name="fc8", flatten=False)
+    except:
+        fc6 = mx.sym.FullyConnected(data=flatten, num_hidden=4096, name="fc6")
+        relu6 = mx.sym.Activation(data=fc6, act_type="relu", name="relu6")
+        drop6 = mx.sym.Dropout(data=relu6, p=0.5, name="drop6")
+        fc7 = mx.sym.FullyConnected(data=drop6, num_hidden=4096, name="fc7")
+        relu7 = mx.sym.Activation(data=fc7, act_type="relu", name="relu7")
+        drop7 = mx.sym.Dropout(data=relu7, p=0.5, name="drop7")
+        fc8 = mx.sym.FullyConnected(data=drop7, num_hidden=num_classes, name="fc8")
+    return fc8
+
+def get_symbol(num_classes, num_layers=11, batch_norm=False, dtype='float32', **kwargs):
+    """
+    Parameters
+    ----------
+    num_classes : int, default 1000
+        Number of classification classes.
+    num_layers : int
+        Number of layers for the variant of densenet. Options are 11, 13, 16, 19.
+    batch_norm : bool, default False
+        Use batch normalization.
+    dtype: str, float32 or float16
+        Data precision.
+    """
+    vgg_spec = {11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]),
+                13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]),
+                16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]),
+                19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512])}
+    if num_layers not in vgg_spec:
+        raise ValueError("Invalide num_layers {}. Possible choices are 11,13,16,19.".format(num_layers))
+    layers, filters = vgg_spec[num_layers]
+    data = mx.sym.Variable(name="data")
+    if dtype == 'float16':
+        data = mx.sym.Cast(data=data, dtype=np.float16)
+    feature = get_feature(data, layers, filters, batch_norm)
+    classifier = get_classifier(feature, num_classes)
+    if dtype == 'float16':
+        classifier = mx.sym.Cast(data=classifier, dtype=np.float32)
+    symbol = mx.sym.softmax(data=classifier, name='softmax')
+    return symbol
diff --git a/tests/python/relay/frontend/mxnet/test_forward.py b/tests/python/relay/frontend/mxnet/test_forward.py
new file mode 100644
index 000000000000..fcc760981ef5
--- /dev/null
+++ b/tests/python/relay/frontend/mxnet/test_forward.py
@@ -0,0 +1,206 @@
+import numpy as np
+
+import topi
+import tvm
+from tvm.contrib import graph_runtime
+from tvm import relay
+from tvm.relay.testing.config import ctx_list
+import mxnet as mx
+from mxnet import gluon
+from mxnet.gluon.model_zoo import vision
+import model_zoo
+
+
+def verify_mxnet_frontend_impl(mx_symbol, data_shape=(1, 3, 224, 224), out_shape=(1, 1000),
+                               gluon_impl=False, name=None, dtype='float32'):
+    """Use name different from test to avoid let nose pick it up"""
+    if gluon_impl:
+        def get_gluon_output(name, x):
+            net = vision.get_model(name)
+            net.collect_params().initialize(mx.init.Xavier())
+            net_sym = gluon.nn.SymbolBlock(outputs=net(mx.sym.var('data')),
+                                           inputs=mx.sym.var('data'),
+                                           params=net.collect_params())
+            out = net_sym(mx.nd.array(x.astype(dtype))).asnumpy()
+            return out, net_sym
+    else:
+        def get_mxnet_output(symbol, x, dtype='float32'):
+            from collections import namedtuple
+            Batch = namedtuple('Batch', ['data'])
+            mod = mx.mod.Module(symbol, label_names=None)
+            mod.bind(data_shapes=[('data', x.shape)], for_training=False)
+            mod.init_params()
+            mod.forward(Batch([mx.nd.array(x.astype(dtype))]))
+            out = mod.get_outputs()[0].asnumpy()
+            args, auxs = mod.get_params()
+            return out, args, auxs
+
+    def get_tvm_output(symbol, x, args, auxs, target, ctx, dtype='float32'):
+        dshape = x.shape
+        shape_dict = {'data': dshape}
+        if gluon_impl:
+            new_sym, params = relay.frontend.from_mxnet(symbol, shape_dict)
+        else:
+            new_sym, params = relay.frontend.from_mxnet(symbol, shape_dict, arg_params=args, aux_params=auxs)
+
+        with relay.build_config(opt_level=3):
+            graph, lib, params = relay.build(new_sym, target, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        # set inputs
+        m.set_input("data", tvm.nd.array(x.astype(dtype)))
+        m.set_input(**params)
+        m.run()
+        # get outputs
+        out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
+        return out.asnumpy()
+
+    # random input
+    x = np.random.uniform(size=data_shape)
+    if gluon_impl:
+        gluon_out, gluon_sym = get_gluon_output(name, x)
+        for target, ctx in ctx_list():
+            tvm_out = get_tvm_output(gluon_sym, x, None, None, target, ctx, dtype)
+            tvm.testing.assert_allclose(gluon_out, tvm_out, rtol=1e-5, atol=1e-5)
+    else:
+        mx_out, args, auxs = get_mxnet_output(mx_symbol, x, dtype)
+        assert "data" not in args
+        for target, ctx in ctx_list():
+            tvm_out = get_tvm_output(mx_symbol, x, args, auxs, target, ctx, dtype)
+            tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_mlp():
+    mlp = model_zoo.mx_mlp
+    verify_mxnet_frontend_impl(mlp)
+
+def test_forward_vgg():
+    for n in [11]:
+        mx_sym = model_zoo.mx_vgg[n]
+        verify_mxnet_frontend_impl(mx_sym)
+
+def test_forward_resnet():
+    for n in [18]:
+        mx_sym = model_zoo.mx_resnet[n]
+        verify_mxnet_frontend_impl(mx_sym)
+
+def test_forward_elu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.LeakyReLU(data, act_type='elu')
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_rrelu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.LeakyReLU(data, act_type='rrelu', lower_bound=0.3, upper_bound=0.7)
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_prelu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.LeakyReLU(data, act_type='prelu')
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_softrelu():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicitly
+    mx_sym = mx.sym.Activation(data, act_type='softrelu')
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_fc_flatten():
+    # test flatten=True option in mxnet 0.11.1
+    data = mx.sym.var('data')
+    try:
+        mx_sym = mx.sym.FullyConnected(data, num_hidden=100, flatten=True)
+        verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 100))
+        mx_sym = mx.sym.FullyConnected(mx.sym.Flatten(data), num_hidden=100, flatten=False)
+        verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 100))
+    except:
+        pass
+
+def test_forward_clip():
+    data = mx.sym.var('data')
+    data = mx.sym.concat(data, -data, dim=1)  # negative part explicity
+    mx_sym = mx.sym.clip(data, a_min=0, a_max=1)
+    verify_mxnet_frontend_impl(mx_sym, (1, 3, 100, 100), (1, 6, 100, 100))
+
+def test_forward_split():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.split(data, axis=1, num_outputs=4, squeeze_axis=False)
+    verify_mxnet_frontend_impl(mx_sym, (1, 4, 2, 1), (1, 1, 2, 1))
+
+def test_forward_split_squeeze():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.split(data, axis=1, num_outputs=4, squeeze_axis=True)
+    verify_mxnet_frontend_impl(mx_sym, (1, 4, 2, 1), (1, 2, 1))
+
+def test_forward_expand_dims():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.expand_dims(data, axis=1)
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 1, 3, 4))
+
+def test_forward_pooling():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.Pooling(data, kernel=(3, 3), pad=(1, 1), pool_type='avg')
+    verify_mxnet_frontend_impl(mx_sym, (1, 20, 8, 8), (1, 20, 8, 8))
+
+    mx_sym = mx.sym.Pooling(data, kernel=(3, 3), pad=(1, 1), pool_type='max')
+    verify_mxnet_frontend_impl(mx_sym, (1, 20, 8, 8), (1, 20, 8, 8))
+
+def test_forward_lrn():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.LRN(data, alpha=2, beta=2, knorm=1, nsize=5)
+    verify_mxnet_frontend_impl(mx_sym, (1, 10, 24, 24), (1, 10, 24, 24))
+
+def test_forward_ones():
+    data = mx.sym.var('data')
+    ones = mx.sym.ones(shape=(2, 3, 4), dtype='float32')
+    mx_sym = mx.sym.elemwise_add(data, ones)
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+    
+def test_forward_zeros():
+    data = mx.sym.var('data')
+    zeros = mx.sym.zeros(shape=(2, 3, 4), dtype='float32')
+    mx_sym = mx.sym.elemwise_add(data, zeros)
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_ones_like():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.ones_like(data, dtype='float32')
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_zeros_like():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.zeros_like(data, dtype='float32')
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
+
+def test_forward_argmax():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.argmax(data, axis=1)
+    verify_mxnet_frontend_impl(mx_sym, (5, 3), (5,))
+
+def test_forward_argmin():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.argmin(data, axis=0)
+    verify_mxnet_frontend_impl(mx_sym, (5, 4), (4,))
+    
+if __name__ == '__main__':
+    test_forward_mlp()
+    test_forward_vgg()
+    test_forward_resnet()
+    test_forward_elu()
+    test_forward_rrelu()
+    test_forward_prelu()
+    test_forward_softrelu()
+    test_forward_fc_flatten()
+    test_forward_clip()
+    test_forward_split()
+    test_forward_split_squeeze()
+    test_forward_expand_dims()
+    test_forward_pooling()
+    test_forward_lrn()
+    test_forward_ones()
+    test_forward_zeros()
+    test_forward_ones_like()
+    test_forward_zeros_like()
+    test_forward_argmax()
+    test_forward_argmin()
diff --git a/tests/python/relay/frontend/mxnet/test_graph.py b/tests/python/relay/frontend/mxnet/test_graph.py
new file mode 100644
index 000000000000..820e78242808
--- /dev/null
+++ b/tests/python/relay/frontend/mxnet/test_graph.py
@@ -0,0 +1,87 @@
+import mxnet as mx
+import tvm
+from tvm import relay
+import model_zoo
+from model_zoo import _batch
+
+def test_mlp():
+    mx_sym = model_zoo.mx_mlp
+    from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, {'data': (_batch, 1, 28, 28)})
+    from_mx_sym = relay.ir_pass.infer_type(from_mx_sym)
+    relay_sym = model_zoo.relay_mlp
+    assert relay.ir_pass.alpha_equal(from_mx_sym, relay_sym)
+
+def test_vgg():
+    for n in [11, 13, 16, 19]:
+        mx_sym = model_zoo.mx_vgg[n]
+        from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, {'data': (_batch, 3, 224, 224)})
+        from_mx_sym = relay.ir_pass.infer_type(from_mx_sym)
+        relay_sym = model_zoo.relay_vgg[n]
+        assert relay.ir_pass.alpha_equal(from_mx_sym, relay_sym)
+
+def test_resnet():
+    for n in [18, 34, 50, 101, 152, 200, 269]:
+        mx_sym = model_zoo.mx_resnet[n]
+        from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, {'data': (_batch, 3, 224, 224)})
+        from_mx_sym = relay.ir_pass.infer_type(from_mx_sym)
+        relay_sym = model_zoo.relay_resnet[n]
+        assert relay.ir_pass.alpha_equal(from_mx_sym, relay_sym)
+
+def test_squeezenet():
+    for version in ['1.0', '1.1']:
+        mx_sym = model_zoo.mx_squeezenet[version]
+        from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, {'data': (_batch, 3, 224, 224)})
+        from_mx_sym = relay.ir_pass.infer_type(from_mx_sym)
+        relay_sym = model_zoo.relay_squeezenet[version]
+        assert relay.ir_pass.alpha_equal(from_mx_sym, relay_sym)
+
+def test_inception_v3():
+    mx_sym = model_zoo.mx_inception_v3
+    from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, {'data': (_batch, 3, 299, 299)})
+    from_mx_sym = relay.ir_pass.infer_type(from_mx_sym)
+    relay_sym = model_zoo.relay_inception_v3
+    assert relay.ir_pass.alpha_equal(from_mx_sym, relay_sym)
+
+def test_dqn():
+    mx_sym = model_zoo.mx_dqn
+    from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, {'data': (_batch, 4, 84, 84)})
+    from_mx_sym = relay.ir_pass.infer_type(from_mx_sym)
+    relay_sym = model_zoo.relay_dqn
+    assert relay.ir_pass.alpha_equal(from_mx_sym, relay_sym)
+
+def test_dcgan():
+    mx_sym = model_zoo.mx_dcgan
+    from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, {'data': (_batch, 100)})
+    from_mx_sym = relay.ir_pass.infer_type(from_mx_sym)
+    relay_sym = model_zoo.relay_dcgan
+    assert relay.ir_pass.alpha_equal(from_mx_sym, relay_sym)
+
+def test_multi_outputs():
+    def compose_mxnet(**kwargs):
+        x = mx.sym.Variable('x')
+        y = mx.sym.Variable('y')
+        z = mx.sym.split(x, **kwargs)
+        return mx.sym.broadcast_sub(mx.sym.broadcast_add(z[0], z[2]), y)
+    def compose_relay(**kwargs):
+        x = relay.var("x", shape=(_batch, 3, 224, 224))
+        y = relay.var("y", shape=(1,))
+        z = relay.split(x, **kwargs)
+        ret = z[0] + z[2] - y
+        args = relay.ir_pass.free_vars(ret)
+        return relay.Function(args, ret)
+    mx_sym = compose_mxnet(num_outputs=3, axis=1)
+    from_mx_sym, _ = relay.frontend.from_mxnet(mx_sym, {'x': (_batch, 3, 224, 224), 'y': (1,)})
+    from_mx_sym = relay.ir_pass.infer_type(from_mx_sym)
+    relay_sym = compose_relay(indices_or_sections=3, axis=1)
+    relay_sym = relay.ir_pass.infer_type(relay_sym)
+    assert relay.ir_pass.alpha_equal(from_mx_sym, relay_sym)
+
+if __name__ == '__main__':
+    test_mlp()
+    test_vgg()
+    test_resnet()
+    test_squeezenet()
+    test_inception_v3()
+    test_dqn()
+    test_dcgan()
+    test_multi_outputs()
diff --git a/tests/python/relay/frontend/test_keras.py b/tests/python/relay/frontend/test_keras.py
new file mode 100644
index 000000000000..f508c5b44310
--- /dev/null
+++ b/tests/python/relay/frontend/test_keras.py
@@ -0,0 +1,332 @@
+import numpy as np
+import nnvm
+from nnvm import to_relay
+import tvm
+from tvm import relay
+from tvm.contrib import graph_runtime
+from nnvm.testing.config import ctx_list
+import keras
+
+# prevent keras from using up all gpu memory
+import tensorflow as tf
+from keras.backend.tensorflow_backend import set_session
+config = tf.ConfigProto()
+config.gpu_options.per_process_gpu_memory_fraction = 0.5
+set_session(tf.Session(config=config))
+
+
+def verify_keras_frontend(keras_model, need_transpose=True):
+    # Keras frontend currently supports tensorflow backend only.
+    assert(keras.backend.backend() == 'tensorflow')
+
+    in_shapes = []
+    for layer in keras_model._input_layers:
+        in_shapes.append(tuple(dim.value if dim.value is not None else 1 for dim in layer.input.shape))
+
+    def get_keras_output(xs, dtype='float32'):
+        return keras_model.predict(xs)
+
+    def get_tvm_output(xs, target, ctx, dtype='float32'):
+        sym, params = nnvm.frontend.from_keras(keras_model)
+        shape_dict = {name: x.shape for (name, x) in zip(keras_model.input_names, xs)}
+        with relay.build_module.build_config(opt_level=2):
+            func, params = to_relay.to_relay(sym, shape_dict, dtype, params)
+            graph, lib, params = relay.build(func, target='llvm', params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        for name, x in zip(keras_model.input_names, xs):
+            m.set_input(name, tvm.nd.array(x.astype(dtype)))
+        m.set_input(**params)
+        m.run()
+
+        return [m.get_output(i).asnumpy() for i in range(m.get_num_outputs())]
+
+    def to_channels_first(arr):
+        return arr.transpose([0, -1] + list(range(1, arr.ndim - 1)))
+
+    def to_channels_last(arr):
+        return arr.transpose([0] + list(range(2, arr.ndim)) + [1])
+
+    xs = [np.random.uniform(size=shape, low=-1.0, high=1.0) for shape in in_shapes]
+    keras_out = get_keras_output(xs)
+
+    keras_out = keras_out if isinstance(keras_out, list) else [keras_out]
+    for target, ctx in ctx_list():
+        inputs = [to_channels_first(x) for x in xs] if need_transpose else xs
+        tvm_out = get_tvm_output(inputs, target, ctx)
+        for kout, tout in zip(keras_out, tvm_out):
+            if need_transpose:
+                tout = to_channels_last(tout)
+            tvm.testing.assert_allclose(kout, tout, rtol=1e-5, atol=1e-5)
+
+def test_forward_elemwise_add():
+    r = []
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(data)
+    r.append(x)
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(x)
+    r.append(x)
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(x)
+    # add two symbols
+    y = keras.layers.add([keras.layers.add([x, r[0]]), r[1]])
+    y = keras.layers.GlobalAveragePooling2D()(y)
+    keras_model = keras.models.Model(data, y)
+    verify_keras_frontend(keras_model)
+    # add three symbols
+    y = keras.layers.add([x, r[0], r[1]])
+    y = keras.layers.GlobalAveragePooling2D()(y)
+    keras_model = keras.models.Model(data, y)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_dense():
+    data = keras.layers.Input(shape=(32,32,1))
+    x = keras.layers.Flatten()(data)
+    x = keras.layers.Dropout(0.5)(x)
+    x = keras.layers.Dense(10, activation='relu', kernel_initializer='uniform')(x)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_pool():
+    data = keras.layers.Input(shape=(32,32,1))
+    # maxpool
+    x = keras.layers.MaxPooling2D((3, 3), strides=(1, 1), padding='same')(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+    # avgpool
+    y = keras.layers.AveragePooling2D((3, 3), strides=(1, 1), padding='same')(data)
+    keras_model = keras.models.Model(data, y)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_conv():
+    data = keras.layers.Input(shape=(32,32,3))
+    conv_funcs = [keras.layers.Conv2D(filters=10, kernel_size=(3,3),
+                                      strides=(2,2), padding='same'),
+                  keras.layers.Conv2D(filters=10, kernel_size=(3,3),
+                                      dilation_rate=(2,2), padding='same'),
+                  keras.layers.DepthwiseConv2D(kernel_size=(3,3), padding='same'),
+                  keras.layers.Conv2DTranspose(filters=10, kernel_size=(3,3), padding='valid'),
+                  keras.layers.SeparableConv2D(filters=10, kernel_size=(3,3), padding='same')]
+    for conv_func in conv_funcs:
+        x = conv_func(data)
+        keras_model = keras.models.Model(data, x)
+        verify_keras_frontend(keras_model)
+
+
+def test_forward_upsample():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.UpSampling2D(size=(3,3))(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_reshape():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Reshape(target_shape=(32,32,3))(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_crop():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Cropping2D(cropping=((1, 1), (1, 1)))(data)
+    x = keras.layers.Cropping2D(cropping=(1, 1))(x)
+    x = keras.layers.Cropping2D(cropping=1)(x)
+    x = keras.layers.Cropping2D(cropping=((0, 1), (1, 0)))(x)
+    x = keras.layers.Cropping2D(cropping=(1, 0))(x)
+    x = keras.layers.Cropping2D(cropping=0)(x)
+    x = keras.layers.Add()([x, x])
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_vgg16():
+    keras_model = keras.applications.vgg16.VGG16(include_top=True, weights='imagenet',
+        input_shape=(224,224,3), classes=1000)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_xception():
+    keras_model = keras.applications.xception.Xception(include_top=True, weights='imagenet',
+        input_shape=(299,299,3), classes=1000)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_resnet50():
+    keras_model = keras.applications.resnet50.ResNet50(include_top=True, weights='imagenet',
+        input_shape=(224,224,3), classes=1000)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_mobilenet():
+    keras_model = keras.applications.mobilenet.MobileNet(include_top=True, weights='imagenet',
+        input_shape=(224,224,3), classes=1000)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_activations():
+    data = keras.layers.Input(shape=(32,32,3))
+    weights = np.random.rand(1, 32, 32, 3)
+    act_funcs = [keras.layers.Activation('softmax'),
+                 keras.layers.Activation('softplus'),
+                 keras.layers.ReLU(),
+                 keras.layers.ReLU(max_value=6.),
+                 keras.layers.LeakyReLU(alpha=0.3),
+                 keras.layers.PReLU(weights=weights, alpha_initializer="zero"),
+                 keras.layers.ELU(alpha=0.5),
+                 keras.layers.Activation('selu'),
+                 keras.layers.ThresholdedReLU(theta=0.5),
+                 keras.layers.Activation('softsign'),
+                 keras.layers.Activation('hard_sigmoid'),
+                 keras.layers.Activation('sigmoid'),
+                 keras.layers.Activation('tanh'),
+                 keras.layers.Activation('linear')]
+    for act_func in act_funcs:
+        x = act_func(data)
+        keras_model = keras.models.Model(data, x)
+        verify_keras_frontend(keras_model)
+
+
+def test_forward_multi_inputs():
+    data1 = keras.layers.Input(shape=(32,32,3))
+    data2 = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(data1)
+    y = keras.layers.Conv2D(8, (3, 3), padding="same")(data2)
+    z = keras.layers.add([x, y])
+    z = keras.layers.GlobalAveragePooling2D()(z)
+    keras_model = keras.models.Model([data1, data2], z)
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_multi_outputs():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(data)
+    x = keras.layers.GlobalAveragePooling2D()(x)
+    y = keras.layers.Conv2D(8, (3, 3), padding="same")(data)
+    y = keras.layers.GlobalAveragePooling2D()(y)
+    keras_model = keras.models.Model(data, [x, y])
+    verify_keras_frontend(keras_model)
+
+
+def test_forward_reuse_layers():
+    # reuse conv2d
+    data = keras.layers.Input(shape=(32,32,3))
+    conv2d = keras.layers.Conv2D(8, (3, 3), padding="same")
+    x = conv2d(data)
+    y = conv2d(data)
+    z = keras.layers.add([x, y])
+    z = keras.layers.GlobalAveragePooling2D()(z)
+    keras_model = keras.models.Model(data, z)
+    verify_keras_frontend(keras_model)
+
+    # reuse add
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(data)
+    add = keras.layers.Add()
+    x = add([x, x])
+    x = add([x, x])
+    z = keras.layers.GlobalAveragePooling2D()(x)
+    keras_model = keras.models.Model(data, z)
+    verify_keras_frontend(keras_model)
+
+def _test_LSTM(inputs, hidden, return_state=True):
+    data = keras.layers.Input(shape=(1, inputs))
+    lstm_out = keras.layers.LSTM(hidden,
+                                 return_state=return_state,
+                                 recurrent_activation='sigmoid',
+                                 activation='tanh')
+    x = lstm_out(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def _test_LSTM_MultiLayer(inputs, hidden):
+    inputs = keras.layers.Input(shape=(1, inputs))
+    layer = keras.layers.LSTM(hidden, return_state=True, return_sequences=True,
+                                 recurrent_activation='sigmoid',
+                                 activation='tanh')
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    output = keras.layers.LSTM(hidden, recurrent_activation='sigmoid',
+                               activation='tanh')(output, initial_state=state)
+    keras_model = keras.models.Model(inputs, output)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+
+def test_forward_LSTM():
+    # TODO(@jroesch): need to modify compile engine to fix return_state=True
+    _test_LSTM(8, 8, return_state=False)
+    _test_LSTM(4, 4, return_state=False)
+    _test_LSTM_MultiLayer(4, 4)
+
+def _test_RNN(inputs, units):
+    data = keras.layers.Input(shape=(1, inputs))
+    rnn_out = keras.layers.SimpleRNN(units, return_state=True,
+                                 activation='tanh')
+    x = rnn_out(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def _test_RNN_MultiLayer(inputs, units):
+    inputs = keras.layers.Input(shape=(1, inputs))
+    layer = keras.layers.SimpleRNN(units, return_state=True, return_sequences=True,
+                                   activation='tanh')
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    output = keras.layers.SimpleRNN(units, activation='tanh')(output, initial_state=state)
+    keras_model = keras.models.Model(inputs, output)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def test_forward_RNN():
+    _test_RNN(2, 4)
+    _test_RNN(4, 3)
+    _test_RNN_MultiLayer(4, 12)
+
+def _test_GRU(inputs, units):
+    data = keras.layers.Input(shape=(1, inputs))
+    gru_out = keras.layers.GRU(units,
+                               return_state=True,
+                               recurrent_activation='sigmoid',
+                               activation='tanh')
+    x = gru_out(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def _test_GRU_MultiLayer(inputs, units):
+    inputs = keras.layers.Input(shape=(1, inputs))
+    layer = keras.layers.GRU(units,
+                             return_state=True,
+                             return_sequences=True,
+                             recurrent_activation='sigmoid',
+                             activation='tanh')
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    output = keras.layers.GRU(units, recurrent_activation='sigmoid',
+                              activation='tanh')(output, initial_state=state)
+    keras_model = keras.models.Model(inputs, output)
+    verify_keras_frontend(keras_model, need_transpose=False)
+
+def test_forward_GRU():
+    _test_GRU(2, 4)
+    _test_GRU(4, 3)
+    _test_GRU_MultiLayer(4, 4)
+
+if __name__ == '__main__':
+    test_forward_elemwise_add()
+    test_forward_activations()
+    test_forward_dense()
+    test_forward_pool()
+    test_forward_conv()
+    test_forward_upsample()
+    test_forward_reshape()
+    test_forward_crop()
+    test_forward_vgg16()
+    test_forward_xception()
+    test_forward_resnet50()
+    test_forward_mobilenet()
+    test_forward_multi_inputs()
+    test_forward_multi_outputs()
+    test_forward_reuse_layers()
+    test_forward_LSTM()
+    test_forward_RNN()
+    test_forward_GRU()
diff --git a/tests/python/relay/test_autotvm_task_extraction.py b/tests/python/relay/test_autotvm_task_extraction.py
new file mode 100644
index 000000000000..8c93e4a56642
--- /dev/null
+++ b/tests/python/relay/test_autotvm_task_extraction.py
@@ -0,0 +1,56 @@
+"""Test task extraction for autotvm"""
+import tvm.relay.testing
+from tvm import relay
+from tvm import autotvm
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network"""
+    input_shape = (batch_size, 3, 224, 224)
+
+    if name == 'resnet-18':
+        net, params = relay.testing.resnet.get_workload(num_layers=18, batch_size=batch_size)
+    elif name == 'mobilenet':
+        net, params = relay.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif name == 'dcgan':
+        net, params = relay.testing.dcgan.get_workload(batch_size=batch_size)
+        input_shape = (batch_size, 100)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, input_shape
+
+def test_task_extraction():
+    target = 'llvm'
+
+    net, params, input_shape = get_network('resnet-18', batch_size=1)
+    tasks = autotvm.task.extract_from_program(net, target=target,
+                                            params=params,
+                                            ops=(relay.op.nn.conv2d,))
+    assert len(tasks) == 12
+
+    net, params, input_shape = get_network('resnet-18', batch_size=1)
+    tasks = autotvm.task.extract_from_program(net, target=target,
+                                            params=params,
+                                            ops=(relay.op.nn.dense,))
+    assert len(tasks) == 1
+
+    net, params, input_shape = get_network('resnet-18', batch_size=1)
+    tasks = autotvm.task.extract_from_program(net, target=target,
+                                            params=params,
+                                            ops=(relay.op.nn.conv2d, relay.op.nn.dense))
+    assert len(tasks) == 13
+
+    net, params, input_shape = get_network('mobilenet', batch_size=1)
+    tasks = autotvm.task.extract_from_program(net, target=target,
+                                            params=params,
+                                            ops=(relay.op.nn.conv2d, relay.op.nn.dense))
+    assert len(tasks) == 20
+
+    net, params, input_shape = get_network('dcgan', batch_size=1)
+    tasks = autotvm.task.extract_from_program(net, target=target,
+                                            params=params,
+                                            ops=(relay.op.nn.conv2d_transpose,))
+    assert len(tasks) == 4
+
+if __name__ == '__main__':
+    test_task_extraction()
diff --git a/tests/python/relay/test_backend_compile_engine.py b/tests/python/relay/test_backend_compile_engine.py
new file mode 100644
index 000000000000..568d7849e7ee
--- /dev/null
+++ b/tests/python/relay/test_backend_compile_engine.py
@@ -0,0 +1,38 @@
+import tvm
+import tvm.testing
+import numpy as np
+from tvm import relay
+
+
+def test_compile_engine():
+    engine = relay.backend.compile_engine.get()
+    def get_func(shape):
+        x = relay.var("x", shape=shape)
+        y = relay.add(x, x)
+        z = relay.add(y, x)
+        f = relay.ir_pass.infer_type(relay.Function([x], z))
+        return f
+    z1 = engine.lower(get_func((10,)), "llvm")
+    z2 = engine.lower(get_func((10,)), "llvm")
+    z3 = engine.lower(get_func(()), "llvm")
+    assert z1.same_as(z2)
+    assert not z3.same_as(z1)
+    if tvm.context("cuda").exist:
+        z4 = engine.lower(get_func(()), "cuda")
+        assert not z3.same_as(z4)
+
+    # Test JIT target
+    for target in ["llvm"]:
+        ctx = tvm.context(target)
+        if ctx.exist:
+            f = engine.jit(get_func((10,)), target)
+            x = tvm.nd.array(np.ones(10).astype("float32"), ctx=ctx)
+            y = tvm.nd.empty((10,), ctx=ctx)
+            f(x, y)
+            tvm.testing.assert_allclose(
+                y.asnumpy(), x.asnumpy() * 3)
+    engine.dump()
+
+
+if __name__ == "__main__":
+    test_compile_engine()
diff --git a/tests/python/relay/test_backend_graph_runtime.py b/tests/python/relay/test_backend_graph_runtime.py
new file mode 100644
index 000000000000..59970dee38f9
--- /dev/null
+++ b/tests/python/relay/test_backend_graph_runtime.py
@@ -0,0 +1,130 @@
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.contrib import graph_runtime
+from tvm.relay.ir_pass import infer_type
+from tvm.relay.scope_builder import ScopeBuilder
+from tvm.relay.op import add
+from tvm.relay.module import Module
+
+# @tq, @jr should we put this in testing ns?
+def check_rts(expr, args, expected_result, mod=None):
+    """
+    Check that evaluating `expr` applied to the arguments produces
+    `result` on both the evaluator and TVM runtime.
+
+    Parameters
+    ----------
+    expr:
+        The expression to evaluate
+
+    args: list of Expr
+        The arguments to supply the expr.
+
+    expected_result:
+        The expected result of running the expression.
+    """
+    intrp = relay.create_executor('debug', mod=mod)
+    graph = relay.create_executor('graph', mod=mod)
+    eval_result = intrp.evaluate(expr)(*args)
+    rts_result = graph.evaluate(expr)(*args)
+    tvm.testing.assert_allclose(eval_result.asnumpy(), rts_result.asnumpy())
+    tvm.testing.assert_allclose(eval_result.asnumpy(), expected_result)
+
+def test_add_op_scalar():
+    """
+    Program:
+        fn (x, y) {
+            return x + y;
+        }
+    """
+    x = relay.var('x', shape=())
+    y = relay.var('y', shape=())
+    func = relay.Function([x, y], add(x, y))
+    x_data = np.array(10.0, dtype='float32')
+    y_data = np.array(1.0, dtype='float32')
+    check_rts(func, [x_data, y_data], x_data + y_data)
+
+def test_add_op_tensor():
+    """
+    Program:
+        fn (x, y) {
+            return x + y;
+        }
+    """
+    x = relay.var('x', shape=(10, 5))
+    y = relay.var('y', shape=(10, 5))
+    func = relay.Function([x, y], add(x, y))
+    x_data = np.random.rand(10, 5).astype('float32')
+    y_data = np.random.rand(10, 5).astype('float32')
+    check_rts(func, [x_data, y_data], x_data + y_data)
+
+def test_add_op_broadcast():
+    """
+    Program:
+        fn (x, y) {
+            return x + y;
+        }
+    """
+    x = relay.var('x', shape=(10, 5))
+    y = relay.var('y', shape=(1, 5))
+    func = relay.Function([x, y], add(x, y))
+    x_data = np.random.rand(10, 5).astype('float32')
+    y_data = np.random.rand(1, 5).astype('float32')
+    check_rts(func, [x_data, y_data], x_data + y_data)
+
+
+def test_with_params():
+    x = relay.var('x', shape=(10, 5))
+    y = relay.var('y', shape=(1, 5))
+    z = relay.add(x, y)
+    z = relay.exp(z)
+    func = relay.Function([x, y], z)
+    x_data = np.random.rand(10, 5).astype('float32')
+    y_data = np.random.rand(1, 5).astype('float32')
+    params = {"y": y_data}
+    graph, lib, params = relay.build(func, "llvm", params=params)
+    mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
+    mod.set_input(**params)
+    mod.set_input(x=x_data)
+    mod.run()
+    res = mod.get_output(0).asnumpy()
+    ref_res = np.exp(y_data + x_data)
+    tvm.testing.assert_allclose(res, ref_res)
+
+
+def test_plan_memory():
+    # it is sufficient to cycle through two memories.
+
+    x = relay.var("x", shape=(10,))
+    y = relay.var("x", shape=(1,))
+    y2 = relay.exp(y)
+    z = relay.add(x, y2)
+    z = relay.exp(z)
+    z = relay.exp(z)
+    z = relay.exp(z)
+    z = relay.exp(z)
+    z = relay.exp(z)
+    func = relay.Function([x, y], z)
+    func = relay.ir_pass.infer_type(func)
+    func = relay.ir_pass.fuse_ops(func, opt_level=0)
+    func = relay.ir_pass.infer_type(func)
+    smap = relay.backend._backend.GraphPlanMemory(func)
+    storage_ids = set()
+    for k, v in smap.items():
+        for x in v:
+            storage_ids.add(x.value)
+
+    # Current rule requires vars have unique storage id
+    # because we don't do inplace, we will need another
+    # two alternating temporary space.
+    assert len(storage_ids) == 4
+
+
+if __name__ == "__main__":
+    test_plan_memory()
+    test_with_params()
+    test_add_op_scalar()
+    test_add_op_tensor()
+    test_add_op_broadcast()
diff --git a/tests/python/relay/test_backend_interpreter.py b/tests/python/relay/test_backend_interpreter.py
new file mode 100644
index 000000000000..f53f27192b9e
--- /dev/null
+++ b/tests/python/relay/test_backend_interpreter.py
@@ -0,0 +1,128 @@
+import numpy as np
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.relay.backend.interpreter import Value, TupleValue
+from tvm.relay.scope_builder import ScopeBuilder
+from tvm.relay import testing, create_executor
+
+
+def check_eval(expr, args, expected_result, mod=None, rtol=1e-07):
+    # TODO(tqchen) add more types once the schedule register is fixed.
+    for target in ["llvm"]:
+        ctx = tvm.context(target, 0)
+        if not ctx.exist:
+            return
+        intrp = create_executor(mod=mod, ctx=ctx, target=target)
+        result = intrp.evaluate(expr)(*args)
+        # use tvm.testing which also set atol
+        tvm.testing.assert_allclose(
+            result.asnumpy(), expected_result, rtol=rtol)
+
+
+def test_from_scalar():
+    np.testing.assert_allclose(Value.from_scalar(1, 'int32').asnumpy(), 1)
+    np.testing.assert_allclose(Value.from_scalar(10.0, 'float32').asnumpy(), 10.0)
+    np.testing.assert_allclose(Value.from_scalar(True).asnumpy(), True)
+
+
+def test_tuple_value():
+    tv = TupleValue(Value.from_scalar(
+        1), Value.from_scalar(2), Value.from_scalar(3))
+    np.testing.assert_allclose(tv[0].asnumpy(), 1)
+    np.testing.assert_allclose(tv[1].asnumpy(), 2)
+    np.testing.assert_allclose(tv[2].asnumpy(), 3)
+
+
+def test_id():
+    x = relay.var('x', 'float32')
+    ident = relay.Function([x], x)
+    check_eval(ident, [1.0], 1.0)
+
+
+def test_add_const():
+    two = relay.add(relay.const(1), relay.const(1))
+    func = relay.Function([], two)
+    check_eval(func, [], 2)
+
+
+def test_mul_param():
+    x = relay.var('x', shape=(10, 10))
+    y = relay.var('y', shape=(1, 10))
+    func = relay.Function([x, y], relay.multiply(x, y))
+    x_data = np.random.rand(10, 10).astype('float32')
+    y_data = np.random.rand(1, 10).astype('float32')
+    check_eval(func, [x_data, y_data], x_data * y_data)
+
+
+def test_equal():
+    i = relay.var('i', shape=[], dtype='int32')
+    j = relay.var('i', shape=[], dtype='int32')
+    z = relay.equal(i, j)
+    func = relay.Function([i, j], z, ret_type=relay.TensorType([], 'bool'))
+    i_data = relay.const(0)
+    j_data = relay.const(0)
+    check_eval(func, [i_data, j_data], True)
+
+
+def test_subtract():
+    i = relay.var('i', shape=[], dtype='int32')
+    sub = relay.subtract(i, relay.const(1, dtype='int32'))
+    func = relay.Function([i], sub, ret_type=relay.TensorType([], 'int32'))
+    i_data = np.array(1, dtype='int32')
+    check_eval(func, [i_data], 0)
+
+
+def test_simple_loop():
+    mod = relay.module.Module({})
+    sum_up = relay.GlobalVar('sum_up')
+    i = relay.var('i', shape=[], dtype='int32')
+    sb = ScopeBuilder()
+    with sb.if_scope(relay.equal(i, relay.const(0, dtype='int32'))):
+        sb.ret(i)
+    with sb.else_scope():
+        one_less = relay.subtract(i, relay.const(1, dtype='int32'))
+        rec_call = relay.Call(sum_up, [one_less])
+        sb.ret(relay.add(rec_call, i))
+    func = relay.Function([i], sb.get(), ret_type=relay.TensorType([], 'int32'))
+    mod[sum_up] = func
+    i_data = np.array(10, dtype='int32')
+    check_eval(sum_up, [i_data], sum(range(1, 11)), mod=mod)
+
+
+def test_loop():
+    mod = relay.module.Module({})
+    sum_up = relay.GlobalVar('sum_up')
+    i = relay.var('i', shape=[], dtype='int32')
+    accum = relay.var('accum', shape=[], dtype='int32')
+    sb = ScopeBuilder()
+    with sb.if_scope(relay.equal(i, relay.const(0))):
+        sb.ret(accum)
+    with sb.else_scope():
+        one_less = relay.subtract(i, relay.const(1))
+        new_accum = relay.add(accum, i)
+        sb.ret(relay.Call(sum_up, [one_less, new_accum]))
+    func = relay.Function([i, accum], sb.get())
+    mod[sum_up] = func
+    i_data = np.array(10, dtype='int32')
+    accum_data = np.array(0, dtype='int32')
+    check_eval(sum_up, [i_data, accum_data], sum(range(1, 11)), mod=mod)
+
+
+def test_binds():
+    x = relay.var("x")
+    y = relay.add(x, x)
+    intrp = create_executor("debug")
+    xx = np.ones((10, 20))
+    res = intrp.evaluate(y, binds={x: xx}).asnumpy()
+    tvm.testing.assert_allclose(xx + xx, res)
+
+
+if __name__ == "__main__":
+    test_id()
+    test_add_const()
+    test_equal()
+    test_subtract()
+    test_simple_loop()
+    test_loop()
+    test_binds()
diff --git a/tests/python/relay/test_debug.py b/tests/python/relay/test_debug.py
new file mode 100644
index 000000000000..3463e2916147
--- /dev/null
+++ b/tests/python/relay/test_debug.py
@@ -0,0 +1,32 @@
+from tvm.relay import var, const, create_executor
+from tvm.relay.op import debug
+
+
+_test_debug_hit = False
+
+def test_debug():
+    global _test_debug_hit
+    ex = create_executor()
+    x = var('x', shape=(), dtype='int32')
+    _test_debug_hit = False
+    def did_exec(x):
+        global _test_debug_hit
+        _test_debug_hit = True
+    prog = debug(x, debug_func=did_exec)
+    result = ex.evaluate(prog, { x: const(1) })
+    assert _test_debug_hit
+    assert result.asnumpy() == 1
+
+def test_debug_with_expr():
+    global _test_debug_hit
+    _test_debug_hit = False
+    ex = create_executor()
+    x = var('x', shape=(), dtype='int32')
+    _test_debug_hit = False
+    def did_exec(x):
+        global _test_debug_hit
+        _test_debug_hit = True
+    prog = debug(x + x * x, debug_func=did_exec)
+    result = ex.evaluate(prog, { x: const(2) })
+    assert _test_debug_hit
+    assert result.asnumpy() == 6
diff --git a/tests/python/relay/test_ir_bind.py b/tests/python/relay/test_ir_bind.py
new file mode 100644
index 000000000000..8377bb9fb953
--- /dev/null
+++ b/tests/python/relay/test_ir_bind.py
@@ -0,0 +1,23 @@
+""" test bind function."""
+import tvm
+from tvm import relay
+
+
+def test_bind_params():
+    x = relay.var("x")
+    y = relay.var("y")
+    z = relay.add(x, y)
+    f = relay.Function([x, y], z)
+    fbinded = relay.bind(f, {x : relay.const(1, "float32")})
+    fexpected =relay.Function(
+        [y],
+        relay.add(relay.const(1, "float32"),  y))
+    assert relay.ir_pass.alpha_equal(fbinded, fexpected)
+
+    zbinded = relay.bind(z, {y: x})
+    zexpected = relay.add(x, x)
+    assert relay.ir_pass.alpha_equal(zbinded, zexpected)
+
+
+if __name__ == "__main__":
+    test_bind_params()
diff --git a/tests/python/relay/test_ir_nodes.py b/tests/python/relay/test_ir_nodes.py
new file mode 100644
index 000000000000..e041acca0e0c
--- /dev/null
+++ b/tests/python/relay/test_ir_nodes.py
@@ -0,0 +1,249 @@
+""" test ir"""
+import tvm
+from tvm import relay
+from tvm.expr import *
+from tvm.relay import op
+from tvm.relay.ir_pass import graph_equal
+
+
+def check_json_roundtrip(node):
+    json_str = tvm.save_json(node)
+    back = tvm.load_json(json_str)
+    assert graph_equal(back, node)
+
+
+def test_bad_constructor():
+    try:
+        x = relay.ty.TensorType("xx", "xx")
+    except tvm.TVMError:
+        pass
+
+
+# Span
+def test_span():
+    span = relay.Span(None, 1, 1)
+    assert span.source == None
+    assert span.lineno == 1
+    assert span.col_offset == 1
+    assert span.same_as(span)
+    assert span == span
+    assert isinstance(span, relay.base.Span)
+    str(span)
+
+    # span is not a node so we can't use graph_equal
+    # to test the round trip
+    back = tvm.load_json(tvm.save_json(span))
+    assert back.source == span.source
+    assert back.lineno == span.lineno
+    assert back.col_offset == span.col_offset
+
+# Types
+
+def test_tensor_type():
+    shape = tvm.convert([1, 2, 3])
+    dtype = 'float32'
+    tt = relay.TensorType(shape, dtype)
+    assert tt.dtype == dtype
+    assert tt.shape == shape
+    assert tt.span == None
+    str(tt)
+    check_json_roundtrip(tt)
+
+
+def test_type_param():
+    tp = relay.TypeVar('name', relay.Kind.Type)
+    assert tp.kind == relay.Kind.Type
+    # assert tp.span  # TODO allow us to set span
+    str(tp)
+    check_json_roundtrip(tp)
+
+
+def test_func_type():
+    type_params = tvm.convert([])
+    type_constraints = tvm.convert([])  # TODO: fill me in
+    arg_types = tvm.convert([])
+    ret_type = relay.TensorType((1, 2, 3), 'float32')
+    tf = relay.FuncType(arg_types, ret_type, type_params, type_constraints)
+    assert tf.type_params == type_params
+    assert tf.type_constraints == type_constraints
+    assert tf.arg_types == arg_types
+    assert tf.ret_type == ret_type
+    assert tf.span == None
+    # TODO make sure we can set span
+    str(tf)
+    check_json_roundtrip(tf)
+
+
+def test_tuple_type():
+    tp = relay.TypeVar('tp', relay.Kind.Type)
+    tf = relay.FuncType(tvm.convert([]), None, tvm.convert([]), tvm.convert([]))
+    tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
+    fields = tvm.convert([tp, tf, tt])
+
+    tup_ty = relay.TupleType(fields)
+    assert tup_ty.fields == fields
+    str(tup_ty)
+    check_json_roundtrip(tup_ty)
+
+
+def test_type_relation():
+    tp = relay.TypeVar('tp', relay.Kind.Type)
+    tf = relay.FuncType(tvm.convert([]), None, tvm.convert([]), tvm.convert([]))
+    tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
+    args = tvm.convert([tp, tf, tt])
+
+    num_inputs = 2
+    func = tvm.get_env_func("tvm.relay.type_relation.Broadcast")
+    attrs = tvm.make.node("attrs.TestAttrs", name="attr", padding=(3,4))
+
+    tr = relay.TypeRelation(func, args, num_inputs, attrs)
+    assert tr.args == args
+    assert tr.num_inputs == num_inputs
+    str(tr)
+    check_json_roundtrip(tr)
+
+
+def test_constant():
+    arr = tvm.nd.array(10)
+    const = relay.Constant(arr)
+    assert const.data == arr
+    assert const.span == None
+    str(const)
+    check_json_roundtrip(const)
+
+
+def test_tuple():
+    fields = tvm.convert([])
+    tup = relay.Tuple(fields)
+    assert tup.fields == fields
+    assert tup.span == None
+    str(tup)
+    check_json_roundtrip(tup)
+
+
+def test_local_var():
+    name_hint = 's'
+    lv = relay.Var(name_hint)
+    assert lv.name_hint == name_hint
+    assert lv.type_annotation is None
+    # assert lv.span == None todo(@jroesch): what do we do about spans
+    str(lv)
+    check_json_roundtrip(lv)
+
+    t1 = relay.ty.TensorType((), "float")
+    lv = relay.Var(name_hint, t1)
+    assert lv.name_hint == name_hint
+    assert lv.type_annotation == t1
+
+
+def test_global_var():
+    name_hint = 'g'
+    gv = relay.GlobalVar(name_hint)
+    gv.name_hint == name_hint
+    # assert lv.span == None todo(@jroesch): what do we do about spans
+    str(gv)
+    check_json_roundtrip(gv)
+
+
+def test_function():
+    param_names = ['a', 'b', 'c', 'd']
+    params = tvm.convert([relay.Var(n) for n in param_names])
+    ret_type = relay.TupleType(tvm.convert([]))
+    body = relay.Tuple(tvm.convert([]))
+    type_params = tvm.convert([])
+    fn = relay.Function(params, body, ret_type, type_params)
+    assert fn.params == params
+    assert fn.body == body
+    assert fn.type_params == type_params
+    assert fn.span == None
+    str(fn)
+    check_json_roundtrip(fn)
+
+
+def test_call():
+    op = relay.Var('f')
+    arg_names = ['a', 'b', 'c', 'd']
+    args = tvm.convert([relay.Var(n) for n in arg_names])
+    call = relay.Call(op, args, None, None)
+    assert call.op == op
+    assert call.args == args
+    assert call.span == None
+    str(call)
+    check_json_roundtrip(call)
+
+
+def test_let():
+    lv = relay.Var('x')
+    ty = None
+    arr = tvm.nd.array(10)
+    value = relay.Constant(arr)
+    # I would prefer that the order of arguments
+    # matches syntax let x: t = v in b
+    let = relay.Let(lv, value, lv)
+    assert let.var == lv
+    assert let.value == value
+    assert let.body == lv
+    assert let.span == None
+    str(let)
+    check_json_roundtrip(let)
+
+
+def test_if():
+    cond = relay.Var('cond')
+    left = relay.Var('left')
+    right = relay.Var('right')
+    ife = relay.If(cond, left, right)
+    assert ife.cond == cond
+    assert ife.true_branch == left
+    assert ife.false_branch == right
+    assert ife.span == None
+    str(ife)
+    check_json_roundtrip(ife)
+
+
+def test_tuple_get_item():
+    tup = relay.Var("tuple")
+    get = relay.TupleGetItem(tup, 1)
+    assert get.tuple_value == tup
+    assert get.index == 1
+    str(get)
+    check_json_roundtrip(get)
+
+
+def test_op():
+    add = op.op.get("add")
+    check_json_roundtrip(add)
+
+
+def test_conv2d_attrs():
+    data = relay.var('data', shape=(1, 3, 224, 224))
+    param = relay.var('param', shape=(64, 3, 7, 7))
+    out = op.nn.conv2d(
+        data,
+        param,
+        strides=(2, 2),
+        padding=(3, 3),
+        channels=64,
+        kernel_size=(7, 7))
+    check_json_roundtrip(out)
+
+
+if __name__ == "__main__":
+    test_bad_constructor()
+    test_span()
+    test_tensor_type()
+    test_type_param()
+    test_func_type()
+    test_tuple_type()
+    test_type_relation()
+    test_constant()
+    test_tuple()
+    test_local_var()
+    test_global_var()
+    test_function()
+    test_call()
+    test_let()
+    test_if()
+    test_tuple_get_item()
+    test_op()
+    test_conv2d_attrs()
diff --git a/tests/python/relay/test_ir_op.py b/tests/python/relay/test_ir_op.py
new file mode 100644
index 000000000000..f1d835d2b43b
--- /dev/null
+++ b/tests/python/relay/test_ir_op.py
@@ -0,0 +1,35 @@
+from tvm import relay
+
+def test_op_attr():
+    log_op = relay.op.get("log")
+
+    @relay.op.register("exp", "ftest")
+    def test(x):
+        return x + 1
+
+    assert log_op.num_inputs  == 1
+    assert log_op.get_attr("ftest") is None
+    assert relay.op.get("exp").get_attr("ftest")(1) == 2
+
+def test_op_level1():
+    x = relay.Var("x")
+
+    for op_name in ["log", "exp", "sqrt", "tanh"]:
+        y = getattr(relay, op_name)(x)
+        assert y.op.name == op_name
+        assert y.op.support_level == 1
+        assert y.args[0] == x
+
+def test_op_level3():
+    x = relay.Var("x")
+
+    for op_name in ["ceil", "floor", "trunc", "round", "abs", "negative"]:
+        y = getattr(relay, op_name)(x)
+        assert y.op.name == op_name
+        assert y.op.support_level == 3
+        assert y.args[0] == x
+
+if __name__ == "__main__":
+    test_op_attr()
+    test_op_level1()
+    test_op_level3()
diff --git a/tests/python/relay/test_ir_parser.py b/tests/python/relay/test_ir_parser.py
new file mode 100644
index 000000000000..c2c83df7ed0c
--- /dev/null
+++ b/tests/python/relay/test_ir_parser.py
@@ -0,0 +1,562 @@
+import tvm
+from tvm import relay
+from tvm.relay.parser import enabled
+from tvm.relay.ir_pass import alpha_equal
+from nose.tools import nottest, raises
+from numpy import isclose
+from typing import Union
+from functools import wraps
+if enabled():
+    from tvm.relay._parser import ParseError
+    raises_parse_error = raises(ParseError)
+else:
+    raises_parse_error = lambda x: x
+
+BINARY_OPS = {
+    "*": relay.multiply,
+    "/": relay.divide,
+    "+": relay.add,
+    "-": relay.subtract,
+    "<": relay.less,
+    ">": relay.greater,
+    "<=": relay.less_equal,
+    ">=": relay.greater_equal,
+    "==": relay.equal,
+    "!=": relay.not_equal,
+}
+
+TYPES = {
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+
+    "float16",
+    "float32",
+    "float64",
+
+    "bool",
+
+    "int8x4",
+    "uint1x4",
+    "float16x4",
+}
+
+def get_scalar(x):
+    # type: (relay.Constant) -> (Union[float, int, bool])
+    return x.data.asnumpy().item()
+
+int32 = relay.scalar_type("int32")
+
+_ = relay.Var("_")
+X = relay.Var("x")
+Y = relay.Var("y")
+X_ANNO = relay.Var("x", int32)
+Y_ANNO = relay.Var("y", int32)
+
+UNIT = relay.Tuple([])
+
+# decorator to determine if parser is enabled
+def if_parser_enabled(func):
+    # https://stackoverflow.com/q/7727678
+    @wraps(func)
+    def wrapper():
+        if not enabled():
+            return
+        func()
+    return wrapper
+
+@if_parser_enabled
+def test_comments():
+    assert alpha_equal(
+        relay.fromtext("""
+            // This is a line comment!
+            ()
+        """),
+        UNIT
+    )
+
+    assert alpha_equal(
+        relay.fromtext("""
+            /* This is a block comment!
+               This is still a block comment!
+            */
+            ()
+        """),
+        UNIT
+    )
+
+@if_parser_enabled
+def test_int_literal():
+    assert isinstance(relay.fromtext("1"), relay.Constant)
+    assert isinstance(relay.fromtext("1").data, tvm.ndarray.NDArray)
+    
+    assert get_scalar(relay.fromtext("1")) == 1
+    assert get_scalar(relay.fromtext("10")) == 10
+    assert get_scalar(relay.fromtext("0")) == 0
+    assert get_scalar(relay.fromtext("-100")) == -100
+    assert get_scalar(relay.fromtext("-05")) == -5
+
+@if_parser_enabled
+def test_float_literal():
+    assert get_scalar(relay.fromtext("1.0")) == 1.0
+    assert isclose(get_scalar(relay.fromtext("1.56667")), 1.56667)
+    assert get_scalar(relay.fromtext("0.0")) == 0.0
+    assert get_scalar(relay.fromtext("-10.0")) == -10.0
+
+    # scientific notation
+    assert isclose(get_scalar(relay.fromtext("1e-1")), 1e-1)
+    assert get_scalar(relay.fromtext("1e+1")) == 1e+1
+    assert isclose(get_scalar(relay.fromtext("1E-1")), 1E-1)
+    assert get_scalar(relay.fromtext("1E+1")) == 1E+1
+    assert isclose(get_scalar(relay.fromtext("1.0e-1")), 1.0e-1)
+    assert get_scalar(relay.fromtext("1.0e+1")) == 1.0e+1
+    assert isclose(get_scalar(relay.fromtext("1.0E-1")), 1.0E-1)
+    assert get_scalar(relay.fromtext("1.0E+1")) == 1.0E+1
+
+@if_parser_enabled
+def test_bool_literal():
+    assert get_scalar(relay.fromtext("True")) == True
+    assert get_scalar(relay.fromtext("False")) == False
+
+@if_parser_enabled
+def test_negative():
+    assert isinstance(relay.fromtext("let %x = 1; -%x").body, relay.Call)
+    assert get_scalar(relay.fromtext("--10")) == 10
+    assert get_scalar(relay.fromtext("---10")) == -10
+
+@if_parser_enabled
+def test_bin_op():
+    for bin_op in BINARY_OPS.keys():
+        assert alpha_equal(
+            relay.fromtext("1 {} 1".format(bin_op)),
+            BINARY_OPS.get(bin_op)(relay.const(1), relay.const(1))
+        )
+
+@if_parser_enabled
+def test_parens():
+    assert alpha_equal(relay.fromtext("1 * 1 + 1"), relay.fromtext("(1 * 1) + 1"))
+    assert not alpha_equal(relay.fromtext("1 * 1 + 1"), relay.fromtext("1 * (1 + 1)"))
+
+@if_parser_enabled
+def test_op_assoc():
+    assert alpha_equal(relay.fromtext("1 * 1 + 1 < 1 == 1"), relay.fromtext("(((1 * 1) + 1) < 1) == 1"))
+    assert alpha_equal(relay.fromtext("1 == 1 < 1 + 1 * 1"), relay.fromtext("1 == (1 < (1 + (1 * 1)))"))
+
+@nottest
+@if_parser_enabled
+def test_vars():
+    # temp vars won't work b/c they start with a digit
+    # # temp var
+    # temp_var = relay.fromtext("%1")
+    # assert isinstance(temp_var, relay.Var)
+    # assert temp_var.name == "1"
+
+    # var
+    var = relay.fromtext("let %foo = (); %foo")
+    assert isinstance(var.body, relay.Var)
+    assert var.body.name_hint == "foo"
+
+    # global var
+    global_var = relay.fromtext("@foo")
+    assert isinstance(global_var, relay.GlobalVar)
+    assert global_var.name_hint == "foo"
+
+    # operator id
+    op = relay.fromtext("foo")
+    assert isinstance(op, relay.Op)
+    assert op.name == "foo"
+
+@if_parser_enabled
+def test_let():
+    assert alpha_equal(
+        relay.fromtext("let %x = 1; ()"),
+        relay.Let(
+            X,
+            relay.const(1),
+            UNIT
+        )
+    )
+
+@if_parser_enabled
+def test_seq():
+    assert alpha_equal(
+        relay.fromtext("(); ()"),
+        relay.Let(
+            _,
+            UNIT,
+            UNIT)
+    )
+
+    assert alpha_equal(
+        relay.fromtext("let %_ = { 1 }; ()"),
+        relay.Let(
+            X,
+            relay.const(1),
+            UNIT
+        )
+    )
+
+@raises_parse_error
+@if_parser_enabled
+def test_let_global_var():
+    relay.fromtext("let @x = 1; ()")
+
+@raises_parse_error
+@if_parser_enabled
+def test_let_op():
+    relay.fromtext("let x = 1; ()")
+
+@if_parser_enabled
+def test_tuple():
+    assert alpha_equal(relay.fromtext("()"), relay.Tuple([]))
+
+    assert alpha_equal(relay.fromtext("(0,)"), relay.Tuple([relay.const(0)]))
+
+    assert alpha_equal(relay.fromtext("(0, 1)"), relay.Tuple([relay.const(0), relay.const(1)]))
+
+    assert alpha_equal(relay.fromtext("(0, 1, 2)"), relay.Tuple([relay.const(0), relay.const(1), relay.const(2)]))
+
+@if_parser_enabled
+def test_func():
+    # 0 args
+    assert alpha_equal(
+        relay.fromtext("fn () { 0 }"),
+        relay.Function(
+            [],
+            relay.const(0),
+            None,
+            []
+        )
+    )
+
+    # 1 arg
+    assert alpha_equal(
+        relay.fromtext("fn (%x) { %x }"),
+        relay.Function(
+            [X],
+            X,
+            None,
+            []
+        )
+    )
+
+    # 2 args
+    assert alpha_equal(
+        relay.fromtext("fn (%x, %y) { %x + %y }"),
+        relay.Function(
+            [X, Y],
+            relay.add(X, Y),
+            None,
+            []
+        )
+    )
+
+    # annotations
+    assert alpha_equal(
+        relay.fromtext("fn (%x: int32) -> int32 { %x }"),
+        relay.Function(
+            [X_ANNO],
+            X_ANNO,
+            int32,
+            []
+        )
+    )
+
+# TODO(@jmp): Crashes if %x isn't annnotated.
+# @nottest
+@if_parser_enabled
+def test_defn():
+    id_defn = relay.fromtext(
+        """
+        def @id(%x: int32) -> int32 {
+            %x
+        }
+        """)
+    assert isinstance(id_defn, relay.Module)
+
+@if_parser_enabled
+def test_ifelse():
+    assert alpha_equal(
+        relay.fromtext(
+        """
+        if (True) {
+            0
+        } else {
+            1
+        }
+        """
+        ),
+        relay.If(
+            relay.const(True),
+            relay.const(0),
+            relay.const(1)
+        )
+    )
+
+@raises_parse_error
+@if_parser_enabled
+def test_ifelse_scope():
+    relay.fromtext(
+        """
+        if (True) {
+            let %x = ();
+            ()
+        } else {
+            %x
+        }
+        """
+    )
+
+@if_parser_enabled
+def test_call():
+    # 0 args
+    constant = relay.Var("constant")
+    assert alpha_equal(
+        relay.fromtext(
+        """
+        let %constant = fn () { 0 };
+        %constant()
+        """
+        ),
+        relay.Let(
+            constant,
+            relay.Function([], relay.const(0), None, []),
+            relay.Call(constant, [], None, None)
+        )
+    )
+
+    # 1 arg
+    id_var = relay.Var("id")
+    assert alpha_equal(
+        relay.fromtext(
+            """
+            let %id = fn (%x) { %x };
+            %id(1)
+            """
+        ),
+        relay.Let(
+            id_var,
+            relay.Function([X], X, None, []),
+            relay.Call(id_var, [relay.const(1)], None, None)
+        )
+    )
+
+    # 2 args
+    multiply = relay.Var("multiply")
+    assert alpha_equal(
+        relay.fromtext(
+        """
+        let %multiply = fn (%x, %y) { %x * %y };
+        %multiply(0, 0)
+        """
+        ),
+        relay.Let(
+            multiply,
+            relay.Function(
+                [X, Y],
+                relay.multiply(X, Y),
+                None,
+                []
+            ),
+            relay.Call(multiply, [relay.const(0), relay.const(0)], None, None)
+        )
+    )
+
+    # anonymous function
+    assert alpha_equal(
+        relay.fromtext(
+        """
+        (fn (%x) { %x })(0)
+        """
+        ),
+        relay.Call(
+            relay.Function(
+                [X],
+                X,
+                None,
+                []
+            ),
+            [relay.const(0)],
+            None,
+            None
+        )
+    )
+
+    # curried function
+    curried_mult = relay.Var("curried_mult")
+    alpha_equal(
+        relay.fromtext(
+            """
+            let %curried_mult =
+                fn (%x) {
+                fn (%y) {
+                    %x * %y
+                }
+                };
+            %curried_mult(0);
+            %curried_mult(0)(0)
+            """
+        ),
+        relay.Let(
+            curried_mult,
+            relay.Function(
+                [X],
+                relay.Function(
+                    [Y],
+                    relay.multiply(X, Y),
+                    None,
+                    []
+                ),
+                None,
+                []
+            ),
+            relay.Let(
+                _,
+                relay.Call(curried_mult, [relay.const(0)], None, None),
+                relay.Call(relay.Call(curried_mult, [relay.const(0)], None, None), [relay.const(0)], None, None)
+            )
+        )
+    )
+
+    # op
+    alpha_equal(
+        relay.fromtext("abs(1)"),
+        relay.Call(relay.op.get("abs"), [relay.const(1)], None, None)
+    )
+
+# Types
+
+@if_parser_enabled
+def test_incomplete_type():
+    assert alpha_equal(
+        relay.fromtext("let %_ : _ = (); ()"),
+        relay.Let(
+            _,
+            UNIT,
+            UNIT
+        )
+    )
+
+@if_parser_enabled
+def test_builtin_types():
+    for builtin_type in TYPES:
+        relay.fromtext("let %_ : {} = (); ()".format(builtin_type))
+
+@nottest
+@if_parser_enabled
+def test_call_type():
+    assert False
+
+@if_parser_enabled
+def test_tensor_type():
+    assert alpha_equal(
+        relay.fromtext("let %_ : Tensor[(), float32] = (); ()"),
+        relay.Let(
+            relay.Var("_", relay.TensorType((), "float32")),
+            UNIT,
+            UNIT
+        )
+    )
+
+    assert alpha_equal(
+        relay.fromtext("let %_ : Tensor[(1,), float32] = (); ()"),
+        relay.Let(
+            relay.Var("_", relay.TensorType((1,), "float32")),
+            UNIT,
+            UNIT
+        )
+    )
+
+    assert alpha_equal(
+        relay.fromtext("let %_ : Tensor[(1, 1), float32] = (); ()"),
+        relay.Let(
+            relay.Var("_", relay.TensorType((1, 1), "float32")),
+            UNIT,
+            UNIT
+        )
+    )
+
+@if_parser_enabled
+def test_function_type():
+    assert alpha_equal(
+        relay.fromtext(
+            """
+            let %_: fn () -> int32 = fn () -> int32 { 0 }; ()
+            """
+        ),
+        relay.Let(
+            relay.Var("_", relay.FuncType([], int32, [], [])),
+            relay.Function([], relay.const(0), int32, []),
+            UNIT
+        )
+    )
+
+    assert alpha_equal(
+        relay.fromtext(
+            """
+            let %_: fn (int32) -> int32 = fn (%x: int32) -> int32 { 0 }; ()
+            """
+        ),
+        relay.Let(
+            relay.Var("_", relay.FuncType([int32], int32, [], [])),
+            relay.Function([relay.Var("x", int32)], relay.const(0), int32, []),
+            UNIT
+        )
+    )
+
+    assert alpha_equal(
+        relay.fromtext(
+            """
+            let %_: fn (int32, int32) -> int32 = fn (%x: int32, %y: int32) -> int32 { 0 }; ()
+            """
+        ),
+        relay.Let(
+            relay.Var("_", relay.FuncType([int32, int32], int32, [], [])),
+            relay.Function([relay.Var("x", int32), relay.Var("y", int32)], relay.const(0), int32, []),
+            UNIT
+        )
+    )
+
+@if_parser_enabled
+def test_tuple_type():
+    assert alpha_equal(
+        relay.fromtext(
+        """
+        let %_: () = (); ()
+        """),
+        relay.Let(
+            relay.Var("_", relay.TupleType([])),
+            UNIT,
+            UNIT
+        )
+    )
+
+    assert alpha_equal(
+        relay.fromtext(
+        """
+        let %_: (int32,) = (0,); ()
+        """),
+        relay.Let(
+            relay.Var("_", relay.TupleType([int32])),
+            relay.Tuple([relay.const(0)]),
+            UNIT
+        )
+    )
+
+    assert alpha_equal(
+        relay.fromtext(
+        """
+        let %_: (int32, int32) = (0, 1); ()
+        """),
+        relay.Let(
+            relay.Var("_", relay.TupleType([int32, int32])),
+            relay.Tuple([relay.const(0), relay.const(1)]),
+            UNIT
+        )
+    )
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
new file mode 100644
index 000000000000..624ef71ed870
--- /dev/null
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -0,0 +1,164 @@
+import tvm
+import tvm.relay.testing
+import numpy as np
+from tvm import relay
+
+
+do_print = [False]
+
+def show(text):
+    if do_print[0]:
+        print("---------------------------")
+        print(text)
+
+def test_func():
+    x = relay.var("x", shape=(3, 2))
+    y = relay.var("y")
+    one = relay.const(10e10, dtype="float32")
+    z = relay.add(x, one)
+    z = relay.add(z, z)
+    f = relay.Function([x, y], z)
+    show(z.astext())
+    show(f.astext())
+
+
+def test_env():
+    x = relay.var("x", "float32")
+    y = relay.var("y", "float32")
+    z = relay.add(x, y)
+    z = relay.add(z, z)
+    f = relay.Function([x, y], z)
+    env = relay.Module()
+    env["myf"] = f
+    text = env.astext()
+    assert "def @myf" in text
+    assert "%1 = add(%0, %0) # ty=float32" in text
+    show(env.astext(annotate=lambda x: str(x.checked_type.dtype)))
+    show(text)
+
+
+def test_meta_data():
+    n, c, h, w = tvm.var("n"), 10, 224, 224
+    x = relay.var("x", shape=(n, c, h, w))
+    w = relay.var("w")
+    z = relay.nn.conv2d(x, w,
+                        kernel_size=(3, 3),
+                        padding=(1, 1),
+                        channels=2)
+    f = relay.Function([x, w], z)
+    text = f.astext()
+    assert "channels=2" in text
+    assert "meta.Variable(id=0)" in text
+    show(text)
+
+    text = relay.const([1,2,3]).astext()
+    assert "meta.relay.Constant(id=0)" in text
+    show(text)
+
+
+def test_call_attrs():
+    x = relay.var("x")
+    # non default args
+    z = relay.nn.softmax(x, axis=2)
+    assert "axis=2" in z.astext()
+    # default args
+    z = relay.nn.softmax(x)
+    assert "softmax(%x)" in z.astext()
+    # non default args
+    z = relay.expand_dims(x, axis=2, num_newaxis=2)
+    assert "num_newaxis=2" in z.astext()
+
+
+def test_let_if_scope():
+    x = relay.var("x", "float32")
+    y = relay.var("y", "float32")
+    cond = relay.var("cond", "bool")
+
+    sb = relay.ScopeBuilder()
+    with sb.if_scope(cond):
+        v1 = sb.let("v", relay.const(1, "float32"))
+        v2 = sb.let("v", x)
+        sb.ret(relay.subtract(v1, v2))
+    with sb.else_scope():
+        v3 = relay.var("v")
+        let2 = relay.Let(v3, y, v3)
+        sb.ret(relay.add(let2, let2))
+    result = sb.get()
+
+    f = relay.Function([x, y, cond], result)
+    text = f.astext()
+    assert text.count("{") == 4
+    assert "%cond: bool" in text
+    show(f.astext())
+
+
+def test_variable_name():
+    # avoid pure number even if the namehint is pure number
+    v1 = relay.var("1")
+    assert "%v1" in v1.astext()
+
+
+def test_mlp():
+    net, params = tvm.relay.testing.mlp.get_workload(batch_size=1)
+    net.astext()
+
+
+def test_resnet():
+    net, params = tvm.relay.testing.resnet.get_workload(batch_size=1)
+    net.astext()
+
+
+def test_mobilenet():
+    net, params = tvm.relay.testing.mobilenet.get_workload(batch_size=1)
+    net.astext()
+
+
+def test_dqn():
+    net, params = tvm.relay.testing.dqn.get_workload(batch_size=1)
+    net.astext()
+
+
+def test_dcgan():
+    net, params = tvm.relay.testing.dcgan.get_workload(batch_size=1)
+    net.astext()
+
+
+def test_lstm():
+    net, params = tvm.relay.testing.lstm.get_workload(4, 4)
+    net.astext()
+
+def test_inception_v3():
+    net, params = tvm.relay.testing.inception_v3.get_workload(batch_size=1)
+    net.astext()
+
+def test_squeezenet():
+    for version in ['1.0', '1.1']:
+        net, params = tvm.relay.testing.squeezenet.get_workload(batch_size=1, version=version)
+        net.astext()
+
+def test_vgg():
+    net, params = tvm.relay.testing.vgg.get_workload(batch_size=1)
+    net.astext()
+
+def test_densenet():
+    net, params = tvm.relay.testing.densenet.get_workload(batch_size=1)
+    net.astext()
+
+
+if __name__ == "__main__":
+    do_print[0] = True
+    test_resnet()
+    test_mobilenet()
+    test_mlp()
+    test_dqn()
+    test_dcgan()
+    test_squeezenet()
+    test_inception_v3()
+    test_vgg()
+    test_densenet()
+    test_func()
+    test_env()
+    test_meta_data()
+    test_call_attrs()
+    test_let_if_scope()
+    test_variable_name()
diff --git a/tests/python/relay/test_ir_well_formed.py b/tests/python/relay/test_ir_well_formed.py
new file mode 100644
index 000000000000..725b2fbd3c3d
--- /dev/null
+++ b/tests/python/relay/test_ir_well_formed.py
@@ -0,0 +1,32 @@
+import tvm
+from tvm import relay
+from tvm.relay.ir_pass import well_formed
+
+def test_well_formed():
+    x = relay.Var('x')
+    assert well_formed(x)
+    v = relay.Constant(tvm.nd.array(10))
+    ty = None
+    let = relay.Let(x, v, x)
+    assert well_formed(let)
+    assert not well_formed(relay.Let(x, v, let))
+    f = relay.Function([x], x, ty)
+    assert well_formed(f)
+    assert well_formed(
+        relay.Let(relay.Var("y"), f,
+                  relay.Let(relay.Var("z"), f, v)))
+
+
+def test_tuple():
+    x = relay.Var('x')
+    assert well_formed(x)
+    v = relay.Constant(tvm.nd.array(10))
+    let = relay.Let(x, v, x)
+    assert well_formed(let)
+    assert well_formed(relay.Tuple([v, v]))
+    assert not well_formed(relay.Tuple([let, relay.Let(x, v, x)]))
+
+
+def test_tuple_get_item():
+    t = relay.Var('t')
+    assert well_formed(relay.TupleGetItem(t, 2))
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
new file mode 100644
index 000000000000..6a1662b65170
--- /dev/null
+++ b/tests/python/relay/test_op_level1.py
@@ -0,0 +1,321 @@
+import math
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.relay.testing import ctx_list
+import topi.testing
+
+def sigmoid(x):
+    one = np.ones_like(x)
+    return one / (one + np.exp(-x))
+
+def relu(x):
+    x_copy = np.copy(x)
+    np.maximum(x_copy, 0, x_copy)
+    return x_copy
+
+def test_unary_op():
+    def check_single_op(opfunc, ref):
+        shape = (10, 4)
+        dtype = 'float32'
+        tp = relay.TensorType(shape, dtype)
+        x = relay.var("x", tp)
+        y = opfunc(x)
+        # test printer
+        assert ("%0 = {}(%x)".format(y.op.name)) in y.astext()
+        # test type inference
+        assert relay.ir_pass.infer_type(y).checked_type == tp
+
+        if ref is not None:
+            data = np.random.rand(*shape).astype(dtype)
+            ref_res = ref(data)
+            func = relay.Function([x], y)
+            for target, ctx in ctx_list():
+                # use graph by execuor default for testing, as we need
+                # create function explicitly to avoid constant-folding.
+                intrp = relay.create_executor("graph", ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(data)
+                np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+
+
+    for opfunc, ref in [(tvm.relay.log, np.log),
+                   (tvm.relay.exp, np.exp),
+                   (tvm.relay.sqrt, np.sqrt),
+                   (tvm.relay.sigmoid, sigmoid),
+                   (tvm.relay.tanh, np.tanh),
+                   (relay.nn.relu, relu)]:
+        check_single_op(opfunc, ref)
+
+
+def test_binary_op():
+    def inst(vars, sh):
+        return [vars.get(s, s) for s in sh]
+
+    def check_binary_op(opfunc, ref):
+        # TODO(@jroesch): this piece of code improperly uses type variables.
+        n = tvm.var("n")
+        s1 = (5, n, 5)
+        s2 = (n, 1)
+        t1 = relay.TensorType(s1)
+        t2 = relay.TensorType(s2)
+        x = relay.var("x", t1)
+        y = relay.var("y", t2)
+        z = opfunc(x, y)
+        # test printer
+        assert ("%0 = {}(%x, %y)".format(z.op.name)) in z.astext()
+        assert relay.ir_pass.infer_type(z).checked_type == t1
+
+        if ref is not None:
+            t1 = relay.TensorType((5, 10, 5))
+            t2 = relay.TensorType((5, 10, 5))
+            x = relay.var("x", t1)
+            y = relay.var("y", t2)
+            z = opfunc(x, y)
+            x_data = np.random.rand(5, 10, 5).astype(t1.dtype)
+            y_data = np.random.rand(5, 10, 5).astype(t2.dtype)
+            ref_res = ref(x_data, y_data)
+            func = relay.Function([x, y], z)
+
+            for target, ctx in ctx_list():
+                # use graph by execuor default for testing, as we need
+                # create function explicitly to avoid constant-folding.
+                intrp = relay.create_executor("graph", ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data, y_data)
+                np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+
+    for opfunc, ref in [(relay.add, np.add),
+                   (relay.subtract, np.subtract),
+                   (relay.multiply, np.multiply),
+                   (relay.divide, np.divide)]:
+        check_binary_op(opfunc, ref)
+
+
+def test_expand_dims():
+    # based on topi test
+    def verify_expand_dims(dshape, dtype, oshape, axis, num_newaxis):
+        x = relay.Var("x", relay.TensorType(dshape, dtype))
+        func = relay.Function([x], relay.expand_dims(x, axis, num_newaxis))
+        for target, ctx in ctx_list():
+            data = np.random.uniform(size=dshape).astype(dtype)
+            ref_res = data.reshape(oshape)
+            intrp = relay.create_executor("graph", ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(data)
+            np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+
+    verify_expand_dims((3, 10), 'float32', (3, 10, 1, 1), 2, 2)
+    verify_expand_dims((3, 10), 'float32', (1, 3, 10), -3, 1)
+
+
+def test_bias_add():
+    xshape=(10, 2, 3, 4)
+    bshape=(2,)
+    dtype="float32"
+    x = relay.var("x", shape=xshape)
+    bias = relay.var("bias")
+    z = relay.nn.bias_add(x, bias)
+    zz = relay.ir_pass.infer_type(z)
+    assert "axis=" not in zz.astext()
+    assert zz.args[1].checked_type == relay.TensorType(bshape)
+
+    func = relay.Function([x, bias], z)
+    x_data = np.random.uniform(size=xshape).astype(dtype)
+    y_data = np.random.uniform(size=bshape).astype(dtype)
+    ref_res = x_data + y_data.reshape((2, 1, 1))
+    for target, ctx in ctx_list():
+        intrp = relay.create_executor("graph", ctx=ctx, target=target)
+        op_res = intrp.evaluate(func)(x_data, y_data)
+        np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+
+
+def test_expand_dims_infer_type():
+    n, t, d = tvm.var("n"), tvm.var("t"), 100
+    x = relay.var("x", shape=(n, t, d))
+    y = relay.expand_dims(x, axis=2)
+    assert "axis=2" in y.astext()
+    checked = relay.ir_pass.infer_type(y)
+    assert checked.checked_type == relay.TensorType((n, t, 1, 100))
+
+
+def test_softmax():
+    shape = (10, 4)
+    x = relay.var("x", shape=shape)
+    y = relay.nn.softmax(x, axis=1)
+    assert "nn.softmax" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType(shape)
+    func = relay.Function([x], y)
+    x_data = np.random.uniform(size=shape).astype("float32")
+    ref_res = topi.testing.softmax_python(x_data)
+    for target, ctx in ctx_list():
+        intrp = relay.create_executor("graph", ctx=ctx, target=target)
+        op_res = intrp.evaluate(func)(x_data)
+        np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+
+
+def test_log_softmax():
+    shape = (10, 4)
+    x = relay.var("x", shape=shape)
+    y = relay.nn.log_softmax(x, axis=1)
+    assert "nn.log_softmax" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType(shape)
+    func = relay.Function([x], y)
+    x_data = np.random.uniform(size=shape).astype("float32")
+    ref_res = topi.testing.log_softmax_python(x_data)
+    for target, ctx in ctx_list():
+        intrp = relay.create_executor("graph", ctx=ctx, target=target)
+        op_res = intrp.evaluate(func)(x_data)
+        np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+
+
+def test_concatenate():
+    n, t, d = tvm.var("n"), tvm.var("t"), 100
+    x = relay.var("x", shape=(n, t, d))
+    y = relay.var("y", shape=(n, t, d))
+    z = relay.concatenate((x, y), axis=-1)
+    assert "axis=" in z.astext()
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.TensorType((n, t, 200))
+
+    x = relay.exp(x)
+    z = relay.concatenate((x, y), axis=2)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.TensorType((n, t, 200))
+
+    z = relay.concatenate((x, y), axis=1)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.TensorType((n, t + t, 100))
+
+    x = relay.var("x", shape=(10, 5))
+    y = relay.var("y", shape=(10, 5))
+    t = relay.var("z", shape=())
+    z = relay.concatenate((x, y), axis=1)
+    z = relay.add(z, t)
+    # Check result.
+    func = relay.Function([x, y, t], z)
+    x_data = np.random.rand(10, 5).astype('float32')
+    y_data = np.random.rand(10, 5).astype('float32')
+    t_data = np.random.uniform(size=()).astype('float32')
+    ref_res = np.concatenate((x_data, y_data), axis=1) + t_data
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(x_data, y_data, t_data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=0.01)
+        op_res2 = intrp2.evaluate(func)(x_data, y_data, t_data)
+        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=0.01)
+
+def test_dropout():
+    n, t, d = tvm.var("n"), tvm.var("t"), tvm.var("d")
+    input_ty = relay.TensorType((n, t, d), "float32")
+    x = relay.var("x", input_ty)
+    y = relay.nn.dropout(x, rate=0.75)
+    assert "rate=" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == input_ty
+
+
+def test_batch_norm():
+    # beta and gamma ignored
+    data = relay.var("data", relay.TensorType((3, 2, 1)))
+    beta = relay.var("beta", relay.TensorType((2,)))
+    gamma = relay.var("gamma", relay.TensorType((2,)))
+    moving_mean = relay.var("moving_mean", relay.TensorType((2,)))
+    moving_var = relay.var("moving_var", relay.TensorType((2,)))
+    y = relay.nn.batch_norm(data, gamma, beta, moving_mean, moving_var,
+                            center=False, scale=False)
+    yy = relay.ir_pass.infer_type(y.astuple())
+    assert "center=" in yy.astext()
+    assert yy.checked_type == relay.ty.TupleType(tvm.convert([
+        relay.TensorType((3, 2, 1), "float32"),
+        relay.TensorType((2,), "float32"),
+        relay.TensorType((2,), "float32")
+    ]))
+
+    beta = relay.var("beta", relay.TensorType((3,)))
+    gamma = relay.var("gamma", relay.TensorType((3,)))
+    moving_mean = relay.var("moving_mean", relay.TensorType((3,)))
+    moving_var = relay.var("moving_var", relay.TensorType((3,)))
+
+    y = relay.nn.batch_norm(data, gamma, beta, moving_mean, moving_var,
+                            axis=0, center=False, scale=False)
+    yy = relay.ir_pass.infer_type(y.astuple())
+    assert yy.checked_type == relay.ty.TupleType(tvm.convert([
+        relay.ty.TensorType((3, 2, 1), "float32"),
+        relay.ty.TensorType((3,), "float32"),
+        relay.ty.TensorType((3,), "float32")
+    ]))
+
+    # axis=-1
+    data = relay.var("data", relay.TensorType((1, 2, 3)))
+    beta = relay.var("beta", relay.TensorType((3,)))
+    gamma = relay.var("gamma", relay.TensorType((3,)))
+    moving_mean = relay.var("moving_mean", relay.TensorType((3,)))
+    moving_var = relay.var("moving_var", relay.TensorType((3,)))
+    y = relay.nn.batch_norm(data, gamma, beta, moving_mean, moving_var,
+                            axis=-1, center=False, scale=False)
+    yy = relay.ir_pass.infer_type(y.astuple())
+    assert yy.checked_type == relay.ty.TupleType(tvm.convert([
+        relay.ty.TensorType((1, 2, 3), "float32"),
+        relay.ty.TensorType((3,), "float32"),
+        relay.ty.TensorType((3,), "float32")
+    ]))
+
+
+def test_dense():
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    w = relay.var("w", relay.TensorType((2, w), "float32"))
+    y = relay.nn.dense(x, w, units=2)
+    "units=2" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c, h, 2), "float32")
+
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), 2
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    wh, ww = tvm.var("wh"), tvm.var("ww")
+    w = relay.var("w", relay.TensorType((ww, wh), "float32"))
+    y = relay.nn.dense(x, w)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c, h, ww), "float32")
+
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), 2
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    w = relay.var("w", relay.IncompleteType())
+    y = relay.nn.dense(x, w, units=2)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c, h, 2), "float32")
+
+    x = relay.var("x", shape=(10, 5))
+    w = relay.var("w", shape=(2, 5))
+    z = relay.nn.dense(x, w)
+
+    # Check result.
+    func = relay.Function([x, w], z)
+    x_data = np.random.rand(10, 5).astype('float32')
+    w_data = np.random.rand(2, 5).astype('float32')
+    ref_res = np.dot(x_data, w_data.T)
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(x_data, w_data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+        op_res2 = intrp2.evaluate(func)(x_data, w_data)
+        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
+
+
+if __name__ == "__main__":
+    test_concatenate()
+    test_bias_add()
+    test_unary_op()
+    test_binary_op()
+    test_expand_dims_infer_type()
+    test_expand_dims()
+    test_softmax()
+    test_log_softmax()
+    test_dropout()
+    test_batch_norm()
+    test_dense()
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
new file mode 100644
index 000000000000..2c0ed73a7535
--- /dev/null
+++ b/tests/python/relay/test_op_level10.py
@@ -0,0 +1,128 @@
+""" Support level10 operator test cases.
+"""
+import numpy as np
+import tvm
+from tvm import relay
+from tvm.relay.testing import ctx_list
+
+def test_collapse_sum_like():
+    shape = (3, 4, 5, 6)
+    shape_like = (4, 5, 6)
+    dtype = "float32"
+    x = relay.Var("x", relay.ty.TensorType(shape , dtype))
+    y = relay.Var("y", relay.ty.TensorType(shape_like, dtype))
+    z = relay.collapse_sum_like(x, y)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.ty.TensorType(shape_like, dtype)
+
+    func = relay.Function([x, y], z)
+    x = np.random.uniform(size=shape).astype(dtype)
+    y = np.random.uniform(size=shape_like).astype(dtype)
+    ref_res = np.sum(x, 0)
+    for target, ctx in ctx_list():
+        for kind in ["graph", "debug"]:
+            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(x, y)
+            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+
+def test_broadcast_to():
+    shape = (4, 1, 6)
+    shape_like = (3, 4, 5, 6)
+    dtype = "float32"
+    x = relay.Var("x", relay.ty.TensorType(shape , dtype))
+    z = relay.broadcast_to(x, shape=shape_like)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.ty.TensorType(shape_like, dtype)
+
+    func = relay.Function([x], z)
+    x = np.random.uniform(size=shape).astype(dtype)
+    ref_res = np.broadcast_to(x, shape_like)
+    for target, ctx in ctx_list():
+        for kind in ["graph", "debug"]:
+            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(x)
+            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+
+def test_broadcast_to_like():
+    shape = (4, 1, 6)
+    shape_like = (3, 4, 5, 6)
+    dtype = "float32"
+    x = relay.Var("x", relay.ty.TensorType(shape , dtype))
+    y = relay.Var("y", relay.ty.TensorType(shape_like, dtype))
+    z = relay.broadcast_to_like(x, y)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.ty.TensorType(shape_like, dtype)
+
+    func = relay.Function([x, y], z)
+    x = np.random.uniform(size=shape).astype(dtype)
+    y = np.random.uniform(size=shape_like).astype(dtype)
+    ref_res = np.broadcast_to(x, shape_like)
+    for target, ctx in ctx_list():
+        for kind in ["graph", "debug"]:
+            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(x, y)
+            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+
+
+def np_slice_like(np_data, np_shape_like, axis=None):
+    begin_idx = [0 for _ in np_data.shape]
+    end_idx = list(np_data.shape)
+    if axis:
+        for i in axis:
+            if i < 0:
+                i = len(np_data.shape) + i
+            end_idx[i] = np_shape_like.shape[i]
+    else:
+        for i in range(len(np_data.shape)):
+            if i < len(np_shape_like.shape):
+                end_idx[i] = np_shape_like.shape[i]
+    slice_idx = []
+    for b, e in zip(begin_idx, end_idx):
+        slice_idx.append(slice(b, e))
+    np_result = np_data[tuple(slice_idx)]
+    return np_result
+
+
+def verify_slice_like(data, slice_like, axes, output, dtype="float32"):
+    x = relay.var("data", relay.TensorType(data, dtype))
+    y = relay.var("slice_like", relay.TensorType(slice_like, dtype))
+    z = relay.slice_like(x, y, axes)
+    zz = relay.ir_pass.infer_type(z)
+    if axes:
+        assert "axes" in z.astext()
+    assert zz.checked_type == relay.ty.TensorType(output, dtype)
+
+    if all(isinstance(v, int) == 0 for v in data) or \
+        all(isinstance(v, int) == 0 for v in slice_like):
+        return
+
+    func = relay.Function([x, y], z)
+    x_data = np.random.uniform(size=data).astype(dtype)
+    y_data = np.random.uniform(size=slice_like).astype(dtype)
+    ref_res = np_slice_like(x_data, y_data, axes)
+
+    for target, ctx in ctx_list():
+        for kind in ["graph", "debug"]:
+            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(x_data, y_data)
+            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+
+def test_slice_like():
+    d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4")
+    verify_slice_like(data=(d1, d2, d3), slice_like=(1, 2, 3), axes=None, output=(1, 2, 3))
+    verify_slice_like(data=(1, 2, 3), slice_like=(d1, d2, d3), axes=None, output=(d1, d2, d3))
+    verify_slice_like(data=(d2, d3, d4), slice_like=(d1, d2, d3), axes=(1,2), output=(d2, d2, d3))
+    verify_slice_like(data=(3, 4, 5), slice_like=(1, 2, 3), axes=None, output=(1, 2, 3))
+    verify_slice_like(data=(3, 4, 5), slice_like=(1, 2), axes=None, output=(1, 2, 5))
+    verify_slice_like(data=(3, 4, 5), slice_like=(1, 2, 3), axes=(1, 2), output=(3, 2, 3))
+    verify_slice_like(data=(3, 4, 5), slice_like=(1, 2, 3), axes=(-1, -3), output=(1, 4, 3))
+    verify_slice_like(data=(1, 3, 224, 224),
+                      slice_like=(1, 3, 112, 112),
+                      axes=(2, 3),
+                      output=(1, 3, 112, 112))
+
+
+if __name__ == "__main__":
+    test_collapse_sum_like()
+    test_broadcast_to_like()
+    test_slice_like()
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
new file mode 100644
index 000000000000..0544ee49d159
--- /dev/null
+++ b/tests/python/relay/test_op_level2.py
@@ -0,0 +1,480 @@
+""" Support level2 operator test cases.
+"""
+import tvm
+from tvm import relay
+from tvm.relay.testing import ctx_list
+import numpy as np
+import topi.testing
+
+def test_conv2d_infer_type():
+    # symbolic in batch dimension
+    n, c, h, w = tvm.var("n"), 10, 224, 224
+    x = relay.var("x", relay.ty.TensorType((n, c, h, w), "float32"))
+    w = relay.var("w")
+    y = relay.nn.conv2d(x, w,
+                        kernel_size=(3, 3),
+                        padding=(1, 1),
+                        channels=2)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type ==  relay.TensorType(
+        (n, 2, 224, 224), "float32")
+    assert yy.args[1].checked_type == relay.TensorType(
+        (2, 10, 3, 3), "float32")
+
+    # infer by shape of w, mixed precision
+
+    n, c, h, w = tvm.var("n"), 10, 224, 224
+    x = relay.var("x", relay.TensorType((n, c, h, w), "int8"))
+    w = relay.var("w", relay.TensorType((2, 10, 3, 3), "int8"))
+    y = relay.nn.conv2d(x, w, out_dtype="int32")
+    assert "out_dtype=\"int32\"" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type ==  relay.TensorType(
+        (n, 2, 222, 222), "int32")
+
+    # Infer with a different layout
+    n, c, h, w = 4, 32, 224, 224
+    x = relay.var("x", relay.TensorType((n//4, c//4, h, w, 4, 4), "int8"))
+    wt = relay.var("w")
+    y = relay.nn.conv2d(x, wt,
+                        kernel_size=(3, 3),
+                        padding=(1, 1),
+                        channels=16,
+                        data_layout="NCHW4n4c",
+                        weight_layout="OIHW4o4i",
+                        out_dtype="int32")
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type ==  relay.TensorType(
+        (1, 4, 224, 224, 4, 4), "int32")
+    assert yy.args[1].checked_type == relay.TensorType(
+        (4, 8, 3, 3, 4, 4), "int8")
+
+    # Infer with NHWC
+    n, c, h, w = 4, 32, 224, 224
+    x = relay.var("x", relay.TensorType((n, h, w, c), "int8"))
+    wt = relay.var("w")
+    y = relay.nn.conv2d(x, wt,
+                        kernel_size=(3, 3),
+                        padding=(1, 1),
+                        channels=16,
+                        data_layout="NHWC",
+                        out_dtype="int32")
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type ==  relay.TensorType(
+        (n, h, w, 16), "int32")
+
+
+def test_conv2d_run():
+    def run_test_conv2d(dtype, out_dtype, scale, dshape, kshape,
+                        padding=(1, 1),
+                        fref=None,
+                        groups=1,
+                        dilation=(1, 1),
+                        **attrs):
+        x = relay.var("x", shape=dshape)
+        w = relay.var("w")
+        y = relay.nn.conv2d(x, w,
+                            padding=padding,
+                            dilation=dilation,
+                            groups=groups,
+                            **attrs)
+        func = relay.Function([x, w], y)
+        data = np.random.uniform(-scale, scale, size=dshape).astype(dtype)
+        kernel = np.random.uniform(-scale, scale, size=kshape).astype(dtype)
+        dkernel = topi.testing.dilate_python(kernel, (1, 1) + dilation)
+        if fref is None:
+            ref_res = topi.testing.conv2d_nchw_python(
+                data.astype(out_dtype), dkernel.astype(out_dtype), 1, padding)
+        else:
+            ref_res = fref(data.astype(out_dtype), dkernel.astype(out_dtype))
+
+        for target, ctx in ctx_list():
+            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            op_res1 = intrp1.evaluate(func)(data, kernel)
+            tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
+
+    # depthwise conv2d
+    dshape = (1, 32, 18, 18)
+    kshape = (32, 1, 3, 3)
+    run_test_conv2d("float32", "float32", 1, dshape, kshape,
+                    padding=(1, 1), channels=32, groups=32, kernel_size=(3 ,3),
+                    fref=lambda x, w: topi.testing.depthwise_conv2d_python_nchw(
+                        x, w, (1, 1), "SAME"))
+
+    # normal conv2d
+    dshape = (1, 3, 224, 224)
+    kshape = (10, 3, 3, 3)
+    run_test_conv2d("float32", "float32", 1, dshape, kshape,
+                    padding=(1, 1), channels=10, kernel_size=(3 ,3))
+    # mixed precision
+    run_test_conv2d("int8", "int32", 1, dshape, kshape,
+                    padding=(1, 1), channels=10, kernel_size=(3 ,3))
+    kshape = (10, 3, 1, 3)
+    # mixed precision.
+    run_test_conv2d("int8", "int32", 1, dshape, kshape,
+                    padding=(0, 1), channels=10, kernel_size=(1 ,3))
+    # dilated conv2d
+    dshape = (1, 3, 18, 18)
+    kshape = (10, 3, 3, 3)
+    run_test_conv2d("float32", "float32", 1, dshape, kshape,
+                    padding=(1, 1), channels=10, kernel_size=(3 ,3), dilation=(3, 3))
+
+
+def test_conv2d_transpose_infer_type():
+    # symbolic in batch dimension
+    n, c, h, w = tvm.var("n"), 10, 10, 12
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    w = relay.var("w", relay.IncompleteType())
+    y = relay.nn.conv2d_transpose(x, w,
+                                  kernel_size=(3, 3),
+                                  padding=(1, 1),
+                                  channels=15)
+    assert "channels=15" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType(
+        (n, 15, 10, 12), "float32")
+    assert yy.args[1].checked_type == relay.TensorType(
+        (10, 15, 3, 3), "float32")
+
+    # infer by shape of w, mixed precision
+    n, c, h, w = tvm.var("n"), 10, 10, 12
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    w = relay.var("w", relay.TensorType((12, 11, 5, 5), "float32"))
+    y = relay.nn.conv2d_transpose(x, w,
+                                  output_padding=(1, 1),
+                                  channels=11,
+                                  data_layout="NHWC")
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType(
+        (n, 15, 15, 11), "float32")
+
+
+def test_conv2d_transpose_run():
+    dshape = (1, 3, 18, 18)
+    kshape = (3, 10, 3, 3)
+    oshape = (1, 10, 37, 37)
+    x = relay.var("x", shape=dshape)
+    w = relay.var("w")
+    y = relay.nn.conv2d_transpose(x, w,
+                                  channels=10, kernel_size=(3,3), strides=(2,2),
+                                  padding=(1,1), output_padding=(2, 2))
+    func = relay.Function([x, w], y)
+    dtype = "float32"
+    data = np.random.uniform(size=dshape).astype(dtype)
+    kernel = np.random.uniform(size=kshape).astype(dtype)
+    c_np = topi.testing.conv2d_transpose_nchw_python(
+        data, kernel, 2, 1)
+    d_np = np.zeros(shape=oshape)
+    d_np[:,:,0:c_np.shape[2],0:c_np.shape[3]] = c_np
+    ref_res = d_np
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(data, kernel)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
+
+
+
+def test_upsampling_infer_type():
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    y = relay.nn.upsampling(x, scale=2, layout="NCHW", method="BILINEAR")
+    "method=\"BINLINEAR\"" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c, h*2, w*2), "float32")
+    n, c = tvm.var("n"), tvm.var("c")
+    x = relay.var("x", relay.TensorType((n, c, 100, 200), "float32"))
+    y = relay.nn.upsampling(x, scale=2, layout="NCHW", method="BILINEAR")
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c, 200, 400), "float32")
+
+
+def _test_pool2d(opfunc, reffunc):
+    n, c, h, w = tvm.var("n"), 10, 224, 224
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    y = opfunc(x, pool_size=(1, 1))
+    assert "pool_size=" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, 10, 224, 224), "float32")
+    # test execution
+    dtype = "float32"
+    dshape = (1, 3, 28, 28)
+    x = relay.var("x", shape=dshape)
+    y = opfunc(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
+    func = relay.Function([x], y)
+    data = np.random.uniform(size=dshape).astype(dtype)
+    ref_res = reffunc(data.reshape(1,3,14,2,14,2), axis=(3,5))
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
+
+
+def _test_global_pool2d(opfunc, reffunc):
+    n, c, h, w = tvm.var("n"), tvm.var("c"), 224, 224
+    x = relay.var("x", relay.TensorType((n, h, w, c), "float32"))
+    y = opfunc(x, layout="NHWC")
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, 1, 1, c), "float32")
+
+    n, c, h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    y = opfunc(x)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c, 1, 1), "float32")
+    # test execution
+    dtype = "float32"
+    dshape = (1, 1024, 7, 7)
+    x = relay.var("x", shape=dshape)
+    y = opfunc(x)
+    func = relay.Function([x], y)
+    data = np.random.uniform(size=dshape).astype(dtype)
+    ref_res = reffunc(data, axis=(2,3), keepdims=True)
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
+
+
+def test_pool2d():
+    _test_pool2d(relay.nn.max_pool2d, np.max)
+    _test_pool2d(relay.nn.avg_pool2d, np.mean)
+    _test_global_pool2d(relay.nn.global_max_pool2d, np.max)
+    _test_global_pool2d(relay.nn.global_avg_pool2d, np.mean)
+
+
+def test_avg_pool2d_no_count_pad():
+    kh, kw = (4, 4)
+    sh, sw = (2, 2)
+    ph, pw = (2, 2)
+    n = 1
+    (ic, ih, iw) = (3, 28, 28)
+    (oc, oh, ow) = (3, 15, 15)
+    dshape = (n, ic, ih, iw)
+    x = relay.var("x", shape=dshape)
+    y = relay.nn.avg_pool2d(x,
+                            pool_size=(kh, kw),
+                            strides=(sw, sw),
+                            padding=(ph, pw),
+                            count_include_pad=False)
+    func = relay.Function([x], y)
+    dtype = "float32"
+    a_np = np.random.uniform(low=0.001, size=(n, ic, ih, iw)).astype(dtype)
+    pad_np = np.zeros(shape=(n, ic, ih+2*ph, iw+2*pw)).astype(dtype)
+    no_zero = (range(n), range(ic), (range(ph, ih+ph)), (range(pw, iw+pw)))
+    pad_np[np.ix_(*no_zero)] = a_np
+    b_np = np.zeros(shape=(n, oc, oh, ow)).astype(dtype)
+    for i in range(oh):
+        for j in range(ow):
+            pad_count = np.sum(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw] > 0, axis=(2,3))
+            b_np[:,:,i,j] = np.sum(pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw],
+                                   axis=(2,3)) / np.maximum(pad_count, 1)
+    ref_res = np.maximum(b_np, 0.0)
+    data = a_np
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
+
+
+def test_flatten_infer_type():
+    d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4")
+    x = relay.var("x", relay.TensorType((d1, d2, d3, d4), "float32"))
+    y = relay.nn.batch_flatten(x)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((d1, ((d2*d3)*d4)), "float32")
+
+    x = relay.var("x", relay.TensorType((3, 2, 4, 3), "float32"))
+    y = relay.nn.batch_flatten(x)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((3, 24), "float32")
+
+    x = relay.var("x", relay.TensorType((d1, 2, d3, 3), "float32"))
+    y = relay.nn.batch_flatten(x)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((d1, ((2*d3)*3)), "float32")
+
+    shape = (1, 5, 10, 10)
+    o_shape = (1, 500)
+    dtype = "float32"
+    x = relay.var("x", relay.TensorType(shape, dtype))
+    z = relay.nn.batch_flatten(x)
+    yy = relay.ir_pass.infer_type(z)
+    assert yy.checked_type == relay.TensorType(o_shape, dtype)
+    func = relay.Function([x], z)
+    x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
+    ref_res = x_data.flatten().reshape(o_shape)
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+        op_res2 = intrp2.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
+def test_pad_infer_type():
+    # entirely concrete case
+    n, c, h, w = 1, 2, 3, 4
+    t = relay.var("t", relay.TensorType((n, c, h, w), "float32"))
+    y = relay.nn.pad(t, ((1, 1), (2, 2), (3, 3), (4, 4)))
+    "pad_width=" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((3, 6, 9, 12), "float32")
+
+    # some symbolic values
+    n, c, h, w = tvm.var("n"), 2, 3, tvm.var("w")
+    t = relay.var("t", relay.TensorType((n, c, h, w), "float32"))
+    y = relay.nn.pad(t, ((1, 1), (2, 2), (3, 3), (4, 4)))
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n + 2, 6, 9, w + 8), "float32")
+
+def test_pad_run():
+    def _test_run(dtype):
+        dshape = (4, 10, 7, 7)
+        x = relay.var("x", shape=dshape)
+        y = relay.nn.pad(x, ((1, 1), (2, 2), (3, 3), (4, 4)))
+        func = relay.Function([x], y)
+        data = np.random.uniform(size=dshape).astype(dtype)
+        ref_res = np.pad(data, ((1, 1), (2, 2), (3, 3), (4, 4)), 'constant')
+        for target, ctx in ctx_list():
+            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            op_res1 = intrp1.evaluate(func)(data)
+            tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5)
+
+    _test_run('float32')
+    _test_run('int32')
+
+def test_lrn():
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = relay.var("x", shape=(n, c , h, w))
+    y = relay.nn.lrn(x, size=10, axis=2, bias=0.5, alpha=.00001, beta=0.75)
+    "alpha=" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c , h, w))
+
+    shape = (1, 5, 10, 10)
+    dtype = "float32"
+    x = relay.var("x", relay.TensorType(shape, dtype))
+    size=5
+    axis=1
+    bias=0.5
+    alpha=.00001
+    beta=0.75
+    z = relay.nn.lrn(x, size=size, axis=axis, bias=bias, alpha=alpha, beta=beta)
+    yy = relay.ir_pass.infer_type(z)
+    assert yy.checked_type == relay.TensorType(shape, dtype)
+    func = relay.Function([x], z)
+    x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
+    ref_res = topi.testing.lrn_python(x_data, size, axis, bias, alpha, beta)
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+        op_res2 = intrp2.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
+def test_l2_normalize():
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = relay.var("x", shape=(n, c , h, w))
+    y = relay.nn.l2_normalize(x, eps=0.001, axis=[1])
+    "axis=" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c , h, w))
+
+    shape = (1, 5, 10, 10)
+    dtype = "float32"
+    x = relay.var("x", relay.TensorType(shape, dtype))
+    eps=0.001
+    axis=1
+    z = relay.nn.l2_normalize(x, eps=0.001, axis=[axis])
+    yy = relay.ir_pass.infer_type(z)
+    assert yy.checked_type == relay.TensorType(shape, dtype)
+    func = relay.Function([x], z)
+    x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
+    ref_res = topi.testing.l2_normalize_python(x_data, eps, axis)
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+        op_res2 = intrp2.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
+
+def batch_flatten(data):
+    shape = data.shape
+    target_dim = 1
+    for i in range(len(shape) - 1):
+        target_dim = target_dim * shape[i + 1]
+    return np.reshape(data, (shape[0], target_dim))
+
+
+def test_batch_flatten():
+    t1 = relay.TensorType((5, 10, 5))
+    x = relay.Var("x", t1)
+    func = relay.Function([x], relay.nn.batch_flatten(x))
+
+    data = np.random.rand(5, 10, 5).astype(t1.dtype)
+    ref_res = batch_flatten(data)
+    for target, ctx in ctx_list():
+        intrp = relay.create_executor("graph", ctx=ctx, target=target)
+        op_res = intrp.evaluate(func)(data)
+        np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+
+
+def _test_upsampling(layout, method):
+    n, c, h, w = tvm.var("n"), 16, 32, 32
+    scale = 2
+    dtype = "float32"
+    def get_shape():
+        if layout == "NCHW":
+            return (c, h, w), (c, h*scale, w*scale)
+        else:
+            return (h, w, c), (h*scale, w*scale, c)
+    ishape, oshape = get_shape()
+    x = relay.var("x", relay.TensorType((n,) + ishape, dtype))
+    y = relay.nn.upsampling(x, scale=scale, layout=layout, method=method)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n,) + oshape, dtype)
+    dshape = (1,) + ishape
+    x = relay.var("x", shape=dshape)
+    y = relay.nn.upsampling(x, scale=scale, layout=layout, method=method)
+    func = relay.Function([x], y)
+    data = np.random.uniform(size=dshape).astype(dtype)
+    if method == "NEAREST_NEIGHBOR":
+        ref = topi.testing.upsampling_python(data, scale, layout)
+    else:
+        ref = topi.testing.bilinear_resize_python(data, (h*scale, w*scale), layout)
+    for target, ctx in ctx_list():
+        executor = relay.create_executor("graph", ctx=ctx, target=target)
+        out = executor.evaluate(func)(data)
+        tvm.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5, atol=1e-5)
+
+
+def test_upsampling():
+    _test_upsampling("NCHW", "NEAREST_NEIGHBOR")
+    _test_upsampling("NCHW", "BILINEAR")
+    _test_upsampling("NHWC", "NEAREST_NEIGHBOR")
+    _test_upsampling("NHWC", "BILINEAR")
+
+
+if __name__ == "__main__":
+    test_pool2d()
+    test_avg_pool2d_no_count_pad()
+    test_lrn()
+    test_l2_normalize()
+    test_conv2d_infer_type()
+    test_upsampling_infer_type()
+    test_flatten_infer_type()
+    test_pad_infer_type()
+    test_pad_run()
+    test_conv2d_transpose_infer_type()
+    test_conv2d_transpose_run()
+    test_conv2d_run()
+    test_batch_flatten()
+    test_upsampling()
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
new file mode 100644
index 000000000000..31e87ef04856
--- /dev/null
+++ b/tests/python/relay/test_op_level3.py
@@ -0,0 +1,471 @@
+""" Support level3 operator test cases.
+"""
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.relay import create_executor
+from tvm.relay.testing import ctx_list
+from nose.tools import raises
+
+def test_zeros_ones():
+    for op, ref in [(relay.zeros, np.zeros), (relay.ones, np.ones)]:
+        y = op(shape=(124, 50), dtype="float64")
+        yy = relay.ir_pass.infer_type(y)
+        assert yy.checked_type == relay.TensorType((124, 50), "float64")
+        intrp = create_executor()
+        intrp_res = intrp.evaluate(y).asnumpy()
+        np.testing.assert_allclose(intrp_res, ref((124, 50), 'float64'))
+
+def test_unary_identity():
+    for op, ref in [(relay.zeros_like, np.zeros_like),
+               (relay.ones_like, np.ones_like),
+               (relay.ceil, np.ceil),
+               (relay.floor, np.floor),
+               (relay.trunc, np.trunc),
+               (relay.round, np.round),
+               (relay.abs, np.abs),
+               (relay.copy, None), # np.copy
+               (relay.negative, np.negative)]:
+        shape = (8, 9, 4)
+        x = relay.var("x", relay.TensorType(shape, "float32"))
+        y = op(x)
+        yy = relay.ir_pass.infer_type(y)
+        assert yy.checked_type == relay.TensorType(shape, "float32")
+
+        if ref is not None:
+            data = np.random.rand(*shape).astype('float32')
+            intrp = create_executor()
+            op_res = intrp.evaluate(y, { x: relay.const(data) })
+            ref_res = ref(data)
+            np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+
+def test_cast():
+    x = relay.var("x", relay.TensorType((8, 9, 4), "float32"))
+    y = x.astype("int32")
+    yy = relay.ir_pass.infer_type(y)
+    assert "dtype=" in yy.astext()
+    assert yy.checked_type == relay.TensorType((8, 9, 4), "int32")
+
+    x = relay.var("x", relay.TensorType((8, 9, 4), "float32"))
+    y = relay.cast(x, "int32")
+    yy = relay.ir_pass.infer_type(y)
+    assert "dtype=" in yy.astext()
+    assert yy.checked_type == relay.TensorType((8, 9, 4), "int32")
+
+def test_clip():
+    a = relay.var("a", relay.TensorType((10, 4), "float32"))
+    y = relay.clip(a, 1., 4.)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((10, 4), "float32")
+
+    data = np.random.rand(10, 4).astype('float32')
+    intrp = create_executor()
+    op_res = intrp.evaluate(y, { a: relay.const(data) })
+    ref_res = np.clip(data, 1., 4.)
+    np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+
+
+def test_squeeze():
+    def verify_squeeze(shape, dtype, axis):
+        x = relay.var("x", relay.TensorType(shape, dtype))
+        squeeze = relay.squeeze(x, axis=axis)
+
+        np_axis = tuple(axis) if axis is not None else None
+
+        data = np.random.random_sample(shape).astype(dtype)
+        intrp = create_executor()
+        op_res = intrp.evaluate(squeeze, { x : relay.const(data) })
+        ref_res = np.squeeze(data, axis=np_axis)
+        np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
+
+    verify_squeeze((1, 3, 2, 5), "float32", None)
+    verify_squeeze((1, 3, 1), "float32", [0])
+    verify_squeeze((1, 2, 1, 2, 1), "float32", [0, 2])
+
+
+def test_transpose_infer_type():
+    n, t, d = tvm.var("n"), tvm.var("t"), 100
+    x = relay.var("x", relay.TensorType((n, t, d), "float32"))
+    y = relay.transpose(x, axes=(1, 0, 2))
+    assert "axes=" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType(
+        (t, n, 100), "float32")
+
+    y = relay.transpose(x)
+    assert "axes=" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType(
+        (100, t, n), "float32")
+
+
+def test_transpose():
+    def verify_transpose(dshape, axes):
+        x = relay.var("x", relay.TensorType(dshape, "float32"))
+        z = relay.transpose(x, axes=axes)
+
+        func = relay.Function([x], z)
+        x_data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
+        ref_res = np.transpose(x_data, axes=axes)
+
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+    verify_transpose((2, 3, 4), (0, 2, 1))
+
+
+def test_squeeze_infer_type():
+    n, t, d = 1, 4, 1
+    x = relay.var("x", relay.TensorType((n, t, d), "float32"))
+    y = relay.squeeze(x, axis=(2,))
+    assert "axis=" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType(
+        (1, 4), "float32")
+
+    n, t, d = 1, 4, 1
+    x = relay.var("x", relay.TensorType((n, t, d), "float32"))
+    y = relay.squeeze(x)
+    assert "axis=" not in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType(
+        (4,), "float32")
+
+
+@raises(tvm._ffi.base.TVMError)
+def test_squeeze_bad_axes_infer_type():
+    n, t, d = 1, 4, 1
+    x = relay.var("x", relay.TensorType((n, t, d), "float32"))
+    y = relay.squeeze(x, axis=(1,))
+    yy = relay.ir_pass.infer_type(y)
+
+
+def test_reshape_infer_type():
+    n, t, d1, d2 = 10, 20, 100, 20
+    x = relay.var("x", relay.TensorType((n, t, d1, d2), "float32"))
+    y = relay.reshape(x, newshape=(n, t, 2000))
+    assert "newshape=" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType(
+        (n, t, 2000), "float32")
+
+def test_reshape():
+    def verify_reshape(shape, oshape):
+        x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
+        ref_res = np.reshape(x_data, oshape)
+
+        x = relay.var("x", relay.TensorType(shape, "float32"))
+        z = relay.reshape(x, newshape=ref_res.shape)
+        zz = relay.ir_pass.infer_type(z)
+        assert "newshape=" in z.astext()
+        assert zz.checked_type == relay.ty.TensorType(oshape, "float32")
+
+        func = relay.Function([x], z)
+
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+    verify_reshape((2, 3, 4), (8, 3))
+    verify_reshape((4, 7), (2, 7, 2))
+
+def test_reshape_like_infer_type():
+    # concrete shape
+    x = relay.var("x", relay.TensorType((1, 2, 3), "float32"))
+    y = relay.var("y", relay.TensorType((1,6), "float32"))
+    z = relay.reshape_like(x, y)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.TensorType((1, 6), "float32")
+
+    # symbolic shape
+    n, c, h, w = tvm.var("n"), 2, 3, tvm.var("w")
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    y = relay.var("y", relay.TensorType((1, 8, 8), "float32"))
+    z = relay.reshape_like(x, y)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.TensorType((1, 8, 8), "float32")
+
+
+def test_reshape_like():
+    def verify_reshape_like(shape, oshape):
+        x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
+        y_data = np.random.uniform(low=-1, high=1, size=oshape).astype("float32")
+        ref_res = np.reshape(x_data, y_data.shape)
+
+        x = relay.var("x", relay.TensorType(shape, "float32"))
+        y = relay.var("x", relay.TensorType(oshape, "float32"))
+        z = relay.reshape_like(x, y)
+        zz = relay.ir_pass.infer_type(z)
+        assert zz.checked_type == relay.ty.TensorType(ref_res.shape, "float32")
+
+        func = relay.Function([x, y], z)
+
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data, y_data)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+
+    verify_reshape_like((2, 3, 4), (1, 8, 3))
+    verify_reshape_like((4, 7), (2, 7, 2))
+
+def test_take_infer_type():
+    def verify_take(dshape, indices_shape, oshape, axis=None):
+        x = relay.var("x", relay.TensorType(dshape, "float32"))
+        indices = relay.var("indices", relay.TensorType(indices_shape, "int32"))
+        y = relay.take(x, indices, axis=axis)
+        y.astext()
+        yy = relay.ir_pass.infer_type(y)
+        assert yy.checked_type == relay.TensorType(oshape, "float32")
+
+    d1, d2, d3 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3")
+    d4, d5, d6 = tvm.var("d4"), tvm.var("d5"), tvm.var("d6")
+    verify_take((d1,), (1,), (1,), 0)
+    verify_take((4,), (d1, d2), (d1, d2))
+    verify_take((3, 3, 3), (1, d2), (1, d2))
+    verify_take((d1, d2), (d3, d4, d5), (d3, d4, d5, d2), 0)
+    verify_take((d1, d2), (d3, d4, d5), (d1, d3, d4, d5), 1)
+    verify_take((d1, d2, d3, d4), (d5, d6), (d1, d2, d5, d6, d4), -2)
+
+def test_take():
+    def verify_take(src_shape, indices_src, axis=None):
+        src_dtype = "float32"
+        indices_dtype = "int32"
+        indices_src = np.array(indices_src, dtype=indices_dtype)
+        x = relay.var("x", relay.TensorType(src_shape, src_dtype))
+        indices = relay.var("indices", relay.TensorType(indices_src.shape, indices_dtype))
+        z = relay.take(x, indices, axis=axis)
+
+        func = relay.Function([x, indices], z)
+        x_data = np.random.uniform(low=-1, high=1, size=src_shape).astype(src_dtype)
+        ref_res = np.take(x_data, indices=indices_src, axis=axis)
+
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data, indices_src)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+
+    verify_take((4,), [1])
+    verify_take((4,), [[0,1,2,3]])
+    verify_take((3,3,3), [[11,25]])
+    verify_take((4,), [[0,1],[2,3]])
+    verify_take((4,), [1], 0)
+    verify_take((2,2), [[[1,0],[0,1]]], 0)
+    verify_take((2,2), [[[1,0],[0,1]]], 1)
+    verify_take((4,3,5,6), [[2,1,0,0]], -2)
+
+
+def test_split_infer_type():
+    def verify_split(dshape, indices_or_sections, ret_type, axis=None):
+        x = relay.var("x", relay.ty.TensorType(dshape, "float32"))
+        y = relay.split(x, indices_or_sections, axis=axis)
+        y.astext()
+        yy = relay.ir_pass.infer_type(y.astuple())
+        assert yy.checked_type == ret_type
+
+    d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4")
+    axis = tvm.var("axis")
+    verify_split((5, 5, 2, 2), 5,
+                 relay.ty.TupleType(tvm.convert([
+                     relay.ty.TensorType((5, 1, 2, 2), "float32"),
+                     relay.ty.TensorType((5, 1, 2, 2), "float32"),
+                     relay.ty.TensorType((5, 1, 2, 2), "float32"),
+                     relay.ty.TensorType((5, 1, 2, 2), "float32"),
+                     relay.ty.TensorType((5, 1, 2, 2), "float32")])),
+                  axis=1)
+    verify_split((5, 5, 2, 2), 5,
+                 relay.ty.TupleType(tvm.convert([
+                     relay.ty.TensorType((1, 5, 2, 2), "float32"),
+                     relay.ty.TensorType((1, 5, 2, 2), "float32"),
+                     relay.ty.TensorType((1, 5, 2, 2), "float32"),
+                     relay.ty.TensorType((1, 5, 2, 2), "float32"),
+                     relay.ty.TensorType((1, 5, 2, 2), "float32")])),
+                  axis=0)
+    verify_split((d1, d2, d3, d4), 4,
+                 relay.ty.TupleType(tvm.convert([
+                     relay.ty.TensorType((d1, d2, d3/4, d4), "float32"),
+                     relay.ty.TensorType((d1, d2, d3/4, d4), "float32"),
+                     relay.ty.TensorType((d1, d2, d3/4, d4), "float32"),
+                     relay.ty.TensorType((d1, d2, d3/4, d4), "float32")])),
+                  axis=2)
+    verify_split((d1, d2, d3, d4), 2,
+                 relay.ty.TupleType(tvm.convert([
+                     relay.ty.TensorType((d1/2, d2, d3, d4), "float32"),
+                     relay.ty.TensorType((d1/2, d2, d3, d4), "float32")])),
+                  axis=0)
+    verify_split((d1, d2, d3, d4), (2, 4, 7),
+                 relay.ty.TupleType(tvm.convert([
+                     relay.ty.TensorType((d1, 2, d3, d4), "float32"),
+                     relay.ty.TensorType((d1, 2, d3, d4), "float32"),
+                     relay.ty.TensorType((d1, 3, d3, d4), "float32"),
+                     relay.ty.TensorType((d1, (d2-7), d3, d4), "float32")])),
+                  axis=1)
+
+def test_full_infer_type():
+    # default settings: match input dtype
+    x = relay.var("x", relay.TensorType((), "int8"))
+    y = relay.full(x, ())
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((), "int8")
+
+    # change the shape and dtype
+    x = relay.var("x", relay.TensorType((), "float32"))
+    y = relay.full(x, (1, 2), "int8")
+    "shape=" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((1, 2), "int8")
+
+
+def test_full():
+    def verify_full(fill_value, src_shape, dtype):
+        x = relay.var("x", relay.scalar_type(dtype))
+        z = relay.full(x, src_shape, dtype)
+        func = relay.Function([x], z)
+        ref_res = np.full(src_shape, fill_value)
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(fill_value)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+    verify_full(4, (1, 3, 4, 4), "int32")
+    verify_full(4.0, (1, 4), "float32")
+
+
+def test_full_like_infer_type():
+    # concrete shape
+    base = relay.var("base", relay.TensorType((1, 2, 3), "float32"))
+    fill = relay.var("fill", relay.TensorType((), "float32"))
+    y = relay.full_like(base, fill)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((1, 2, 3), "float32")
+
+    # symbolic shape
+    n, c, h, w = tvm.var("n"), 2, 3, tvm.var("w")
+    base = relay.var("base", relay.TensorType((n, c, h, w), "float32"))
+    fill = relay.var("fill", relay.TensorType((), "float32"))
+    y = relay.full_like(base, fill)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c, h, w), "float32")
+
+
+def test_full_like():
+    def verify_full_like(base, fill_value, dtype):
+        x_data = np.random.uniform(low=-1, high=1, size=base).astype(dtype)
+        x = relay.var("x", relay.TensorType(base, dtype))
+        y = relay.var("y", relay.scalar_type(dtype))
+        z = relay.full_like(x, y)
+
+        func = relay.Function([x, y], z)
+        ref_res = np.full_like(x_data, fill_value)
+
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data, fill_value)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+    verify_full_like((1, 3, 4, 4), 4, "int32")
+    verify_full_like((1, 1), 44.0, "float32")
+
+
+def test_infer_type_leaky_relu():
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    y = relay.nn.leaky_relu(x, alpha=0.1)
+    "alpha=0.1" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c, h, w), "float32")
+
+    shape = (1, 5, 10, 10)
+    dtype = "float32"
+    x = relay.var("x", relay.TensorType(shape, dtype))
+    z = relay.nn.leaky_relu(x, alpha=0.1)
+    assert "alpha=0.1" in z.astext()
+    yy = relay.ir_pass.infer_type(z)
+    assert yy.checked_type == relay.TensorType(shape, dtype)
+    func = relay.Function([x], z)
+    x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
+    ref_res = np.where(x_data > 0, x_data, x_data * 0.1)
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+        op_res2 = intrp2.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
+def verify_infer_type_prelu(data, alpha, axis, output, dtype="float32"):
+    x = relay.var("data", relay.TensorType(data, dtype))
+    if alpha:
+        y = relay.var("alpha", relay.TensorType(alpha, dtype))
+    else:
+        y = relay.var("alpha", relay.IncompleteType())
+    z = relay.nn.prelu(x, y, axis=axis)
+    zz = relay.ir_pass.infer_type(z)
+    if axis != 1:
+        assert "axis" in z.astext()
+    assert zz.checked_type == relay.ty.TensorType(output, dtype)
+    if not alpha:
+        axis = axis if axis else 1
+        alpha_shape = (data[axis],)
+        assert zz.args[1].checked_type == relay.TensorType(alpha_shape, "float32")
+
+    if all(isinstance(v, tvm.expr.Var) == 1 for v in data) or not alpha:
+        return
+
+    func = relay.Function([x, y], z)
+    x_data = np.random.uniform(low=-1, high=1, size=data).astype(dtype)
+    a_data = np.random.uniform(low=-1, high=1, size=alpha).astype(dtype)
+
+    if axis == 1:
+        ref_res = (x_data < 0) * (x_data * a_data.reshape(3, 1, 1)) + (x_data>=0) * x_data
+    else:
+        ref_res = (x_data < 0) * (x_data * a_data.reshape(1, 1, 3)) + (x_data>=0) * x_data
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(x_data, a_data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+        op_res2 = intrp2.evaluate(func)(x_data, a_data)
+        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
+
+def test_infer_type_prelu():
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    verify_infer_type_prelu((n, c, h, w), (c,), 1, (n, c, h, w))
+    verify_infer_type_prelu((n, h, w, c), (c,), 3, (n, h, w, c))
+    verify_infer_type_prelu((n, c, h, w), None, 1, (n, c, h, w))
+    verify_infer_type_prelu((n, h, w, c), None, 3, (n, h, w, c))
+    verify_infer_type_prelu((1, 3, 2, 2), (3,), 1, (1, 3, 2, 2))
+    verify_infer_type_prelu((1, 2, 2, 3), (3,), 3, (1, 2, 2, 3))
+    verify_infer_type_prelu((1, 3, 2, 2), None, 1, (1, 3, 2, 2))
+    verify_infer_type_prelu((1, 2, 2, 3), None, 3, (1, 2, 2, 3))
+
+if __name__ == "__main__":
+    test_cast()
+    test_zeros_ones()
+    test_unary_identity()
+    test_clip()
+    test_transpose_infer_type()
+    test_transpose()
+    test_reshape_infer_type()
+    test_reshape()
+    test_reshape_like_infer_type()
+    test_reshape_like()
+    test_take_infer_type()
+    test_take()
+    test_full_infer_type()
+    test_full()
+    test_full_like_infer_type()
+    test_full_like()
+    test_infer_type_leaky_relu()
+    test_infer_type_prelu()
+    test_squeeze()
+    test_squeeze_infer_type()
+    test_squeeze_bad_axes_infer_type()
+    test_split_infer_type()
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
new file mode 100644
index 000000000000..db478ff251c5
--- /dev/null
+++ b/tests/python/relay/test_op_level4.py
@@ -0,0 +1,238 @@
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.relay.testing import ctx_list
+import topi.testing
+
+def test_binary_op():
+    def check_binary_op(opfunc, ref):
+        n = tvm.var("n")
+        t1 = relay.TensorType((5, n, 5))
+        t2 = relay.TensorType((n, 1))
+        x = relay.var("x", t1)
+        y = relay.var("y", t2)
+        z = opfunc(x, y)
+        # test printer
+        assert ("%0 = {}(%x, %y)".format(z.op.name)) in z.astext()
+        assert relay.ir_pass.infer_type(z).checked_type == t1
+
+        if ref is not None:
+            t1 = relay.TensorType((5, 10, 5))
+            t2 = relay.TensorType((5, 10, 5))
+            x = relay.var("x", t1)
+            y = relay.var("y", t2)
+            z = opfunc(x, y)
+            x_data = np.random.rand(5, 10, 5).astype(t1.dtype)
+            y_data = np.random.rand(5, 10, 5).astype(t2.dtype)
+            ref_res = ref(x_data, y_data)
+            func = relay.Function([x, y], z)
+
+            for target, ctx in ctx_list():
+                intrp = relay.create_executor("graph", ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data, y_data)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
+
+    for opfunc, ref in [(relay.power, np.power)]:
+        check_binary_op(opfunc, ref)
+
+
+def test_cmp_type():
+    for op, ref in ((relay.greater, np.greater),
+               (relay.greater_equal, np.greater_equal),
+               (relay.less, np.less),
+               (relay.less_equal, np.less_equal),
+               (relay.equal, np.equal),
+               (relay.not_equal, np.not_equal)):
+        x = relay.var("x", relay.TensorType((10, 4), "float32"))
+        y = relay.var("y", relay.TensorType((5, 10, 1), "float32"))
+        z = op(x, y)
+        z.astext()
+        zz = relay.ir_pass.infer_type(z)
+        assert zz.checked_type == relay.TensorType((5, 10, 4), "bool")
+
+        if ref is not None:
+            x_shape = (10, 4)
+            y_shape = (5, 10, 1)
+            t1 = relay.TensorType(x_shape)
+            t2 = relay.TensorType(y_shape)
+            x = relay.var("x", t1)
+            y = relay.var("y", t2)
+            z = op(x, y)
+            x_data = np.random.rand(*x_shape).astype(t1.dtype)
+            y_data = np.random.rand(*y_shape).astype(t2.dtype)
+            ref_res = ref(x_data, y_data)
+            func = relay.Function([x, y], z)
+
+            for target, ctx in ctx_list():
+                intrp = relay.create_executor("graph", ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data, y_data)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
+
+
+def test_binary_int_broadcast():
+    for op, ref in [(relay.right_shift, np.right_shift),
+               (relay.left_shift, np.left_shift),
+                (relay.mod, np.mod),
+               (relay.maximum, np.maximum),
+               (relay.minimum, np.minimum)]:
+        x = relay.var("x", relay.TensorType((10, 4), "int32"))
+        y = relay.var("y", relay.TensorType((5, 10, 1), "int32"))
+        z = op(x, y)
+        zz = relay.ir_pass.infer_type(z)
+        assert zz.checked_type == relay.TensorType((5, 10, 4), "int32")
+
+    if ref is not None:
+        x_shape = (10, 4)
+        y_shape = (5, 10, 1)
+        t1 = relay.TensorType(x_shape, 'int32')
+        t2 = relay.TensorType(y_shape, 'int32')
+        x_data = np.random.rand(*x_shape).astype(t1.dtype)
+        y_data = np.random.rand(*y_shape).astype(t2.dtype)
+        func = relay.Function([x, y], z)
+        ref_res = ref(x_data, y_data)
+
+        for target, ctx in ctx_list():
+            intrp = relay.create_executor("graph", ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(x_data, y_data)
+            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
+
+
+def test_where():
+    shape = (3, 4)
+    dtype = "float32"
+    cond = relay.var("cond", relay.TensorType(shape, dtype))
+    x = relay.var("x", relay.TensorType(shape, dtype))
+    y = relay.var("y", relay.TensorType(shape, dtype))
+    z = relay.where(cond, x, y)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.TensorType(shape, dtype)
+
+    func = relay.Function([cond, x, y], z)
+    condition = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
+    x = np.random.uniform(size=shape).astype(dtype)
+    y = np.random.uniform(size=shape).astype(dtype)
+    ref_res = np.where(condition, x, y)
+    for target, ctx in ctx_list():
+        for kind in ["graph", "debug"]:
+            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(condition, x, y)
+            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+
+
+def verify_reduce(funcs, data, axis, keepdims, exclude, output, dtype="float32"):
+    test_func = funcs[0]
+    ref_func = funcs[1]
+
+    x = relay.var("x", relay.TensorType(data, dtype))
+    z = test_func(x, axis, keepdims, exclude)
+    zz = relay.ir_pass.infer_type(z)
+    if axis:
+        assert "axis=" in z.astext()
+    if keepdims:
+        assert "keepdims=" in z.astext()
+    if exclude:
+        assert "exclude=" in z.astext()
+    out_type = "int32" if test_func in [relay.argmin, relay.argmax] else dtype
+    assert zz.checked_type == relay.ty.TensorType(output, out_type)
+
+    if all(isinstance(v, tvm.expr.Var) == 1 for v in data) or len(output) == 0:
+        return
+
+    func = relay.Function([x], z)
+    x_data = np.random.uniform(size=data).astype(dtype)
+    if ref_func in [np.sum]:
+        ref_res = ref_func(x_data + 0, axis=axis, dtype=dtype, keepdims=keepdims)
+    elif ref_func in [np.max, np.min, np.mean, np.prod]:
+        ref_res = ref_func(x_data + 0, axis=axis, keepdims=keepdims)
+    else: #argmin/argmax
+        if axis and not isinstance(axis, int) and len(axis) > 1 :
+            return
+        ref_res = ref_func(x_data + 0, axis=axis, keepdims=keepdims)
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+        op_res2 = intrp2.evaluate(func)(x_data)
+        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
+def test_reduce_functions():
+    def _with_keepdims(func):
+        def _wrapper(data, axis=None, keepdims=False):
+            if not keepdims:
+                return func(data, axis=axis)
+            else:
+                if axis is not None:
+                    axis = axis if isinstance(axis, int) else axis[0]
+                    out_shape = list(data.shape)
+                    out_shape[axis] = 1
+                else:
+                    out_shape = [1 for _ in range(len(data.shape))]
+                return func(data, axis=axis).reshape(out_shape)
+        return _wrapper
+
+    d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4")
+    for func in [[relay.sum, np.sum],
+                 [relay.max, np.max],
+                 [relay.min, np.min],
+                 [relay.mean, np.mean],
+                 [relay.prod, np.prod],
+                 [relay.argmin, _with_keepdims(np.argmin)],
+                 [relay.argmax, _with_keepdims(np.argmax)]]:
+        verify_reduce(func, (d1, d2, d3, d4), 2, True, False, (d1, d2, 1, d4))
+        verify_reduce(func, (d1, d2, d3), 1, True, False, (d1, 1, d3))
+        verify_reduce(func, (d1, d2, d3), None, True, False, (1, 1, 1))
+        verify_reduce(func, (d1, d2, d3), (0, 1), True, False, (1, 1, d3))
+        verify_reduce(func, (2, 3, 4), 1, True, False, (2, 1, 4))
+        verify_reduce(func, (2, 3, 4), (1,), True, False, (2, 1, 4))
+        verify_reduce(func, (2, 3, 4), (0, 1, 2), False, False, ())
+        verify_reduce(func, (4, 4, 3), None, False, True, ())
+        verify_reduce(func, (4, 4, 3), (0, 2), False, False, (4,))
+        verify_reduce(func, (128, 24, 128), (0, 1), False, False, (128,))
+        verify_reduce(func, (128, 24, 128), (0, 2), False, False, (24,))
+        verify_reduce(func, (128, 24, 128), (0, 1), True, False, (1, 1, 128))
+        verify_reduce(func, (128, 24, 128), (0, 2), True, False, (1, 24, 1))
+
+
+def test_strided_slice():
+    def verify(dshape, begin, end, strides, output, test_ref=True):
+        x = relay.var("x", relay.TensorType(dshape, "float32"))
+        z = relay.strided_slice(x, begin=begin, end=end, strides=strides)
+        func = relay.Function([x], z)
+        func = relay.ir_pass.infer_type(func)
+        text = func.astext()
+        assert "begin=" in text
+        assert "end=" in text
+        if output:
+            assert func.body.checked_type == relay.ty.TensorType(output, "float32")
+        if not test_ref:
+            return
+        x_data = np.random.uniform(size=dshape).astype("float32")
+        ref_res = topi.testing.strided_slice_python(
+            x_data, begin, end, strides)
+        for target, ctx in ctx_list():
+            intrp = relay.create_executor("graph", ctx=ctx, target=target)
+            op_res = intrp.evaluate(func)(x_data)
+            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
+
+    d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4")
+    verify((d1, d2, 3), [None, None, 1], [None, None, 2], None, (d1, d2, 1), False)
+    verify((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2], (3, 1, 2))
+    verify((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], (1, 3, 3))
+    verify((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], (1, 4, 3))
+    verify((3, 4, 3), [1, 0, 0], [2, 2, 3], [1, 1, 2], (1, 2, 2))
+    verify((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1], (1, 2, 3))
+    verify((3, 4, 3), [1, 1, 0], [4, 4, 3], None, (2, 3, 3))
+    verify((3, 4, 3), [1, 1, 0], [4, 1000, 3], None, (2, 3, 3))
+    verify((3, 4, 3), [1, 1, 0], [4, 4], None, (2, 3, 3))
+    verify((3, 4, 3), [1, 1], [4, 4, 3], None, (2, 3, 3))
+
+
+if __name__ == "__main__":
+    test_strided_slice()
+    test_binary_op()
+    test_cmp_type()
+    test_binary_int_broadcast()
+    test_where()
+    test_reduce_functions()
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
new file mode 100644
index 000000000000..aa31aa96ef45
--- /dev/null
+++ b/tests/python/relay/test_op_level5.py
@@ -0,0 +1,281 @@
+""" Support level5 operator test cases.
+"""
+import math
+import numpy as np
+import tvm
+from tvm import relay
+from tvm.relay.testing import ctx_list
+import topi.testing
+
+
+def test_resize_infer_type():
+    n, c, h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = relay.var("x", relay.TensorType((n, c, h, w), "int8"))
+    th, tw = tvm.var("th"), tvm.var("tw")
+    z = relay.image.resize(x, (th, tw))
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.TensorType((n, c, th, tw), "int8")
+
+    x = relay.var("x", relay.TensorType((n, c, h, w), "int8"))
+    z= relay.image.resize(x, (100, 200), "NCHW", "BILINEAR", False)
+    assert "size=" in z.astext()
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.TensorType((n, c, 100, 200), "int8")
+
+def test_resize():
+    def verify_resize(dshape, scale, method, layout):
+        if layout == "NHWC":
+            size = (dshape[1] * scale, dshape[2] * scale)
+        else:
+            size = (dshape[2] * scale, dshape[3] * scale)
+
+        x_data = np.random.uniform(size=dshape).astype("float32")
+        if method == "BILINEAR":
+            ref_res = topi.testing.bilinear_resize_python(x_data, size, layout)
+        else:
+            ref_res = topi.testing.upsampling_python(x_data, scale, layout)
+        x = relay.var("x", relay.TensorType(dshape, "float32"))
+        z = relay.image.resize(x, size, layout, method, False)
+        assert "size=" in z.astext()
+        zz = relay.ir_pass.infer_type(z)
+        assert zz.checked_type == relay.TensorType(ref_res.shape, "float32")
+        func = relay.Function([x], z)
+
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+    for method in ["BILINEAR", "NEAREST_NEIGHBOR"]:
+        for layout in ["NHWC", "NCHW"]:
+            verify_resize((1, 4, 4, 4), 2, method, layout)
+
+
+def test_multibox_prior():
+    def get_ref_result(dshape, sizes=(1.0,),
+                       ratios=(1.0,), steps=(-1.0, -1.0),
+                       offsets=(0.5, 0.5), clip=True):
+        in_height = dshape[2]
+        in_width = dshape[3]
+        num_sizes = len(sizes)
+        num_ratios = len(ratios)
+        size_ratio_concat = sizes + ratios
+        steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height
+        steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width
+        offset_h = offsets[0]
+        offset_w = offsets[1]
+
+        oshape = (1, in_height * in_width * (num_sizes + num_ratios - 1), 4)
+        dtype = "float32"
+        np_out = np.zeros(oshape).astype(dtype)
+
+        for i in range(in_height):
+            center_h = (i + offset_h) * steps_h
+            for j in range(in_width):
+                center_w = (j + offset_w) * steps_w
+                for k in range(num_sizes + num_ratios - 1):
+                    w = size_ratio_concat[k] * in_height / in_width / 2.0 if k < num_sizes else \
+                        size_ratio_concat[0] * in_height / in_width * math.sqrt(size_ratio_concat[k + 1]) / 2.0
+                    h = size_ratio_concat[k] / 2.0 if k < num_sizes else \
+                        size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0
+                    count = i * in_width * (num_sizes + num_ratios - 1) + j * (num_sizes + num_ratios - 1) + k
+                    np_out[0][count][0] = center_w - w
+                    np_out[0][count][1] = center_h - h
+                    np_out[0][count][2] = center_w + w
+                    np_out[0][count][3] = center_h + h
+        if clip:
+            np_out = np.clip(np_out, 0, 1)
+
+        return np_out
+
+    def verify_multibox_prior(x, dshape, ref_res, sizes=(1.0,),
+                              ratios=(1.0,), steps=(-1.0, -1.0),
+                              offsets=(0.5, 0.5), clip=True, check_size=False,
+                              check_type_only=False):
+
+        z = relay.vision.multibox_prior(x, sizes, ratios, steps, offsets, clip)
+        zz = relay.ir_pass.infer_type(z)
+        if check_size:
+            assert "sizes=" in z.astext()
+        assert zz.checked_type == relay.TensorType(
+            (1, dshape[2] * dshape[3] * (len(sizes) + len(ratios) - 1), 4),
+            "float32")
+
+        if check_type_only:
+            return
+
+        data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
+        func = relay.Function([x], z)
+        func = relay.ir_pass.infer_type(func)
+        for target, ctx in ctx_list():
+            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            op_res1 = intrp1.evaluate(func)(data)
+            tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+            intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+            op_res2 = intrp2.evaluate(func)(data)
+            tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
+    sizes = (0.3, 1.5, 0.7)
+    ratios = (1.3, 2.4)
+    steps = (2.0, 1.5)
+    offsets = (0.2, 0.3)
+    dshape = (1, 3, 56, 56)
+    ref_res = get_ref_result(dshape, sizes, ratios, steps, offsets)
+    x = relay.var("x", relay.TensorType(dshape, "float32"))
+    verify_multibox_prior(x, dshape, ref_res, sizes, ratios, steps, offsets,
+                          check_size=True)
+    y = relay.var("y", relay.TensorType((tvm.var("n"), 3, 56, 56), "float32"))
+    verify_multibox_prior(x, dshape, ref_res, sizes, ratios, steps, offsets,
+                          check_size=True, check_type_only=True)
+
+    dshape = (1, 24, 32, 32)
+    ref_res = get_ref_result(dshape, clip=False)
+    x = relay.var("x", relay.TensorType(dshape, "float32"))
+    verify_multibox_prior(x, dshape, ref_res, clip=False)
+    y = relay.var("y", relay.TensorType((tvm.var("n"), 24, 32, 32), "float32"))
+    verify_multibox_prior(x, dshape, ref_res, clip=False, check_type_only=True)
+
+
+def test_nms():
+    def verify_nms(x0_data, x1_data, dshape, ref_res, valid_count,
+                   overlap_threshold=0.5, force_suppress=False, topk=-1,
+                   check_type_only=False):
+        x0 = relay.var("x0", relay.ty.TensorType(dshape, "float32"))
+        x1 = relay.var("x1", relay.ty.TensorType((dshape[0],), "int"))
+        z = relay.vision.nms(x0, x1, overlap_threshold, force_suppress, topk)
+        assert "overlap_threshold" in z.astext()
+        zz = relay.ir_pass.infer_type(z)
+        assert zz.checked_type == relay.ty.TensorType(dshape, "float32")
+
+        if check_type_only:
+            return
+
+        func = relay.Function([x0, x1], z)
+        func = relay.ir_pass.infer_type(func)
+        ctx_list = [("llvm", tvm.cpu(0))]
+        for target, ctx in ctx_list:
+            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            op_res1 = intrp1.evaluate(func)(x0_data, x1_data)
+            tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+            intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+            op_res2 = intrp2.evaluate(func)(x0_data, x1_data)
+            tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
+    np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
+                         [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
+                         [1, 0.5, 100, 60, 70, 110]]]).astype("float32")
+    np_valid_count = np.array([4]).astype("int32")
+    np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
+                           [0, 0.4, 4, 21, 19, 40], [-1, 0.9, 35, 61, 52, 79],
+                           [-1, -1, -1, -1, -1, -1]]])
+    num_anchors = 5
+
+    dshape = (tvm.var("n"), num_anchors, 6)
+    verify_nms(np_data, np_valid_count, dshape, np_result, dshape[0],
+               force_suppress=True, topk=2, check_type_only=True)
+    dshape = (1, num_anchors, 6)
+    verify_nms(np_data, np_valid_count, dshape, np_result, dshape[0],
+               force_suppress=True, topk=2, check_type_only=False)
+
+    np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
+                           [1, 0.7, 30, 60, 50, 80], [-1, 0.9, 35, 61, 52, 79],
+                           [-1, -1, -1, -1, -1, -1]]])
+    dshape = (tvm.var("n"), num_anchors, 6)
+    verify_nms(np_data, np_valid_count, dshape, np_result, dshape[0],
+               check_type_only=True)
+    dshape = (1, num_anchors, 6)
+    verify_nms(np_data, np_valid_count, dshape, np_result, dshape[0],
+               topk=3)
+
+
+def test_multibox_transform_loc():
+    def test_default_value():
+        num_anchors = 3
+        num_classes = 3
+
+        np_cls_prob = np.array(
+            [[[0.2, 0.5, 0.3], [0.25, 0.3, 0.45],
+              [0.7, 0.1, 0.2]]]).astype("float32")
+        np_loc_preds = np.array(
+            [[0.1, -0.2, 0.3, 0.2, 0.2, 0.4, 0.5, -0.3, 0.7, -0.2, -0.4,
+              -0.8]]).astype("float32")
+        np_anchors = np.array(
+            [[[-0.1, -0.1, 0.1, 0.1], [-0.2, -0.2, 0.2, 0.2],
+              [1.2, 1.2, 1.5, 1.5]]]).astype("float32")
+
+        expected_np_out = np.array([[[1, 0.69999999, 0, 0, 0.10818365, 0.10008108],
+                                     [0, 0.44999999, 1, 1, 1, 1],
+                                     [0, 0.30000001, 0, 0, 0.22903419, 0.20435292]]])
+
+
+        cls_prob = relay.var(
+            "cls_prob",
+            relay.ty.TensorType((1, num_anchors, num_classes), "float32"))
+        loc_pred = relay.var(
+            "loc_pred", relay.ty.TensorType((1, num_anchors * 4), "float32"))
+        anchors = relay.var(
+            "anchors", relay.ty.TensorType((1, num_anchors, 4), "float32"))
+
+        mtl = relay.vision.multibox_transform_loc(
+            cls_prob=cls_prob, loc_pred=loc_pred, anchor=anchors)
+        ret = relay.ir_pass.infer_type(mtl.astuple())
+        ref_type = relay.ty.TupleType(
+            tvm.convert([
+                relay.ty.TensorType((1, num_anchors, 6), "float32"),
+                relay.ty.TensorType((1, ), "int")
+            ]))
+
+        assert ret.checked_type == ref_type
+
+        nms = relay.vision.nms(mtl[0], mtl[1])
+        func = relay.Function([cls_prob, loc_pred, anchors], nms)
+        func = relay.ir_pass.infer_type(func)
+        ctx_list = [("llvm", tvm.cpu(0))]
+        for target, ctx in ctx_list:
+            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            op_res1 = intrp1.evaluate(func)(np_cls_prob, np_loc_preds,
+                                            np_anchors)
+            tvm.testing.assert_allclose(op_res1.asnumpy(), expected_np_out, rtol=1e-5)
+            intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+            op_res2 = intrp2.evaluate(func)(np_cls_prob, np_loc_preds,
+                                            np_anchors)
+            tvm.testing.assert_allclose(op_res2.asnumpy(), expected_np_out, rtol=1e-5)
+
+    def test_threshold():
+        num_anchors = 5
+        num_classes = 5
+        n = tvm.var("n")
+        cls_prob = relay.var(
+            "cls_prob",
+            relay.ty.TensorType((n, num_anchors, num_classes), "float32"))
+        loc_pred = relay.var(
+            "loc_pred", relay.ty.TensorType((n, num_anchors * 4), "float32"))
+        anchors = relay.var(
+            "anchors", relay.ty.TensorType((1, num_anchors, 4), "float32"))
+        threshold = 0.02
+        variances = (0.2, 0.2, 0.3, 0.3)
+
+        ret = relay.vision.multibox_transform_loc(
+            cls_prob=cls_prob,
+            loc_pred=loc_pred,
+            anchor=anchors,
+            threshold=threshold,
+            variances=variances)
+        ret = relay.ir_pass.infer_type(ret.astuple())
+        ref_type = relay.ty.TupleType(
+            tvm.convert([
+                relay.ty.TensorType((n, num_anchors, 6), "float32"),
+                relay.ty.TensorType((n, ), "int")
+            ]))
+        assert ret.checked_type == ref_type
+
+    test_default_value()
+    test_threshold()
+
+
+if __name__ == "__main__":
+    test_resize_infer_type()
+    test_resize()
+    test_multibox_prior()
+    test_multibox_transform_loc()
+    test_nms()
diff --git a/tests/python/relay/test_pass_alpha_equal.py b/tests/python/relay/test_pass_alpha_equal.py
new file mode 100644
index 000000000000..5158d5c7cc9c
--- /dev/null
+++ b/tests/python/relay/test_pass_alpha_equal.py
@@ -0,0 +1,504 @@
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.relay import ir_pass
+
+def alpha_equal(x, y):
+    """
+    Wrapper around alpha equality which ensures that
+    the hash function respects equality.
+    """
+    return ir_pass.alpha_equal(x, y) and ir_pass.structural_hash(x) == ir_pass.structural_hash(y)
+
+def test_tensor_type_alpha_equal():
+    t1 = relay.TensorType((3, 4), "float32")
+    t2 = relay.TensorType((3, 4), "float32")
+    t3 = relay.TensorType((3, 4, 5), "float32")
+    assert t1 == t2
+    assert t1 != t3
+
+    t1 = relay.TensorType((), "float32")
+    t2 = relay.TensorType((), "float32")
+    assert t1 == t2
+
+
+def test_incomplete_type_alpha_equal():
+    t1 = relay.IncompleteType(relay.Kind.Shape)
+    t2 = relay.IncompleteType(relay.Kind.Type)
+    t3 = relay.IncompleteType(relay.Kind.Type)
+
+    # only equal when there is pointer equality
+    assert t2 == t2
+    assert t1 == t1
+    assert t1 != t2
+    assert t2 != t3
+
+
+def test_type_param_alpha_equal():
+    t1 = relay.TypeVar("v1", relay.Kind.Type)
+    t2 = relay.TypeVar("v2", relay.Kind.Shape)
+    t3 = relay.TypeVar("v3", relay.Kind.Type)
+
+    # only pointer equality and eq_map allow equal params
+    assert t1 == t1
+    assert t2 == t2
+    assert t1 != t2 # different kind
+    assert t1 != t3 # not in eq_map
+
+    # function types are the only way to put type params
+    # in eq map
+    ft1 = relay.FuncType(tvm.convert([]), t1, tvm.convert([t1]), tvm.convert([]))
+    ft2 = relay.FuncType(tvm.convert([]), t3, tvm.convert([t3]), tvm.convert([]))
+    # actually an invalid type because t2 is wrong kind
+    ft3 = relay.FuncType(tvm.convert([]), t2, tvm.convert([t2]), tvm.convert([]))
+
+    assert ft1 == ft2
+    assert ft1 != ft3 # kinds still do not match
+
+
+def test_func_type_alpha_equal():
+    t1 = relay.TensorType((1, 2), "float32")
+    t2 = relay.TensorType((1, 2, 3), "float32")
+
+    tp1 = relay.TypeVar("v1", relay.Kind.Type)
+    tp2 = relay.TypeVar("v2", relay.Kind.Type)
+    tp3 = relay.TypeVar("v3", relay.Kind.Shape)
+    tp4 = relay.TypeVar("v3", relay.Kind.Shape)
+
+    broadcast = tvm.get_env_func("tvm.relay.type_relation.Broadcast")
+    identity = tvm.get_env_func("tvm.relay.type_relation.Identity")
+
+    tr1 = relay.TypeRelation(broadcast, tvm.convert([tp1, tp3]), 1, None)
+    tr2 = relay.TypeRelation(broadcast, tvm.convert([tp2, tp4]), 1, None)
+    tr3 = relay.TypeRelation(identity, tvm.convert([tp1, tp3]), 1, None)
+
+    ft = relay.FuncType(tvm.convert([t1, t2]), tp1,
+                         tvm.convert([tp1, tp3]),
+                         tvm.convert([tr1]))
+    translate_vars = relay.FuncType(tvm.convert([t1, t2]), tp1,
+                         tvm.convert([tp2, tp4]),
+                         tvm.convert([tr2]))
+    assert ft == translate_vars
+
+    different_args = relay.FuncType(tvm.convert([t1]), tp1,
+                         tvm.convert([tp1, tp3]),
+                         tvm.convert([tr1]))
+    assert ft != different_args
+
+    different_order = relay.FuncType(tvm.convert([t2, t1]), tp1,
+                         tvm.convert([tp1, tp3]),
+                         tvm.convert([tr1]))
+    assert ft != different_order
+
+    no_rel = relay.FuncType(tvm.convert([t1, t2]), tp1,
+                         tvm.convert([tp1, tp3]),
+                         tvm.convert([]))
+    assert ft != no_rel
+
+    more_vars = relay.FuncType(tvm.convert([t1, t2]), tp2,
+                         tvm.convert([tp1, tp2, tp3]),
+                         tvm.convert([tr1]))
+    assert ft != more_vars
+
+    all_the_vars = relay.FuncType(tvm.convert([t1, t2]), tp1,
+                         tvm.convert([tp1, tp2, tp3, tp4]),
+                         tvm.convert([tr1, tr2]))
+    assert ft != all_the_vars
+
+    different_rel = relay.FuncType(tvm.convert([t1, t2]), tp1,
+                                   tvm.convert([tp1, tp3]),
+                                   tvm.convert([tr3]))
+    assert ft != different_rel
+
+    more_rels = relay.FuncType(tvm.convert([t1, t2]), tp1,
+                                   tvm.convert([tp1, tp3]),
+                                   tvm.convert([tr1, tr3]))
+    assert ft != more_rels
+
+
+def test_tuple_type_alpha_equal():
+    t1 = relay.TensorType((1, 2, 3), "float32")
+    t2 = relay.TensorType((1, 2, 3, 4), "float32")
+    tp1 = relay.TypeVar("v1", relay.Kind.Type)
+    tp2 = relay.TypeVar("v2", relay.Kind.Type)
+
+    tup1 = relay.TupleType(tvm.convert([t1, t2, tp1]))
+    tup2 = relay.TupleType(tvm.convert([t1, t2, tp1]))
+    tup3 = relay.TupleType(tvm.convert([t2, t1, tp1]))
+    tup4 = relay.TupleType(tvm.convert([t1, t2, tp2]))
+
+    # as long as types are alpha-equal and in same order,
+    # tuples should be alpha-equal
+    assert tup1 == tup2
+    assert tup1 != tup3
+    assert tup1 != tup4
+
+
+def test_type_relation_alpha_equal():
+    t1 = relay.TensorType((1, 2), "float32")
+    t2 = relay.TensorType((1, 2, 3), "float32")
+    t3 = relay.TensorType((1, 2, 3, 4), "float32")
+
+    # functions are compared only by pointer equality so
+    # we need to be sure to use the same pointers
+    broadcast = tvm.get_env_func("tvm.relay.type_relation.Broadcast")
+    identity = tvm.get_env_func("tvm.relay.type_relation.Identity")
+
+    # attrs are also compared only by pointer equality
+    attr1 = tvm.make.node("attrs.TestAttrs", name="attr", padding=(3,4))
+    attr1_same = tvm.make.node("attrs.TestAttrs", name="attr", padding=(3,4))
+    attr2 = tvm.make.node("attrs.TestAttrs", name="attr", padding=(3,4,4))
+
+    tr = relay.TypeRelation(broadcast, tvm.convert([t1, t2]), 1, attr1)
+    same = relay.TypeRelation(broadcast, tvm.convert([t1, t2]), 1, attr1)
+    diff_func = relay.TypeRelation(identity, tvm.convert([t1, t2]), 1, attr1)
+    diff_order = relay.TypeRelation(broadcast, tvm.convert([t2, t1]), 1, attr1)
+    diff_args = relay.TypeRelation(broadcast, tvm.convert([t2, t3]), 1, attr1)
+    diff_attr = relay.TypeRelation(broadcast, tvm.convert([t1, t2]), 1, attr2)
+    same_attr = relay.TypeRelation(broadcast, tvm.convert([t1, t2]), 1, attr1_same)
+
+    bigger = relay.TypeRelation(identity, tvm.convert([t1, t3, t2]), 2, attr1)
+    diff_num_inputs = relay.TypeRelation(identity, tvm.convert([t1, t3, t2]), 1, attr2)
+
+    # func, number of args, input count, and order should be the same
+    assert tr == same
+    assert tr != diff_func
+    assert tr != diff_order
+    assert tr != diff_args
+    assert tr != diff_attr
+    assert tr == same_attr
+    assert tr != bigger
+
+    assert bigger != diff_num_inputs
+
+
+def test_constant_alpha_equal():
+    x = relay.const(1)
+    y = relay.const(2)
+    assert alpha_equal(x, x)
+    assert not alpha_equal(x, y)
+    assert alpha_equal(x, relay.const(1))
+
+
+def test_var_alpha_equal():
+    v1 = relay.Var("v1")
+    v2 = relay.Var("v2")
+
+    # normally only pointer equality
+    assert alpha_equal(v1, v1)
+    assert not alpha_equal(v1, v2)
+
+    # let node allows for setting the eq_map
+    l1 = relay.Let(v1, relay.const(1), v1)
+    l2 = relay.Let(v2, relay.const(1), v2)
+    l3 = relay.Let(v1, relay.const(1), v2)
+
+    assert alpha_equal(l1, l2)
+    assert not alpha_equal(l1, l3)
+
+    # type annotations
+    tt1 = relay.TensorType([], "int32")
+    tt2 = relay.TensorType([], "int32")
+    tt3 = relay.TensorType([], "int64")
+    v3 = relay.Var("v3", tt1)
+    v4 = relay.Var("v4", tt2)
+    v5 = relay.Var("v5", tt3)
+
+    l4 = relay.Let(v3, relay.const(1), v3)
+    l5 = relay.Let(v4, relay.const(1), v4)
+    l6 = relay.Let(v5, relay.const(1), v5)
+
+    # same annotations
+    assert alpha_equal(l4, l5)
+    # different annotations
+    assert not alpha_equal(l4, l6)
+    # one null annotation
+    assert not alpha_equal(l1, l4)
+
+
+def test_global_var_alpha_equal():
+    v1 = relay.GlobalVar("v1")
+    v2 = relay.GlobalVar("v2")
+
+    # only pointer equality suffices (smoke test)
+    assert alpha_equal(v1, v1)
+    assert not alpha_equal(v1, v2)
+
+
+def test_tuple_alpha_equal():
+    v0 = relay.Var("v0")
+    v1 = relay.Var("v1")
+    v2 = relay.Var("v2")
+
+    # unit value is a valid tuple
+    assert alpha_equal(relay.Tuple([]), relay.Tuple([]))
+
+    tup = relay.Tuple([v0, relay.const(2), relay.const(3), relay.Tuple([relay.const(4)])])
+    same = relay.Tuple([v0, relay.const(2), relay.const(3), relay.Tuple([relay.const(4)])])
+
+    assert alpha_equal(tup, same)
+
+    # use the eq_map
+
+
+    let_tup = relay.Let(v1, tup, v1)
+    let_mapped = relay.Let(v2, relay.Tuple([v0, relay.const(2), relay.const(3),
+                                            relay.Tuple([relay.const(4)])]),
+                           v2)
+
+    assert alpha_equal(let_tup, let_mapped)
+
+    more_fields = relay.Tuple([v1, relay.const(2), relay.const(3), relay.Tuple([relay.const(4)]), v2])
+    assert not alpha_equal(tup, more_fields)
+
+    fewer_fields = relay.Tuple([v1, relay.const(2), relay.const(3)])
+    assert not alpha_equal(tup, fewer_fields)
+
+    different_end = relay.Tuple([v1, relay.const(2), relay.const(3),
+                           relay.Tuple([relay.const(5)])])
+    assert not alpha_equal(tup, different_end)
+
+    different_start = relay.Tuple([v2, relay.const(2), relay.const(3),
+                                 relay.Tuple([relay.const(4)])])
+    assert not alpha_equal(tup, different_start)
+
+    longer_at_end = relay.Tuple([v1, relay.const(2), relay.const(3),
+                                 relay.Tuple([relay.const(4), relay.const(5)])])
+    assert not alpha_equal(tup, longer_at_end)
+
+
+def test_tuple_get_item_alpha_equal():
+    x = relay.Var('x')
+    y = relay.Var('y')
+    assert not alpha_equal(relay.TupleGetItem(x, 1), relay.TupleGetItem(y, 1))
+    assert not alpha_equal(relay.TupleGetItem(x, 1), relay.TupleGetItem(x, 2))
+    assert alpha_equal(relay.TupleGetItem(x, 1), relay.TupleGetItem(x, 1))
+
+
+def test_function_alpha_equal():
+    tt1 = relay.TensorType((1, 2, 3), "float32")
+    tt2 = relay.TensorType((4, 5, 6), "int8")
+    tt3 = relay.TupleType([tt1, tt2])
+
+    v1 = relay.Var("v1", tt1)
+    v2 = relay.Var("v2", tt2)
+    v3 = relay.Var("v3", tt3)
+    v4 = relay.Var("v4", tt2)
+    vret = relay.Constant(tvm.nd.array(np.ones(1)))
+
+    tp1 = relay.TypeVar("tp1", relay.Kind.Type)
+    tp2 = relay.TypeVar("tp2", relay.Kind.Type)
+    tp3 = relay.TypeVar("tp3", relay.Kind.Shape)
+    tp4 = relay.TypeVar("tp4", relay.Kind.Shape)
+
+    basic_args = [relay.Var("v3", tt1), relay.Var("v4", tt2)]
+    basic_tps = [tp1, tp2]
+
+    func = relay.Function([v1, v2], v1,
+                          tt2, basic_tps)
+    mapped = relay.Function(basic_args, basic_args[0], tt2, basic_tps)
+    assert alpha_equal(func, mapped)
+
+    fewer_params = relay.Function([relay.Var("v4", tt2)], v4, tt2, basic_tps)
+    assert not alpha_equal(func, fewer_params)
+
+    more_params = relay.Function([relay.Var("v3", tt1),
+                                  relay.Var("v4", tt2),
+                                  relay.Var("v2", tt2)], v4, tt2, basic_tps)
+    assert not alpha_equal(func, more_params)
+
+    params_unordered = relay.Function([v2, v1], v1,
+                                      tt2, basic_tps)
+    assert not alpha_equal(func, params_unordered)
+
+    params_mismatch = relay.Function([v1, v3], v1,
+                                     tt2, basic_tps)
+    assert not alpha_equal(func, params_mismatch)
+
+    # also would not typecheck
+    ret_type_mismatch = relay.Function(basic_args, v4, tt1, basic_tps)
+    assert not alpha_equal(func, ret_type_mismatch)
+
+    # also mis-typed
+    different_body = relay.Function(basic_args, v3, tt2, basic_tps)
+    assert not alpha_equal(func, different_body)
+
+    fewer_type_params = relay.Function(basic_args, v4, tt2, [tp1])
+    assert not alpha_equal(func, fewer_type_params)
+
+    more_type_params = relay.Function(basic_args, v4, tt2, [tp1, tp2, tp3])
+    assert not alpha_equal(func, more_type_params)
+
+    type_params_unordered = relay.Function(basic_args, v4, tt2, [tp2, tp1])
+    assert not alpha_equal(func, type_params_unordered)
+
+    different_type_params = relay.Function(basic_args, v4, tt2, [tp3, tp4])
+    assert not alpha_equal(func, different_type_params)
+
+    # a well-typed example that also differs in body, ret type, and type params
+    tupled_example = relay.Function(basic_args, relay.Tuple([v3, v4]), tt3)
+    assert not alpha_equal(func, tupled_example)
+
+    # nullable
+    no_ret_type = relay.Function(basic_args, v4, None, [tp1, tp2])
+    # both null
+    assert alpha_equal(no_ret_type, no_ret_type)
+    # one null
+    assert not alpha_equal(func, no_ret_type)
+    assert not alpha_equal(no_ret_type, func)
+
+
+def test_call_alpha_equal():
+    v1 = relay.Var("v1")
+    v2 = relay.Var("v2")
+
+    # attrs are compared only by pointer equality
+    attr1 = tvm.make.node("attrs.TestAttrs", name="attr", padding=(3,4))
+    attr1_same = tvm.make.node("attrs.TestAttrs", name="attr", padding=(3,4))
+    attr2 = tvm.make.node("attrs.TestAttrs", name="attr", padding=(3,4,4))
+
+    tt1 = relay.TensorType((1, 2, 3), "float32")
+    tt2 = relay.TensorType((), "int8")
+
+    basic_args = [relay.const(1), relay.const(2), v2, relay.Tuple([])]
+
+    # manually writing out args to ensure that args does not rely on
+    # pointer equality
+    call = relay.Call(v1, [relay.const(1), relay.const(2), v2, relay.Tuple([])],
+                      attr1, [tt1])
+    same = relay.Call(v1, basic_args, attr1, [tt1])
+    assert alpha_equal(call, same)
+
+    different_fn = relay.Call(v2, basic_args, attr1, [tt1])
+    assert not alpha_equal(call, different_fn)
+
+    fewer_args = relay.Call(v1, [relay.const(1), relay.const(2), v2], attr1, [tt1])
+    assert not alpha_equal(call, fewer_args)
+
+    reordered_args = relay.Call(v1, [relay.const(2), relay.const(1),
+                                     relay.Tuple([]), v2], attr1, [tt1])
+    assert not alpha_equal(call, reordered_args)
+
+    different_args = relay.Call(v1, [relay.const(1), relay.const(2), relay.const(3)],
+                                attr1, [tt1])
+    assert not alpha_equal(call, different_args)
+
+    more_args = relay.Call(v1, [relay.const(1), relay.const(2), v2, relay.Tuple([]),
+                                relay.const(3), relay.const(4)], attr1, [tt1])
+    assert not alpha_equal(call, more_args)
+
+    different_attrs = relay.Call(v1, basic_args, attr2, [tt1])
+    assert not alpha_equal(call, different_attrs)
+
+    same_attrs = relay.Call(v1, basic_args, attr1_same, [tt1])
+    assert alpha_equal(call, same_attrs)
+
+    no_type_args = relay.Call(v1, basic_args, attr1)
+    assert not alpha_equal(call, no_type_args)
+
+    more_type_args = relay.Call(v1, basic_args, attr1, [tt1, tt2])
+    assert not alpha_equal(call, more_type_args)
+
+    different_type_arg = relay.Call(v1, basic_args, attr1, [tt2])
+    assert not alpha_equal(call, different_type_arg)
+
+
+def test_let_alpha_equal():
+    tt1 = relay.TensorType((), "float32")
+    tt2 = relay.TensorType((), "int8")
+    v1 = relay.Var("v1")
+    v1_wtype = relay.Var("v1", tt1)
+    v2 = relay.Var("v2")
+    v3 = relay.Var("v3")
+
+    let = relay.Let(v1, relay.const(2), v1)
+    mapped = relay.Let(v2, relay.const(2), v2)
+    assert alpha_equal(let, mapped)
+
+    mismatched_var = relay.Let(v2, relay.const(2), v3)
+    assert not alpha_equal(let, mismatched_var)
+
+    different_value = relay.Let(v2, relay.const(3), v2)
+    assert not alpha_equal(let, different_value)
+
+    different_body = relay.Let(v2, relay.const(3), relay.const(12))
+    assert not alpha_equal(let, different_body)
+
+    # specified types must match
+
+    let_with_type = relay.Let(v1_wtype, relay.const(2), v1_wtype)
+    same_type = relay.Let(v1_wtype, relay.const(2), v1_wtype)
+    assert alpha_equal(let_with_type, same_type)
+    assert not alpha_equal(let, let_with_type)
+    v2 = relay.Var("v1", tt2)
+    different_type = relay.Let(v2, relay.const(2), v2)
+    assert not alpha_equal(let_with_type, different_type)
+
+
+def test_if_alpha_equal():
+    v1 = relay.Var("v1")
+    v2 = relay.Var("v2")
+
+    if_sample = relay.If(v1, relay.const(1), relay.Tuple([relay.const(2), relay.const(3)]))
+    same = relay.If(v1, relay.const(1), relay.Tuple([relay.const(2), relay.const(3)]))
+    assert alpha_equal(if_sample, same)
+
+    different_cond = relay.If(v2, relay.const(1), relay.Tuple([relay.const(2), relay.const(3)]))
+    assert not alpha_equal(if_sample, different_cond)
+
+    different_true = relay.If(v1, relay.const(2), relay.Tuple([relay.const(2), relay.const(3)]))
+    assert not alpha_equal(if_sample, different_true)
+
+    different_false = relay.If(v1, relay.const(1), relay.Tuple([]))
+    assert not alpha_equal(if_sample, different_false)
+
+
+def test_op_alpha_equal():
+    # only checks names
+    op1 = relay.op.get("add")
+    op2 = relay.op.get("add")
+    assert alpha_equal(op1, op2)
+
+    op3 = relay.op.get("take")
+    assert not alpha_equal(op1, op3)
+
+
+def test_graph_equal():
+    x = relay.var("x")
+
+    y0 = relay.add(x, x)
+    z0 = relay.add(y0, y0)
+
+    y1 = relay.add(x, x)
+    z1 = relay.add(y1, y1)
+
+    z3 = relay.add(relay.add(x, x), relay.add(x, x))
+
+    assert alpha_equal(z0, z1)
+
+    # z3's dataflow format is different from z0
+    # z0 is computed from a common y0 node
+    # Relay view them as different programs
+    # Check the difference in the text format.
+    assert not alpha_equal(z0, z3)
+
+
+
+if __name__ == "__main__":
+    test_tensor_type_alpha_equal()
+    test_incomplete_type_alpha_equal()
+    test_constant_alpha_equal()
+    test_func_type_alpha_equal()
+    test_tuple_type_alpha_equal()
+    test_type_relation_alpha_equal()
+    test_constant_alpha_equal()
+    test_global_var_alpha_equal()
+    test_tuple_alpha_equal()
+    test_tuple_get_item_alpha_equal()
+    test_function_alpha_equal()
+    test_call_alpha_equal()
+    test_let_alpha_equal()
+    test_if_alpha_equal()
+    test_op_alpha_equal()
+    test_var_alpha_equal()
+    test_graph_equal()
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
new file mode 100644
index 000000000000..6a8be7ea847e
--- /dev/null
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -0,0 +1,316 @@
+"""Test alter op layout pass"""
+
+from tvm import relay
+from tvm.relay.op import register_alter_op_layout
+from tvm.relay.ir_pass import *
+
+def test_alter_op():
+    """Test directly replacing an operator with a new one"""
+    def before():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight = relay.var('weight', shape=(64, 64, 3, 3))
+        y = relay.nn.conv2d(x, weight,
+                            channels=64,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        y = relay.nn.relu(y)
+        y = relay.Function([x, weight], y)
+        return y
+
+    @register_alter_op_layout("nn.conv2d", level=100)
+    def alter_conv2d(attrs, inputs, tinfos):
+        data, weight = inputs
+        weight = relay.multiply(weight, relay.const(2.0))
+        return relay.nn.conv2d(data, weight, **attrs)
+
+    def expected():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight = relay.var('weight', shape=(64, 64, 3, 3))
+        y = relay.nn.conv2d(x, relay.multiply(weight, relay.const(2.0)),
+                            channels=64,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        y = relay.nn.relu(y)
+        y = relay.Function([x, weight], y)
+        return y
+
+    a = before()
+    a = infer_type(a)
+    a = alter_op_layout(a)
+
+    b = expected()
+    b = infer_type(b)
+
+    assert(alpha_equal(a, b))
+
+
+def test_alter_return_none():
+    """Test doing nothing by returning 'None' """
+    def before():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        y = relay.nn.global_max_pool2d(x)
+        y = relay.Function([x], y)
+        return y
+
+    called = [False]
+
+    @register_alter_op_layout("nn.global_max_pool2d", level=101)
+    def alter_conv2d(attrs, inputs, tinfos):
+        called[0] = True
+        return None
+
+    a = before()
+    a = infer_type(a)
+    a = alter_op_layout(a)
+
+    b = before()
+    b = infer_type(b)
+    assert(alpha_equal(a, b))
+    assert(called[0])
+
+
+def test_alter_layout():
+    """Test alternating the layout of a conv2d.
+    The layout of broadcast operators and the weight should be changed accordingly.
+    """
+    def before():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        bias = relay.var("bias")
+        weight = relay.var("weight")
+        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
+        y = relay.nn.bias_add(y, bias)
+        # a useless tuple, which will be eliminated
+        y = relay.Tuple([y])[0]
+        y = relay.nn.relu(y)
+        y = relay.nn.batch_flatten(y)
+        y = relay.Function(free_vars(y), y)
+        return y
+
+    @register_alter_op_layout("nn.conv2d", level=102)
+    def alter_conv2d(attrs, inputs, tinfos):
+        data, weight = inputs
+        new_attrs = dict(attrs)
+        new_attrs['data_layout'] = 'NCHW16c'
+        new_attrs['weight_layout'] = 'OIHW16i'
+        return relay.nn.conv2d(data, weight, **new_attrs)
+
+    def expected():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        bias = relay.var("bias", shape=(64,))
+        weight = relay.var("weight", shape=(64, 64, 3, 3))
+
+        y = relay.layout_transform(x, "NCHW", "NCHW16c")
+        w = relay.layout_transform(weight, "OIHW", "OIHW16i")
+        y = relay.nn.conv2d(y, w,
+                            channels=64,
+                            kernel_size=(3, 3),
+                            padding=(1, 1),
+                            weight_layout="OIHW16i",
+                            data_layout="NCHW16c")
+        b = relay.expand_dims(bias, axis=1, num_newaxis=2)
+        b = relay.layout_transform(b, "CHW", "CHW16c")
+        y = relay.add(y, b)
+
+        y = relay.nn.relu(y)
+        y = relay.layout_transform(y, "NCHW16c", "NCHW")
+        y = relay.nn.batch_flatten(y)
+        y = relay.Function(free_vars(y), y)
+        return y
+
+    a = before()
+    a = infer_type(a)
+    a = canonicalize_ops(a)
+    a = infer_type(a)
+    a = alter_op_layout(a)
+    a = infer_type(a)
+
+    b = expected()
+    b = infer_type(b)
+
+    assert(alpha_equal(a, b))
+
+
+def test_alter_layout_dual_path():
+    """
+    Test alternating the layout with two outputs.
+    One path continues to use the new layout while one path fall backs to old layout.
+    """
+    def before():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight1 = relay.var('weight1')
+        weight2 = relay.var('weight2')
+        y = relay.nn.conv2d(x, weight1,
+                            channels=32,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        y = relay.nn.relu(y)
+        y1 = relay.nn.conv2d(y, weight2,
+                             channels=32,
+                             kernel_size=(3, 3),
+                             padding=(1, 1))
+        y1 = relay.nn.relu(y1)
+        y2 = relay.nn.batch_flatten(y)
+        ret = relay.Tuple([y1, y2])
+        y = relay.Function(free_vars(ret), ret)
+        return y
+
+    @register_alter_op_layout("nn.conv2d", level=103)
+    def alter_conv2d(attrs, inputs, tinfos):
+        data, weight = inputs
+        new_attrs = dict(attrs)
+        new_attrs['data_layout'] = 'NCHW16c'
+        return relay.nn.conv2d(data, weight, **new_attrs)
+
+    def expected():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight1 = relay.var('weight1')
+        weight2 = relay.var('weight2')
+        y = relay.layout_transform(x, "NCHW", "NCHW16c")
+        y = relay.nn.conv2d(y, weight1,
+                            channels=32,
+                            kernel_size=(3, 3),
+                            padding=(1, 1),
+                            data_layout="NCHW16c")
+        y = relay.nn.relu(y)
+        y1 = relay.nn.conv2d(y, weight2,
+                             channels=32,
+                             kernel_size=(3, 3),
+                             padding=(1, 1),
+                             data_layout='NCHW16c')
+        y1 = relay.nn.relu(y1)
+        y1 = relay.layout_transform(y1, "NCHW16c", "NCHW")
+        y2 = relay.layout_transform(y, "NCHW16c", "NCHW")
+        y2 = relay.nn.batch_flatten(y2)
+        ret = relay.Tuple([y1, y2])
+        y = relay.Function(free_vars(ret), ret)
+        return y
+
+    a = before()
+    a = infer_type(a)
+    a = alter_op_layout(a)
+    a = infer_type(a)
+
+    b = expected()
+    b = infer_type(b)
+
+    assert(alpha_equal(a, b))
+
+def test_alter_layout_resnet():
+    """Test alternating the layout of a residual block
+    This also tests the elimination of duplicated transformation.
+    If a same transformation applies to a same node twice, only one transformation will be created.
+    """
+    def before():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight1 = relay.var('weight1')
+        weight2 = relay.var('weight2')
+        y = relay.nn.conv2d(x, weight1,
+                            channels=32,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        y = relay.nn.relu(y)
+        y2 = relay.nn.conv2d(x, weight2,
+                             channels=32,
+                             kernel_size=(1, 1))
+        y2 = relay.nn.relu(y2)
+        y = y + y2
+        y = relay.nn.global_max_pool2d(y)
+        return relay.Function(free_vars(y), y)
+
+    @register_alter_op_layout("nn.conv2d", level=104)
+    def alter_conv2d(attrs, inputs, tinfos):
+        data, weight = inputs
+        new_attrs = dict(attrs)
+        new_attrs['data_layout'] = 'NCHW16c'
+        return relay.nn.conv2d(data, weight, **new_attrs)
+
+    def expected():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        weight1 = relay.var('weight1')
+        weight2 = relay.var('weight2')
+        x = relay.layout_transform(x, "NCHW", "NCHW16c")
+        y = relay.nn.conv2d(x, weight1,
+                            channels=32,
+                            kernel_size=(3, 3),
+                            padding=(1, 1),
+                            data_layout="NCHW16c")
+        y = relay.nn.relu(y)
+        y2 = relay.nn.conv2d(x, weight2,
+                             channels=32,
+                             kernel_size=(1, 1),
+                             data_layout='NCHW16c')
+        y2 = relay.nn.relu(y2)
+        y = y + y2
+        y = relay.nn.global_max_pool2d(y, layout="NCHW16c")
+        y = relay.layout_transform(y, "NCHW16c", "NCHW")
+        return relay.Function(free_vars(y), y)
+
+    a = before()
+    a = infer_type(a)
+    a = alter_op_layout(a)
+    a = infer_type(a)
+
+    b = expected()
+    b = infer_type(b)
+
+    assert(alpha_equal(a, b))
+
+
+def test_alter_layout_broadcast_op():
+    """Test boradcast operators """
+    def before():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        bias = relay.var("bias", shape=(64,))
+        scale = relay.var("scale", shape=(64, 1, 1))
+        weight = relay.var("weight")
+        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
+        y = relay.nn.bias_add(y, bias) # test broadcasting to lhs
+        y = relay.multiply(scale, y)         # test broadcasting to rhs
+        y = relay.Function(free_vars(y), y)
+        return y
+
+    @register_alter_op_layout("nn.conv2d", level=102)
+    def alter_conv2d(attrs, inputs, tinfos):
+        data, weight = inputs
+        new_attrs = dict(attrs)
+        new_attrs['data_layout'] = 'NCHW16c'
+        return relay.nn.conv2d(data, weight, **new_attrs)
+
+    def expected():
+        x = relay.var("x", shape=(1, 64, 56, 56))
+        bias = relay.var("bias", shape=(64,))
+        scale = relay.var("scale", shape=(64, 1, 1))
+        weight = relay.var("weight")
+        x = relay.layout_transform(x, "NCHW", "NCHW16c")
+        bias = relay.expand_dims(bias, 1, 2)
+        bias = relay.layout_transform(bias, "CHW", "CHW16c")
+        scale = relay.layout_transform(scale, "CHW", "CHW16c")
+        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1),
+                            data_layout="NCHW16c")
+        y = relay.add(y, bias)          # test broadcasting to lhs
+        y = relay.multiply(scale, y)      # test broadcasting to rhs
+        y = relay.layout_transform(y, "NCHW16c", "NCHW")
+        y = relay.Function(free_vars(y), y)
+        return y
+
+    a = before()
+    a = infer_type(a)
+    a = canonicalize_ops(a)
+    a = infer_type(a)
+    a = alter_op_layout(a)
+    a = infer_type(a)
+
+    b = expected()
+    b = infer_type(b)
+
+    assert(alpha_equal(a, b))
+
+
+if __name__ == "__main__":
+    test_alter_op()
+    test_alter_return_none()
+    test_alter_layout()
+    test_alter_layout_dual_path()
+    test_alter_layout_resnet()
+    test_alter_layout_broadcast_op()
+
diff --git a/tests/python/relay/test_pass_check_kind.py b/tests/python/relay/test_pass_check_kind.py
new file mode 100644
index 000000000000..5ead501157c5
--- /dev/null
+++ b/tests/python/relay/test_pass_check_kind.py
@@ -0,0 +1,134 @@
+import tvm
+from tvm import relay
+from tvm.relay.ir_pass import check_kind
+
+def test_tuple_kind():
+    # only contain type kinds
+    tp = relay.TypeVar('tp', relay.Kind.Type)
+    tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
+    tf = relay.FuncType(tvm.convert([]), tt, tvm.convert([]), tvm.convert([]))
+    fields = tvm.convert([tp, tf, tt])
+
+    tup_ty = relay.TupleType(fields)
+    assert check_kind(tup_ty)
+
+
+def test_func_kind():
+    # only contain type kinds
+    tp1 = relay.TypeVar('tp1', relay.Kind.Type)
+    tp2 = relay.TypeVar('tp2', relay.Kind.Type)
+
+    shape = tvm.convert([1, 2, 3])
+    dtype = 'float32'
+    tensor_type = relay.TensorType(shape, dtype)
+
+    tr = relay.TypeRelation(None, tvm.convert([tensor_type, tp1]) , 1, None)
+
+    type_params = tvm.convert([tp1, tp2])
+    type_constraints = tvm.convert([tr])
+    arg_types = tvm.convert([tp1, tensor_type])
+    ret_type = relay.TupleType(tvm.convert([tp2, tensor_type]))
+
+    tf = relay.FuncType(arg_types, ret_type, type_params, type_constraints)
+    assert check_kind(tf)
+
+
+def test_relation_kind():
+    # only have type kinds for arguments
+    tp = relay.TypeVar('tp', relay.Kind.Type)
+    tt = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
+    tf = relay.FuncType(tvm.convert([]), tt, tvm.convert([]), tvm.convert([]))
+    args = tvm.convert([tf, tt, tp])
+
+    tr = relay.TypeRelation(None, args, 2, None)
+    assert check_kind(tr)
+
+
+def test_invalid_tuple_kind():
+    tp1 = relay.TypeVar('tp1', relay.Kind.Shape)
+    tp2 = relay.TypeVar('tp2', relay.Kind.BaseType)
+    tp3 = relay.TypeVar('tp3', relay.Kind.ShapeVar)
+    fields = tvm.convert([tp1, tp2, tp3])
+
+    tup_ty = relay.TupleType(fields)
+    assert not check_kind(tup_ty)
+
+
+def test_invalid_func_kind():
+    tp1 = relay.TypeVar('tp1', relay.Kind.Shape)
+    tp2 = relay.TypeVar('tp2', relay.Kind.BaseType)
+    tp3 = relay.TypeVar('tp3', relay.Kind.ShapeVar)
+
+    type_params = tvm.convert([tp1, tp2, tp3])
+    type_constraints = tvm.convert([])
+    arg_types = tvm.convert([tp1, tp2])
+    ret_type = tp3
+
+    tf = relay.FuncType(arg_types, ret_type, type_params, type_constraints)
+    assert not check_kind(tf)
+
+
+def test_invalid_relation_kind():
+    tp1 = relay.TypeVar('tp1', relay.Kind.Shape)
+    tp2 = relay.TypeVar('tp2', relay.Kind.BaseType)
+    tp3 = relay.TypeVar('tp3', relay.Kind.ShapeVar)
+    args = tvm.convert([tp1, tp2, tp3])
+
+    tr = relay.TypeRelation(None, args, 2, None)
+    assert not check_kind(tr)
+
+
+def test_func_with_invalid_ret_type():
+    tp1 = relay.TypeVar('tp1', relay.Kind.Type)
+    tp2 = relay.TypeVar('tp2', relay.Kind.Shape)
+    tf = relay.FuncType(tvm.convert([tp1]), tp2, tvm.convert([tp1, tp2]), tvm.convert([]))
+
+
+def test_func_with_invalid_arg_types():
+    tp1 = relay.TypeVar('tp1', relay.Kind.Shape)
+    tp2 = relay.TypeVar('tp2', relay.Kind.Type)
+    tf = relay.FuncType(tvm.convert([tp1]), tp2, tvm.convert([tp1, tp2]), tvm.convert([]))
+
+
+def test_func_with_invalid_tuple():
+    tp1 = relay.TypeVar('tp1', relay.Kind.Shape)
+
+    ret_type = relay.TupleType(tvm.convert([tp1, tp1, tp1]))
+
+    tf = relay.FuncType(tvm.convert([]), ret_type, tvm.convert([tp1]), tvm.convert([]))
+    assert not check_kind(tf)
+
+
+def test_func_with_invalid_relation():
+    tp1 = relay.TypeVar('tp1', relay.Kind.Type)
+    tp2 = relay.TypeVar('tp2', relay.Kind.Shape)
+    tp3 = relay.TypeVar('tp3', relay.Kind.ShapeVar)
+
+    tr = relay.TypeRelation(None, tvm.convert([tp2, tp3]), 1, None)
+
+    tf = relay.FuncType(tvm.convert([tp1]), tp1, tvm.convert([tp1, tp2, tp3]), tvm.convert([tr]))
+    assert not check_kind(tf)
+
+
+def test_tuple_with_invalid_func():
+    tensor_type = relay.TensorType(tvm.convert([1, 2, 3]), 'float32')
+
+    tp1 = relay.TypeVar('tp1', relay.Kind.Shape)
+    tf = relay.FuncType(tvm.convert([]), tp1, tvm.convert([tp1]), tvm.convert([]))
+
+    tup_ty = relay.TupleType(tvm.convert([tensor_type, tf]))
+    assert not check_kind(tup_ty)
+
+
+if __name__ == "__main__":
+    test_tuple_kind()
+    test_func_kind()
+    test_relation_kind()
+    test_invalid_tuple_kind()
+    test_invalid_func_kind()
+    test_invalid_relation_kind()
+    test_func_with_invalid_ret_type()
+    test_func_with_invalid_arg_types()
+    test_func_with_invalid_tuple()
+    test_func_with_invalid_relation()
+    test_tuple_with_invalid_func()
diff --git a/tests/python/relay/test_pass_combine_parallel_conv2d.py b/tests/python/relay/test_pass_combine_parallel_conv2d.py
new file mode 100644
index 000000000000..7d0a5a08555e
--- /dev/null
+++ b/tests/python/relay/test_pass_combine_parallel_conv2d.py
@@ -0,0 +1,179 @@
+from tvm import relay
+import numpy as np
+
+
+def test_combine_parallel_conv2d():
+    """Simple testcase."""
+    def before(x, w1, w2, w3, w4):
+        args = [x, w1, w2, w3, w4]
+        y1 = relay.nn.conv2d(x, w1)
+        y2 = relay.nn.conv2d(x, w2)
+        # y3 cannot be combined
+        y3 = relay.nn.conv2d(x, w3)
+        y4 = relay.nn.conv2d(x, w4)
+        y5 = relay.nn.max_pool2d(x)
+        y = relay.Tuple((y1, y2, y3, y4, y5))
+        return relay.Function(args, y)
+
+    def expected(x, w1, w2, w3, w4, channels1, channels2, channels3, channels4):
+        # use a fixed order of args so alpha equal check can pass
+        args = [x, w1, w2, w3, w4]
+        w = relay.concatenate((w1, w2, w4), axis=0)
+        y = relay.nn.conv2d(x, w, channels=channels1 + channels2 + channels4)
+        y1 = relay.strided_slice(y, [0, 0], [None, channels1])
+        y2 = relay.strided_slice(y, [0, channels1], [None, channels1 + channels2])
+        y3 = relay.nn.conv2d(x, w3)
+        y4 = relay.strided_slice(y, [0, channels1 + channels2],
+                                 [None, channels1 + channels2 + channels4])
+        y5 = relay.nn.max_pool2d(x)
+        y = relay.Tuple((y1, y2, y3, y4, y5))
+        return relay.Function(args, y)
+
+    def check(x_shape, channels1, channels2, channels3, channels4):
+        x =  relay.var("x", shape=x_shape)
+        in_c = x_shape[1]
+        w1 = relay.var("w1", shape=(channels1, in_c, 1, 1))
+        w2 = relay.var("w2", shape=(channels2, in_c, 1, 1))
+        w3 = relay.var("w3", shape=(channels3, in_c, 3, 3))
+        w4 = relay.var("w4", shape=(channels4, in_c, 1, 1))
+
+        y_before = before(x, w1, w2, w3, w4)
+        y = relay.ir_pass.infer_type(y_before)
+        y = relay.ir_pass.combine_parallel_conv2d(y)
+        y = relay.ir_pass.infer_type(y)
+        y_expected = expected(x, w1, w2, w3, w4, channels1, channels2, channels3, channels4)
+        y_expected = relay.ir_pass.infer_type(y_expected)
+        assert relay.ir_pass.alpha_equal(y, y_expected)
+
+    check((1, 4, 16, 16), 4, 4, 4, 4)
+    check((1, 4, 16, 16), 4, 8, 4, 7)
+
+
+def test_combine_parallel_conv2d_scale_relu():
+    """Testcase of combining conv2d + scale + relu"""
+    def before(x, w1, w2, scale1, scale2, bias):
+        args = [x, w1, w2, scale1, scale2, bias]
+        y1 = relay.nn.conv2d(x, w1)
+        y1 = relay.multiply(y1, scale1)
+        y1 = relay.nn.relu(y1)
+        y2 = relay.nn.conv2d(x, w2)
+        y2 = relay.multiply(y2, scale2)
+        y2 = relay.nn.relu(y2)
+        y2 = relay.add(y2, bias)
+        y = relay.Tuple((y1, y2))
+        return relay.Function(args, y)
+
+    def expected(x, w1, w2, scale1, scale2, bias, channels1, channels2):
+        args = [x, w1, w2, scale1, scale2, bias]
+        w = relay.concatenate((w1, w2), axis=0)
+        scale = relay.concatenate((scale1, scale2), axis=0)
+        y = relay.nn.conv2d(x, w, channels=channels1 + channels2)
+        y = relay.multiply(y, scale)
+        y = relay.nn.relu(y)
+        y1 = relay.strided_slice(y, [0, 0], [None, channels1])
+        y2 = relay.strided_slice(y, [0, channels1], [None, channels1 + channels2])
+        y2 = relay.add(y2, bias)
+        y = relay.Tuple((y1, y2))
+        return relay.Function(args, y)
+
+    def check(x_shape, channels1, channels2):
+        x = relay.var("x", shape=x_shape)
+        in_c = x_shape[1]
+        w1 = relay.var("w1", shape=(channels1, in_c, 1, 1))
+        w2 = relay.var("w2", shape=(channels2, in_c, 1, 1))
+        scale1 = relay.var("scale1", shape=(channels1, 1, 1))
+        scale2 = relay.var("scale2", shape=(channels2, 1, 1))
+        bias = relay.var("bias", shape=(channels2, 1, 1))
+        y_before = before(x, w1, w2, scale1, scale2, bias)
+        y = relay.ir_pass.infer_type(y_before)
+        y = relay.ir_pass.combine_parallel_conv2d(y)
+        y = relay.ir_pass.infer_type(y)
+        y_expected = expected(x, w1, w2, scale1, scale2, bias, channels1, channels2)
+        y_expected = relay.ir_pass.infer_type(y_expected)
+        assert relay.ir_pass.alpha_equal(y, y_expected)
+
+    check((1, 4, 16, 16), 4, 8)
+
+
+def test_combine_parallel_conv2d_scale():
+    """Testcase of un-combinable scale"""
+    def before(x, w1, w2, scale1, scale2):
+        args = [x, w1, w2, scale1, scale2]
+        y1 = relay.nn.conv2d(x, w1)
+        y1 = relay.multiply(y1, scale1)
+        y2 = relay.nn.conv2d(x, w2)
+        y2 = relay.multiply(y2, scale2)
+        y = relay.Tuple((y1, y2))
+        return relay.Function(args, y)
+
+    def expected(x, w1, w2, scale1, scale2, channels1, channels2):
+        args = [x, w1, w2, scale1, scale2]
+        w = relay.concatenate((w1, w2), axis=0)
+        y = relay.nn.conv2d(x, w, channels=channels1 + channels2)
+        y1 = relay.strided_slice(y, [0, 0], [None, channels1])
+        y2 = relay.strided_slice(y, [0, channels1], [None, channels1 + channels2])
+        y1 = relay.multiply(y1, scale1)
+        y2 = relay.multiply(y2, scale2)
+        y = relay.Tuple((y1, y2))
+        return relay.Function(args, y)
+
+    def check(x_shape, channels1, channels2):
+        x = relay.var("x", shape=x_shape)
+        in_c = x_shape[1]
+        w1 = relay.var("w1", shape=(channels1, in_c, 1, 1))
+        w2 = relay.var("w2", shape=(channels2, in_c, 1, 1))
+        scale1 = relay.var("scale1", shape=(1,))
+        scale2 = relay.var("scale2", shape=(1,))
+        y_before = before(x, w1, w2, scale1, scale2)
+        y = relay.ir_pass.infer_type(y_before)
+        y = relay.ir_pass.combine_parallel_conv2d(y)
+        y = relay.ir_pass.infer_type(y)
+        y_expected = expected(x, w1, w2, scale1, scale2, channels1, channels2)
+        y_expected = relay.ir_pass.infer_type(y_expected)
+        assert relay.ir_pass.alpha_equal(y, y_expected)
+
+    check((1, 4, 16, 16), 4, 8)
+
+
+def test_combine_parallel_conv2d_multiple_blocks():
+    def before(x, w, repeat):
+        args = [x, w]
+        y = x
+        for i in range(repeat):
+            y1 = relay.nn.conv2d(y, w)
+            y2 = relay.nn.conv2d(y, w)
+            y = relay.concatenate((y1, y2), axis=1)
+        return relay.Function(args, y)
+
+    def expected(x, w, channels, repeat):
+        args = [x, w]
+        y = x
+        for i in range(repeat):
+            w_concat = relay.concatenate((w, w), axis=0)
+            y = relay.nn.conv2d(y, w_concat, channels=channels*2)
+            y1 = relay.strided_slice(y, [0, 0], [None, channels])
+            y2 = relay.strided_slice(y, [0, channels], [None, channels * 2])
+            y = relay.concatenate((y1, y2), axis=1)
+        return relay.Function(args, y)
+
+    def check(x_shape, repeat):
+        x = relay.var("x", shape=x_shape)
+        in_c = x_shape[1]
+        out_c = in_c // 2
+        w = relay.var("w", shape=(out_c, in_c, 1, 1))
+        y_before = before(x, w, repeat)
+        y = relay.ir_pass.infer_type(y_before)
+        y = relay.ir_pass.combine_parallel_conv2d(y)
+        y = relay.ir_pass.infer_type(y)
+        y_expected = expected(x, w, out_c, repeat)
+        y_expected = relay.ir_pass.infer_type(y_expected)
+        assert relay.ir_pass.alpha_equal(y, y_expected)
+
+    check((1, 4, 16, 16), 4)
+
+
+if __name__ == "__main__":
+    test_combine_parallel_conv2d()
+    test_combine_parallel_conv2d_scale_relu()
+    test_combine_parallel_conv2d_scale()
+    test_combine_parallel_conv2d_multiple_blocks()
diff --git a/tests/python/relay/test_pass_dead_code_elimination.py b/tests/python/relay/test_pass_dead_code_elimination.py
new file mode 100644
index 000000000000..f74aaf74e474
--- /dev/null
+++ b/tests/python/relay/test_pass_dead_code_elimination.py
@@ -0,0 +1,95 @@
+import tvm
+from tvm import relay
+from tvm.relay.ir_pass import dead_code_elimination, alpha_equal
+from tvm.relay.op import log, add, equal, subtract
+
+
+class env:
+    def __init__(self):
+        self.a = relay.Var("a")
+        self.b = relay.Var("b")
+        self.c = relay.Var("c")
+        self.d = relay.Var("d")
+        self.e = relay.Var("e")
+        self.x = relay.Var("x")
+        self.y = relay.Var("y")
+        self.z = relay.Var("z")
+        self.shape = tvm.convert([1, 2, 3])
+        self.tt = relay.TensorType(self.shape, "float32")
+        self.int32 = relay.TensorType([], "int32")
+        self.float32 = relay.TensorType([], "float32")
+        self.one = relay.const(1.0)
+        self.two = relay.const(2.0)
+        self.three = relay.const(3.0)
+
+
+e = env()
+
+
+def test_let():
+    orig = relay.Let(e.x, e.y, e.z)
+    assert alpha_equal(dead_code_elimination(orig), e.z)
+
+
+def test_used_let():
+    orig = relay.Let(e.a, e.b, relay.Let(e.c, e.d, e.c))
+    assert alpha_equal(dead_code_elimination(orig), relay.Let(e.c, e.d, e.c))
+
+
+def test_chain_unused_let():
+    orig = relay.Let(e.a, e.b, relay.Let(e.c, e.d, e.e))
+    assert alpha_equal(dead_code_elimination(orig), e.e)
+
+
+# make sure we dont infinite loop
+def test_recursion():
+    """
+    Program:
+       let f(n: i32, data: f32) -> f32 = {
+          if (n == 0) {
+              return data;
+          } else {
+              return f(n - 1, log(data));
+          }
+       }
+       f(2, 10000);
+    """
+    f = relay.Var("f")
+    n = relay.Var("n", e.int32)
+    data = relay.Var("data", e.float32)
+    funcbody = relay.If(equal(n, relay.const(0)),
+                        data,
+                        relay.Call(f, [subtract(n, relay.const(1.0)),
+                                       log(data)]))
+    value = relay.Function([n, data], funcbody, e.float32, [])
+    orig = relay.Let(f, funcbody, relay.Call(f, [relay.const(2.0), relay.const(10000.0)]))
+    assert alpha_equal(dead_code_elimination(orig), orig)
+    assert alpha_equal(dead_code_elimination(relay.Let(f, funcbody, e.three)), e.three)
+
+
+def test_op_let():
+    assert alpha_equal(dead_code_elimination(add(relay.Let(e.a, e.one, e.three), e.two)), add(e.three, e.two))
+
+
+def test_if():
+    cond = relay.const(True)
+    orig = relay.If(cond, e.a, e.b)
+    y = dead_code_elimination(orig)
+    assert alpha_equal(y, e.a)
+
+
+def test_tuple_get_item():
+    t = relay.Var('t')
+    g = relay.TupleGetItem(t, 0)
+    assert alpha_equal(dead_code_elimination(g), g)
+    assert alpha_equal(dead_code_elimination(relay.TupleGetItem(relay.Let(e.a, e.one, t), 0)), g)
+
+
+if __name__ == "__main__":
+    test_if()
+    test_let()
+    test_used_let()
+    test_chain_unused_let()
+    test_recursion()
+    test_op_let()
+    test_tuple_get_item()
diff --git a/tests/python/relay/test_pass_fold_constant.py b/tests/python/relay/test_pass_fold_constant.py
new file mode 100644
index 000000000000..6a63d88f052f
--- /dev/null
+++ b/tests/python/relay/test_pass_fold_constant.py
@@ -0,0 +1,102 @@
+import numpy as np
+import tvm
+from tvm import relay
+
+
+def test_fold_const():
+    c_data = np.array([1, 2, 3]).astype("float32")
+    def before():
+        c = relay.const(c_data)
+        x = relay.var("x")
+        y = relay.add(c, c)
+        y = relay.multiply(y, relay.const(2, "float32"))
+        y = relay.add(x, y)
+        z = relay.add(y, c)
+        return relay.Function([x], z)
+
+    def expected():
+        x = relay.var("x")
+        c_folded = (c_data + c_data) * 2
+        y = relay.add(x, relay.const(c_folded))
+        z = relay.add(y, relay.const(c_data))
+        return relay.Function([x], z)
+
+    def fail(x):
+        raise RuntimeError()
+    # the fold constant should work on any context.
+    with tvm.build_config(add_lower_pass=[(0, fail)]):
+        with tvm.target.create("cuda"):
+            zz = relay.ir_pass.fold_constant(before())
+    zexpected = expected()
+    assert relay.ir_pass.alpha_equal(zz, zexpected)
+
+
+def test_fold_let():
+    c_data = np.array(1).astype("float32")
+    def before():
+        sb = relay.ScopeBuilder()
+        x = relay.var("x")
+        t1 = sb.let("t1", relay.const(c_data))
+        t2 = sb.let("t2", relay.add(t1, t1))
+        t3 = sb.let("t3", relay.add(t2, x))
+        sb.ret(t3)
+        return relay.Function([x], sb.get())
+
+    def expected():
+        sb = relay.ScopeBuilder()
+        x = relay.var("x")
+        c_folded = (c_data + c_data)
+        t3 = sb.let("t3", relay.add(relay.const(c_folded), x))
+        sb.ret(t3)
+        return relay.Function([x], sb.get())
+
+    zz = relay.ir_pass.fold_constant(before())
+    zexpected = expected()
+    assert relay.ir_pass.graph_equal(zz, zexpected)
+
+
+def test_fold_tuple():
+    c_data = np.array(1).astype("float32")
+    def before():
+        c = relay.const(c_data)
+        x = relay.var("x")
+        y = relay.Tuple([x, c])
+        z = relay.add(y[1], c)
+        z = relay.add(z, y[0])
+        return relay.Function([x], z)
+
+    def expected():
+        c = relay.const(c_data + c_data)
+        x = relay.var("x")
+        z = relay.add(c, x)
+        return relay.Function([x], z)
+
+    zz = relay.ir_pass.fold_constant(before())
+    zexpected = expected()
+    assert relay.ir_pass.graph_equal(zz, zexpected)
+
+
+def test_fold_concat():
+    c_data = np.array([[1, 2, 3]]).astype("float32")
+
+    def before():
+        a = relay.const(c_data)
+        b = relay.const(c_data)
+        y = relay.concatenate((a, b), axis=0)
+        return relay.Function([], y)
+
+    def expected():
+        y_data = np.concatenate((c_data, c_data), axis=0)
+        y = relay.const(y_data)
+        return relay.Function([], y)
+
+    zz = relay.ir_pass.fold_constant(before())
+    zexpected = expected()
+    assert relay.ir_pass.graph_equal(zz, zexpected)
+
+
+if __name__ == "__main__":
+    test_fold_const()
+    test_fold_let()
+    test_fold_tuple()
+    test_fold_concat()
diff --git a/tests/python/relay/test_pass_fold_scale_axis.py b/tests/python/relay/test_pass_fold_scale_axis.py
new file mode 100644
index 000000000000..57cb7c84b10d
--- /dev/null
+++ b/tests/python/relay/test_pass_fold_scale_axis.py
@@ -0,0 +1,460 @@
+from tvm import relay
+import numpy as np
+
+def _get_positive_scale(size):
+    return np.random.uniform(0.5, 1, size=size).astype('float32')
+
+
+def test_fold_fwd_simple():
+    """Simple testcase."""
+    def before(x, conv_weight, in_bias, in_scale, channels):
+        args = [x, conv_weight, in_bias]
+        in_bias = relay.expand_dims(in_bias, axis=1, num_newaxis=2)
+        x = relay.multiply(x, in_scale)
+        x = relay.nn.relu(x)
+        x = relay.add(x, in_bias)
+        y = relay.nn.conv2d(x, conv_weight,
+                            channels=channels,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+
+        return relay.Function(args, y)
+
+    def expected(x, conv_weight, in_bias, in_scale, channels):
+        # use a fixed order of args so alpha equal check can pass
+        args = [x, conv_weight, in_bias]
+        in_bias = relay.expand_dims(in_bias, axis=1, num_newaxis=2)
+        squeezed_scale = relay.squeeze(in_scale, axis=[1,2])
+        x = relay.nn.relu(x)
+        in_bias = relay.divide(in_bias, relay.expand_dims(squeezed_scale, axis=1, num_newaxis=2))
+        x = relay.add(x, in_bias)
+        conv_weight = relay.multiply(
+            conv_weight , relay.expand_dims(squeezed_scale, axis=1, num_newaxis=2))
+        y = relay.nn.conv2d(x, conv_weight,
+                            channels=channels,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        return relay.Function(args, y)
+
+    def check(shape, channels):
+        x =  relay.var("x", shape=shape)
+        in_channels = shape[1]
+        weight = relay.var("weight")
+        in_bias = relay.var("in_bias", shape=(in_channels,))
+        in_scale = relay.const(_get_positive_scale((in_channels, 1, 1)))
+        y1 = before(x, weight, in_bias, in_scale, channels)
+        y1 = relay.ir_pass.infer_type(y1)
+        type_dict = {x.name_hint:x.checked_type for x in y1.params}
+        weight = relay.var("weight", type_dict["weight"])
+        y1_folded = relay.ir_pass.forward_fold_scale_axis(y1)
+        y1_expected = expected(x, weight, in_bias, in_scale, channels)
+
+        y1_folded = relay.ir_pass.infer_type(y1_folded)
+        y1_expected = relay.ir_pass.infer_type(y1_expected)
+        assert relay.ir_pass.alpha_equal(y1_folded, y1_expected)
+
+    check((2, 4, 10, 10), 2)
+
+
+def test_fold_fwd_dual_path():
+    """scale axis being consumed by two consumers"""
+    def before(x, conv_weight, in_bias, in_scale, channels):
+        args = [x, conv_weight, in_bias]
+        x = relay.multiply(in_scale, x)
+        x = relay.nn.relu(x)
+        x = relay.subtract(x, in_bias)
+        y1 = relay.nn.conv2d(x, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             data_layout="NHWC",
+                             weight_layout="HWIO",
+                             groups=channels,
+                             padding=(1, 1))
+        y2 = relay.nn.conv2d(x, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             data_layout="NHWC",
+                             weight_layout="HWIO",
+                             groups=channels,
+                             padding=(1, 1))
+        z = relay.add(y1, y2)
+        return relay.Function(args, z)
+
+    def expected(x, conv_weight, in_bias, in_scale, channels):
+        args = [x, conv_weight, in_bias]
+        x = relay.nn.relu(x)
+        in_bias = relay.divide(in_bias, in_scale)
+        x = relay.subtract(x, in_bias)
+        y1 = relay.nn.conv2d(x,
+                             relay.multiply(conv_weight, in_scale),
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             data_layout="NHWC",
+                             weight_layout="HWIO",
+                             groups=channels,
+                             padding=(1, 1))
+        y2 = relay.nn.conv2d(x,
+                             relay.multiply(conv_weight, in_scale),
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             data_layout="NHWC",
+                             weight_layout="HWIO",
+                             groups=channels,
+                             padding=(1, 1))
+        z = relay.add(y1, y2)
+        return relay.Function(args, z)
+
+    def check(shape, channels):
+        x =  relay.var("x", shape=shape)
+        in_channels = shape[-1]
+        # test depthwise
+        assert in_channels == channels
+        weight = relay.var("weight")
+        in_bias = relay.var("in_bias", shape=(in_channels,))
+        in_scale = relay.const(_get_positive_scale(in_channels,))
+        y1 = before(x, weight, in_bias, in_scale, channels)
+        y1 = relay.ir_pass.infer_type(y1)
+        y1_folded = relay.ir_pass.forward_fold_scale_axis(y1)
+        type_dict = {x.name_hint:x.checked_type for x in y1.params}
+        weight = relay.var("weight", type_dict["weight"])
+        y1_expected = expected(x, weight, in_bias, in_scale, channels)
+        y1_folded = relay.ir_pass.infer_type(y1_folded)
+        y1_expected = relay.ir_pass.infer_type(y1_expected)
+        assert relay.ir_pass.alpha_equal(y1_folded, y1_expected)
+
+    check((2, 4, 10, 3), 3)
+
+
+def test_fold_fwd_fail():
+    """testcase where we canont fold"""
+    def before(x, conv_weight, in_bias, in_scale, channels):
+        x = relay.multiply(x, in_scale)
+        xx = relay.nn.leaky_relu(x, alpha=0.1)
+        y1 = relay.nn.conv2d(xx, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             data_layout="NHWC",
+                             padding=(1, 1))
+        z = relay.add(y1, x)
+        return relay.Function(relay.ir_pass.free_vars(z), z)
+
+    def check(shape, channels):
+        x =  relay.var("x", shape=shape)
+        in_channels = shape[-1]
+        # test depthwise
+        assert in_channels == channels
+        weight = relay.var("weight")
+        in_bias = relay.var("in_bias", shape=(in_channels,))
+        in_scale = relay.const(_get_positive_scale(size=(in_channels,)))
+        y1 = before(x, weight, in_bias, in_scale, channels)
+        y1 = relay.ir_pass.infer_type(y1)
+        y1_folded = relay.ir_pass.forward_fold_scale_axis(y1)
+        assert relay.ir_pass.alpha_equal(y1, y1_folded)
+
+    check((2, 11, 10, 4), 4)
+
+
+def test_fold_fwd_relu_fail():
+    """testcase where we canont fold because scale can not pass relu"""
+    def before(x, conv_weight, in_bias, in_scale, channels):
+        x = relay.multiply(x, in_scale)
+        xx = relay.nn.relu(x)
+        y1 = relay.nn.conv2d(xx, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             data_layout="NHWC",
+                             padding=(1, 1))
+        z = relay.add(y1, x)
+        return relay.Function(relay.ir_pass.free_vars(z), z)
+
+    def check(shape, channels, in_scale):
+        x =  relay.var("x", shape=shape)
+        in_channels = shape[-1]
+        # test depthwise
+        assert in_channels == channels
+        weight = relay.var("weight")
+        in_bias = relay.var("in_bias", shape=(in_channels,))
+        in_scale = relay.var("in_scale", shape=(in_channels,))
+        y1 = before(x, weight, in_bias, in_scale, channels)
+        y1 = relay.ir_pass.infer_type(y1)
+        y1_folded = relay.ir_pass.forward_fold_scale_axis(y1)
+        assert relay.ir_pass.alpha_equal(y1, y1_folded)
+
+    in_scale = relay.var("in_scale", shape=(4,))
+    check((2, 11, 10, 4), 4, in_scale)
+    in_scale = relay.const(np.random.uniform(size=(4,), low=-1.0, high=0.0)).astype("float32")
+    check((2, 11, 10, 4), 4, in_scale)
+
+
+def test_fold_bwd_simple():
+    """Simple testcase."""
+    def before(x, conv_weight, out_bias, out_scale, channels):
+        args = [x, conv_weight, out_bias]
+        out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=2)
+        y = relay.nn.conv2d(x, conv_weight,
+                            channels=channels,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        y = relay.add(y, out_bias)
+        y = relay.nn.relu(y)
+        y = relay.multiply(y, out_scale)
+        return relay.Function(args, y)
+
+    def expected(x, conv_weight, out_bias, out_scale, channels):
+        # use a fixed order of args so alpha equal check can pass
+        args = [x, conv_weight, out_bias]
+        out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=2)
+        squeezed_scale = relay.squeeze(out_scale, axis=[1,2])
+        conv_weight = relay.multiply(
+            conv_weight , relay.expand_dims(squeezed_scale, axis=1, num_newaxis=3))
+
+        y = relay.nn.conv2d(x, conv_weight,
+                            channels=channels,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        out_bias = relay.multiply(out_bias,
+                                  relay.expand_dims(squeezed_scale, axis=1, num_newaxis=2))
+        y = relay.add(y, out_bias)
+        y = relay.nn.relu(y)
+        return relay.Function(args, y)
+
+    def check(shape, channels):
+        x =  relay.var("x", shape=shape)
+        in_channels = shape[1]
+        weight = relay.var("weight")
+        out_bias = relay.var("out_bias", shape=(channels,))
+        out_scale = relay.const(np.random.uniform(size=(channels, 1, 1)).astype("float32"))
+
+        y1 = before(x, weight, out_bias, out_scale, channels)
+        y1 = relay.ir_pass.infer_type(y1)
+        type_dict = {x.name_hint:x.checked_type for x in y1.params}
+        weight = relay.var("weight", type_dict["weight"])
+        y1_folded = relay.ir_pass.backward_fold_scale_axis(y1)
+        y1_expected = expected(x, weight, out_bias, out_scale, channels)
+        y1_folded = relay.ir_pass.infer_type(y1_folded)
+        y1_expected = relay.ir_pass.infer_type(y1_expected)
+        assert relay.ir_pass.alpha_equal(y1_folded, y1_expected)
+
+    check((2, 4, 10, 10), 8)
+
+
+def test_fold_bwd_dual_path():
+    """Dual path testcase."""
+    def before(x, conv_weight, out_bias, out_scale, channels):
+        args = [x, conv_weight, out_bias]
+        y1 = relay.nn.conv2d(x, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             padding=(1, 1))
+        y1 = relay.nn.relu(y1)
+        y2 = relay.nn.conv2d(x, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             padding=(1, 1))
+        y2 = relay.nn.relu(y2)
+        y = relay.add(y1, y2)
+        y = relay.multiply(y, out_scale)
+        return relay.Function(args, y)
+
+    def expected(x, conv_weight, out_bias, out_scale, channels):
+        # use a fixed order of args so alpha equal check can pass
+        args = [x, conv_weight, out_bias]
+        out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=2)
+        squeezed_scale = relay.squeeze(out_scale, axis=[1,2])
+        def fold_conv_weight():
+            return  relay.multiply(
+                conv_weight ,
+                relay.expand_dims(squeezed_scale, axis=1, num_newaxis=3))
+        y1 = relay.nn.conv2d(x, fold_conv_weight(),
+                            channels=channels,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        y1 = relay.nn.relu(y1)
+        y2 = relay.nn.conv2d(x, fold_conv_weight(),
+                            channels=channels,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        y2 = relay.nn.relu(y2)
+        y = relay.add(y1, y2)
+        return relay.Function(args, y)
+
+    def check(shape, channels):
+        x =  relay.var("x", shape=shape)
+        in_channels = shape[1]
+        weight = relay.var("weight")
+        out_bias = relay.var("out_bias", shape=(channels,))
+        out_scale = relay.const(np.random.uniform(size=(channels, 1, 1)).astype("float32"))
+
+        y1 = before(x, weight, out_bias, out_scale, channels)
+        y1 = relay.ir_pass.infer_type(y1)
+        type_dict = {x.name_hint:x.checked_type for x in y1.params}
+        weight = relay.var("weight", type_dict["weight"])
+        y1_folded = relay.ir_pass.backward_fold_scale_axis(y1)
+        y1_expected = expected(x, weight, out_bias, out_scale, channels)
+        y1_folded = relay.ir_pass.infer_type(y1_folded)
+        y1_expected = relay.ir_pass.infer_type(y1_expected)
+        assert relay.ir_pass.alpha_equal(y1_folded, y1_expected)
+
+    check((2, 4, 10, 10), 8)
+
+
+def test_fold_bwd_dual_consumer():
+    def before(x, conv_weight, out_bias, out_scale, channels):
+        args = [x, conv_weight, out_bias]
+        y0 = relay.nn.conv2d(x, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             padding=(1, 1))
+        y0 = relay.multiply(y0, out_scale)
+        y0 = relay.nn.relu(y0)
+
+        y1 = relay.nn.conv2d(y0, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             padding=(1, 1))
+        y1 = relay.multiply(y1, out_scale)
+        y1 = relay.nn.relu(y1)
+
+        y2 = relay.nn.conv2d(y0, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             padding=(1, 1))
+        y2 = relay.multiply(y2, out_scale)
+        y2 = relay.nn.relu(y2)
+
+        y = relay.add(y1, y2)
+        return relay.Function(args, y)
+
+    def expected(x, conv_weight, out_bias, out_scale, channels):
+        # use a fixed order of args so alpha equal check can pass
+        args = [x, conv_weight, out_bias]
+        def fold_conv_weight():
+            squeezed_scale = relay.squeeze(out_scale, axis=[1,2])
+            return  relay.multiply(
+                conv_weight ,
+                relay.expand_dims(squeezed_scale, axis=1, num_newaxis=3))
+        y0 = relay.nn.conv2d(x, fold_conv_weight(),
+                            channels=channels,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        y0 = relay.nn.relu(y0)
+        y1 = relay.nn.conv2d(y0, fold_conv_weight(),
+                            channels=channels,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        y1 = relay.nn.relu(y1)
+        y2 = relay.nn.conv2d(y0, fold_conv_weight(),
+                            channels=channels,
+                            kernel_size=(3, 3),
+                            padding=(1, 1))
+        y2 = relay.nn.relu(y2)
+        y = relay.add(y1, y2)
+        return relay.Function(args, y)
+
+    def check(shape, channels):
+        x =  relay.var("x", shape=shape)
+        in_channels = shape[1]
+        weight = relay.var("weight")
+        out_bias = relay.var("out_bias", shape=(channels,))
+        out_scale = relay.const(np.random.uniform(size=(channels,1, 1)).astype("float32"))
+
+        y1 = before(x, weight, out_bias, out_scale, channels)
+        y1 = relay.ir_pass.infer_type(y1)
+        type_dict = {x.name_hint:x.checked_type for x in y1.params}
+        weight = relay.var("weight", type_dict["weight"])
+        y1_folded = relay.ir_pass.backward_fold_scale_axis(y1)
+        y1_expected = expected(x, weight, out_bias, out_scale, channels)
+        y1_folded = relay.ir_pass.infer_type(y1_folded)
+        y1_expected = relay.ir_pass.infer_type(y1_expected)
+        assert relay.ir_pass.alpha_equal(y1_folded, y1_expected)
+
+    check((2, 4, 10, 10), 4)
+
+
+def test_fold_bwd_fail():
+    """Dual path testcase."""
+    def fail1(x, conv_weight, out_bias, out_scale, channels):
+        args = [x, conv_weight, out_bias]
+        out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=2)
+        y1 = relay.nn.conv2d(x, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             padding=(1, 1))
+        y1 = relay.nn.relu(y1)
+        y2 = relay.nn.conv2d(x, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             padding=(1, 1),
+                             out_layout="CNHW")
+        # fold will fail because the axis from two path
+        # differs from each other.
+        y2 = relay.nn.relu(y2)
+        y = relay.add(y1, y2)
+        y = relay.multiply(y, out_scale)
+        return relay.Function(args, y)
+
+    def fail2(x, conv_weight, out_bias, out_scale, channels):
+        args = [x, conv_weight, out_bias]
+        out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=2)
+        y1 = relay.nn.conv2d(x, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             padding=(1, 1))
+        y2 = relay.nn.relu(y1)
+        # fold will fail because y1 is referred also by y2
+        y1 = relay.multiply(y1, out_scale)
+        y = relay.add(y1, y2)
+        return relay.Function(args, y)
+
+    def check(shape, channels, fbefore):
+        x =  relay.var("x", shape=shape)
+        in_channels = shape[1]
+        weight = relay.var("weight")
+        out_bias = relay.var("out_bias", shape=(channels,))
+        out_scale = relay.const(np.random.uniform(size=(channels, 1, 1)).astype("float32"))
+        y1 = fbefore(x, weight, out_bias, out_scale, channels)
+        y1 = relay.ir_pass.infer_type(y1)
+        y1_folded = relay.ir_pass.backward_fold_scale_axis(y1)
+        assert relay.ir_pass.alpha_equal(y1_folded, y1)
+
+    check((4, 4, 10, 10), 4, fail1)
+    check((4, 4, 10, 10), 4, fail2)
+
+
+def test_fold_bwd_relu_fail():
+    """testcase where we canont fold because scale can not pass relu"""
+    def before(x, conv_weight, out_scale, channels):
+        y = relay.nn.conv2d(x, conv_weight,
+                             channels=channels,
+                             kernel_size=(3, 3),
+                             data_layout="NCHW",
+                             padding=(1, 1))
+        y = relay.nn.relu(y)
+        y = relay.multiply(x, out_scale)
+        return relay.Function(relay.ir_pass.free_vars(y), y)
+
+    def check(shape, channels, out_scale):
+        x =  relay.var("x", shape=shape)
+        in_channels = shape[1]
+        weight = relay.var("weight")
+        y1 = before(x, weight, out_scale, channels)
+        y1 = relay.ir_pass.infer_type(y1)
+        y1_folded = relay.ir_pass.forward_fold_scale_axis(y1)
+        assert relay.ir_pass.alpha_equal(y1, y1_folded)
+
+    out_scale = relay.var("in_scale", shape=(4, 1, 1))
+    check((4, 4, 10, 10), 4, out_scale)
+    out_scale = relay.const(np.random.uniform(size=(4, 1, 1), low=-1.0, high=0.0)).astype("float32")
+    check((4, 4, 10, 10), 4, out_scale)
+
+
+if __name__ == "__main__":
+    test_fold_fwd_simple()
+    test_fold_fwd_dual_path()
+    test_fold_fwd_fail()
+    test_fold_fwd_relu_fail()
+    test_fold_bwd_simple()
+    test_fold_bwd_dual_path()
+    test_fold_bwd_dual_consumer()
+    test_fold_bwd_fail()
+    test_fold_bwd_relu_fail()
diff --git a/tests/python/relay/test_pass_free_vars.py b/tests/python/relay/test_pass_free_vars.py
new file mode 100644
index 000000000000..151dbe1412bc
--- /dev/null
+++ b/tests/python/relay/test_pass_free_vars.py
@@ -0,0 +1,41 @@
+import tvm
+from tvm import relay
+from tvm.relay.ir_pass import free_vars, free_type_vars
+
+def test_free_vars():
+    ty = relay.TensorType([], "int32")
+    x = relay.Var("x", ty)
+    fvx = free_vars(x)
+    assert len(fvx) == 1
+    assert fvx[0] == x
+    v = relay.Constant(tvm.nd.array(10))
+
+    let = relay.Let(x, v, x)
+    fvx = free_vars(let)
+    assert len(free_vars(let)) == 0
+    f = relay.Function([x], x, ty)
+    assert len(free_vars(f)) == 0
+
+
+def test_tuple():
+    t = relay.Var('t')
+    fv = free_vars(relay.Tuple([t, t]))
+    assert len(fv) == 1
+    assert fv[0] == t
+    fv = free_vars(relay.TupleGetItem(t, 123))
+    assert len(fv) == 1
+    assert fv[0] == t
+
+
+def test_free_type_vars():
+    tp = relay.TypeVar("")
+    ty = relay.TupleType([tp, relay.TensorType([], "int32")])
+    x = relay.Var("x", ty)
+    y = relay.Var("y")
+    let = relay.Let(x, y, x)
+    fvl = free_vars(let)
+    assert len(fvl) == 1
+    assert fvl[0] == y
+    ftvl = free_type_vars(let)
+    assert len(ftvl) == 1
+    assert ftvl[0] == tp
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
new file mode 100644
index 000000000000..28ea8dd28988
--- /dev/null
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -0,0 +1,189 @@
+import tvm
+from tvm import relay
+
+def test_fuse_simple():
+    """Simple testcase."""
+    def before():
+        x = relay.var("x", shape=(10, 20))
+        y = relay.add(x, relay.const(1, "float32"))
+        z = relay.exp(y)
+        return relay.Function([x], z)
+
+    def expected():
+        x = relay.var("p", shape=(10, 20))
+        y = relay.add(x, relay.const(1, "float32"))
+        z = relay.exp(y)
+        f1 = relay.Function([x], z)
+        x = relay.var("x", shape=(10, 20))
+        y = relay.Call(f1, [x])
+        return relay.Function([x], y)
+
+    z = before()
+    z = relay.ir_pass.infer_type(z)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=2)
+    zz = relay.ir_pass.infer_type(zz)
+    zz = relay.ir_pass.fuse_ops(zz)
+    zz = relay.ir_pass.infer_type(zz)
+    after = relay.ir_pass.infer_type(expected())
+    assert relay.ir_pass.alpha_equal(zz, after)
+
+
+def test_conv2d_fuse():
+    """Test fusion case of conv2d"""
+    def before(dshape):
+        x = relay.var("x", shape=dshape)
+        x = relay.add(x, relay.const(1, "float32"))
+        y = relay.nn.conv2d(x, relay.var("w1"),
+                            kernel_size=(3, 3),
+                            padding=(1, 1),
+                            channels=16)
+        # this is the next dominator.
+        y1 = relay.add(relay.const(1, "float32"), y)
+        y = relay.add(y, y1)
+        # second path
+        z2 = relay.nn.conv2d(y, relay.var("w2"),
+                             kernel_size=(1, 1),
+                             padding=(0,0),
+                             channels=16)
+        z3 = relay.nn.conv2d(y, relay.var("w3"),
+                             kernel_size=(3, 3),
+                             padding=(1,1),
+                             channels=16)
+        # add can only be fused to z1
+        z = relay.add(z2, z3)
+        return relay.Function(relay.ir_pass.free_vars(z), z)
+
+    def expected(dshape):
+        # segment 0
+        x = relay.var("p0", shape=dshape)
+        y = relay.add(x, relay.const(1, "float32"))
+        f0 = relay.Function([x], y)
+        # segment 1
+        x = relay.var("p0", shape=dshape)
+        w = relay.var("p1")
+        y = relay.nn.conv2d(x, w,
+                            kernel_size=(3, 3),
+                            padding=(1, 1),
+                            channels=16)
+        y1 = relay.add(relay.const(1, "float32"), y)
+        y = relay.add(y, y1)
+        f1 = relay.Function([x, w], y)
+        # segment 2
+        x = relay.var("p0", shape=dshape)
+        w = relay.var("p1")
+        z2 = relay.nn.conv2d(x, w,
+                             kernel_size=(3, 3),
+                             padding=(1,1),
+                             channels=16)
+        f2 = relay.Function([x, w], z2)
+        # segment 3
+        x = relay.var("p0", shape=dshape)
+        w = relay.var("p1")
+        offset = relay.var("p2", shape=dshape)
+        z3 = relay.nn.conv2d(x, w,
+                             kernel_size=(1, 1),
+                             padding=(0, 0),
+                             channels=16)
+        z3 = relay.add(z3, offset)
+        f3 = relay.Function([x, w, offset], z3)
+        # compose
+        x = relay.var("x", shape=dshape)
+        y = relay.Call(f0, [x])
+        y = relay.Call(f1, [y, relay.var("w1")])
+        z2 = relay.Call(f2, [y, relay.var("w3")])
+        z3 = relay.Call(f3, [y, relay.var("w2"), z2])
+        z = z3
+        return relay.Function(relay.ir_pass.free_vars(z), z)
+
+    dshape = (1, 16, 64, 64)
+    z = before(dshape)
+    z = relay.ir_pass.infer_type(z)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=2)
+    zz = relay.ir_pass.infer_type(zz)
+    after = relay.ir_pass.infer_type(expected(dshape))
+    assert relay.ir_pass.alpha_equal(zz, after)
+
+
+def test_concatenate():
+    """Test fusion case involving concat op and Tuple node"""
+
+    def before(dshape):
+        x = relay.var("x", shape=dshape)
+        pooled = relay.nn.max_pool2d(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
+        upsampled = relay.nn.upsampling(pooled, scale=2, layout="NCHW")
+        concat = relay.concatenate((upsampled, x), axis=1)
+        out = relay.add(concat, relay.const(1, "float32"))
+        return relay.Function(relay.ir_pass.free_vars(out), out)
+
+    def expected(dshape):
+        x = relay.var("x", shape=dshape)
+        pooled = relay.nn.max_pool2d(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
+        f0 = relay.Function([x], pooled)
+
+        p0 = relay.var("p0", shape=(dshape[0], dshape[1], dshape[2]//2, dshape[3]//2))
+        p1 = relay.var("p1", shape=dshape)
+        upsampled = relay.nn.upsampling(p0, scale=2, layout="NCHW")
+        concat = relay.concatenate((upsampled, p1), axis=1)
+        out = relay.add(concat, relay.const(1, "float32"))
+        f1 = relay.Function([p0, p1], out)
+
+        x = relay.var("x", shape=dshape)
+        y = relay.Call(f0, [x])
+        z = relay.Call(f1, [y, x])
+        return relay.Function([x], z)
+
+    dshape = (1, 16, 64, 64)
+    z = before(dshape)
+    z = relay.ir_pass.infer_type(z)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=0)
+    assert not relay.ir_pass.free_vars(zz)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=2)
+    zz = relay.ir_pass.infer_type(zz)
+    assert not relay.ir_pass.free_vars(zz)
+    after = relay.ir_pass.infer_type(expected(dshape))
+    assert relay.ir_pass.alpha_equal(zz, after)
+
+
+def test_tuple_root():
+    """Test fusion case where Tuple node is the root in its group"""
+
+    def before(dshape):
+        x = relay.var("x", shape=dshape)
+        pooled = relay.nn.max_pool2d(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
+        upsampled = relay.nn.upsampling(pooled, scale=2, layout="NCHW")
+        out = relay.Tuple((upsampled, x))
+        return relay.Function(relay.ir_pass.free_vars(out), out)
+
+    def expected(dshape):
+        x = relay.var("x", shape=dshape)
+        pooled = relay.nn.max_pool2d(x, pool_size=(2, 2), strides=(2, 2), padding=(0, 0))
+        f0 = relay.Function([x], pooled)
+
+        p0 = relay.var("p0", shape=(dshape[0], dshape[1], dshape[2]//2, dshape[3]//2))
+        p1 = relay.var("p1", shape=(dshape[0], dshape[1], dshape[2], dshape[3]))
+        upsampled = relay.nn.upsampling(p0, scale=2, layout="NCHW")
+        out = relay.Tuple((upsampled, p1))
+        f1 = relay.Function([p0, p1], out)
+
+        x = relay.var("x", shape=dshape)
+        y = relay.Call(f0, [x])
+        z = relay.Call(f1, [y, x])
+        return relay.Function([x], z)
+
+    dshape = (1, 16, 64, 64)
+    z = before(dshape)
+    z = relay.ir_pass.infer_type(z)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=0)
+    assert not relay.ir_pass.free_vars(zz)
+    zz = relay.ir_pass.fuse_ops(z, opt_level=2)
+    zz = relay.ir_pass.infer_type(zz)
+    assert not relay.ir_pass.free_vars(zz)
+    after = relay.ir_pass.infer_type(expected(dshape))
+    assert relay.ir_pass.alpha_equal(zz, after)
+
+
+if __name__ == "__main__":
+    test_fuse_simple()
+    test_conv2d_fuse()
+    test_concatenate()
+    test_tuple_root()
diff --git a/tests/python/relay/test_pass_simplify_inference.py b/tests/python/relay/test_pass_simplify_inference.py
new file mode 100644
index 000000000000..7585a88063ab
--- /dev/null
+++ b/tests/python/relay/test_pass_simplify_inference.py
@@ -0,0 +1,47 @@
+from tvm import relay as rly
+from tvm.relay.ir_pass import simplify_inference, alpha_equal
+
+def test_simplify_batchnorm():
+    def simple_bn(x, gamma, beta, moving_mean, moving_var,
+                  axis=1, epsilon=1e-5, shape=None):
+        # expect = (x - moving_mean) / sqrt(moving_var + eps) * gamma + beta
+        scale = rly.multiply(rly.const(1, 'float32') /
+                rly.sqrt(moving_var + rly.const(epsilon, 'float32')), gamma)
+        shift = rly.add(
+            rly.multiply(rly.negative(moving_mean), scale), beta)
+        num_newaxis = len(shape) - (axis + 1)
+        if num_newaxis:
+            scale = rly.expand_dims(scale, axis=1, num_newaxis=num_newaxis)
+            shift = rly.expand_dims(shift, axis=1, num_newaxis=num_newaxis)
+        return x * scale + shift
+
+    def check(dim, axis, nstep):
+        eps = 0.01
+        ttype1 = rly.TensorType(tuple(10 for i in range(dim)), 'float32')
+        ttype2 = rly.TensorType((10,), 'float32')
+        x = rly.var("x", ttype1)
+        beta = rly.var("beta", ttype2)
+        gamma = rly.var("gamma", ttype2)
+        moving_var = rly.var("moving_var", ttype2)
+        moving_mean = rly.var("moving_mean", ttype2)
+        y1, y2 = x, x
+
+        for _ in range(nstep):
+            y1, _, _ = rly.nn.batch_norm(y1 + rly.const(1, 'float32'),
+                gamma, beta, moving_mean, moving_var, epsilon=eps, axis=axis)
+            y1 = rly.nn.dropout(y1)
+            y2 = simple_bn(y2 + rly.const(1, 'float32'),
+                           gamma, beta, moving_mean, moving_var,
+                           epsilon=eps, axis=axis, shape=ttype1.shape)
+        y1 = rly.ir_pass.infer_type(y1)
+        y1 = simplify_inference(y1)
+
+        assert rly.ir_pass.graph_equal(y1, y2)
+
+    check(2, 1, 1)
+    check(4, 1, 1)
+    check(4, 0, 3)
+
+
+if __name__ == "__main__":
+    test_simplify_batchnorm()
diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py
new file mode 100644
index 000000000000..06cb19639dcf
--- /dev/null
+++ b/tests/python/relay/test_type_infer.py
@@ -0,0 +1,211 @@
+"""Test that type checker correcly computes types
+   for expressions.
+"""
+import tvm
+import numpy as np
+from tvm.relay.ir_pass import infer_type
+from tvm import relay
+from tvm.relay import op
+from tvm.relay.scope_builder import ScopeBuilder
+
+
+def assert_has_type(expr, typ, mod=relay.module.Module({})):
+    checked_expr = infer_type(expr, mod)
+    checked_type = checked_expr.checked_type
+    if checked_type != typ:
+        raise RuntimeError("Type mismatch %s vs %s" % (
+            checked_type, typ))
+
+
+def test_monomorphic_let():
+    "Program: let x = 1; return x"
+    sb = relay.ScopeBuilder()
+    x = sb.let('x', relay.const(1.0, "float64"))
+    sb.ret(x)
+    xchecked = relay.ir_pass.infer_type(sb.get())
+    assert xchecked.checked_type == relay.scalar_type("float64")
+
+
+def test_single_op():
+    "Program: fn (x : float32) { let t1 = f(x); t1 }"
+    x = relay.var('x', shape=[])
+    func = relay.Function([x], op.log(x))
+    ttype = relay.TensorType([], dtype='float32')
+    assert_has_type(func, relay.FuncType([ttype], ttype))
+
+
+def test_add_broadcast_op():
+    """
+    Program:
+        fn (x: Tensor[(10, 4), f32], y: Tensor[(5, 10, 1), f32]) -> Tensor[(5, 10, 4), f32] {
+            return x + y;
+        }
+    """
+    pass
+    # x = relay.var('x', shape=(10, 4))
+    # y = relay.var('y', shape=(5, 10, 1))
+    # z = x + y
+    # func = relay.Function([x, y], z)
+    # ttype = relay.TensorType((5, 5, 5), 'float32')
+    # expected_ty = relay.FuncType([ttype, ttype], ttype)
+    # assert_has_type(func.to_func(), expected_ty)
+
+
+def test_dual_op():
+    """Program:
+       fn (x : Tensor[f32, (10, 10)]) {
+         let t1 = log(x);
+         let t2 = add(t1, x);
+         return t1;
+       }
+    """
+    tp = relay.TensorType((10, 10), "float32")
+    x = relay.var("x", tp)
+    sb = relay.ScopeBuilder()
+    t1 = sb.let("t1", relay.log(x))
+    t2 = sb.let("t2", relay.add(t1, x))
+    sb.ret(t2)
+    f = relay.Function([x], sb.get())
+    fchecked = relay.ir_pass.infer_type(f)
+    assert fchecked.checked_type == relay.FuncType([tp], tp)
+
+
+def test_decl():
+    """Program:
+       def f(x : Tensor[(10, 10), f32]) {
+           return log(x);
+       }
+    """
+    tp = relay.TensorType((10, 10))
+    x = relay.var("x", tp)
+    f = relay.Function([x], relay.log(x))
+    fchecked = relay.ir_pass.infer_type(f)
+    assert fchecked.checked_type == relay.FuncType([tp], tp)
+
+
+def test_recursion():
+    """
+    Program:
+       def f(n: i32, data: f32) -> f32 {
+          if (n == 0) {
+              return data;
+          } else {
+              return f(n - 1, log(data));
+          }
+       }
+    """
+    sb = relay.ScopeBuilder()
+    f = relay.GlobalVar("f")
+    ti32 = relay.scalar_type("int32")
+    tf32 = relay.scalar_type("float32")
+    n = relay.var("n", ti32)
+    data = relay.var("data", tf32)
+
+    with sb.if_scope(relay.equal(n, relay.const(0, ti32))):
+        sb.ret(data)
+    with sb.else_scope():
+        sb.ret(f(relay.subtract(n, relay.const(1, ti32)), relay.log(data)))
+    mod = relay.Module()
+    mod[f] = relay.Function([n, data], sb.get())
+    assert "%3 = @f(%1, %2)" in mod.astext()
+    assert mod[f].checked_type == relay.FuncType([ti32, tf32], tf32)
+
+# This currently fails and should pass under the type system.
+#
+# This test is to illustrate problem with our weak form of
+# unification.
+#
+
+
+def test_incomplete_call():
+    sb = ScopeBuilder()
+    x = relay.var('x', dtype='int32')
+    f = relay.var('f')
+    func = relay.Function([x, f], relay.Call(f, [x]))
+
+    try:
+        relay.ir_pass.infer_type(func)
+        assert False
+    except tvm.TVMError as e:
+        assert True
+
+def test_tuple():
+    tp = relay.TensorType((10,))
+    x = relay.var("x", tp)
+    res = relay.Tuple([x, x])
+    assert (relay.ir_pass.infer_type(res).checked_type ==
+            relay.TupleType([tp, tp]))
+
+def test_free_expr():
+    x = relay.var("x", "float32")
+    y = relay.add(x, x)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.scalar_type("float32")
+    assert x.vid.same_as(yy.args[0].vid)
+
+
+def test_type_args():
+    x = relay.var("x", shape=(10, 10))
+    y = relay.var("y", shape=(1, 10))
+    z = relay.add(x, y)
+    ty_z = relay.ir_pass.infer_type(z)
+    ty_args = ty_z.type_args
+    assert len(ty_args) == 2
+    assert ty_args[0].dtype == "float32"
+    assert ty_args[1].dtype == "float32"
+    sh1 = ty_args[0].shape
+    sh2 = ty_args[1].shape
+    assert sh1[0].value == 10
+    assert sh1[1].value == 10
+    assert sh2[0].value == 1
+    assert sh2[1].value == 10
+
+
+def test_self_reference():
+    """
+    Program:
+       def f(x) {
+           return x;
+       }
+    """
+    a = relay.TypeVar("a")
+    x = relay.var("x", a)
+    sb = relay.ScopeBuilder()
+
+    f = relay.Function([x], x)
+    fx = relay.Call(f, [x])
+    assert relay.ir_pass.infer_type(x).checked_type == a
+    assert relay.ir_pass.infer_type(f).checked_type == relay.FuncType([a], a)
+    assert relay.ir_pass.infer_type(fx).checked_type == a
+
+
+def test_global_var_cow_issue():
+    mod = relay.Module({})
+    gv = relay.GlobalVar("foo")
+    x = relay.var('x', shape=[])
+    func = relay.Function([x], relay.Call(gv, [x]),
+                          relay.TensorType([], 'float32'))
+    mod[gv] = func
+
+
+def test_equal():
+    i = relay.var('i', shape=[], dtype='int32')
+    eq = op.equal(i, relay.const(0, dtype='int32'))
+    # This should fail ....
+    func = relay.Function([i], eq, ret_type=relay.TensorType([], 'int32'))
+
+
+if __name__ == "__main__":
+    test_free_expr()
+    test_dual_op()
+    test_single_op()
+    test_recursion()
+    test_monomorphic_let()
+    test_decl()
+    test_recursion()
+    test_tuple()
+    test_incomplete_call()
+    test_free_expr()
+    test_type_args()
+    test_self_reference()
+    test_global_var_cow_issue()
diff --git a/tests/python/relay/test_type_solver.py b/tests/python/relay/test_type_solver.py
new file mode 100644
index 000000000000..e8ff67756931
--- /dev/null
+++ b/tests/python/relay/test_type_solver.py
@@ -0,0 +1,54 @@
+import tvm
+from tvm import relay
+
+
+def make_rel(name, args, num_inputs=None, attrs=None):
+    func = tvm.get_env_func("tvm.relay.type_relation." + name)
+    if num_inputs is None:
+        num_inputs = len(args) - 1
+    return relay.ty.TypeRelation(func, args, num_inputs, attrs)
+
+def make_solver():
+    solver = relay._ir_pass._test_type_solver()
+    solver.Solve = solver("Solve")
+    solver.Unify = solver("Unify")
+    solver.Resolve = solver("Resolve")
+    solver.AddConstraint = solver("AddConstraint")
+
+    def gen_type(name, args, out=None):
+        out = out if out else relay.ty.IncompleteType()
+        solver.AddConstraint(make_rel(name, args + [out]))
+        return out
+
+    solver.gen_type = gen_type
+    return solver
+
+
+def test_bcast():
+    solver = make_solver()
+    t0 = relay.ty.TensorType((10, 20), "float32")
+    t1 = relay.ty.TensorType((10, 1), "float32")
+    tc = relay.ty.TensorType((10, 1, 1), "float32")
+    t2 = solver.gen_type("Broadcast", [t0, t1])
+    t3 = solver.gen_type("Identity", [t2])
+    t4 = solver.gen_type("Broadcast", [t3, tc])
+    assert solver.Solve()
+    assert solver.Resolve(t2) == relay.ty.TensorType((10, 20), "float32")
+    assert solver.Resolve(t4) == relay.ty.TensorType((10, 10, 20), "float32")
+
+
+def test_backward_solving():
+    solver = make_solver()
+    t0 = relay.ty.TensorType((10, 20), "float32")
+    tc = relay.ty.TensorType((10, 1, 1), "float32")
+    t1 = relay.ty.IncompleteType()
+    t3 = solver.gen_type("Broadcast", [t0, t1])
+    t2 = solver.gen_type("Identity", [t1], out=tc)
+    assert solver.Solve()
+    assert solver.Resolve(t3) == relay.ty.TensorType((10, 10, 20), "float32")
+
+
+
+if __name__ == "__main__":
+    test_bcast()
+    test_backward_solving()
diff --git a/tests/python/unittest/test_arith_detect_linear_equation.py b/tests/python/unittest/test_arith_detect_linear_equation.py
index 9d875c910d1c..2b0f327b65b2 100644
--- a/tests/python/unittest/test_arith_detect_linear_equation.py
+++ b/tests/python/unittest/test_arith_detect_linear_equation.py
@@ -38,6 +38,10 @@ def test_multivariate():
     assert(m[2].value == 2)
     assert(m[len(m)-1].value == 2)
 
+    m = tvm.arith.DetectLinearEquation((v[0] - v[1]), [v[2]])
+    assert(m[0].value == 0)
+    assert(tvm.ir_pass.Simplify(m[1] - (v[0] - v[1])).value == 0)
+
 if __name__ == "__main__":
     test_basic()
     test_multivariate()
diff --git a/tests/python/unittest/test_arith_intset.py b/tests/python/unittest/test_arith_intset.py
index 78589cf3af0e..9b869feddc9d 100644
--- a/tests/python/unittest/test_arith_intset.py
+++ b/tests/python/unittest/test_arith_intset.py
@@ -35,7 +35,7 @@ def test_deduce():
 
     e1 = (a*4+b < c)
     res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
-    ans1 = (((c - b) + -1)/4) 
+    ans1 = (((c - b) + -1)/4)
     assert str(tvm.ir_pass.Simplify(res1.max())) == str(ans1)
 
     e2 = (tvm.max(5, a * 4) < 0)
@@ -63,7 +63,7 @@ def test_check():
     assert res1.is_nothing()
 
     # multiple compare operators
-    res2 = tvm.arith.DeduceBound(a, (a+b>3)>c , {b: b_s, c: c_s}, {})
+    res2 = tvm.arith.DeduceBound(a, (a+b>3).astype(c.dtype)>c , {b: b_s, c: c_s}, {})
     assert res2.is_nothing()
 
     # multiple target variable
@@ -88,11 +88,11 @@ def test_basic(a1, a2, coff):
         res1 = tvm.arith.DeduceBound(a, e0<=17, {b: b_s}, {b: b_s})
         [x, y] = [res1.max(), b_s.max()] if coff > 0 else [res1.min(), b_s.min()]
         assert (tvm.ir_pass.Simplify((x * coff + 3 + y) <= 17)).value == 1
-      
+
         res1 = tvm.arith.DeduceBound(a, e0>=17, {b: b_s}, {b: b_s})
         [x, y] = [res1.max(), b_s.max()] if coff < 0 else [res1.min(), b_s.min()]
         assert (tvm.ir_pass.Simplify((x * coff + 3 + y) >= 17)).value == 1
-       
+
     test_basic(0, 4, 4)
     test_basic(1, 5, 4)
     test_basic(2, 6, 4)
@@ -137,4 +137,3 @@ def test_complex(a1, a2, coff):
     test_check()
     test_deduce_basic()
     test_deduce_complex()
-
diff --git a/tests/python/unittest/test_arith_simplify.py b/tests/python/unittest/test_arith_simplify.py
index e6689dddf9d0..f6a78b6e3770 100644
--- a/tests/python/unittest/test_arith_simplify.py
+++ b/tests/python/unittest/test_arith_simplify.py
@@ -20,6 +20,16 @@ def test_simplify():
     zz = zz.a
     assert zz.a == x and zz.b.value == 4
 
+    n = tvm.var('n')
+    assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(n % (-1)), tvm.const(0))
+    assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(n % 1), tvm.const(0))
+    assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(n / 1), n)
+    tvm.ir_pass.CanonicalSimplify(n / (-1))
+    # This is not true in the current implementation
+    #  assert tvm.ir_pass.Equal(tvm.ir_pass.CanonicalSimplify(n / (-1)),
+    #                           tvm.ir_pass.CanonicalSimplify(-n))
+
+
 def test_simplify_mod():
     """Not yet working, mock design"""
     ib = tvm.ir_builder.create()
@@ -36,6 +46,21 @@ def test_simplify_mod():
         (j + n * 32) % 16, {j: tvm.Range(0, 6)})
     assert index == j
 
+def test_simplify_minmax():
+    x = tvm.var('x')
+    e1 = tvm.max(x, 1) - tvm.max(x, 1)
+    e1s = tvm.ir_pass.CanonicalSimplify(e1)
+    assert e1s.value == 0
+
+    e2 = tvm.min(x, 1) - tvm.min(x, 1)
+    e2s = tvm.ir_pass.CanonicalSimplify(e2)
+    assert e2s.value == 0
+
+def test_mul():
+    x = tvm.var('x')
+    e = x * x - x * x
+    es = tvm.ir_pass.CanonicalSimplify(e)
+    assert es.value == 0
 
 def test_modular():
     rx = tvm.var("rx")
@@ -52,11 +77,9 @@ def test_modular():
     assert tvm.ir_pass.CanonicalSimplify(z1 - (ry + y)).value == 0
     assert tvm.ir_pass.CanonicalSimplify(z2 - (rx + x)).value == 0
 
-
-
-
-
 if __name__ == "__main__":
     test_simplify_mod()
     test_modular()
     test_simplify()
+    test_mul()
+    test_simplify_minmax()
\ No newline at end of file
diff --git a/tests/python/unittest/test_autotvm_common.py b/tests/python/unittest/test_autotvm_common.py
index 3a6883f69489..ed39c3846c8c 100644
--- a/tests/python/unittest/test_autotvm_common.py
+++ b/tests/python/unittest/test_autotvm_common.py
@@ -32,6 +32,25 @@ def matmul(N, L, M, dtype):
 
     return s, [A, B, C]
 
+@autotvm.template
+def bad_matmul(N, L, M, dtype):
+    if 'bad_device' in tvm.target.current_target().keys:
+        A = tvm.placeholder((N, L), name='A', dtype=dtype)
+        B = tvm.placeholder((L, M), name='B', dtype=dtype)
+
+        k = tvm.reduce_axis((0, L-1), name='k')
+        C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
+        s = tvm.create_schedule(C.op)
+
+        # schedule
+        y, x = s[C].op.axis
+        cfg = autotvm.get_config()
+        cfg.define_split("tile_y", y, num_outputs=2)
+        cfg.define_split("tile_x", x, num_outputs=2)
+        return s, [A, B, C]
+
+    return matmul(N, L, M, dtype)
+
 def get_sample_task(n=128):
     """return a sample task for testing"""
     target = tvm.target.create("llvm")
diff --git a/tests/python/unittest/test_autotvm_database.py b/tests/python/unittest/test_autotvm_database.py
index af4704d95e51..aa956f61bbcf 100644
--- a/tests/python/unittest/test_autotvm_database.py
+++ b/tests/python/unittest/test_autotvm_database.py
@@ -1,17 +1,11 @@
 """Test database"""
 import copy
 import logging
-import time
 
-import numpy as np
-import tvm
-
-from tvm import autotvm
 from tvm.autotvm import database
-from tvm.autotvm.measure.measure_methods import HashMismatchError
-from tvm.autotvm.record import encode, MeasureInput, MeasureResult
+from tvm.autotvm.record import encode, MeasureResult
 
-from test_autotvm_common import get_sample_task, get_sample_records
+from test_autotvm_common import get_sample_records
 
 def test_save_load():
     logging.info("test basic db load/save ...")
@@ -35,66 +29,6 @@ def test_save_load():
 
 TRIAL_LIMIT = 2
 
-def test_db_filter():
-    logging.info("test db filter ...")
-
-    # Pick a GPU target because there are more likely to be failures/invalid configs
-    task, target = get_sample_task()
-
-    ctx = tvm.context(str(target))
-    if not ctx.exist:
-        logging.warning("Skip this test because there is no supported device for test")
-
-    batch_size = 2
-
-    measure_option = autotvm.measure_option('local', do_fork=False, timeout=2)
-    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
-
-    ct = 0
-    all_inputs = list()
-    all_results = list()
-    batches = list()
-    tuner = autotvm.tuner.RandomTuner(task)
-    while ct < TRIAL_LIMIT:
-        inputs = list()
-        for i in range(batch_size):
-            cfg = tuner.next_batch(1)[0]
-            inputs.append((MeasureInput(target, task, cfg)))
-            all_inputs.append(inputs[-1])
-        batches.append(inputs)
-        results = measure_batch(inputs)
-        all_results += results
-        ct += 1
-
-    del measure_batch
-
-    db = database.DummyDatabase()
-    db.flush()
-
-    # First setting, memoize one input at a time, check that each is saved and replayed
-    measure_option = autotvm.measure_option('local', do_fork=False, timeout=2, replay_db=db)
-    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
-
-    for i in range(len(all_inputs)+1):
-        db.flush()
-        for j in range(i):
-            db.save(all_inputs[j], all_results[j])
-
-        for k in range(len(batches)):
-            batch = batches[k]
-            batch_result = measure_batch(batch)
-            for l in range(batch_size):
-                all_idx = k*batch_size + l
-                assert batch_result[l] is not None
-                if all_idx < i:
-                    assert encode(batch[l], batch_result[l]) == encode(batch[l], all_results[all_idx]), \
-                        "(no retry) EXPECTED MATCH, GOT MISMATCH"
-                else:
-                    assert encode(batch[l], batch_result[l]) != encode(batch[l], all_results[all_idx]), \
-                        "(no retry) EXPECTED MISMATCH, GOT MATCH"
-
-    del measure_batch
-
 def test_db_hash():
     logging.info("test db hash check ...")
     inp1, res1 = get_sample_records(1)[0]
@@ -149,89 +83,8 @@ def test_db_latest_all():
     assert encode(inp1, load4[1]) == encode(inp1, res2)
     assert encode(inp1, load4[2]) == encode(inp1, res3)
 
-def test_db_save_replay():
-    logging.info("test db save (from measure_batch) and replay ...")
-    _db = database.DummyDatabase()
-    _db.flush()
-
-    task, target = get_sample_task()
-
-    ctx = tvm.context(str(target))
-    if not ctx.exist:
-        logging.warning("Skip this test because there is no supported device for test")
-
-    measure_option = autotvm.measure_option('local',
-                                            do_fork=False,
-                                            timeout=2,
-                                            replay_db=_db)
-    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
-
-    batch_size = 2
-
-    ct = 0
-    all_inputs = list()
-    all_results = list()
-    batches = list()
-    tuner = autotvm.tuner.RandomTuner(task)
-    while ct < TRIAL_LIMIT:
-        inputs = list()
-        for i in range(batch_size):
-            cfg = tuner.next_batch(1)[0]
-            inputs.append((MeasureInput(target, task, cfg)))
-            all_inputs.append(inputs[-1])
-        batches.append(inputs)
-        results = measure_batch(inputs)
-        all_results += results
-        ct += 1
-    callback = autotvm.callback.log_to_database(_db)
-    callback(None, all_inputs, all_results)
-
-    assert len(_db.db.keys()) == batch_size * TRIAL_LIMIT, \
-        "%d vs %d" % (len(_db.db.keys()), batch_size * TRIAL_LIMIT)
-
-    all_results_2 = measure_batch(all_inputs)
-    all_results_3 = measure_batch(all_inputs)
-
-    for i in range(len(all_results)):
-        encr1 = encode(all_inputs[i], all_results[i])
-        encr2 = encode(all_inputs[i], all_results_2[i])
-        encr3 = encode(all_inputs[i], all_results_3[i])
-        assert encr1 == encr2, "EXPECTED MATCH WITH SAVE REPLAY (first replay), got MISMATCH"
-        assert encr2 == encr3, "EXPECTED MATCH WITH SAVE REPLAY (second replay), got MISMATCH"
-
-    del measure_batch
-
-def test_check_hashmismatch():
-    logging.info("test hash mismatch check")
-
-    task, target = get_sample_task()
-
-    ctx = tvm.context(str(target))
-    if not ctx.exist:
-        logging.warning("Skip this test because there is no supported device for test")
-
-    measure_option = autotvm.measure_option('local', do_fork=False)
-    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
-
-    inputs = list()
-    cfg = task.config_space.get(np.random.randint(len(task.config_space)))
-    # notvalidh is not a valid CRC32 hash (not hex)
-    cfg.code_hash = 'notvalidh'
-    inputs.append((MeasureInput(target, task, cfg)))
-
-    try:
-        results = measure_batch(inputs)
-        assert False, "HashMismatchError should be raised"
-    except HashMismatchError:
-        pass
-
-    del measure_batch
-
 if __name__ == '__main__':
     logging.basicConfig(level=logging.INFO)
     test_save_load()
-    test_db_filter()
     test_db_hash()
     test_db_latest_all()
-    test_db_save_replay()
-    test_check_hashmismatch()
diff --git a/tests/python/unittest/test_autotvm_dispatch_context.py b/tests/python/unittest/test_autotvm_dispatch_context.py
index 6c718e5bd041..1f2a7e276a32 100644
--- a/tests/python/unittest/test_autotvm_dispatch_context.py
+++ b/tests/python/unittest/test_autotvm_dispatch_context.py
@@ -3,34 +3,48 @@
 to the parameters of workload"""
 
 from collections import namedtuple
+from tvm import autotvm
 from tvm.autotvm.task import dispatcher, DispatchContext
 
-SimpleWorkload = namedtuple("SimpleWorkload", ["key"])
-SimpleConfig = namedtuple("SimpleConfig", ["template_key"])
+SimpleConfig = namedtuple('SimpleConfig', ('template_key', 'is_fallback'))
 
 def test_dispatch():
     @dispatcher
     def my_dispatcher(a, b):
-        return SimpleWorkload(key=a + b)
-
-    @my_dispatcher.register("spatial_pack")
-    def _sp_pack_add(cfg, a, b):
-        return b + 100
+        return (a, b)
 
     @my_dispatcher.register("im2col")
-    def _im2col_add(cfg, a, b):
-        return a + 1
+    def _im2col(cfg, a, b):
+        return a
+
+    @my_dispatcher.register("spatial_pack")
+    def _spatial_pack(cfg, a, b):
+        return b
 
     class SimpleDispatcher(DispatchContext):
         def query(self, target, workload):
-            tkey = "spatial_pack" if workload.key > 2 else "im2col"
-            return SimpleConfig(tkey)
+            a, b = workload
+            tkey = "spatial_pack" if a + b > 2 else "im2col"
+            cfg = SimpleConfig(tkey, False)
+            return cfg
 
     with SimpleDispatcher():
-        # im2col
-        assert my_dispatcher(1, 0) == 2
-        # spack
-        assert my_dispatcher(1, 100) == 200
+        # this will call im2col
+        assert my_dispatcher(1, 0) == 1
+
+        # this will call spatial pack
+        assert my_dispatcher(1, 100) == 100
+
+def test_fallback():
+
+    @autotvm.template
+    def simple_template(a, b):
+        cfg = autotvm.get_config()
+        assert cfg.is_fallback
+
+    simple_template(2, 3)
+
 
 if __name__ == "__main__":
     test_dispatch()
+    test_fallback()
diff --git a/tests/python/unittest/test_autotvm_measure.py b/tests/python/unittest/test_autotvm_measure.py
new file mode 100644
index 000000000000..e29cc2c51658
--- /dev/null
+++ b/tests/python/unittest/test_autotvm_measure.py
@@ -0,0 +1,97 @@
+"""Test builder and runner"""
+import logging
+import time
+
+import numpy as np
+
+import tvm
+from tvm import autotvm
+from test_autotvm_common import get_sample_task, bad_matmul
+from tvm.autotvm.measure.measure import Runner, MeasureResult, MeasureErrorNo
+
+def test_task_tuner_without_measurement():
+    """test task and tuner without measurement"""
+    task, target = get_sample_task()
+
+    class DummyRunner(Runner):
+        def __init__(self):
+            super(DummyRunner, self).__init__(1, 1)
+
+        def run(self, measure_inputs, build_results):
+            return [MeasureResult((np.random.random(),), 0, 0.2, time.time())
+                    for _ in range(len(measure_inputs))]
+
+        def get_build_kwargs(self):
+            return {}
+
+    measure_option = autotvm.measure_option(
+        builder=autotvm.LocalBuilder(),
+        runner=DummyRunner()
+    )
+
+    logging.info("%s", task.config_space)
+
+    for tuner_class in [autotvm.tuner.RandomTuner,
+                        autotvm.tuner.GridSearchTuner,
+                        autotvm.tuner.GATuner,
+                        autotvm.tuner.XGBTuner]:
+        tuner = tuner_class(task)
+        tuner.tune(n_trial=10, measure_option=measure_option)
+        assert tuner.best_flops > 1
+
+def test_check_correctness():
+    task, target = get_sample_task()
+
+    measure_option = autotvm.measure_option(
+        builder=autotvm.LocalBuilder(),
+        runner=autotvm.LocalRunner(check_correctness=True)
+    )
+
+    def _callback_correct(tuner, measure_inputs, measure_results):
+        for inp, res in zip(measure_inputs, measure_results):
+            assert res.error_no == 0
+
+    tuner = autotvm.tuner.RandomTuner(task)
+    tuner.tune(n_trial=2, measure_option=measure_option,
+               callbacks=[_callback_correct])
+
+    # a bad template
+    n = 128
+    target = tvm.target.create("llvm -device=bad_device")
+    task = autotvm.task.create(bad_matmul, args=(n, n, n, 'float32'), target=target)
+
+    def _callback_wrong(tuner, measure_inputs, measure_results):
+        for inp, res in zip(measure_inputs, measure_results):
+            assert res.error_no == MeasureErrorNo.WRONG_ANSWER
+
+    tuner = autotvm.tuner.RandomTuner(task)
+    tuner.tune(n_trial=2, measure_option=measure_option,
+               callbacks=[_callback_wrong])
+
+
+def test_min_repeat_ms():
+    task, target = get_sample_task()
+
+    measure_option = autotvm.measure_option(
+        builder=autotvm.LocalBuilder(),
+        runner=autotvm.LocalRunner(number=1, min_repeat_ms=100)
+    )
+
+    def _callback(tuner, measure_inputs, measure_results):
+        for inp, res in zip(measure_inputs, measure_results):
+            if res.error_no != 0:
+                continue
+
+            assert 1000 * np.mean(res.costs) * \
+                   measure_option['runner'].cur_number >= 100
+
+    tuner = autotvm.tuner.RandomTuner(task)
+    tuner.tune(n_trial=5, measure_option=measure_option,
+               callbacks=[_callback])
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO)
+
+    test_task_tuner_without_measurement()
+    test_check_correctness()
+    test_min_repeat_ms()
diff --git a/tests/python/unittest/test_autotvm_space.py b/tests/python/unittest/test_autotvm_space.py
index 0320ef1c6f3c..7866226083cc 100644
--- a/tests/python/unittest/test_autotvm_space.py
+++ b/tests/python/unittest/test_autotvm_space.py
@@ -1,7 +1,7 @@
 """Test space definition primitives"""
 
 import tvm
-from tvm.autotvm.task.space import ConfigSpace
+from tvm.autotvm.task.space import ConfigSpace, FallbackConfigEntity
 
 def gemm_func(cfg, N):
     A = tvm.placeholder((N, N), name='A')
@@ -26,5 +26,25 @@ def test_split():
     assert len(cfg) == 64
     assert len(cfg.space_map['tile_y']) == 8
 
+    # test fallback
+    cfg = FallbackConfigEntity()
+    cfg.define_split('tile_n', cfg.axis(128), num_outputs=3)
+    cfg.fallback_split('tile_n', [-1, 8, 4])
+    assert cfg['tile_n'].size == [4, 8, 4]
+
+    cfg = FallbackConfigEntity()
+    cfg.define_split('tile_n', cfg.axis(49), num_outputs=3)
+    cfg.fallback_split('tile_n', [-1, 8, 4])
+    assert cfg['tile_n'].size == [7, 7, 1]
+
+    cfg = FallbackConfigEntity()
+    cfg.define_split('tile_n', cfg.axis(49), num_outputs=3)
+    try:
+        cfg.fallback_split('tile_n', [-1, 1, 0])
+        assert False
+    except RuntimeError:
+        pass
+
+
 if __name__ == '__main__':
     test_split()
diff --git a/tests/python/unittest/test_autotvm_xgboost_model.py b/tests/python/unittest/test_autotvm_xgboost_model.py
index 3488d0f599a5..58da219f2e48 100644
--- a/tests/python/unittest/test_autotvm_xgboost_model.py
+++ b/tests/python/unittest/test_autotvm_xgboost_model.py
@@ -12,7 +12,7 @@
 
 def test_fit():
     task, target = get_sample_task()
-    records = get_sample_records(n=100)
+    records = get_sample_records(n=500)
 
     base_model = XGBoostCostModel(task, feature_type='itervar', loss_type='rank')
     base_model.fit_log(records, plan_size=32)
@@ -20,8 +20,8 @@ def test_fit():
     upper_model = XGBoostCostModel(task, feature_type='itervar', loss_type='rank')
     upper_model.load_basemodel(base_model)
 
-    xs = np.arange(100)
-    ys = np.arange(100)
+    xs = np.arange(10)
+    ys = np.arange(10)
 
     upper_model.fit(xs, ys, plan_size=32)
 
diff --git a/tests/python/unittest/test_codegen_bool.py b/tests/python/unittest/test_codegen_bool.py
new file mode 100644
index 000000000000..e2592c416345
--- /dev/null
+++ b/tests/python/unittest/test_codegen_bool.py
@@ -0,0 +1,58 @@
+"""codegen related to bool types"""
+
+import tvm
+import numpy as np
+
+def test_cmp_load_store():
+    n = 32
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.placeholder((n,), name='B')
+    C = tvm.compute(A.shape, lambda *i: A(*i) > B(*i), name='C')
+    D = tvm.compute(C.shape, lambda *i: tvm.all(C(*i), A(*i) > 1), name="D")
+
+
+    def check_llvm():
+        if not tvm.module.enabled("llvm"):
+            return
+        s = tvm.create_schedule(D.op)
+        xo, xi = s[C].split(C.op.axis[0], factor=4)
+        xo1, xo2 = s[C].split(xo, factor=13)
+        s[C].parallel(xo2)
+        # BUILD and invoke the kernel.
+        f = tvm.build(s, [A, B, D], "llvm")
+        ctx = tvm.cpu(0)
+        a_np = np.random.uniform(size=n).astype(A.dtype)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
+        d = tvm.nd.array(np.zeros(n, dtype=D.dtype), ctx)
+        f(a, b, d)
+        np.testing.assert_equal(
+            d.asnumpy(), np.logical_and(a.asnumpy()> b.asnumpy(), a.asnumpy() > 1))
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            return
+        s = tvm.create_schedule(D.op)
+        for stage in [C, D]:
+            xo, xi = s[stage].split(stage.op.axis[0], factor=4)
+            s[stage].bind(xo, tvm.thread_axis("blockIdx.x"))
+            s[stage].bind(xi, tvm.thread_axis("threadIdx.x"))
+        f = tvm.build(s, [A, B, D], device)
+        a_np = np.random.uniform(size=n).astype(A.dtype)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
+        d = tvm.nd.array(np.zeros(n, dtype=D.dtype), ctx)
+        f(a, b, d)
+        np.testing.assert_equal(
+            d.asnumpy(), np.logical_and(a.asnumpy()> b.asnumpy(), a.asnumpy() > 1))
+
+
+    check_llvm()
+    for device in ["vulkan", "opencl", "cuda", "rocm", "metal"]:
+        check_device(device)
+
+
+
+if __name__ == "__main__":
+    test_cmp_load_store()
diff --git a/tests/python/unittest/test_codegen_c_host.py b/tests/python/unittest/test_codegen_c_host.py
new file mode 100644
index 000000000000..00acbeb88fcf
--- /dev/null
+++ b/tests/python/unittest/test_codegen_c_host.py
@@ -0,0 +1,87 @@
+import tvm
+import numpy as np
+from tvm.contrib import util
+
+def test_add():
+    nn = 1024
+    n = tvm.convert(nn)
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.placeholder((n,), name='B')
+    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+    s = tvm.create_schedule(C.op)
+
+    def check_c():
+        f1 = tvm.lower(s, [A, B, C], name="fadd")
+        fsplits = [x for x in tvm.ir_pass.SplitHostDevice(f1)]
+        fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0])
+        mhost = tvm.codegen.build_module(fsplits[0], "c")
+        temp = util.tempdir()
+        path_dso = temp.relpath("temp.so")
+        mhost.export_library(path_dso)
+        m = tvm.module.load(path_dso)
+        fadd = m['fadd']
+        ctx = tvm.cpu(0)
+        # launch the kernel.
+        n = nn
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        fadd(a, b, c)
+        tvm.testing.assert_allclose(
+            c.asnumpy(), a.asnumpy() + b.asnumpy())
+    check_c()
+
+def test_add_pipeline():
+    nn = 1024
+    n = tvm.convert(nn)
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.placeholder((n,), name='B')
+    AA = tvm.compute((n,), lambda *i: A(*i), name='A')
+    BB = tvm.compute((n,), lambda *i: B(*i), name='B')
+    T = tvm.compute(A.shape, lambda *i: AA(*i) + BB(*i), name='T')
+    C = tvm.compute(A.shape, lambda *i: T(*i), name='C')
+    s = tvm.create_schedule(C.op)
+    xo, xi = s[C].split(C.op.axis[0], factor=4)
+    xo1, xo2 = s[C].split(xo, factor=13)
+    s[C].parallel(xo2)
+    s[C].pragma(xo1, "parallel_launch_point")
+    s[C].pragma(xo2, "parallel_stride_pattern")
+    s[C].pragma(xo2, "parallel_barrier_when_finish")
+    s[C].vectorize(xi)
+
+    def check_c():
+        if not tvm.module.enabled("llvm"):
+            return
+        # Specifically allow offset to test codepath when offset is available
+        Ab = tvm.decl_buffer(
+            A.shape, A.dtype,
+            elem_offset=tvm.var('Aoffset'),
+            offset_factor=8,
+            name='A')
+        binds = {A : Ab}
+        # BUILD and invoke the kernel.
+        f1 = tvm.lower(s, [A,B,C], name="fadd_pipeline")
+        fsplits = [x for x in tvm.ir_pass.SplitHostDevice(f1)]
+        fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0])
+        mhost = tvm.codegen.build_module(fsplits[0], "c")
+        temp = util.tempdir()
+        path_dso = temp.relpath("temp.so")
+        mhost.export_library(path_dso)
+        m = tvm.module.load(path_dso)
+        fadd = m["fadd_pipeline"]
+        ctx = tvm.cpu(0)
+        # launch the kernel.
+        n = nn
+        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
+        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+        fadd(a, b, c)
+        tvm.testing.assert_allclose(
+            c.asnumpy(), a.asnumpy() + b.asnumpy())
+
+    with tvm.build_config(offset_factor=4):
+        check_c()
+
+if __name__ == "__main__":
+    test_add()
+    test_add_pipeline()
diff --git a/tests/python/unittest/test_codegen_cross_llvm.py b/tests/python/unittest/test_codegen_cross_llvm.py
index aa6f9d708a41..5b9c509aedf2 100644
--- a/tests/python/unittest/test_codegen_cross_llvm.py
+++ b/tests/python/unittest/test_codegen_cross_llvm.py
@@ -67,7 +67,7 @@ def build_arm():
             b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
             c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
             farm(a, b, c)
-            np.testing.assert_allclose(
+            tvm.testing.assert_allclose(
                 c.asnumpy(), a.asnumpy() + b.asnumpy())
             print("Verification finish on remote..")
 
diff --git a/tests/python/unittest/test_codegen_cuda.py b/tests/python/unittest/test_codegen_cuda.py
index 0b54863d1aab..d3b770790bdb 100644
--- a/tests/python/unittest/test_codegen_cuda.py
+++ b/tests/python/unittest/test_codegen_cuda.py
@@ -27,7 +27,7 @@ def check_cuda(dtype, n, lanes):
             np.random.uniform(size=(n, lanes)))
         c = tvm.nd.empty((n,), B.dtype, ctx)
         fun(a, c)
-        np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
+        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
         
     check_cuda("float32", 64, 2)
     check_cuda("float16", 64, 2)
@@ -62,7 +62,7 @@ def check_cuda(dtype, n, lanes):
         c = tvm.nd.empty((n,), C.dtype, ctx).copyfrom(np_c)
         d = tvm.nd.empty((n,), D.dtype, ctx)
         fun(a, b, c, d)
-        np.testing.assert_allclose(d.asnumpy(), np_d)
+        tvm.testing.assert_allclose(d.asnumpy(), np_d)
     check_cuda("int8", 64, 4)
 
 def test_cuda_vectorize_load():
@@ -83,11 +83,34 @@ def check_cuda(dtype, n, lanes):
         a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a)
         b = tvm.nd.empty((n,), B.dtype, ctx)
         fun(a,b)
-        np.testing.assert_allclose(a.asnumpy(), b.asnumpy())
+        tvm.testing.assert_allclose(a.asnumpy(), b.asnumpy())
     check_cuda("int8", 64, 8)
     check_cuda("int8", 64, 16)
 
+def test_cuda_make_int8x4():
+    def check_cuda(n, value):
+        if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
+            print("skip because cuda is not enabled..")
+            return
+        lanes = 4
+        dtype = 'int8'
+        ctx = tvm.gpu(0)
+        A = tvm.compute((n, lanes), lambda i,j: tvm.const(value, dtype=dtype))
+        s = tvm.create_schedule(A.op)
+        y, x = s[A].op.axis
+        s[A].vectorize(x)
+        s[A].bind(y, tvm.thread_axis("blockIdx.x"))
+        fun = tvm.build(s, [A], "cuda", name="make_int8x4")
+        np_a = np.full((n, lanes), value, dtype=dtype)
+        a = tvm.nd.empty(np_a.shape, dtype, ctx)
+        fun(a)
+        np.testing.assert_equal(a.asnumpy(), np_a)
+    check_cuda(64, 0xAB)
+    check_cuda(64, 0)
+    check_cuda(64, -3)
+
 if __name__ == "__main__":
     test_cuda_vectorize_add()
     test_cuda_multiply_add()
     test_cuda_vectorize_load()
+    test_cuda_make_int8x4()
diff --git a/tests/python/unittest/test_codegen_device.py b/tests/python/unittest/test_codegen_device.py
index 0bb072ebf0bd..ee7644cea677 100644
--- a/tests/python/unittest/test_codegen_device.py
+++ b/tests/python/unittest/test_codegen_device.py
@@ -51,7 +51,7 @@ def check_target(device, host="stackvm"):
         b = tvm.nd.array(np.random.uniform(size=()).astype(Bb.dtype), ctx)
         d = tvm.nd.array(np.zeros(n, dtype=Db.dtype), ctx)
         f(a, b, d)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             d.asnumpy(), a.asnumpy() + b.asnumpy() + 1)
 
     def check_module_save(device, host="stackvm"):
@@ -75,7 +75,7 @@ def check_module_save(device, host="stackvm"):
         b = tvm.nd.array(np.random.uniform(size=()).astype(Bb.dtype), ctx)
         d = tvm.nd.array(np.zeros(n, dtype=Db.dtype), ctx)
         f(a, b, d)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             d.asnumpy(), a.asnumpy() + b.asnumpy() + 1)
 
     check_target("cuda", host="stackvm")
diff --git a/tests/python/unittest/test_codegen_extern.py b/tests/python/unittest/test_codegen_extern.py
index dfbf1820c21d..7512f0d23634 100644
--- a/tests/python/unittest/test_codegen_extern.py
+++ b/tests/python/unittest/test_codegen_extern.py
@@ -46,7 +46,7 @@ def check_target(target):
         a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
         f(a, c)
-        np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
+        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
         
     check_target("llvm")
     check_target("opencl")
@@ -80,7 +80,7 @@ def check_target(target):
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
 
         f(a, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), a.asnumpy())
     check_target("stackvm")
     check_target("llvm")
@@ -112,12 +112,12 @@ def check_target(target):
         @tvm.register_func
         def my_extern_array_func2(aa, bb):
             assert aa.shape == a.shape
-            np.testing.assert_allclose(
+            tvm.testing.assert_allclose(
                 aa.asnumpy(), a.asnumpy() + 1)
             aa.copyto(bb)
 
         f(a, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), a.asnumpy() + 1)
 
     check_target("llvm")
diff --git a/tests/python/unittest/test_codegen_llvm.py b/tests/python/unittest/test_codegen_llvm.py
index 9f282050df3e..4f3e4e914d55 100644
--- a/tests/python/unittest/test_codegen_llvm.py
+++ b/tests/python/unittest/test_codegen_llvm.py
@@ -2,6 +2,7 @@
 from tvm.contrib import util, clang
 import numpy as np
 import ctypes
+import math
 
 def test_llvm_intrin():
     ib = tvm.ir_builder.create()
@@ -52,7 +53,7 @@ def check_llvm(use_file):
         a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
         b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
         f(a, b)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             b.asnumpy(), a.asnumpy() + 1.0)
     check_llvm(use_file=True)
     check_llvm(use_file=False)
@@ -106,7 +107,7 @@ def check_llvm():
         b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
         f(a, b, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), a.asnumpy() + b.asnumpy())
 
     with tvm.build_config(offset_factor=4):
@@ -138,7 +139,7 @@ def check_llvm():
         a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
         f(a, c)
-        np.testing.assert_allclose(c.asnumpy(),
+        tvm.testing.assert_allclose(c.asnumpy(),
                                    np.sqrt(a.asnumpy() + 1) * 2 + 2,
                                    rtol=1e-5)
 
@@ -164,7 +165,7 @@ def check_llvm(nn, base):
         a = tvm.nd.array(np.random.uniform(size=(n + base)).astype(A.dtype), ctx)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
         f(a, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), a.asnumpy()[::-1][:n])
     check_llvm(4, 0)
     check_llvm(128, 8)
@@ -195,7 +196,7 @@ def check_llvm(n, lanes):
             np.random.uniform(size=(n, lanes)))
         c = tvm.nd.empty((n,), C.dtype, ctx)
         f(a, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), a.asnumpy() + 1)
     check_llvm(64, 2)
     check_llvm(512, 2)
@@ -220,7 +221,7 @@ def check_llvm(nn, base, stride):
         a = tvm.nd.array(np.random.uniform(size=(n + base, stride)).astype(A.dtype), ctx)
         c = tvm.nd.array(np.zeros((n, stride), dtype=C.dtype), ctx)
         f(a, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), a.asnumpy()[base:] + 1)
     check_llvm(64, 0, 2)
     check_llvm(4, 0, 1)
@@ -247,7 +248,7 @@ def check_llvm():
         a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
         f(a, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), a.asnumpy() + 1 + 1)
     check_llvm()
 
@@ -277,10 +278,10 @@ def check_llvm():
         b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
         fadd1(a, b, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), a.asnumpy() + b.asnumpy())
         fadd2(a, b, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), a.asnumpy() + b.asnumpy())
     check_llvm()
 
@@ -302,7 +303,7 @@ def check_llvm(n, offset):
         f(a, c)
         c_np = a.asnumpy()
         c_np[:offset] = 0
-        np.testing.assert_allclose(c.asnumpy(), c_np)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np)
     check_llvm(64, 8)
 
 
@@ -321,7 +322,7 @@ def check_llvm(n):
         c = tvm.nd.empty((n,), C.dtype, ctx)
         f(a, c)
         c_np = a.asnumpy() == 1
-        np.testing.assert_allclose(c.asnumpy(), c_np)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np)
     check_llvm(64)
 
 
@@ -345,7 +346,31 @@ def check_llvm(n):
         d = tvm.nd.empty((), D.dtype, ctx)
         f(a, sc, d)
         d_np = np.sum(a.asnumpy()) * sc.asnumpy() + 1
-        np.testing.assert_allclose(d.asnumpy(), d_np)
+        tvm.testing.assert_allclose(d.asnumpy(), d_np)
+    check_llvm(64)
+
+def test_rank_zero_bound_checkers():
+    def check_llvm(n):
+        if not tvm.module.enabled("llvm"):
+            return
+        with tvm.build_config(instrument_bound_checkers=True):
+            A = tvm.placeholder((n, ), name='A')
+            scale = tvm.placeholder((), name='scale')
+            k = tvm.reduce_axis((0, n), name="k")
+            C = tvm.compute((), lambda : tvm.sum(A[k] * scale, axis=k), name="C")
+            D = tvm.compute((), lambda : C + 1)
+            s = tvm.create_schedule(D.op)
+            # build and invoke the kernel.
+            f = tvm.build(s, [A, scale, D], "llvm")
+            ctx = tvm.cpu(0)
+            # launch the kernel.
+            a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), ctx)
+            sc = tvm.nd.array(
+                np.random.randint(0, 2, size=()).astype(scale.dtype), ctx)
+            d = tvm.nd.empty((), D.dtype, ctx)
+            f(a, sc, d)
+            d_np = np.sum(a.asnumpy()) * sc.asnumpy() + 1
+            tvm.testing.assert_allclose(d.asnumpy(), d_np)
     check_llvm(64)
 
 
@@ -362,11 +387,46 @@ def test_alignment():
         if "align" in l and "4 x float" in l:
             assert "align 32" in l
 
+def test_llvm_div():
+    """Check that the semantics of div and mod is the same as in C/C++"""
+    def check_div(start, end, divisor, dtype):
+        T = tvm.compute((end - start,),
+                        lambda i: tvm.expr.Cast(dtype, (start + i)) / tvm.const(divisor, dtype))
+        s = tvm.create_schedule([T.op])
+        f = tvm.build(s, [T], "llvm")
+        a = tvm.nd.empty((end - start,), dtype)
+        f(a)
+        ref = [int(float(i)/divisor) for i in range(start, end)]
+        tvm.testing.assert_allclose(a.asnumpy(), ref)
+
+    def check_mod(start, end, divisor, dtype):
+        T = tvm.compute((end - start,),
+                        lambda i: tvm.expr.Cast(dtype, (start + i)) % tvm.const(divisor, dtype))
+        s = tvm.create_schedule([T.op])
+        f = tvm.build(s, [T], "llvm")
+        a = tvm.nd.empty((end - start,), dtype)
+        f(a)
+        ref = [int(math.fmod(i, divisor)) for i in range(start, end)]
+        tvm.testing.assert_allclose(a.asnumpy(), ref)
+
+    def check_llvm(start, end, divisor, dtype):
+        check_div(start, end, divisor, dtype)
+        check_mod(start, end, divisor, dtype)
+
+    for d in range(-5, 6):
+        if d != 0:
+            # Note that 11 (and not e.g. 10) is used to avoid issues with the simplifier
+            check_llvm(-11, 11, d, 'int32')
+            check_llvm(-11, 11, d, 'int8')
+            if d > 0:
+                check_llvm(123, 133, d, 'uint8')
+                check_llvm(0, 256, d, 'uint8')
 
 if __name__ == "__main__":
     test_llvm_import()
     test_alignment()
     test_rank_zero()
+    test_rank_zero_bound_checkers()
     test_llvm_bool()
     test_llvm_persist_parallel()
     test_llvm_select()
@@ -378,3 +438,4 @@ def test_alignment():
     test_llvm_madd_pipeline()
     test_llvm_temp_space()
     test_llvm_lookup_intrin()
+    test_llvm_div()
diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index 0f500d7c704f..f87c75f7929d 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -1,9 +1,9 @@
-import tvm, inspect, sys, traceback, numpy, nose
+import tvm, inspect, sys, traceback, numpy, nose, types
 from tvm.hybrid import script
 from tvm.hybrid.intrin import HYBRID_GLOBALS
 
 @nose.tools.nottest
-def run_and_check(func, args, outs, var_dict={}, target='llvm'):
+def run_and_check(func, args, var_dict={}, target='llvm'):
     def tvm_val_2_py_val(val):
         val = tvm.ir_pass.Substitute(val, var_dict)
         val = tvm.ir_pass.Simplify(val)
@@ -11,41 +11,55 @@ def tvm_val_2_py_val(val):
         return val.value
 
     ctx = tvm.context(target, 0)
+    op = None
+
+    outs = func(*args)
+    op = outs[0].op if isinstance(outs, list) else outs.op
 
     emu_args = []
     nd_args = []
-    to_check = []
     for i in args:
         if isinstance(i, tvm.tensor.Tensor):
             shape = [tvm_val_2_py_val(j) for j in i.shape]
-            if i in outs:
-                emu_args.append(numpy.zeros(shape).astype(i.dtype))
-                nd_args.append(tvm.nd.array(emu_args[-1], ctx))
-                to_check.append((nd_args[-1], emu_args[-1]))
-            else:
-                emu_args.append(numpy.random.randn(*shape).astype(i.dtype))
-                nd_args.append(tvm.nd.array(emu_args[-1], ctx))
+            emu_args.append(numpy.random.randn(*shape).astype(i.dtype))
+            nd_args.append(tvm.nd.array(emu_args[-1], ctx))
         else:
             assert isinstance(i, tvm.expr.Var)
             emu_args.append(tvm_val_2_py_val(i))
             nd_args.append(emu_args[-1])
 
-    func(*emu_args)
-
-    lowerd_func = tvm.lower(func(*args), args)
-    module = tvm.build(lowerd_func, target=target)
+    sch = tvm.create_schedule(op)
+    module = tvm.build(sch, args + (outs if isinstance(outs, list) else [outs]), target=target)
     assert module
+    
+    out_tensors = []
+    for i in range(op.num_outputs):
+        output = op.output(i)
+        shape = [tvm_val_2_py_val(j) for j in output.shape]
+        nd_args.append(tvm.nd.array(numpy.zeros(shape).astype(output.dtype), ctx))
+        out_tensors.append(nd_args[-1])
+
+    ref_data = func(*emu_args)
+    if isinstance(ref_data, numpy.ndarray):
+        ref_data = [ref_data]
+    
     module(*nd_args)
 
-    for nd, np in to_check:
-        numpy.testing.assert_allclose(nd.asnumpy(), np, rtol=1e-5, atol=1e-5)
+    for nd, np in zip(out_tensors, ref_data):
+        tvm.testing.assert_allclose(nd.asnumpy(), np, rtol=1e-5, atol=1e-5)
 
 
 @script
-def outer_product(n, m, a, b, c):
+def outer_product(n, m, a, b):
+    """This is a simple outer product.
+    Actually this function is not required to be documented.
+    I write this docstring to test skipping docstring functionality.
+    """
+    c = output_tensor((n, m), a.dtype)
     for i in range(n):
         for j in range(m):
             c[i, j] = a[i] * b[j]
+    return c
 
 #Test global function
 #Test bridge between frontend and backend
@@ -54,8 +68,14 @@ def test_outer_product():
     m = tvm.var('m')
     a = tvm.placeholder((n, ), name='a')
     b = tvm.placeholder((m, ), name='b')
-    c = tvm.placeholder((n, m), name='c')
-    ir = outer_product(n, m, a, b, c)
+
+    try:
+        c = outer_product(n, m, a, b)
+        ir = c.op.body
+    except IOError as err:
+        assert sys.version_info[0] == 2 and str(err) == 'could not get source code'
+        return
+
     #Check for i in (0, n)
     assert isinstance(ir, tvm.stmt.For)
     assert ir.loop_var.name == 'i'
@@ -80,10 +100,8 @@ def test_outer_product():
     assert mul.a.name == 'a'
     assert mul.b.name == 'b'
 
-    func = tvm.lower(ir, [n, m, a, b, c])
-    func = tvm.build(func)
 
-    run_and_check(outer_product, [n, m, a, b, c], [c], {n: 999, m: 1001})
+    run_and_check(outer_product, [n, m, a, b], {n: 99, m: 101})
 
     for key, _ in HYBRID_GLOBALS.items():
         assert key not in globals().keys()
@@ -93,19 +111,25 @@ def test_outer_product():
 #Test allocation of local variable
 def test_fanout():
     @script
-    def fanout(n, a, b):
+    def fanout(n, a):
         three = 3.0
+        b = output_tensor((a.shape[0] - 3, ), a.dtype)
         for i in range(a.shape[0] - 3):
             sigma = 0.0
             for j in range(3):
-                sigma = sigma + a[i + j]
+                sigma += a[i + j]
             sigma = sigma / three
             b[i] = sigma
+        return b
 
     n = tvm.var('n')
     a = tvm.placeholder((n, ), 'float32', name='a')
-    b = tvm.placeholder((n-3, ), 'float32', name='b')
-    ir = fanout(n, a, b)
+    try:
+        b = fanout(n, a)
+        ir = b.op.body
+    except IOError as err:
+        assert sys.version_info[0] == 2 and str(err) == 'could not get source code'
+        return
 
     #Check for i in (0, n-3)
     assert isinstance(ir, tvm.stmt.For)
@@ -162,38 +186,31 @@ def fanout(n, a, b):
     assert len(write.value.args) == 1
     assert write.value.args[0].value == 0
 
-    run_and_check(fanout, [n, a, b], [b], {n: 10})
-
-
-@script
-def failure():
-    for i in range(1, 100):
-        i = 0
-
-def test_failure():
-    try:
-        tvm.hybrid.parse(failure, [])
-    except IOError as err:
-        assert sys.version_info[0] == 2
-        print('[Warning] Case test_failure is skipped by Python2 because "%s"' % str(err))
-    except Exception as err:
-        assert str(err) == 'You CAN NEVER overwrite a loop variable!'
+    run_and_check(fanout, [n, a], {n: 10})
 
 
 def test_looptype():
     @script
     def looptype(a, b, c):
+        d = output_tensor((8, ), 'int32')
+        e = output_tensor((8, ), 'int32')
+        f = output_tensor((8, ), 'int32')
         for i in parallel(8):
-            a[i] = i
+            d[i] = a[i]
         for j in vectorize(8):
-            b[j] = j
+            e[j] = b[j]
         for k in unroll(8):
-            c[k] = k
+            f[k] = c[k]
+        return d, e, f
 
     a = tvm.placeholder((8, ), name='a', dtype='int32')
     b = tvm.placeholder((8, ), name='b', dtype='int32')
     c = tvm.placeholder((8, ), name='c', dtype='int32')
-    ir = looptype(a, b, c)
+    try:
+        d, e, f = looptype(a, b, c)
+        ir = d.op.body
+    except:
+        return
     iloop = ir.first
     jloop = ir.rest.first
     kloop = ir.rest.rest
@@ -201,24 +218,50 @@ def looptype(a, b, c):
     assert jloop.for_type == tvm.stmt.For.Vectorized
     assert kloop.for_type == tvm.stmt.For.Unrolled
 
-    run_and_check(looptype, [a, b, c], [a, b, c])
+    run_and_check(looptype, [a, b, c])
 
 
 def test_if():
     @script
-    def if_then_else(a, b):
+    def if_then_else(a):
+        b = output_tensor((10, ), 'int32')
+        c = output_tensor((10, ), 'int32')
         for i in range(10):
             if i % 2 == 0:
-                a[i] = -1
+                c[i] = a[i]
             else:
-                a[i] = 1
+                c[i] = b[i]
         for i in unroll(10):
             b[i] = -1 if i % 2 == 0 else 1
+        return b, c
 
     a = tvm.placeholder((10, ), dtype='int32', name='a')
-    b = tvm.placeholder((10, ), dtype='int32', name='b')
 
-    run_and_check(if_then_else, [a, b], [a, b])
+    run_and_check(if_then_else, [a])
+
+    @script
+    def if_triple_condition(a):
+        b = output_tensor((10, ), 'int32')
+        for i in range(10):
+            if 0 <= i < 5:
+                b[i] = a[i]
+            else:
+                b[i] = a[i] + 1
+        return b
+
+    run_and_check(if_triple_condition, [a])
+
+    @script
+    def if_and(a):
+        b = output_tensor((10, ), 'int32')
+        for i in range(10):
+            if i >= 0 and i < 5:
+                b[i] = a[i]
+            else:
+                b[i] = a[i] + 1
+        return b
+
+    run_and_check(if_and, [a])
 
 
 def test_bind():
@@ -226,85 +269,95 @@ def test_bind():
         print('[Warning] No GPU found! Skip bind test!')
         return
     @script
-    def vec_add(a, b, c):
+    def vec_add(a, b):
+        c = output_tensor((1000, ), 'float32')
         for tx in bind('threadIdx.x', 1000):
-            c[tx] = b[tx] + c[tx]
+            c[tx] = a[tx] + b[tx]
+        return c
 
     a = tvm.placeholder((1000, ), dtype='float32', name='a')
     b = tvm.placeholder((1000, ), dtype='float32', name='b')
-    c = tvm.placeholder((1000, ), dtype='float32', name='c')
 
-    run_and_check(vec_add, [a, b, c], [c], target='cuda')
+    run_and_check(vec_add, [a, b], target='cuda')
 
 def test_math_intrin():
     @script
     def intrin_real(a):
-        a[0] = sqrt(a[0])
-        a[1] = log(a[1])
-        a[2] = exp(a[2])
-        a[3] = sigmoid(a[3])
-        a[4] = power(a[4], a[5])
-        a[5] = tanh(a[5])
-        a[6] = min(a[4], a[5])
-        a[7] = max(a[5], a[6])
+        b = output_tensor((8, ), 'float32')
+        b[0] = sqrt(a[0])
+        b[1] = log(a[1])
+        b[2] = exp(a[2])
+        b[3] = sigmoid(a[3])
+        b[4] = power(a[4], a[5])
+        b[5] = tanh(a[5])
+        b[6] = min(a[4], a[5])
+        b[7] = max(a[5], a[6])
+        return b
 
     a8 = tvm.placeholder((8, ), dtype='float32', name='a')
-    ir = intrin_real(a8)
-    func = tvm.build(tvm.lower(ir, [a8]))
+    b8 = intrin_real(a8)
+    sch = tvm.create_schedule(b8.op)
+    func = tvm.build(sch, [a8, b8])
     assert func
     a = numpy.arange(2, 10).astype('float32')
     tvm_a = tvm.ndarray.array(a)
-    func(tvm_a)
-    intrin_real(a)
-    numpy.testing.assert_allclose(a, tvm_a.asnumpy(), rtol=1e-5)
+    tvm_b = tvm.ndarray.array(numpy.zeros((8, ), dtype='float32'))
+    b = intrin_real(a)
+    func(tvm_a, tvm_b)
+    tvm.testing.assert_allclose(b, tvm_b.asnumpy(), rtol=1e-5)
 
     @script
     def intrin_int(a):
-        a[0] = popcount(a[0])
+        b = output_tensor((1, ), 'int32')
+        b[0] = popcount(a[0])
+        return b
 
     a1 = tvm.placeholder((1, ), dtype='int32')
-    ir = intrin_int(a1)
-    func = tvm.build(tvm.lower(ir, [a1]))
+    b1 = intrin_int(a1)
+    sch = tvm.create_schedule(b1.op)
+    func = tvm.build(sch, [a1, b1])
     assert func
-    a = numpy.array([1234567890]).astype('int32')
+    a = numpy.array([114514]).astype('int32')
     tvm_a = tvm.ndarray.array(a)
-    intrin_int(a)
-    func(tvm_a)
-    assert tvm_a.asnumpy()[0] == a[0]
+    tvm_b = tvm.ndarray.array(numpy.array([0]).astype('int32'))
+    b = intrin_int(a)
+    func(tvm_a, tvm_b)
+    assert tvm_b.asnumpy()[0] == b[0]
 
+# test non caconical loops
 def test_non_zero():
     @tvm.hybrid.script
-    def blur(a, b):
+    def blur(a):
+        b = output_tensor((30, 30), 'float32')
         for i in range(2, 32):
             for j in range(2, 32):
                 s = 0.0
                 for di in range(3):
                     for dj in range(3):
-                        s = s + a[i-di, j-dj]
+                        s += a[i-di, j-dj]
                 b[i-2, j-2] = s / 9.0
-    try:
-        a = tvm.placeholder((32, 32), 'float32', 'a')
-        b = tvm.placeholder((30, 30), 'float32', 'b')
-        run_and_check(blur, [a, b], [b])
-    except IOError as err:
-        assert sys.version_info[0] == 2
-        print('[Warning] Case test_non_zero is skipped by Python2 because "%s"' % str(err))
+        return b
+
+    a = tvm.placeholder((32, 32), 'float32', 'a')
+    run_and_check(blur, [a])
 
     @tvm.hybrid.script
-    def triangle(a, b, c):
+    def triangle(a, b):
+        c = output_tensor((10, 10), dtype='float32')
         for i in range(10):
             for j in range(i, 10):
                 c[i, j] = a[i] * b[j]
+        return c
 
     a = tvm.placeholder((10, ), dtype='float32', name='a')
     b = tvm.placeholder((10, ), dtype='float32', name='b')
-    c = tvm.placeholder((10, 10), dtype='float32', name='c')
 
-    run_and_check(triangle, [a, b, c], [c])
+    run_and_check(triangle, [a, b])
 
 def test_allocate():
     @tvm.hybrid.script
-    def blur2d(a, b):
+    def blur2d(a):
+        b = output_tensor((30, 30), 'float32')
         for i in range(30):
             ha = allocate((3, 30), 'float32')
             for j in range(3):
@@ -312,15 +365,15 @@ def blur2d(a, b):
                     ha[j, k] = a[i+j, k] + a[i+j, k+1] + a[i+j, k+2]
             for j in range(30):
                 b[i, j] = (ha[0, j] + ha[1, j] + ha[2, j]) / 9.0
+        return b
 
     a = tvm.placeholder((32, 32), 'float32', 'a')
-    b = tvm.placeholder((30, 30), 'float32', 'b')
-
-    run_and_check(blur2d, [a, b], [b])
+    run_and_check(blur2d, [a])
 
     if tvm.gpu().exist:
         @tvm.hybrid.script
-        def share_vec_add(a, b, c):
+        def share_vec_add(a, b):
+            c = output_tensor((256, ), 'float32')
             shared = allocate((256, ), 'float32', 'shared')
             for i in bind("threadIdx.x", 256):
                 shared[i] = a[i]
@@ -329,23 +382,176 @@ def share_vec_add(a, b, c):
                 local[i] = b[i]
             for i in bind("threadIdx.x", 256):
                 c[i] = shared[i] + local[i]
+            return c
 
         a = tvm.placeholder((256, ), dtype='float32', name='a')
         b = tvm.placeholder((256, ), dtype='float32', name='b')
-        c = tvm.placeholder((256, ), dtype='float32', name='c')
-        run_and_check(share_vec_add, [a, b, c], [c], target='cuda')
+        run_and_check(share_vec_add, [a, b], target='cuda')
     else:
         print('[Warning] No GPU found! Skip shared mem test!')
 
+def test_upstream():
+    @tvm.hybrid.script
+    def upstream(a):
+        b = output_tensor((20, ), 'float32')
+        for i in range(20):
+            b[i] = a[i] * i
+        return b
+
+    a = tvm.placeholder((20, ), 'float32')
+    b = tvm.placeholder((20, ), 'float32')
+    c = tvm.compute((20, ), lambda x: a[x] + b[x])
+    d = upstream(c)
+    sch = tvm.create_schedule([c.op, d.op])
+    ir = tvm.lower(sch, [a, b, d], simple_mode=True)
+    func = tvm.build(sch, [a, b, d])
+    assert(func)
+
+    a = numpy.random.randn(20).astype('float32')
+    b = numpy.random.randn(20).astype('float32')
+    ref = numpy.zeros((20, ), 'float32')
+    for i in range(20):
+        ref[i] = (a[i] + b[i]) * i
+
+    tvm_a = tvm.nd.array(a)
+    tvm_b = tvm.nd.array(b)
+    tvm_d = tvm.nd.array(numpy.zeros((20, )).astype('float32'))
+
+    func(tvm_a, tvm_b, tvm_d)
+    tvm.testing.assert_allclose(tvm_d.asnumpy(), ref, 1e-5, 1e-5)
+
+def test_downstream():
+    @tvm.hybrid.script
+    def downstream(a):
+        b = output_tensor((20, ), 'float32')
+        for i in range(20):
+            b[i] = a[i] * i
+        return b
+
+    
+    a = tvm.placeholder((20, ), 'float32')
+    b = downstream(a)
+    c = tvm.compute((20, ), lambda x: b[x] + 1.0)
+
+    sch = tvm.create_schedule(c.op)
+    module = tvm.build(sch, [a, c])
+    assert module
+
+    a = numpy.random.randn(20).astype('float32')
+    ref = numpy.zeros((20, )).astype('float32')
+    for i in range(20):
+        ref[i] = (a[i] * i) + 1.0
+
+    tvm_a = tvm.nd.array(a)
+    tvm_c = tvm.nd.array(numpy.zeros((20, )).astype('float32'))
+    module(tvm_a, tvm_c)
+    tvm.testing.assert_allclose(tvm_c.asnumpy(), ref, 1e-5, 1e-5)
+
+def test_const_param():
+    @tvm.hybrid.script
+    def add_something(a, b):
+        c = output_tensor((11, ), 'int32')
+        for i in range(11):
+            c[i] = a[i] + b
+        return c
+
+    a = tvm.placeholder((11, ), dtype='int32', name='a')
+    b = tvm.const(11, 'int32')
+    c = add_something(a, b)
+    sch = tvm.create_schedule(c.op)
+    module = tvm.build(sch, [a, c], 'llvm')
+    assert(module)
+
+    np_a = numpy.arange(11).astype('int32')
+    np_b = 11
+    np_c = numpy.zeros((11, )).astype('int32')
+
+    nd_a = tvm.ndarray.array(np_a)
+    nd_c = tvm.ndarray.array(numpy.zeros((11, )).astype('int32'))
+    module(nd_a, nd_c)
+    ref = add_something(np_a, 11)
+
+    tvm.testing.assert_allclose(nd_c.asnumpy(), ref, 1e-5, 1e-5)
+
+def test_value_index():
+    @tvm.hybrid.script
+    def kernel_a(a):
+        b = output_tensor((16, ), 'int32')
+        c = output_tensor((4, 4), 'int32')
+        for i in range(16):
+            b[i] = a[i] + 2
+            c[i // 4, i % 4] = a[i] + 1
+        return b, c
+
+    @tvm.hybrid.script
+    def kernel_b(b, a):
+        c = output_tensor((4, 4), 'int32')
+        for i in range(4):
+            for j in range(4):
+                c[i, j] = a[i * 4 + j] * b[i, j]
+        return c
+
+    a = tvm.placeholder((16, ), 'int32')
+    b, c = kernel_a(a)
+    d = kernel_b(c, b)
+    sch = tvm.create_schedule(d.op)
+    module = tvm.build(sch, [a, d])
+    assert module
+
+    np_a = numpy.arange(16).astype('int32')
+    np_b, np_c = kernel_a(np_a)
+    ref = kernel_b(np_c, np_b)
+
+    res = tvm.ndarray.array(numpy.zeros((4, 4)).astype('int32'))
+    module(tvm.ndarray.array(np_a), res)
+    tvm.testing.assert_allclose(res.asnumpy(), ref)
+
+def test_func_call():
+    @tvm.hybrid.script
+    def foo(a, b):
+        for i in range(10):
+            a[i] = i + 1.0
+        for i in range(10):
+            b[i] = i + 1.0
+        c = outer_product(10, 10, a, b)
+        d = output_tensor(c.shape, c.dtype)
+        for i in range(10):
+            for j in range(10):
+                d[i, j] = c[i, j] + i * j
+        return d
+
+    a = tvm.placeholder((10, ), name='a')
+    b = tvm.placeholder((10, ), name='b')
+    run_and_check(foo, [a, b])
+
+def test_bool():
+    @tvm.hybrid.script
+    def foo(a):
+        b = output_tensor(a.shape, a.dtype)
+        b[0] = 1.2
+        for i in range(1, a.shape[0] - 1):
+            if a[i] * a[i - 1] < a[i] or a[i] * a[i - 1] < a[i - 1] or i * a[i] == a[i]:
+                b[i] = a[i]
+            else:
+                b[i] = 0.0
+        return b
+    a = tvm.placeholder((10, ), name='a')
+    run_and_check(foo, [a])
 
 if __name__ == "__main__":
     test_outer_product()
     test_fanout()
-    test_failure()
     test_looptype()
     test_if()
     test_bind()
     test_math_intrin()
     test_non_zero()
     test_allocate()
-
+    test_upstream()
+    test_downstream()
+    test_const_param()
+    test_value_index()
+    test_func_call()
+    test_bool()
+    # TODO:
+    # test_inplace()
diff --git a/tests/python/unittest/test_ir_builder.py b/tests/python/unittest/test_ir_builder.py
index 864455d01ad9..a257571752e4 100644
--- a/tests/python/unittest/test_ir_builder.py
+++ b/tests/python/unittest/test_ir_builder.py
@@ -84,7 +84,7 @@ def check_target(target):
         b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
         fadd(a, b, c)
-        np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
     check_target("llvm")
 
 def test_gpu():
@@ -125,7 +125,7 @@ def check_target(target):
         b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
         fadd(a, b, c)
-        np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
     check_target("opencl")
     check_target("cuda")
 
diff --git a/tests/python/unittest/test_lang_basic.py b/tests/python/unittest/test_lang_basic.py
index c9a04747b56d..079123d96ca0 100644
--- a/tests/python/unittest/test_lang_basic.py
+++ b/tests/python/unittest/test_lang_basic.py
@@ -8,7 +8,7 @@ def test_const():
 
 def test_make():
     x = tvm.const(1)
-    y = tvm.make.IntImm('int32', 1)
+    y = tvm.var("x")
     z = x + y
     assert isinstance(tvm.max(x, y), tvm.expr.Max)
     assert isinstance(tvm.min(x, y), tvm.expr.Min)
@@ -79,7 +79,7 @@ def test_dtype():
     x = tvm.var('x')
     assert x.dtype == 'int32'
     y = tvm.var('y')
-    assert (x > y).dtype == 'uint1'
+    assert (x > y).dtype == 'bool'
 
 
 def test_any():
diff --git a/tests/python/unittest/test_lang_buffer.py b/tests/python/unittest/test_lang_buffer.py
index a5a8f5d065a6..85c9fbeee53e 100644
--- a/tests/python/unittest/test_lang_buffer.py
+++ b/tests/python/unittest/test_lang_buffer.py
@@ -41,6 +41,26 @@ def test_buffer_access_ptr_offset():
     assert tvm.ir_pass.Equal(offset, tvm.call_extern('int32', "test_call", 200 + v))
     assert aptr.args[4].value == Buffer.READ | Buffer.WRITE
 
+def test_buffer_access_ptr_extent():
+    m = tvm.var('m')
+    n = tvm.var('n')
+    Ab = tvm.decl_buffer((m, n), tvm.float32)
+    aptr = Ab.access_ptr("rw")
+    assert tvm.ir_pass.Equal(aptr.args[3], m * n)
+    aptr = Ab.access_ptr("rw", offset=100)
+    assert tvm.ir_pass.Equal(aptr.args[3], m * n - 100)
+    Ab = tvm.decl_buffer((m, n), tvm.float32, strides=[n + 1 , 1])
+    aptr = Ab.access_ptr("rw", offset=100)
+    assert tvm.ir_pass.Equal(aptr.args[3], Ab.strides[0] * m - 100)
+
+def test_buffer_vload():
+    m = tvm.var('m')
+    n = tvm.var('n')
+    Ab = tvm.decl_buffer((m, n), tvm.float32, elem_offset=100)
+    load = Ab.vload([2, 3])
+    offset = tvm.ir_pass.Simplify(load.index)
+    assert tvm.ir_pass.Equal(offset, n * 2 + 103)
+
 def test_buffer_index_merge_mult_mod():
     m = tvm.var('m')
     n = tvm.var('n')
@@ -76,4 +96,6 @@ def assert_simplified_equal(index_simplified, index_direct):
     test_buffer()
     test_buffer_access_ptr()
     test_buffer_access_ptr_offset()
+    test_buffer_access_ptr_extent()
+    test_buffer_vload()
     test_buffer_index_merge_mult_mod()
diff --git a/tests/python/unittest/test_lang_constructor.py b/tests/python/unittest/test_lang_constructor.py
new file mode 100644
index 000000000000..caca08afa804
--- /dev/null
+++ b/tests/python/unittest/test_lang_constructor.py
@@ -0,0 +1,202 @@
+import tvm
+
+def test_expr_constructor():
+    x = tvm.expr.Var("xx", "float32")
+    assert isinstance(x, tvm.expr.Var)
+    assert x.name == "xx"
+
+    x = tvm.expr.Reduce(None, [1],
+                        [tvm.api._IterVar((0, 1), "x", 2)],
+                        None, 0)
+    assert isinstance(x, tvm.expr.Reduce)
+    assert x.combiner == None
+    assert x.value_index == 0
+
+    x = tvm.expr.FloatImm("float32", 1.0)
+    assert isinstance(x, tvm.expr.FloatImm)
+    assert x.value == 1.0
+    assert x.dtype == "float32"
+
+    x = tvm.expr.IntImm("int64", 2)
+    assert isinstance(x, tvm.expr.IntImm)
+    assert x.value == 2
+    assert x.dtype == "int64"
+
+    x = tvm.expr.UIntImm("uint16", 2)
+    assert isinstance(x, tvm.expr.UIntImm)
+    assert x.value == 2
+    assert x.dtype == "uint16"
+
+    x = tvm.expr.StringImm("xyza")
+    assert isinstance(x, tvm.expr.StringImm)
+    assert x.value == "xyza"
+
+    x = tvm.expr.Cast("float32", tvm.expr.IntImm("int32", 1))
+    assert isinstance(x, tvm.expr.Cast)
+    assert x.dtype == "float32"
+    assert x.value.value == 1
+
+    a = tvm.const(1.0, dtype="float32")
+    b = tvm.var("x", dtype="float32")
+
+    for cls in [tvm.expr.Add,
+                tvm.expr.Sub,
+                tvm.expr.Mul,
+                tvm.expr.Div,
+                tvm.expr.Mod,
+                tvm.expr.Min,
+                tvm.expr.Max,
+                tvm.expr.LT,
+                tvm.expr.LE,
+                tvm.expr.GT,
+                tvm.expr.GE]:
+        x = cls(a, b)
+        assert isinstance(x, cls)
+        assert x.a == a
+        assert x.b.same_as(b)
+
+
+    a = tvm.convert(tvm.var("x") > 1)
+    b = tvm.convert(tvm.var("x") == 1)
+
+    for cls in [tvm.expr.And,
+                tvm.expr.Or]:
+        x = cls(a, b)
+        assert isinstance(x, cls)
+        assert x.a == a
+        assert x.b.same_as(b)
+
+    x = tvm.expr.Not(a)
+    assert isinstance(x, tvm.expr.Not)
+    assert x.a == a
+
+    x = tvm.expr.Select(a, a, b)
+    assert isinstance(x, tvm.expr.Select)
+    assert x.true_value == a
+    assert x.false_value == b
+    assert x.condition == a
+
+    buffer_var = tvm.var("x", dtype="handle")
+    x = tvm.expr.Load("float32", buffer_var, 1, a)
+    assert isinstance(x, tvm.expr.Load)
+    assert x.dtype == "float32"
+    assert x.buffer_var == buffer_var
+    assert x.index.value == 1
+    assert x.predicate == a
+
+    x = tvm.expr.Ramp(1, 2, 10)
+    assert isinstance(x, tvm.expr.Ramp)
+    assert x.base.value == 1
+    assert x.stride.value == 2
+    assert x.lanes == 10
+
+    x = tvm.expr.Broadcast(a, 10)
+    assert isinstance(x, tvm.expr.Broadcast)
+    assert x.value == a
+    assert x.lanes == 10
+
+    x = tvm.expr.Shuffle([a], [0])
+    assert isinstance(x, tvm.expr.Shuffle)
+    assert x.vectors[0] == a
+    assert x.indices[0].value == 0
+
+    x = tvm.expr.Call("float32", "xyz", [a], tvm.expr.Call.Extern, None, 0)
+    assert isinstance(x, tvm.expr.Call)
+    assert x.dtype == "float32"
+    assert x.name == "xyz"
+    assert x.args[0] == a
+    assert x.call_type == tvm.expr.Call.Extern
+    assert x.func == None
+    assert x.value_index == 0
+
+    v = tvm.var("aa")
+    x = tvm.expr.Let(v, 1, v)
+    assert x.var == v
+    assert x.value.value == 1
+    assert x.body == v
+
+
+def test_stmt_constructor():
+    v = tvm.var("aa")
+    buffer_var = tvm.var("buf", dtype="handle")
+    nop = tvm.stmt.Evaluate(1)
+    x = tvm.stmt.LetStmt(v, 1, tvm.stmt.Evaluate(1))
+    assert isinstance(x, tvm.stmt.LetStmt)
+    assert x.var == v
+    assert x.value.value == 1
+    assert isinstance(x.body, tvm.stmt.Evaluate)
+
+    x = tvm.stmt.AttrStmt(v == 1, "xx", 1, tvm.stmt.Evaluate(1))
+    assert isinstance(x, tvm.stmt.AttrStmt)
+    assert x.value.value == 1
+
+    x = tvm.stmt.Block(tvm.stmt.Evaluate(11),
+                       nop)
+    assert isinstance(x, tvm.stmt.Block)
+    assert x.first.value.value == 11
+    assert x.rest == nop
+
+    x = tvm.stmt.AssertStmt(tvm.const(1, "uint1"),
+                            tvm.convert("hellow"),
+                            nop)
+    assert isinstance(x, tvm.stmt.AssertStmt)
+    assert x.body == nop
+
+    x = tvm.stmt.ProducerConsumer(None, True, nop)
+    assert isinstance(x, tvm.stmt.ProducerConsumer)
+    assert x.body == nop
+
+    x = tvm.stmt.For(tvm.var("x"), 0, 10, 0, 0, nop)
+    assert isinstance(x, tvm.stmt.For)
+    assert x.min.value == 0
+    assert x.extent.value == 10
+    assert x.body == nop
+
+    x = tvm.stmt.Store(buffer_var, 1, 10, tvm.const(1, "uint1"))
+    assert isinstance(x, tvm.stmt.Store)
+    assert x.buffer_var == buffer_var
+    assert x.index.value == 10
+    assert x.value.value == 1
+
+    tensor = tvm.placeholder((), dtype="float32")
+    x = tvm.stmt.Provide(tensor.op, 0, 10, [])
+    assert isinstance(x, tvm.stmt.Provide)
+    assert x.value_index == 0
+    assert x.value.value == 10
+
+    x = tvm.stmt.Allocate(buffer_var, "float32", [10],
+                          tvm.const(1, "uint1"), nop)
+    assert isinstance(x, tvm.stmt.Allocate)
+    assert x.dtype == "float32"
+    assert x.buffer_var == buffer_var
+    assert x.body == nop
+
+    x = tvm.stmt.AttrStmt(buffer_var, "xyz", 1, nop)
+    assert isinstance(x, tvm.stmt.AttrStmt)
+    assert x.node == buffer_var
+    assert x.attr_key == "xyz"
+    assert x.body == nop
+
+    x = tvm.stmt.Free(buffer_var)
+    assert isinstance(x, tvm.stmt.Free)
+    assert x.buffer_var == buffer_var
+
+    x = tvm.stmt.Realize(None, 0, "float", [], tvm.const(1, "uint1"), nop)
+    assert isinstance(x, tvm.stmt.Realize)
+    assert x.body == nop
+
+    x = tvm.stmt.IfThenElse(tvm.const(1, "uint1"),
+                            tvm.stmt.Evaluate(11),
+                            nop)
+    assert isinstance(x, tvm.stmt.IfThenElse)
+    assert x.then_case.value.value == 11
+    assert x.else_case == nop
+
+    x = tvm.stmt.Prefetch(None, 1, "float32", [])
+    assert isinstance(x, tvm.stmt.Prefetch)
+    assert x.value_index == 1
+
+
+if __name__ == "__main__":
+    test_expr_constructor()
+    test_stmt_constructor()
diff --git a/tests/python/unittest/test_lang_container.py b/tests/python/unittest/test_lang_container.py
index 615c5ac0a8d5..8683e56088a0 100644
--- a/tests/python/unittest/test_lang_container.py
+++ b/tests/python/unittest/test_lang_container.py
@@ -3,6 +3,9 @@
 def test_array():
     a = tvm.convert([1,2,3])
     assert len(a) == 3
+    assert a[-1].value == 3
+    a_slice = a[-3:-1]
+    assert (a_slice[0].value, a_slice[1].value) == (1, 2)
 
 def test_array_save_load_json():
     a = tvm.convert([1,2,3])
diff --git a/tests/python/unittest/test_lang_operator.py b/tests/python/unittest/test_lang_operator.py
new file mode 100644
index 000000000000..af7d9fd5544a
--- /dev/null
+++ b/tests/python/unittest/test_lang_operator.py
@@ -0,0 +1,73 @@
+import tvm
+
+def test_const_fold():
+    def check(f, *args):
+        x = f(*[tvm.const(x) for x in args])
+        y = f(*args)
+        if not isinstance(x, (tvm.expr.IntImm, tvm.expr.UIntImm)) or x.value != int(y):
+            raise ValueError("check error: %s vs %s " % (x, y))
+
+    check(lambda x, y: x + y, 3, 4)
+    check(lambda x, y: x * y, 3, 12)
+    check(lambda x, y: x * y - 10, 3, 12)
+    check(lambda x, y: x - y % 10, 3, 12)
+    check(lambda x, y: x // y + 10, 100, 12)
+    check(lambda x, y: x & y + 10, 112, 128)
+    check(lambda x, y: x > y, 112, 128)
+    check(lambda x, y: x < y, 112, 128)
+    check(lambda x, y: x <= y, 112, 128)
+    check(lambda x, y: x >= y, 112, 128)
+    check(lambda x, y: (x | y) ^ 10, 112, 128)
+
+
+def test_const_fold2():
+    x = tvm.var("x")
+    assert (x + 0).same_as(x)
+    assert (0 + x).same_as(x)
+    assert (x - 0).same_as(x)
+    assert (x % 1).value == 0
+    assert (x * 1).same_as(x)
+    assert (1 * x).same_as(x)
+    assert isinstance((1 / x), tvm.expr.Div)
+
+def test_const_fold3():
+    def check_throws(f):
+        try:
+            f()
+        except tvm.TVMError:
+            pass
+        else:
+            raise AssertionError("Should have raised an exception but didn't.")
+
+    # Test that using ints with logic operations is forbidden
+    x = tvm.var("x")
+    for val in [0, 1]:
+        for func in [tvm.all, tvm.any]:
+            check_throws(lambda: func(tvm.const(val, 'uint1'), x))
+            check_throws(lambda: func(x, tvm.const(val, 'uint1')))
+
+    # Test const folding when both arguments are const
+    for tvm_func, py_func in [(tvm.all, lambda a, b: a and b), (tvm.any, lambda a, b: a or b)]:
+        for v1 in [0, 1]:
+            for v2 in [0, 1]:
+                assert tvm.ir_pass.Equal(tvm_func(tvm.const(v1, 'uint1'), tvm.const(v2, 'uint1')),
+                                         tvm.const(py_func(v1, v2), 'uint1'))
+
+    x = tvm.var("x", 'uint1')
+    true = tvm.const(1, 'uint1')
+    false = tvm.const(0, 'uint1')
+
+    assert tvm.all(x, true).same_as(x)
+    assert tvm.all(true, x).same_as(x)
+    assert tvm.any(x, false).same_as(x)
+    assert tvm.any(false, x).same_as(x)
+
+    assert tvm.all(x, false).same_as(false)
+    assert tvm.all(false, x).same_as(false)
+    assert tvm.any(x, true).same_as(true)
+    assert tvm.any(true, x).same_as(true)
+
+if __name__ == "__main__":
+    test_const_fold()
+    test_const_fold2()
+    test_const_fold3()
diff --git a/tests/python/unittest/test_lang_reflection.py b/tests/python/unittest/test_lang_reflection.py
index fefb8771a812..3ec760f20c76 100644
--- a/tests/python/unittest/test_lang_reflection.py
+++ b/tests/python/unittest/test_lang_reflection.py
@@ -15,7 +15,7 @@ def test_make_smap():
     # save load json
     x = tvm.const(1)
     y = tvm.const(10)
-    z = x + y
+    z = tvm.expr.Add(x, y)
     smap = tvm.convert({"z": z, "x": x})
     json_str = tvm.save_json(tvm.convert([smap]))
     arr = tvm.load_json(json_str)
@@ -36,6 +36,34 @@ def test_make_node():
     assert AA.op == A.op
     assert AA.value_index == A.value_index
 
+
+def test_make_attrs():
+    try:
+        x = tvm.make.node("attrs.TestAttrs", unknown_key=1, name="xx")
+        assert False
+    except tvm.TVMError as e:
+        assert str(e).find("unknown_key") != -1
+
+    try:
+        x = tvm.make.node("attrs.TestAttrs", axis=100, name="xx")
+        assert False
+    except tvm.TVMError as e:
+        assert str(e).find("upper bound") != -1
+
+    x = tvm.make.node("attrs.TestAttrs", name="xx", padding=(3,4))
+    assert x.name == "xx"
+    assert x.padding[0].value == 3
+    assert x.padding[1].value == 4
+    assert x.axis == 10
+
+
+    dattr = tvm.make.node("DictAttrs", x=1, y=10, name="xyz", padding=(0,0))
+    assert dattr.x.value == 1
+    datrr = tvm.load_json(tvm.save_json(dattr))
+    assert dattr.name.value == "xyz"
+
+
+
 def test_make_sum():
     A = tvm.placeholder((2, 10), name='A')
     k = tvm.reduce_axis((0,10), "k")
@@ -45,7 +73,34 @@ def test_make_sum():
     assert B.op.body[0].combiner is not None
     assert BB.op.body[0].combiner is not None
 
+
+def test_env_func():
+    @tvm.register_func("test.env_func")
+    def test(x):
+        return x + 1
+
+    f = tvm.get_global_func("test.env_func")
+    x = tvm.get_env_func("test.env_func")
+    assert x.name == "test.env_func"
+    json_str = tvm.save_json([x])
+    y = tvm.load_json(json_str)[0]
+    assert y.name == x.name
+    assert y(1) == 2
+    assert y.func(1) == 2
+
+    x = tvm.make.node("attrs.TestAttrs", name="xx", padding=(3,4), func=y)
+    assert x.name == "xx"
+    assert x.padding[0].value == 3
+    assert x.padding[1].value == 4
+    assert x.axis == 10
+    x = tvm.load_json(tvm.save_json(x))
+    assert isinstance(x.func, tvm.container.EnvFunc)
+    assert x.func(10) == 11
+
+
 if __name__ == "__main__":
+    test_env_func()
+    test_make_attrs()
     test_make_node()
     test_make_smap()
     test_const_saveload_json()
diff --git a/tests/python/unittest/test_lang_schedule.py b/tests/python/unittest/test_lang_schedule.py
index 1eb42f3f0bca..a00785dea7af 100644
--- a/tests/python/unittest/test_lang_schedule.py
+++ b/tests/python/unittest/test_lang_schedule.py
@@ -1,3 +1,4 @@
+from nose.tools import raises
 import tvm
 import pickle as pkl
 
@@ -112,6 +113,13 @@ def test_vectorize():
     assert s[T].iter_var_attrs[xi].iter_type == UNROLL
     assert s[T].iter_var_attrs[yi].iter_type == VECTORIZE
 
+@raises(Exception)
+def test_vectorize_commreduce():
+    V = tvm.placeholder((128,), name='V')
+    ax = tvm.reduce_axis((0, 128), name='ax')
+    O = tvm.compute((1,), lambda _: tvm.sum(V[ax], axis=[ax]))
+    s = tvm.create_schedule(O.op)
+    s[O].vectorize(ax) # should throw here
 
 def test_pragma():
     m = 100
@@ -197,3 +205,4 @@ def intrin_func(ins, outs):
     test_split()
     test_fuse()
     test_vectorize()
+    test_vectorize_commreduce()
diff --git a/tests/python/unittest/test_lang_target.py b/tests/python/unittest/test_lang_target.py
index f7309fc30819..42e2c3fcb2e3 100644
--- a/tests/python/unittest/test_lang_target.py
+++ b/tests/python/unittest/test_lang_target.py
@@ -34,20 +34,21 @@ def test_target_dispatch():
     with tvm.target.create("metal"):
         assert mygeneric(1) == 3
 
-    assert tvm.target.current_target() == None
+    assert tvm.target.current_target() is None
 
 
 def test_target_string_parse():
-    target = tvm.target.create("cuda -libs=cublas,cudnn")
+    target = tvm.target.create("cuda -model=unknown -libs=cublas,cudnn")
 
     assert target.target_name == "cuda"
-    assert target.options == ['-libs=cublas,cudnn']
+    assert target.options == ['-model=unknown', '-libs=cublas,cudnn']
     assert target.keys == ['cuda', 'gpu']
     assert target.libs == ['cublas', 'cudnn']
-    assert str(target) == str(tvm.target.cuda("-libs=cublas,cudnn"))
-
+    assert str(target) == str(tvm.target.cuda(options="-libs=cublas,cudnn"))
 
     assert tvm.target.intel_graphics().device_name == "intel_graphics"
+    assert tvm.target.mali().device_name == "mali"
+    assert tvm.target.arm_cpu().device_name == "arm_cpu"
 
 if __name__ == "__main__":
     test_target_dispatch()
diff --git a/tests/python/unittest/test_lang_tensor.py b/tests/python/unittest/test_lang_tensor.py
index 1d8603dfc98b..50492ca41fca 100644
--- a/tests/python/unittest/test_lang_tensor.py
+++ b/tests/python/unittest/test_lang_tensor.py
@@ -1,4 +1,5 @@
 import tvm
+from topi.nn.pooling import pool
 
 def test_tensor():
     m = tvm.var('m')
@@ -84,6 +85,78 @@ def test_tensor_reduce():
     assert(isinstance(C_loaded, tvm.tensor.Tensor))
     assert(str(C_loaded) == str(C))
 
+def test_tensor_compute1():
+    m = 1024
+    factor = 16
+    dtype = 'float32'
+
+    def intrin_vadd(n):
+        x = tvm.placeholder((n,))
+        y = tvm.placeholder((n,))
+        z = tvm.compute(x.shape, lambda i: x[i] + y[i])
+
+        def intrin_func(ins, outs):
+            ib = tvm.ir_builder.create()
+            ib.emit(tvm.call_extern(outs[0].dtype, 'vadd', ins[0].access_ptr("r"), ins[1].access_ptr('r'), outs[0].access_ptr('wr')))
+            return ib.get()
+
+        with tvm.build_config(offset_factor=n):
+            return tvm.decl_tensor_intrin(z.op, intrin_func)
+
+    vadd = intrin_vadd(factor)
+
+    A = tvm.placeholder((m//factor, factor), name="A", dtype=dtype)
+    B = tvm.placeholder((m//factor, factor), name="B", dtype=dtype)
+    C = tvm.compute((m//factor, factor),
+          lambda i: vadd(A[i, 0:factor], B[i, 0:factor]))
+
+    s = tvm.create_schedule(C.op)
+    stmt = tvm.lower(s, [A, B, C], simple_mode=True)
+    assert isinstance(stmt.body.body, tvm.stmt.Evaluate)
+
+def test_tensor_compute2():
+    M = 2048
+    N = 1024
+    L = 1024
+    factor = 16
+    factor1 = 32
+    factor2 = 32
+    dtype = 'float32'
+
+    def intrin_gemm(m, n, l):
+        k = tvm.reduce_axis((0, l))
+        x = tvm.placeholder((m, l))
+        y = tvm.placeholder((n, l))
+        # in theory, no relation
+        z = tvm.compute((m, n), lambda i, j: tvm.sum(x[i][k] * y[j][k], axis=k))
+
+        def intrin_func(ins, outs):
+            x_ptr = ins[0].access_ptr("r")
+            y_ptr = ins[1].access_ptr("r")
+            z_ptr = outs[0].access_ptr("w")
+            body = tvm.call_packed(
+                "gemv", x_ptr, y_ptr, z_ptr, m, n, l)
+            reset = tvm.call_packed(
+                "fill_zero", z_ptr, m, n)
+            update = tvm.call_packed(
+                "gemv_add", x_ptr, y_ptr, z_ptr, m, n, l)
+            return body, reset, update
+
+        with tvm.build_config(offset_factor=n):
+            return tvm.decl_tensor_intrin(z.op, intrin_func)
+
+    vgemm = intrin_gemm(factor1, factor2, factor)
+
+    A = tvm.placeholder((M//factor1, L//factor, factor1, factor), name="A", dtype=dtype)
+    B = tvm.placeholder((N//factor2, L//factor, factor2, factor), name="B", dtype=dtype)
+    k = tvm.reduce_axis((0, L//factor), name='k')
+    C = tvm.compute((M//factor1, N//factor2, factor1, factor2),
+          lambda i, j: vgemm(A[i, k, 0:factor1, 0:factor], B[j, k, 0:factor2, 0:factor], reduce_axis=k))
+
+    s = tvm.create_schedule(C.op)
+    stmt = tvm.lower(s, [A, B, C], simple_mode=True)
+    assert isinstance(stmt.body.body.body.first, tvm.stmt.Evaluate)
+    assert isinstance(stmt.body.body.body.rest.body, tvm.stmt.Evaluate)
 
 def test_tensor_scan():
     m = tvm.var("m")
@@ -185,6 +258,34 @@ def test_tensor_inputs():
     assert tuple(y.op.input_tensors) == (x,)
 
 
+def test_tensor_pool():
+    def intrin_pool():
+        A = tvm.placeholder((64, 16, 16), name='A')
+        kh = tvm.reduce_axis((0, 3), name='kh')
+        kw = tvm.reduce_axis((0, 3), name='kw')
+        P = tvm.compute((64, 14, 14),
+                        lambda c, oh, ow: tvm.max(A[c, oh + kh, ow + kw],
+                                                  axis=[kh, kw]),
+                        name='p')
+
+        def intrin_func(ins, outs):
+            dinp = ins[0]
+            dout = outs[0]
+            return tvm.call_packed("op", dinp, dout)
+
+        with tvm.build_config(offset_factor=1):
+            return tvm.decl_tensor_intrin(P.op, intrin_func)
+
+    A = tvm.placeholder((1, 64, 16, 16), name='A')
+    P = pool(data=A, kernel=(3, 3), stride=(1, 1), padding=(0, 0, 0, 0),
+             pool_type='max')
+    s = tvm.create_schedule(P.op)
+    _, oh, _, _ = P.op.axis
+    intrin = intrin_pool()
+    s[P].tensorize(oh, intrin)
+    tvm.lower(s, [A, P])
+
+
 if __name__ == "__main__":
     test_rank_zero()
     test_tensor_inputs()
@@ -192,6 +293,8 @@ def test_tensor_inputs():
     test_conv1d()
     test_tensor_slice()
     test_tensor()
+    test_tensor_compute1()
+    test_tensor_compute2()
     test_tensor_reduce()
     test_tensor_scan()
     test_scan_multi_out()
@@ -199,3 +302,4 @@ def test_tensor_inputs():
     test_extern_multi_out()
     test_tuple_inputs()
     test_tuple_with_different_deps()
+    test_tensor_pool()
diff --git a/tests/python/unittest/test_lang_tensor_overload_op.py b/tests/python/unittest/test_lang_tensor_overload_op.py
index 14853e89188a..ee6eaf74a79c 100644
--- a/tests/python/unittest/test_lang_tensor_overload_op.py
+++ b/tests/python/unittest/test_lang_tensor_overload_op.py
@@ -66,7 +66,7 @@ def test_combination():
     c = tvm.nd.array(np.random.uniform(size=(n, m)).astype(C.dtype), ctx)
     d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
     foo(x, a, b, c, d)
-    np.testing.assert_allclose(d.asnumpy(), k + a.asnumpy() - b.asnumpy() * c.asnumpy() / x)
+    tvm.testing.assert_allclose(d.asnumpy(), k + a.asnumpy() - b.asnumpy() * c.asnumpy() / x)
 
 
 def verify_tensor_scalar_bop(shape, typ="add"):
@@ -111,7 +111,7 @@ def check_device(device):
         a_nd = tvm.nd.array(a_npy, ctx)
         b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), ctx)
         foo(a_nd, b_nd, k_, *shape)
-        np.testing.assert_allclose(b_nd.asnumpy(), b_npy, rtol=1e-5)
+        tvm.testing.assert_allclose(b_nd.asnumpy(), b_npy, rtol=1e-5)
 
     for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
         check_device(device)
@@ -160,7 +160,7 @@ def check_device(device):
         out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx)
         for _ in range(1):
             foo(lhs_nd, rhs_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
 
     for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
         check_device(device)
@@ -175,10 +175,11 @@ def check_device(device):
         print("Running on target: %s" % device)
 
         k = 10.0
+        dilation = (1, 1)
         with tvm.target.create(device):
             A = tvm.placeholder((batch, in_channel, in_size, in_size), name='A')
             W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
-            B = topi.nn.conv2d(A, W, stride, padding)
+            B = topi.nn.conv2d(A, W, stride, padding, dilation)
             if typ == "add":
                 C = B + k
             elif typ == "sub":
@@ -213,7 +214,7 @@ def check_device(device):
         b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), ctx)
         c_nd = tvm.nd.array(np.empty(c_npy.shape).astype(C.dtype), ctx)
         foo(a_nd, w_nd, b_nd, c_nd)
-        np.testing.assert_allclose(c_nd.asnumpy(), c_npy, rtol=1E-4, atol=1E-4)
+        tvm.testing.assert_allclose(c_nd.asnumpy(), c_npy, rtol=1E-4, atol=1E-4)
 
     for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan']:
         check_device(device)
diff --git a/tests/python/unittest/test_module_load.py b/tests/python/unittest/test_module_load.py
index 1b239a357f66..8ee3ea5e06c0 100644
--- a/tests/python/unittest/test_module_load.py
+++ b/tests/python/unittest/test_module_load.py
@@ -109,11 +109,25 @@ def check_device(device):
             f2[name](a, b)
             np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
-    check_device("cuda")
-    check_device("vulkan")
-    check_device("opencl")
-    check_device("metal")
+    def check_stackvm(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        temp = util.tempdir()
+        name = "myadd_%s" % device
+        f = tvm.build(s, [A, B], device, "stackvm", name=name)
+        path_dso = temp.relpath("dev_lib.stackvm")
+        #f.export_library(path_dso)
+        #f1 = tvm.module.load(path_dso)
+        a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
+        f(a, b)
+        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
+    for device in ["cuda", "vulkan", "opencl", "metal"]:
+        check_device(device)
+        check_stackvm(device)
 
 def test_combine_module_llvm():
     """Test combine multiple module into one shared lib."""
diff --git a/tests/python/unittest/test_pass_attrs_hash_equal.py b/tests/python/unittest/test_pass_attrs_hash_equal.py
new file mode 100644
index 000000000000..2d6987aeb183
--- /dev/null
+++ b/tests/python/unittest/test_pass_attrs_hash_equal.py
@@ -0,0 +1,39 @@
+import tvm
+
+def test_attrs_equal():
+    x = tvm.make.node("attrs.TestAttrs", name="xx", padding=(3, 4))
+    y = tvm.make.node("attrs.TestAttrs", name="xx", padding=(3, 4))
+    z = tvm.make.node("attrs.TestAttrs", name="xx", padding=(3,4,1))
+    assert tvm.ir_pass.AttrsEqual(x, y)
+    assert not tvm.ir_pass.AttrsEqual(x, z)
+
+    dattr = tvm.make.node("DictAttrs", x=1, y=10, name="xyz", padding=(0,0))
+    assert not tvm.ir_pass.AttrsEqual(dattr, x)
+    dattr2 = tvm.make.node("DictAttrs", x=1, y=10, name="xyz", padding=(0,0))
+    assert tvm.ir_pass.AttrsEqual(dattr, dattr2)
+
+    assert tvm.ir_pass.AttrsEqual({"x": x}, {"x": y})
+    # array related checks
+    assert tvm.ir_pass.AttrsEqual({"x": [x, x]}, {"x": [y, x]})
+    assert not tvm.ir_pass.AttrsEqual({"x": [x, 1]}, {"x": [y, 2]})
+
+    n = tvm.var("n")
+    assert tvm.ir_pass.AttrsEqual({"x": n+1}, {"x": n+1})
+
+
+
+
+
+def test_attrs_hash():
+    fhash = tvm.ir_pass.AttrsHash
+    x = tvm.make.node("attrs.TestAttrs", name="xx", padding=(3, 4))
+    y = tvm.make.node("attrs.TestAttrs", name="xx", padding=(3, 4))
+    assert fhash({"x": x}) == fhash({"x": y})
+    assert fhash({"x": x}) != fhash({"x": [y, 1]})
+    assert fhash({"x": [x, 1]}) == fhash({"x": [y, 1]})
+    assert fhash({"x": [x, 2]}) == fhash({"x": [y, 2]})
+
+
+if __name__ == "__main__":
+    test_attrs_equal()
+    test_attrs_hash()
diff --git a/tests/python/unittest/test_pass_bound_checkers.py b/tests/python/unittest/test_pass_bound_checkers.py
new file mode 100644
index 000000000000..bb552f078f1a
--- /dev/null
+++ b/tests/python/unittest/test_pass_bound_checkers.py
@@ -0,0 +1,544 @@
+from nose.tools import raises
+import tvm
+import numpy as np
+def collect_visit(stmt, f):
+    ret = []
+    tvm.ir_pass.PostOrderVisit(stmt, lambda x: ret.append(f(x)))
+    return ret
+
+def lower(sch, args):
+    binds = {}
+    arg_list = []
+    for x in args:
+        if isinstance(x, tvm.tensor.Tensor):
+            buf = tvm.decl_buffer(x.shape, dtype=x.dtype, name=x.name)
+            assert x not in binds
+            binds[x] = buf
+            arg_list.append(buf)
+        else:
+            raise ValueError("args must be Tensor, Buffer or Var")
+    sch = sch.normalize()
+    bounds = tvm.schedule.InferBound(sch)
+    stmt = tvm.schedule.ScheduleOps(sch, bounds)
+    stmt = tvm.ir_pass.LoopPartition(stmt, True)
+    stmt = tvm.ir_pass.StorageFlatten(stmt, binds, 64, True)
+    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
+    stmt = tvm.ir_pass.VectorizeLoop(stmt)
+    stmt = tvm.ir_pass.Simplify(stmt)
+    return stmt
+
+@raises(Exception)
+def test_out_of_bounds_llvm(index_a, index_b):
+    n = tvm.var("n")
+    A = tvm.placeholder ((n,), name='A')
+    B = tvm.placeholder ((n,), name='B')
+    C = tvm.compute(A.shape, lambda i: A[i + index_a] + B[i + index_b], name='C')
+    s = tvm.create_schedule (C.op)
+    tgt = "llvm"
+    tgt_host = "llvm"
+    stmt = tvm.lower (s, [A, B, C], simple_mode=True)
+    print (stmt)
+    fadd = tvm.build (s, [A, B, C], tgt, target_host=tgt_host, name="myadd")
+    ctx = tvm.context(tgt, 0)
+    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), ctx)
+    c = tvm.nd.array(np.zeros(1024, dtype=C.dtype), ctx)
+    fadd (a, b, c)
+
+def test_in_bounds_llvm():
+    n = tvm.var("n")
+    A = tvm.placeholder ((n,), name='A')
+    B = tvm.placeholder ((n,), name='B')
+    C = tvm.compute(A.shape, lambda i: A[i] + B[i], name='C')
+    s = tvm.create_schedule (C.op)
+    tgt = "llvm"
+    tgt_host = "llvm"
+    stmt = tvm.lower (s, [A, B, C], simple_mode=True)
+    print (stmt)
+    fadd = tvm.build (s, [A, B, C], tgt, target_host=tgt_host, name="myadd")
+    ctx = tvm.context(tgt, 0)
+    a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), ctx)
+    c = tvm.nd.array(np.zeros(1024, dtype=C.dtype), ctx)
+    fadd (a, b, c)
+
+@raises(Exception)
+def test_out_of_bounds_vectorize_llvm(nn, index_a, index_b):
+    n = tvm.convert(nn)
+    a = tvm.placeholder((n), name='a')
+    b = tvm.placeholder((n), name='b')
+    c = tvm.compute((n,), lambda i: a[i + index_a] + b[i + index_b], name='c')
+    s = tvm.create_schedule(c.op)
+    xo, xi = s[c].split(c.op.axis[0], factor=8)
+    s[c].parallel(xo)
+    s[c].vectorize(xi)
+    tgt = "llvm"
+    tgt_host = "llvm"
+    stmt = tvm.lower (s, [a, b, c], simple_mode=True)
+    print (stmt)
+    f = tvm.build(s, [a, b, c], tgt, target_host=tgt_host, name="myaddvec")
+    ctx = tvm.cpu(0)
+    n = nn
+    a = tvm.nd.array(np.random.uniform(size=(n)).astype(a.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=(n)).astype(a.dtype), ctx)
+    c = tvm.nd.array(np.zeros(n, dtype=c.dtype), ctx)
+    f(a, b, c)
+
+def test_in_bounds_vectorize_llvm():
+    n = 512
+    lanes = 2
+    A = tvm.placeholder((n,), name='A', dtype="float32x%d" % lanes)
+    B = tvm.compute((n,), lambda i: A[i], name='B')
+    C = tvm.compute((n,), lambda i: B[i] + tvm.const(1, A.dtype), name='C')
+    s = tvm.create_schedule(C.op)
+    xo, xi = s[C].split(C.op.axis[0], nparts=2)
+    _, xi = s[C].split(xi, factor=2)
+    s[C].parallel(xo)
+    s[C].vectorize(xi)
+    s[B].compute_at(s[C], xo)
+    xo, xi = s[B].split(B.op.axis[0], factor=2)
+    s[B].vectorize(xi)
+    # build and invoke the kernel.
+    lowered_func = tvm.lower (s, [A, C], "llvm", simple_mode=False)
+    print (lowered_func.body)
+    f = tvm.build(s, [A, C], "llvm")
+    ctx = tvm.cpu(0)
+    # launch the kernel.
+    a = tvm.nd.empty((n,), A.dtype).copyfrom(
+        np.random.uniform(size=(n, lanes)))
+    c = tvm.nd.empty((n,), C.dtype, ctx)
+    f(a, c)
+    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
+
+def test_in_bounds_loop_partition_basic_llvm():
+    n = tvm.var('n')
+    A = tvm.placeholder((n, ), name='A')
+    B = tvm.placeholder((n, ), name='B')
+
+    T = tvm.compute((n, ), lambda i: A[i]+B[i])
+    s = tvm.create_schedule(T.op)
+    xo, xi = s[T].split(T.op.axis[0], factor=4)
+    lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
+    print (lowered_func.body)
+    ctx = tvm.cpu(0)
+
+    f = tvm.build(s, [A, B, T], "llvm")
+    a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), ctx)
+    t = tvm.nd.empty((32,), T.dtype, ctx)
+    f(a, b, t)
+
+@raises(Exception)
+def test_out_of_bounds_loop_partition_basic_llvm(index_a, index_b):
+    n = tvm.var('n')
+    A = tvm.placeholder((n, ), name='A')
+    B = tvm.placeholder((n, ), name='B')
+
+    T = tvm.compute((n, ), lambda i: A[i + index_a]+B[i + index_b])
+    s = tvm.create_schedule(T.op)
+    xo, xi = s[T].split(T.op.axis[0], factor=4)
+    lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
+    print (lowered_func.body)
+    ctx = tvm.cpu(0)
+
+    f = tvm.build(s, [A, B, T], "llvm")
+    a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), ctx)
+    t = tvm.nd.empty((32,), T.dtype, ctx)
+    f(a, b, t)
+
+def test_in_bounds_const_loop_partition_ir():
+    def check_attr_stmt (x):
+        if isinstance(x, tvm.stmt.AttrStmt) and x.attr_key == "buffer_bound" and str(x.value) == str(n):
+            return True
+        return False
+
+    def check_branch_stmt (x):
+        if isinstance(x, tvm.stmt.IfThenElse):
+            return True
+        return False
+
+    def assert_bound_instrumentation(stmt, f, nums):
+        count = 0
+        for i in collect_visit(stmt, f):
+            if i is True:
+              count = count + 1
+        assert (count == nums)
+
+    def collect_branch_stmt (x):
+        if isinstance(x, tvm.stmt.IfThenElse):
+            branch_collector.append(x)
+
+    n = 21
+    A = tvm.placeholder((n, ), name='A')
+    B = tvm.placeholder((n, ), name='B')
+
+    T = tvm.compute((n, ), lambda i: A[i]+B[i])
+    s = tvm.create_schedule(T.op)
+    xo, xi = s[T].split(T.op.axis[0], factor=4)
+
+    bounds = tvm.schedule.InferBound(s)
+    stmt = lower (s, [A, B, T])
+    # num_attributes = num_buffers * num_splits = 2 * 3
+    # before instrumentation
+    assert_bound_instrumentation(stmt, check_attr_stmt, 2 * 3)
+    assert_bound_instrumentation(stmt, check_branch_stmt, 0)
+    stmt = tvm.ir_pass.InstrumentBoundCheckers(stmt)
+    # after instrumentation
+    assert_bound_instrumentation(stmt, check_attr_stmt, 2 * 3)
+    assert_bound_instrumentation(stmt, check_branch_stmt, 2)
+    print (stmt)
+    branch_collector = list()
+    collect_visit(stmt, collect_branch_stmt)
+    assert(len(branch_collector) ==  2)
+    print (branch_collector[0].condition)
+    print (branch_collector[1].condition)
+
+def test_in_bounds_const_loop_partition_llvm():
+    with tvm.build_config(instrument_bound_checkers=True, partition_const_loop=True):
+        n = 21
+        A = tvm.placeholder((n, ), name='A')
+        B = tvm.placeholder((n, ), name='B')
+
+        T = tvm.compute((n, ), lambda i: A[i]+B[i])
+        s = tvm.create_schedule(T.op)
+        xo, xi = s[T].split(T.op.axis[0], factor=4)
+        lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
+        print (lowered_func.body)
+        ctx = tvm.cpu(0)
+
+        f = tvm.build(s, [A, B, T], "llvm")
+        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), ctx)
+        t = tvm.nd.empty((n,), T.dtype, ctx)
+        f(a, b, t)
+
+@raises(Exception)
+def test_out_of_bounds_const_loop_partition_llvm(index_a, index_b):
+    with tvm.build_config(instrument_bound_checkers=True, partition_const_loop=True):
+        n = 21
+        A = tvm.placeholder((n, ), name='A')
+        B = tvm.placeholder((n, ), name='B')
+
+        T = tvm.compute((n, ), lambda i: A[i + index_a]+B[i + index_b])
+        s = tvm.create_schedule(T.op)
+        xo, xi = s[T].split(T.op.axis[0], factor=4)
+        lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
+        print (lowered_func.body)
+        ctx = tvm.cpu(0)
+
+        f = tvm.build(s, [A, B, T], "llvm")
+        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), ctx)
+        t = tvm.nd.empty((n,), T.dtype, ctx)
+        f(a, b, t)
+
+def test_in_bounds_conv_llvm(loop_tiling=False):
+    HSTR = WSTR = 1
+    in_channel = 128
+    kernel_height = kernel_width = 3
+    out_channel = 64
+    batch_size = 1
+    in_height = in_width = 64
+    out_height = out_width = in_height - kernel_height + 1
+    data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data')
+    kernel = tvm.placeholder((kernel_height, kernel_width, in_channel,
+        out_channel), name='kernel')
+    ic = tvm.reduce_axis((0, in_channel), name='ic')
+    kh = tvm.reduce_axis((0, kernel_height), name='kh')
+    kw = tvm.reduce_axis((0, kernel_width), name='kw')
+    conv = tvm.compute((batch_size, out_channel, out_height, out_width),
+                       lambda n, oc, oh, ow: tvm.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] *
+                                                     kernel[kh, kw, ic, oc],
+                                                     axis=[ic, kh, kw]),
+                       name="conv2d")
+    s = tvm.create_schedule(conv.op)
+
+    n, oc, oh, ow = conv.op.axis
+    if loop_tiling:
+        oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16)
+    lowered_func = tvm.lower(s, [data, kernel, conv], simple_mode=True)
+    print (lowered_func.body)
+    ctx = tvm.cpu (0)
+
+    f = tvm.build(s, [data, kernel, conv], "llvm")
+    data_input = tvm.nd.array(np.random.uniform(
+          size=(batch_size, in_channel, in_height, in_width)).astype(tvm.float32), ctx)
+    kernel_input = tvm.nd.array(np.random.uniform(
+          size=(kernel_height, kernel_width, in_channel, out_channel)).astype(tvm.float32), ctx)
+    conv_out = tvm.nd.empty ((batch_size, out_channel, out_height, out_width), tvm.float32, ctx)
+    f(data_input, kernel_input, conv_out)
+
+@raises(Exception)
+def test_out_of_bounds_conv_llvm(data_offsets, kernel_offsets, loop_tiling=False):
+    HSTR = WSTR = 1
+    in_channel = 128
+    kernel_height = kernel_width = 3
+    out_channel = 64
+    batch_size = 1
+    in_height = in_width = 64
+    out_height = out_width = in_height - kernel_height + 1
+    data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data')
+    kernel = tvm.placeholder((kernel_height, kernel_width, in_channel,
+        out_channel), name='kernel')
+    ic = tvm.reduce_axis((0, in_channel), name='ic')
+    kh = tvm.reduce_axis((0, kernel_height), name='kh')
+    kw = tvm.reduce_axis((0, kernel_width), name='kw')
+    conv = tvm.compute((batch_size, out_channel, out_height, out_width),
+                       lambda n, oc, oh, ow: tvm.sum(data[n + data_offsets[0],
+                                                          ic + data_offsets[1],
+                                                          oh*HSTR + kh + data_offsets[2],
+                                                          ow*WSTR + kw + data_offsets[3]]
+                                                          *
+                                                     kernel[kh + kernel_offsets[0],
+                                                     kw + kernel_offsets[1],
+                                                     ic + kernel_offsets[2],
+                                                     oc + kernel_offsets[3]],
+                                                     axis=[ic, kh, kw]),
+                       name="conv2d")
+    s = tvm.create_schedule(conv.op)
+
+    n, oc, oh, ow = conv.op.axis
+    if loop_tiling:
+        oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16)
+    lowered_func = tvm.lower(s, [data, kernel, conv], simple_mode=True)
+    print (lowered_func.body)
+    ctx = tvm.cpu (0)
+
+    f = tvm.build(s, [data, kernel, conv], "llvm")
+    data_input = tvm.nd.array(np.random.uniform(
+          size=(batch_size, in_channel, in_height, in_width)).astype(tvm.float32), ctx)
+    kernel_input = tvm.nd.array(np.random.uniform(
+          size=(kernel_height, kernel_width, in_channel, out_channel)).astype(tvm.float32), ctx)
+    conv_out = tvm.nd.empty ((batch_size, out_channel, out_height, out_width), tvm.float32, ctx)
+    f(data_input, kernel_input, conv_out)
+
+def test_in_bounds_tensors_with_same_shapes1D_llvm():
+    n = tvm.var('n')
+    k = tvm.var('k')
+    m = tvm.var('m')
+    A = tvm.placeholder((n, ), name='A')
+    B = tvm.placeholder((k, ), name='B')
+
+    T = tvm.compute((m, ), lambda i: A[i]*B[i])
+    s = tvm.create_schedule(T.op)
+    lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
+    print (lowered_func.body)
+    ctx = tvm.cpu(0)
+
+    f = tvm.build(s, [A, B, T], "llvm")
+    a = tvm.nd.array(np.random.uniform(size=(32, )).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), ctx)
+    t = tvm.nd.empty((32,), T.dtype, ctx)
+    f(a, b, t)
+
+@raises(Exception)
+def test_out_of_bounds_tensors_with_diff_shapes1D_llvm(a_shape, b_shape, c_shape):
+    n = tvm.var('n')
+    k = tvm.var('k')
+    m = tvm.var('m')
+    A = tvm.placeholder((n, ), name='A')
+    B = tvm.placeholder((k, ), name='B')
+
+    T = tvm.compute((m, ), lambda i: A[i]*B[i])
+    s = tvm.create_schedule(T.op)
+    lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
+    print (lowered_func.body)
+    ctx = tvm.cpu(0)
+
+    f = tvm.build(s, [A, B, T], "llvm")
+    a = tvm.nd.array(np.random.uniform(size=(a_shape,)).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=(b_shape,)).astype(B.dtype), ctx)
+    t = tvm.nd.empty((c_shape,), T.dtype, ctx)
+    f(a, b, t)
+
+def test_in_bounds_tensors_with_same_shapes2D_llvm():
+    n = tvm.var('n')
+    k = tvm.var('k')
+    m = tvm.var('m')
+    A = tvm.placeholder((n, n), name='A')
+    B = tvm.placeholder((k, k), name='B')
+
+    T = tvm.compute((m, m), lambda i, j: A[i][j]*B[i][j])
+    s = tvm.create_schedule(T.op)
+    lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
+    print (lowered_func.body)
+    ctx = tvm.cpu(0)
+
+    f = tvm.build(s, [A, B, T], "llvm")
+    a = tvm.nd.array(np.random.uniform(size=(32, 32)).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=(32, 32)).astype(B.dtype), ctx)
+    t = tvm.nd.empty((32, 32), T.dtype, ctx)
+    f(a, b, t)
+
+@raises(Exception)
+def test_out_of_bounds_tensors_with_diff_shapes2D_llvm(a_shape, b_shape, c_shape):
+    n = tvm.var('n')
+    k = tvm.var('k')
+    m = tvm.var('m')
+    A = tvm.placeholder((n, n), name='A')
+    B = tvm.placeholder((k, k), name='B')
+
+    T = tvm.compute((m, m), lambda i, j: A[i][j]*B[i][j])
+    s = tvm.create_schedule(T.op)
+    lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
+    print (lowered_func.body)
+    ctx = tvm.cpu(0)
+
+    f = tvm.build(s, [A, B, T], "llvm")
+    a = tvm.nd.array(np.random.uniform(size=(a_shape[0],a_shape[1])).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=(b_shape[0],b_shape[1])).astype(B.dtype), ctx)
+    t = tvm.nd.empty((c_shape[0],c_shape[1]), T.dtype, ctx)
+    f(a, b, t)
+
+def test_in_bounds_tensors_with_same_shapes3D_llvm():
+    n = tvm.var('n')
+    k = tvm.var('k')
+    m = tvm.var('m')
+    A = tvm.placeholder((n, n, n), name='A')
+    B = tvm.placeholder((k, k, k), name='B')
+
+    T = tvm.compute((m, m, m), lambda i, j, p: A[i][j][p]*B[i][j][p])
+    s = tvm.create_schedule(T.op)
+    lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
+    print (lowered_func.body)
+    ctx = tvm.cpu(0)
+
+    f = tvm.build(s, [A, B, T], "llvm")
+    a = tvm.nd.array(np.random.uniform(size=(32,32,32)).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=(32,32,32)).astype(B.dtype), ctx)
+    t = tvm.nd.empty((32, 32, 32), T.dtype, ctx)
+    f(a, b, t)
+
+@raises(Exception)
+def test_out_of_bounds_tensors_with_diff_shapes3D_llvm(a_shape, b_shape, c_shape):
+    n = tvm.var('n')
+    k = tvm.var('k')
+    m = tvm.var('m')
+    A = tvm.placeholder((n, n, n), name='A')
+    B = tvm.placeholder((k, k, k), name='B')
+
+    T = tvm.compute((m, m, m), lambda i, j, p: A[i][j][p]*B[i][j][p])
+    s = tvm.create_schedule(T.op)
+    lowered_func = tvm.lower (s, [A, B, T], "llvm", simple_mode=False)
+    print (lowered_func.body)
+    ctx = tvm.cpu(0)
+
+    f = tvm.build(s, [A, B, T], "llvm")
+    a = tvm.nd.array(np.random.uniform(size=(a_shape[0],a_shape[1], c_shape[2])).astype(A.dtype), ctx)
+    b = tvm.nd.array(np.random.uniform(size=(b_shape[0],b_shape[1], b_shape[2])).astype(B.dtype), ctx)
+    t = tvm.nd.empty((c_shape[0],c_shape[1],c_shape[2]), T.dtype, ctx)
+    f(a, b, t)
+
+@raises(Exception)
+def test_out_of_bounds_tensors_with_zero_shape_op_with_not_zero_shape_llvm():
+    if not tvm.module.enabled("llvm"):
+        return
+    n = 64
+    A = tvm.placeholder((n, ), name='A')
+    scale = tvm.placeholder((), name='scale')
+    k = tvm.reduce_axis((0, n), name="k")
+    C = tvm.compute((), lambda : tvm.sum(A[k + k + k] * scale, axis=k), name="C")
+    D = tvm.compute((), lambda : C + 1)
+    s = tvm.create_schedule(D.op)
+    stmt = tvm.lower (s, [A, scale, D], simple_mode=True)
+    print (stmt)
+    # build and invoke the kernel.
+    f = tvm.build(s, [A, scale, D], "llvm")
+    ctx = tvm.cpu(0)
+    # launch the kernel.
+    a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), ctx)
+    sc = tvm.nd.array(
+        np.random.randint(0, 2, size=()).astype(scale.dtype), ctx)
+    d = tvm.nd.empty((), D.dtype, ctx)
+    f(a, sc, d)
+    d_np = np.sum(a.asnumpy()) * sc.asnumpy() + 1
+    tvm.testing.assert_allclose(d.asnumpy(), d_np)
+
+if __name__ == "__main__":
+    with tvm.build_config(instrument_bound_checkers=True):
+        # zero scale
+        test_out_of_bounds_tensors_with_zero_shape_op_with_not_zero_shape_llvm()
+        # in bound
+        test_in_bounds_llvm()
+        # upper bound
+        test_out_of_bounds_llvm(1, 0)
+        test_out_of_bounds_llvm(0, 1)
+        test_out_of_bounds_llvm(1, 1)
+        test_out_of_bounds_llvm(10000, 0)
+        test_out_of_bounds_llvm(0, 10000)
+        test_out_of_bounds_llvm(10000, 10000)
+        # lower bound
+        test_out_of_bounds_llvm(-1, 0)
+        test_out_of_bounds_llvm(0, -1)
+        test_out_of_bounds_llvm(-1, -1)
+        test_out_of_bounds_llvm(-10000, 0)
+        test_out_of_bounds_llvm(0, -10000)
+        test_out_of_bounds_llvm(-10000, -10000)
+        # vectorize in bound
+        test_in_bounds_vectorize_llvm()
+        # vectorization upper bound
+        test_out_of_bounds_vectorize_llvm(1024, 1000, 0)
+        test_out_of_bounds_vectorize_llvm(1024, 0, 10000)
+        # vectorization lower bound
+        test_out_of_bounds_vectorize_llvm(1024, -1000, 0)
+        test_out_of_bounds_vectorize_llvm(1024, 0, -10000)
+        test_in_bounds_const_loop_partition_llvm()
+        test_out_of_bounds_const_loop_partition_llvm(1, 0)
+        test_out_of_bounds_const_loop_partition_llvm(0, 1)
+        test_out_of_bounds_const_loop_partition_llvm(-1, 0)
+        test_out_of_bounds_const_loop_partition_llvm(0, -1)
+        test_in_bounds_loop_partition_basic_llvm()
+        test_out_of_bounds_loop_partition_basic_llvm(32, 0)
+        test_out_of_bounds_loop_partition_basic_llvm(0, 32)
+        test_out_of_bounds_loop_partition_basic_llvm(-32, 0)
+        test_out_of_bounds_loop_partition_basic_llvm(0, -32)
+        # conv
+        test_in_bounds_conv_llvm()
+        test_out_of_bounds_conv_llvm([1, 0, 0, 0], [0, 0, 0, 0])
+        test_out_of_bounds_conv_llvm([0, 1, 0, 0], [0, 0, 0, 0])
+        test_out_of_bounds_conv_llvm([0, 0, 1, 0], [0, 0, 0, 0])
+        test_out_of_bounds_conv_llvm([0, 0, 0, 1], [0, 0, 0, 0])
+        test_out_of_bounds_conv_llvm([-1, 0, 0, 0], [0, 0, 0, 0])
+        test_out_of_bounds_conv_llvm([0, -1, 0, 0], [0, 0, 0, 0])
+        test_out_of_bounds_conv_llvm([0, 0, -1, 0], [0, 0, 0, 0])
+        test_out_of_bounds_conv_llvm([0, 0, 0, -1], [0, 0, 0, 0])
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [1, 0, 0, 0])
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 1, 0, 0])
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 1, 0])
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 0, 1])
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [-1, 0, 0, 0])
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, -1, 0, 0])
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, -1, 0])
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 0, -1])
+        # loop tiling
+        test_in_bounds_conv_llvm(True)
+        test_out_of_bounds_conv_llvm([1, 0, 0, 0], [0, 0, 0, 0], True)
+        test_out_of_bounds_conv_llvm([0, 1, 0, 0], [0, 0, 0, 0], True)
+        test_out_of_bounds_conv_llvm([0, 0, 1, 0], [0, 0, 0, 0], True)
+        test_out_of_bounds_conv_llvm([0, 0, 0, 1], [0, 0, 0, 0], True)
+        test_out_of_bounds_conv_llvm([-1, 0, 0, 0], [0, 0, 0, 0], True)
+        test_out_of_bounds_conv_llvm([0, -1, 0, 0], [0, 0, 0, 0], True)
+        test_out_of_bounds_conv_llvm([0, 0, -1, 0], [0, 0, 0, 0], True)
+        test_out_of_bounds_conv_llvm([0, 0, 0, -1], [0, 0, 0, 0], True)
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [1, 0, 0, 0], True)
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 1, 0, 0], True)
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 1, 0], True)
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 0, 1], True)
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [-1, 0, 0, 0], True)
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, -1, 0, 0], True)
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, -1, 0], True)
+        test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 0, -1], True)
+        # tensors with diff shapes basic operation such as mul
+        test_out_of_bounds_tensors_with_diff_shapes1D_llvm (32, 64, 64)
+        test_out_of_bounds_tensors_with_diff_shapes1D_llvm (64, 32, 64)
+        test_out_of_bounds_tensors_with_diff_shapes2D_llvm([64, 64], [32, 32], [64, 64])
+        test_out_of_bounds_tensors_with_diff_shapes2D_llvm([32, 32], [64, 64], [64, 64])
+        test_out_of_bounds_tensors_with_diff_shapes3D_llvm([64, 64, 64], [32, 32, 32], [64, 64, 64])
+        test_out_of_bounds_tensors_with_diff_shapes3D_llvm([32, 32, 32], [64, 64, 64], [64, 64, 64])
+        # check tensors with the same shapes
+        test_in_bounds_tensors_with_same_shapes1D_llvm()
+        test_in_bounds_tensors_with_same_shapes2D_llvm()
+        test_in_bounds_tensors_with_same_shapes3D_llvm()
+        # ir tests
+        test_in_bounds_const_loop_partition_ir()
diff --git a/tests/python/unittest/test_pass_decorate_device_scope.py b/tests/python/unittest/test_pass_decorate_device_scope.py
new file mode 100644
index 000000000000..1d9eb899a642
--- /dev/null
+++ b/tests/python/unittest/test_pass_decorate_device_scope.py
@@ -0,0 +1,26 @@
+import tvm
+
+def test_decorate_device():
+    m = tvm.var('m')
+    l = tvm.var('l')
+    A = tvm.placeholder((m, l), name='A')
+
+    A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1')
+    A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')
+
+    s = tvm.create_schedule(A2.op)
+    xo, xi = s[A2].split(A2.op.axis[0], factor=8)
+    s[A1].compute_at(s[A2], xo)
+    s[A1].set_scope("shared")
+
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    stmt1 = tvm.ir_pass.Simplify(stmt)
+    stmt2 = tvm.ir_pass.DecorateDeviceScope(stmt1)
+    assert isinstance(stmt2, tvm.stmt.AttrStmt)
+    assert stmt2.attr_key == "device_scope"
+    assert stmt1 == stmt2.body
+    
+if __name__ == "__main__":
+    test_decorate_device()
+
diff --git a/tests/python/unittest/test_pass_inject_vthread.py b/tests/python/unittest/test_pass_inject_vthread.py
index 502a55574df0..16f4c4652a3d 100644
--- a/tests/python/unittest/test_pass_inject_vthread.py
+++ b/tests/python/unittest/test_pass_inject_vthread.py
@@ -60,7 +60,26 @@ def get_vthread(name):
     assert stmt.body.body.body.body.body.body.extents[0].value == 2
     assert len(stmt.body.body.body.body.body.body.extents) == 3
 
+def test_vthread_if_then_else():
+    nthread = 2
+    tx = tvm.thread_axis("vthread")
+    ib = tvm.ir_builder.create()
+    A = ib.pointer("float32", name="A")
+    with ib.for_range(0, 100) as i:
+        ib.scope_attr(tx, "virtual_thread", nthread)
+        B = ib.allocate("float32", 128, name="B", scope="shared")
+        with ib.if_scope(i == 0):
+            B[i] = A[i * nthread + tx]
+        with ib.else_scope():
+            B[i] = A[i * nthread + tx] + 1
+        with ib.if_scope(i == 0):
+            B[i] = A[i * nthread + tx] + 2
+    stmt = ib.get()
+    stmt = tvm.ir_pass.InjectVirtualThread(stmt)
+    assert stmt.body.body.body.first.else_case != None
+    assert stmt.body.body.body.rest.else_case == None
 
 if __name__ == "__main__":
     test_vthread_extern()
     test_vthread()
+    test_vthread_if_then_else()
diff --git a/tests/python/unittest/test_pass_loop_partition.py b/tests/python/unittest/test_pass_loop_partition.py
index a1025e1f662c..85860ce824d0 100644
--- a/tests/python/unittest/test_pass_loop_partition.py
+++ b/tests/python/unittest/test_pass_loop_partition.py
@@ -177,6 +177,157 @@ def test_everything_during_deduction():
     stmt = tvm.ir_pass.Simplify(stmt)
     assert(isinstance(stmt.body.body, tvm.stmt.IfThenElse))
 
+def test_single_likely():
+    n = 60
+    A = tvm.placeholder((n, ), name='A')
+    B = tvm.placeholder((n, ), name='B')
+
+    T = tvm.compute((n, ), lambda i: A[i]+B[i])
+    s = tvm.create_schedule(T.op)
+    x = T.op.axis[0]
+    xo, xi = s[T].split(x, factor=16)
+
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    stmt = tvm.ir_pass.LoopPartition(stmt, True)
+    stmt = tvm.ir_pass.Simplify(stmt)
+    assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
+
+def test_multi_likely():
+    n = 94
+    m = 62
+    A = tvm.placeholder((n, m), name='A')
+    B = tvm.placeholder((n, m), name='B')
+
+    T = tvm.compute((n, m), lambda i, j: A[i, j]+B[i, j])
+    s = tvm.create_schedule(T.op)
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    x, y = T.op.axis
+    xo, xi = s[T].split(x, factor=16)
+    yo, yi = s[T].split(y, factor=16)
+    s[T].reorder(xo, yo, xi, yi)
+
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    stmt = tvm.ir_pass.LoopPartition(stmt, True)
+    stmt = tvm.ir_pass.Simplify(stmt)
+    assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
+
+def test_oneD_pool():
+    m = tvm.var('m')
+    ib = tvm.ir_builder.create()
+    #data = tvm.placeholder((16,), name = 'data')
+    data = ib.pointer("float32", name="A")
+    out = ib.pointer("float32", name="A")
+    with ib.for_range(0, 16, 'ow') as ow:
+        with ib.for_range(0, 3, 'kw') as kw:
+            with ib.if_scope(ib.likely(ow > 0)):
+                with ib.if_scope(ib.likely(ow < 15)):
+                    out[ow] = tvm.max(out[ow], data[ow + kw - 1])
+    with ib.for_range(0, 16, 'ow') as ow:
+        with ib.for_range(0, 3, 'kw') as kw:
+            with ib.if_scope(ib.likely(ow < 1)):
+                with ib.if_scope(ib.likely(kw > 0)):
+                    out[ow] = tvm.max(out[ow], data[ow + kw - 1])
+    with ib.for_range(0, 16, 'ow') as ow:
+        with ib.for_range(0, 3, 'kw') as kw:
+            with ib.if_scope(ib.likely(ow > 14)):
+                with ib.if_scope(ib.likely(kw < 2)):
+                    out[ow] = tvm.max(out[ow], data[ow + kw - 1])
+
+    stmt = ib.get()
+    stmt = tvm.ir_pass.LoopPartition(stmt, True)
+    stmt = tvm.ir_pass.Simplify(stmt)
+    assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
+
+def test_cce_loop_1():
+  ib = tvm.ir_builder.create()
+  dtype = 'float16'
+  n = 514
+  m = 514
+  _A = tvm.placeholder((n*m,), name = 'A')
+  Ab = tvm.decl_buffer((n*m,), dtype, name="A")
+  A = ib.buffer_ptr(Ab)
+  _B = tvm.placeholder((n*m,), name = 'B')
+  Bb = tvm.decl_buffer((n*m,), dtype, name="B")
+  B = ib.buffer_ptr(Bb)
+  #for i in 0 to n-1:
+  with ib.for_range(0, 11, name="i") as i:
+      with ib.for_range(0, 160, name="j") as j:
+          with ib.if_scope(ib.likely(((i*160) + j) < 1600)):
+               A[(i+1)*m+j+1] = B[(i)*m+j+1] + B[(i+1)*m+j+1] + B[(i+2)*m+j+1]
+  stmt = ib.get()
+  stmt = tvm.ir_pass.LoopPartition(stmt, True)
+  stmt = tvm.ir_pass.Simplify(stmt)
+  assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
+
+def test_cce_loop_2():
+  ib = tvm.ir_builder.create()
+  len = 112
+  tile = 32
+  loop = (len + tile - 1) // tile
+  with ib.for_range(0, loop, 'i') as i:
+    head = i * tile
+    with ib.if_scope(ib.likely(head + tile > len)):
+      tail = len
+      ib.emit(tvm.call_extern('float32', "cce_intrisic", head, tail))
+    with ib.else_scope():
+      tail = head + tile
+      ib.emit(tvm.call_extern('float32', "cce_intrisic", head, tail))
+
+  stmt = ib.get()
+  stmt = tvm.ir_pass.LoopPartition(stmt, True)
+  stmt = tvm.ir_pass.Simplify(stmt)
+  assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
+
+
+def test_cce_loop_3():
+    ib = tvm.ir_builder.create()
+    loop1 = 4
+    loop2 = 9998
+    tile = 39991
+    with ib.for_range(0,loop2,'i') as i:
+        with ib.for_range(0,loop1,'j') as j:
+            head1 = i
+            head2 = j
+            with ib.if_scope(ib.likely(head1*loop1 + head2 < tile)):
+                ib.emit(tvm.call_extern('float16',"cce_intrisic",head1))
+
+    stmt = ib.get()
+    stmt = tvm.ir_pass.LoopPartition(stmt,True)
+    stmt = tvm.ir_pass.Simplify(stmt)
+    assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
+
+def test_conv_tiling():
+    HSTR = WSTR = 1
+    in_channel = 128
+    kernel_height = kernel_width = 3
+    out_channel = 64
+    batch_size = 1
+    in_height = in_width = 64
+    out_height = out_width = in_height - kernel_height + 1
+    data = tvm.placeholder((batch_size, in_channel, in_height, in_width), name='data')
+    kernel = tvm.placeholder((kernel_height, kernel_width, in_channel,
+        out_channel), name='kernel')
+    ic = tvm.reduce_axis((0, in_channel), name='ic')
+    kh = tvm.reduce_axis((0, kernel_height), name='kh')
+    kw = tvm.reduce_axis((0, kernel_width), name='kw')
+    conv = tvm.compute((batch_size, out_channel, out_height, out_width),
+                       lambda n, oc, oh, ow: tvm.sum(data[n, ic, oh*HSTR + kh, ow*WSTR + kw] *
+                                                     kernel[kh, kw, ic, oc],
+                                                     axis=[ic, kh, kw]),
+                       name="conv2d")
+    s = tvm.create_schedule(conv.op)
+
+    n, oc, oh, ow = conv.op.axis
+    oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16)
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+    stmt = tvm.ir_pass.LoopPartition(stmt, True)
+    stmt = tvm.ir_pass.Simplify(stmt)
+    assert(not any(collect_visit(stmt, lambda x: isinstance(x, tvm.stmt.IfThenElse))))
+
 if __name__ == "__main__":
     test_basic()
     test_const_loop()
@@ -187,3 +338,10 @@ def test_everything_during_deduction():
     test_select()
     test_thread_axis2()
     test_everything_during_deduction()
+    test_single_likely()
+    test_multi_likely()
+    test_oneD_pool()
+    test_cce_loop_1()
+    test_cce_loop_2()
+    test_cce_loop_3()
+    test_conv_tiling()
diff --git a/tests/python/unittest/test_pass_simplify.py b/tests/python/unittest/test_pass_simplify.py
index c38083822fe2..fce6eaed5a1f 100644
--- a/tests/python/unittest/test_pass_simplify.py
+++ b/tests/python/unittest/test_pass_simplify.py
@@ -53,7 +53,6 @@ def test_canonical():
     assert (tvm.ir_pass.Equal(ret1, ret2))
 
 if __name__ == "__main__":
-    test_modular()
     test_bound()
     test_basic()
     test_simplify()
diff --git a/tests/python/unittest/test_pass_split_host_device.py b/tests/python/unittest/test_pass_split_host_device.py
new file mode 100644
index 000000000000..24cc497944d7
--- /dev/null
+++ b/tests/python/unittest/test_pass_split_host_device.py
@@ -0,0 +1,17 @@
+from nose.tools import raises
+import tvm
+
+@raises(Exception)
+def test_loop_dependent_allocate():
+    N = tvm.var("N")
+    A = tvm.placeholder((2*N,), "float32", "A")
+    C = tvm.compute((N, ), lambda i: A[2*i] + A[i+1], name='C')
+    s = tvm.create_schedule(C.op)
+    AA = s.cache_read(A, "local", [C])
+    s[AA].compute_at(s[C], s[C].op.axis[0])
+    # this line should fail due to IRUseDefAnalysis sees an allocate statement
+    # referencing undefined variable
+    tvm.lower(s, [A,C])
+
+if __name__ == "__main__":
+    test_loop_dependent_allocate()
diff --git a/tests/python/unittest/test_pass_storage_flatten.py b/tests/python/unittest/test_pass_storage_flatten.py
index 4e2feed23eff..655df1da4e15 100644
--- a/tests/python/unittest/test_pass_storage_flatten.py
+++ b/tests/python/unittest/test_pass_storage_flatten.py
@@ -51,8 +51,41 @@ def test_flatten_storage_align():
     stmt = tvm.ir_pass.Simplify(stmt)
     assert(stmt.body.extents[0].value == 17 * 8)
 
+def test_flatten_double_buffer():
+    dtype = 'int64'
+    n = 100
+    m = 4
+    tx = tvm.thread_axis("threadIdx.x")
+    ib = tvm.ir_builder.create()
+    A = ib.pointer("float32", name="A")
+    C = ib.pointer("float32", name="C")
+    ib.scope_attr(tx, "thread_extent", 1)
+    with ib.for_range(0, n) as i:
+        B = ib.allocate("float32", m, name="B", scope="shared")
+        with ib.new_scope():
+            ib.scope_attr(B.asnode(), "double_buffer_scope", 1)
+            with ib.for_range(0, m) as j:
+                B[j] = A[i * 4 + j]
+        with ib.for_range(0, m) as j:
+            C[j] = B[j] + 1
+
+    stmt = ib.get()
+    stmt = tvm.ir_pass.StorageFlatten(stmt, {}, 64)
+    stmt = tvm.ir_pass.InjectDoubleBuffer(stmt, 2)
+    stmt = tvm.ir_pass.Simplify(stmt)
+    assert isinstance(stmt.body.body, tvm.stmt.Allocate)
+    assert stmt.body.body.extents[0].value == 2
+    f = tvm.ir_pass.MakeAPI(stmt, "db", [A.asnode(), C.asnode()], 2, True)
+    f = tvm.ir_pass.ThreadSync(f, "shared")
+    count = [0]
+    def count_sync(op):
+        if isinstance(op, tvm.expr.Call) and op.name == "tvm_storage_sync":
+            count[0] += 1
+    tvm.ir_pass.PostOrderVisit(f.body, count_sync)
+    assert count[0] == 4
 
 if __name__ == "__main__":
     test_flatten_storage_align()
     test_flatten2()
     test_flatten_prefetch()
+    test_flatten_double_buffer()
diff --git a/tests/python/unittest/test_pass_storage_rewrite.py b/tests/python/unittest/test_pass_storage_rewrite.py
index 2bb02998982f..3c07a1f26aff 100644
--- a/tests/python/unittest/test_pass_storage_rewrite.py
+++ b/tests/python/unittest/test_pass_storage_rewrite.py
@@ -28,15 +28,30 @@ def verify(n):
     tvm.ir_pass.PostOrderVisit(stmt, verify)
     assert num_alloc[0] == 1
 
+def register_mem(scope_tb, max_bits):
+    #Register mem
+    @tvm.register_func("tvm.info.mem.%s" % scope_tb)
+    def mem_info_inp_buffer():
+        return tvm.make.node("MemoryInfo",
+                        unit_bits= 16,
+                        max_simd_bits=32,
+                        max_num_bits=max_bits,
+                        head_address=None)
+
 def test_alloc_seq():
+    scope_tb = "local.L0A"
+    max_bits = 1024 * 1024 * 1024
+
+    register_mem(scope_tb, max_bits)
+
     ib = tvm.ir_builder.create()
     n = tvm.var("n")
     with ib.for_range(0, n, name="i") as i:
         with ib.for_range(0, 10, name="j") as j:
-            A = ib.allocate("float32", 200, name="A", scope="local.L0A")
+            A = ib.allocate("float32", 200, name="A", scope=scope_tb)
             A[j] = 1.2
         with ib.for_range(0, 10, name="j") as j:
-            A = ib.allocate("float32", 200, name="B", scope="local.L0A")
+            A = ib.allocate("float32", 200, name="B", scope=scope_tb)
             A[j] = 1.3
 
     body = ib.get()
@@ -233,16 +248,9 @@ def test_parallel_alloc():
 
     assert(isinstance(body.body.body.body.body, tvm.stmt.Allocate))
 
-def test_inplace_rule2():
+def test_inplace_rule2(scope_tb = "local_TB2", max_bits = 1024 * 1024 * 1024):
     #Test Buffer
-    scope_tb = "local_TB2"
-    @tvm.register_func("tvm.info.mem.%s" % scope_tb)
-    def mem_info_inp_buffer():
-        return tvm.make.node("MemoryInfo",
-                        unit_bits= 16,
-                        max_simd_bits=32,
-                        max_num_bits=1024*1024*1024,
-                        head_address=None)
+    register_mem(scope_tb, max_bits)
     m = 10
     A = tvm.placeholder((m,), name='A')
     C = tvm.placeholder((m,), name='C')
@@ -275,16 +283,23 @@ def verify(n):
     tvm.ir_pass.PostOrderVisit(stmt, verify)
     assert num_alloc[0] == 2
 
+def test_exceed_mem():
+    max_bits = 639
+    # The critical max_num_bits is between 639 and 640
+    loc = -1
+    try:
+        test_inplace_rule2("local_TEM", max_bits)
+    except Exception as e:
+        estr = str(e)
+        loc = estr.find('Allocation exceed bound of memory')
+        assert loc != -1
+
 def test_inplace_rule3():
     #Test Buffer
     scope_tb = "local_TB3"
-    @tvm.register_func("tvm.info.mem.%s" % scope_tb)
-    def mem_info_inp_buffer():
-        return tvm.make.node("MemoryInfo",
-                        unit_bits= 16,
-                        max_simd_bits=32,
-                        max_num_bits=1024*1024*1024,
-                        head_address=None)
+    max_bits=1024 * 1024 * 1024
+
+    register_mem(scope_tb, max_bits)
     m = 10
     B0 = tvm.placeholder((m,), name='B0')
     B1 = tvm.placeholder((m,), name='B1')
@@ -388,17 +403,22 @@ def verify(n):
     assert num_alloc[0] == 1
 
 def test_alloc_seq_type2():
+    scope_tb = "local.L0A2"
+    max_bits=1024 * 1024 * 1024
+
+    register_mem(scope_tb, max_bits)
+
     ib = tvm.ir_builder.create()
     n = tvm.var("n")
     with ib.for_range(0, n, name="i") as i:
         with ib.for_range(0, 10, name="j") as j:
-            A = ib.allocate("float32", 200, name="A", scope="local.L0A")
+            A = ib.allocate("float32", 200, name="A", scope=scope_tb)
             A[j] = 1.2
         with ib.for_range(0, 20, name="j") as j:
-            B = ib.allocate("int16", 400, name="B", scope="local.L0A")
+            B = ib.allocate("int16", 400, name="B", scope=scope_tb)
             B[j] = tvm.const(1, "int16")
         with ib.for_range(0, 10, name="j") as j:
-            C = ib.allocate("float32", 200, name="C", scope="local.L0A")
+            C = ib.allocate("float32", 200, name="C", scope=scope_tb)
             C[j] = 1.2
 
     body = ib.get()
@@ -465,6 +485,7 @@ def test_replace_dataflow():
     test_storage_combine()
     test_storage_share_gpu()
     test_inplace_rule2()
+    test_exceed_mem()
     test_inplace_rule3()
     test_alloc_seq_type()
     test_alloc_seq_type2()
diff --git a/tests/python/unittest/test_pass_unroll.py b/tests/python/unittest/test_pass_unroll.py
index dda3fdad166c..68467b0c04c6 100644
--- a/tests/python/unittest/test_pass_unroll.py
+++ b/tests/python/unittest/test_pass_unroll.py
@@ -35,6 +35,23 @@ def test_unroll_loop():
     assert isinstance(ret.rest, tvm.stmt.For)
     assert ret.rest.for_type != tvm.stmt.For.Unrolled
 
+def test_unroll_fake_loop():
+    ib = tvm.ir_builder.create()
+    dtype = 'int32'
+    n = tvm.var('n')
+    Ab = tvm.decl_buffer((n, ), dtype)
+    Aptr = ib.buffer_ptr(Ab)
+    # for i in 0 to n-1:
+    with ib.for_range(0, 1, name="i") as i:
+        Aptr[i*2] = 3
+        with ib.for_range(0, 10, name="j") as j:
+            Aptr[j + 1] = Aptr[i] + 1
+
+    stmt = ib.get()
+    ret = tvm.ir_pass.UnrollLoop(stmt, 8, 0, 1, True)
+    assert isinstance(ret.first, tvm.stmt.Store)
+
 
 if __name__ == "__main__":
     test_unroll_loop()
+    test_unroll_fake_loop()
\ No newline at end of file
diff --git a/tests/python/unittest/test_pass_verify_gpu_code.py b/tests/python/unittest/test_pass_verify_gpu_code.py
index 6fc0387cf144..e3884a727852 100644
--- a/tests/python/unittest/test_pass_verify_gpu_code.py
+++ b/tests/python/unittest/test_pass_verify_gpu_code.py
@@ -162,8 +162,32 @@ def test_multiple_kernels():
             tvm.build(s, [A, C], target)
         assert valid[0]
 
+def test_wrong_bind():
+    N = 1024
+
+    A = tvm.placeholder((N, N-1), name='A')
+    B = tvm.compute((N, N-1), lambda i, j: A[i, j])
+
+    s = tvm.create_schedule([B.op])
+
+    # bind a thread axis to two loop axes with different lengths
+    s[B].bind(s[B].op.axis[0], tvm.thread_axis("threadIdx.x"))
+    s[B].bind(s[B].op.axis[1], tvm.thread_axis("threadIdx.x"))
+
+    for target in ['opencl', 'cuda']:
+        if not tvm.context(target).exist:
+            continue
+
+        valid = [None]
+        with tvm.build_config(**{"add_lower_pass": [
+                (2, get_verify_pass(valid, max_threads_per_block=N*N))]}):
+            tvm.build(s, [A, B], target)
+        assert not valid[0]
+
+
 if __name__ == "__main__":
     test_local_memory()
     test_shared_memory()
     test_num_thread()
     test_multiple_kernels()
+    test_wrong_bind()
diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py
new file mode 100644
index 000000000000..b9d8b689cb9e
--- /dev/null
+++ b/tests/python/unittest/test_runtime_graph_debug.py
@@ -0,0 +1,103 @@
+import os
+import tvm
+import numpy as np
+import json
+from tvm import rpc
+from tvm.contrib import util
+from tvm.contrib.debugger import debug_runtime as graph_runtime
+
+def test_graph_simple():
+    n = 4
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+    s = tvm.create_schedule(B.op)
+
+    node0 = {"op": "null", "name": "x", "inputs": []}
+    node1 = {"op": "tvm_op", "name": "add",
+             "inputs": [[0, 0, 0]],
+             "attrs": {"func_name": "myadd",
+                       "flatten_data": "1",
+                       "num_inputs" : "1",
+                    "num_outputs" : "1"}}
+    nodes = [node0, node1]
+    arg_nodes = [0]
+    node_row_ptr = [0, 1, 2]
+    outputs = [[1, 0, 0]]
+    shape = (4,)
+    attrs = {
+        "shape" : ["list_shape", [shape, shape]],
+        "dltype" : ["list_str", ["float32", "float32"]],
+        "storage_id" : ["list_int", [0, 1]],
+    }
+    graph = {"nodes": nodes,
+             "arg_nodes": arg_nodes,
+             "node_row_ptr": node_row_ptr,
+             "heads": outputs,
+             "attrs": attrs}
+    graph = json.dumps(graph)
+
+    def check_verify():
+        if not tvm.module.enabled("llvm"):
+            print("Skip because llvm is not enabled")
+            return
+        mlib = tvm.build(s, [A, B], "llvm", name="myadd")
+        try:
+            mod = graph_runtime.create(graph, mlib, tvm.cpu(0))
+        except ValueError:
+            return
+
+        a = np.random.uniform(size=(n,)).astype(A.dtype)
+        mod.set_input(x=a)
+
+        #verify dumproot created
+        directory = mod._dump_path
+        assert(os.path.exists(directory))
+
+        #verify graph is there
+        GRAPH_DUMP_FILE_NAME = '_tvmdbg_graph_dump.json'
+        assert(len(os.listdir(directory)) == 1)
+
+        #verify the file name is proper
+        assert(os.path.exists(os.path.join(directory, GRAPH_DUMP_FILE_NAME)))
+
+        mod.run()
+        #Verify the tensors are dumped
+        assert(len(os.listdir(directory)) > 1)
+
+        #verify the output is correct
+        out = mod.get_output(0, tvm.nd.empty((n,)))
+        np.testing.assert_equal(out.asnumpy(), a + 1)
+
+        mod.exit()
+        #verify dump root delete after cleanup
+        assert(not os.path.exists(directory))
+
+    def check_remote():
+        if not tvm.module.enabled("llvm"):
+            print("Skip because llvm is not enabled")
+            return
+        mlib = tvm.build(s, [A, B], "llvm", name="myadd")
+        server = rpc.Server("localhost")
+        remote = rpc.connect(server.host, server.port)
+        temp = util.tempdir()
+        ctx = remote.cpu(0)
+        path_dso = temp.relpath("dev_lib.so")
+        mlib.export_library(path_dso)
+        remote.upload(path_dso)
+        mlib = remote.load_module("dev_lib.so")
+        try:
+            mod = graph_runtime.create(graph, mlib, remote.cpu(0))
+        except ValueError:
+            print("Skip because debug graph_runtime not enabled")
+            return
+        a = np.random.uniform(size=(n,)).astype(A.dtype)
+        mod.run(x=tvm.nd.array(a, ctx))
+        out = tvm.nd.empty((n,), ctx=ctx)
+        out = mod.get_output(0, out)
+        np.testing.assert_equal(out.asnumpy(), a + 1)
+
+    check_verify()
+    check_remote()
+
+if __name__ == "__main__":
+    test_graph_simple()
diff --git a/tests/python/unittest/test_runtime_heterogeneous.py b/tests/python/unittest/test_runtime_heterogeneous.py
new file mode 100644
index 000000000000..3272165f0b02
--- /dev/null
+++ b/tests/python/unittest/test_runtime_heterogeneous.py
@@ -0,0 +1,385 @@
+# pylint: disable=too-many-locals
+"""Unit tests for heterogeneous runtime"""
+import json
+import numpy as np
+
+import tvm
+from tvm.contrib import graph_runtime, util
+import topi
+
+def get_simplex_graph(host_dev_type, device_dev_type):
+    r""" Return the hand-crafted json object where only one copy node is
+    inserted. This node copies data from the target device to cpu.
+    The network is constructed as following:
+                 A    B
+                  \  /
+             elemwise_add  (gpu)
+                     \
+                     copy      C
+                       \      /
+                     elemwise_sub  (cpu)
+
+    Parameters
+    ----------
+    host_dev_type : int
+        The device type of the host processor, e.g. cpu.
+    device_dev_type : int
+        The device type of the device processor, e.g. gpu, opencl, etc.
+
+    Returns
+    -------
+    json : json
+        A json encoded object.
+    """
+    # Construct each node in the graph.
+    var_a = {"op": "null", "name": "A", "inputs": []}
+    var_b = {"op": "null", "name": "B", "inputs": []}
+    elemwise_add = {
+        "op": "tvm_op", "name": "elemwise_add",
+        "attrs": {
+            "flatten_data": "1",
+            "func_name": "elemwise_add",
+            "num_inputs": "2",
+            "num_outputs": "1"
+        },
+        "inputs": [[0, 0, 0], [1, 0, 0]]
+    }
+    copy = {
+        "op": "tvm_op",
+        "name": "__copy_add_to_sub",
+        "attrs": {
+            "flatten_data": "0",
+            "func_name": "__copy",
+            "num_inputs": "1",
+            "num_outputs": "1"
+        },
+        "inputs": [[2, 0, 0]]
+    }
+    var_c = {"op": "null", "name": "C", "inputs": []}
+    elemwise_sub = {
+        "op": "tvm_op", "name": "elemwise_sub",
+        "attrs": {
+            "flatten_data": "0",
+            "func_name": "elemwise_sub",
+            "num_inputs": "2",
+            "num_outputs": "1"
+        },
+        "inputs": [[3, 0, 0], [4, 0, 0]]
+    }
+
+    # Group the nodes.
+    nodes = [var_a, var_b, elemwise_add, copy, var_c, elemwise_sub]
+    arg_nodes = [0, 1, 4]
+    node_row_ptr = [0, 1, 2, 3, 4, 5, 6]
+    heads = [[5, 0, 0]]
+    shape = (4,)
+    attrs = {
+        "storage_id": ["list_int", [3, 4, 0, 1, 5, 2]],
+        "shape": ["list_shape", [shape, shape, shape, shape, shape, shape]],
+        "device_index": ["list_int", [device_dev_type, device_dev_type,
+                                      device_dev_type, host_dev_type,
+                                      host_dev_type, host_dev_type]],
+        "dtype": ["list_int", [0, 0, 0, 0, 0, 0]],
+        "dltype": ["list_str", ["float32", "float32", "float32",
+                                "float32", "float32", "float32"]]
+    }
+
+    # Construct the graph.
+    graph = {"nodes": nodes,
+             "arg_nodes": arg_nodes,
+             "node_row_ptr": node_row_ptr,
+             "heads": heads,
+             "attrs": attrs}
+    return json.dumps(graph)
+
+
+def test_simplex_data_transferring():
+    r"""
+    Test the heterogeneous execution of a simple network where data
+    transferring is from the target device to the host processor at runtime.
+    The host processor is always assumed to be cpu, and the device varies.
+    """
+    host = "cpu"
+    target_host = "llvm"
+    host_ctx = tvm.context(host)
+    if not tvm.module.enabled(target_host):
+        print("Skip test because llvm is not enabled.")
+        return
+
+    def check_device(device, target_device):
+        if not tvm.module.enabled(target_device):
+            print("Skip test because {} is not enabled.".format(target_device))
+            return
+
+        device_ctx = tvm.context(device)
+        graph = get_simplex_graph(host_ctx.device_type, device_ctx.device_type)
+        shape = (4,)
+
+        # Create module for add whose target is the device.
+        tensor_a = tvm.placeholder(shape, name="A")
+        tensor_b = tvm.placeholder(shape, name="B")
+        elemwise_add = tvm.compute(shape, lambda *i: tensor_a(*i)
+                                   + tensor_b(*i), name="elemwise_add")
+        target = topi.cpp.TEST_create_target(device)
+        schedule_add = topi.cpp.cuda.schedule_injective(target, [elemwise_add])
+        lower_add = tvm.lower(schedule_add, [tensor_a, tensor_b, elemwise_add],
+                              name="elemwise_add")
+
+        # Insert copy. Neither compute nor schedule is required for the copy
+        # node. The compute will be performed at runtime which is just data
+        # copy from the input to the output.
+        tensor_copy = tvm.placeholder(shape, name="__copy")
+
+        # Create module for sub whose target is the host.
+        tensor_c = tvm.placeholder(shape, name="C")
+        elemwise_sub = tvm.compute(shape, lambda *i: tensor_copy(*i)
+                                   - tensor_c(*i), name="elemwise_sub")
+        schedule_sub = tvm.create_schedule(elemwise_sub.op)
+        lower_sub = tvm.lower(schedule_sub, [tensor_copy, tensor_c,
+                                             elemwise_sub],
+                              name="elemwise_sub")
+
+        target_flist = {target_device: [lower_add], target_host: [lower_sub]}
+        mhost = tvm.build(target_flist, target_host=target_host)
+        ctx = [host_ctx, device_ctx]
+        mod = graph_runtime.create(graph, mhost, ctx)
+        params = {}
+        params["A"] = tensor_a = np.random.uniform(
+            size=shape).astype(tensor_a.dtype)
+        params["B"] = tensor_b = np.random.uniform(
+            size=shape).astype(tensor_b.dtype)
+        params["C"] = tensor_c = np.random.uniform(
+            size=shape).astype(tensor_c.dtype)
+        mod.set_input(**params)
+        mod.run()
+        out = mod.get_output(0, tvm.nd.empty(shape))
+        np.testing.assert_equal(
+            out.asnumpy(), (tensor_a + tensor_b) - tensor_c)
+
+    dev_tar = {"cuda": "cuda", "opencl": "opencl"}
+    for device, target in dev_tar.items():
+        check_device(device, target)
+
+
+def get_duplex_graph(host_dev_type, device_dev_type):
+    r""" Return the hand-crafted json object where two copy nodes are inserted.
+    Data transferring happens back-and-forth between the target device and CPU.
+    The network is constructed as following:
+                 A    B
+                  \  /
+             elemwise_add  (gpu)
+                     \
+                     copy        C
+                       \        /
+                      elemwise_sub  (cpu)
+                         \
+                         copy          D
+                           \          /
+                           elemwise_add  (gpu)
+
+    Parameters
+    ----------
+    host_dev_type : int
+        The device type of the host processor, e.g. cpu.
+    device_dev_type : int
+        The device type of the device processor, e.g. gpu, opencl, etc.
+
+    Returns
+    -------
+    json : json
+        A json encoded object.
+    """
+    # Construct each node in the graph.
+    var_a = {"op": "null", "name": "A", "inputs": []}
+    var_b = {"op": "null", "name": "B", "inputs": []}
+    elemwise_add0 = {
+        "op": "tvm_op", "name": "elemwise_add0",
+        "attrs": {
+            "flatten_data": "1",
+            "func_name": "elemwise_add0",
+            "num_inputs": "2",
+            "num_outputs": "1"
+        },
+        "inputs": [[0, 0, 0], [1, 0, 0]]
+    }
+    copy_add_sub = {
+        "op": "tvm_op",
+        "name": "__copy_add_to_sub",
+        "attrs": {
+            "flatten_data": "0",
+            "func_name": "__copy",
+            "num_inputs": "1",
+            "num_outputs": "1"
+        },
+        "inputs": [[2, 0, 0]]
+    }
+    var_c = {"op": "null", "name": "C", "inputs": []}
+    elemwise_sub = {
+        "op": "tvm_op", "name": "elemwise_sub",
+        "attrs": {
+            "flatten_data": "0",
+            "func_name": "elemwise_sub",
+            "num_inputs": "2",
+            "num_outputs": "1"
+        },
+        "inputs": [[3, 0, 0], [4, 0, 0]]
+    }
+    copy_sub_add = {
+        "op": "tvm_op",
+        "name": "__copy_sub_to_add",
+        "attrs": {
+            "flatten_data": "0",
+            "func_name": "__copy",
+            "num_inputs": "1",
+            "num_outputs": "1"
+        },
+        "inputs": [[5, 0, 0]]
+    }
+    var_d = {"op": "null", "name": "D", "inputs": []}
+    elemwise_add1 = {
+        "op": "tvm_op", "name": "elemwise_add1",
+        "attrs": {
+            "flatten_data": "0",
+            "func_name": "elemwise_add1",
+            "num_inputs": "2",
+            "num_outputs": "1"
+        },
+        "inputs": [[6, 0, 0], [7, 0, 0]]
+    }
+
+    # Group the nodes.
+    nodes = [var_a, var_b, elemwise_add0, copy_add_sub, var_c, elemwise_sub,
+             copy_sub_add, var_d, elemwise_add1]
+    arg_nodes = [0, 1, 4, 7]
+    node_row_ptr = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+    heads = [[8, 0, 0]]
+    shape = (4,)
+    attrs = {
+        "storage_id": ["list_int", [4, 5, 0, 1, 6, 2, 0, 7, 3]],
+        "shape": ["list_shape", [shape, shape, shape, shape, shape, shape,
+                                 shape, shape, shape]],
+        "device_index": ["list_int", [device_dev_type, device_dev_type,
+                                      device_dev_type,
+                                      host_dev_type, host_dev_type, host_dev_type,
+                                      device_dev_type, device_dev_type,
+                                      device_dev_type]],
+        "dtype": ["list_int", [0, 0, 0, 0, 0, 0, 0, 0, 0]],
+        "dltype": ["list_str", ["float32", "float32", "float32",
+                                "float32", "float32", "float32",
+                                "float32", "float32", "float32"]]
+    }
+
+    # Construct the graph.
+    graph = {"nodes": nodes,
+             "arg_nodes": arg_nodes,
+             "node_row_ptr": node_row_ptr,
+             "heads": heads,
+             "attrs": attrs}
+    return json.dumps(graph)
+
+
+def test_duplex_data_transferring():
+    r"""
+    Test the heterogeneous execution of a simple network where data
+    transferring occurs back-and-forth between the target device and host
+    processor.
+    The host processor is always assumed to be cpu, and the target device
+    varies.
+    """
+    host = "cpu"
+    target_host = "llvm"
+    host_ctx = tvm.context(host)
+    if not tvm.module.enabled(target_host):
+        print("Skip test because llvm is not enabled.")
+        return
+
+    def check_device(device, target_device):
+        if not tvm.module.enabled(target_device):
+            print("Skip test because {} is not enabled.".format(target_device))
+            return
+
+        device_ctx = tvm.context(device)
+        graph = get_duplex_graph(host_ctx.device_type, device_ctx.device_type)
+        shape = (4,)
+
+        # Insert copy nodes for data transferring between add and sub nodes.
+        # Transfers data from gpu to cpu.
+        copy_add_sub = tvm.placeholder(shape, name="__copy0")
+        # Transfers data from cpu to gpu.
+        copy_sub_add = tvm.placeholder(shape, name="__copy1")
+
+        # Create a module containing adds on the device.
+        tensor_a = tvm.placeholder(shape, name="A")
+        tensor_b = tvm.placeholder(shape, name="B")
+        tensor_d = tvm.placeholder(shape, name="D")
+        elemwise_add0 = tvm.compute(shape, lambda *i: tensor_a(*i)
+                                    + tensor_b(*i), name="elemwise_add0")
+        elemwise_add1 = tvm.compute(shape, lambda *i: copy_sub_add(*i)
+                                    + tensor_d(*i), name="elemwise_add1")
+        target = topi.cpp.TEST_create_target(device)
+        add_schedule0 = topi.cpp.cuda.schedule_injective(
+            target, [elemwise_add0])
+        lower_add0 = tvm.lower(
+            add_schedule0, [tensor_a, tensor_b, elemwise_add0],
+            name="elemwise_add0")
+        add_schedule1 = topi.cpp.cuda.schedule_injective(
+            target, [elemwise_add1])
+        lower_add1 = tvm.lower(
+            add_schedule1, [tensor_d, copy_sub_add, elemwise_add1],
+            name="elemwise_add1")
+        # Create module for sub whose target is the host.
+        tensor_c = tvm.placeholder(shape, name="C")
+        elemwise_sub = tvm.compute(shape, lambda *i: copy_add_sub(*i)
+                                   - tensor_c(*i), name="elemwise_sub")
+        sub_schedule = tvm.create_schedule(elemwise_sub.op)
+        lower_sub = tvm.lower(sub_schedule, [copy_add_sub, tensor_c,
+                                             elemwise_sub],
+                              name="elemwise_sub")
+
+        target_flist = {target_device: [lower_add0, lower_add1], target_host:
+                        [lower_sub]}
+        mhost = tvm.build(target_flist, target_host=target_host)
+        ctx = [host_ctx, device_ctx]
+        params = {}
+        params["A"] = tensor_a = np.random.uniform(
+            size=shape).astype(tensor_a.dtype)
+        params["B"] = tensor_b = np.random.uniform(
+            size=shape).astype(tensor_b.dtype)
+        params["C"] = tensor_c = np.random.uniform(
+            size=shape).astype(tensor_c.dtype)
+        params["D"] = tensor_d = np.random.uniform(
+            size=shape).astype(tensor_d.dtype)
+
+        def check_verify():
+            mod = graph_runtime.create(graph, mhost, ctx)
+            mod.set_input(**params)
+            mod.run()
+            out = mod.get_output(0, tvm.nd.empty(shape))
+            np.testing.assert_equal(
+                out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d)
+
+        def check_load_module():
+            temp = util.tempdir()
+            path_lib = temp.relpath("deploy.so")
+            mhost.export_library(path_lib)
+            with open(temp.relpath("deploy.json"), "w") as out_file:
+                out_file.write(graph)
+            loaded_lib = tvm.module.load(path_lib)
+            loaded_graph = open(temp.relpath("deploy.json")).read()
+            mod = graph_runtime.create(loaded_graph, loaded_lib, ctx)
+            mod.set_input(**params)
+            mod.run()
+            out = mod.get_output(0, tvm.nd.empty(shape))
+            np.testing.assert_equal(
+                out.asnumpy(), tensor_a + tensor_b - tensor_c + tensor_d)
+
+        check_verify()
+        check_load_module()
+
+    dev_tar = {"cuda": "cuda", "opencl": "opencl"}
+    for device, target in dev_tar.items():
+        check_device(device, target)
+
+if __name__ == "__main__":
+    test_simplex_data_transferring()
+    test_duplex_data_transferring()
diff --git a/tests/python/unittest/test_runtime_ndarray.py b/tests/python/unittest/test_runtime_ndarray.py
index 9f33e2aabfd8..935f8f38a804 100644
--- a/tests/python/unittest/test_runtime_ndarray.py
+++ b/tests/python/unittest/test_runtime_ndarray.py
@@ -35,5 +35,26 @@ def test_nd_create():
         ctx.sync()
 
 
+def test_fp16_conversion():
+    n = 100
+
+    for (src, dst) in [('float32', 'float16'), ('float16', 'float32')]:
+        A = tvm.placeholder((n,), dtype=src)
+        B = tvm.compute((n,), lambda i: A[i].astype(dst))
+
+        s = tvm.create_schedule([B.op])
+        func = tvm.build(s, [A, B], 'llvm')
+
+        x_tvm = tvm.nd.array(100 * np.random.randn(n).astype(src) - 50)
+        y_tvm = tvm.nd.array(100 * np.random.randn(n).astype(dst) - 50)
+
+        func(x_tvm, y_tvm)
+
+        expected = x_tvm.asnumpy().astype(dst)
+        real = y_tvm.asnumpy()
+
+        tvm.testing.assert_allclose(expected, real)
+
 if __name__ == "__main__":
     test_nd_create()
+    test_fp16_conversion()
diff --git a/tests/python/unittest/test_runtime_packed_func.py b/tests/python/unittest/test_runtime_packed_func.py
index 279172555d2a..2d7d0197640b 100644
--- a/tests/python/unittest/test_runtime_packed_func.py
+++ b/tests/python/unittest/test_runtime_packed_func.py
@@ -70,6 +70,16 @@ def myfunc(ss):
     tvm.convert(myfunc)(x)
 
 
+def test_ctx():
+    def test_ctx_func(ctx):
+        assert tvm.gpu(7) == ctx
+        return tvm.cpu(0)
+    x = test_ctx_func(tvm.gpu(7))
+    assert x == tvm.cpu(0)
+    x = tvm.opencl(10)
+    x = tvm._api_internal._context_test(x, x.device_type, x.device_id)
+    assert x == tvm.opencl(10)
+
 if __name__ == "__main__":
     test_empty_array()
     test_get_global()
@@ -77,3 +87,4 @@ def myfunc(ss):
     test_convert()
     test_return_func()
     test_byte_array()
+    test_ctx()
diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index e7c0cc1bbabd..eb7458555979 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -1,10 +1,13 @@
 import tvm
 import os
 import logging
-import numpy as np
 import time
+import multiprocessing
+
+import numpy as np
 from tvm import rpc
 from tvm.contrib import util
+from tvm.rpc.tracker import Tracker
 
 
 def test_bigendian_rpc():
@@ -28,7 +31,7 @@ def verify_rpc(remote, target, shape, dtype):
         remote.upload(path_dso)
         f = remote.load_module("dev_lib.o")
         f(a, b)
-        np.testing.assert_allclose(a.asnumpy() + 1, b.asnumpy())
+        tvm.testing.assert_allclose(a.asnumpy() + 1, b.asnumpy())
 
     print("Test RPC connection to PowerPC...")
     remote = rpc.connect(host, port)
@@ -175,6 +178,7 @@ def test_rpc_return_func():
     @tvm.register_func("rpc.test.remote_func")
     def addone(x):
         return lambda y: x+y
+
     server = rpc.Server("localhost", key="x1")
     client = rpc.connect(server.host, server.port, key="x1")
     f1 = client.get_function("rpc.test.remote_func")
@@ -182,6 +186,46 @@ def addone(x):
     assert fadd(12) == 22
 
 
+def test_rpc_return_ndarray():
+    # Use closure to check the ref counter correctness
+    nd = tvm.nd.array(np.zeros(10).astype("float32"))
+    @tvm.register_func("rpc.test.remote_return_nd")
+    def my_module(name):
+        if name == "get_arr":
+            return lambda : nd
+        elif name == "ref_count":
+            return lambda : tvm._api_internal._ndarray_use_count(nd)
+        elif name == "get_elem":
+            return lambda idx: nd.asnumpy()[idx]
+        elif name == "get_arr_elem":
+            return lambda arr, idx: arr.asnumpy()[idx]
+
+    # start server
+    server = rpc.Server("localhost", key="x1")
+    client = rpc.connect(server.host, server.port, key="x1")
+    m = client.get_function("rpc.test.remote_return_nd")
+    get_arr = m("get_arr")
+    ref_count = m("ref_count")
+    get_elem = m("get_elem")
+    get_arr_elem = m("get_arr_elem")
+    # array test
+    def run_arr_test():
+        arr = get_arr()
+        assert ref_count() == 2
+        arr2 = get_arr()
+        assert ref_count() == 3
+        assert arr.context == client.cpu(0)
+        arr.copyfrom(np.ones(10).astype(arr.dtype))
+        assert arr2.asnumpy()[0] == 1.0
+        assert get_elem(0) == 1.0
+        assert get_arr_elem(arr2, 0) == 1.0
+
+    assert ref_count() == 1
+    run_arr_test()
+    # check recycle correctness
+    assert ref_count() == 1
+
+
 def test_local_func():
     @tvm.register_func("rpc.test.remote_func2")
     def addone(x):
@@ -196,13 +240,89 @@ def addone(x):
     rev = client.download("dat.bin")
     assert rev == blob
 
+def test_rpc_tracker_register():
+    # test registration
+    tracker = Tracker('localhost', port=9000, port_end=10000)
+    device_key = 'test_device'
+    server = rpc.Server('localhost', port=9000, port_end=10000,
+                        key=device_key,
+                        tracker_addr=(tracker.host, tracker.port))
+    time.sleep(1)
+    client = rpc.connect_tracker(tracker.host, tracker.port)
+
+    summary = client.summary()
+    assert summary['queue_info'][device_key]['free'] == 1
+
+    remote = client.request(device_key)
+    summary = client.summary()
+    assert summary['queue_info'][device_key]['free'] == 0
+
+    del remote
+    time.sleep(1)
+
+    summary = client.summary()
+    assert summary['queue_info'][device_key]['free'] == 1
+
+    server.terminate()
+    time.sleep(1)
+
+    summary = client.summary()
+    assert summary['queue_info'][device_key]['free'] == 0
+
+    tracker.terminate()
+
+def test_rpc_tracker_request():
+    # test concurrent request
+    tracker = Tracker('localhost', port=9000, port_end=10000)
+    device_key = 'test_device'
+    server = rpc.Server('localhost', port=9000, port_end=10000,
+                        key=device_key,
+                        tracker_addr=(tracker.host, tracker.port))
+    client = rpc.connect_tracker(tracker.host, tracker.port)
+
+    def target(host, port, device_key, timeout):
+        client = rpc.connect_tracker(host, port)
+        remote = client.request(device_key, session_timeout=timeout)
+        while True:
+            pass
+        remote.cpu()
+
+    proc1 = multiprocessing.Process(target=target,
+                                    args=(tracker.host, tracker.port, device_key, 4))
+    proc2 = multiprocessing.Process(target=target,
+                                    args=(tracker.host, tracker.port, device_key, 200))
+    proc1.start()
+    time.sleep(0.5)
+    proc2.start()
+    time.sleep(0.5)
+
+    summary = client.summary()
+    assert summary['queue_info'][device_key]['free'] == 0
+    assert summary['queue_info'][device_key]['pending'] == 1
+
+    proc1.terminate()
+    proc1.join()
+    time.sleep(0.5)
+
+    summary = client.summary()
+    assert summary['queue_info'][device_key]['free'] == 0
+    assert summary['queue_info'][device_key]['pending'] == 0
+
+    proc2.terminate()
+    proc2.join()
+    server.terminate()
+    tracker.terminate()
+
 
 if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO)
+    test_rpc_return_ndarray()
+    test_rpc_return_func()
     test_bigendian_rpc()
     test_rpc_remote_module()
-    test_rpc_return_func()
     test_rpc_file_exchange()
     test_rpc_array()
     test_rpc_simple()
     test_local_func()
+    test_rpc_tracker_register()
+    test_rpc_tracker_request()
diff --git a/tests/python/unittest/test_schedule_schedule_ops.py b/tests/python/unittest/test_schedule_schedule_ops.py
index 8e6f4090d403..e59a73529d24 100644
--- a/tests/python/unittest/test_schedule_schedule_ops.py
+++ b/tests/python/unittest/test_schedule_schedule_ops.py
@@ -12,6 +12,7 @@ def test_schedule0():
     assert isinstance(bounds, tvm.container.Map)
     stmt = tvm.schedule.ScheduleOps(s, bounds)
 
+
 def test_schedule1():
     m = tvm.var('m')
     l = tvm.var('l')
@@ -53,10 +54,13 @@ def test_schedule_scan():
     assert tuple(res.shape) == (m, n)
     s = tvm.create_schedule(res.op)
     s = s.normalize()
+    ir = tvm.lower(s, [s_state], simple_mode=True)
+    assert not hasattr(ir.body.body.body.body.rest.body.body.rest.body, "condition")
     bounds = tvm.schedule.InferBound(s)
     assert(bounds[res.op.scan_axis].min.value == 1)
     stmt = tvm.schedule.ScheduleOps(s, bounds)
 
+
 def test_inline_multi_reduce():
     def argmax_comp(x, y):
         idx = tvm.select((x[1] >= y[1]), x[0], y[0])
@@ -80,7 +84,6 @@ def argmax_init(idx_typ, val_typ):
     stmt = tvm.schedule.ScheduleOps(s, bounds)
 
 
-
 def test_auto_inline():
     m = tvm.var('m')
     n = tvm.var('n')
@@ -96,6 +99,7 @@ def test_auto_inline():
     bounds = tvm.schedule.InferBound(s)
     stmt = tvm.schedule.ScheduleOps(s, bounds)
 
+
 def test_schedule_const_bound():
     n = 128
     A = tvm.placeholder((n,), name='A')
@@ -146,6 +150,7 @@ def test_scan_inline1():
     s[s_x1].compute_inline()
     stmt = tvm.lower(s, [x, res1, res2])
 
+
 def test_scan_inline2():
     m = tvm.var("m")
     n = tvm.var("n")
@@ -183,6 +188,7 @@ def test_schedule_cache():
     bounds = tvm.schedule.InferBound(s)
     stmt = tvm.schedule.ScheduleOps(s, bounds)
 
+
 def test_schedule_middle_cache():
     m = tvm.var('m')
     n = tvm.var('n')
@@ -202,7 +208,6 @@ def test_schedule_middle_cache():
     stmt = tvm.schedule.ScheduleOps(s, bounds)
 
 
-
 def test_schedule_cache_relayout1():
     m = tvm.var('m')
     n = tvm.var('n')
@@ -249,6 +254,7 @@ def test_schedule_cache_relayout3():
     bounds = tvm.schedule.InferBound(s)
     stmt = tvm.schedule.ScheduleOps(s, bounds)
 
+
 def test_schedule_cache_relayout4():
     def _compute(*indice):
         return A(*indice) + 1, B(*indice) / 2
@@ -276,7 +282,145 @@ def test_schedule_bound_condition():
    stmt = tvm.ir_pass.Simplify(stmt)
    assert (isinstance(stmt.body.body.first.body.body.then_case, tvm.stmt.IfThenElse))
 
+
+def intrin_gemv(m, n):
+    w = tvm.placeholder((m, n), name='w')
+    x = tvm.placeholder((n,), name='x')
+    k = tvm.reduce_axis((0, n), name='k')
+    z = tvm.compute((m,), lambda i:
+                    tvm.sum(w[i, k] * x[k], axis=k), name='z')
+    Wb = tvm.decl_buffer(w.shape, w.dtype,
+                         name="W",
+                         offset_factor=16,
+                         strides=[tvm.var('ldw'), 1])
+    def intrin_func(ins, outs):
+        ww, xx = ins
+        zz = outs[0]
+        ww_ptr = ww.access_ptr("r")
+        xx_ptr = xx.access_ptr("r")
+        zz_ptr = zz.access_ptr("w")
+        body = tvm.call_packed(
+            "gemm", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
+        reset = tvm.call_packed(
+            "fill_zero", zz_ptr, n)
+        update = tvm.call_packed(
+            "gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0])
+        return body, reset, update
+
+    with tvm.build_config(data_alignment=16,
+                          offset_factor=16):
+        return tvm.decl_tensor_intrin(z.op, intrin_func,
+                                      binds={w: Wb})
+
+
+def test_schedule_tensor_compute1():
+    # basic: split, reorder, tile
+    M, N, L = 2048, 1024, 512
+    factor, rfactor = 16, 16
+    A = tvm.placeholder((N//factor, L//rfactor, factor, rfactor), name='A')
+    B = tvm.placeholder((M, L//rfactor, rfactor), name='B')
+    k = tvm.reduce_axis((0, L//rfactor), name='k')
+
+    gemv = intrin_gemv(factor, rfactor)
+    C = tvm.compute((N, M//factor, factor),
+        lambda i, j: gemv(A[i, k, 0:factor, 0:factor], B[j, k, 0:rfactor], reduce_axis=k),
+        name='C')
+
+    s = tvm.create_schedule(C.op)
+    ai, aj, ax = s[C].op.axis
+    aio, aii = s[C].split(ai, 16)
+    s[C].reorder(aio, aj, aii)
+    aioo, ajo, aioi, aji = s[C].tile(aio, aj, 16, 4)
+
+    s = s.normalize()
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+
+
+def intrin_vadd(n, cache_read=False, cache_write=False):
+    scope_ubuf = 'local'
+    dtype = 'float32'
+    x = tvm.placeholder((n,), dtype=dtype, name='vx')
+    y = tvm.placeholder((n,), dtype=dtype, name='vy')
+    z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z')
+    s = tvm.create_schedule(z.op)
+
+    def create_buffer(t):
+        return tvm.decl_buffer(t.shape, t.dtype,
+                               name='W'+t.name,
+                               scope=scope_ubuf,
+                               offset_factor=16)
+
+    binds = {}
+    if cache_read:
+        binds[x] = create_buffer(x)
+        binds[y] = create_buffer(y)
+    if cache_write:
+        binds[z] = create_buffer(z)
+
+    def intrin_func(ins, outs):
+        ib = tvm.ir_builder.create()
+        ib.emit(tvm.call_extern(outs[0].dtype, 'vadd', ins[0].access_ptr("r"), ins[1].access_ptr('r'), outs[0].access_ptr('wr')))
+        return ib.get()
+
+    with tvm.build_config(offset_factor=16):
+        return tvm.decl_tensor_intrin(z.op, intrin_func, binds=binds)
+
+
+def test_schedule_tensor_compute2():
+    # cache_read, cache_write
+    M = 1024
+    factor = 16
+    dtype = 'float32'
+    scope_ubuf = 'local'
+
+    A = tvm.placeholder((M//factor, factor), name="A", dtype=dtype)
+    B = tvm.placeholder((M//factor, factor), name="B", dtype=dtype)
+
+    vadd = intrin_vadd(factor, True, True)
+    C = tvm.compute((M//factor, factor),
+        lambda i: vadd(A[i, 0:factor], B[i, 0:factor]), name='C')
+
+    s = tvm.create_schedule(C.op)
+    AL = s.cache_read(A, scope_ubuf, C)
+    BL = s.cache_read(B, scope_ubuf, C)
+    CL = s.cache_write(C, scope_ubuf)
+    s = s.normalize()
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+
+
+def test_schedule_tensor_compute3():
+    # compute_at
+    M = 1024
+    factor = 16
+    dtype = 'float32'
+    A = tvm.placeholder((M//factor, factor), name="A", dtype=dtype)
+    B = tvm.placeholder((M//factor, factor), name="B", dtype=dtype)
+    Bi = tvm.compute((M//factor, factor), lambda i, j: B[i, j] + 5, name="Bi")
+
+    vadd = intrin_vadd(factor)
+    C = tvm.compute((M//factor, factor),
+        lambda i: vadd(A[i, 0:factor], Bi[i, 0:factor]), name='C')
+    s = tvm.create_schedule(C.op)
+    s[Bi].compute_at(s[C], C.op.axis[0])
+    s = s.normalize()
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, bounds)
+
+
+def test_loop_dep_reduce():
+    X = tvm.placeholder(shape=(10,), name="x")
+    def f(n):
+        rv = tvm.reduce_axis((0, n))
+        return tvm.sum(X[rv], axis=rv)
+    Y = tvm.compute(X.shape, f, name="y")
+    s = tvm.create_schedule([Y.op])
+    f = tvm.build(s, [X, Y])
+
+
 if __name__ == "__main__":
+    test_loop_dep_reduce()
     test_schedule_middle_cache()
     test_inline_multi_reduce()
     test_schedule_cache_relayout4()
@@ -294,3 +438,6 @@ def test_schedule_bound_condition():
     test_schedule2()
     test_schedule_cache()
     test_schedule_bound_condition()
+    test_schedule_tensor_compute1()
+    test_schedule_tensor_compute2()
+    test_schedule_tensor_compute3()
diff --git a/tests/python/unittest/test_testing.py b/tests/python/unittest/test_testing.py
new file mode 100644
index 000000000000..852bf2ce7e11
--- /dev/null
+++ b/tests/python/unittest/test_testing.py
@@ -0,0 +1,95 @@
+import numpy as np
+import tvm
+from tvm.testing import check_numerical_grads
+
+def test_check_numerical_grads():
+    # Functions and their derivatives
+    functions = [
+        lambda x: (x*x*x, 3*x*x),
+        lambda x: (x*x, 2*x),
+        lambda x: (np.abs(x), np.sign(x)),
+        lambda x: (np.log(np.abs(x)), 1/x),
+        lambda x: (np.sqrt(np.abs(x)), np.sign(x)/(2*np.sqrt(np.abs(x)))),
+        lambda x: (1/x, -1/(x*x)),
+        lambda x: (np.sign(np.sin(1/x)), np.zeros_like(x)),
+        lambda x: (x*np.sin(1/x), np.sin(1/x) - np.cos(1/x)/x),
+        lambda x: (np.sin(1/x), - np.cos(1/x)/(x*x)),
+    ]
+
+    # Avoid values too close to 0 since singularities of our functions are there
+    min_x = 0.5
+
+    for func in functions:
+        x_input = np.random.uniform(min_x, 10, size=(3, 4))
+
+        # We need a function returning a scalar, so sum the results
+        func_forw = lambda x: np.sum(func(x)[0])
+        grads = [func(x_input)[1]]
+
+        check_numerical_grads(func_forw, [x_input], grads)
+
+    # Check functions with multiple arguments
+    for f1 in functions:
+        for f2 in functions:
+            x_input = np.random.uniform(min_x, 10, size=(3, 4))
+            y_input = np.random.uniform(min_x, 10, size=(3, 4))
+
+            func_forw = lambda x, y: np.sum(f1(x)[0] + f2(y)[0])
+            grads = [f1(x_input)[1], f2(y_input)[1]]
+
+            check_numerical_grads(func_forw, [x_input, y_input], grads)
+
+            # Same thing but with keyword arguments
+            func_forw = lambda x, y: np.sum(f1(x)[0] + f2(y)[0])
+            grads = {'x': f1(x_input)[1], 'y': f2(y_input)[1]}
+
+            check_numerical_grads(func_forw, {'x': x_input, 'y': y_input}, grads)
+
+    def _noise1(x, atol=1e-2, rtol=0.1):
+        # We go in random direction using twice the original tolerance to be sure this
+        # results in an error
+        sqrt_n = np.sqrt(float(np.prod(x.shape)))
+        tol = 2*(np.linalg.norm(x)*rtol + atol*sqrt_n)
+        noise = np.random.normal(size=x.shape)
+        noise = tol * noise / np.linalg.norm(noise)
+        return x + noise
+
+    def _noise2(x, atol=1e-2, rtol=0.1):
+        # This noise affects just a single component
+        sqrt_n = np.sqrt(float(np.prod(x.shape)))
+        tol = 2*(np.linalg.norm(x)*rtol + atol*sqrt_n)
+        n = np.random.randint(np.prod(x.shape))
+        noise = np.zeros_like(x)
+        noise.reshape(-1)[n] = tol
+        return x + noise
+
+    # Add noise to gradients and check that the function throws
+    for f1 in functions:
+        for f2 in functions:
+            x_input = np.random.uniform(min_x, 10, size=(3, 4))
+            y_input = np.random.uniform(min_x, 10, size=(3, 4))
+
+            func_forw = lambda x, y: np.sum(f1(x)[0] + f2(y)[0])
+            grads = [_noise1(f1(x_input)[1]), _noise1(f2(y_input)[1])]
+
+            try:
+                check_numerical_grads(func_forw, [x_input, y_input], grads)
+            except AssertionError as e:
+                pass
+            else:
+                raise AssertionError("check_numerical_grads didn't raise an exception")
+
+            func_forw = lambda x, y: np.sum(f1(x)[0] + f2(y)[0])
+            grads = {'x': _noise2(f1(x_input)[1]), 'y': _noise2(f2(y_input)[1])}
+
+            try:
+                check_numerical_grads(func_forw, {'x': x_input, 'y': y_input}, grads)
+            except AssertionError as e:
+                pass
+            else:
+                raise AssertionError("check_numerical_grads didn't raise an exception")
+
+
+if __name__ == "__main__":
+    test_check_numerical_grads()
+
diff --git a/tests/scripts/task_golang.sh b/tests/scripts/task_golang.sh
new file mode 100755
index 000000000000..363ee05bcbec
--- /dev/null
+++ b/tests/scripts/task_golang.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+set -e
+
+export LD_LIBRARY_PATH=lib:$LD_LIBRARY_PATH
+
+tvm_root="$(git rev-parse --show-toplevel)"
+export PYTHONPATH="$tvm_root/python":"$tvm_root/nnvm/python":"$tvm_root/topi/python"
+
+# Golang tests
+make -C golang tests
diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index 4ff7c490935e..2dfa68415f98 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -1,6 +1,11 @@
 #!/bin/bash
 mkdir -p docs/_build/html
 rm -rf docs/_build/html/jsdoc
+rm -rf docs/_build/html/javadoc
+
+# remove stale tutorials and always build from scratch.
+rm -rf docs/tutorials
+
 # C++ doc
 make doc
 
@@ -8,6 +13,10 @@ make doc
 jsdoc web/tvm_runtime.js web/README.md || exit -1
 mv out docs/_build/html/jsdoc || exit -1
 
+# Java doc
+make javadoc || exit -1
+mv jvm/core/target/site/apidocs docs/_build/html/javadoc || exit -1
+
 rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc
 
 cd docs
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index 8104bf079502..d11dcd5da71a 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-export PYTHONPATH=python:apps/extension/python
+export PYTHONPATH=python:topi/python:apps/extension/python
 export LD_LIBRARY_PATH=build:${LD_LIBRARY_PATH}
 
 rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc
@@ -10,6 +10,7 @@ make cython3 || exit -1
 
 # Test extern package package
 cd apps/extension
+rm -rf lib
 make || exit -1
 cd ../..
 python -m nose -v apps/extension/tests || exit -1
@@ -18,6 +19,8 @@ TVM_FFI=cython python -m nose -v tests/python/integration || exit -1
 TVM_FFI=ctypes python3 -m nose -v tests/python/integration || exit -1
 TVM_FFI=cython python -m nose -v tests/python/contrib || exit -1
 TVM_FFI=ctypes python3 -m nose -v tests/python/contrib || exit -1
+TVM_FFI=cython python -m nose -v tests/python/relay || exit -1
+TVM_FFI=ctypes python3 -m nose -v tests/python/relay || exit -1
 
 # Do not enabke OpenGL
 # TVM_FFI=cython python -m nose -v tests/webgl || exit -1
diff --git a/tests/scripts/task_python_nnvm.sh b/tests/scripts/task_python_nnvm.sh
index 2fc41980fb3d..cf6039d58416 100755
--- a/tests/scripts/task_python_nnvm.sh
+++ b/tests/scripts/task_python_nnvm.sh
@@ -1,6 +1,12 @@
 #!/bin/bash
 
 export PYTHONPATH=nnvm/python:python:topi/python
+# to avoid openblas threading error
+export OMP_NUM_THREADS=1
+
+# Rebuild cython
+make cython || exit -1
+make cython3 || exit -1
 
 echo "Running unittest..."
 python -m nose -v nnvm/tests/python/unittest || exit -1
diff --git a/tests/scripts/task_python_topi.sh b/tests/scripts/task_python_topi.sh
index 13a324d79b1f..6842ddaae13a 100755
--- a/tests/scripts/task_python_topi.sh
+++ b/tests/scripts/task_python_topi.sh
@@ -1,4 +1,8 @@
 export PYTHONPATH=python:topi/python
 
+# Rebuild cython
+make cython || exit -1
+make cython3 || exit -1
+
 python -m nose -v topi/tests/python || exit -1
 python3 -m nose -v topi/tests/python || exit -1
diff --git a/tests/scripts/task_rust.sh b/tests/scripts/task_rust.sh
new file mode 100755
index 000000000000..8ef9a1a1556f
--- /dev/null
+++ b/tests/scripts/task_rust.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+set -e
+
+export LD_LIBRARY_PATH=lib:$LD_LIBRARY_PATH
+
+tvm_root="$(git rev-parse --show-toplevel)"
+export PYTHONPATH="$tvm_root/python":"$tvm_root/nnvm/python":"$tvm_root/topi/python"
+
+cd rust
+cargo fmt -- --check
+
+# run basic tests
+python3 tests/build_model.py
+cargo test --tests
+
+# run TVM module test
+cd tests/test_tvm_basic
+cargo run
+cd -
+
+# run NNVM graph test
+cd tests/test_nnvm
+cargo run
+cd -
diff --git a/tests/verilog/integration/test_codegen_verilog.py b/tests/verilog/integration/test_codegen_verilog.py
index 26c0a9e36c9d..7ce264797012 100644
--- a/tests/verilog/integration/test_codegen_verilog.py
+++ b/tests/verilog/integration/test_codegen_verilog.py
@@ -60,7 +60,7 @@ def check_target(device, host="stackvm"):
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
         f(a, b, c)
         print("Check correctness...")
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), a.asnumpy() + b.asnumpy())
     check_target("verilog")
 
diff --git a/tests/webgl/test_local_gemm.py b/tests/webgl/test_local_gemm.py
index 0dd1c0fc7376..e3b9c862a5f9 100644
--- a/tests/webgl/test_local_gemm.py
+++ b/tests/webgl/test_local_gemm.py
@@ -35,7 +35,7 @@ def test_local_gemm():
     c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
     f(a, b, c)
 
-    np.testing.assert_allclose(c.asnumpy(), np.dot(a_np, b_np.T))
+    tvm.testing.assert_allclose(c.asnumpy(), np.dot(a_np, b_np.T))
 
 if __name__ == "__main__":
     test_local_gemm()
diff --git a/tests/webgl/test_local_multi_stage.py b/tests/webgl/test_local_multi_stage.py
index 47fa5c76c7aa..1791241d68ee 100644
--- a/tests/webgl/test_local_multi_stage.py
+++ b/tests/webgl/test_local_multi_stage.py
@@ -24,7 +24,7 @@ def test_local_multi_stage():
     c = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), ctx)
     f(a, c)
 
-    np.testing.assert_allclose(c.asnumpy(), (a.asnumpy() + 1) * 2)
+    tvm.testing.assert_allclose(c.asnumpy(), (a.asnumpy() + 1) * 2)
 
 if __name__ == "__main__":
     test_local_multi_stage()
diff --git a/tests/webgl/test_local_save_load.py b/tests/webgl/test_local_save_load.py
index 5ed058a7461c..bcf9f0a8d5bf 100644
--- a/tests/webgl/test_local_save_load.py
+++ b/tests/webgl/test_local_save_load.py
@@ -30,7 +30,7 @@ def test_local_save_load():
     f.export_library(path_so)
     f1 = tvm.module.load(path_so)
     f1(a, b, c)
-    np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
 if __name__ == "__main__":
     test_local_save_load()
diff --git a/tests/webgl/test_local_topi_conv2d_nchw.py b/tests/webgl/test_local_topi_conv2d_nchw.py
index 106534505694..598446456b4e 100644
--- a/tests/webgl/test_local_topi_conv2d_nchw.py
+++ b/tests/webgl/test_local_topi_conv2d_nchw.py
@@ -49,8 +49,8 @@ def check_device(device):
             func2 = tvm.build(s2, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding))
             func1(a, w, b)
             func2(a, w, c)
-            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-            np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+            tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+            tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
     for device in ['opengl']:
         check_device(device)
diff --git a/tests/webgl/test_local_topi_dense.py b/tests/webgl/test_local_topi_dense.py
index f2e7dfc1331c..75f6dac5d1f8 100644
--- a/tests/webgl/test_local_topi_dense.py
+++ b/tests/webgl/test_local_topi_dense.py
@@ -45,7 +45,7 @@ def check_device(device):
         d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B, C, D], device, name="dense")
         f(a, b, c, d)
-        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
+        tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
 
     for device in ['opengl']:
         check_device(device)
diff --git a/tests/webgl/test_local_topi_pooling.py b/tests/webgl/test_local_topi_pooling.py
index 813fcd227e2f..35e893b94e6e 100644
--- a/tests/webgl/test_local_topi_pooling.py
+++ b/tests/webgl/test_local_topi_pooling.py
@@ -60,7 +60,7 @@ def check_device(device):
 
         f = tvm.build(s, [A, B], device)
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['opengl']:
         check_device(device)
@@ -98,7 +98,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         f = tvm.build(s, [A, B], device)
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['opengl']:
         check_device(device)
diff --git a/tests/webgl/test_local_topi_softmax.py b/tests/webgl/test_local_topi_softmax.py
index 34f8bfb8d8f5..45c0c18098ed 100644
--- a/tests/webgl/test_local_topi_softmax.py
+++ b/tests/webgl/test_local_topi_softmax.py
@@ -32,7 +32,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         foo = tvm.build(s, [A, B], device, name="softmax")
         foo(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ["opengl"]:
         check_device(device)
@@ -63,7 +63,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         foo = tvm.build(s, [A, B], device, name="log_softmax")
         foo(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ["opengl"]:
         check_device(device)
diff --git a/tests/webgl/test_remote_save_load.py b/tests/webgl/test_remote_save_load.py
index b1db6ce741c5..f14b2f2a2eae 100644
--- a/tests/webgl/test_remote_save_load.py
+++ b/tests/webgl/test_remote_save_load.py
@@ -73,7 +73,7 @@ def try_remote_save_load():
     b = tvm.nd.array(np.zeros(16, dtype=A.dtype), ctx)
     c = tvm.nd.array(np.zeros(16, dtype=C.dtype), ctx)
     fhost(a, b, c)
-    np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
 if __name__ == "__main__":
     try_remote_save_load()
diff --git a/topi/include/topi/detail/constant_utils.h b/topi/include/topi/detail/constant_utils.h
index 343334562349..7ff137418c48 100644
--- a/topi/include/topi/detail/constant_utils.h
+++ b/topi/include/topi/detail/constant_utils.h
@@ -59,6 +59,7 @@ inline int64_t GetConstInt(Expr expr) {
  */
 inline std::vector<int> GetConstIntValues(Array<Expr> exprs, const std::string& var_name) {
   std::vector<int> result;
+  if (!exprs.defined()) return result;
   for (auto expr : exprs) {
     CHECK(IsConstInt(expr)) << "All elements of " << var_name << " must be constant integers";
     result.push_back(GetConstInt(expr));
@@ -77,6 +78,7 @@ inline std::vector<int> GetConstIntValues(Array<Expr> exprs, const std::string&
  */
 inline std::vector<int64_t> GetConstInt64Values(Array<Expr> exprs, const std::string& var_name) {
   std::vector<int64_t> result;
+  if (!exprs.defined()) return result;
   for (auto expr : exprs) {
     CHECK(IsConstInt(expr)) << "All elements of " << var_name << " must be constant integers";
     result.push_back(GetConstInt(expr));
diff --git a/topi/include/topi/detail/fuse.h b/topi/include/topi/detail/fuse.h
index 9ee7fbd1cffd..85ca0f9efacb 100644
--- a/topi/include/topi/detail/fuse.h
+++ b/topi/include/topi/detail/fuse.h
@@ -14,22 +14,16 @@ using namespace tvm;
 
 /*!
  * \brief Fuse all of the given args
- * 
+ *
  * \param stage The stage in which to apply the fuse
  * \param args The iteration variables to be fused
  *
  * \return The fused iteration variable
  */
 inline IterVar Fuse(Stage stage, const Array<IterVar>& args) {
-  CHECK_GE(args.size(), 1) << "Fuse requires at least 1 arg";
-
-  auto fused = args[0];
-  for (size_t i = 1; i < args.size(); ++i) {
-    IterVar out;
-    stage.fuse(fused, args[i], &out);
-    fused = out;
-  }
-  return fused;
+  IterVar res;
+  stage.fuse(args, &res);
+  return res;
 }
 
 }  // namespace detail
diff --git a/topi/include/topi/elemwise.h b/topi/include/topi/elemwise.h
index 88c77f0afc52..02bc51515159 100644
--- a/topi/include/topi/elemwise.h
+++ b/topi/include/topi/elemwise.h
@@ -163,7 +163,7 @@ inline Tensor full(const Array<Expr>& shape,
                    const Expr fill_value,
                    std::string name = "tensor",
                    std::string tag = kElementWise) {
-  Expr ev = lossless_cast(dtype, fill_value);
+  Expr ev = cast(dtype, fill_value);
   if (!ev.defined()) {
     LOG(ERROR) << "Can't cast fill_value to " << dtype;
   }
@@ -173,7 +173,7 @@ inline Tensor full(const Array<Expr>& shape,
 }
 
 /*!
-* \brief Creates an operation that construct a tensor with same shape as input tensor, 
+* \brief Creates an operation that construct a tensor with same shape as input tensor,
 * then fill a tensor with fill_value
 *
 * \param x The input tensor
@@ -187,10 +187,7 @@ inline Tensor full_like(const Tensor& x,
                         const Expr fill_value,
                         std::string name = "tensor",
                         std::string tag = kElementWise) {
-  Expr ev = lossless_cast(x->dtype, fill_value);
-  if (!ev.defined()) {
-    LOG(ERROR) << "Can't cast fill_value to " << x->dtype;
-  }
+  Expr ev = cast(x->dtype, fill_value);
   return compute(x->shape, [&](const Array<Var>& i) {
       return ev;
   }, name, tag);
diff --git a/topi/include/topi/image/resize.h b/topi/include/topi/image/resize.h
index b6bd51ef0fd2..2ffe4f453ba2 100644
--- a/topi/include/topi/image/resize.h
+++ b/topi/include/topi/image/resize.h
@@ -12,6 +12,7 @@
 #include <algorithm>
 
 #include "topi/tags.h"
+#include "topi/elemwise.h"
 #include "topi/detail/ravel_unravel.h"
 #include "topi/detail/constant_utils.h"
 #include "tvm/tvm.h"
@@ -288,7 +289,7 @@ inline Tensor resize_bilinear_nchw(const Tensor& input,
 * \return A Tensor resized to given shape
 */
 inline Tensor resize_bilinear(const Tensor& input,
-                              const Array<Expr>& shape,
+                              const Array<tvm::Expr>& shape,
                               std::string layout = "NCHW",
                               bool align_corners = false,
                               std::string name = "tensor",
diff --git a/topi/include/topi/nn.h b/topi/include/topi/nn.h
index 53b899796e37..9d3e675d8ef7 100644
--- a/topi/include/topi/nn.h
+++ b/topi/include/topi/nn.h
@@ -64,7 +64,7 @@ inline tvm::Tensor relu(const tvm::Tensor& t,
 * \param name The name of the operation
 * \param tag The tag to mark the operation
 *
-* \return A Tensor whose op member is the relu operation
+* \return A Tensor whose op member is the leaky relu operation
 */
 inline tvm::Tensor leaky_relu(const tvm::Tensor& t,
                               double alpha = 0.1,
@@ -90,7 +90,7 @@ inline tvm::Tensor leaky_relu(const tvm::Tensor& t,
  * \param name The name of the operation
  * \param tag The tag to mark the operation
  *
- * \return A Tensor whose op member is the relu operation
+ * \return A Tensor whose op member is the parametric relu operation
  */
 inline tvm::Tensor prelu(const tvm::Tensor &x,
                          const tvm::Tensor &slope,
@@ -200,37 +200,6 @@ inline tvm::Tensor pad(const tvm::Tensor& t,
   return tvm::compute(output_shape, l, name, tag);
 }
 
-/*!
- * \brief Creates an operation that calculates a matrix multiplication
- *  (row-major notation):
- *      A(i, k) * B(k, j), if trans_a == trans_b
- *          the usual transposed combinations, otherwise
- *
- * \param A The matrix A
- * \param B The matrix B
- * \param trans_a Is A's layout transposed?
- * \param trans_b Is B's layout transposed?
- * \param name The name of the operation
- * \param tag The tag to mark the operation
- *
- * \return A Tensor whose op member is the matmul operation
- */
-inline tvm::Tensor matmul(const tvm::Tensor& A,
-                           const tvm::Tensor& B,
-                           bool trans_a = false,
-                           bool trans_b = false,
-                           std::string name = "tensor",
-                           std::string tag = kMatMul) {
-  tvm::Array<tvm::Expr> output_shape{A->shape[trans_a ? 1 : 0],
-                                     B->shape[trans_b ? 0 : 1]};
-  auto k = tvm::reduce_axis(tvm::Range{0, A->shape[trans_a ? 0 : 1]}, "k");
-  auto l = [&](tvm::Var i, tvm::Var j) {
-    return tvm::sum((trans_a ? A[k][i] : A[i][k]) * (trans_b ? B[j][k] : B[k][j]),
-                    {k});
-  };
-  return tvm::compute(output_shape, l, name, tag);
-}
-
 /*!
  * \brief Creates an operation that performs a 2-D convolution with an
  * NCHW-layout
@@ -265,7 +234,7 @@ inline tvm::Tensor conv2d_nchw(const tvm::Tensor& I,
   auto pW = I->shape[3];
   tvm::Array<tvm::Expr> output_shape{
       I->shape[0],                                            // B
-      W->shape[1],                                            // O
+      W->shape[0],                                            // O
       (I->shape[2] - W->shape[2] + 2 * pad_h) / stride_h + 1,  // H
       (I->shape[3] - W->shape[3] + 2 * pad_w) / stride_w + 1   // W
   };
@@ -479,6 +448,7 @@ inline tvm::Tensor group_conv2d_ngchw(const tvm::Tensor& I,
 }
 
 using FLayoutIndicesTransform = std::function<Array<Expr>(const Array<Var>& indices)>;
+
 /*!
  * \brief Transform the layout according to the mapping function \p to_src_indices.
  * \param src the source input.
diff --git a/topi/include/topi/nn/l2_normalize.h b/topi/include/topi/nn/l2_normalize.h
index cda1f3b5c813..6d98a75ec157 100644
--- a/topi/include/topi/nn/l2_normalize.h
+++ b/topi/include/topi/nn/l2_normalize.h
@@ -27,7 +27,7 @@ using namespace tvm;
 */
 inline Tensor l2_normalize(const Tensor& data,
                            float eps,
-                           const Array<Expr>& axis,
+                           const Array<Integer>& axis,
                            std::string name = "tensor",
                            std::string tag = "l2_normalize") {
   CHECK_EQ(data->shape.size(), 4) << "L2 normalization requires 4-D input";
diff --git a/topi/include/topi/nn/pooling.h b/topi/include/topi/nn/pooling.h
index 26d61d42991d..795d04a31a46 100644
--- a/topi/include/topi/nn/pooling.h
+++ b/topi/include/topi/nn/pooling.h
@@ -94,10 +94,10 @@ inline Tensor pool_impl(const Tensor& x,
   out_shape.Set(height_axis, out_height);
   out_shape.Set(width_axis, out_width);
 
-  const int64_t *padding_h0 = HalideIR::Internal::as_const_int(pad_top);
-  const int64_t *padding_w0 = HalideIR::Internal::as_const_int(pad_left);
-  const int64_t *padding_h1 = HalideIR::Internal::as_const_int(pad_bottom);
-  const int64_t *padding_w1 = HalideIR::Internal::as_const_int(pad_right);
+  const int64_t *padding_h0 = as_const_int(pad_top);
+  const int64_t *padding_w0 = as_const_int(pad_left);
+  const int64_t *padding_h1 = as_const_int(pad_bottom);
+  const int64_t *padding_w1 = as_const_int(pad_right);
   const bool do_pad = ((padding_h0 && *padding_h0) || (padding_w0 && *padding_w0)) ||
                       ((padding_h1 && *padding_h1) || (padding_w1 && *padding_w1));
 
@@ -112,18 +112,18 @@ inline Tensor pool_impl(const Tensor& x,
     }, "tensor", "pool_max");
   } else if (pool_type == kAvgPool) {
     auto temp = do_pad ? pad(x, pad_before, pad_after, 0, "pad_temp") : x;
-    auto tsum = tvm::compute(out_shape, [&](const Array<Var>& output) {
+    auto tavg = [&](const Array<Var>& output, Expr divide_factor) {
       Array<Expr> indices;
       for (const Var& var : output) indices.push_back(var);
       indices.Set(height_axis, output[height_axis] * stride_height + dheight);
       indices.Set(width_axis, output[width_axis] * stride_width + dwidth);
-      return tvm::sum(temp(indices), { dheight, dwidth });
-    }, "tensor", "pool_avg");
+      return tvm::sum(temp(indices) / divide_factor, { dheight, dwidth });
+    };
 
     return tvm::compute(out_shape,
     [&](const Array<Var>& output) {
       if (count_include_pad) {
-        return tsum(output) / (kernel_height * kernel_width);
+        return tavg(output, kernel_height * kernel_width);
       } else {
         Expr h_start = output[height_axis] * stride_height - pad_top;
         Expr w_start = output[width_axis] * stride_width - pad_left;
@@ -133,9 +133,9 @@ inline Tensor pool_impl(const Tensor& x,
         w_start = ir::Max::make(w_start, make_const(Int(32), 0));
         Expr divide_factor = ir::Max::make((h_end - h_start) * (w_end - w_start),
                                            make_const(Int(32), 1));
-        return tsum(output) / divide_factor;
+        return tavg(output, divide_factor);
       }
-    }, "tensor", kElementWise);
+    }, "tensor", "pool_avg");
   } else {
     LOG(ERROR) << "Unrecognized pool_type: " << pool_type;
     return x;
@@ -192,7 +192,7 @@ inline bool find_height_width(const std::string& layout,
 *        Since pooling does not care about the factor size of dimensions
 *        other than `H` and `W`, one can pass `NCHWc` as well.
 * \param  count_include_pad Whether include padding in the calculation when pool_type is 'avg'
-*        
+*
 *
 * \return The output tensor in the same layout
 */
diff --git a/topi/include/topi/nn/softmax.h b/topi/include/topi/nn/softmax.h
index d17f93046e72..8ee747ccd07c 100644
--- a/topi/include/topi/nn/softmax.h
+++ b/topi/include/topi/nn/softmax.h
@@ -40,7 +40,7 @@ inline Tensor softmax(const Tensor &x,
 
   auto k1 = tvm::reduce_axis(Range(0, input_shape[axis]), "k1");
   auto k2 = tvm::reduce_axis(Range(0, input_shape[axis]), "k2");
-  auto reduced_shape = MakeReduceTargetShape({axis}, x, false);
+  auto reduced_shape = MakeReduceTargetShape({axis}, x, false, false);
 
   auto insert_reduce_index = [axis, ndim](const Array<Var> &indices,
                                           const IterVar &reduce_index) {
diff --git a/topi/include/topi/reduction.h b/topi/include/topi/reduction.h
index f14187471faf..f26d14951fd4 100644
--- a/topi/include/topi/reduction.h
+++ b/topi/include/topi/reduction.h
@@ -8,10 +8,10 @@
 
 #include <algorithm>
 #include <string>
-#include <set>
 #include <vector>
 #include <iterator>
 
+#include "topi/broadcast.h"
 #include "topi/elemwise.h"
 #include "topi/tags.h"
 #include "topi/transform.h"
@@ -19,6 +19,7 @@
 #include "topi/detail/constant_utils.h"
 #include "tvm/tvm.h"
 
+
 namespace topi {
 using namespace tvm;
 
@@ -33,30 +34,34 @@ using FCommReduce = std::function<
 * \brief Convert a reduction axis which could be empty or have negative
 * elements into a real axis with valid dimension indices.
 *
+* \param ndim Number of dimensions in the target.
+* \param axis The axis parameter.
+*
 * \return A non-empty sorted array of valid dimension indices, with no duplicates.
 * If the input axis is empty, the result will be an axis including all dimensions.
 * If any input element is negative, it will be treated as an offset from the
 * last dimension (same as python indexing rules).
 */
-inline std::vector<int> GetRealAxis(int ndim, const std::vector<int>& axis) {
+inline std::vector<int> GetRealAxis(int ndim, const Array<Integer>& axis) {
   std::vector<int> real_axis;
-  if (axis.size() == 0) {
+  if (!axis.defined() || axis.size() == 0) {
     for (int i = 0; i < ndim; ++i) {
       real_axis.push_back(i);
     }
   } else {
     // Use a set so duplicates are removed and the dims are sorted
-    std::set<int> dims;
-    for (auto ele : axis) {
-      if (ele < 0) {
-        ele += ndim;
-      }
-      if (ele >= ndim) {
-        LOG(ERROR) << ele << " exceeds the maximum dimension " << ndim;
+    for (auto elem : axis) {
+      int64_t val = elem->value;
+      if (val < 0) {
+        val += ndim;
       }
-      dims.emplace(ele);
+      CHECK_LE(val, ndim) << " exceeds the maximum dimension " << ndim;
+      CHECK_GE(val, 0);
+      real_axis.push_back(static_cast<int>(val));
     }
-    std::copy(dims.begin(), dims.end(), std::back_inserter(real_axis));
+    std::sort(real_axis.begin(), real_axis.end());
+    real_axis.resize(
+        std::unique(real_axis.begin(), real_axis.end()) - real_axis.begin());
   }
   return real_axis;
 }
@@ -75,7 +80,8 @@ inline Array<IterVar> MakeReduceAxes(const std::vector<int>& real_axis, const Te
 /*! \brief Calculate the target shape for a reduce op */
 inline Array<Expr> MakeReduceTargetShape(const std::vector<int>& real_axis,
                                          const Tensor& data,
-                                         bool keepdims) {
+                                         bool keepdims,
+                                         bool atleast1d) {
   auto ndim = data->shape.size();
   Array<Expr> target_shape;
   if (keepdims) {
@@ -94,9 +100,9 @@ inline Array<Expr> MakeReduceTargetShape(const std::vector<int>& real_axis,
         target_shape.push_back(data->shape[i]);
       }
     }
-    if (target_shape.size() == 0) {
-      target_shape.push_back(1);
-    }
+  }
+  if (target_shape.size() == 0 && atleast1d) {
+    target_shape.push_back(1);
   }
   return target_shape;
 }
@@ -154,18 +160,19 @@ inline Tensor DoCommReduce(const Tensor& data,
  * \param keepdims If this is set to true, the axes which are reduced are
  * left in the result as dimensions with size one. This enables the result
  * to broadcast correctly against the input array.
+ * \param atleast1d Whether the output need to be atleast1d.
  *
  * \return The result tensor.
  */
 inline Tensor CommReduce(const Tensor& data,
-                         const Array<Expr>& axis,
+                         const Array<Integer>& axis,
                          FReduce func,
-                         bool keepdims = false) {
+                         bool keepdims,
+                         bool atleast1d) {
   auto ndim = data->shape.size();
   CHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
-  auto axis_val = detail::GetConstIntValues(axis, "axis");
-  auto real_axis = GetRealAxis(static_cast<int>(ndim), axis_val);
-  auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims);
+  auto real_axis = GetRealAxis(static_cast<int>(ndim), axis);
+  auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims, atleast1d);
   return DoCommReduce(data, func, target_shape, real_axis,
       keepdims ? std::vector<int>() : real_axis);
 }
@@ -179,19 +186,20 @@ inline Tensor CommReduce(const Tensor& data,
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return The result tensor.
 */
 inline Tensor CommReduceIdx(const Tensor& data,
-                            const Array<Expr>& axis,
+                            const Array<Integer>& axis,
                             FCommReduce func,
-                            bool keepdims = false) {
+                            bool keepdims,
+                            bool atleast1d) {
   auto ndim = data->shape.size();
   CHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
-  auto axis_val = detail::GetConstIntValues(axis, "axis");
-  auto real_axis = GetRealAxis(static_cast<int>(ndim), axis_val);
+  auto real_axis = GetRealAxis(static_cast<int>(ndim), axis);
   auto reduce_axes = MakeReduceAxes(real_axis, data);
-  auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims);
+  auto target_shape = MakeReduceTargetShape(real_axis, data, keepdims, atleast1d);
 
   auto compute = [ndim, keepdims, &real_axis, &reduce_axes, &func, &data]
   (const Array<Var>& indices) {
@@ -253,7 +261,7 @@ using FIdentity = std::function<Array<Expr>(std::vector<Type> types)>;
 inline FCommReduce MakeCommReducer(FCombine fcombine,
                                    FIdentity fidentity,
                                    std::string name = "reduce") {
-  return [fcombine, fidentity, &name]
+  return [fcombine, fidentity, name]
   (Array<Expr> exprs, const Array<IterVar>& axis, Expr* condition) {
     Array<Var> lhs, rhs;
     std::vector<Type> dtypes;
@@ -288,6 +296,11 @@ inline Expr MaxOp(Expr source, Array<IterVar> axis) {
   return tvm::max(source, axis);  // NOLINT(*)
 }
 
+/*! \brief Wrap tvm::prod to ensure we get the correct overload */
+inline Expr ProdOp(Expr source, Array<IterVar> axis) {
+  return tvm::prod(source, axis);  // NOLINT(*)
+}
+
 /*!
 * \brief Creates an operation that sums array elements over a given axis
 *
@@ -297,11 +310,15 @@ inline Expr MaxOp(Expr source, Array<IterVar> axis) {
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the sum operation
 */
-inline Tensor sum(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
-  return CommReduce(data, axis, tvm::sum, keepdims);
+inline Tensor sum(const Tensor& data,
+                  const Array<Integer>& axis,
+                  bool keepdims = false,
+                  bool atleast1d = false) {
+  return CommReduce(data, axis, tvm::sum, keepdims, atleast1d);
 }
 
 inline Tensor collapse_sum(const Tensor& data, Array<Expr> target_shape) {
@@ -342,11 +359,15 @@ inline Tensor collapse_sum(const Tensor& data, Array<Expr> target_shape) {
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the min operation
 */
-inline Tensor min(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
-  return CommReduce(data, axis, MinOp, keepdims);
+inline Tensor min(const Tensor& data,
+                  const Array<Integer>& axis,
+                  bool keepdims = false,
+                  bool atleast1d = false) {
+  return CommReduce(data, axis, MinOp, keepdims, atleast1d);
 }
 
 /*!
@@ -359,11 +380,15 @@ inline Tensor min(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the max operation
 */
-inline Tensor max(const Tensor& data, Array<Expr> axis, bool keepdims = false) {  // NOLINT(*)
-  return CommReduce(data, axis, MaxOp, keepdims);
+inline Tensor max(const Tensor& data,
+                  const Array<Integer>& axis,
+                  bool keepdims = false,
+                  bool atleast1d = false) {
+  return CommReduce(data, axis, MaxOp, keepdims, atleast1d);
 }
 
 /*!
@@ -376,10 +401,14 @@ inline Tensor max(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the argmin operation
 */
-inline Tensor argmin(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
+inline Tensor argmin(const Tensor& data,
+                     const Array<Integer>& axis,
+                     bool keepdims = false,
+                     bool atleast1d = false) {
   auto fcombine = [](Array<Var> lhs, Array<Var> rhs) {
     Array<Expr> result;
     result.push_back(tvm::select(lhs[1] <= rhs[1], lhs[0], rhs[0]));  // idx
@@ -393,7 +422,7 @@ inline Tensor argmin(const Tensor& data, Array<Expr> axis, bool keepdims = false
     return result;
   };
   auto func = MakeCommReducer(fcombine, fidentity, "argmin");
-  return CommReduceIdx(data, axis, func, keepdims);
+  return CommReduceIdx(data, axis, func, keepdims, atleast1d);
 }
 
 /*!
@@ -406,10 +435,14 @@ inline Tensor argmin(const Tensor& data, Array<Expr> axis, bool keepdims = false
 * \param keepdims If this is set to true, the axes which are reduced are
 * left in the result as dimensions with size one. This enables the result
 * to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
 *
 * \return A Tensor whose op member is the argmax operation
 */
-inline Tensor argmax(const Tensor& data, Array<Expr> axis, bool keepdims = false) {
+inline Tensor argmax(const Tensor& data,
+                     const Array<Integer>& axis,
+                     bool keepdims = false,
+                     bool atleast1d = false) {
   auto fcombine = [](Array<Var> lhs, Array<Var> rhs) {
     Array<Expr> result;
     result.push_back(tvm::select(lhs[1] >= rhs[1], lhs[0], rhs[0]));  // idx
@@ -423,7 +456,27 @@ inline Tensor argmax(const Tensor& data, Array<Expr> axis, bool keepdims = false
     return result;
   };
   auto func = MakeCommReducer(fcombine, fidentity, "argmax");
-  return CommReduceIdx(data, axis, func, keepdims);
+  return CommReduceIdx(data, axis, func, keepdims, atleast1d);
+}
+
+/*!
+* \brief Creates product operation over given axis.
+*
+* \param data The input tensor
+* \param axis The axis to do product over. If axis is empty, the
+* operation will do the product over all elements of the array.
+* \param keepdims If this is set to true, the axes which are reduced are
+* left in the result as dimensions with size one. This enables the result
+* to broadcast correctly against the input array.
+* \param atleast1d Whether the output need to be atleast1d.
+*
+* \return A Tensor whose op member is the prod operation
+*/
+inline Tensor prod(const Tensor& data,
+                   const Array<Integer>& axis,
+                   bool keepdims = false,
+                   bool atleast1d = false) {
+  return CommReduce(data, axis, ProdOp, keepdims, atleast1d);
 }
 
 }  // namespace topi
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index 09af612b957b..157c19e55249 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -10,6 +10,7 @@
 #include <vector>
 #include <iterator>
 #include <algorithm>
+#include <limits>
 
 #include "topi/tags.h"
 #include "topi/detail/ravel_unravel.h"
@@ -37,11 +38,18 @@ inline Tensor expand_dims(const Tensor& x,
                           int num_newaxis = 1,
                           std::string name = "tensor",
                           std::string tag = kBroadcast) {
+  int ndim = static_cast<int>(x->shape.size());
+  CHECK(-ndim - 1 <= axis && axis <= ndim)
+    << "expand_dims only accepts `axis` in [-data.ndim - 1, data.ndim]"
+    << ", but got axis = " << axis
+    << ", and data.ndim = " << ndim;
+  CHECK(num_newaxis >= 0)
+    << "expand_dims only accepts `num_newaxis >= 0`"
+    << ", but got num_newaxis = " << num_newaxis;
   if (axis < 0) {
     // Calculate offset from last dimension
-    axis = static_cast<int>(x->shape.size()) + axis + 1;
+    axis = ndim + axis + 1;
   }
-
   Array<Expr> new_shape;
   for (size_t i = 0; i < static_cast<size_t>(axis); ++i) {
     new_shape.push_back(x->shape[i]);
@@ -78,42 +86,45 @@ inline Tensor expand_dims(const Tensor& x,
 * \return A Tensor whose op member is the transpose operation
 */
 inline Tensor transpose(const Tensor& x,
-                        Array<Expr> axes,
+                        Array<Integer> axes,
                         std::string name = "tensor",
                         std::string tag = kInjective) {
-  if (axes.size() == 0) {
-    axes = Array<Expr>();
+  if (!axes.defined() || axes.size() == 0) {
+    axes = Array<Integer>();
     for (int i = static_cast<int>(x->shape.size()) - 1; i >= 0; --i) {
       axes.push_back(i);
     }
   }
 
-  auto axes_val = GetConstIntValues(axes, "axes");
-  for (size_t i = 0; i < axes_val.size(); ++i) {
-    int axis = axes_val[i];
-    if (axes_val[i] < 0) {
-      axes_val[i] = static_cast<int>(x->shape.size()) + axes_val[i];
+  Array<Expr> new_shape;
+  for (size_t i = 0; i < axes.size(); ++i) {
+    int axis = static_cast<int>(axes[i]->value);
+    int new_axis = axis;
+    if (axis < 0) {
+      new_axis = static_cast<int>(x->shape.size()) + axis;
+      axes.Set(i, new_axis);
     }
-    CHECK((0 <= axes_val[i]) && (axes_val[i] < static_cast<int>(x->shape.size())))
+    CHECK((new_axis >= 0) && (new_axis < static_cast<int>(x->shape.size())))
       << "axis=" << axis << " is invalid for the "
       << static_cast<int>(x->shape.size()) << "-dimensional input tensor";
 
-    CHECK(1 == std::count(std::begin(axes_val), std::end(axes_val), axes_val[i]))
-      << "repeated axis in transpose";
+    for (size_t j = 0; j < axes.size(); ++j) {
+      if (i !=j) {
+        CHECK(new_axis != static_cast<int>(axes[j]->value)) << "repeated axis in transpose";
+      }
+    }
+    new_shape.push_back(x->shape[new_axis]);
   }
 
-  Array<Expr> new_shape;
-  for (size_t i = 0; i < axes_val.size(); ++i) {
-    new_shape.push_back(x->shape[axes_val[i]]);
-  }
   return compute(
     new_shape, [&](const Array<Var>& indices) {
       std::vector<Expr> idx;
-      for (size_t i = 0; i < axes_val.size(); ++i) {
+      for (size_t i = 0; i < axes.size(); ++i) {
         idx.push_back(1);
       }
-      for (size_t i = 0; i < axes_val.size(); ++i) {
-        idx[axes_val[i]] = indices[i];
+      for (size_t i = 0; i < axes.size(); ++i) {
+        int axis = static_cast<int>(axes[i]->value);
+        idx[axis] = indices[i];
       }
       return x(idx);
     }, name, tag);
@@ -188,30 +199,34 @@ inline Tensor reshape(const Tensor& x,
 * \param x The input tensor
 * \param axis Indices of the dimensions to remove. If this is empty,
 * all entries with a constant size of 1 will be removed.
+ * \param atleast1d Whether the output need to be atleast1d.
 * \param name The name of the operation
 * \param tag The tag to mark the operation
 *
 * \return A Tensor whose op member is the squeeze operation
 */
 inline Tensor squeeze(const Tensor& x,
-                      Array<Expr> axis,
+                      Array<Integer> axis,
+                      bool atleast1d = false,
                       std::string name = "tensor",
                       std::string tag = kInjective) {
-  auto axis_val = GetConstIntValues(axis, "axis");
   auto ndim = x->shape.size();
-  if (axis_val.size() == 0) {
+  std::vector<int> axis_val;
+  if (!axis.defined() || axis.size() == 0) {
     for (size_t i = 0; i < ndim; ++i) {
       if (IsConstInt(x->shape[i]) && GetConstInt(x->shape[i]) == 1) {
         axis_val.push_back(static_cast<int>(i));
       }
     }
   } else {
-    for (size_t i = 0; i < axis_val.size(); ++i) {
-      if (axis_val[i] < 0) {
-        axis_val[i] += static_cast<int>(x->shape.size());
+    for (size_t i = 0; i < axis.size(); ++i) {
+      int64_t val = axis[i]->value;
+      if (val < 0) {
+        val += static_cast<int>(x->shape.size());
       }
-      CHECK_EQ(GetConstInt(x->shape[axis_val[i]]), 1) <<
-        "Dimension " << axis[i] << " must have size 1";
+      CHECK_EQ(GetConstInt(x->shape[val]), 1) <<
+          "Dimension " << val << " must have size 1";
+      axis_val.push_back(val);
     }
   }
 
@@ -223,7 +238,7 @@ inline Tensor squeeze(const Tensor& x,
       out_shape.push_back(x->shape[i]);
     }
   }
-  if (out_shape.size() == 0) {
+  if (out_shape.size() == 0 && atleast1d) {
     out_shape.push_back(1);
   }
 
@@ -257,8 +272,13 @@ inline Tensor concatenate(const Array<Tensor>& inputs,
                           int axis = 0,
                           std::string name = "tensor",
                           std::string tag = kInjective) {
+  int ndim = static_cast<int>(inputs[0]->shape.size());
+  CHECK(-ndim <= axis && axis < ndim)
+    << "concatenate only accepts `axis` in [-ndim, ndim)"
+    << ", but got axis = " << axis
+    << ", and ndim = " << ndim;
   if (axis < 0) {
-    axis += static_cast<int>(inputs[0]->shape.size());
+    axis += ndim;
   }
   CHECK_LT(axis, inputs[0]->shape.size()) <<
     "axis out of bounds";
@@ -315,7 +335,7 @@ inline Tensor concatenate(const Array<Tensor>& inputs,
 * \return A Tensor whose op member is the split operation
 */
 inline Array<Tensor> split(const Tensor& x,
-                           Array<Expr> split_indices,
+                           Array<Integer> split_indices,
                            int axis,
                            std::string name = "tensor",
                            std::string tag = kInjective) {
@@ -325,14 +345,15 @@ inline Array<Tensor> split(const Tensor& x,
   CHECK_LT(axis, x->shape.size()) << "axis out of bounds";
 
   auto src_axis_size = static_cast<int>(GetConstInt(x->shape[axis]));
-
-  auto split_indices_val = GetConstIntValues(split_indices, "split_indices");
-  CHECK(std::is_sorted(split_indices_val.begin(), split_indices_val.end())) <<
-    "split_indices must be sorted";
-
   std::vector<int> begin_ids;
   begin_ids.push_back(0);
-  std::copy(split_indices_val.begin(), split_indices_val.end(), std::back_inserter(begin_ids));
+
+  for (Integer idx : split_indices) {
+    int val = static_cast<int>(idx->value);
+    CHECK_GT(val, begin_ids.back())
+        << "split_indices must be sorted";
+    begin_ids.push_back(val);
+  }
 
   Array< Array<Expr> > out_shapes;
   for (size_t i = 0; i < begin_ids.size(); ++i) {
@@ -391,31 +412,51 @@ inline Array<Tensor> split(const Tensor& x,
 * \return A Tensor whose op member is the split operation
 */
 inline Tensor strided_slice(const Tensor& x,
-                            const Array<Expr>& begin,
-                            const Array<Expr>& end,
-                            const Array<Expr>& strides,
+                            const Array<Integer>& begin,
+                            const Array<Integer>& end,
+                            const Array<Integer>& strides,
                             std::string name = "tensor",
                             std::string tag = kInjective) {
   size_t src_tensor_dim = static_cast<size_t>(x->shape.size());
-  std::vector<int64_t> begin_vec = GetConstInt64Values(begin, "begin");
-  std::vector<int64_t> end_vec = GetConstInt64Values(end, "end");
-  std::vector<int64_t> stride_vec = GetConstInt64Values(strides, "strides");
-  // in case user has not provided begin indices for all the axes,
-  // then inflate it with default value = 0
-  for (size_t i = begin_vec.size(); i < src_tensor_dim; ++i) {
-    begin_vec.push_back(0);
+  // Setup the ranges.
+  // NOTE: this code duplicates the shape inference logic relay.op
+  // Consider to refactor in the future.
+  std::vector<int64_t> stride_vec;
+  for (Integer i : strides) {
+    CHECK(i.defined());
+    stride_vec.push_back(i->value);
   }
-  // in case user has not provided end indices for all the axes,
-  // then inflate it with default value = input_tensor.shape[axis]
-  for (size_t i = end_vec.size(); i < src_tensor_dim; ++i) {
-    end_vec.push_back(GetConstInt(x->shape[i]));
-  }
-  // in case user has not provided stride values,
-  // then inflate it with default value = 1
   for (size_t i = stride_vec.size(); i < src_tensor_dim; ++i) {
     stride_vec.push_back(1);
   }
+  const int64_t max_range = std::numeric_limits<int64_t>::max();
+
+  std::vector<int64_t> begin_vec;
+  for (size_t i = 0; i < begin.size(); ++i) {
+    if (!begin[i].defined()) {
+      // value=None
+      begin_vec.push_back(stride_vec[i] > 0 ? 0 : max_range);
+    } else {
+      begin_vec.push_back(begin[i]->value);
+    }
+  }
+  for (size_t i = begin_vec.size(); i < src_tensor_dim; ++i) {
+    begin_vec.push_back(stride_vec[i] > 0 ? 0 : max_range);
+  }
 
+  std::vector<int64_t> end_vec;
+  for (size_t i = 0; i < end.size(); ++i) {
+    // allow end to be None
+    if (!end[i].defined()) {
+      end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
+    } else {
+      end_vec.push_back(end[i]->value);
+    }
+  }
+  for (size_t i = end_vec.size(); i < src_tensor_dim; ++i) {
+    end_vec.push_back(stride_vec[i] < 0 ? 0 : max_range);
+  }
+  // Compute
   Array<Expr> out_shape;
   Array<Expr> begin_expr;
   Array<Expr> strides_expr;
@@ -471,10 +512,15 @@ inline Tensor strided_slice(const Tensor& x,
 * \return A Tensor whose op member is the split operation
 */
 inline Array<Tensor> split_sections(const Tensor& x,
-                           int num_sections,
-                           int axis,
-                           std::string name = "tensor",
-                           std::string tag = kInjective) {
+                                    int num_sections,
+                                    int axis,
+                                    std::string name = "tensor",
+                                    std::string tag = kInjective) {
+  if (axis < 0) {
+    axis += static_cast<int>(x->shape.size());
+  }
+  CHECK_LT(axis, x->shape.size()) << "axis out of bounds";
+
   auto src_axis_size = static_cast<int>(GetConstInt(x->shape[axis]));
 
   CHECK_GT(num_sections, 0) << "Slice count must be > 0";
@@ -482,7 +528,7 @@ inline Array<Tensor> split_sections(const Tensor& x,
     << "num_sections must be an integer factor of the size of axis " << axis
     << " (" << src_axis_size << ")";
 
-  Array<Expr> split_indices;
+  Array<Integer> split_indices;
   auto seg_size = src_axis_size / num_sections;
   for (int i = 0; i < num_sections; ++i) {
     // region at index 0 is added by split()
@@ -622,6 +668,206 @@ inline Tensor where(const Tensor& condition,
   return out;
 }
 
+/*!
+* \brief Gather elements from a n-dimension array.
+*
+* \param data The source array.
+* \param indices The indices of the values to extract.
+* \param name The name of the operation.
+* \param tag The tag to mark the operation.
+*
+* \return A Tensor whose op member is the gather_nd operation
+*/
+inline Tensor gather_nd(const Tensor& data,
+                        const Tensor& indices,
+                        std::string name = "tensor",
+                        std::string tag = kInjective) {
+  size_t ndim_d = data->shape.size();
+  size_t ndim_i = indices->shape.size();
+  CHECK_GT(ndim_i, 1) << "indices tensor must have at least 2 dimensions";
+  size_t indices_dim0 = static_cast<size_t>(GetConstInt(indices->shape[0]));
+  CHECK_LE(indices_dim0, ndim_d) << "dim 0 of indices tensor must be no more "
+                                 << "than dimensions of data tensor";
+  Array<Expr> out_shape;
+  for (size_t i = 1; i < ndim_i; ++i) {
+    out_shape.push_back(indices->shape[i]);
+  }
+  for (size_t i = indices_dim0; i < ndim_d; ++i) {
+    out_shape.push_back(data->shape[i]);
+  }
+  if (out_shape.size() == 0) {
+    out_shape.push_back(make_const(Int(32), 1));
+  }
+  return compute(
+        out_shape, [&](const Array<Var>& out_index) {
+          Array<Expr> indices_position;
+          indices_position.push_back(0);
+          for (size_t i = 0; i < ndim_i - 1; ++i) {
+            indices_position.push_back(out_index[i]);
+          }
+          Array<Expr> real_indices;
+          for (size_t i = 0; i < indices_dim0; ++i) {
+            indices_position.Set(0, make_const(Int(32), i));
+            if (indices->dtype.is_int()) {
+              real_indices.push_back(indices(indices_position));
+            } else {
+              real_indices.push_back(
+                  tvm::cast(tvm::Int(32), indices(indices_position)));
+            }
+          }
+          for (size_t i = ndim_i - 1; i < out_index.size(); ++i) {
+            real_indices.push_back(out_index[i]);
+          }
+          return data(real_indices);
+        }, name, tag);
+}
+
+/*!
+ * \brief Creates an operation that calculates a matrix multiplication
+ *  (row-major notation):
+ *      A(i, k) * B(k, j), if trans_a == trans_b
+ *          the usual transposed combinations, otherwise
+ *
+ * \param A The matrix A
+ * \param B The matrix B
+ * \param trans_a Is A's layout transposed?
+ * \param trans_b Is B's layout transposed?
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return A Tensor whose op member is the matmul operation
+ */
+inline tvm::Tensor matmul(const tvm::Tensor& A,
+                           const tvm::Tensor& B,
+                           bool trans_a = false,
+                           bool trans_b = false,
+                           std::string name = "tensor",
+                           std::string tag = kMatMul) {
+  tvm::Array<tvm::Expr> output_shape{A->shape[trans_a ? 1 : 0],
+                                     B->shape[trans_b ? 0 : 1]};
+  auto k = tvm::reduce_axis(tvm::Range{0, A->shape[trans_a ? 0 : 1]}, "k");
+  auto l = [&](tvm::Var i, tvm::Var j) {
+    return tvm::sum((trans_a ? A[k][i] : A[i][k]) * (trans_b ? B[j][k] : B[k][j]),
+                    {k});
+  };
+  return tvm::compute(output_shape, l, name, tag);
+}
+
+/*!
+ * \brief A generalization of matrix multiplication to tensors.
+ *
+ * \param A The tensor A
+ * \param B The tensor B
+ * \param axes The number of the dimensions to reduce over
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return A Tensor computing the result
+ */
+inline Tensor tensordot(const Tensor& A,
+                        const tvm::Tensor& B,
+                        int axes = 2,
+                        std::string name = "tensor",
+                        std::string tag = kMatMul) {
+  CHECK_GE(A->shape.size(), axes);
+  CHECK_GE(B->shape.size(), axes);
+
+  Array<Expr> output_shape(A->shape.begin(), A->shape.end() + (-axes));
+  for (auto it = B->shape.begin() + axes; it != B->shape.end(); ++it)
+    output_shape.push_back(*it);
+
+  Array<IterVar> iter_vars;
+  for (int i = 0; i < axes; ++i)
+    iter_vars.push_back(reduce_axis(Range(0, B->shape[i]), "k" + std::to_string(i)));
+
+  auto func =
+    [&A, &B, &iter_vars, axes]
+    (const Array<Var>& input_indices) {
+      Array<Expr> A_indices(
+          input_indices.begin(),
+          input_indices.begin() + (A->shape.size() - axes));
+      for (auto& v : iter_vars)
+        A_indices.push_back(v);
+
+      Array<Expr> B_indices;
+      for (auto& v : iter_vars)
+        B_indices.push_back(v);
+
+      auto it = input_indices.begin() + (A->shape.size() - axes);
+      for (; it != input_indices.end(); ++it)
+        B_indices.push_back(*it);
+
+      // Some passes don't like reductions with empty axis, so avoid it here
+      if (iter_vars.empty())
+        return A(A_indices) * B(B_indices);
+      else
+        return sum(A(A_indices) * B(B_indices), iter_vars);
+    };
+
+  return compute(output_shape, func, name, tag);
+}
+
+/*!
+ * \brief A generalization of matrix multiplication to tensors.
+ *
+ * \param A The tensor A
+ * \param B The tensor B
+ * \param A_axes The indices of the dimensions of tensor A to reduce over
+ * \param B_axes The indices of the dimensions of tensor B to reduce over
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return A Tensor computing the result
+ */
+inline Tensor tensordot(const Tensor& A,
+                        const tvm::Tensor& B,
+                        Array<Expr> A_axes,
+                        Array<Expr> B_axes,
+                        std::string name = "tensor",
+                        std::string tag = kMatMul) {
+  CHECK_EQ(A_axes.size(), B_axes.size());
+
+  auto A_axes_val = GetConstIntValues(A_axes, "A_axes");
+  auto B_axes_val = GetConstIntValues(B_axes, "B_axes");
+
+  Array<Expr> output_shape;
+  for (unsigned i = 0; i < A->shape.size(); ++i)
+    if (std::find(A_axes_val.begin(), A_axes_val.end(), i) == A_axes_val.end())
+      output_shape.push_back(A->shape[i]);
+  for (unsigned i = 0; i < B->shape.size(); ++i)
+    if (std::find(B_axes_val.begin(), B_axes_val.end(), i) == B_axes_val.end())
+      output_shape.push_back(B->shape[i]);
+
+  Array<IterVar> iter_vars;
+    for (unsigned i = 0; i < B_axes_val.size(); ++i)
+      iter_vars.push_back(reduce_axis(Range(0, B->shape[B_axes_val[i]]), "k" + std::to_string(i)));
+
+  auto func =
+    [&A, &B, &iter_vars, A_axes_val, B_axes_val]
+    (const Array<Var>& input_indices) {
+      int idx_input = 0;
+      Array<Expr> A_indices;
+      for (unsigned i = 0; i < A->shape.size(); ++i) {
+        auto axes_pos = std::find(A_axes_val.begin(), A_axes_val.end(), i);
+        if (axes_pos == A_axes_val.end())
+          A_indices.push_back(input_indices[idx_input++]);
+        else
+          A_indices.push_back(iter_vars[axes_pos - A_axes_val.begin()]);
+      }
+
+      Array<Expr> B_indices;
+      for (unsigned i = 0; i < B->shape.size(); ++i) {
+        auto axes_pos = std::find(B_axes_val.begin(), B_axes_val.end(), i);
+        if (axes_pos == B_axes_val.end())
+          B_indices.push_back(input_indices[idx_input++]);
+        else
+          B_indices.push_back(iter_vars[axes_pos - B_axes_val.begin()]);
+      }
+      return sum(A(A_indices) * B(B_indices), iter_vars);
+    };
+  return compute(output_shape, func, name, tag);
+}
+
 
 }  // namespace topi
 #endif  // TOPI_TRANSFORM_H_
diff --git a/topi/include/topi/vision/yolo/region.h b/topi/include/topi/vision/yolo/region.h
index 88553fc29b8a..7d303f445ac4 100644
--- a/topi/include/topi/vision/yolo/region.h
+++ b/topi/include/topi/vision/yolo/region.h
@@ -53,7 +53,7 @@ inline Tensor region(const Tensor &data,
                                      input_shape[2],
                                      input_shape[3]};
   auto data_block = reshape(data, intermediate_shape);
-  Array <Expr> split_indices;
+  Array <Integer> split_indices;
   for (int i = 1; i < split_size; ++i) {
     split_indices.push_back(i);
   }
diff --git a/topi/include/topi/vision/yolo/yolo.h b/topi/include/topi/vision/yolo/yolo.h
deleted file mode 100644
index d2e24c01b253..000000000000
--- a/topi/include/topi/vision/yolo/yolo.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \brief YOLO op constructions
- * \file vision/yolo/yolo.h
- */
-#ifndef TOPI_VISION_YOLO_YOLO_H_
-#define TOPI_VISION_YOLO_YOLO_H_
-
-#include <algorithm>
-#include <string>
-
-#include "topi/detail/constant_utils.h"
-#include "topi/tags.h"
-#include "topi/transform.h"
-#include "tvm/tvm.h"
-
-
-namespace topi {
-namespace vision {
-namespace yolo {
-using namespace tvm;
-using namespace nn;
-
-/*!
-* \brief yolo operation
-*
-* \param data The input tensor.
-* \param num Darknet layer parameter n
-* \param classes number of classes in the yolo model
-* \param name The name of the operation
-* \param tag The tag to mark the operation
-*
-* \return A Tensor whose op member is the yolo operation
-*/
-inline Tensor yolo(const Tensor &data,
-                   int num,
-                   int classes,
-                   std::string name = "tensor",
-                   std::string tag = "yolo_output") {
-  auto input_shape = data->shape;
-  int split_size = classes + 5;
-  Array <Expr> intermediate_shape = {input_shape[0],
-                                     num,
-                                     split_size,
-                                     input_shape[2],
-                                     input_shape[3]};
-  auto data_block = reshape(data, intermediate_shape);
-  Array <Expr> split_indices = {2, 4};
-  Array <Tensor> split_res = split(data_block, split_indices, 2);
-  split_res.Set(0, sigmoid(split_res[0]));
-  split_res.Set(2, sigmoid(split_res[2]));
-  Tensor out = concatenate(split_res, 2);
-  return reshape(out, input_shape);
-}
-}  // namespace yolo
-}  // namespace vision
-}  // namespace topi
-#endif  // TOPI_VISION_YOLO_YOLO_H_
diff --git a/topi/python/topi/__init__.py b/topi/python/topi/__init__.py
index 3ef59913e07b..2eb460d151ae 100644
--- a/topi/python/topi/__init__.py
+++ b/topi/python/topi/__init__.py
@@ -32,6 +32,7 @@
 from . import rocm
 from . import vision
 from . import image
+from . import sparse
 from . import hls
 # not import testing by default
 # because testing can have extra deps that are not necessary
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index 48bb4fb022c7..b64d2b92a334 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -2,6 +2,8 @@
 """Conv2D schedule for ARM CPU"""
 from __future__ import absolute_import as _abs
 
+import warnings
+
 import numpy as np
 
 import tvm
@@ -9,35 +11,68 @@
 
 from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform
 from ..util import traverse_inline, get_const_tuple, const_matrix
-from ..nn import pad, conv2d, conv2d_alter_layout, conv2d_winograd_without_weight_transform
+from ..nn import dilate, pad, conv2d, conv2d_alter_layout, \
+                 conv2d_winograd_without_weight_transform, depthwise_conv2d_nchw
 from ..nn.util import get_const_int, get_pad_tuple
 
-def _conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype):
-    """convert argument to workload"""
-    if len(kernel.shape) == 4:
-        raw_kernel = kernel
-    else:  # the input kernel is transformed by alter_op_layout
-        shape = get_const_tuple(kernel.shape)
-        raw_kernel = tvm.placeholder((shape[0] * shape[4], shape[1], shape[2], shape[3]),
-                                     dtype=kernel.dtype)
-    return ('conv2d', ) + autotvm.task.args_to_workload(
-        [data, raw_kernel, strides, padding, layout, out_dtype])
-
-@conv2d.register('arm_cpu')
-@autotvm.task.dispatcher
-def conv2d_arm_cpu(data, kernel, strides, padding, layout, out_dtype):
-    """TOPI compute callback. Mark this function as a dispatcher, so
-    this template can assign config according to workload"""
-    return _conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype)
-
-@conv2d_arm_cpu.register(['direct'])
-def decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype):
-    """spatial packing template"""
-    return _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile=2)
-
-@autotvm.task.register_topi_schedule(schedule_conv2d_nchw, 'arm_cpu', ['direct', 'winograd'])
+@autotvm.register_topi_compute(conv2d, 'arm_cpu', ['direct'])
+def conv2d_arm_cpu(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
+    """TOPI compute callback for conv2d
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    data : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    kernel : tvm.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
+        pre-packed 5-D with shape [num_filter_chunk, in_channel, filter_height,
+        filter_width, num_filter_block]
+
+    strides : list of two ints
+        [stride_height, stride_width]
+
+    padding : list of two ints
+        [pad_height, pad_width]
+
+    dilation : list of two ints
+        [dilation_height, dilation_width]
+
+    layout : str
+        layout of data
+
+    out_dtype: str
+        The output type. This is used for mixed precision.
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    return _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
+                              num_tile=2)
+
+@autotvm.register_topi_schedule(schedule_conv2d_nchw, 'arm_cpu', ['direct', 'winograd'])
 def schedule_conv2d_nchw_arm_cpu(cfg, outs):
-    """TOPI schedule callback"""
+    """TOPI schedule callback for conv2d
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    outs: Array of Tensor
+        The computation graph description of conv2d
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for conv2d.
+    """
     s = tvm.create_schedule([x.op for x in outs])
 
     def _callback(op):
@@ -68,11 +103,17 @@ def _callback(op):
     return s
 
 
-def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile):
+def _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, num_tile):
     assert layout == "NCHW", "Only support NCHW"
+    # create workload according to raw arguments
     out_dtype = out_dtype or data.dtype
-
     N, CI, IH, IW = get_const_tuple(data.shape)
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
     if len(kernel.shape) == 4:
         pre_packed = False
         CO, _, KH, KW = get_const_tuple(kernel.shape)
@@ -81,11 +122,13 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
         CO, _, KH, KW, VC = get_const_tuple(kernel.shape)
         CO = CO * VC
 
-    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (KH, KW))
+    dilated_kernel_h = (KH - 1) * dilation_h + 1
+    dilated_kernel_w = (KW - 1) * dilation_w + 1
+    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w))
     HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
-
-    OH = (IH + pad_top + pad_bottom - KH) // HSTR + 1
-    OW = (IW + pad_left + pad_right - KW) // WSTR + 1
+    OH = (IH + pad_top + pad_bottom - dilated_kernel_h) // HSTR + 1
+    OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1
     data_pad = pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_bottom, pad_right])
 
     # ==================== define configuration space ====================
@@ -111,20 +154,37 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
 
     cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll')
     cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec')
+
+    # fallback support
+    if cfg.is_fallback:
+        if num_tile == 2:     # arm cpu
+            ref_log = autotvm.tophub.load_reference_log('arm_cpu', 'rk3399', 'conv2d', 'direct')
+            cfg.fallback_with_reference_log(ref_log)
+        elif num_tile == 3:  # mali gpu
+            ref_log = autotvm.tophub.load_reference_log('mali', 'rk3399', 'conv2d', 'direct')
+            cfg.fallback_with_reference_log(ref_log)
     # ====================================================================
 
     VC = cfg["tile_co"].size[-1]
     VH = cfg["tile_oh"].size[-1]
     VW = cfg["tile_ow"].size[-1]
 
-    dvshape = (N, OH // VH, OW // VW, CI, VH*HSTR + KH-1, VW*WSTR + KW-1)
     kvshape = (CO // VC, CI, KH, KW, VC)
     ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC)
     oshape = (N, CO, OH, OW)
 
-    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw:
-                           data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw],
-                           name='data_vec')
+    if dilation_h != 1 or dilation_w != 1:
+        # undilate input data
+        dvshape = (N, OH // VH, OW // VW, CI, KH, KW, VH, VW)
+        data_vec = tvm.compute(dvshape, lambda n, h, w, ci, kh, kw, vh, vw:
+                               data_pad[n][ci][(h*VH+vh)*HSTR+kh*dilation_h]
+                               [(w*VW+vw)*WSTR+kw*dilation_w],
+                               name='data_vec_undilated')
+    else:
+        dvshape = (N, OH // VH, OW // VW, CI, VH*HSTR + KH-1, VW*WSTR + KW-1)
+        data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw:
+                               data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw],
+                               name='data_vec')
 
     if pre_packed:
         kernel_vec = kernel
@@ -137,16 +197,20 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
     kh = tvm.reduce_axis((0, KH), name='kh')
     kw = tvm.reduce_axis((0, KW), name='kw')
 
-    conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
-        tvm.sum(data_vec[n, h, w, ci, vh*HSTR+kh, vw*WSTR+kw].astype(out_dtype) *
-                kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
-                axis=[ci, kh, kw]), name='conv')
+    if dilation_h != 1 or dilation_w != 1:
+        conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
+            tvm.sum(data_vec[n, h, w, ci, kh, kw, vh, vw].astype(out_dtype) *
+                    kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
+                    axis=[ci, kh, kw]), name='conv')
+    else:
+        conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
+            tvm.sum(data_vec[n, h, w, ci, vh*HSTR+kh, vw*WSTR+kw].astype(out_dtype) *
+                    kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
+                    axis=[ci, kh, kw]), name='conv')
 
     output = tvm.compute(oshape, lambda n, co, h, w:
                          conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
-                         name='output_unpack', tag='spatial_conv2d_output',
-                         attrs={'workload': _conv_arg_to_workload(data, kernel, strides, padding,
-                                                                  layout, out_dtype)})
+                         name='output_unpack', tag='spatial_conv2d_output')
     return output
 
 def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec,
@@ -188,7 +252,10 @@ def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec,
     # mark parallel
     s[last].parallel(co)
 
-    _, h, _, _, _, _ = s[data_vec].op.axis
+    if data_vec.op.name == 'data_vec_undilated':
+        _, h, _, _, _, _, _, _ = s[data_vec].op.axis
+    else:
+        _, h, _, _, _, _ = s[data_vec].op.axis
     s[data_vec].parallel(h)
 
     if kernel_vec.op.name == 'kernel_vec':
@@ -206,17 +273,28 @@ def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec,
     return s
 
 
-@conv2d_arm_cpu.register('winograd')
-def decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype):
+@autotvm.register_topi_compute(conv2d, 'arm_cpu', ['winograd'])
+def conv2d_arm_cpu_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
+    """ TOPI compute callback. Use winograd template """
     tile_size = 4
-    return _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size)
+    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout,
+                          out_dtype, tile_size)
 
-def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
+def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size):
     N, CI, IH, IW = get_const_tuple(data.shape)
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
     if len(kernel.shape) == 4:
+        if dilation_h != 1 or dilation_w != 1:
+            kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))
         pre_computed = False
         CO, _, KH, KW = get_const_tuple(kernel.shape)
     else:
+        assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
         pre_computed = True
         H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape)
         CO *= VC
@@ -332,11 +410,9 @@ def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_
     # unpack output
     output = tvm.compute((N, K, H, W), lambda n, k, h, w:
                          Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m],
-                         name='output', tag='winograd_conv2d_output',
-                         attrs={'workload': _winograd_conv_arg_to_workload(
-                             data, kernel, strides, padding, layout, out_dtype, tile_size)})
+                         name='output', tag='winograd_conv2d_output')
 
-    # we have to manually assign effective GFLOP for winogard
+    # we have to manually assign effective GFLOP for winograd
     cfg.add_flop(2 * N * K * H * W * KH * KW * C)
     return output
 
@@ -358,30 +434,29 @@ def _schedule_winograd(cfg, s, output, last):
         kernel, G = U.op.input_tensors
         s[G].compute_inline()
         eps, nu, k, c, kk, = s[U].op.axis
-        r_kh, r_kw = s[U].op.reduce_axis
-        s[U].reorder(k, c, eps, nu, r_kh, r_kw, kk)
-        s[U].unroll(eps)
-        s[U].unroll(nu)
-        s[U].unroll(r_kh)
-        s[U].unroll(r_kw)
-        s[U].vectorize(kk)
         if autotvm.GLOBAL_SCOPE.in_tuning:
             # kernel transformation will be pre-computed during compilation, so we skip
             # this part to make tuning records correct
-            s[U].pragma(k, 'debug_skip_region')
+            s[U].pragma(eps, 'debug_skip_region')
         else:
+            r_kh, r_kw = s[U].op.reduce_axis
+            s[U].reorder(k, c, eps, nu, r_kh, r_kw, kk)
+            for axis in [eps, nu, r_kh, r_kw]:
+                s[U].unroll(axis)
+            s[U].vectorize(kk)
             s[U].parallel(k)
 
+        if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+            s[kernel].compute_inline()
+
     # transform image
     DD = s.cache_read(d, 'global', [V])
     s[B].compute_inline()
     eps, nu, b, c, bb = s[V].op.axis
     r_eps, r_nu = s[V].op.reduce_axis
     s[V].reorder(b, c, eps, nu, r_eps, r_nu, bb)
-    s[V].unroll(eps)
-    s[V].unroll(nu)
-    s[V].unroll(r_eps)
-    s[V].unroll(r_nu)
+    for axis in [eps, nu, r_eps, r_nu]:
+        s[V].unroll(axis)
     s[DD].compute_at(s[V], c)
     s[V].vectorize(bb)
     s[V].parallel(b)
@@ -405,10 +480,8 @@ def _schedule_winograd(cfg, s, output, last):
     s[A].compute_inline()
     k, b, vh, vw = s[Y].op.axis
     r_eps, r_nu = s[Y].op.reduce_axis
-    s[Y].unroll(vh)
-    s[Y].unroll(vw)
-    s[Y].unroll(r_eps)
-    s[Y].unroll(r_nu)
+    for axis in [vh, vw, r_eps, r_nu]:
+        s[Y].unroll(axis)
 
     # output
     n, co, h, w = s[last].op.axis
@@ -426,39 +499,16 @@ def _schedule_winograd(cfg, s, output, last):
         s[output].compute_inline()
 
 
-def _winograd_conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype, tile_size):
-    """convert argument to workload"""
-    K = 3
-    shape = get_const_tuple(kernel.shape)
-    alpha = tile_size + K - 1
-    if len(kernel.shape) == 4:
-        assert shape[2:] == (K, K)
-        CO, CI = shape[:2]
-    else:
-        assert shape[:2] == (alpha, alpha)
-        CO, CI, VCO = shape[2:]
-        CO *= VCO
-
-    raw_kernel = tvm.placeholder((CO, CI, K, K), dtype=kernel.dtype)
-    return ('conv2d', ) + autotvm.task.args_to_workload(
-        [data, raw_kernel, strides, padding, layout, out_dtype])
-
-
-@conv2d_winograd_without_weight_transform.register(['arm_cpu'])
-@autotvm.task.dispatcher
-def winograd_ww_config_dispatcher_(data, kernel, strides, padding, layout, out_dtype, tile_size):
-    return _winograd_conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype,
-                                          tile_size)
-
-
-@winograd_ww_config_dispatcher_.register(['winograd'])
-def decl_winograd_ww(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
-    return _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype,
+##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM #####
+@autotvm.register_topi_compute(conv2d_winograd_without_weight_transform, 'arm_cpu', ['winograd'])
+def conv2d_winograd_ww(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size):
+    """TOPI compute callback"""
+    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,\
                           tile_size)
 
 
-@autotvm.task.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
-                                     'arm_cpu', ['winograd'])
+@autotvm.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
+                                'arm_cpu', ['winograd'])
 def schedule_conv2d_winograd_without_weight_transform_(cfg, outs):
     """TOPI schedule callback"""
     s = tvm.create_schedule([x.op for x in outs])
@@ -472,16 +522,16 @@ def _callback(op):
     return s
 
 
-@conv2d_alter_layout.register(["arm_cpu", "mali"])
-def _alter_conv2d_layout(attrs, inputs, tinfos):
+##### REGISTER ALTER OP LAYOUT #####
+@conv2d_alter_layout.register(["arm_cpu"])
+def _alter_conv2d_layout_arm(attrs, inputs, tinfos):
     """Alter op layout for pre-computing kernel transformation"""
     import nnvm.symbol as sym
     copy_inputs = [s for s in inputs]
 
     new_attrs = {k: attrs[k] for k in attrs.keys()}
 
-    assert attrs.get_int_tuple("dilation") == (1, 1), "Does not support dilation " \
-                                                      "when alter_op_layout is enabled"
+    dilation = attrs.get_int_tuple("dilation")
     strides = attrs.get_int_tuple("strides")
     padding = attrs.get_int_tuple("padding")
     groups = attrs.get_int('groups')
@@ -489,29 +539,94 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
     out_dtype = attrs["out_dtype"]
     out_dtype = tinfos[0].dtype if out_dtype == "same" else out_dtype
 
+    if layout != 'NCHW':
+        return None
+    if dilation != (1, 1):
+        warnings.warn("Does not support weight pre-transform for dilated convolution.")
+        return None
+
+    data, kernel = tinfos[0:2]
+    N, CI, H, W = get_const_tuple(data.shape)
+    CO, _, KH, KW = get_const_tuple(kernel.shape)
+
     if groups == 1:
         # query config of this workload
-        workload = _conv_arg_to_workload(tinfos[0], tinfos[1], strides, padding,
-                                         layout, out_dtype)
-        cfg = autotvm.task.DispatchContext.current.query(tvm.target.current_target(), workload)
+        workload = autotvm.task.args_to_workload(
+            [data, kernel, strides, padding, dilation, layout, out_dtype], conv2d)
+        target = tvm.target.current_target()
+        dispatch_ctx = autotvm.DispatchContext.current
+        cfg = dispatch_ctx.query(target, workload)
+
+        if cfg.is_fallback:  # if is fallback, clear query cache and return None
+            autotvm.task.clear_fallback_cache(target, workload)
+            return None
+
+        if cfg.template_key == 'direct':  # pack weight tensor
+            VC = cfg['tile_co'].size[-1]
+            new_attrs['kernel_layout'] = 'OIHW%do' % VC
+
+            # Store the same config for the altered operator (workload)
+            new_data = data
+            new_kernel = tvm.placeholder((CO // VC, CI, KH, KW, VC), dtype=kernel.dtype)
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_kernel, strides, padding, dilation, 'NCHW', out_dtype], conv2d)
+            dispatch_ctx.update(target, new_workload, cfg)
 
-        if cfg.template_key == 'direct':  # packing weight tensor
-            new_attrs['kernel_layout'] = 'OIHW%do' % (cfg['tile_co'].size[-1])
             return sym.conv2d(*copy_inputs, **new_attrs)
         else:  # pre-compute weight transformation in winograd
-            tile_size = 4
+            if "-device=arm_cpu" in target.options:
+                tile_size = 4
+                VC = cfg['tile_k'].size[-1]
+            else:
+                from ..mali.conv2d import _pick_tile_size
+                tile_size = _pick_tile_size(tinfos[0], tinfos[1])
+                VC = cfg['tile_bna'].val
 
             weight = sym.contrib.conv2d_winograd_weight_transform(copy_inputs[1],
                                                                   tile_size=tile_size)
-            CO, CI, KH, KW = get_const_tuple(tinfos[1].shape)
-            VC = cfg['tile_k'].size[-1]
             weight = sym.reshape(weight,
                                  shape=(KH + tile_size - 1, KW + tile_size - 1, CO // VC, VC, CI))
             weight = sym.transpose(weight, axes=[0, 1, 2, 4, 3])
 
             copy_inputs[1] = weight
             new_attrs['tile_size'] = tile_size
+
+            # Store the same config for the altered operator (workload)
+            new_data = data
+            new_weight = tvm.placeholder((KH + tile_size - 1, KH + tile_size -1, CO // VC, CI, VC),
+                                         kernel.dtype)
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_weight, strides, padding, dilation,
+                 new_attrs['layout'], out_dtype, tile_size],
+                conv2d_winograd_without_weight_transform)
+            dispatch_ctx.update(target, new_workload, cfg)
+
             return sym.contrib.conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs)
+    else:
+        workload = autotvm.task.args_to_workload(
+            [data, kernel, strides, padding, dilation, out_dtype], depthwise_conv2d_nchw)
+        target = tvm.target.current_target()
+        dispatch_ctx = autotvm.DispatchContext.current
+        cfg = dispatch_ctx.query(target, workload)
+
+        if cfg.is_fallback:  # if is fallback, clear query cache and return None
+            autotvm.task.clear_fallback_cache(tvm.target.current_target(), workload)
+            return None
+
+        if cfg.template_key == 'direct':
+            VC = cfg['tile_co'].size[-1]
+            new_attrs['kernel_layout'] = 'OIHW%do' % (cfg['tile_co'].size[-1])
+
+            # Store the same config for the altered operator (workload)
+            new_data = data
+            CO, M, KH, KW = get_const_tuple(kernel.shape)
+            new_kernel = tvm.placeholder((CO // VC, M, KH, KW, VC), dtype=kernel.dtype)
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_kernel, strides, padding, dilation, out_dtype],
+                depthwise_conv2d_nchw)
+            dispatch_ctx.update(target, new_workload, cfg)
 
-    # do nothing for depthwise convolution
-    return None
+            return sym.conv2d(*copy_inputs, **new_attrs)
+        else:
+            # add more schedule templates
+            return None
diff --git a/topi/python/topi/arm_cpu/depthwise_conv2d.py b/topi/python/topi/arm_cpu/depthwise_conv2d.py
index 8aafc436319f..9706559cea69 100644
--- a/topi/python/topi/arm_cpu/depthwise_conv2d.py
+++ b/topi/python/topi/arm_cpu/depthwise_conv2d.py
@@ -5,99 +5,305 @@
 from tvm import autotvm
 
 from ..generic import schedule_depthwise_conv2d_nchw
-from ..nn import depthwise_conv2d_nchw
-from ..util import traverse_inline
+from ..nn import depthwise_conv2d_nchw, pad
+from ..util import traverse_inline, get_const_tuple, get_const_int
+from ..nn.util import get_pad_tuple
 
-# register original implementation of depthwise_conv2d_nchw since we don't need to change this part
-autotvm.task.register_topi_compute(depthwise_conv2d_nchw, 'arm_cpu', 'direct',
-                                   depthwise_conv2d_nchw.fdefault)
+@autotvm.register_topi_compute(depthwise_conv2d_nchw, ['arm_cpu', 'cpu'], ['direct'])
+def depthwise_conv2d_arm_cpu(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """TOPI compute callback for depthwise_conv2d nchw
 
-# register customized schedule for arm cpu.
-@autotvm.task.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'arm_cpu', 'direct')
-def schedule_depthwise_conv2d_nchw_(cfg, outs):
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    data : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    kernel : tvm.Tensor
+        4-D with shape [num_filter, multiplier, filter_height, filter_width] or
+        pre-packed 5-D with shape [num_filter_chunk, multiplier, filter_height,
+        filter_width, num_filter_block]
+
+    strides : list of two ints
+        [stride_height, stride_width]
+
+    padding : list of two ints
+        [pad_height, pad_width]
+
+    dilation : list of two ints
+        [dilation_height, dilation_width]
+
+    out_dtype: str
+        The output type. This is used for mixed precision.
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+
+    return _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile=2)
+
+
+def _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile):
+    out_dtype = out_dtype or data.dtype
+
+    N, C, IH, IW = get_const_tuple(data.shape)
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    if len(kernel.shape) == 4:
+        pre_packed = False
+        C, M, KH, KW = get_const_tuple(kernel.shape)
+    else:  # kernel tensor is pre packed
+        pre_packed = True
+        C, M, KH, KW, VC = get_const_tuple(kernel.shape)
+        C = C * VC
+
+    dilated_kernel_h = (KH - 1) * dilation_h + 1
+    dilated_kernel_w = (KW - 1) * dilation_w + 1
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w))
+    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1
+    OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1
+    # pack data
+    HPAD = pad_top + pad_down
+    WPAD = pad_left + pad_right
+    DOPAD = (HPAD != 0 or WPAD != 0)
+    if DOPAD:
+        data_pad = pad(data, (0, 0, pad_top, pad_left), (0, 0, pad_down, pad_right),
+                       name="data_pad")
+    else:
+        data_pad = data
+
+    # fallback support
+    # Currently, Mali schedule doesn't use it like conv2d.
+    if cfg.is_fallback:
+        ref_log = autotvm.tophub.load_reference_log('arm_cpu', 'rk3399', 'depthwise_conv2d_nchw',
+                                                    'direct')
+        cfg.fallback_with_reference_log(ref_log)
+
+    # ==================== define configuration space ====================
+    n, c, oh, ow = cfg.axis(N), cfg.axis(C), cfg.axis(OH), cfg.axis(OW)
+    kh, kw = cfg.reduce_axis(KH), cfg.reduce_axis(KW)
+
+    # Currently, Mali schedule doesn't use it like conv2d.
+    # Leave num_tile for possible future use of Mali schedule
+    if num_tile == 2:     # for arm cpu
+        co, vc = cfg.define_split('tile_co', c, num_outputs=2)
+        oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2)
+        ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2)
+    else:
+        raise RuntimeError("Invalid num_tile")
+
+    cfg.define_reorder("reorder_0",
+                       [n, co, oh, ow, kh, kw, vh, vw, vc],
+                       policy='candidate', candidate=[
+                           [n, co, oh, ow, kh, kw, vh, vw, vc],
+                           [n, co, oh, ow, kh, kw, vc, vh, vw]])
+
+    cfg.define_reorder("reorder_1",
+                       [n, co, oh, ow, vh, vw, vc],
+                       policy='candidate', candidate=[
+                           [n, co, oh, ow, vh, vw, vc],
+                           [n, co, oh, ow, vc, vh, vw],
+                           [n, co, oh, ow, vh, vc, vw]])
+
+    cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll')
+    cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec')
+    # ====================================================================
+
+    VC = cfg["tile_co"].size[-1]
+    VH = cfg["tile_oh"].size[-1]
+    VW = cfg["tile_ow"].size[-1]
+
+    kvshape = (C // VC, M, KH, KW, VC)
+    ovshape = (N, C * M // VC, OH // VH, OW // VW, VH, VW, VC)
+    oshape = (N, C * M, OH, OW)
+
+    if dilation_h != 1 or dilation_w != 1:
+        # undilate input data
+        dvshape = (N, OH // VH, OW // VW, C, KH, KW, VH, VW)
+        data_vec = tvm.compute(dvshape, lambda n, h, w, c, kh, kw, vh, vw:
+                               data_pad[n][c][(h * VH + vh) * HSTR + kh * dilation_h]
+                               [(w*VW+vw)*WSTR+kw*dilation_w],
+                               name='data_vec_undilated')
+    else:
+        dvshape = (N, OH // VH, OW // VW, C, VH*HSTR + KH-1, VW*WSTR + KW-1)
+        data_vec = tvm.compute(dvshape, lambda n, h, w, c, vh, vw:
+                               data_pad[n][c][h * VH * HSTR + vh][w * VW * WSTR + vw],
+                               name='data_vec')
+
+    if pre_packed:
+        kernel_vec = kernel
+    else:
+        kernel_vec = tvm.compute(kvshape, lambda co, m, kh, kw, vc:
+                                 kernel[co*VC+vc][m][kh][kw],
+                                 name='kernel_vec')
+
+    kh = tvm.reduce_axis((0, KH), name='kh')
+    kw = tvm.reduce_axis((0, KW), name='kw')
+
+    if dilation_h != 1 or dilation_w != 1:
+        conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
+                          tvm.sum(data_vec[n, h, w, (co * VC + vc) // M, kh, kw, vh, vw]
+                                  .astype(out_dtype) *
+                                  kernel_vec[co // M, co % M, kh, kw, vc].astype(out_dtype),
+                                  axis=[kh, kw]), name='conv')
+    else:
+        conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
+                           tvm.sum(data_vec[n, h, w, (co * VC + vc) // M, vh * HSTR + kh,
+                                            vw * WSTR + kw].astype(out_dtype) *
+                                   kernel_vec[co // M, co % M, kh, kw, vc].astype(out_dtype),
+                                   axis=[kh, kw]), name='conv')
+
+    output = tvm.compute(oshape, lambda n, co, h, w:
+                         conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
+                         name='output_unpack', tag='spatial_depthwise_conv_nchw_output')
+    return output
+
+
+# register customized schedule for arm cpu / x86 cpu.
+@autotvm.task.register_topi_schedule(schedule_depthwise_conv2d_nchw, ['arm_cpu', 'cpu'], 'direct')
+def schedule_depthwise_conv2d_nchw_arm(cfg, outs):
     """Schedule depthwise conv2d
 
     Parameters
     ----------
     cfg: ConfigEntity
-        The configuration of this tempalte
+        The configuration of this template
     outs: Array of Tensor
         The computation graph description of depthwise convolution2d
         in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for depthwise_conv2d nchw.
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
 
-    def _schedule(cfg, s, data, data_pad, kernel, output):
-        A, B, C = data, kernel, output
-        s[data_pad].compute_inline()
-
-        # define tile
-        n, c, h, w = s[output].op.axis
-        cfg.define_split('tile_c', c, num_outputs=2)
-        cfg.define_split('tile_h', h, num_outputs=2)
-        cfg.define_split('tile_w', w, num_outputs=2)
-
-        # park data to vector form  [n, c, h, w] -> [n, C, h, w, VC]
-        A0 = s.cache_read(data_pad, "global", C)
-        _, c, h, w = s[A0].op.axis
-        c, vc = cfg['tile_c'].apply(s, A0, c)
-        s[A0].reorder(c, h, w, vc)
-        A1 = s.cache_write(A0, 'global')
-        s[A0].compute_inline()
-
-        # park kernel to vector form  [co, ci, kh, kw] -> [CO, ci, kh, kw, VC]
-        B0 = s.cache_read(B, "global", C)
-        c, m, h, w = s[B0].op.axis
-        c, vc, = cfg['tile_c'].apply(s, B0, c)
-        s[B0].reorder(c, m, h, w, vc)
-        B1 = s.cache_write(B0, 'global')
-        s[B0].compute_inline()
-
-        _, c, h, w = s[C].op.axis
-        c, vc, = cfg['tile_c'].apply(s, C, c)
-        s[C].reorder(c, h, w, vc)
-
-        # depthwise conv
-        C0 = s.cache_write(C, 'global')
-        _, c, h, w, vc = s[C0].op.axis
-        dh, dw = s[C0].op.reduce_axis
-        oh, ih = cfg['tile_h'].apply(s, C0, h)
-        ow, iw = cfg['tile_w'].apply(s, C0, w)
-        s[C0].reorder(c, oh, ow, dh, dw, ih, iw, vc)
-        s[A1].compute_at(s[C0], oh)
-
-        # try unroll and vectorization
-        cfg.define_annotate('ann', [ih, iw, vc], policy='try_unroll_vec')
-        cfg['ann'].apply(s, C0, [ih, iw, vc],
-                         axis_lens=[cfg['tile_h'].size[-1],
-                                    cfg['tile_w'].size[-1],
-                                    cfg['tile_c'].size[-1]],
-                         max_unroll=16,
-                         cfg=cfg)
-
-        # mark parallel
-        n, c, h, w = s[C].op.axis
-        s[C].parallel(c)
-
-        n, c, h, w, vc = s[C0].op.axis
-        s[C0].parallel(c)
-
-        c, m, h, w, vc = s[B1].op.axis
-        s[B1].parallel(c)
-
-        return s
-
     def _callback(op):
-        if op.tag == 'depthwise_conv2d_nchw':
+        if 'spatial_depthwise_conv_nchw_output' in op.tag:
             output = op.output(0)
-            kernel = op.input_tensors[1]
-            data = op.input_tensors[0]
-            data_pad = None
-            if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
-                data_pad = data
-                data = data_pad.op.input_tensors[0]
-            _schedule(cfg, s, data, data_pad, kernel, output)
+            conv = op.input_tensors[0]
+
+            data_vec = conv.op.input_tensors[0]
 
+            kernel_vec = conv.op.input_tensors[1]
+            if kernel_vec.op.name == 'kernel_vec':
+                kernel = kernel_vec.op.input_tensors[0]
+            else:
+                kernel = kernel_vec
+            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+                s[kernel].compute_inline()
+
+            _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0])
     traverse_inline(s, outs[0].op, _callback)
     return s
+
+
+def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec,
+                           conv, output, last):
+    """schedule implementation"""
+    n, co, oh, ow, vh, vw, vc = s[conv].op.axis
+    kh, kw = s[conv].op.reduce_axis
+
+    if data_vec.op.name == 'data_vec_undilated':
+        _, dv_oh, dv_ow, dv_c, _, _, dv_vh, dv_vw = s[data_vec].op.axis
+    else:
+        _, dv_oh, dv_ow, dv_c, dv_vh, dv_vw = s[data_vec].op.axis
+
+    data_pad = data_vec.op.input_tensors[0]
+    if data_pad.op.name == "data_pad":
+        assert isinstance(data_pad.op, tvm.tensor.ComputeOp)
+        has_padding = True
+    else:
+        assert isinstance(data_pad.op, tvm.tensor.PlaceholderOp)
+        has_padding = False
+
+    cfg.define_knob('data_pad_inline', [0, 1, 2, 3, 4])
+
+    if cfg['data_pad_inline'].val == 1 and has_padding:
+        s[data_pad].compute_inline()
+    if cfg['data_pad_inline'].val == 2 and has_padding:
+        s[data_pad].vectorize(list(s[data_pad].op.axis)[-1])
+    if cfg['data_pad_inline'].val == 3 and has_padding:
+        s[data_pad].vectorize(list(s[data_pad].op.axis)[-1])
+        s[data_pad].compute_at(s[data_vec], dv_oh)
+    if cfg['data_pad_inline'].val == 4 and has_padding:
+        s[data_pad].vectorize(list(s[data_pad].op.axis)[-1])
+        s[data_pad].compute_at(s[data_vec], dv_ow)
+
+    cfg.define_knob('data_vec_inline', [0, 1, 2, 3])
+    if cfg['data_vec_inline'].val == 1:
+        s[data_vec].compute_at(s[conv], oh)
+    if cfg['data_vec_inline'].val == 2:
+        s[data_vec].compute_at(s[conv], ow)
+    if cfg['data_vec_inline'].val == 3:
+        s[data_vec].compute_at(s[conv], co)
+
+    # schedule conv
+    cfg["reorder_0"].apply(s, conv, [n, co, oh, ow, kh, kw, vh, vw, vc])
+    cfg["ann_reduce"].apply(s, conv, [kh, kw],
+                            axis_lens=[get_const_int(kh.dom.extent),
+                                       get_const_int(kw.dom.extent)],
+                            max_unroll=16,
+                            cfg=cfg)
+    cfg["ann_spatial"].apply(s, conv, [vh, vw, vc],
+                             axis_lens=[cfg['tile_oh'].size[-1],
+                                        cfg['tile_ow'].size[-1],
+                                        cfg['tile_co'].size[-1]],
+                             max_unroll=16,
+                             cfg=cfg)
+
+    # schedule fusion
+    n, co, h, w = s[last].op.axis
+    co, vc = cfg['tile_co'].apply(s, last, co)
+    oh, vh = cfg['tile_oh'].apply(s, last, h)
+    ow, vw = cfg['tile_ow'].apply(s, last, w)
+    cfg["reorder_1"].apply(s, last, [n, co, oh, ow, vh, vw, vc])
+    if last != output:
+        s[output].compute_inline()
+        cfg["ann_spatial"].apply(s, last, [vh, vw, vc],
+                                 axis_lens=[cfg['tile_oh'].size[-1],
+                                            cfg['tile_ow'].size[-1],
+                                            cfg['tile_co'].size[-1]],
+                                 max_unroll=16,
+                                 cfg=cfg)
+    else:
+        s[last].vectorize(vw)
+    cfg.define_knob('conv_inline', [0, 1, 2, 3])
+    if cfg['conv_inline'].val == 1:
+        s[conv].compute_at(s[last], ow)
+    if cfg['conv_inline'].val == 2:
+        s[conv].compute_at(s[last], oh)
+    if cfg['conv_inline'].val == 3:
+        s[conv].compute_at(s[last], co)
+
+    # mark parallel
+    s[last].parallel(co)
+
+    if data_vec.op.name == 'data_vec_undilated':
+        _, h, _, _, _, _, _, _ = s[data_vec].op.axis
+    else:
+        _, h, _, _, _, _ = s[data_vec].op.axis
+    s[data_vec].parallel(h)
+
+    if kernel_vec.op.name == 'kernel_vec':
+        co, _, _, _, _ = s[kernel_vec].op.axis
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # kernel packing will be pre-computed during compliation, so we skip
+            # this part to make tuning records correct
+            s[kernel_vec].pragma(co, 'debug_skip_region')
+        else:
+            s[kernel_vec].parallel(co)
+
+    return s
diff --git a/topi/python/topi/cpp.py b/topi/python/topi/cpp.py
index 85f203387805..3321b5b68289 100644
--- a/topi/python/topi/cpp.py
+++ b/topi/python/topi/cpp.py
@@ -15,7 +15,7 @@ def _get_lib_names():
 
 def _load_lib():
     """Load libary by searching possible path."""
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    curr_path = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
     lib_search = curr_path
     lib_path = libinfo.find_lib_path(_get_lib_names(), lib_search, optional=True)
     if lib_path is None:
diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
index b8740f811ff7..28d2eb258bea 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -2,18 +2,16 @@
 """CUDA specific declaration and schedules."""
 from __future__ import absolute_import as _abs
 
-from .conv2d import conv2d_cuda
-from .conv2d_nchw import schedule_conv2d_nchw
+from . import conv2d, depthwise_conv2d, conv2d_transpose_nchw, group_conv2d_nchw
 from .conv2d_hwcn import schedule_conv2d_hwcn
-from .depthwise_conv2d import schedule_depthwise_conv2d_nchw, schedule_depthwise_conv2d_nhwc
 from .depthwise_conv2d import schedule_depthwise_conv2d_backward_input_nhwc
 from .depthwise_conv2d import schedule_depthwise_conv2d_backward_weight_nhwc
+from .group_conv2d_nchw import schedule_conv2d_nchw_cuda
 from .reduction import schedule_reduce
 from .softmax import schedule_softmax
 from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
 from .dense import dense_cuda, schedule_dense
 from .pooling import schedule_pool, schedule_global_pool
-from .conv2d_transpose_nchw import schedule_conv2d_transpose_nchw
 from .extern import schedule_extern
 from .nn import schedule_lrn, schedule_l2_normalize
 from .vision import *
diff --git a/topi/python/topi/cuda/conv2d.py b/topi/python/topi/cuda/conv2d.py
index 3c494cdeb0fa..400c8f6bade1 100644
--- a/topi/python/topi/cuda/conv2d.py
+++ b/topi/python/topi/cuda/conv2d.py
@@ -1,71 +1,80 @@
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements, too-many-arguments, too-many-branches, line-too-long
+# pylint: disable=invalid-name
 """Compute definition for conv2d with cuda backend"""
 import tvm
+from tvm import autotvm
 from tvm.contrib import cudnn
-import topi
-from ..nn.conv2d import conv2d
-from ..util import get_const_int
 
-@conv2d.register("cuda")
-def conv2d_cuda(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
+from .. import nn, generic
+from ..util import get_const_tuple, traverse_inline
+
+from .conv2d_direct import schedule_direct_cuda
+from .conv2d_winograd import winograd_cuda, schedule_winograd_cuda
+from .conv2d_int8 import conv2d_NCHWc_int8, schedule_conv2d_NCHWc_int8
+
+
+@autotvm.register_topi_compute(nn.conv2d, ['cuda', 'gpu'], ['direct', 'winograd', 'int8'])
+def conv2d_cuda(cfg, data, kernel, strides, padding, dilation, layout='NCHW', out_dtype='float32'):
     """Conv2D operator for cuda backend.
 
     Parameters
     ----------
-    input : tvm.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
+    cfg: ConfigEntity
+        The config for this template
+
+    data : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width] or
+        5-D with shape [batch, ic_chunk, in_height, in_width, ic_block]
 
-    filter : tvm.Tensor
-        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+    kernel : tvm.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
+        6-D with shape [num_filter_chunk, in_channel_chunk, filter_height,
+        filter_width, num_filter_block, in_channel_block]
 
-    stride : int or a list/tuple of two ints
+    strides : int or a list/tuple of two ints
         stride size, or [stride_height, stride_width]
 
     padding : int or a list/tuple of two ints
         padding size, or [pad_height, pad_width]
 
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
     layout : str
         layout of data
 
+    out_dtype: str
+        The output type. This is used for mixed precision.
+
     Returns
     -------
     output : tvm.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
-    assert isinstance(stride, int) or len(stride) == 2
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-    if isinstance(padding, int):
-        pad_h = pad_w = padding
-    else:
-        pad_h, pad_w = padding
-    # handle dilation
-    dilation_h = dilation_w = 1
-    kernel_tvm = kernel
-    kernel_cudnn = kernel
-    if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
-        kernel_before_dilation = kernel.op.input_tensors[0]
-        kernel_cudnn = kernel_before_dilation
-        if layout == 'NCHW':
-            dilation_h = (get_const_int(kernel.shape[2]) + get_const_int(kernel_before_dilation.shape[2]) - 1) \
-                // get_const_int(kernel_before_dilation.shape[2])
-            dilation_w = (get_const_int(kernel.shape[3]) + get_const_int(kernel_before_dilation.shape[3]) - 1) \
-                // get_const_int(kernel_before_dilation.shape[2])
-        elif layout == 'NHWC':
-            dilation_h = (get_const_int(kernel.shape[1]) + get_const_int(kernel_before_dilation.shape[1]) - 1) \
-                // get_const_int(kernel_before_dilation.shape[1])
-            dilation_w = (get_const_int(kernel.shape[2]) + get_const_int(kernel_before_dilation.shape[2]) - 1) \
-                // get_const_int(kernel_before_dilation.shape[2])
     target = tvm.target.current_target()
+
     if "cudnn" in target.libs:
-        assert layout != 'HWCN', "HWCN layout not supported with CUDNN."
-        tensor_format = 0 # CUDNN_TENSOR_NCHW
-        if layout == 'NHWC':
+        if layout == 'NCHW':
+            tensor_format = 0 # CUDNN_TENSOR_NCHW
+            N, _, H, W = get_const_tuple(data.shape)
+        elif layout == 'NHWC':
             tensor_format = 1 # CUDNN_TENSOR_NHWC
+            N, H, W, _ = get_const_tuple(data.shape)
+        else:
+            raise ValueError("Unsupported layout %s in cudnn" % layout)
+        CO, CI, KH, KW = get_const_tuple(kernel.shape)
+
+        # handle dilation
+        stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides
+        pad_h, pad_w = (padding, padding) if isinstance(padding, int) else padding
+        dilation_h, dilation_w = (dilation, dilation) if isinstance(dilation, int) else dilation
+
+        OH = (H + 2 * pad_h - KH) // stride_h + 1
+        OW = (W + 2 * pad_w - KW) // stride_w + 1
+        cfg.add_flop(2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) *\
+                    ((KW - 1) * dilation_w + 1))
+
         return cudnn.conv2d_forward(data,
-                                    kernel_cudnn,
+                                    kernel,
                                     stride_h,
                                     stride_w,
                                     pad_h,
@@ -74,10 +83,55 @@ def conv2d_cuda(data, kernel, stride, padding, layout='NCHW', out_dtype='float32
                                     dilation_w,
                                     conv_mode=1,
                                     tensor_format=tensor_format,
-                                    algo=-1) # let CUDNN choose the best algo
-    elif layout == 'NCHW':
-        return topi.nn.conv2d_nchw(data, kernel_tvm, stride, padding, out_dtype)
+                                    algo=-1)  # let CUDNN choose the best algo
+
+    if cfg.template_key == 'winograd':
+        return winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
+                             pre_computed=False)
+    if cfg.template_key == 'int8':
+        return conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out_dtype)
+
+    if layout == 'NCHW':
+        return nn.conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype)
     elif layout == 'HWCN':
-        return topi.nn.conv2d_hwcn(data, kernel_tvm, stride, padding, out_dtype)
+        return nn.conv2d_hwcn(data, kernel, strides, padding, dilation, out_dtype)
     else:
         raise ValueError("not support this layout {} yet".format(layout))
+
+
+@autotvm.register_topi_schedule(generic.schedule_conv2d_nchw, ["cuda", "gpu"],
+                                ["direct", 'winograd', "int8"])
+def schedule_conv2d_nchw_cuda(cfg, outs):
+    """TOPI schedule callback of conv2d for cuda gpu
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    outs: Array of Tensor
+        The computation graph description of conv2d
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for conv2d.
+    """
+    target = tvm.target.current_target()
+    if 'cudnn' in target.libs:
+        return generic.schedule_extern(outs)
+
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == 'conv2d_nchw':
+            schedule_direct_cuda(cfg, s, op.output(0))
+        if op.tag == 'conv2d_nchw_winograd':
+            schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=False)
+        if op.tag == "conv2d_NCHWc_int8":
+            schedule_conv2d_NCHWc_int8(cfg, s, op.output(0))
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
diff --git a/topi/python/topi/cuda/conv2d_direct.py b/topi/python/topi/cuda/conv2d_direct.py
new file mode 100644
index 000000000000..9b315a6b0fc1
--- /dev/null
+++ b/topi/python/topi/cuda/conv2d_direct.py
@@ -0,0 +1,101 @@
+# pylint: disable=invalid-name
+"""The templates for cuda conv2d operators"""
+import tvm
+from tvm import autotvm
+from ..util import get_const_tuple
+
+def schedule_direct_cuda(cfg, s, conv):
+    """schedule optimized for batch size = 1"""
+
+    ##### space definition begin #####
+    n, f, y, x = s[conv].op.axis
+    rc, ry, rx = s[conv].op.reduce_axis
+    cfg.define_split("tile_f", f, num_outputs=4)
+    cfg.define_split("tile_y", y, num_outputs=4)
+    cfg.define_split("tile_x", x, num_outputs=4)
+    cfg.define_split("tile_rc", rc, num_outputs=2)
+    cfg.define_split("tile_ry", ry, num_outputs=2)
+    cfg.define_split("tile_rx", rx, num_outputs=2)
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+
+    target = tvm.target.current_target()
+    if target.target_name in ['nvptx', 'rocm']:
+        cfg.define_knob("unroll_explicit", [1])
+    else:
+        cfg.define_knob("unroll_explicit", [0, 1])
+
+    # fallback support
+    if cfg.is_fallback:
+        ref_log = autotvm.tophub.load_reference_log(
+            target.target_name, target.model, 'conv2d', 'direct')
+        cfg.fallback_with_reference_log(ref_log)
+    ##### space definition end #####
+
+    pad_data, kernel = s[conv].op.input_tensors
+
+    s[pad_data].compute_inline()
+    if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
+        s[kernel].compute_inline()
+
+    if conv.op in s.outputs:
+        output = conv
+        OL = s.cache_write(conv, 'local')
+    else:
+        output = s.outputs[0].output(0)
+        s[conv].set_scope('local')
+        OL = conv
+
+    # create cache stage
+    AA = s.cache_read(pad_data, 'shared', [OL])
+    WW = s.cache_read(kernel, 'shared', [OL])
+
+    # tile and bind spatial axes
+    n, f, y, x = s[output].op.axis
+    kernel_scope, n = s[output].split(n, nparts=1)
+
+    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+
+    bf = s[output].fuse(n, bf)
+    s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
+    s[output].bind(by, tvm.thread_axis("blockIdx.y"))
+    s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
+    s[output].bind(vf, tvm.thread_axis("vthread"))
+    s[output].bind(vy, tvm.thread_axis("vthread"))
+    s[output].bind(vx, tvm.thread_axis("vthread"))
+    s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
+    s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
+    s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
+    s[OL].compute_at(s[output], tx)
+
+    # tile reduction axes
+    n, f, y, x = s[OL].op.axis
+    rc, ry, rx = s[OL].op.reduce_axis
+    rco, rci = cfg['tile_rc'].apply(s, OL, rc)
+    ryo, ryi = cfg['tile_rx'].apply(s, OL, ry)
+    rxo, rxi = cfg['tile_ry'].apply(s, OL, rx)
+    s[OL].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x)
+
+    s[AA].compute_at(s[OL], rxo)
+    s[WW].compute_at(s[OL], rxo)
+
+    # cooperative fetching
+    for load in [AA, WW]:
+        n, f, y, x = s[load].op.axis
+        fused = s[load].fuse(n, f, y, x)
+        tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
+        ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
+        tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
+        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
+        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+    # unroll
+    s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+    s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+
+    N, CO, OH, OW = get_const_tuple(output.shape)
+    _, KH, KW, CI = get_const_tuple(kernel.shape)
+    cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW)
diff --git a/topi/python/topi/cuda/conv2d_int8.py b/topi/python/topi/cuda/conv2d_int8.py
new file mode 100644
index 000000000000..637c5de35513
--- /dev/null
+++ b/topi/python/topi/cuda/conv2d_int8.py
@@ -0,0 +1,276 @@
+# pylint: disable=invalid-name
+"""Int8 conv2d in NCHWc layout"""
+import tvm
+from tvm import autotvm
+
+from .injective import _schedule_injective
+from .tensor_intrin import dp4a
+from ..nn.pad import pad
+from ..nn.util import get_pad_tuple
+from ..util import get_const_tuple
+
+
+def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, layout, out_dtype):
+    """Convolution operator in NCHW[x]c layout for int8.
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    data : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width] or
+        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
+
+    kernel : tvm.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
+        6-D with shape [num_filter_chunk, in_channel_chunk, filter_height,
+        filter_width, num_filter_block, in_channel_block]
+
+    stride : int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+
+    padding: int or a list/tuple of two ints
+        padding size, or [pad_height, pad_width]
+
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
+    layout : str
+        layout of data
+
+    out_dtype : str
+        The output type. This is used for mixed precision.
+
+    Returns
+    -------
+    output : tvm.Tensor
+        5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
+    """
+    assert layout in ["NCHW", "NCHW4c"]
+    ic_block_factor = 4
+    oc_block_factor = 4
+
+    pre_computed = len(kernel.shape) == 6
+    if not pre_computed:
+        batch, channels, height, width = get_const_tuple(data.shape)
+        assert channels % ic_block_factor == 0, \
+            "Number of input channels should be multiple of {}".format(
+                ic_block_factor)
+        packed_data = tvm.compute((batch, channels // ic_block_factor, height, width,
+                                   ic_block_factor),
+                                  lambda n, c, h, w, vc: data[n, c*ic_block_factor + vc, h, w],
+                                  name="packed_data")
+
+        out_channels, in_channels, kernel_h, kernel_w = get_const_tuple(
+            kernel.shape)
+        assert out_channels % 4 == 0, \
+            "Number of output channels should be multiple of {}".format(
+                oc_block_factor)
+        packed_kernel = tvm.compute(
+            (out_channels // oc_block_factor, in_channels // ic_block_factor, kernel_h, kernel_w,
+             oc_block_factor, ic_block_factor),
+            lambda oc_chunk, ic_chunk, kh, kw, oc_block, ic_block:
+            kernel[oc_chunk * oc_block_factor + oc_block,
+                   ic_chunk * ic_block_factor + ic_block, kh, kw],
+            name="packed_kernel")
+
+    else:
+        packed_data = data
+        packed_kernel = kernel
+
+    batch, ic_chunk, in_height, in_width, ic_block = get_const_tuple(
+        packed_data.shape)
+    oc_chunk, ic_chunk, kernel_h, kernel_w, oc_block, ic_block = get_const_tuple(
+        packed_kernel.shape)
+
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (kernel_h, kernel_w))
+    # compute graph
+    pad_before = [0, 0, pad_top, pad_left, 0]
+    pad_after = [0, 0, pad_down, pad_right, 0]
+    pad_data = pad(packed_data, pad_before, pad_after, name="pad_data")
+
+    # compute the output shape
+    out_height = (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1
+    out_width = (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1
+
+    oshape = (batch, oc_chunk, out_height, out_width, oc_block)
+
+    icc = tvm.reduce_axis((0, ic_chunk), name='ic_chunk')
+    icb = tvm.reduce_axis((0, ic_block), name='ic_block')
+    kh = tvm.reduce_axis((0, kernel_h), name='kh')
+    kw = tvm.reduce_axis((0, kernel_w), name='kw')
+
+    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                       tvm.sum(pad_data[n, icc, oh*stride_h+kh*dilation_h, \
+                               ow*stride_w+kw*dilation_w, icb]
+                               .astype('int32') *
+                               packed_kernel[oc_chunk, icc,
+                                             kh, kw, oc_block, icb]
+                               .astype('int32'),
+                               axis=[icc, kh, kw, icb]))
+
+    output = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                         conv[n, oc_chunk, oh, ow, oc_block].astype(out_dtype),
+                         tag="conv2d_NCHWc_int8")
+
+    # num flop
+    num_flop = batch * oc_chunk * oc_block * out_height * out_width * \
+        ic_chunk * ic_block * kernel_h * kernel_w * 2
+    cfg.add_flop(num_flop)
+
+    return output
+
+
+_dp4a = dp4a('shared', 'shared', 'local')
+
+
+def schedule_conv2d_NCHWc_int8(cfg, s, output):
+    """Schedule conv2d int8 NCHWc template"""
+    conv = output.op.input_tensors[0]
+    packed_data, packed_kernel = conv.op.input_tensors
+
+    if isinstance(packed_data.op, tvm.tensor.ComputeOp) and "pad" in packed_data.op.tag:
+        pad_data = packed_data
+        packed_data = pad_data.op.input_tensors[0]
+    else:
+        pad_data = packed_data
+
+    if autotvm.GLOBAL_SCOPE.in_tuning:
+        # skip this part during tuning to make recrods accurate
+        # this part will be pre-computed during NNVM's pre-compute optimization pass
+        s[packed_data].pragma(s[packed_data].op.axis[0], "debug_skip_region")
+        s[packed_kernel].pragma(s[packed_kernel].op.axis[0], "debug_skip_region")
+    else:
+        if isinstance(packed_kernel.op, tvm.tensor.ComputeOp) and\
+                       packed_kernel.name == 'packed_kernel':
+            # data and kernel are not pre-computed, schedule layout transform here
+            _schedule_injective(packed_data.op, s)
+            _schedule_injective(packed_kernel.op, s)
+
+    if pad_data != packed_data:
+        s[pad_data].compute_inline()
+
+    # create cache stage
+    AA = s.cache_read(pad_data, 'shared', [conv])
+    WW = s.cache_read(packed_kernel, 'shared', [conv])
+
+    s[conv].set_scope('local')
+
+    # handle bias
+    if output.op not in s.outputs:
+        s[output].compute_inline()
+        output = s.outputs[0].output(0)
+
+    # tile and bind spatial axes
+    n, f, y, x, c = s[output].op.axis
+    cfg.define_split("tile_n", cfg.axis(n), num_outputs=4)
+    cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
+    cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
+    cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
+
+    # this is the scope to attach global config inside this kernel
+    kernel_scope, n = s[output].split(n, nparts=1)
+
+    bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n)
+    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+
+    s[output].reorder(bn, bf, by, bx, vn, vf, vy, vx, tn, tf, ty, tx, ni, fi, yi, xi)
+    s[output].bind(bn, tvm.thread_axis("blockIdx.z"))
+    s[output].bind(bf, tvm.thread_axis("blockIdx.y"))
+    s[output].bind(s[output].fuse(by, bx), tvm.thread_axis("blockIdx.x"))
+    s[output].bind(vn, tvm.thread_axis("vthread"))
+    s[output].bind(vf, tvm.thread_axis("vthread"))
+    s[output].bind(vy, tvm.thread_axis("vthread"))
+    s[output].bind(vx, tvm.thread_axis("vthread"))
+
+    cfg.define_knob("fuse_yx", [0, 1]) # fuse ty,tx or tn,tf
+    if cfg["fuse_yx"].val:
+        s[output].bind(tn, tvm.thread_axis("threadIdx.z"))
+        s[output].bind(tf, tvm.thread_axis("threadIdx.y"))
+        tyx = s[output].fuse(ty, tx)
+        s[output].bind(tyx, tvm.thread_axis("threadIdx.x"))
+        s[conv].compute_at(s[output], tyx)
+
+        # number of threads
+        n_tz = cfg["tile_n"].size[2]
+        n_ty = cfg["tile_f"].size[2]
+        n_tx = cfg["tile_y"].size[2] * cfg["tile_x"].size[2]
+    else:
+        s[output].bind(s[output].fuse(tn, tf), tvm.thread_axis("threadIdx.z"))
+        s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[conv].compute_at(s[output], tx)
+
+        # number of threads
+        n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2]
+        n_ty = cfg["tile_y"].size[2]
+        n_tx = cfg["tile_x"].size[2]
+
+    # tile and bind reduction axes
+    n, f, y, x, c = s[conv].op.axis
+
+    rc, ry, rx, rc_block = s[conv].op.reduce_axis
+    cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=2)
+    cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=2)
+    cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=2)
+    rco, rci = cfg['tile_rc'].apply(s, conv, rc)
+    ryo, ryi = cfg['tile_ry'].apply(s, conv, ry)
+    rxo, rxi = cfg['tile_rx'].apply(s, conv, rx)
+
+    s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x, c, rc_block)
+
+    cfg.define_reorder("reorder_inner", [rco, ryo, rxo], policy="all")
+    cfg["reorder_inner"].apply(s, conv, [rco, ryo, rxo])
+    cfg["reorder_inner"].apply(s, conv, [rci, ryi, rxi])
+
+    _, rc_block = s[conv].split(rc_block, factor=4)
+    s[conv].tensorize(rc_block, _dp4a)
+
+    cache_loc = [rco, ryo, rxo][cfg["reorder_inner"].perm[-1]]
+    s[AA].compute_at(s[conv], cache_loc)
+    s[WW].compute_at(s[conv], cache_loc)
+
+    # cooperative fetching
+    for load in [AA, WW]:
+        c = s[load].op.axis[-1]
+        c_outer, c = s[load].split(c, factor=4)
+        s[load].vectorize(c)
+        fused = s[load].op.axis[:-1] + [c_outer]
+        fused = s[load].fuse(*fused)
+
+        fused, tx = s[load].split(fused, factor=n_tx)
+        fused, ty = s[load].split(fused, factor=n_ty)
+        fused, tz = s[load].split(fused, factor=n_tz)
+        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
+        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+    # double buffer
+    cfg.define_knob('AA_double_buffer', [0, 1])
+    cfg.define_knob('WW_double_buffer', [0, 1])
+    if cfg['AA_double_buffer'].val:
+        s[AA].double_buffer()
+    if cfg['WW_double_buffer'].val:
+        s[WW].double_buffer()
+
+    # unroll
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+    s[output].pragma(kernel_scope, 'auto_unroll_max_step',
+                     cfg['auto_unroll_max_step'].val)
+    s[output].pragma(kernel_scope, 'unroll_explicit', False)
+
+    return s
diff --git a/topi/python/topi/cuda/conv2d_nchw.py b/topi/python/topi/cuda/conv2d_nchw.py
deleted file mode 100644
index 4f7539d224eb..000000000000
--- a/topi/python/topi/cuda/conv2d_nchw.py
+++ /dev/null
@@ -1,544 +0,0 @@
-#pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements, too-many-arguments, too-many-branches, line-too-long
-"""Schedule for conv2d_nchw with auto fusion"""
-import tvm
-import topi
-from .. import util
-from .. import tag
-from .. import generic
-
-def conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag):
-    """Schedule conv2d for specific feature_in_out_filter pattern"""
-    # scheduler params
-    ofactor = 16
-    hfactor = 2
-    if flag >= 96:
-        hfactor = 4
-    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
-    ow_size = util.get_const_int(Out.shape[3])
-    num_thread = min(max_threads, ow_size * hfactor)
-    vthread = ofactor
-    block_x = tvm.thread_axis("blockIdx.x")
-    thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
-    thread_xz = tvm.thread_axis((0, vthread), "vthread", name="vx")
-
-    i, oc, h, w = s[Out].op.axis
-    if ow_size * hfactor == num_thread:
-        ooc, ioc = s[Out].split(oc, factor=vthread)
-        oh, ih = s[Out].split(h, factor=hfactor)
-        s[Out].reorder(ooc, oh, ioc, ih, w)
-        oc = s[Out].fuse(ooc, oh)
-        ow, _ = s[Out].split(w, nparts=ow_size)
-        w = s[Out].fuse(ow, ih)
-        s[Out].bind(w, thread_x)
-        s[Out].bind(ioc, thread_xz)
-        s[Out].bind(oc, block_x)
-    else:
-        ow, w = s[Out].split(w, factor=num_thread)
-        s[Out].bind(w, thread_x)
-        s[Out].bind(ow, block_x)
-
-    s[Out_L].compute_at(s[Out], w)
-
-    # schedule Out_L local write
-    i, oc, h, w = s[Out_L].op.axis
-    ic, dh, dw = s[Out_L].op.reduce_axis
-    s[Out_L].reorder(i, oc, h, w, ic, dh, dw)
-    s[temp_S].compute_at(s[Out_L], ic)
-    s[Filter_S].compute_at(s[Out_L], w)
-
-    num_thread1 = max_threads
-    thread_xx = tvm.thread_axis((0, num_thread1), "threadIdx.x")
-    block_xx = tvm.thread_axis("blockIdx.x")
-
-    i = s[temp].fuse(*s[temp].op.axis)
-    bx, tx = s[temp].split(i, factor=num_thread1)
-    s[temp].bind(tx, thread_xx)
-    s[temp].bind(bx, block_xx)
-
-    i = s[temp_R].fuse(*s[temp_R].op.axis)
-    bx, tx = s[temp_R].split(i, factor=num_thread1)
-    s[temp_R].bind(tx, thread_xx)
-    s[temp_R].bind(bx, block_xx)
-
-    #schedule temp_S shared mem load
-    i, ic, h, ow, iw = s[temp_S].op.axis
-    h = s[temp_S].fuse(h, ow)
-    _, tx = s[temp_S].split(h, factor=num_thread)
-    s[temp_S].bind(tx, thread_x)
-    if num_thread < max_threads:
-        s[temp_S].vectorize(iw)
-
-    #schedule Filter_S shared mem load
-    i, oc, h, w = s[Filter_S].op.axis
-    fuse_index = s[Filter_S].fuse(w, h)
-    w = s[Filter_S].fuse(fuse_index, oc)
-    tx, _ = s[Filter_S].split(w, nparts=num_thread)
-    s[Filter_S].bind(tx, thread_x)
-
-def conv2d_56_64_128(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag):
-    """Schedule conv2d for specific feature_in_out_filter pattern"""
-    if util.get_const_int(Filter_S.shape[0]) == util.get_const_int(Filter_S.shape[1]):
-        mark = util.get_const_int(Out.shape[2]) * util.get_const_int(Out.shape[3])
-        num_thread_x = 0
-        if mark % 8 == 0 and mark % 7 == 0:
-            num_thread_x = 8
-            vthread_x = 7
-        elif mark % 4 == 0 and mark % 7 == 0:
-            num_thread_x = 4
-            vthread_x = 7
-        else:
-            for i in range(5, mark):
-                if mark % i == 0 and num_thread_x == 0:
-                    vthread_x = i
-                    mark = mark // i
-                if mark % i == 0 and vthread_x > 0:
-                    num_thread_x = i
-                    break
-        if mark < 5 or num_thread_x * vthread_x > 128:
-            num_thread_x = 8
-            vthread_x = 8
-        num_thread_y = 8
-        vthread_y = 2
-        ifactor = 8
-
-        block_x = tvm.thread_axis("blockIdx.x")
-        block_y = tvm.thread_axis("blockIdx.y")
-        thread_x = tvm.thread_axis((0, num_thread_x), "threadIdx.x")
-        thread_y = tvm.thread_axis((0, num_thread_y), "threadIdx.y")
-        thread_xz = tvm.thread_axis((0, vthread_x), "vthread", name="vx")
-        thread_yz = tvm.thread_axis((0, vthread_y), "vthread", name="vy")
-
-        i, oc, h, w = s[Out].op.axis
-        factor = util.get_const_int(Out.shape[3])
-        ooc, ioc = s[Out].split(oc, factor=num_thread_y*vthread_y)
-        oioc, iioc = s[Out].split(ioc, nparts=vthread_y)
-        s[Out].bind(iioc, thread_y)
-        s[Out].bind(oioc, thread_yz)
-        s[Out].bind(ooc, block_y)
-        if factor < num_thread_x*vthread_x:
-            oh, ih = s[Out].split(h, factor=num_thread_x*vthread_x//factor)
-            w = s[Out].fuse(ih, w)
-            ow, iw = s[Out].split(w, nparts=vthread_x)
-            s[Out].reorder(i, ooc, oh, oioc, ow, iioc, iw)
-            s[Out].bind(iw, thread_x)
-            s[Out].bind(ow, thread_xz)
-            s[Out].bind(oh, block_x)
-            s[Out_L].compute_at(s[Out], iw)
-        else:
-            ow, iw = s[Out].split(w, factor=num_thread_x)
-            oh, ih = s[Out].split(h, factor=vthread_x)
-            s[Out].reorder(i, ooc, oh, ow, oioc, ih, iioc, iw)
-            oh = s[Out].fuse(oh, ow)
-            s[Out].bind(iw, thread_x)
-            s[Out].bind(ih, thread_xz)
-            s[Out].bind(oh, block_x)
-            s[Out_L].compute_at(s[Out], iw)
-
-        # schedule Out_L local write
-        i, oc, h, w = s[Out_L].op.axis
-        ic, dh, dw = s[Out_L].op.reduce_axis
-        oic, iic = s[Out_L].split(ic, factor=ifactor)
-        s[Out_L].reorder(oic, dh, dw, iic, h, w)
-
-        s[temp_S].compute_at(s[Out_L], oic)
-        s[Filter_S].compute_at(s[Out_L], dw)
-
-        num_thread = tvm.target.current_target(allow_none=False).max_num_threads
-        thread_xx = tvm.thread_axis((0, num_thread), "threadIdx.x")
-        block_xx = tvm.thread_axis("blockIdx.x")
-
-        i = s[temp].fuse(*s[temp].op.axis)
-        bx, tx = s[temp].split(i, factor=num_thread)
-        s[temp].bind(tx, thread_xx)
-        s[temp].bind(bx, block_xx)
-
-        i = s[temp_R].fuse(*s[temp_R].op.axis)
-        bx, tx = s[temp_R].split(i, factor=num_thread)
-        s[temp_R].bind(tx, thread_xx)
-        s[temp_R].bind(bx, block_xx)
-
-        #schedule temp_S shared mem load
-        i, oic, h, w, iic = s[temp_S].op.axis
-        oic = s[temp_S].fuse(oic, h, w)
-        ooic, ioic = s[temp_S].split(oic, factor=num_thread_x)
-        _, iooic = s[temp_S].split(ooic, factor=num_thread_y)
-        s[temp_S].bind(ioic, thread_x)
-        s[temp_S].bind(iooic, thread_y)
-        s[temp_S].vectorize(iic)
-
-        i, oc, h, w = s[Filter_S].op.axis
-        _, ioc = s[Filter_S].split(oc, factor=num_thread_y)
-        _, ii = s[Filter_S].split(i, factor=num_thread_x)
-        s[Filter_S].bind(ioc, thread_y)
-        s[Filter_S].bind(ii, thread_x)
-    else:
-        # scheduler params
-        vthread = 2
-        opart2 = 4
-        ofactor = 64
-        wfactor = 28
-        ifactor = 8
-        if flag > 256:
-            wfactor = 14
-        num_thread_x = max(1, ofactor//(opart2*2))
-        num_thread_y = max(1, (wfactor + vthread-1) // vthread)
-        block_x = tvm.thread_axis("blockIdx.x")
-        block_y = tvm.thread_axis("blockIdx.y")
-        block_z = tvm.thread_axis("blockIdx.z")
-        thread_x = tvm.thread_axis((0, num_thread_x), "threadIdx.x")
-        thread_y = tvm.thread_axis((0, num_thread_y), "threadIdx.y")
-        thread_xz = tvm.thread_axis((0, vthread), "vthread", name="vx")
-        thread_yz = tvm.thread_axis((0, vthread), "vthread", name="vy")
-
-        i, oc, h, w = s[Out].op.axis
-        ooc, ioc = s[Out].split(oc, factor=ofactor)
-        ow, iw = s[Out].split(w, factor=wfactor)
-        ow = s[Out].fuse(ow, h)
-        oioc, iioc = s[Out].split(ioc, nparts=vthread)
-        oiw, iiw = s[Out].split(iw, nparts=vthread)
-        oiioc, iiioc = s[Out].split(iioc, nparts=opart2)
-        s[Out].reorder(i, ooc, ow, oioc, oiw, oiioc, iiw, iiioc)
-        s[Out].bind(iiioc, thread_x)
-        s[Out].bind(iiw, thread_y)
-        s[Out].bind(oiioc, thread_xz)
-        s[Out].bind(oiw, thread_yz)
-        s[Out].bind(oioc, block_x)
-        s[Out].bind(ow, block_y)
-        s[Out].bind(ooc, block_z)
-
-        s[Out_L].compute_at(s[Out], iiioc)
-
-        # schedule Out_L local write
-        i, oc, h, w = s[Out_L].op.axis
-        ic, dh, dw = s[Out_L].op.reduce_axis
-        oic, iic = s[Out_L].split(ic, factor=ifactor)
-        s[Out_L].reorder(oic, dh, dw, iic, h, w)
-        max_num_thread = tvm.target.current_target(allow_none=False).max_num_threads
-        if util.get_const_int(Filter_S.shape[1]) == 128:
-            oic = s[Out_L].fuse(dh, oic)
-            s[temp_S].compute_at(s[Out_L], oic)
-            s[Filter_S].compute_at(s[Out_L], oic)
-            num_thread = max_num_thread
-        else:
-            s[temp_S].compute_at(s[Out_L], oic)
-            s[Filter_S].compute_at(s[Out_L], dw)
-            num_thread = 456
-            if max_num_thread < num_thread:
-                num_thread = max_num_thread
-
-        thread_xx = tvm.thread_axis((0, num_thread), "threadIdx.x")
-        block_xx = tvm.thread_axis("blockIdx.x")
-
-        i = s[temp].fuse(*s[temp].op.axis)
-        bx, tx = s[temp].split(i, factor=num_thread)
-        s[temp].bind(tx, thread_xx)
-        s[temp].bind(bx, block_xx)
-
-        i = s[temp_R].fuse(*s[temp_R].op.axis)
-        bx, tx = s[temp_R].split(i, factor=num_thread)
-        s[temp_R].bind(tx, thread_xx)
-        s[temp_R].bind(bx, block_xx)
-
-        #schedule temp_S shared mem load
-        i, oic, h, w, iic = s[temp_S].op.axis
-        oic = s[temp_S].fuse(oic, h, w)
-        ooic, ioic = s[temp_S].split(oic, factor=num_thread_x)
-        _, iooic = s[temp_S].split(ooic, factor=num_thread_y)
-        s[temp_S].bind(ioic, thread_x)
-        s[temp_S].bind(iooic, thread_y)
-        s[temp_S].vectorize(iic)
-
-        #schedule Filter_S shared mem load
-        i, oc, h, w = s[Filter_S].op.axis
-        _, ioc = s[Filter_S].split(oc, factor=num_thread_x)
-        _, ii = s[Filter_S].split(i, factor=num_thread_y)
-        s[Filter_S].bind(ioc, thread_x)
-        s[Filter_S].bind(ii, thread_y)
-
-def conv2d_14_256_256(s, temp, temp_R, temp_S, Filter, Filter_S, Out, Out_L):
-    """Schedule conv2d for specific feature_in_out_filter pattern"""
-    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
-    if util.get_const_int(Filter.shape[0]) + util.get_const_int(Filter.shape[1]) <= 768:
-        # scheduler params
-        vthread_x = util.get_const_int(Out.shape[3])
-        num_thread_x = 64
-        ofactor = 8
-        if util.get_const_int(Filter.shape[3]) == 1 and vthread_x * 5 <= max_threads:
-            ofactor = 64
-        block_x = tvm.thread_axis("blockIdx.x")
-        thread_x = tvm.thread_axis((0, num_thread_x), "threadIdx.x")
-        thread_xz = tvm.thread_axis((0, vthread_x), "vthread", name="vx")
-
-        i, oc, h, w = s[Out].op.axis
-        ooc, ioc = s[Out].split(oc, factor=num_thread_x)
-        s[Out].reorder(i, ooc, h, w, ioc)
-        ooc = s[Out].fuse(h, ooc)
-        s[Out].bind(ioc, thread_x)
-        s[Out].bind(w, thread_xz)
-        s[Out].bind(ooc, block_x)
-
-        s[Out_L].compute_at(s[Out], ioc)
-
-        # schedule Out_L local write
-        i, oc, h, w = s[Out_L].op.axis
-        ic, dh, dw = s[Out_L].op.reduce_axis
-        oic, iic = s[Out_L].split(ic, ofactor)
-        s[Out_L].reorder(oic, dh, dw, iic, h, w)
-
-        s[temp_S].compute_at(s[Out_L], oic)
-        s[Filter_S].compute_at(s[Out_L], oic)
-
-        #schedule temp_S shared mem load
-        i, ic, h, w = s[temp_S].op.axis
-        s[temp_S].reorder(i, ic, w, h)
-        ic = s[temp_S].fuse(w, ic)
-        _, iic = s[temp_S].split(ic, factor=num_thread_x)
-        s[temp_S].bind(iic, thread_x)
-
-        #schedule Filter_S shared mem load
-        i, oc, h, w = s[Filter_S].op.axis
-        _, ii = s[Filter_S].split(i, factor=num_thread_x)
-        s[Filter_S].bind(ii, thread_x)
-        s[Filter_S].storage_align(s[Filter_S].op.axis[0], 2, 1)
-
-    else:
-        # scheduler params
-        vthread_x = min(8, util.get_const_int(Out.shape[2]))
-        num_thread_x = 16
-        num_thread_y = min(max_threads // num_thread_x, util.get_const_int(Out.shape[3]))
-        ofactor = 8
-        block_x = tvm.thread_axis("blockIdx.x")
-        thread_x = tvm.thread_axis((0, num_thread_x), "threadIdx.x")
-        thread_y = tvm.thread_axis((0, num_thread_y), "threadIdx.y")
-        thread_xz = tvm.thread_axis((0, vthread_x), "vthread", name="vx")
-
-        i, oc, h, w = s[Out].op.axis
-        ow, iw = s[Out].split(w, factor=num_thread_y)
-        oh, ih = s[Out].split(h, factor=vthread_x)
-        ooc, ioc = s[Out].split(oc, factor=num_thread_x)
-        s[Out].reorder(i, ooc, oh, ih, ow, iw, ioc)
-        s[Out].bind(ioc, thread_x)
-        s[Out].bind(iw, thread_y)
-        s[Out].bind(ih, thread_xz)
-        s[Out].bind(ooc, block_x)
-
-        s[Out_L].compute_at(s[Out], ioc)
-
-        # schedule Out_L local write
-        i, oc, h, w = s[Out_L].op.axis
-        ic, dh, dw = s[Out_L].op.reduce_axis
-        oic, iic = s[Out_L].split(ic, ofactor)
-        s[Out_L].reorder(oic, dh, dw, iic, h, w)
-
-        s[temp_S].compute_at(s[Out_L], oic)
-        s[Filter_S].compute_at(s[Out_L], oic)
-
-        num_thread = max_threads
-        thread_xx = tvm.thread_axis((0, num_thread), "threadIdx.x")
-        block_xx = tvm.thread_axis("blockIdx.x")
-
-        i = s[temp].fuse(*s[temp].op.axis)
-        bx, tx = s[temp].split(i, factor=num_thread)
-        s[temp].bind(tx, thread_xx)
-        s[temp].bind(bx, block_xx)
-
-        i = s[temp_R].fuse(*s[temp_R].op.axis)
-        bx, tx = s[temp_R].split(i, factor=num_thread)
-        s[temp_R].bind(tx, thread_xx)
-        s[temp_R].bind(bx, block_xx)
-
-        #schedule temp_S shared mem load
-        i, h, w, oc, ic = s[temp_S].op.axis
-        icc = s[temp_S].fuse(oc, w, h)
-        oic, iic = s[temp_S].split(icc, factor=num_thread_x)
-        _, ioic = s[temp_S].split(oic, factor=num_thread_y)
-        s[temp_S].bind(iic, thread_x)
-        s[temp_S].bind(ioic, thread_y)
-        s[temp_S].vectorize(ic)
-
-        #schedule Filter_S shared mem load
-        i, oc, h, w = s[Filter_S].op.axis
-        _, ii = s[Filter_S].split(i, factor=num_thread_x)
-        h = s[Filter_S].fuse(h, w)
-        _, ih = s[Filter_S].split(h, factor=num_thread_y)
-        s[Filter_S].bind(ii, thread_x)
-        s[Filter_S].bind(ih, thread_y)
-        s[Filter_S].storage_align(s[Filter_S].op.axis[0], 2, 1)
-
-def conv2d_56_64_64(s, Filter, temp_S, Filter_S, Out, Out_L):
-    """Schedule conv2d for specific feature_in_out_filter pattern"""
-    # scheduler params
-    num_thread = 8
-    vthread = 2
-    opart2 = 4
-    ofactor = 64
-    wfactor = 56
-    ifactor = 8
-    if util.get_const_int(Filter.shape[0]) == 64:
-        opart2 = 8
-        ifactor = 16
-    if util.get_const_int(Out.shape[2]) == 224:
-        num_thread = 4
-        wfactor = 112
-        ifactor = 4
-    sfactor = max(1, ofactor // (opart2*vthread))
-    spart = max(1, (wfactor + vthread-1) // vthread)
-
-    block_x = tvm.thread_axis("blockIdx.x")
-    block_y = tvm.thread_axis("blockIdx.y")
-    block_z = tvm.thread_axis("blockIdx.z")
-    thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
-    thread_y = tvm.thread_axis((0, wfactor // vthread), "threadIdx.y")
-    thread_xz = tvm.thread_axis((0, opart2), "vthread", name="vx")
-    thread_yz = tvm.thread_axis((0, vthread), "vthread", name="vy")
-
-    i, oc, h, w = s[Out].op.axis
-    ooc, ioc = s[Out].split(oc, factor=ofactor)
-    ow, iw = s[Out].split(w, factor=wfactor)
-    ow = s[Out].fuse(ow, h)
-    oioc, iioc = s[Out].split(ioc, nparts=vthread)
-    oiw, iiw = s[Out].split(iw, nparts=vthread)
-    oiioc, iiioc = s[Out].split(iioc, nparts=opart2)
-    s[Out].reorder(i, ooc, ow, oioc, oiw, oiioc, iiw, iiioc)
-    s[Out].bind(iiioc, thread_x)
-    s[Out].bind(iiw, thread_y)
-    s[Out].bind(oiioc, thread_xz)
-    s[Out].bind(oiw, thread_yz)
-    s[Out].bind(oioc, block_x)
-    s[Out].bind(ow, block_y)
-    s[Out].bind(ooc, block_z)
-
-    s[Out_L].compute_at(s[Out], iiioc)
-
-    # schedule Out_L local write
-    i, oc, h, w = s[Out_L].op.axis
-    ic, dh, dw = s[Out_L].op.reduce_axis
-    oic, iic = s[Out_L].split(ic, factor=ifactor)
-    s[Out_L].reorder(oic, dh, dw, iic, h, w)
-    fuse_index = s[Out_L].fuse(dw, dh)
-    fuse_index = s[Out_L].fuse(fuse_index, oic)
-    dw = fuse_index
-
-    s[temp_S].compute_at(s[Out_L], dw)
-    s[Filter_S].compute_at(s[Out_L], dw)
-
-    #schedule temp_S shared mem load
-    i, ic, h, w = s[temp_S].op.axis
-    _, iic = s[temp_S].split(ic, factor=sfactor)
-    _, iw = s[temp_S].split(w, factor=spart)
-    s[temp_S].bind(iic, thread_x)
-    s[temp_S].bind(iw, thread_y)
-
-    #schedule Filter_S shared mem load
-    i, oc, h, w = s[Filter_S].op.axis
-    _, ioc = s[Filter_S].split(oc, factor=sfactor)
-    _, ii = s[Filter_S].split(i, factor=spart)
-    s[Filter_S].bind(ioc, thread_x)
-    s[Filter_S].bind(ii, thread_y)
-
-def schedule_conv2d_small_batch(outs):
-    """Create schedule for tensors or return error if batch size is larger than 1"""
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def schedule(temp, Filter, Output):
-        """Schedule conv2d_nchw"""
-
-        flag = util.get_const_int(Filter.shape[0])+util.get_const_int(Filter.shape[1])
-
-        if flag > 768:
-            temp_G = s.cache_read(temp, "global", [Output])
-            s[temp_G].compute_inline()
-            i, ic, h, w = s[temp_G].op.axis
-            oic, iic = s[temp_G].split(ic, factor=4)
-            s[temp_G].reorder(i, h, w, oic, iic)
-            temp_R = s.cache_write(temp_G, "global")
-            temp_S = s.cache_read(temp_R, "shared", [temp_G])
-        elif 128 < flag < 512:
-            temp_G = s.cache_read(temp, "global", [Output])
-            s[temp_G].compute_inline()
-            i, ic, h, w = s[temp_G].op.axis
-            oic, iic = s[temp_G].split(ic, factor=4)
-            s[temp_G].reorder(i, oic, h, w, iic)
-            temp_R = s.cache_write(temp_G, "global")
-            temp_S = s.cache_read(temp_R, "shared", [temp_G])
-        elif util.get_const_int(Filter.shape[3]) == 7 or (util.get_const_int(Output.shape[2] == 224) and flag < 128):
-            temp_G = s.cache_read(temp, "global", [Output])
-            s[temp_G].compute_inline()
-            i, ic, h, w = s[temp_G].op.axis
-            s[temp_G].split(w, factor=4)
-            temp_R = s.cache_write(temp_G, "global")
-            temp_S = s.cache_read(temp_R, "shared", [temp_G])
-        else:
-            s[temp].compute_inline()
-            temp_S = s.cache_read(temp, "shared", [Output])
-            temp_R = temp_S
-
-        Filter_S = s.cache_read(Filter, "shared", [Output])
-
-        if Output.op in s.outputs:
-            Out = Output
-            Out_L = s.cache_write(Out, "local")
-        else:
-            Out = outs[0].op.output(0)
-            s[Output].set_scope("local")
-            Out_L = Output
-
-        if util.get_const_int(Filter.shape[3]) == 7 or (util.get_const_int(Output.shape[2] == 224) and flag < 128):
-            conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag)
-        elif 128 < flag < 512:
-            conv2d_56_64_128(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag)
-        elif flag >= 512:
-            conv2d_14_256_256(s, temp, temp_R, temp_S, Filter, Filter_S, Out, Out_L)
-        else:
-            conv2d_56_64_64(s, Filter, temp_S, Filter_S, Out, Out_L)
-
-    scheduled_ops = []
-
-    def traverse(OP):
-        """Traverse operators from computation graph"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-        # schedule conv2d
-        if 'conv2d_nchw' in OP.tag:
-            temp = OP.input_tensors[0]
-            Filter = OP.input_tensors[1]
-            if isinstance(Filter.op, tvm.tensor.ComputeOp) and 'dilate' in Filter.op.tag:
-                s[Filter].compute_inline()
-            Output = OP.output(0)
-            schedule(temp, Filter, Output)
-
-        scheduled_ops.append(OP)
-
-    traverse(outs[0].op)
-    return s
-
-
-@generic.schedule_conv2d_nchw.register(["cuda", "gpu"])
-def schedule_conv2d_nchw(outs):
-    """Schedule for conv2d_nchw.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of conv2d_nchw
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for conv2d_nchw.
-    """
-    target = tvm.target.current_target()
-    if target.target_name == "cuda" and "cudnn" in target.libs:
-        return topi.generic.schedule_extern(outs)
-
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    batch_size = util.get_const_int(outs[0].op.output(0).shape[0])
-    if batch_size > 1:
-        raise RuntimeError("Batch size: %d is too large for this schedule" % batch_size)
-    return  schedule_conv2d_small_batch(outs)
diff --git a/topi/python/topi/cuda/conv2d_transpose_nchw.py b/topi/python/topi/cuda/conv2d_transpose_nchw.py
index 4454bc54d3eb..e2e011e14d23 100644
--- a/topi/python/topi/cuda/conv2d_transpose_nchw.py
+++ b/topi/python/topi/cuda/conv2d_transpose_nchw.py
@@ -1,121 +1,199 @@
-#pylint: disable=invalid-name, line-too-long
-"""Schedule for conv2d_transpose_nchw with auto fusion"""
-import tvm
-from .. import util
-from .. import tag
-from .. import generic
-from .conv2d_nchw import conv2d_224_3_64, conv2d_56_64_128, conv2d_14_256_256, conv2d_56_64_64
+# pylint: disable=invalid-name
+"""Conv2d transpose template for cuda backend"""
 
+import tvm
+from tvm import autotvm
 
-def schedule_conv2d_transpose_small_batch(outs):
-    """Create schedule for tensors or return error if batch size is larger than 1"""
-    s = tvm.create_schedule([x.op for x in outs])
+from .. import nn, generic
+from ..util import equal_const_int, get_const_tuple, traverse_inline
 
-    def schedule(temp, Filter, Output):
-        """Schedule conv2d_transpose_nchw"""
-        block_h = util.get_const_int(Output.shape[3])
-        block_w = util.get_const_int(temp.shape[1])
-        if block_h % 48 == 0:
-            block_h = 48
-        elif block_h % 32 == 0:
-            block_h = 32
-        if block_w % 48 == 0:
-            block_w = 48
-        elif block_w % 32 == 0:
-            block_w = 32
-
-        flag = util.get_const_int(Filter.shape[0])+util.get_const_int(Filter.shape[1])
-
-        if flag > 768:
-            temp_G = s.cache_read(temp, "global", [Output])
-            s[temp_G].compute_inline()
-            i, ic, h, w = s[temp_G].op.axis
-            oic, iic = s[temp_G].split(ic, factor=4)
-            s[temp_G].reorder(i, h, w, oic, iic)
-            temp_R = s.cache_write(temp_G, "global")
-            temp_S = s.cache_read(temp_R, "shared", [temp_G])
-        elif 128 < flag < 512:
-            temp_G = s.cache_read(temp, "global", [Output])
-            s[temp_G].compute_inline()
-            i, ic, h, w = s[temp_G].op.axis
-            oic, iic = s[temp_G].split(ic, factor=4)
-            s[temp_G].reorder(i, oic, h, w, iic)
-            temp_R = s.cache_write(temp_G, "global")
-            temp_S = s.cache_read(temp_R, "shared", [temp_G])
-        elif util.get_const_int(Filter.shape[3]) == 7 or (util.get_const_int(Output.shape[2] == 224) and flag < 128):
-            temp_G = s.cache_read(temp, "global", [Output])
-            s[temp_G].compute_inline()
-            i, ic, h, w = s[temp_G].op.axis
-            s[temp_G].split(w, factor=4)
-            temp_R = s.cache_write(temp_G, "global")
-            temp_S = s.cache_read(temp_R, "shared", [temp_G])
-        else:
-            s[temp].compute_inline()
-            temp_S = s.cache_read(temp, "shared", [Output])
-            temp_R = temp_S
-
-        Filter_S = s.cache_read(Filter, "shared", [Output])
-
-        if Output.op in s.outputs:
-            Out = Output
-            Out_L = s.cache_write(Out, "local")
-        else:
-            Out = outs[0].op.output(0)
-            s[Output].set_scope("local")
-            Out_L = Output
-
-        if util.get_const_int(Filter.shape[3]) == 7 or (util.get_const_int(Output.shape[2] == 224) and flag < 128):
-            conv2d_224_3_64(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag)
-        elif 128 < flag < 512:
-            conv2d_56_64_128(s, temp, temp_R, temp_S, Filter_S, Out, Out_L, flag)
-        elif flag >= 512:
-            conv2d_14_256_256(s, temp, temp_R, temp_S, Filter, Filter_S, Out, Out_L)
-        else:
-            conv2d_56_64_64(s, Filter, temp_S, Filter_S, Out, Out_L)
-
-    scheduled_ops = []
-
-    def traverse(OP):
-        """Internal travserse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_injective(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-        # schedule conv2d_transpose_nchw
-        if 'conv2d_transpose_nchw' in OP.tag:
-            temp = OP.input_tensors[0]
-            DilatedInput = temp.op.input_tensors[0]
-            s[DilatedInput].compute_inline()
-            Filter = OP.input_tensors[1]
-            Output = OP.output(0)
-            schedule(temp, Filter, Output)
-
-        scheduled_ops.append(OP)
-
-    traverse(outs[0].op)
-    return s
+@autotvm.task.register_topi_compute(nn.conv2d_transpose_nchw, ['cuda', 'gpu'], "direct")
+def conv2d_transpose_nchw_cuda(cfg, Input, Filter, strides, padding, out_dtype):
+    """Transposed 2D convolution nchw forward operator.
 
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+    Input : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+    Filter : tvm.Tensor
+        4-D with shape [in_channel, num_filter, filter_height, filter_width]
+    strides : tuple of two ints
+        The spatial stride along height and width
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
+    out_dtype: str
+        The output type. This is used in mixed precision
 
-@generic.schedule_conv2d_transpose_nchw.register(["cuda", "gpu"])
-def schedule_conv2d_transpose_nchw(outs):
-    """Schedule for conv2d_transpose_nchw.
+    Returns
+    -------
+    Output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    batch, in_c, in_h, in_w = get_const_tuple(Input.shape)
+    _, out_c, filter_h, filter_w = get_const_tuple(Filter.shape)
+    stride_h, stride_w = strides
+
+    # attach stride info to config, this is used in schedule space definition
+    cfg.stride = strides
+
+    # padding stage
+    fpad_top, fpad_left, fpad_bottom, fpad_right = nn.get_pad_tuple(padding, (filter_h, filter_w))
+    bpad_top = filter_h - 1 - fpad_top
+    bpad_bottom = filter_h - 1 - fpad_bottom
+    bpad_left = filter_w - 1 - fpad_left
+    bpad_right = filter_w - 1 - fpad_right
+
+    # padding stage
+    FirstPad = nn.pad(Input,
+                      [0, 0, (bpad_top + stride_h - 1) // stride_h,
+                       (bpad_left + stride_w - 1) // stride_w],
+                      [0, 0, (bpad_bottom + stride_h - 1) // stride_h,
+                       (bpad_right + stride_w - 1) // stride_w], name='FirstPad')
+
+    # remove extra padding introduced by dilatation
+    border_h = (stride_h - bpad_top % stride_h) % stride_h
+    border_w = (stride_w - bpad_left % stride_w) % stride_w
+
+    # dilation stage
+    data = FirstPad
+    strides = [1, 1, stride_h, stride_w]
+    n = len(data.shape)
+
+    def _dilate(*indices):
+        not_zero = []
+        index_tuple = []
+        for i in range(n):
+            if not equal_const_int(strides[i], 1):
+                index_tuple.append(indices[i] // strides[i])
+                not_zero.append((indices[i] % strides[i]).equal(0))
+            else:
+                index_tuple.append(indices[i])
+        if not_zero:
+            not_zero = tvm.all(*not_zero)
+            return tvm.select(not_zero, data(*index_tuple), tvm.const(0.0, data.dtype))
+        return data(*index_tuple)
+
+    # convolution stage
+    out_h = (in_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h
+    out_w = (in_w - 1) * stride_w - fpad_left - fpad_right + filter_w
+    dc = tvm.reduce_axis((0, in_c), name='dc')
+    dh = tvm.reduce_axis((0, filter_h), name='dh')
+    dw = tvm.reduce_axis((0, filter_w), name='dw')
+
+    Output = tvm.compute(
+        (batch, out_c, out_h, out_w),
+        lambda b, c, h, w: tvm.sum(
+            _dilate(b, dc, h + dh + border_h, w + dw + border_w).astype(out_dtype) *
+            Filter[dc, c, filter_h - 1 - dh, filter_w - 1 - dw].astype(out_dtype),
+            axis=[dc, dh, dw]), tag="conv2d_transpose_nchw")
+
+    return Output
+
+@autotvm.task.register_topi_schedule(generic.schedule_conv2d_transpose_nchw,
+                                     ['cuda', 'gpu'], 'direct')
+def schedule_conv2d_transpose_nchw_cuda(cfg, outs):
+    """TOPI Schedule callback for conv2d transpose operator.
 
     Parameters
     ----------
+    cfg: ConfigEntity
+        The parameters for this template
+
     outs: Array of Tensor
-        The computation graph description of conv2d_transpose_nchw
+        The computation graph description of conv2d transpose
         in the format of an array of tensors.
 
     Returns
     -------
     s: Schedule
-        The computation schedule for conv2d_transpose_nchw.
+        The computation schedule for conv2d transpose.
     """
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
-    batch_size = util.get_const_int(outs[0].op.output(0).shape[0])
-    if batch_size > 1:
-        raise RuntimeError("Batch size: %d is too large for this schedule" % batch_size)
-    return schedule_conv2d_transpose_small_batch(outs)
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == 'conv2d_transpose_nchw':
+            pad_data = op.input_tensors[0]
+            kernel = op.input_tensors[1]
+            conv = op.output(0)
+
+            ##### space definition begin #####
+            n, f, y, x = s[conv].op.axis
+            rc = s[conv].op.reduce_axis[0]
+            cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
+            cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
+            cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
+            cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3)
+            cfg.define_knob("auto_unroll_max_step", [64, 512, 1500])
+
+            target = tvm.target.current_target()
+            if target.target_name in ['nvptx', 'rocm']:
+                cfg.define_knob("unroll_explicit", [1])
+            else:
+                cfg.define_knob("unroll_explicit", [0, 1])
+            ##### space definition end #####
+
+            if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
+                s[kernel].compute_inline()
+
+            if conv.op in s.outputs:
+                output = conv
+                OL = s.cache_write(conv, 'local')
+            else:
+                output = s.outputs[0].output(0)
+                s[conv].set_scope('local')
+                OL = conv
+
+            # create cache stage
+            s[pad_data].set_scope('shared')
+            AA = pad_data
+            WW = s.cache_read(kernel, 'shared', [OL])
+
+            # tile and bind spatial axes
+            n, f, y, x = s[output].op.axis
+            kernel_scope, n = s[output].split(n, nparts=1)
+            bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
+            by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+            bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+
+            bf = s[output].fuse(n, bf)
+            s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
+            s[output].bind(by, tvm.thread_axis("blockIdx.y"))
+            s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
+            s[output].bind(vf, tvm.thread_axis("vthread"))
+            s[output].bind(vy, tvm.thread_axis("vthread"))
+            s[output].bind(vx, tvm.thread_axis("vthread"))
+            s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
+            s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
+            s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+            s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
+            s[OL].compute_at(s[output], tx)
+
+            # tile reduction axes
+            n, f, y, x = s[OL].op.axis
+            rc, ry, rx = s[OL].op.reduce_axis
+            rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc)
+            s[OL].reorder(rco, rcm, ry, rx, rci, n, f, y, x)
+
+            s[AA].compute_at(s[OL], rcm)
+            s[WW].compute_at(s[OL], rcm)
+
+            # cooperative fetching
+            for load in [AA, WW]:
+                n, f, y, x = s[load].op.axis
+                fused = s[load].fuse(n, f, y, x)
+                tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
+                ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
+                tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
+                s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
+                s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
+                s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+            s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+            s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+
+    traverse_inline(s, outs[0].op, _callback)
+
+    return s
diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py
new file mode 100644
index 000000000000..d32a87ba6b9d
--- /dev/null
+++ b/topi/python/topi/cuda/conv2d_winograd.py
@@ -0,0 +1,446 @@
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""Winograd template for cuda backend"""
+
+import numpy as np
+
+import tvm
+from tvm import autotvm
+
+from .. import nn
+from ..nn import conv2d, group_conv2d_nchw, conv2d_winograd_without_weight_transform
+from ..util import get_const_int, get_const_tuple, const_matrix, traverse_inline
+from ..generic import schedule_conv2d_winograd_without_weight_transform
+
+
+def _infer_tile_size(data, kernel):
+    N, CI, H, W = get_const_tuple(data.shape)
+
+    if H % 8 == 0:
+        return 4
+    return 2
+
+def winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, pre_computed):
+    """Compute declaration for winograd"""
+    assert layout == 'NCHW'
+
+    tile_size = _infer_tile_size(data, kernel)
+
+    N, CI, H, W = get_const_tuple(data.shape)
+
+    if not pre_computed: # kernel tensor is raw tensor, do strict check
+        if isinstance(dilation, int):
+            dilation_h = dilation_w = dilation
+        else:
+            dilation_h, dilation_w = dilation
+        if dilation_h != 1 or dilation_w != 1:
+            kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))
+
+        CO, CI, KH, KW = get_const_tuple(kernel.shape)
+        HPAD, WPAD, _, _ = nn.get_pad_tuple(padding, kernel)
+        HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides
+        assert HSTR == 1 and WSTR == 1 and HPAD == 1 and WPAD == 1 and KH == 3 and KW == 3
+    else:                   # kernel tensor is pre-transfomred. this op is created by
+                            # alter op layout, do not check
+        # dilation is not supported
+        HSTR = WSTR = 1
+        HPAD = WPAD = 1
+        KH = KW = 3
+        _, _, CI, CO = get_const_tuple(kernel.shape)
+
+    data_pad = nn.pad(data, (0, 0, HPAD, WPAD), (0, 0, HPAD, WPAD), name="data_pad")
+
+    if tile_size == 4:
+        G_data = np.array([
+            [1 / 4.0, 0, 0],
+            [-1 / 6.0, -1 / 6.0, -1 / 6.0],
+            [-1 / 6.0, 1 / 6.0, -1 / 6.0],
+            [1 / 24.0, 1 / 12.0, 1 / 6.0],
+            [1 / 24.0, -1 / 12.0, 1 / 6.0],
+            [0, 0, 1]], dtype=np.float32)
+
+        B_data = np.array([
+            [4, 0, 0, 0, 0, 0],
+            [0, -4, 4, -2, 2, 4],
+            [-5, -4, -4, -1, -1, 0],
+            [0, 1, -1, 2, -2, -5],
+            [1, 1, 1, 1, 1, 0],
+            [0, 0, 0, 0, 0, 1]], out_dtype)
+
+        A_data = np.array([
+            [1, 0, 0, 0],
+            [1, 1, 1, 1],
+            [1, -1, 1, -1],
+            [1, 2, 4, 8],
+            [1, -2, 4, -8],
+            [0, 0, 0, 1]], out_dtype)
+    elif tile_size == 2:
+        G_data = np.array([
+            [1, 0, 0],
+            [1.0/2, 1.0/2, 1.0/2],
+            [1.0/2, -1.0/2, 1.0/2],
+            [0, 0, 1]], np.float32)
+
+        B_data = np.array([
+            [1, 0, 0, 0],
+            [0, 1, -1, 1],
+            [-1, 1, 1, 0],
+            [0, 0, 0, -1]], out_dtype)
+
+        A_data = np.array([
+            [1, 0],
+            [1, 1],
+            [1, -1],
+            [0, -1]], out_dtype)
+    else:
+        raise ValueError("Unsupported tile size for winograd: " + str(tile_size))
+
+    m = A_data.shape[1]
+    r = 3
+    alpha = m + r - 1
+    H = (H + 2 * HPAD - KH) // HSTR + 1
+    W = (W + 2 * WPAD - KW) // WSTR + 1
+    nH, nW = (H + m-1) // m, (W + m-1) // m
+    P = N * nH * nW
+
+    # transform kernel
+    if not pre_computed:
+        G = const_matrix(G_data, 'G')
+        r_kh = tvm.reduce_axis((0, KH), name='r_kh')
+        r_kw = tvm.reduce_axis((0, KW), name='r_kw')
+        kernel_pack = tvm.compute((alpha, alpha, CI, CO), lambda eps, nu, ci, co:
+                                  tvm.sum(kernel[co][ci][r_kh][r_kw] *
+                                          G[eps][r_kh] * G[nu][r_kw],
+                                          axis=[r_kh, r_kw]), name='kernel_pack')
+    else:
+        kernel_pack = kernel
+
+    # pack input tile
+    input_tile = tvm.compute((CI, P, alpha, alpha), lambda c, p, eps, nu:
+                             data_pad[p // (nH * nW)][c][p // nW % nH * m + eps]
+                             [p % nW * m + nu], name='d')
+
+    # transform data
+    B = const_matrix(B_data)
+    r_a = tvm.reduce_axis((0, alpha), 'r_a')
+    r_b = tvm.reduce_axis((0, alpha), 'r_a')
+    data_pack = tvm.compute((alpha, alpha, CI, P), lambda eps, nu, ci, p:
+                            tvm.sum(input_tile[ci][p][r_a][r_b] * B[r_a][eps] * B[r_b][nu],
+                                    axis=[r_a, r_b]), name='data_pack')
+
+    # do batch gemm
+    ci = tvm.reduce_axis((0, CI), name='ci')
+    bgemm = tvm.compute((alpha, alpha, CO, P), lambda eps, nu, co, p:
+                        tvm.sum(kernel_pack[eps][nu][ci][co] *
+                                data_pack[eps][nu][ci][p],
+                                axis=[ci]), name='bgemm')
+
+    # inverse transform
+    A = const_matrix(A_data)
+    r_a = tvm.reduce_axis((0, alpha), 'r_a')
+    r_b = tvm.reduce_axis((0, alpha), 'r_a')
+    inverse = tvm.compute((CO, P, m, m), lambda co, p, vh, vw:
+                          tvm.sum(bgemm[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw],
+                                  axis=[r_a, r_b]), name='inverse')
+
+    # output
+    output = tvm.compute((N, CO, H, W), lambda n, co, h, w:
+                         inverse[co][n * nH * nW + (h // m) * nW + w // m][h % m][w % m],
+                         name='output', tag='conv2d_nchw_winograd')
+    cfg.add_flop(2 * N * CO * H * W * CI * KH * KW)
+
+    return output
+
+
+def schedule_winograd_cuda(cfg, s, output, pre_computed):
+    """Schedule winograd template"""
+    # get stages
+    inverse = s[output].op.input_tensors[0]
+    bgemm, A = s[inverse].op.input_tensors
+    kernel_pack, data_pack = s[bgemm].op.input_tensors
+    input_tile, B = s[data_pack].op.input_tensors
+    pad_data = s[input_tile].op.input_tensors[0]
+
+    # data transform
+    s[B].compute_inline()
+
+    data_l = s.cache_write(data_pack, 'local')
+    eps, nu, c, p = s[data_l].op.axis
+    r_a, r_b = s[data_l].op.reduce_axis
+    for axis in [eps, nu, r_a, r_b]:
+        s[data_l].unroll(axis)
+
+    eps, nu, c, p = s[data_pack].op.axis
+    p, pi = s[data_pack].split(p, 1)
+    fused = s[data_pack].fuse(c, p)
+    bb, tt = s[data_pack].split(fused, 128)
+    s[data_pack].reorder(bb, tt, pi, eps, nu)
+    s[data_pack].bind(bb, tvm.thread_axis("blockIdx.x"))
+    s[data_pack].bind(tt, tvm.thread_axis("threadIdx.x"))
+
+    s[data_l].compute_at(s[data_pack], pi)
+    s[input_tile].compute_at(s[data_pack], pi)
+    s[pad_data].compute_inline()
+
+    # transform kernel
+    if not pre_computed:
+        kernel, G = s[kernel_pack].op.input_tensors
+        eps, nu, ci, co = s[kernel_pack].op.axis
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # skip this part during tuning to make recrods accurate
+            # this part will be pre-computed during NNVM's pre-compute optimization pass
+            s[G].pragma(s[G].op.axis[0], 'debug_skip_region')
+            s[kernel_pack].pragma(eps, 'debug_skip_region')
+        else:
+            s[G].compute_inline()
+            r_a, r_b = s[kernel_pack].op.reduce_axis
+            for axis in [eps, nu, r_a, r_b]:
+                s[kernel_pack].unroll(axis)
+
+            fused = s[kernel_pack].fuse(ci, co)
+            bb, tt = s[kernel_pack].split(fused, 128)
+            s[kernel_pack].reorder(bb, tt, eps, nu, r_a, r_b)
+            s[kernel_pack].bind(bb, tvm.thread_axis("blockIdx.x"))
+            s[kernel_pack].bind(tt, tvm.thread_axis("threadIdx.x"))
+    else:
+        kernel = kernel_pack
+
+    if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+        s[kernel].compute_inline()
+
+    ##### space definition begin #####
+    b1, b2, y, x = s[bgemm].op.axis
+    rc = s[bgemm].op.reduce_axis[0]
+    alpha = get_const_int(b1.dom.extent)
+
+    cfg.define_split("tile_b", cfg.axis(alpha * alpha), num_outputs=4,
+                     filter=lambda x: x.size[-3:] == [1, 1, 1])
+    cfg.define_split("tile_y", y, num_outputs=4)
+    cfg.define_split("tile_x", x, num_outputs=4)
+    cfg.define_split("tile_rc", rc, num_outputs=2)
+    cfg.define_knob("auto_unroll_max_step", [0, 128, 1500])
+    target = tvm.target.current_target()
+    if target.target_name in ['nvptx', 'rocm']:
+        cfg.define_knob("unroll_explicit", [1])
+    else:
+        cfg.define_knob("unroll_explicit", [0, 1])
+    ##### space definition end #####
+
+    # batch gemm
+    C = bgemm
+    A0, B0 = kernel_pack, data_pack
+
+    OL = s.cache_write(C, 'local')
+    AA = s.cache_read(A0, 'shared', [OL])
+    BB = s.cache_read(B0, 'shared', [OL])
+
+    b = s[bgemm].fuse(b1, b2)
+
+    # tile and bind spatial axes
+    bgemm_scope, b = s[bgemm].split(b, nparts=1)
+    bz, vz, tz, zi = cfg["tile_b"].apply(s, C, b)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, C, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, C, x)
+    s[C].bind(bz, tvm.thread_axis("blockIdx.z"))
+    s[C].bind(by, tvm.thread_axis("blockIdx.y"))
+    s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
+    s[C].bind(vz, tvm.thread_axis("vthread"))
+    s[C].bind(vy, tvm.thread_axis("vthread"))
+    s[C].bind(vx, tvm.thread_axis("vthread"))
+    s[C].bind(tz, tvm.thread_axis("threadIdx.z"))
+    s[C].bind(ty, tvm.thread_axis("threadIdx.y"))
+    s[C].bind(tx, tvm.thread_axis("threadIdx.x"))
+    s[C].reorder(bgemm_scope, bz, by, bx, vz, vy, vx, tz, ty, tx, zi, yi, xi)
+
+    # tile reduction axes
+    s[OL].compute_at(s[C], tx)
+    b1, b2, y, x = s[OL].op.axis
+    b = s[OL].fuse(b1, b2)
+    rc, = s[OL].op.reduce_axis
+    rco, rci = cfg['tile_rc'].apply(s, OL, rc)
+    s[OL].reorder(rco, rci, b, y, x)
+
+    s[AA].compute_at(s[OL], rco)
+    s[BB].compute_at(s[OL], rco)
+
+    # cooperative fetching
+    for load in [AA, BB]:
+        fused = s[load].fuse(*list(s[load].op.axis))
+        fused, tx = s[load].split(fused, cfg["tile_x"].size[2])
+        fused, ty = s[load].split(fused, cfg["tile_y"].size[2])
+        fused, tz = s[load].split(fused, cfg["tile_b"].size[2])
+        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
+        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+    s[C].pragma(bgemm_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+    s[C].pragma(bgemm_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+
+    # schedule inverse, output and fusion
+    if output.op in s.outputs:
+        OL = None
+    else:
+        OL = output
+        s[OL].set_scope('local')
+        output = s.outputs[0]
+
+    m = alpha - 3 + 1
+    n, co, h, w = s[output].op.axis
+    ho, wo, hi, wi = s[output].tile(h, w, m, m)
+    inverse_scope, n = s[output].split(n, nparts=1)
+
+    fused = s[output].fuse(n, co, ho, wo)
+    bb, tt = s[output].split(fused, 128)
+
+    s[output].bind(bb, tvm.thread_axis("blockIdx.x"))
+    s[output].bind(tt, tvm.thread_axis("threadIdx.x"))
+
+    if OL is not None:
+        s[OL].compute_at(s[output], tt)
+
+    s[A].compute_inline()
+    co, p, vh, vw = s[inverse].op.axis
+    r_a, r_b = s[inverse].op.reduce_axis
+    for axis in [vh, vw, r_a, r_b]:
+        s[inverse].unroll(axis)
+    s[inverse].compute_at(s[output], tt)
+
+    return s
+
+##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM #####
+@autotvm.register_topi_compute(conv2d_winograd_without_weight_transform,
+                               ['cuda', 'gpu'], ['winograd'])
+def conv2d_winograd_ww(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size):
+    return winograd_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
+                         pre_computed=True)
+
+
+@autotvm.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
+                                ['cuda', 'gpu'], ['winograd'])
+def schedule_conv2d_winograd_without_weight_transform_cuda(cfg, outs):
+    """TOPI schedule callback"""
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'conv2d_nchw_winograd' in op.tag:
+            schedule_winograd_cuda(cfg, s, op.output(0), pre_computed=True)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+##### REGISTER ALTER OP LAYOUT #####
+@nn.conv2d_alter_layout.register(["cuda", "gpu"])
+def _alter_conv2d_layout(attrs, inputs, tinfos):
+    """Alter op layout for pre-computing kernel transformation"""
+    if 'cudnn' in tvm.target.current_target().libs or 'miopen' in tvm.target.current_target().libs:
+        return None
+
+    import nnvm.symbol as sym
+    copy_inputs = [s for s in inputs]
+
+    new_attrs = {k: attrs[k] for k in attrs.keys()}
+
+    strides = attrs.get_int_tuple("strides")
+    padding = attrs.get_int_tuple("padding")
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int('groups')
+    layout = attrs["layout"]
+    out_dtype = attrs["out_dtype"]
+    out_dtype = tinfos[0].dtype if out_dtype == "same" else out_dtype
+
+    data, kernel = tinfos[0:2]
+    N, CI, H, W = get_const_tuple(data.shape)
+    CO, _, KH, KW = get_const_tuple(kernel.shape)
+
+    dispatch_ctx = autotvm.DispatchContext.current
+    target = tvm.target.current_target()
+
+    if groups == 1:
+        # query config of this workload
+        workload = autotvm.task.args_to_workload(
+            [tinfos[0], tinfos[1], strides, padding, dilation, layout, out_dtype], conv2d)
+        cfg = autotvm.DispatchContext.current.query(target, workload)
+
+        if cfg.is_fallback:  # if is fallback, clear query cache and return None
+            autotvm.task.clear_fallback_cache(target, workload)
+            return None
+
+        if cfg.template_key == 'direct':
+            return None
+
+        if cfg.template_key == 'int8':
+            assert 'cuda' in target.keys
+            new_layout = 'NCHW4c'
+            new_attrs['layout'] = new_layout
+            new_attrs['out_layout'] = new_layout
+            new_attrs['kernel_layout'] = 'OIHW4o4i'
+            ic_block_factor = oc_block_factor = 4
+
+            # Store the same config for the altered operator (workload)
+            new_data = tvm.placeholder((N, CI // ic_block_factor, H, W, ic_block_factor),
+                                       dtype=data.dtype)
+            new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor, KH, KW,\
+                                         oc_block_factor, ic_block_factor), dtype=kernel.dtype)
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_kernel, strides, padding, dilation, new_layout, out_dtype],
+                conv2d
+            )
+            dispatch_ctx.update(target, new_workload, cfg)
+            return sym.conv2d(*copy_inputs, **new_attrs)
+
+        if attrs.get_int_tuple("dilation") != (1, 1):
+            warnings.warn("Does not support weight pre-transform for dilated convolution.")
+            return None
+
+        # pre-compute weight transformation in winograd
+        tile_size = _infer_tile_size(tinfos[0], tinfos[1])
+
+        weight = sym.contrib.conv2d_winograd_weight_transform(copy_inputs[1],
+                                                              tile_size=tile_size)
+        weight = sym.transpose(weight, axes=[0, 1, 3, 2])
+        copy_inputs[1] = weight
+        new_attrs['tile_size'] = tile_size
+
+        # Store the same config for the altered operator (workload)
+        new_data = data
+        new_weight = tvm.placeholder((KH + tile_size - 1, KW + tile_size - 1, CI, CO),
+                                     dtype=kernel.dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_weight, strides, padding, dilation, layout, out_dtype, tile_size],
+            conv2d_winograd_without_weight_transform
+        )
+        dispatch_ctx.update(target, new_workload, cfg)
+        return sym.contrib.conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs)
+    elif groups != CI:
+        workload = autotvm.task.args_to_workload(
+            [tinfos[0], tinfos[1], strides, padding, dilation, groups, out_dtype],
+            group_conv2d_nchw)
+        cfg = autotvm.DispatchContext.current.query(target, workload)
+
+        if cfg.is_fallback:  # if is fallback, clear query cache and return None
+            autotvm.task.clear_fallback_cache(target, workload)
+            return None
+
+        if cfg.template_key == 'int8':
+            assert 'cuda' in target.keys
+            new_layout = 'NCHW4c'
+            new_attrs['layout'] = new_layout
+            new_attrs['out_layout'] = new_layout
+            new_attrs['kernel_layout'] = 'OIHW4o4i'
+            ic_block_factor = oc_block_factor = 4
+
+            # Store the same config for the altered operator (workload)
+            new_data = tvm.placeholder((N, CI // ic_block_factor, H, W, ic_block_factor),
+                                       dtype=data.dtype)
+            new_kernel = tvm.placeholder((CO // oc_block_factor, CI // ic_block_factor // groups,\
+                                         KH, KW, oc_block_factor, ic_block_factor),
+                                         dtype=kernel.dtype)
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_kernel, strides, padding, dilation, groups, out_dtype],
+                group_conv2d_nchw
+            )
+            dispatch_ctx.update(target, new_workload, cfg)
+            return sym.conv2d(*copy_inputs, **new_attrs)
+
+    # do nothing for depthwise convolution
+    return None
diff --git a/topi/python/topi/cuda/depthwise_conv2d.py b/topi/python/topi/cuda/depthwise_conv2d.py
index 94fa5c7e79ca..0214ed78b4e7 100644
--- a/topi/python/topi/cuda/depthwise_conv2d.py
+++ b/topi/python/topi/cuda/depthwise_conv2d.py
@@ -1,12 +1,17 @@
 # pylint: disable=invalid-name
 """Schedule for depthwise_conv2d with auto fusion"""
 import tvm
-from ..util import get_const_tuple
+from tvm import autotvm
+from ..util import traverse_inline
 from .. import tag
-from .. import generic
+from .. import generic, nn
 
-@generic.schedule_depthwise_conv2d_nchw.register(["cuda", "gpu"])
-def schedule_depthwise_conv2d_nchw(outs):
+# register original implementation of depthwise_conv2d_nchw since we don't need to change this part
+autotvm.register_topi_compute(nn.depthwise_conv2d_nchw, ['cuda', 'gpu'], 'direct',
+                              nn.depthwise_conv2d_nchw.fdefault)
+
+@autotvm.register_topi_schedule(generic.schedule_depthwise_conv2d_nchw, ['cuda', 'gpu'], 'direct')
+def schedule_depthwise_conv2d_nchw_cuda(cfg, outs):
     """Schedule for depthwise_conv2d nchw forward.
 
     Parameters
@@ -22,108 +27,92 @@ def schedule_depthwise_conv2d_nchw(outs):
     """
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
-    def _schedule(PaddedInput, Filter, DepthwiseConv2d):
-        in_shape = get_const_tuple(PaddedInput.shape)
-        out_shape = get_const_tuple(DepthwiseConv2d.shape)
-        in_height = in_shape[2]
-        in_width = in_shape[3]
-        out_height = out_shape[2]
-        out_width = out_shape[3]
-        channel_multiplier = get_const_tuple(Filter.shape)[1]
-        s[PaddedInput].compute_inline()
-        IS = s.cache_read(PaddedInput, "shared", [DepthwiseConv2d])
-        FS = s.cache_read(Filter, "shared", [DepthwiseConv2d])
-        IL = s.cache_read(IS, "local", [DepthwiseConv2d])
-        FL = s.cache_read(FS, "local", [DepthwiseConv2d])
-        if DepthwiseConv2d.op in s.outputs:
-            Output = DepthwiseConv2d
-            CL = s.cache_write(DepthwiseConv2d, "local")
-        else:
-            Output = outs[0].op.output(0)
-            s[DepthwiseConv2d].set_scope("local")
-        # schedule parameters
-        num_thread_y = 8
-        num_thread_x = 8
-        num_vthread_y = 1
-        num_vthread_x = 1
-        blocking_h = out_height
-        blocking_w = out_width
-        if out_height % 32 == 0 or in_height >= 108:
-            blocking_h = 32
-        if out_width % 32 == 0:
-            blocking_w = 32
-            num_thread_x = 16
-            num_vthread_x = 2
-        elif in_width >= 108:
-            blocking_w = 32
-        block_y = tvm.thread_axis("blockIdx.y")
-        block_x = tvm.thread_axis("blockIdx.x")
-        thread_y = tvm.thread_axis((0, num_thread_y), "threadIdx.y")
-        thread_x = tvm.thread_axis((0, num_thread_x), "threadIdx.x")
-        thread_vy = tvm.thread_axis((0, num_vthread_y), "vthread", name="vy")
-        thread_vx = tvm.thread_axis((0, num_vthread_x), "vthread", name="vx")
-        # split and bind
-        by, byi = s[Output].split(Output.op.axis[1], factor=channel_multiplier)
-        s[Output].reorder(Output.op.axis[2], Output.op.axis[3], byi)
-        by = s[Output].fuse(Output.op.axis[0], by)
-        s[Output].bind(by, block_y)
-        bx1, x1i = s[Output].split(Output.op.axis[2], factor=blocking_h)
-        tvy, vyi = s[Output].split(x1i, nparts=num_vthread_y)
-        ty, yi = s[Output].split(vyi, nparts=num_thread_y)
-        bx2, x2i = s[Output].split(Output.op.axis[3], factor=blocking_w)
-        tvx, vxi = s[Output].split(x2i, nparts=num_vthread_x)
-        tx, xi = s[Output].split(vxi, nparts=num_thread_x)
-        s[Output].reorder(bx1, bx2, tvy, tvx, ty, tx, yi, xi)
-        bx = s[Output].fuse(bx1, bx2)
-        s[Output].bind(bx, block_x)
-        s[Output].bind(tvy, thread_vy)
-        s[Output].bind(tvx, thread_vx)
-        s[Output].bind(ty, thread_y)
-        s[Output].bind(tx, thread_x)
-        # local memory load
-        s[IL].compute_at(s[Output], tx)
-        s[FL].compute_at(s[Output], tx)
-        if DepthwiseConv2d.op in s.outputs:
-            s[CL].compute_at(s[Output], tx)
-        else:
-            s[DepthwiseConv2d].compute_at(s[Output], tx)
-        # input's shared memory load
-        s[IS].compute_at(s[Output], bx)
-        ty, yi = s[IS].split(IS.op.axis[2], nparts=num_thread_y)
-        tx, xi = s[IS].split(IS.op.axis[3], nparts=num_thread_x)
-        s[IS].bind(ty, thread_y)
-        s[IS].bind(tx, thread_x)
-        # filter's shared memory load
-        s[FS].compute_at(s[Output], bx)
-        s[FS].reorder(FS.op.axis[2], FS.op.axis[3], FS.op.axis[1])
-        ty, yi = s[FS].split(FS.op.axis[2], nparts=num_thread_y)
-        tx, xi = s[FS].split(FS.op.axis[3], nparts=num_thread_x)
-        s[FS].bind(ty, thread_y)
-        s[FS].bind(tx, thread_x)
 
-    scheduled_ops = []
-
-    def traverse(OP):
-        """Internal travserse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-        # schedule depthwise_conv2d
-        if OP.tag == 'depthwise_conv2d_nchw':
-            PaddedInput = OP.input_tensors[0]
-            Filter = OP.input_tensors[1]
-            if isinstance(Filter.op, tvm.tensor.ComputeOp) and 'dilate' in Filter.op.tag:
-                s[Filter].compute_inline()
-            DepthwiseConv2d = OP.output(0)
-            _schedule(PaddedInput, Filter, DepthwiseConv2d)
-
-        scheduled_ops.append(OP)
-
-    traverse(outs[0].op)
+    def _callback(op):
+        if op.tag == 'depthwise_conv2d_nchw':
+            pad_data = op.input_tensors[0]
+            kernel = op.input_tensors[1]
+            conv = op.output(0)
+
+            ##### space definition begin #####
+            n, f, y, x = s[conv].op.axis
+            cfg.define_split("tile_f", f, num_outputs=4)
+            cfg.define_split("tile_y", y, num_outputs=4)
+            cfg.define_split("tile_x", x, num_outputs=4)
+            cfg.define_knob("auto_unroll_max_step", [0, 256, 1500])
+
+            target = tvm.target.current_target()
+            if target.target_name in ['nvptx', 'rocm']:
+                cfg.define_knob("unroll_explicit", [1])
+            else:
+                cfg.define_knob("unroll_explicit", [0, 1])
+
+            # fallback support
+            if cfg.is_fallback:
+                ref_log = autotvm.tophub.load_reference_log(
+                    target.target_name, target.model, 'depthwise_conv2d_nchw', 'direct')
+                cfg.fallback_with_reference_log(ref_log)
+                # TODO(lmzheng): A bug here, set unroll_explicit to False as workaround
+                cfg['unroll_explicit'].val = 0
+            ##### space definition end #####
+
+            s[pad_data].compute_inline()
+            if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
+                s[kernel].compute_inline()
+
+            if conv.op in s.outputs:
+                output = conv
+                OL = s.cache_write(conv, 'local')
+            else:
+                output = s.outputs[0].output(0)
+                s[conv].set_scope('local')
+                OL = conv
+
+            # create cache stage
+            AA = s.cache_read(pad_data, 'shared', [OL])
+            WW = s.cache_read(kernel, 'shared', [OL])
+            AL = s.cache_read(AA, 'local', [OL])
+            WL = s.cache_read(WW, 'local', [OL])
+
+            # tile and bind spatial axes
+            n, f, y, x = s[output].op.axis
+            bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
+            by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+            bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+
+            kernel_scope, n = s[output].split(n, nparts=1)
+            bf = s[output].fuse(n, bf)
+            s[output].bind(bf, tvm.thread_axis("blockIdx.z"))
+            s[output].bind(by, tvm.thread_axis("blockIdx.y"))
+            s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
+            s[output].bind(vf, tvm.thread_axis("vthread"))
+            s[output].bind(vy, tvm.thread_axis("vthread"))
+            s[output].bind(vx, tvm.thread_axis("vthread"))
+            s[output].bind(tf, tvm.thread_axis("threadIdx.z"))
+            s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
+            s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+            s[output].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
+            s[OL].compute_at(s[output], tx)
+
+            # cooperative fetching
+            s[AA].compute_at(s[output], bx)
+            s[WW].compute_at(s[output], bx)
+            s[AL].compute_at(s[output], tx)
+            s[WL].compute_at(s[output], tx)
+
+            for load in [AA, WW]:
+                fused = s[load].fuse(*list(s[load].op.axis))
+                fused, tx = s[load].split(fused, cfg["tile_x"].size[2])
+                fused, ty = s[load].split(fused, cfg["tile_y"].size[2])
+                fused, tz = s[load].split(fused, cfg["tile_f"].size[2])
+                s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
+                s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
+                s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+            s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+            s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
+
+    traverse_inline(s, outs[0].op, _callback)
     return s
 
 @generic.schedule_depthwise_conv2d_nhwc.register(["cuda", "gpu"])
@@ -143,8 +132,8 @@ def schedule_depthwise_conv2d_nhwc(outs):
     """
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
-    def _schedule(temp, Filter, DepthwiseConv2d):
 
+    def _schedule(temp, Filter, DepthwiseConv2d):
         s[temp].compute_inline()
         FS = s.cache_read(Filter, "shared", [DepthwiseConv2d])
         if DepthwiseConv2d.op in s.outputs:
diff --git a/topi/python/topi/cuda/group_conv2d_nchw.py b/topi/python/topi/cuda/group_conv2d_nchw.py
new file mode 100644
index 000000000000..739691131284
--- /dev/null
+++ b/topi/python/topi/cuda/group_conv2d_nchw.py
@@ -0,0 +1,308 @@
+# pylint: disable=invalid-name
+"""The template for cuda group_conv2d_nchw"""
+import tvm
+from tvm import autotvm
+
+from .injective import _schedule_injective
+from .tensor_intrin import dp4a
+from ..nn.pad import pad
+from ..nn.util import get_pad_tuple
+from ..util import traverse_inline, get_const_tuple, get_const_int
+from .. import nn, generic
+
+
+@autotvm.register_topi_compute(nn.group_conv2d_nchw, ['cuda', 'gpu'], ['direct', 'int8'])
+def group_conv2d_nchw_cuda(cfg, data, kernel, stride, padding, dilation, groups,
+                           out_dtype='float32'):
+    """Group convolution operator in NCHW layout.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width] or
+        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
+
+    kernel : tvm.Tensor
+        4-D with shape [num_filter, in_channel // groups, filter_height, filter_width] or
+        6-D with shape [num_filter_chunk, in_channel_chunk // groups, filter_height,
+        filter_width, num_filter_block, in_channel_block]
+
+    stride : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
+
+    dilation : int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
+    groups : int
+        number of groups
+
+    out_dtype : str
+        The output type. This is used for mixed precision.
+
+    Returns
+    -------
+    Output : tvm.Tensor
+        5-D with shape [batch, out_channel, out_height, out_width, out_channel_block]
+    """
+    ic_block_factor = 4
+    oc_block_factor = 4
+
+    pre_computed = len(kernel.shape) == 6
+    if not pre_computed:
+        batch, channels, height, width = get_const_tuple(data.shape)
+        out_channels, in_channels, kernel_h, kernel_w = get_const_tuple(
+            kernel.shape)
+
+        assert channels % groups == 0, "input channels must divide group size"
+        assert out_channels % groups == 0, "output channels must divide group size"
+        assert channels % ic_block_factor == 0, \
+            "Number of input channels per group must divide {}".format(ic_block_factor)
+        assert out_channels % 4 == 0, \
+            "Number of output channels per group must divide {}".format(oc_block_factor)
+
+        packed_data = tvm.compute((batch, channels // ic_block_factor, height, width,
+                                   ic_block_factor),
+                                  lambda n, c, h, w, vc: data[n, c*ic_block_factor + vc, h, w],
+                                  name="packed_data")
+        packed_kernel = tvm.compute(
+            (out_channels // oc_block_factor, in_channels // ic_block_factor, kernel_h, kernel_w,
+             oc_block_factor, ic_block_factor),
+            lambda oc_chunk, ic_chunk, kh, kw, oc_block, ic_block:
+            kernel[oc_chunk * oc_block_factor + oc_block,
+                   ic_chunk * ic_block_factor + ic_block, kh, kw],
+            name="packed_kernel")
+    else:
+        packed_data = data
+        packed_kernel = kernel
+
+    batch, ic_chunk, in_height, in_width, _ = get_const_tuple(
+        packed_data.shape)
+    oc_chunk, _, kernel_h, kernel_w, oc_block, ic_block = get_const_tuple(
+        packed_kernel.shape)
+
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (kernel_h, kernel_w))
+    # compute graph
+    pad_before = [0, 0, pad_top, pad_left, 0]
+    pad_after = [0, 0, pad_down, pad_right, 0]
+    pad_data = pad(packed_data, pad_before, pad_after, name="pad_data")
+
+    # compute the output shape
+    out_height = (in_height - (kernel_h - 1) * dilation_h -
+                  1 + pad_top + pad_down) // stride_h + 1
+    out_width = (in_width - (kernel_w - 1) * dilation_w -
+                 1 + pad_left + pad_right) // stride_w + 1
+
+    oshape = (batch, oc_chunk, out_height, out_width, oc_block)
+
+    icc = tvm.reduce_axis((0, ic_chunk // groups), name='ic_chunk')
+    icb = tvm.reduce_axis((0, ic_block_factor), name='ic_block')
+    kh = tvm.reduce_axis((0, kernel_h), name='kh')
+    kw = tvm.reduce_axis((0, kernel_w), name='kw')
+
+    conv = tvm.compute(oshape, lambda n, occ, oh, ow, ocb:
+                       tvm.sum(pad_data[n, occ//(oc_chunk//groups)*(ic_chunk//groups)+icc,
+                                        oh*stride_h+kh*dilation_h, ow*stride_w+kw*dilation_w, icb]
+                               .astype('int32') *
+                               packed_kernel[occ, icc,
+                                             kh, kw, ocb, icb]
+                               .astype('int32'),
+                               axis=[icc, kh, kw, icb]))
+
+    output = tvm.compute(oshape, lambda *index: conv(*index).astype(out_dtype),
+                         tag='group_conv2d_NCHWc_int8')
+    num_flop = batch * oc_chunk * oc_block * out_height * out_width * \
+        ic_chunk * ic_block * kernel_h * kernel_w * 2 // groups
+    cfg.add_flop(num_flop)
+
+    return output
+
+
+_dp4a = dp4a('shared', 'shared', 'local')
+
+
+def schedule_group_conv2d_NCHWc_int8(cfg, s, output):
+    """Schedule group conv2d int8 NCHWc template"""
+    workload = output.op.attrs["workload"]
+    groups = get_const_int(workload[6])
+
+    conv = output.op.input_tensors[0]
+    packed_data, packed_kernel = conv.op.input_tensors
+
+    if isinstance(packed_data.op, tvm.tensor.ComputeOp) and "pad" in packed_data.op.tag:
+        pad_data = packed_data
+        packed_data = pad_data.op.input_tensors[0]
+    else:
+        pad_data = packed_data
+
+    if autotvm.GLOBAL_SCOPE.in_tuning:
+        # skip this part during tuning to make records accurate
+        # this part will be pre-computed during NNVM's pre-compute optimization pass
+        s[packed_data].pragma(s[packed_data].op.axis[0], "debug_skip_region")
+        s[packed_kernel].pragma(
+            s[packed_kernel].op.axis[0], "debug_skip_region")
+    else:
+        if isinstance(packed_kernel.op, tvm.tensor.ComputeOp) and\
+                packed_kernel.name == 'packed_kernel':
+            # data and kernel are not pre-computed, schedule layout transform here
+            _schedule_injective(packed_data.op, s)
+            _schedule_injective(packed_kernel.op, s)
+
+    if pad_data != packed_data:
+        s[pad_data].compute_inline()
+
+    # create cache stage
+    AA = s.cache_read(pad_data, 'shared', [conv])
+    WW = s.cache_read(packed_kernel, 'shared', [conv])
+
+    s[conv].set_scope('local')
+
+    # handle bias
+    if output.op not in s.outputs:
+        s[output].compute_inline()
+        output = s.outputs[0].output(0)
+
+    oc_chunk = get_const_int(output.shape[1])
+    # tile and bind spatial axes
+    n, f, y, x, c = s[output].op.axis
+    cfg.define_split("tile_n", n, num_outputs=4)
+    cfg.define_split("tile_g", cfg.axis(groups), num_outputs=2)
+    cfg.define_split("tile_f", cfg.axis(oc_chunk // groups), num_outputs=4)
+    cfg.define_split("tile_y", y, num_outputs=4)
+    cfg.define_split("tile_x", x, num_outputs=4)
+
+    # this is the scope to attach global config inside this kernel
+    kernel_scope, n = s[output].split(n, nparts=1)
+
+    g, f = s[output].split(f, nparts=groups)
+    s[output].bind(n, tvm.thread_axis('blockIdx.z'))
+    bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n)
+    bg, vg = cfg["tile_g"].apply(s, output, g)
+    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
+    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
+    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
+
+    s[output].reorder(bn, bg, bf, by, bx, vn, vg, vf, vy,
+                      vx, tn, tf, ty, tx, ni, fi, yi, xi)
+    s[output].bind(bn, tvm.thread_axis("blockIdx.z"))
+    s[output].bind(s[output].fuse(bg, bf), tvm.thread_axis("blockIdx.y"))
+    s[output].bind(s[output].fuse(by, bx), tvm.thread_axis("blockIdx.x"))
+    s[output].bind(vn, tvm.thread_axis("vthread"))
+    s[output].bind(vg, tvm.thread_axis("vthread"))
+    s[output].bind(vf, tvm.thread_axis("vthread"))
+    s[output].bind(vy, tvm.thread_axis("vthread"))
+    s[output].bind(vx, tvm.thread_axis("vthread"))
+    cfg.define_knob("fuse_yx", [0, 1])  # fuse ty,tx or tn,tf
+    if cfg["fuse_yx"].val:
+        s[output].bind(tn, tvm.thread_axis("threadIdx.z"))
+        s[output].bind(tf, tvm.thread_axis("threadIdx.y"))
+        tyx = s[output].fuse(ty, tx)
+        s[output].bind(tyx, tvm.thread_axis("threadIdx.x"))
+        s[conv].compute_at(s[output], tyx)
+
+        # number of threads
+        n_tz = cfg["tile_n"].size[2]
+        n_ty = cfg["tile_f"].size[2]
+        n_tx = cfg["tile_y"].size[2] * cfg["tile_x"].size[2]
+    else:
+        s[output].bind(tn, tvm.thread_axis("threadIdx.z"))
+        s[output].bind(s[output].fuse(tn, tf), tvm.thread_axis("threadIdx.z"))
+        s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+        s[conv].compute_at(s[output], tx)
+
+        # number of threads
+        n_tz = cfg["tile_n"].size[2] * cfg["tile_f"].size[2]
+        n_ty = cfg["tile_y"].size[2]
+        n_tx = cfg["tile_x"].size[2]
+
+    # tile and bind reduction axes
+    n, f, y, x, c = s[conv].op.axis
+    rc, ry, rx, rc_block = s[conv].op.reduce_axis
+    cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=2)
+    cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=2)
+    cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=2)
+    rco, rci = cfg['tile_rc'].apply(s, conv, rc)
+    ryo, ryi = cfg['tile_ry'].apply(s, conv, ry)
+    rxo, rxi = cfg['tile_rx'].apply(s, conv, rx)
+
+    s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x, c, rc_block)
+    _, rc_block = s[conv].split(rc_block, factor=4)
+    s[conv].tensorize(rc_block, _dp4a)
+
+    s[AA].compute_at(s[conv], rxo)
+    s[WW].compute_at(s[conv], rxo)
+
+    # cooperative fetching
+    for load in [AA, WW]:
+        c = s[load].op.axis[-1]
+        c_outer, c = s[load].split(c, factor=4)
+        s[load].vectorize(c)
+        fused = s[load].op.axis[:-1] + [c_outer]
+        fused = s[load].fuse(*fused)
+
+        fused, tx = s[load].split(fused, factor=n_tx)
+        fused, ty = s[load].split(fused, factor=n_ty)
+        fused, tz = s[load].split(fused, factor=n_tz)
+        s[load].bind(tz, tvm.thread_axis("threadIdx.z"))
+        s[load].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+    # double buffer
+    cfg.define_knob('AA_double_buffer', [0, 1])
+    cfg.define_knob('WW_double_buffer', [0, 1])
+    if cfg['AA_double_buffer'].val:
+        s[AA].double_buffer()
+    if cfg['WW_double_buffer'].val:
+        s[WW].double_buffer()
+
+    # unroll
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+    s[output].pragma(kernel_scope, 'auto_unroll_max_step',
+                     cfg['auto_unroll_max_step'].val)
+    s[output].pragma(kernel_scope, 'unroll_explicit', False)
+
+    return s
+
+
+@autotvm.register_topi_schedule(generic.schedule_group_conv2d_nchw,
+                                ["cuda", "gpu"], ["direct", "int8"])
+def schedule_conv2d_nchw_cuda(cfg, outs):
+    """TOPI schedule callback of group conv2d for cuda gpu
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    outs: Array of Tensor
+        The computation graph description of conv2d
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for group conv2d.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == "group_conv2d_NCHWc_int8":
+            schedule_group_conv2d_NCHWc_int8(cfg, s, op.output(0))
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 4d4e402de5c2..361208bf1cfb 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -7,19 +7,155 @@
 from topi.vision import nms
 
 
-def sort_ir(data, index, output, axis, is_descend):
-    """Low level IR to do sorting on the GPU, same usage as tvm.contrib.sort.argsort on the CPU.
+def sort_pre_ir(index, sizes_out, axis_mul_before, axis_mul_after):
+    """Low level IR routing subfunction 1/4 for computing segments' staring locatons.
+
+    Parameters
+    ----------
+    index : Buffer
+        Buffer of number of valid output boxes.
+
+    sizes_out : Buffer
+        Output buffer of start locations of each sorting segment.
+
+    axis_mul_before : int
+        The multiplication result of axis dimensions before axis.
+
+    axis_mul_after : int
+        The multiplication result of axis dimensions after axis.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    max_threads = int(
+        tvm.target.current_target(allow_none=False).max_num_threads)
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    ib = tvm.ir_builder.create()
+    p_index = ib.buffer_ptr(index)
+    dshape = sizes_out.shape
+    sizes = ib.buffer_ptr(sizes_out)
+    nthread_tx = max_threads
+    nthread_bx = dshape[0] // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+
+    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
+        sizes[tid] = p_index[tid]
+
+    # scan
+    with ib.if_scope(tid < 1):
+        with ib.for_range(0, axis_mul_before * axis_mul_after - 1, name="k") as k:
+            sizes[k + 1] += sizes[k]
+    body = ib.get()
+    return body
+
+
+def sort_pre_ir_data(data, index, sizes_in, data_out, index_out, \
+                     axis, axis_mul_before, axis_mul_after):
+    """Low level IR routing subfunction 2/4 for flattening data and indices into segmented format.
 
     Parameters
     ----------
     data: Buffer
-        2D Buffer of input boxes' score with shape [batch_size, num_anchors].
+        Buffer of output boxes with class and score.
 
     index : Buffer
-        Buffer of number of valid number of boxes.
+        Buffer of number of valid output boxes.
 
-    output : Buffer
-        Output buffer of indicies of sorted tensor.
+    sizes_in : Buffer
+        Buffer of start locations of each sorting segment.
+
+    data_out : Buffer
+        Buffer of flattened segmented data.
+
+    index_out : Buffer
+        Buffer of flattened segmented indices.
+
+    axis : int
+        The axis used for sorting.
+
+    axis_mul_before : int
+        The multiplication result of axis dimensions before axis.
+
+    axis_mul_after : int
+        The multiplication result of axis dimensions after axis.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    ib = tvm.ir_builder.create()
+    sizes = ib.buffer_ptr(sizes_in)
+    p_index = ib.buffer_ptr(index)
+    p_data = ib.buffer_ptr(data)
+    data_new = ib.buffer_ptr(data_out)
+    index_new = ib.buffer_ptr(index_out)
+    max_threads = int(
+        tvm.target.current_target(allow_none=False).max_num_threads)
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    dshape = tvm.max(sizes_in.shape[0], p_index[0])
+    nthread_tx = max_threads
+    nthread_bx = dshape // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+    with ib.if_scope(axis_mul_before * axis_mul_after > 1):
+        with ib.if_scope(tid < axis_mul_before * axis_mul_after):
+            i = tid / axis_mul_after
+            j = tid % axis_mul_after
+            current_sort_num = p_index[tid]
+            base_idx = i * data.shape[axis] * axis_mul_after + j
+            with ib.for_range(0, current_sort_num, name="k") as k:
+                full_idx = base_idx + k * axis_mul_after
+                with ib.if_scope(tid == 0):
+                    start = 0
+                with ib.else_scope():
+                    start = sizes[tid-1]
+                index_new[start + k] = k
+                data_new[start + k] = p_data[full_idx]
+    with ib.else_scope():
+        with ib.if_scope(tid == 0):
+            with ib.for_range(0, p_index[0], name="k") as k:
+                index_new[k] = k
+
+    body = ib.get()
+    return body
+
+def sort_oet_ir(data, index, new_data, new_index, loc, out_index, axis_mul_before, \
+                axis_mul_after, axis, is_descend):
+    """Low level IR routing subfunction 3/4 for Odd-Even-Transposition sorting.
+
+    Parameters
+    ----------
+    data: Buffer
+        Buffer of output boxes with class and score.
+
+    index : Buffer
+        Buffer of number of valid output boxes.
+
+    new_data : Buffer
+        Buffer of flattened segmented data.
+
+    new_index : Buffer
+        Buffer of flattened segmented indices.
+
+    loc : Buffer
+        Buffer of start locations of each sorting segment.
+
+    out_index : Buffer
+        Output buffer of output box indexes sorted by score in a flattened segmented format.
+
+    axis_mul_before : int
+        The multiplication result of axis dimensions before axis.
+
+    axis_mul_after : int
+        The multiplication result of axis dimensions after axis.
 
     axis : int
         The axis used for sorting.
@@ -32,15 +168,197 @@ def sort_ir(data, index, output, axis, is_descend):
     stmt : Stmt
         The result IR statement.
     """
-
     max_threads = int(
         tvm.target.current_target(allow_none=False).max_num_threads)
     tx = tvm.thread_axis("threadIdx.x")
     bx = tvm.thread_axis("blockIdx.x")
     ib = tvm.ir_builder.create()
+    dshape = loc.shape
+    fshape = data.shape[axis] * dshape[0]
+    temp_data = ib.allocate(
+        "float32", dshape, name="temp_data", scope="local")
     p_data = ib.buffer_ptr(data)
     p_index = ib.buffer_ptr(index)
+    data_new = ib.buffer_ptr(new_data)
+    index_new = ib.buffer_ptr(new_index)
+    index_out = ib.buffer_ptr(out_index)
+    sizes = ib.buffer_ptr(loc)
+    nthread_tx = max_threads
+    nthread_bx = fshape // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+
+    with ib.if_scope(axis_mul_before * axis_mul_after > 1):
+        with ib.if_scope(tid < axis_mul_before * axis_mul_after):
+            with ib.if_scope(tid == 0):
+                start = 0
+            with ib.else_scope():
+                start = sizes[tid-1]
+            # OddEvenTransposeSort
+            with ib.for_range(0, p_index[tid], name="k") as k:
+                with ib.for_range(0, p_index[tid] - 1, name="i") as i:
+                    with ib.if_scope(i % 2 == k % 2):
+                        with ib.if_scope(((data_new[i+start] < data_new[i+start+1]) == is_descend)):
+                            temp_data[tid] = data_new[i+start]
+                            data_new[i+start] = data_new[i+start+1]
+                            data_new[i+start+1] = temp_data[tid]
+                            index_out[tid] = index_new[i+start]
+                            index_new[i+start] = index_new[i+start+1]
+                            index_new[i+start+1] = index_out[tid]
+        with ib.if_scope(tid < 1):
+            with ib.for_range(0, sizes[dshape[0] - 1], name="i") as i:
+                index_out[i] = index_new[i]
+    with ib.else_scope():
+        with ib.for_range(0, fshape, name="k", for_type="unroll") as k:
+            with ib.if_scope(tvm.all(k % 2 == tid % 2, tid < fshape)):
+                with ib.if_scope(k % 2 == 0):
+                    with ib.if_scope(tvm.all(tid + 1 < fshape, (p_data[tid] < p_data[tid+1]) \
+                                             == is_descend)):
+                        data_new[tid] = p_data[tid+1]
+                        index_out[tid] = index_new[tid+1]
+                    with ib.else_scope():
+                        data_new[tid] = p_data[tid]
+                        index_out[tid] = index_new[tid]
+                with ib.else_scope():
+                    with ib.if_scope(tvm.all(tid + 1 < fshape, (data_new[tid] < data_new[tid+1]) \
+                                             == is_descend)):
+                        p_data[tid] = data_new[tid+1]
+                        index_new[tid] = index_out[tid+1]
+                    with ib.else_scope():
+                        p_data[tid] = data_new[tid]
+                        index_new[tid] = index_out[tid]
+            with ib.if_scope(tvm.all(k % 2 != tid % 2, tid < fshape)):
+                with ib.if_scope(k % 2 == 0):
+                    with ib.if_scope(tvm.all(tid > 0, (p_data[tid-1] < p_data[tid]) == is_descend)):
+                        data_new[tid] = p_data[tid-1]
+                        index_out[tid] = index_new[tid-1]
+                    with ib.else_scope():
+                        data_new[tid] = p_data[tid]
+                        index_out[tid] = index_new[tid]
+                with ib.else_scope():
+                    with ib.if_scope(tvm.all(tid > 0, (data_new[tid-1] < data_new[tid]) \
+                                             == is_descend)):
+                        p_data[tid] = data_new[tid-1]
+                        index_new[tid] = index_out[tid-1]
+                    with ib.else_scope():
+                        p_data[tid] = data_new[tid]
+                        index_new[tid] = index_out[tid]
+        with ib.if_scope(fshape % 2 == 1):
+            with ib.if_scope(tid < 1):
+                with ib.for_range(0, fshape, name="k") as k:
+                    index_out[tid] = index_new[tid]
+    body = ib.get()
+    return body
+
+
+def sort_ir_out(data, index, new_index, loc, output, axis_mul_before, axis_mul_after, axis):
+    """Low level IR routing subfunction 4/4 for writing sorted indices to output format.
+
+    Parameters
+    ----------
+    data: Buffer
+        Buffer of output boxes with class and score.
+
+    index : Buffer
+        Buffer of number of valid output boxes.
+
+    new_index : Buffer
+        Buffer of sorted indices in a flatten format.
+
+    loc : Buffer
+        Buffer of start locations of each sorting segment.
+
+    output : Buffer
+        Output buffer of output box indexes sorted by score.
+
+    axis_mul_before : int
+        The multiplication result of axis dimensions before axis.
+
+    axis_mul_after : int
+        The multiplication result of axis dimensions after axis.
+
+    axis : int
+        The axis used for sorting.
+
+    is_descend : bool
+        If the sorted data is in descending order.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    max_threads = int(
+        tvm.target.current_target(allow_none=False).max_num_threads)
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    ib = tvm.ir_builder.create()
+    dshape = tvm.max(loc.shape[0], data.shape[axis])
+    p_index = ib.buffer_ptr(index)
+    index_new = ib.buffer_ptr(new_index)
+    sizes = ib.buffer_ptr(loc)
     p_out = ib.buffer_ptr(output)
+    nthread_tx = max_threads
+    nthread_bx = dshape // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+
+    with ib.if_scope(axis_mul_before * axis_mul_after > 1):
+        with ib.if_scope(tid < axis_mul_before * axis_mul_after):
+            i = tid / axis_mul_after
+            j = tid % axis_mul_after
+            base_idx = i * data.shape[axis] * axis_mul_after + j
+            with ib.for_range(0, data.shape[axis], name="k") as k:
+                with ib.if_scope(tid == 0):
+                    start = 0
+                with ib.else_scope():
+                    start = sizes[tid-1]
+                p_out[base_idx + k * axis_mul_after] = tvm.select(
+                    k < p_index[tid], index_new[k+start], k)
+    with ib.else_scope():
+        with ib.if_scope(tid < data.shape[axis]):
+            p_out[tid] = tvm.select(tid < p_index[0], index_new[tid], tid)
+
+    body = ib.get()
+    return body
+
+
+def sort_gpu(data, data_buf, index, index_buf, output_buf, axis, is_descend):
+    """Function to generate low level IR to do sorting on the GPU, use it by calling sort_gpu.
+
+    Parameters
+    ----------
+    data: tvm.Tensor
+        3-D tensor with shape [batch_size, num_anchors, 6].
+        The last dimension should be in format of
+        [class_id, score, box_left, box_top, box_right, box_bottom].
+
+    data_buf: Buffer
+        2D Buffer of input boxes' score with shape [batch_size, num_anchors].
+
+    index : tvm.Tensor
+        1-D tensor for valid number of boxes.
+
+    index_buf : Buffer
+        Buffer of number of valid number of boxes.
+
+    output_buf : Buffer
+        Output buffer of indicies of sorted tensor.
+
+    axis : int
+        The axis used for sorting.
+
+    is_descend : bool
+        If the sorted data is in descending order.
+
+    Returns
+    -------
+    out : tvm.Tensor
+        3-D tensor with shape [batch_size, num_anchors].
+    """
+
     ndim = len(data.shape)
     assert data.dtype == "float32", "Currently only supports input dtype to be float32"
     assert axis < ndim, "Axis out of boundary for input ndim %d" % ndim
@@ -55,89 +373,60 @@ def sort_ir(data, index, output, axis, is_descend):
         elif i > axis:
             axis_mul_after *= data.shape[i]
 
-    dshape = 0
-    for i in range(0, len(index.shape)):
-        dshape += index.shape[i]
-    dshape = tvm.select(dshape > axis_mul_before*axis_mul_after, dshape,
-                        axis_mul_before*axis_mul_after)
-
-    sizes_temp = ib.allocate(
-        "int32", dshape, name="sizes_temp", scope="global")
-    sizes = ib.allocate("int32", dshape, name="sizes", scope="global")
-    temp_index = ib.allocate("int32", dshape, name="temp_index", scope="local")
-    temp_data = ib.allocate("float32", dshape, name="temp_data", scope="local")
-    data_new = ib.allocate("float32", dshape, name="data_new", scope="global")
-    index_new = ib.allocate("int32", dshape, name="index_new", scope="global")
-    nthread_tx = max_threads
-    nthread_bx = dshape // max_threads + 1
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    tid = bx * max_threads + tx
-
-    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
-        sizes[tid] = p_index[tid]
-        sizes_temp[tid] = p_index[tid]
-
-    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
-        with ib.for_range(0, tvm.floor(tvm.sqrt((axis_mul_before * axis_mul_after) \
-             .astype("float32"))) + 1, name="k") as k:
-            with ib.if_scope(tid - (tvm.const(1, "int32") << k) >= 0):
-                with ib.if_scope(k % 2 == 0):
-                    sizes[tid] += sizes_temp[tid - (
-                        tvm.const(1, "int32") << k)]
-                    sizes_temp[tid] = sizes[tid]
-                with ib.else_scope():
-                    sizes_temp[tid] += sizes[tid - (
-                        tvm.const(1, "int32") << k)]
-                    sizes[tid] = sizes_temp[tid]
-
-    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
-        i = tid / axis_mul_after
-        j = tid % axis_mul_after
-        current_sort_num = p_index[tid]
-        base_idx = i * data.shape[axis] * axis_mul_after + j
-        with ib.for_range(0, current_sort_num, name="k") as k:
-            full_idx = base_idx + k * axis_mul_after
-            with ib.if_scope(tid == 0):
-                start = 0
-            with ib.else_scope():
-                start = sizes[tid-1]
-            index_new[start + k] = k
-            data_new[start + k] = p_data[full_idx]
-
-    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
-        with ib.if_scope(tid == 0):
-            start = 0
-        with ib.else_scope():
-            start = sizes[tid-1]
-        # OddEvenTransposeSort
-        with ib.for_range(0, p_index[tid], name="k") as k:
-            with ib.for_range(0, p_index[tid] - 1, name="i") as i:
-                with ib.if_scope(i % 2 == (k & 1)):
-                    with ib.if_scope(((data_new[i+start] < data_new[i+start+1]) ^
-                                      is_descend) == False):
-                        temp_data[tid] = data_new[i+start]
-                        data_new[i+start] = data_new[i+start+1]
-                        data_new[i+start+1] = temp_data[tid]
-                        temp_index[tid] = index_new[i+start]
-                        index_new[i+start] = index_new[i+start+1]
-                        index_new[i+start+1] = temp_index[tid]
-
-    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
-        i = tid / axis_mul_after
-        j = tid % axis_mul_after
-        current_sort_num = p_index[tid]
-        base_idx = i * data.shape[axis] * axis_mul_after + j
-        with ib.for_range(0, data.shape[axis], name="k") as k:
-            with ib.if_scope(tid == 0):
-                start = 0
-            with ib.else_scope():
-                start = sizes[tid-1]
-            p_out[base_idx + k * axis_mul_after] = tvm.select(
-                k < current_sort_num,
-                index_new[k+start], k)
-    body = ib.get()
-    return body
+    dshape = axis_mul_before*axis_mul_after
+    fshape = data.shape[axis] * dshape
+
+    loc_buf = api.decl_buffer(dshape, index.dtype, "sizes", data_alignment=8)
+    new_index_buf = api.decl_buffer(
+        fshape, index.dtype, "index_new", data_alignment=8)
+    out_index_buf = api.decl_buffer(
+        fshape, index.dtype, "index_out", data_alignment=8)
+    new_data_buf = api.decl_buffer(
+        dshape, data.dtype, "data_new", data_alignment=8)
+
+    loc = \
+        tvm.extern([(dshape,)],
+                   [index],
+                   lambda ins, outs: sort_pre_ir(
+                       ins[0], outs[0], axis_mul_before, axis_mul_after),
+                   dtype=[index.dtype],
+                   in_buffers=index_buf,
+                   out_buffers=[loc_buf],
+                   tag="sorting_prepare")
+
+    data_new, index_new = \
+        tvm.extern([(dshape,), (fshape,)],
+                   [data, index, loc],
+                   lambda ins, outs: sort_pre_ir_data(
+                       ins[0], ins[1], ins[2], outs[0], outs[1], axis,
+                       axis_mul_before, axis_mul_after),
+                   dtype=[data.dtype, index.dtype],
+                   in_buffers=[data_buf, index_buf, loc_buf],
+                   out_buffers=[new_data_buf, new_index_buf],
+                   tag="sorting_data")
+
+    index_out = \
+        tvm.extern([(fshape,)],
+                   [data, index, data_new, index_new, loc],
+                   lambda ins, outs: sort_oet_ir(
+                       ins[0], ins[1], ins[2], ins[3], ins[4], outs[0],
+                       axis_mul_before, axis_mul_after, axis, is_descend),
+                   dtype=[index.dtype],
+                   in_buffers=[data_buf, index_buf,
+                               new_data_buf, new_index_buf, loc_buf],
+                   out_buffers=[out_index_buf],
+                   tag="sorting_oet")
+    out = \
+        tvm.extern([data.shape],
+                   [data, index, index_out, loc],
+                   lambda ins, outs: sort_ir_out(
+                       ins[0], ins[1], ins[2], ins[3], outs[0],
+                       axis_mul_before, axis_mul_after, axis),
+                   dtype=[index.dtype],
+                   in_buffers=[data_buf, index_buf, out_index_buf, loc_buf],
+                   out_buffers=output_buf,
+                   tag="sorting_output")
+    return out
 
 
 def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk):
@@ -333,15 +622,8 @@ def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk
     sort_tensor_buf = api.decl_buffer(score_shape, sort_tensor_dtype,
                                       "sort_tensor_buf", data_alignment=8)
 
-    sort_tensor = \
-        tvm.extern(score_shape,
-                   [score_tensor, valid_count],
-                   lambda ins, outs: sort_ir(
-                       ins[0], ins[1], outs[0], score_axis, True),
-                   dtype=sort_tensor_dtype,
-                   in_buffers=[score_tensor_buf, valid_count_buf],
-                   out_buffers=sort_tensor_buf,
-                   name="nms_sort")
+    sort_tensor = sort_gpu(score_tensor, score_tensor_buf, valid_count,
+                           valid_count_buf, sort_tensor_buf, score_axis, True)
     out = \
         tvm.extern(data.shape,
                    [data, sort_tensor, valid_count],
diff --git a/topi/python/topi/cuda/pooling.py b/topi/python/topi/cuda/pooling.py
index 637f664fbd36..6b36e9a8743f 100644
--- a/topi/python/topi/cuda/pooling.py
+++ b/topi/python/topi/cuda/pooling.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name, unused-variable
+# pylint: disable=invalid-name, unused-variable, unused-argument
 """Schedule for pooling operators"""
 import tvm
 from .. import tag
@@ -70,7 +70,7 @@ def traverse(OP):
 
 
 @generic.schedule_pool.register(["cuda", "gpu"])
-def schedule_pool(outs):
+def schedule_pool(outs, layout):
     """Schedule for pool.
 
     Parameters
@@ -79,6 +79,9 @@ def schedule_pool(outs):
         The computation graph description of pool
         in the format of an array of tensors.
 
+    layout: str
+        Data layout.
+
     Returns
     -------
     s: Schedule
diff --git a/topi/python/topi/cuda/reduction.py b/topi/python/topi/cuda/reduction.py
index 52bacd3d1ae3..4c5d1a507660 100644
--- a/topi/python/topi/cuda/reduction.py
+++ b/topi/python/topi/cuda/reduction.py
@@ -63,10 +63,12 @@ def _schedule_reduce(op, sch, is_idx_reduce=False):
             sch[temp_val_input].compute_at(sch[real_output], outer_in)
     else:
         if is_idx_reduce:
+            spatial_axis = sch[real_output].fuse(*(sch[real_output].op.axis))
+            sch[real_output].bind(spatial_axis, tvm.thread_axis("blockIdx.x"))
             sch[temp_idx_input].compute_at(sch[real_output],
-                                           sch[real_output].op.axis[0])
+                                           spatial_axis)
             sch[temp_val_input].compute_at(sch[real_output],
-                                           sch[real_output].op.axis[0])
+                                           spatial_axis)
     sch[real_output].set_store_predicate(thread_x.equal(0))
     return sch
 
@@ -107,7 +109,10 @@ def traverse_before_reduce(operator):
     def traverse_after_reduce(operator):
         """Internal travserse function"""
         if tag.is_broadcast(operator.tag):
-            raise RuntimeError("Not yet support ewise after reduce")
+            if operator not in scheduled_ops:
+                _schedule_injective(operator, sch)
+            for tensor in operator.input_tensors:
+                traverse_after_reduce(tensor.op)
         elif operator.tag == 'comm_reduce':
             _schedule_reduce(operator, sch, is_idx_reduce=False)
             for tensor in operator.input_tensors:
diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py
index c22e7a513d7d..3c013c4d1605 100644
--- a/topi/python/topi/cuda/ssd/multibox.py
+++ b/topi/python/topi/cuda/ssd/multibox.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, too-many-function-args
 """SSD multibox operators"""
 from __future__ import absolute_import as _abs
 import math
@@ -13,6 +13,7 @@
 from topi.vision.ssd import multibox_transform_loc
 from ..nms import nms
 
+
 def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
     """Low level IR routing for multibox_prior operator.
 
@@ -41,7 +42,8 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
     stmt : Stmt
         The result IR statement.
     """
-    max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads))
+    max_threads = int(math.sqrt(
+        tvm.target.current_target(allow_none=False).max_num_threads))
     tx = tvm.thread_axis("threadIdx.x")
     ty = tvm.thread_axis("threadIdx.y")
     bx = tvm.thread_axis("blockIdx.x")
@@ -76,7 +78,8 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
 
             for k in range(num_sizes + num_ratios - 1):
                 w = tvm.select(k < num_sizes,
-                               size_ratio_concat[k] * in_height / in_width / 2.0,
+                               size_ratio_concat[
+                                   k] * in_height / in_width / 2.0,
                                size_ratio_concat[0] * in_height / in_width *
                                math.sqrt(size_ratio_concat[k + 1]) / 2.0)
                 h = tvm.select(k < num_sizes, size_ratio_concat[k] / 2.0,
@@ -93,7 +96,7 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
 
 
 @multibox_prior.register(["cuda", "gpu"])
-def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1), \
+def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1),
                        offsets=(0.5, 0.5), clip=False):
     """Generate prior(anchor) boxes from data, sizes and ratios.
 
@@ -124,31 +127,114 @@ def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1), \
     """
     num_sizes = len(sizes)
     num_ratios = len(ratios)
-    oshape = (1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1), 4)
+    oshape = (
+        1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1), 4)
     out = tvm.extern(oshape, [data], lambda ins, outs:
-                     multibox_prior_ir(ins[0], outs[0], sizes, ratios, steps, offsets),
+                     multibox_prior_ir(
+                         ins[0], outs[0], sizes, ratios, steps, offsets),
                      tag="multibox_prior")
     if clip:
         out = topi.clip(out, 0, 1)
     return out
 
 
-def transform_loc_ir(cls_prob, loc_pred, anchor, valid_count, out, clip, threshold, variances):
-    """Low level IR routing for transform location in multibox_detection operator.
+def transform_loc_pre(cls_prob, valid_count, temp_flag, temp_id, temp_score_out, threshold):
+    """Low level IR routing for transform location data preparation.
 
     Parameters
     ----------
     cls_prob : Buffer
         Buffer of class probabilities.
 
+    valid_count : Buffer
+        Buffer of number of valid output boxes.
+
+    temp_flag : Buffer
+        Output intermediate result buffer
+
+    temp_id : Buffer
+        Output intermediate result buffer
+
+    temp_score_out : Buffer
+        Output buffer
+
+    threshold : float
+        Threshold to be a positive prediction.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    batch_size = cls_prob.shape[0]
+    num_classes = cls_prob.shape[1]
+    num_anchors = cls_prob.shape[2]
+
+    max_threads = int(
+        tvm.target.current_target(allow_none=False).max_num_threads)
+    ib = tvm.ir_builder.create()
+    score = ib.buffer_ptr(temp_score_out)
+    cls_id = ib.buffer_ptr(temp_id)
+    flag = ib.buffer_ptr(temp_flag)
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    nthread_tx = max_threads
+    nthread_bx = (batch_size * num_anchors * num_classes) // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+    p_cls_prob = ib.buffer_ptr(cls_prob)
+    p_valid_count = ib.buffer_ptr(valid_count)
+
+    with ib.if_scope(tid < batch_size * num_anchors):
+        n = tid / num_anchors  # number of batches
+        i = tid % num_anchors  # number of anchors
+        score[i] = -1.0
+        cls_id[i] = 0
+        p_valid_count[n] = 0
+        with ib.for_range(0, num_classes-1, name="k") as k:
+            temp = p_cls_prob[n * num_anchors * num_classes + (k + 1) * num_anchors + i]
+            with ib.if_scope(temp > score[i]):
+                cls_id[i] = k + 1
+                score[i] = temp
+        with ib.if_scope(tvm.all(cls_id[i] > 0, score[i] < threshold)):
+            cls_id[i] = 0
+        with ib.if_scope(cls_id[i] > 0):
+            flag[i] = 1
+        with ib.else_scope():
+            flag[i] = 0
+
+        with ib.if_scope(tid < batch_size):
+            with ib.for_range(0, num_anchors, name="k") as k:
+                with ib.if_scope(k > 0):
+                    flag[tid * num_anchors +
+                         k] += flag[tid * num_anchors + k - 1]
+            p_valid_count[n] = flag[tid * num_anchors + num_anchors - 1]
+
+    body = ib.get()
+    return body
+
+
+def transform_loc_ir(loc_pred, anchor, temp_flag, temp_id, temp_score_in, \
+                     out, clip, variances, batch_size, num_classes, num_anchors):
+    """Low level IR routing for transform location in multibox_detection operator.
+
+    Parameters
+    ----------
     loc_pred : Buffer
         Buffer of location regression predictions.
 
     anchor : Buffer
         Buffer of prior anchor boxes.
 
-    valid_count : Buffer
-        Buffer of number of valid output boxes.
+    temp_flag : Buffer
+        Intermediate result buffer.
+
+    temp_id : Buffer
+        Intermediate result buffer.
+
+    temp_score_in : Buffer
+        Input buffer which stores intermediate results.
 
     out : Buffer
         Output buffer.
@@ -156,12 +242,18 @@ def transform_loc_ir(cls_prob, loc_pred, anchor, valid_count, out, clip, thresho
     clip : boolean
         Whether to clip out-of-boundary boxes.
 
-    threshold : float
-        Threshold to be a positive prediction.
-
     variances : tuple of float
         Variances to be decoded from box regression output.
 
+    batch_size : int
+        Batch size
+
+    num_classes : int
+        Number of classes
+
+    num_anchors : int
+        Number of anchors
+
     Returns
     -------
     stmt : Stmt
@@ -187,21 +279,16 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
         ow = tvm.exp(pw * vw) * aw / 2.0
         oh = tvm.exp(ph * vh) * ah / 2.0
         return tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox - ow)), ox - ow), \
-               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy - oh)), oy - oh), \
-               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox + ow)), ox + ow), \
-               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy + oh)), oy + oh)
-
-    batch_size = cls_prob.shape[0]
-    num_classes = cls_prob.shape[1]
-    num_anchors = cls_prob.shape[2]
+            tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy - oh)), oy - oh), \
+            tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox + ow)), ox + ow), \
+            tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy + oh)), oy + oh)
 
+    max_threads = int(
+        tvm.target.current_target(allow_none=False).max_num_threads)
     ib = tvm.ir_builder.create()
-    temp_score = ib.allocate('float32', (batch_size * (num_classes -1) * num_anchors, \
-                 ), name="temp_score", scope="global")
-    score = ib.allocate('float32', (batch_size * num_anchors, ), name="score", scope="local")
-    cls_id = ib.allocate('int32', (batch_size * num_anchors, ), name="id", scope="local")
-    flag = ib.allocate('int32', (batch_size * num_anchors, ), name="flag", scope="global")
-    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
+    score = ib.buffer_ptr(temp_score_in)
+    cls_id = ib.buffer_ptr(temp_id)
+    flag = ib.buffer_ptr(temp_flag)
     tx = tvm.thread_axis("threadIdx.x")
     bx = tvm.thread_axis("blockIdx.x")
     nthread_tx = max_threads
@@ -209,42 +296,13 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     tid = bx * max_threads + tx
-    p_cls_prob = ib.buffer_ptr(cls_prob)
     p_loc_pred = ib.buffer_ptr(loc_pred)
     p_anchor = ib.buffer_ptr(anchor)
-    p_valid_count = ib.buffer_ptr(valid_count)
     p_out = ib.buffer_ptr(out)
-    with ib.if_scope(tid < batch_size * num_anchors * num_classes):
-        n = tid / (num_anchors * num_classes)
-        j = (tid % (num_anchors * num_classes)) / num_anchors
-        i = tid % num_anchors
-        with ib.if_scope(j > 0):
-            temp_score[n * num_anchors * num_classes + i * (num_classes - 1) + j-1] = \
-            p_cls_prob[tid]
-        p_valid_count[n] = 0
-    with ib.if_scope(tid < batch_size * num_anchors):
-        n = tid / num_anchors
-        i = tid % num_anchors
-        score[tid] = -1.0
-        cls_id[tid] = 0
-        with ib.for_range(0, num_classes-1, name="k") as k:
-            temp = temp_score[tid * (num_classes-1) + k]
-            cls_id[tid] = tvm.select(temp > score[tid], k + 1, cls_id[tid])
-            score[tid] = tvm.make.Max(temp, score[tid])
-        with ib.if_scope(tvm.all(cls_id[tid] > 0, score[tid] < threshold)):
-            cls_id[tid] = 0
-        with ib.if_scope(cls_id[tid] > 0):
-            flag[tid] = 1
-        with ib.else_scope():
-            flag[tid] = 0
-    with ib.if_scope(tid < batch_size):
-        with ib.for_range(0, num_anchors, name="k") as k:
-            with ib.if_scope(k > 0):
-                flag[tid * num_anchors + k] += flag[tid * num_anchors + k - 1]
-        p_valid_count[tid] = flag[tid * num_anchors + num_anchors - 1]
+
     with ib.if_scope(tid < batch_size * num_anchors):
-        n = tid / num_anchors
-        i = tid % num_anchors
+        n = tid / num_anchors  # number of batches
+        i = tid % num_anchors  # number of anchors
         with ib.if_scope(cls_id[tid] > 0):
             with ib.if_scope(tid == 0):
                 out_base_idx = n * num_anchors * 6
@@ -253,17 +311,17 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
             p_out[out_base_idx] = cls_id[tid] - 1.0
             p_out[out_base_idx + 1] = score[tid]
             p_out[out_base_idx + 2], p_out[out_base_idx + 3], p_out[out_base_idx + 4], \
-            p_out[out_base_idx + 5] = transform_loc(p_loc_pred, tid * 4, p_anchor, i*4,
-                                                    clip, variances[0], variances[1],
-                                                    variances[2], variances[3])
+                p_out[out_base_idx + 5] = transform_loc(p_loc_pred, tid * 4,
+                                                        p_anchor, i*4, clip, variances[0],
+                                                        variances[1], variances[2], variances[3])
 
     body = ib.get()
     return body
 
 
 @multibox_transform_loc.register(["cuda", "gpu"])
-def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01,
-                               variances=(0.1, 0.1, 0.2, 0.2)):
+def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, \
+                               threshold=0.01, variances=(0.1, 0.1, 0.2, 0.2)):
     """Location transformation for multibox detection
 
     Parameters
@@ -297,20 +355,42 @@ def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=
         1-D tensor with shape (batch_size,), number of valid anchor boxes.
     """
     batch_size = cls_prob.shape[0]
-    num_anchors = anchor.shape[1]
+    num_classes = cls_prob.shape[1]
+    num_anchors = cls_prob.shape[2]
     oshape = (batch_size, num_anchors, 6)
     # Define data alignment for intermediate buffer
     valid_count_dtype = "int32"
     valid_count_buf = api.decl_buffer((batch_size,), valid_count_dtype,
                                       "valid_count_buf", data_alignment=4)
-    out_buf = api.decl_buffer(oshape, cls_prob.dtype, "out_buf", data_alignment=8)
-    valid_count, out = \
-        tvm.extern([(batch_size,), oshape],
-                   [cls_prob, loc_pred, anchor],
+    out_buf = api.decl_buffer(
+        oshape, cls_prob.dtype, "out_buf", data_alignment=8)
+    size = num_anchors
+    temp_flag_buf = api.decl_buffer(
+        (size,), valid_count_dtype, "flag", data_alignment=8)
+    temp_id_buf = api.decl_buffer(
+        (size,), valid_count_dtype, "cls_id", data_alignment=8)
+    temp_score_buf = api.decl_buffer(
+        (size,), cls_prob.dtype, "score", data_alignment=8)
+
+    valid_count, temp_flag, temp_id, temp_score = \
+        tvm.extern([(batch_size,), (size,), (size,), (size,)],
+                   [cls_prob],
+                   lambda ins, outs: transform_loc_pre(
+                       ins[0], outs[0], outs[1], outs[2], outs[3], threshold),
+                   dtype=[valid_count_dtype,
+                          valid_count_dtype, valid_count_dtype, cls_prob.dtype],
+                   out_buffers=[valid_count_buf,
+                                temp_flag_buf, temp_id_buf, temp_score_buf],
+                   tag="multibox_transform_loc_first_step")
+
+    out = \
+        tvm.extern([oshape],
+                   [loc_pred, anchor, temp_flag, temp_id, temp_score],
                    lambda ins, outs: transform_loc_ir(
-                       ins[0], ins[1], ins[2], outs[0], outs[1], clip, threshold, variances),
-                   dtype=[valid_count_dtype, cls_prob.dtype],
-                   out_buffers=[valid_count_buf, out_buf],
+                       ins[0], ins[1], ins[2], ins[3], ins[4], outs[0], clip, \
+                       variances, batch_size, num_classes, num_anchors),
+                   dtype=[cls_prob.dtype],
+                   out_buffers=[out_buf],
                    tag="multibox_transform_loc")
     return [out, valid_count]
 
@@ -356,5 +436,6 @@ def multibox_detection_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01
     """
     inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
                                        clip, threshold, variances)
-    out = nms(inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk)
+    out = nms(
+        inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk)
     return out
diff --git a/topi/python/topi/cuda/tensor_intrin.py b/topi/python/topi/cuda/tensor_intrin.py
new file mode 100644
index 000000000000..26ae7587c5df
--- /dev/null
+++ b/topi/python/topi/cuda/tensor_intrin.py
@@ -0,0 +1,62 @@
+"""Tensor intrinsics on CUDA."""
+#pylint: disable=invalid-name
+import tvm
+
+
+def dp4a(x_scope='local', y_scope='local', z_scope='local'):
+    """
+    Int8 dot product reduced by every 4 elements using __dp4a
+
+    Parameters
+    ----------
+    x_scope : str, optional
+        The storage scope of buffer for lhs
+    y_scope : str, optional
+        The storage scope of buffer for rhs
+    z_scope : str, optional
+        The storage scope of buffer for result
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The dp4a TensorIntrin that can be used in tensorizing schedule.
+    """
+
+    n = 4  # dp4a requires operands packed by 4
+    x = tvm.placeholder((n,), name='x', dtype='int8')
+    y = tvm.placeholder((n,), name='y', dtype='int8')
+
+    k = tvm.reduce_axis((0, n), name='rc')
+
+    z = tvm.compute((1,), lambda i: tvm.sum(
+        x[k].astype('int32') * y[k].astype('int32'), axis=[k]))
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            xx, yy = ins
+            zz = outs[0]
+
+            if index == 1:
+                return zz.vstore(0, 0)
+
+            ib = tvm.ir_builder.create()
+
+            vec_x = xx.vload(0, dtype='int8x4')
+            vec_y = yy.vload(0, dtype='int8x4')
+            prev_z = 0 if index == 0 else zz.vload(0)
+
+            new_z = tvm.call_pure_extern('int32', '__dp4a', vec_x, vec_y, prev_z)
+            ib.emit(zz.vstore(0, new_z))
+
+            return ib.get()
+
+        return _instr(0), _instr(1), _instr(2) # body, reset, update
+
+    with tvm.build_config(data_alignment=4, offset_factor=1) as cfg:
+        scopes = {x: x_scope, y: y_scope, z: z_scope}
+        binds = {t: tvm.decl_buffer(t.shape, t.dtype, t.op.name,
+                                    data_alignment=cfg.data_alignment,
+                                    offset_factor=cfg.offset_factor,
+                                    scope=scopes[t]) for t in [x, y, z]}
+
+        return tvm.decl_tensor_intrin(z.op, _intrin_func, binds=binds)
diff --git a/topi/python/topi/generic/injective.py b/topi/python/topi/generic/injective.py
index 0a9e394661af..975e4c11ea41 100644
--- a/topi/python/topi/generic/injective.py
+++ b/topi/python/topi/generic/injective.py
@@ -29,5 +29,22 @@ def schedule_injective(outs):
     s[x].fuse(s[x].op.axis)
     return s
 
+@tvm.target.generic_func
+def schedule_concatenate(outs):
+    """Schedule for concatenate op.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of reduce in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return schedule_injective(outs)
+
 schedule_elemwise = schedule_injective
 schedule_broadcast = schedule_injective
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index 1e01adb899b7..8c303e5be182 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -55,33 +55,15 @@ def schedule_conv2d_nhwc(outs):
 
 
 @tvm.target.generic_func
-def schedule_conv2d_NCHWc(num_filter, kernel_size, strides,
-                          padding, layout, out_layout, outs):
+def schedule_conv2d_NCHWc(outs):
     """Schedule for conv2d_NCHW[x]c
 
     Parameters
     ----------
-    num_filter : int
-        The number of filter, i.e., the output channel.
-
-    kernel_size : tuple of int
-        (kernel_height, kernel_width)
-
-    strides : tuple of int
-        (stride_of_height, stride_of_width)
-
-    padding : tuple of int
-        (pad_of_height, pad_of_width)
-
-    layout : str
-        Input data layout
-
-    out_layout : str
-        Output data layout
-
     outs : Array of Tensor
         The computation graph description of conv2d_NCHWc
         in the format of an array of tensors.
+        The number of filter, i.e., the output channel.
 
     Returns
     -------
@@ -191,6 +173,42 @@ def schedule_depthwise_conv2d_nhwc(outs):
     """
     return _default_schedule(outs, False)
 
+
+@tvm.target.generic_func
+def schedule_depthwise_conv2d_NCHWc(outs):
+    """Schedule for depthwise_conv2d_NCHWc
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of depthwise_conv2d_nhwc
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
+@tvm.target.generic_func
+def schedule_group_conv2d_nchw(outs):
+    """Schedule for conv2d_nchw
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of group_conv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
 @tvm.target.generic_func
 def schedule_bitserial_conv2d_nchw(outs):
     """Schedule for bitserial_conv2d_nchw
@@ -282,7 +300,7 @@ def schedule_dense(outs):
 
 
 @tvm.target.override_native_generic_func("schedule_pool")
-def schedule_pool(outs):
+def schedule_pool(outs, layout):
     """Schedule for pool
 
     Parameters
@@ -291,6 +309,9 @@ def schedule_pool(outs):
           The computation graph description of pool
           in the format of an array of tensors.
 
+    layout: str
+        Data layout.
+
     Returns
     -------
     sch: Schedule
diff --git a/topi/python/topi/hls/__init__.py b/topi/python/topi/hls/__init__.py
index 69b80514ff56..65f091fc9916 100644
--- a/topi/python/topi/hls/__init__.py
+++ b/topi/python/topi/hls/__init__.py
@@ -3,3 +3,4 @@
 from __future__ import absolute_import as _abs
 
 from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
+from .nn import *
diff --git a/topi/python/topi/hls/nn.py b/topi/python/topi/hls/nn.py
new file mode 100644
index 000000000000..536453fc629c
--- /dev/null
+++ b/topi/python/topi/hls/nn.py
@@ -0,0 +1,388 @@
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""HLS nn operators"""
+from __future__ import absolute_import as _abs
+import tvm
+from .. import tag
+from .. import generic
+
+
+def _schedule_conv2d(outs):
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_injective(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        # schedule conv2d
+        elif OP.tag.find("conv2d") >= 0:
+            Conv2d = OP.output(0)
+            if not Conv2d.op in s.outputs:
+                Out = outs[0].op.output(0)
+                s[Conv2d].compute_at(s[Out], s[Out].op.axis[1])
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+
+    px, x = s[outs[0]].split(outs[0].op.axis[0], nparts=1)
+    s[outs[0]].bind(px, tvm.thread_axis("pipeline"))
+    return s
+
+
+@generic.schedule_conv2d_nchw.register(["hls"])
+def schedule_conv2d_nchw(outs):
+    """Schedule for conv2d_nchw
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of conv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+
+@generic.schedule_conv2d_nhwc.register(["hls"])
+def schedule_conv2d_nhwc(outs):
+    """Schedule for conv2d_nhwc
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of conv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+
+@generic.schedule_conv2d_NCHWc.register(["hls"])
+def schedule_conv2d_NCHWc(outs):
+    """Schedule for conv2d_NCHW[x]c
+
+    Parameters
+    ----------
+    outs : Array of Tensor
+        The computation graph description of conv2d_NCHWc
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    sch : Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+
+@generic.schedule_conv2d_transpose_nchw.register(["hls"])
+def schedule_conv2d_transpose_nchw(outs):
+    """Schedule for conv2d_transpose_nchw
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of conv2d_transpose_nchw
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+
+@generic.schedule_depthwise_conv2d_nchw.register(["hls"])
+def schedule_depthwise_conv2d_nchw(outs):
+    """Schedule for depthwise_conv2d_nchw
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of depthwise_conv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+
+@generic.schedule_depthwise_conv2d_nhwc.register(["hls"])
+def schedule_depthwise_conv2d_nhwc(outs):
+    """Schedule for depthwise_conv2d_nhwc
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of depthwise_conv2d_nhwc
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+@generic.schedule_bitserial_conv2d_nchw.register(["hls"])
+def schedule_bitserial_conv2d_nchw(outs):
+    """Schedule for bitserial_conv2d_nchw
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of bitserial_conv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+
+@generic.schedule_bitserial_conv2d_nhwc.register(["hls"])
+def schedule_bitserial_conv2d_nhwc(outs):
+    """Schedule for bitserial_conv2d_nhwc
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of bitserial_conv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+
+@generic.schedule_reduce.register(["hls"])
+def schedule_reduce(outs):
+    """Schedule for reduction
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of reduce
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        elif OP.tag in ["comm_reduce", "comm_reduce_idx"]:
+            if OP.tag == "comm_reduce":
+                Reduce = OP.output(0)
+            else:
+                Reduce = OP.input_tensors[0]
+            if not Reduce.op in s.outputs:
+                Out = outs[0].op.output(0)
+                s[Reduce].compute_at(s[Out], s[Out].op.axis[0])
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+
+    fused = s[outs[0]].fuse()
+    px, x = s[outs[0]].split(fused, nparts=1)
+    s[outs[0]].bind(px, tvm.thread_axis("pipeline"))
+    return s
+
+
+@generic.schedule_softmax.register(["hls"])
+def schedule_softmax(outs):
+    """Schedule for softmax
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of softmax
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+
+    softmax = outs[0]
+    max_elem = softmax.op.input_tensors[1]
+    expsum = softmax.op.input_tensors[2]
+
+    s[expsum].compute_at(s[softmax], s[softmax].op.axis[1])
+    s[max_elem].compute_at(s[softmax], s[softmax].op.axis[1])
+
+    px, x = s[softmax].split(softmax.op.axis[0], nparts=1)
+    s[softmax].bind(px, tvm.thread_axis("pipeline"))
+    return s
+
+
+@generic.schedule_dense.register(["hls"])
+def schedule_dense(outs):
+    """Schedule for dense
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of dense
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        # schedule dense
+        elif OP.tag == 'dense':
+            Dense = OP.output(0)
+            if not Dense.op in s.outputs:
+                Out = outs[0].op.output(0)
+                s[Dense].compute_at(s[Out], s[Out].op.axis[1])
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+
+    px, x = s[outs[0]].split(outs[0].op.axis[0], nparts=1)
+    s[outs[0]].bind(px, tvm.thread_axis("pipeline"))
+    return s
+
+
+@generic.schedule_pool.register(["hls"])
+def schedule_pool(outs, layout):
+    """Schedule for pool
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of pool
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        # schedule pool
+        elif OP.tag.startswith('pool'):
+            Pool = OP.output(0)
+            if not Pool.op in s.outputs:
+                Out = outs[0].op.output(0)
+                s[Pool].compute_at(s[Out], s[Out].op.axis[1])
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+
+    px, x = s[outs[0]].split(outs[0].op.axis[0], nparts=1)
+    s[outs[0]].bind(px, tvm.thread_axis("pipeline"))
+    return s
+
+
+@generic.schedule_global_pool.register(["hls"])
+def schedule_global_pool(outs):
+    """Schedule for global pool
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of global pool
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        # schedule global_pool
+        elif OP.tag.startswith('global_pool'):
+            Pool = OP.output(0)
+            if not Pool.op in s.outputs:
+                Out = outs[0].op.output(0)
+                s[Pool].compute_at(s[Out], s[Out].op.axis[1])
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+
+    px, x = s[outs[0]].split(outs[0].op.axis[0], nparts=1)
+    s[outs[0]].bind(px, tvm.thread_axis("pipeline"))
+    return s
diff --git a/topi/python/topi/intel_graphics/conv2d.py b/topi/python/topi/intel_graphics/conv2d.py
index 4275bd963d10..d712e71410d7 100644
--- a/topi/python/topi/intel_graphics/conv2d.py
+++ b/topi/python/topi/intel_graphics/conv2d.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return, too-many-arguments, too-many-locals, too-many-statements, no-member, too-many-branches
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return, too-many-arguments, too-many-locals, too-many-statements, no-member, too-many-branches, too-many-boolean-expressions
 """conv2d schedule on Intel Graphics"""
 
 from __future__ import absolute_import as _abs
@@ -49,7 +49,11 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
     stride = ast.literal_eval(attrs['strides'])
 
     wkl = _get_workload(data, kernel, stride, padding, data.dtype)
-    oc_bn = 16
+    oc_bn = 1
+    kernel_shape = util.get_const_tuple(kernel.shape)
+    for oc_bn in range(16, 1, -1):
+        if kernel_shape[0] % oc_bn == 0:
+            break
 
     new_attrs = {k: attrs[k] for k in attrs.keys()}
     new_attrs['kernel_layout'] = 'OIHW%do' % (oc_bn)
@@ -57,8 +61,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
     return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
 
 @conv2d_NCHWc.register(["intel_graphics"])
-def _decl_conv2d(data, kernel, num_filter, kernel_size, stride, padding, layout,\
-                 out_layout, out_dtype='float32'):
+def _decl_conv2d(data, kernel, stride, padding, dilation, layout, out_layout, out_dtype='float32'):
     """Conv2D operator for Intel Graphics backend.
 
     Parameters
@@ -97,7 +100,7 @@ def _decl_conv2d(data, kernel, num_filter, kernel_size, stride, padding, layout,
     return _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype)
 
 @generic.schedule_conv2d_NCHWc.register(["intel_graphics"])
-def schedule_conv2d_NCHWc(num_filter, kernel_size, stride, padding, layout, out_layout, outs):
+def schedule_conv2d_NCHWc(outs):
     """Schedule for conv2d_nchw for Intel Graphics
 
     Parameters
@@ -123,8 +126,7 @@ def traverse(op):
             for tensor in op.input_tensors:
                 if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
-        if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag \
-           or "1_16" in op.tag:
+        if 'conv2d' in op.tag:
             _schedule_cl_spatialpack_NCHWc(s, op)
 
         scheduled_ops.append(op)
@@ -148,39 +150,35 @@ def _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype='float16
     out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
     out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
     oshape = (batch, out_channel, out_height, out_width)
-    pad_before = [0, 0, pad_top, pad_left]
-    pad_after = [0, 0, pad_down, pad_right]
-    temp = pad(data, pad_before, pad_after, name="pad_temp")
 
     rc = tvm.reduce_axis((0, in_channel), name='rc')
     ry = tvm.reduce_axis((0, kernel_h), name='ry')
     rx = tvm.reduce_axis((0, kernel_w), name='rx')
 
-    block_w = 0
-    block_h = 0
+    block_w = 1
+    block_h = 1
     if stride_h == 2:
         if num_filter + kernel_h == 515:
-            conv_tag = "4_4"
             block_h = 4
             block_w = 4
         else:
-            conv_tag = "4_5"
             block_h = 4
             block_w = 5
     elif kernel_h == 3:
         if num_filter == 512:
-            conv_tag = "2_7"
             block_h = 2
             block_w = 7
         else:
-            conv_tag = "2_14"
             block_h = 2
             block_w = 14
+    elif kernel_h == 7 and padding == 3 and stride == 1:
+        block_h = 3
+        block_w = 4
     else:
-        conv_tag = "1_16"
         block_h = 1
         block_w = 16
 
+    attrs = {'block_h': block_h, 'block_w' : block_w}
     c_h = out_height
     c_w = out_width
 
@@ -190,6 +188,10 @@ def _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype='float16
     if not out_width % block_w == 0:
         c_w = (out_width // block_w + 1) * block_w
 
+    pad_before = [0, 0, pad_top, pad_left]
+    pad_after = [0, 0, pad_down + c_h - block_h, pad_right + c_w - block_w]
+    temp = pad(data, pad_before, pad_after, name="pad_temp")
+
     cshape = (batch, out_channel // nv, c_h, c_w, nv)
 
     conv = tvm.compute(
@@ -198,13 +200,13 @@ def _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype='float16
           tvm.sum(
               temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
               kernel[ff, rc, ry, rx, vc].astype(out_dtype),
-              axis=[rc, ry, rx]), tag=conv_tag, name='conv')
+              axis=[rc, ry, rx]), name='conv', attrs=attrs)
 
     output = tvm.compute(
         oshape,
         lambda nn, ff, yy, xx:
         conv[nn][ff//nv][yy][xx][ff%nv],
-        name='output_unpack', tag=conv_tag)
+        name='output_unpack', tag='conv2d')
 
     return output
 
@@ -220,21 +222,10 @@ def _schedule_cl_spatialpack_NCHWc(s, op):
 
     kernel_L = s.cache_read(kernel, "local", [conv_L])
     _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
-    if "1_16" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 1
-        OUTPUT_BLOCK_WIDTH = 16
-    elif "2_14" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 2
-        OUTPUT_BLOCK_WIDTH = 14
-    elif "2_7" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 2
-        OUTPUT_BLOCK_WIDTH = 7
-    elif "4_5" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 4
-        OUTPUT_BLOCK_WIDTH = 5
-    elif "4_4" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 4
-        OUTPUT_BLOCK_WIDTH = 4
+
+    attrs = s[conv].op.attrs
+    OUTPUT_BLOCK_HEIGHT = attrs['block_h']
+    OUTPUT_BLOCK_WIDTH = attrs['block_w']
 
     # schedule conv
     z_factor = 1
@@ -263,17 +254,8 @@ def _schedule_cl_spatialpack_NCHWc(s, op):
     s[conv_L].compute_at(s[conv], vci)
     i, oc, h, w, vc = s[conv_L].op.axis
     rc, ry, rx = s[conv_L].op.reduce_axis
-    if in_channel == 2048:
-        rco, rci = s[conv_L].split(rc, nparts=128)
-        s[conv_L].unroll(rci)
-        s[conv_L].reorder(i, oc, rco, rci, ry, rx, vc, h, w)
-        s[temp_W].compute_at(s[conv_L], rco)
-    else:
-        s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w)
-        s[temp_W].compute_at(s[conv_L], rc)
-    if kernel.shape[3].value != 7:
-        s[conv_L].unroll(ry)
-        s[conv_L].unroll(rx)
+    s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w)
+    s[temp_W].compute_at(s[conv_L], rc)
     if kernel.shape[3].value != 7:
         s[conv_L].unroll(ry)
         s[conv_L].unroll(rx)
@@ -313,7 +295,7 @@ def _schedule_cl_spatialpack_NCHWc(s, op):
 
 
 @conv2d.register(["intel_graphics"])
-def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
+def decl_conv2d(data, kernel, stride, padding, dilation, layout='NCHW', out_dtype='float32'):
     """Conv2D operator for Intel Graphics backend.
 
     Parameters
@@ -373,8 +355,7 @@ def traverse(op):
             for tensor in op.input_tensors:
                 if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
-        if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag \
-           or "1_16" in op.tag:
+        if 'conv2d' in op.tag:
             _schedule_cl_spatialpack(s, op)
 
         scheduled_ops.append(op)
@@ -396,49 +377,53 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float
     out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
     out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
     oshape = (batch, out_channel, out_height, out_width)
-    pad_before = [0, 0, pad_top, pad_left]
-    pad_after = [0, 0, pad_down, pad_right]
-    temp = pad(data, pad_before, pad_after, name="pad_temp")
 
     rc = tvm.reduce_axis((0, in_channel), name='rc')
     ry = tvm.reduce_axis((0, kernel_h), name='ry')
     rx = tvm.reduce_axis((0, kernel_w), name='rx')
 
-    block_w = 0
-    block_h = 0
+    block_w = 1
+    block_h = 1
     if stride_h == 2:
         if num_filter + kernel_h == 515:
-            conv_tag = "4_4"
             block_h = 4
             block_w = 4
         else:
-            conv_tag = "4_5"
             block_h = 4
             block_w = 5
     elif kernel_h == 3:
         if num_filter == 512:
-            conv_tag = "2_7"
             block_h = 2
             block_w = 7
         else:
-            conv_tag = "2_14"
             block_h = 2
             block_w = 14
+    elif kernel_h == 7 and padding == 3 and stride == 1:
+        block_h = 3
+        block_w = 4
     else:
-        conv_tag = "1_16"
         block_h = 1
         block_w = 16
 
+    attrs = {'block_h': block_h, 'block_w' : block_w}
     c_h = out_height
     c_w = out_width
 
+    if not out_width % block_w == 0:
+        c_w = (out_width // block_w + 1) * block_w
+
     if not out_height % block_h == 0:
         c_h = (out_height // block_h + 1) * block_h
 
-    if not out_width % block_w == 0:
-        c_w = (out_width // block_w + 1) * block_w
+    pad_before = [0, 0, pad_top, pad_left]
+    pad_after = [0, 0, pad_down + c_h - block_h, pad_right + c_w - block_w]
+    temp = pad(data, pad_before, pad_after, name="pad_temp")
 
     nv = 16
+    if not num_filter % nv == 0:
+        num_filter = (num_filter // nv + 1) * nv
+        out_channel = num_filter
+
     cshape = (batch, out_channel // nv, c_h, c_w, nv)
     kvshape = (num_filter // nv, channel, kernel_h, kernel_w, nv)
 
@@ -453,13 +438,13 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float
           tvm.sum(
               temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
               kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype),
-              axis=[rc, ry, rx]), tag=conv_tag, name='conv')
+              axis=[rc, ry, rx]), name='conv', attrs=attrs)
 
     output = tvm.compute(
         oshape,
         lambda nn, ff, yy, xx:
         conv[nn][ff//nv][yy][xx][ff%nv],
-        name='output_unpack', tag=conv_tag)
+        name='output_unpack', tag='conv2d')
 
     return output
 
@@ -477,21 +462,9 @@ def _schedule_cl_spatialpack(s, op):
     kernel_L = s.cache_read(kernel_vec, "local", [conv_L])
     _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
 
-    if "1_16" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 1
-        OUTPUT_BLOCK_WIDTH = 16
-    elif "2_14" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 2
-        OUTPUT_BLOCK_WIDTH = 14
-    elif "2_7" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 2
-        OUTPUT_BLOCK_WIDTH = 7
-    elif "4_5" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 4
-        OUTPUT_BLOCK_WIDTH = 5
-    elif "4_4" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 4
-        OUTPUT_BLOCK_WIDTH = 4
+    attrs = s[conv].op.attrs
+    OUTPUT_BLOCK_HEIGHT = attrs['block_h']
+    OUTPUT_BLOCK_WIDTH = attrs['block_w']
 
     # schedule conv
     z_factor = 1
@@ -520,14 +493,8 @@ def _schedule_cl_spatialpack(s, op):
     s[conv_L].compute_at(s[conv], vci)
     i, oc, h, w, vc = s[conv_L].op.axis
     rc, ry, rx = s[conv_L].op.reduce_axis
-    if in_channel == 2048:
-        rco, rci = s[conv_L].split(rc, nparts=128)
-        s[conv_L].unroll(rci)
-        s[conv_L].reorder(i, oc, rco, rci, ry, rx, vc, h, w)
-        s[temp_W].compute_at(s[conv_L], rco)
-    else:
-        s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w)
-        s[temp_W].compute_at(s[conv_L], rc)
+    s[conv_L].reorder(i, oc, rc, ry, rx, vc, h, w)
+    s[temp_W].compute_at(s[conv_L], rc)
     if kernel.shape[3].value != 7:
         s[conv_L].unroll(ry)
         s[conv_L].unroll(rx)
diff --git a/topi/python/topi/mali/conv2d.py b/topi/python/topi/mali/conv2d.py
index fc6309a7ebf4..d7b1f939ef45 100644
--- a/topi/python/topi/mali/conv2d.py
+++ b/topi/python/topi/mali/conv2d.py
@@ -1,328 +1,166 @@
 # pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
 """conv2d schedule on ARM Mali GPU"""
-
-from __future__ import absolute_import as _abs
-
 import numpy as np
+
 import tvm
+from tvm import autotvm
+from tvm.autotvm.task.space import get_factors
 
-from .. import generic
-from .. import util
-from .. import tag
-from ..nn import pad
-from ..nn.conv2d import conv2d
-from ..nn.util import get_pad_tuple
-
-##### SCHEDULE UTILITIES #####
-def fuse_and_bind(s, tensor, axis=None, num_thread=None):
-    """ fuse all the axis and bind to GPU threads """
-    axis = axis or s[tensor].op.axis
-    fused = s[tensor].fuse(*axis)
-    max_threads = tvm.target.current_target(allow_none=False).max_num_threads
-    bx, tx = s[tensor].split(fused, num_thread or max_threads)
-    s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
-    s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
-    return bx, tx
+from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform
+from ..util import traverse_inline, get_const_int, get_const_tuple, const_matrix
+from ..nn import conv2d, conv2d_winograd_without_weight_transform, \
+    get_pad_tuple, pad, conv2d_alter_layout
 
-def tile_and_bind(s, tensor, y, x, y_factor, x_factor=None):
-    """ tile and bind to GPU threads """
-    x_factor = x_factor or y_factor
-    yo, xo, yi, xi = s[tensor].tile(y, x, y_factor, x_factor)
-    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
-    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
-    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
-    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
-    return yo, xo, yi, xi
+# reuse some compute declarations from ARM CPU
+from ..arm_cpu.conv2d import _decl_spatial_pack, _alter_conv2d_layout_arm
 
-def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
-    """ tile and bind 3d """
-    y_factor = y_factor or z_factor
-    x_factor = x_factor or y_factor
-    zo, zi = s[tensor].split(z, z_factor)
-    yo, yi = s[tensor].split(y, y_factor)
-    xo, xi = s[tensor].split(x, x_factor)
-    s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
-    s[tensor].bind(zi, tvm.thread_axis("threadIdx.z"))
-    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
-    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
-    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
-    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
 
-def pack_tensor(s, tensor, factor, readers):
-    """ do transform X[n, m] -> X[n / factor, m, factor] """
-    tmp = s.cache_read(tensor, 'global', readers)
-    y, x = s[tmp].op.axis
-    yo, yi = s[tmp].split(y, factor)
-    s[tmp].reorder(yo, x, yi)
-    s[tmp].compute_inline()
-    return s.cache_write(tmp, 'global')
-
-def transpose(s, tensor, readers):
-    """ do transform X[n, m] -> X[m, n] """
-    tmp = s.cache_read(tensor, 'global', readers)
-    y, x = s[tmp].op.axis
-    s[tmp].reorder(x, y)
-    s[tmp].compute_inline()
-    return s.cache_write(tmp, "global"), tmp
-
-def const_array(data, name):
-    """ convert an const array to tvm tensor"""
-    row, col = data.shape
-    dtype = str(data.dtype)
-
-    def select_array(i, j):
-        now = tvm.const(0.0, dtype)
-        for ii in range(row):
-            for jj in range(col):
-                now = tvm.select(tvm.all(i % row == ii, j % col == jj),
-                                 tvm.const(data[ii][jj], dtype),
-                                 now)
-        return now
-    return tvm.compute(data.shape, select_array, name=name)
-
-
-@conv2d.register(["mali"])
-def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
-    """Conv2D operator for ARM Mali GPU backend.
+@autotvm.register_topi_compute(conv2d, 'mali', ['direct'])
+def conv2d_mali(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
+    """TOPI compute callback for conv2d
 
     Parameters
     ----------
+    cfg: ConfigEntity
+        The config for this template
+
     data : tvm.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]
 
     kernel : tvm.Tensor
-        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
+        pre-packed 5-D with shape [num_filter_chunk, in_channel, filter_height,
+        filter_width, num_filter_block]
+
+    strides : list of two ints
+        [stride_height, stride_width]
 
-    stride : int or a list/tuple of two ints
-        stride size, or [stride_height, stride_width]
+    padding : list of two ints
+        [pad_height, pad_width]
 
-    padding : int or a list/tuple of two ints
-        padding size, or [pad_height, pad_width]
+    dilation : list of two ints
+        [dilation_height, dilation_width]
 
     layout : str
         layout of data
 
+    out_dtype: str
+        The output type. This is used for mixed precision.
+
     Returns
     -------
     output : tvm.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
-    assert layout == 'NCHW', "only support NCHW convolution on mali"
-    assert data.shape[0].value == 1, "only support batch size=1 convolution on mali"
-    assert data.dtype == kernel.dtype, "Do not support inputs with different data types now."
+    return _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
+                              num_tile=3)
 
-    out_dtype = data.dtype
-    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
-    kernel_shape = util.get_const_tuple(kernel.shape)
-    if isinstance(stride, (tuple, list)):
-        HSTR, WSTR = stride
-    else:
-        HSTR, WSTR = stride, stride
-
-    if (kernel_shape[2:4] == (3, 3) and (HPAD, WPAD) == (1, 1) and kernel_shape[0] >= 64 and
-            (HSTR, WSTR) == (1, 1)):
-        return _decl_winograd(data, kernel, stride, padding, layout, out_dtype)
-    elif kernel_shape[2:4] == (1, 1):
-        return _decl_im2col(data, kernel, stride, padding, layout, out_dtype)
-    else:
-        return _decl_spatialpack(data, kernel, stride, padding, layout, out_dtype)
-
-@generic.schedule_conv2d_nchw.register(["mali"])
-def schedule_conv2d_nchw(outs):
-    """Schedule for conv2d_nchw for ARM Mali GPU
+@autotvm.register_topi_schedule(schedule_conv2d_nchw, 'mali', ['direct', 'winograd'])
+def schedule_conv2d_nchw_mali(cfg, outs):
+    """TOPI schedule callback for conv2d
 
     Parameters
     ----------
+    cfg: ConfigEntity
+        The configuration of this template
     outs: Array of Tensor
-        The computation graph description of conv2d_nchw
+        The computation graph description of convolution2d
         in the format of an array of tensors.
 
     Returns
     -------
     s: Schedule
-        The computation schedule for conv2d_nchw.
+        The computation schedule for conv2d
     """
-    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
 
-    def traverse(op):
-        """inline all one-to-one-mapping operators except the last stage (output)"""
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
+    def _callback(op):
+        # schedule conv2d
+        if 'spatial_conv2d_output' in op.tag:
+            output = op.output(0)
+            conv = op.input_tensors[0]
 
-        if 'im2col_conv_output' in op.tag:
-            _schedule_im2col_conv2d(s, op)
+            data_vec = conv.op.input_tensors[0]
+            data_pad = data_vec.op.input_tensors[0]
+            s[data_pad].compute_inline()
 
-        if 'spatialpack_conv_output' in op.tag:
-            _schedule_spatialpack_conv2d(s, op)
+            kernel_vec = conv.op.input_tensors[1]
+            if kernel_vec.op.name == 'kernel_vec':
+                kernel = kernel_vec.op.input_tensors[0]
+            else:
+                kernel = kernel_vec
+            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+                s[kernel].compute_inline()
 
-        if 'winograd_conv_output' in op.tag:
-            _schedule_winograd(s, op)
+            _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec)
 
-        scheduled_ops.append(op)
+        if 'winograd_conv2d_output' in op.tag:
+            _schedule_winograd(cfg, s, op)
 
-    traverse(outs[0].op)
+    traverse_inline(s, outs[0].op, _callback)
     return s
 
-def _decl_spatialpack(data, kernel, stride, padding, layout, out_dtype):
-    """declare the spatialpack method (spatial packing) for conv2d"""
-    _, CI, IH, IW = [util.get_const_int(x) for x in data.shape]
-    CO, _, KH, KW = [util.get_const_int(x) for x in kernel.shape]
-    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
-    HCAT, WCAT = KH - 1, KW - 1
 
-    if isinstance(stride, (tuple, list)):
-        HSTR, WSTR = stride
-    else:
-        HSTR, WSTR = stride, stride
-
-    N = 1
-    TH = IH + 2*HPAD
-    TW = IW + 2*WPAD
-    OH = (IH + 2*HPAD - KH) // HSTR + 1
-    OW = (IW + 2*WPAD - KW) // WSTR + 1
-
-    DO_PAD = (HPAD != 0 and WPAD != 0)
-    if DO_PAD:
-        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
-    else:
-        data_pad = data
-
-    # set tunable parameters (tile factor, ...)
-    tune_config = getattr(tvm.target.current_target(), "tune_config", None)
-    if tune_config is None:
-        VH = 1
-        VW, VC = 4, 4
-        # correct tile factor
-        if OW % VW != 0:
-            if OW == 14:
-                VW = 2
-                VC = 8
-            elif OW == 7:
-                VW = 7
-    else:
-        VH = tune_config['VH']
-        VW = tune_config['VW']
-        VC = tune_config['VC']
-
-    if data.dtype == 'float16':
-        VC *= 2
-
-    assert CO % VC == 0
-    assert OH % VH == 0, "OH: %d  VH : %d" % (OH, VH)
-    assert OW % VW == 0, "OW: %d  VW : %d" % (OW, VW)
-
-    dvshape = (N, TH//(VH*HSTR), TW//(VW*WSTR), CI, VH*HSTR+HCAT, VW*WSTR+WCAT)
-    kvshape = (CO // VC, CI, KH, KW, VC)
-    ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC)
-    oshape = (N, CO, OH, OW)
-
-    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw:
-                           data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw],
-                           name='data_vec')
-
-    kernel_vec = tvm.compute(kvshape, lambda co, ci, kh, kw, vc:
-                             kernel[co*VC+vc][ci][kh][kw],
-                             name='kernel_vec')
-
-    ci = tvm.reduce_axis((0, CI), name='ci')
-    kh = tvm.reduce_axis((0, KH), name='kh')
-    kw = tvm.reduce_axis((0, KW), name='kw')
-
-    conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc:\
-                tvm.sum(data_vec[n, h, w, ci, vh*HSTR+kh, vw*WSTR+kw].astype(out_dtype) *
-                        kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
-                        axis=[ci, kh, kw]), name='conv')
-
-    output = tvm.compute(oshape, lambda n, co, h, w:
-                         conv[n][co//VC][h/VH][w//VW][h%VH][w%VW][co%VC],
-                         name='output_unpack', tag='spatialpack_conv_output')
-
-    return output
-
-def _schedule_spatialpack_conv2d(s, op):
-    """schedule the spatialpack method (spatial packing) for conv2d"""
-    # get ops and tensors
-    output = op.output(0)
-    output_height = util.get_const_int(output.shape[2])
-
-    conv = op.input_tensors[0]
-    data_vec = s[conv].op.input_tensors[0]
-    kernel_vec = s[conv].op.input_tensors[1]
+def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec):
+    """schedule the spatial packing for conv2d"""
     data = s[data_vec].op.input_tensors[0]
-    kernel = s[kernel_vec].op.input_tensors[0]
-
-    # set tunable parameters (tile factor, ...)
-    tune_config = getattr(tvm.target.current_target(), "tune_config", None)
-    if tune_config is None:
-        num_thread = 8
-
-        out_channel = util.get_const_int(kernel.shape[0])
-        in_channel = util.get_const_int(kernel.shape[1])
-        in_width = util.get_const_int(data.shape[2])
-
-        if in_width >= 224:
-            pass
-        elif in_width >= 112:
-            pass
-        elif in_width >= 56:
-            if out_channel != in_channel:
-                num_thread = 16
-        elif in_width >= 28:
-            if out_channel >= 256:
-                num_thread = 16
-        elif in_width >= 14:
-            if in_channel == out_channel:
-                num_thread = 8
-            else:
-                num_thread = 4
-    else:
-        num_thread = tune_config["num_thread"]
-
-    last = 1
-    if output_height == 28:
-        last = 7
-        num_thread = 32
 
-    if data.dtype == 'float16' and (util.get_const_int(conv.shape[1]) == 4 or output_height == 28):
-        num_thread //= 2
-
-    # schedule dilation
-    if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
-        s[kernel].compute_inline()
+    max_unroll = 16
+    vec_size = [1, 2, 4, 8, 16]
+    # get tunable parameters (they are defined in compute)
+    BC, TC, VC = cfg["tile_co"].size
+    BH, TH, VH = cfg["tile_oh"].size
+    BW, TW, VW = cfg["tile_ow"].size
 
     # schedule padding
     if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
         data_pad = data
-        data = data_pad.op.input_tensors[0]
         s[data_pad].compute_inline()
 
     # schedule data packing
-    _, h, w, ci, vh, vw = s[data_vec].op.axis
+    if isinstance(data_vec.op, tvm.tensor.ComputeOp) and data_vec.op.name == 'data_vec_undilated':
+        _, h, w, ci, _, _, vh, vw = s[data_vec].op.axis
+    else:
+        _, h, w, ci, vh, vw = s[data_vec].op.axis
     tile_and_bind3d(s, data_vec, h, w, ci, 1)
-    s[data_vec].unroll(vw)
-
-    # schedule kernel packing
-    co, ci, kh, kw, vc = s[kernel_vec].op.axis
-    tile_and_bind(s, kernel_vec, co, ci, 1)
-    s[kernel_vec].unroll(kh)
-    s[kernel_vec].unroll(kw)
-    s[kernel_vec].vectorize(vc)
+    if vh.dom.extent.value < max_unroll:
+        s[data_vec].unroll(vh)
+    if vw.dom.extent.value < max_unroll:
+        s[data_vec].unroll(vw)
+
+    if isinstance(kernel_vec.op, tvm.tensor.ComputeOp) and kernel_vec.name == 'kernel_vec':
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # kernel packing will be pre-computed during compilation, so we skip
+            # this part to make tuning records correct
+            s[kernel_vec].pragma(s[kernel_vec].op.axis[0], 'debug_skip_region')
+        else:
+            max_threads = tvm.target.current_target(allow_none=False).max_num_threads
+            co, ci, kh, kw, vc = s[kernel_vec].op.axis
+            fused = s[kernel_vec].fuse(co, ci, kh, kw, vc)
+            fused, vec = s[kernel_vec].split(fused, VC)
+            bb, tt = s[kernel_vec].split(fused, max_threads)
+            s[kernel_vec].bind(bb, tvm.thread_axis("blockIdx.x"))
+            s[kernel_vec].bind(tt, tvm.thread_axis("threadIdx.x"))
+            if VC in vec_size:
+                s[kernel_vec].vectorize(vec)
 
     # schedule convolution
-    _, c, h, w, vh, vw, vc = s[conv].op.axis
+    n, c, h, w, vh, vw, vc = s[conv].op.axis
     kc, kh, kw = s[conv].op.reduce_axis
-    s[conv].reorder(_, c, h, w, vh, kc, kh, kw, vw, vc)
-    tile_and_bind3d(s, conv, c, h, w, num_thread, 1, last)
-    s[conv].unroll(kh)
-    s[conv].unroll(kw)
-    s[conv].unroll(vw)
-    s[conv].vectorize(vc)
+
+    cfg["reorder_0"].apply(s, conv, [n, c, h, w, kc, kh, kw, vh, vw, vc])
+    tile_and_bind3d(s, conv, c, h, w, TC, TH, TW)
+
+    cfg["ann_reduce"].apply(s, conv, [kh, kw],
+                            axis_lens=[get_const_int(kernel_vec.shape[2]),
+                                       get_const_int(kernel_vec.shape[3])],
+                            max_unroll=max_unroll)
+
+    cfg["ann_spatial"].apply(s, conv, [vh, vw, vc],
+                             axis_lens=[VH, VW, VC],
+                             max_unroll=max_unroll,
+                             vec_size=vec_size,
+                             cfg=cfg)
 
     # schedule output
     if output.op not in s.outputs:  # has bias
@@ -330,364 +168,334 @@ def _schedule_spatialpack_conv2d(s, op):
         output = s.outputs[0]
 
     _, co, oh, ow = s[output].op.axis
-    tile_and_bind3d(s, output, co, oh, ow, num_thread, 1, last)
-
-def _decl_im2col(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
-    """declare the Im2Col method for conv2d"""
-    _, CI, IH, IW = [x.value for x in data.shape]
-    CO, _, KH, KW = [x.value for x in kernel.shape]
-    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+    tile_and_bind3d(s, output, co, oh, ow, TC, TH, TW)
 
-    if isinstance(stride, (tuple, list)):
-        HSTR, WSTR = stride
-    else:
-        HSTR, WSTR = stride, stride
+    return s
 
-    N = 1
-    OH = (IH + 2*HPAD - KH) // HSTR + 1
-    OW = (IW + 2*WPAD - KW) // WSTR + 1
+##### WINOGRAD TEMPLATE #####
+def _pick_tile_size(data, kernel):
+    N, CI, H, W = get_const_tuple(data.shape)
 
-    DO_PAD = (HPAD != 0 and WPAD != 0)
-    if DO_PAD:
-        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
+    if H % 4 == 0:
+        return 4
     else:
-        data_pad = data
-
-    ALIGN = 16
-    def upround(x, align):
-        return (x + align - 1) // align * align
-
-    # A [CO, CI * KH * KW]
-    reduce_len = upround(CI * KH * KW, ALIGN)
-    A = tvm.compute((upround(CO, ALIGN), reduce_len), lambda i, j:
-                    kernel[i][j // KW // KH][j // KW % KH][j % KW], name='A')
-
-    # B [CI * KH * KW, N * OH * OW]
-    B = tvm.compute((reduce_len, upround(N * OH * OW, ALIGN)), lambda i, j:\
-            tvm.select(tvm.all(i < CI * KH * KW, j < N * OH * OW),
-                       data_pad[j // (OH*OW)][i // (KH*KW)][j // OW % OH*HSTR + i // KW % KH]
-                       [j % OW*WSTR + i % KW],
-                       tvm.const(0, data_pad.dtype)), name='B')
-
-    gemm_n, gemm_l, gemm_m = A.shape[0], reduce_len, B.shape[1]
-
-    # C [CO, N * OH * OW]
-    k = tvm.reduce_axis((0, gemm_l), name='k')
-    C = tvm.compute((gemm_n, gemm_m), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
-
-    # output
-    # the last term C[gemm_n-1, gemm_m-1] is for enabling the alignment,
-    # otherwise the alignment above will be eliminated by bound inference
-    output = tvm.compute((N, CO, OH, OW), lambda n, co, h, w:\
-                 C[co][n * OW * OW + h * OW + w] + tvm.const(0, C.dtype) * C[gemm_n-1, gemm_m-1],
-                         name='output', tag='im2col_conv_output')
-
-    return output
-
-def _schedule_im2col_conv2d(s, op):
-    """schedule the Im2Col method for conv2d"""
-
-    # get ops and tensors
-    output = op.output(0)
-    C = op.input_tensors[0]
-    A, B = C.op.input_tensors
-    kernel = A.op.input_tensors[0]
-    data = B.op.input_tensors[0]
-
-    # tuning parameter config
-    tune_config = getattr(tvm.target.current_target(), "tune_config", None)
-    if tune_config is None: # use rule
-        bn = 4
-        unroll_step = 16
-
-        total_work = util.get_const_int(C.shape[0] * C.shape[1])
-        reduce_work = util.get_const_int(A.shape[1])
-        if total_work > 200000:
-            last_work = util.get_const_int(C.shape[1])
-            if last_work > 10000:
-                num_thread = 16
-            elif last_work > 3000:
-                num_thread = 8
-            elif reduce_work > 100:
-                num_thread = 4
-            else:
-                num_thread = 2
-
-            if reduce_work < 50 and last_work < 30000:
-                num_thread = 4
-        elif total_work > 150000:
-            num_thread = 8
-        elif total_work > 50000:
-            num_thread = 4
-        else:
-            num_thread = 2
-
-        if num_thread == 4:
-            unroll_step = 2
+        return 2
+
+@autotvm.register_topi_compute(conv2d, 'mali', ['winograd'])
+def conv2d_mali_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
+    tile_size = _pick_tile_size(data, kernel)
+    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
+                          tile_size)
+
+def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size):
+    N, CI, IH, IW = get_const_tuple(data.shape)
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
     else:
-        bn = tune_config["bn"]
-        num_thread = tune_config["num_thread"]
-        unroll_step = tune_config["unroll_step"]
-
-    bna = bnb = bn
-    num_thread1 = num_thread2 = num_thread
-    if data.dtype == 'float16':
-        bnb *= 2
-        last_work = util.get_const_int(C.shape[1])
-        if last_work % (bnb * num_thread2) != 0:
-            num_thread1 = num_thread * 2
-            num_thread2 = num_thread // 2
-
-    # schedule dilation
-    if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
-        s[kernel].compute_inline()
-
-    # schedule padding
-    if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
-        data_pad = data
-        s[data_pad].compute_inline()
+        dilation_h, dilation_w = dilation
 
-    ##### SCHEDULE A #####
-    if util.get_const_int(kernel.shape[2]) == 1 and util.get_const_int(kernel.shape[3]) == 1:
-        s[A].compute_inline()
-    else:
-        y, x = s[A].op.axis
-        yo, xo, yi, xi = s[A].tile(y, x, bna, util.get_const_int(kernel.shape[3]))
-        s[A].vectorize(xi)
-        fuse_and_bind(s, A, [yo, xo])
-
-    # pack to vector form
-    packedA = pack_tensor(s, A, bna, [C])
-
-    # vectorize load
-    y, x = s[packedA].op.axis[:2]
-    tmp = s.cache_write(packedA, "local")
-    x, xt = s[packedA].split(x, bna)
-    _, _, _, xi = tile_and_bind(s, packedA, y, x, num_thread)
-    s[tmp].compute_at(s[packedA], xi)
-    s[tmp].vectorize(s[tmp].op.axis[1])
-    s[tmp].unroll(s[tmp].op.axis[2])
-    s[packedA].vectorize(s[packedA].op.axis[2])
-    s[packedA].unroll(xt)
-
-    ##### SCHEDULE B #####
-    y, x = s[B].op.axis
-    yo, xo, yi, xi = s[B].tile(y, x, 1, 1 * bnb)
-    fuse_and_bind(s, B, [yo, xo])
-
-    # transpose and pack to vector form
-    B_transpose, B_tmp = transpose(s, B, [C])
-    s[B_transpose].compute_inline()
-    packedB = pack_tensor(s, B_transpose, bnb, [B_tmp])
-
-    # vectorize load
-    s[packedB].vectorize(s[packedB].op.axis[2])
-    y, x = s[packedB].op.axis[:2]
-    tile_and_bind(s, packedB, y, x, num_thread)
-
-    ##### SCHEDULE C #####
-    # vectorize and unroll dot
-    y, x = s[C].op.axis
-    y, x, yt, xt = s[C].tile(y, x, bna, bnb)
-
-    k = s[C].op.reduce_axis[0]
-    s[C].reorder(k, yt, xt)
-    if unroll_step != 1:
-        k, k_unroll = s[C].split(k, unroll_step)
-        s[C].unroll(k_unroll)
-    s[C].unroll(yt)
-    s[C].vectorize(xt)
-
-    tile_and_bind(s, C, y, x, num_thread1, num_thread2)
-
-    ##### COPY TO OUTPUT #####
-    if output.op in s.outputs:  # no bias
-        output = output
-    else:                       # has bias
-        s[output].compute_inline()
-        output = s.outputs[0]
+    if len(kernel.shape) == 4:
 
-    n, co, h, w = s[output].op.axis
-    h, w, vh, vw = s[output].tile(h, w, 1, bnb)
-    s[output].unroll(vh)
-    if util.get_const_int(s[output].op.output(0).shape[3]) % bnb != 0:
-        pass
+        if dilation_h != 1 or dilation_w != 1:
+            kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))
+        pre_computed = False
+        CO, _, KH, KW = get_const_tuple(kernel.shape)
     else:
-        s[output].vectorize(vw)
-    fuse_and_bind(s, output, [n, co, h, w])
-
-def _decl_winograd(data, kernel, stride, padding, layout, out_dtype):
-    """declare winograd fast convolution F(2x2, 3x3) for conv2d"""
-    N, CI, H, W = [util.get_const_int(x) for x in data.shape]
-    CO, CI, KH, KW = [util.get_const_int(x) for x in kernel.shape]
+        assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
+        pre_computed = True
+        H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape)
+        CO *= VC
+        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
+    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
     HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
-    if isinstance(stride, (tuple, list)):
-        HSTR, WSTR = stride
-    else:
-        HSTR, WSTR = stride, stride
 
-    assert HSTR == 1 and WSTR == 1 and HPAD == 1 and WPAD == 1 and KH == 3 and KW == 3
+    assert layout == 'NCHW'
+    assert KH == 3 and KW == 3 and HPAD == 1 and WPAD == 1 and HSTR == 1 and WSTR == 1
     data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
 
-    B_data = np.array([
-        [1, 0, 0, 0],
-        [0, 1, -1, 1],
-        [-1, 1, 1, 0],
-        [0, 0, 0, -1]
-    ], out_dtype)
-
-    G_data = np.array([
-        [1, 0, 0],
-        [1.0/2, 1.0/2, 1.0/2],
-        [1.0/2, -1.0/2, 1.0/2],
-        [0, 0, 1],
-    ], out_dtype)
-
-    A_data = np.array([
-        [1, 0],
-        [1, 1],
-        [1, -1],
-        [0, -1],
-    ], out_dtype)
-
-    m = 2
+    if tile_size == 4:
+        G_data = np.array([
+            [1 / 4.0, 0, 0],
+            [-1 / 6.0, -1 / 6.0, -1 / 6.0],
+            [-1 / 6.0, 1 / 6.0, -1 / 6.0],
+            [1 / 24.0, 1 / 12.0, 1 / 6.0],
+            [1 / 24.0, -1 / 12.0, 1 / 6.0],
+            [0, 0, 1]], out_dtype)
+
+        B_data = np.array([
+            [4, 0, 0, 0, 0, 0],
+            [0, -4, 4, -2, 2, 4],
+            [-5, -4, -4, -1, -1, 0],
+            [0, 1, -1, 2, -2, -5],
+            [1, 1, 1, 1, 1, 0],
+            [0, 0, 0, 0, 0, 1]], out_dtype)
+
+        A_data = np.array([
+            [1, 0, 0, 0],
+            [1, 1, 1, 1],
+            [1, -1, 1, -1],
+            [1, 2, 4, 8],
+            [1, -2, 4, -8],
+            [0, 0, 0, 1]], out_dtype)
+    elif tile_size == 2:
+        G_data = np.array([
+            [1, 0, 0],
+            [1.0/2, 1.0/2, 1.0/2],
+            [1.0/2, -1.0/2, 1.0/2],
+            [0, 0, 1]], out_dtype)
+
+        B_data = np.array([
+            [1, 0, 0, 0],
+            [0, 1, -1, 1],
+            [-1, 1, 1, 0],
+            [0, 0, 0, -1]], out_dtype)
+
+        A_data = np.array([
+            [1, 0],
+            [1, 1],
+            [1, -1],
+            [0, -1]], out_dtype)
+    else:
+        raise ValueError("Unsupported tile size for winograd: " + str(tile_size))
+
+    m = A_data.shape[1]
     r = 3
     alpha = m + r - 1
-    K = CO
-    C = CI
 
+    H = (IH + 2 * HPAD - 3) // HSTR + 1
+    W = (IW + 2 * WPAD - 3) // WSTR + 1
     nH, nW = (H + m-1) // m, (W + m-1) // m
     P = N * nH * nW
 
-    bna, bnb = 4, 4
-    if data.dtype == 'float16':
-        bnb *= 2
+    ##### space definition begin #####
+    tile_bna_candidates = [1, 2, 4, 8, 16]
+    factors = get_factors(CO)
+    cfg.define_knob('tile_bna', [x for x in tile_bna_candidates if x in factors])
+    cfg.define_knob('tile_bnb', [1, 2, 4, 8, 16])
+    cfg.define_split('tile_t1', CI, num_outputs=2, max_factor=128)
+    cfg.define_split('tile_t2', CO, num_outputs=2, max_factor=128)
+    cfg.define_split('c_unroll', CI, num_outputs=2, max_factor=8)
+    cfg.define_knob('yt', [1, 2, 4, 8, 16, 32])
+    ##### space definition end #####
+
+    if cfg.is_fallback:
+        cfg['tile_bnb'].val = 4
+        cfg['tile_bna'].val = 4
+        while CO % cfg['tile_bna'].val != 0:
+            cfg['tile_bna'].val //= 2
+        cfg['yt'].val = 8
+        cfg.fallback_split('tile_t1', [-1, 128])
+        cfg.fallback_split('tile_t2', [-1, 128])
+        cfg.fallback_split('c_unroll', [-1, 8])
+
+    bna = cfg['tile_bna'].val
+    bnb = cfg['tile_bnb'].val
+
     P_round = (P + bnb - 1) // bnb * bnb
-    assert K % bna == 0 and P_round % bnb == 0
+    assert CO % bna == 0 and P_round % bnb == 0
 
     # pack input tile
-    input_tile = tvm.compute((C, P_round // bnb, alpha, alpha, bnb),
-                             lambda c, b, eps, nu, bb:
-                             tvm.select(b * bnb + bb < P,\
-                             data_pad[(b*bnb+bb) // (nH*nW)][c][(b*bnb+bb) // nW % nH * m + eps]\
-                             [(b*bnb+bb) % nW * m + nu], tvm.const(0, data_pad.dtype)),
-                             name='d')
+    input_tile = tvm.compute((CI, P_round // bnb, alpha, alpha, bnb), lambda ci, b, eps, nu, bb: \
+         tvm.select(b * bnb + bb < P,
+                    data_pad[(b*bnb+bb) // (nH*nW)][ci][(b*bnb+bb) // nW % nH * m + eps]
+                    [(b*bnb+bb) % nW * m + nu], tvm.const(0, data_pad.dtype)), name='d')
 
     # transform kernel
-    G = const_array(G_data, 'G')
-    r_kh = tvm.reduce_axis((0, KH), 'r_kh')
-    r_kw = tvm.reduce_axis((0, KW), 'r_kw')
-    U = tvm.compute((alpha, alpha, K // bna, C, bna), lambda eps, nu, k, c, kk:
-                    tvm.sum(kernel[k * bna + kk][c][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
-                            axis=[r_kh, r_kw]), name='U')
+    if pre_computed:
+        U = kernel
+    else:
+        G = const_matrix(G_data, 'G')
+        r_kh = tvm.reduce_axis((0, KH), 'r_kh')
+        r_kw = tvm.reduce_axis((0, KW), 'r_kw')
+        U = tvm.compute((alpha, alpha, CO // bna, CI, bna), lambda eps, nu, co, ci, vco:
+                        tvm.sum(kernel[co * bna + vco][ci][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
+                                axis=[r_kh, r_kw]), name='U')
 
     # transform image
-    B = const_array(B_data, 'B')
-    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
-    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
-    V = tvm.compute((alpha, alpha, P_round // bnb, C, bnb), lambda eps, nu, b, c, bb:
-                    tvm.sum(input_tile[c][b][r_eps][r_nu][bb] * B[r_eps][eps] * B[r_nu][nu],
-                            axis=[r_eps, r_nu]), name='V')
+    B = const_matrix(B_data, 'B')
+    r_a = tvm.reduce_axis((0, alpha), 'r_a')
+    r_b = tvm.reduce_axis((0, alpha), 'r_b')
+    V = tvm.compute((alpha, alpha, P_round // bnb, CI, bnb), lambda eps, nu, p, ci, vp:
+                    tvm.sum(input_tile[ci][p][r_a][r_b][vp] * B[r_a][eps] * B[r_b][nu],
+                            axis=[r_a, r_b]), name='V')
 
     # batch gemm
-    c = tvm.reduce_axis((0, C), name='c')
-    M = tvm.compute((alpha, alpha, K, P_round), lambda eps, nu, k, b:
-                    tvm.sum(U[eps][nu][k // bna][c][k % bna] *
-                            V[eps][nu][b // bnb][c][b % bnb], axis=c), name='M')
-
-    # inverse transform
-    A = const_array(A_data, 'A')
-    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
-    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
-    Y = tvm.compute((K, P, m, m), lambda k, b, vh, vw:
-                    tvm.sum(M[r_eps][r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw],
-                            axis=[r_eps, r_nu]), name='Y')
+    ci = tvm.reduce_axis((0, CI), name='c')
+    M = tvm.compute((alpha, alpha, CO, P_round), lambda eps, nu, co, p:
+                    tvm.sum(U[eps][nu][co // bna][ci][co % bna] *
+                            V[eps][nu][p // bnb][ci][p % bnb], axis=ci), name='M')
+
+    A = const_matrix(A_data, 'A')
+    r_a = tvm.reduce_axis((0, alpha), 'r_a')
+    r_b = tvm.reduce_axis((0, alpha), 'r_b')
+    Y = tvm.compute((CO, P, m, m), lambda co, p, vh, vw:
+                    tvm.sum(M[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw],
+                            axis=[r_a, r_b]), name='Y')
 
     # unpack output
-    output = tvm.compute((N, K, H, W), lambda n, k, h, w:
-                         Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m]
+    output = tvm.compute((N, CO, H, W), lambda n, co, h, w:
+                         Y[co][n * nH * nW + (h//m) * nW + w//m][h % m][w % m]
                          # thw following term is used to make the padding effective,
                          # otherwise the padding will be eliminated by bound inference
-                         + tvm.const(0, out_dtype) * M[alpha-1][alpha-1][K-1][P_round-1],
-                         name='output', tag='winograd_conv_output')
+                         + tvm.const(0, out_dtype) * M[alpha-1][alpha-1][CO-1][P_round-1],
+                         name='output', tag='winograd_conv2d_output')
 
+    # we have to manually assign effective GFLOP for winograd
+    cfg.add_flop(2 * N * CO * H * W * KH * KW * CI)
     return output
 
-def _schedule_winograd(s, op):
+def _schedule_winograd(cfg, s, op):
     """schedule winograd fast convolution F(2x2, 3x3) for conv2d"""
-
     # get ops and tensors
     output = op.output(0)
 
     Y = op.input_tensors[0]
     M, A = s[Y].op.input_tensors
     U, V = s[M].op.input_tensors
-    kernel, G = s[U].op.input_tensors
     d, B = s[V].op.input_tensors
     data_pad = s[d].op.input_tensors[0]
-    data = s[data_pad].op.input_tensors[0]
-
-    # dilation
-    if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
-        s[kernel].compute_inline()
 
     # padding
     s[data_pad].compute_inline()
 
-    # pack input tiles
-    c, b, eps, nu, bb = s[d].op.axis
-    s[d].reorder(eps, nu, bb)
-    aha = s[d].fuse(eps, nu)
-    s[d].unroll(bb)
-    tile_and_bind3d(s, d, c, b, aha, 4, 1, 1)
-
     # transform kernel
-    s[G].compute_inline()
-    eps, nu, k, c, kk, = s[U].op.axis
-    r_kh, r_kw = s[U].op.reduce_axis
-    s[U].reorder(k, c, kk, eps, nu, r_kh, r_kw)
-    _ = [s[U].unroll(x) for x in [eps, nu, r_kh, r_kw]]
-    s[U].vectorize(kk)
-    tile_and_bind(s, U, k, c, 1, 256)
+    if isinstance(U.op, tvm.tensor.ComputeOp):
+        kernel, G = s[U].op.input_tensors
+        s[G].compute_inline()
+        eps, nu, co, ci, vco, = s[U].op.axis
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # kernel transformation will be pre-computed during compilation, so we skip
+            # this part to make tuning records correct
+            s[U].pragma(eps, 'debug_skip_region')
+        else:
+            r_kh, r_kw = s[U].op.reduce_axis
+            s[U].reorder(co, ci, eps, nu, r_kh, r_kw, vco)
+            _ = [s[U].unroll(x) for x in [eps, nu, r_kh, r_kw]]
+            s[U].vectorize(vco)
+            tile_and_bind(s, U, co, ci, 1, 256)
+
+        # dilation
+        if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+            s[kernel].compute_inline()
 
     # transform image
     s[B].compute_inline()
-    eps, nu, b, c, bb = s[V].op.axis
-    r_eps, r_nu = s[V].op.reduce_axis
-    s[V].reorder(b, c, bb, eps, nu, r_nu, r_eps)
-    _ = [s[V].unroll(x) for x in [eps, nu, r_eps, r_nu]]
-    s[V].vectorize(bb)
-    tile_and_bind(s, V, b, c, 2, 1)
+    VL = s.cache_write(V, 'local')
+
+    eps, nu, p, ci, vp = s[V].op.axis
+    s[V].reorder(p, ci, eps, nu, vp)
+    for axis in [eps, nu]:
+        s[V].unroll(axis)
+    s[V].vectorize(vp)
+    fused = s[V].fuse(p, ci)
+
+    bb, tt = cfg['tile_t1'].apply(s, V, fused)
+    s[V].bind(bb, tvm.thread_axis('blockIdx.x'))
+    s[V].bind(tt, tvm.thread_axis('threadIdx.x'))
+
+    eps, nu, p, ci, vp = s[VL].op.axis
+    r_a, r_b = s[VL].op.reduce_axis
+    for axis in [eps, nu, r_a, r_b]:
+        s[VL].unroll(axis)
+    s[VL].vectorize(vp)
+    s[d].compute_at(s[V], tt)
+    s[VL].compute_at(s[V], tt)
 
     # batch gemm
-    bna, bnb = 4, 4
-    if data.dtype == 'float16':
-        bnb *= 2
+    bna = cfg['tile_bna'].val
+    bnb = cfg['tile_bnb'].val
 
     eps, nu, k, b = s[M].op.axis
+    alpha = eps.dom.extent
     c = s[M].op.reduce_axis[0]
     yo, xo, yi, xi = s[M].tile(k, b, bna, bnb)
-    s[M].reorder(c, yi, xi)
-    c, c_unroll = s[M].split(c, 2)
+    c, c_unroll = cfg['c_unroll'].apply(s, M, c)
+    s[M].reorder(yo, xo, c, c_unroll, yi, xi)
     s[M].unroll(c_unroll)
     s[M].unroll(yi)
     s[M].vectorize(xi)
     z = s[M].fuse(eps, nu)
-    tile_and_bind3d(s, M, z, yo, xo, 1, 8, 1)
+    tile_and_bind3d(s, M, z, yo, xo, 1, cfg['yt'].val, 1)
 
     # inverse transform
     s[A].compute_inline()
     k, b, vh, vw = s[Y].op.axis
-    r_eps, r_nu = s[Y].op.reduce_axis
-    _ = [s[Y].unroll(x) for x in [vh, vw, r_eps, r_nu]]
-    tile_and_bind(s, Y, k, b, 4, 1)
+    r_a, r_b = s[Y].op.reduce_axis
+    for axis in [vh, vw, r_a, r_b]:
+        s[Y].unroll(axis)
 
-    # schedule output
-    if output.op in s.outputs:  # no bias
-        output = output
-    else:                       # has bias
+    # schedule output and fusion
+    if output.op not in s.outputs:
         s[output].compute_inline()
         output = s.outputs[0]
 
-    _, k, h, w = s[output].op.axis
-    tile_and_bind3d(s, output, k, h, w, 1, 2, 2)
+    n, co, h, w = s[output].op.axis
+    m = alpha - 3 + 1
+    h, w, hi, wi = s[output].tile(h, w, m, m)
+    s[output].unroll(hi)
+    s[output].unroll(wi)
+    fused = s[output].fuse(n, co, h, w)
+    bb, tt = cfg['tile_t2'].apply(s, output, fused)
+    s[output].bind(bb, tvm.thread_axis('blockIdx.x'))
+    s[output].bind(tt, tvm.thread_axis('threadIdx.x'))
+
+    s[Y].compute_at(s[output], tt)
+
+##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM #####
+@autotvm.register_topi_compute(conv2d_winograd_without_weight_transform, 'mali', ['winograd'])
+def conv2d_winograd_ww(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size):
+    """TOPI compute callback"""
+    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype,
+                          tile_size)
+
+
+@autotvm.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
+                                'mali', ['winograd'])
+def schedule_conv2d_winograd_without_weight_transform_(cfg, outs):
+    """TOPI schedule callback"""
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'winograd_conv2d_output' in op.tag:
+            _schedule_winograd(cfg, s, op)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+##### REGISTER ALTER OP LAYOUT #####
+@conv2d_alter_layout.register(["mali"])
+def _alter_conv2d_layout(attrs, inputs, tinfos):
+    try:
+        return _alter_conv2d_layout_arm(attrs, inputs, tinfos)
+    except KeyError:  # to filter out fallback opencl templates
+        return None
+
+
+##### SCHECULE UTILITIES #####
+def tile_and_bind(s, tensor, y, x, y_factor, x_factor=None):
+    """ tile and bind to GPU threads """
+    x_factor = x_factor or y_factor
+    yo, xo, yi, xi = s[tensor].tile(y, x, y_factor, x_factor)
+    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
+    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
+    return yo, xo, yi, xi
+
+
+def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
+    """ tile and bind 3d """
+    y_factor = y_factor or z_factor
+    x_factor = x_factor or y_factor
+    zo, zi = s[tensor].split(z, z_factor)
+    yo, yi = s[tensor].split(y, y_factor)
+    xo, xi = s[tensor].split(x, x_factor)
+    s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
+    s[tensor].bind(zi, tvm.thread_axis("threadIdx.z"))
+    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
+    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
+    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+    s[tensor].reorder(zo, yo, xo, zi, yi, xi)
+    return zo, yo, xo, zi, yi, xi
diff --git a/topi/python/topi/mali/dense.py b/topi/python/topi/mali/dense.py
index 165d80a5ceef..ec21b806d0ad 100644
--- a/topi/python/topi/mali/dense.py
+++ b/topi/python/topi/mali/dense.py
@@ -4,17 +4,21 @@
 from __future__ import absolute_import as _abs
 
 import tvm
+from tvm import autotvm
 
-from .. import generic
-from .. import util
-from .. import tag
+from .. import generic, nn
+from ..util import traverse_inline
 
-@generic.schedule_dense.register(["mali"])
-def schedule_dense(outs):
+autotvm.register_topi_compute(nn.dense, 'mali', 'direct', nn.dense.fdefault)
+
+@autotvm.register_topi_schedule(generic.schedule_dense, 'mali', 'direct')
+def schedule_dense(cfg, outs):
     """Schedule for dense operator.
 
     Parameters
     ----------
+    cfg: ConfigEntity
+        The config entity for this template
     outs: Array of Tensor
         The computation graph description of dense
         in the format of an array of tensors.
@@ -26,80 +30,65 @@ def schedule_dense(outs):
     """
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
-    def _schedule(dense):
-        data = s[dense].op.input_tensors[0]
-        weight = s[dense].op.input_tensors[1]
-
-        hidden = util.get_const_int(weight.shape[1])
-        out = util.get_const_int(weight.shape[0])
-
-        # set tunable parameter
-        tune_config = getattr(tvm.target.current_target(), "tune_config", None)
-        if tune_config is None:
-            if hidden > 8192:
-                num_thread = 32
-                unroll_step = 32
-            else:
-                if out <= 1024:
-                    num_thread = 32
-                    unroll_step = 16
-                else:
-                    num_thread = 256
-                    unroll_step = 32
-
-            if data.dtype == 'float16':
-                if hidden > 8192:
-                    num_thread = 2
-                    unroll_step = 32
-                else:
-                    num_thread = 8
-                    unroll_step = 256
-        else:
-            num_thread = tune_config['num_thread']
-            unroll_step = tune_config['unroll_step']
-
-        def fuse_and_bind(s, tensor, axis=None, num_thread=None):
-            """ fuse all the axis and bind to GPU threads """
-            axis = axis or s[tensor].op.axis
-            fused = s[tensor].fuse(*axis)
-            max_threads = tvm.target.current_target(allow_none=False).max_num_threads
-            bx, tx = s[tensor].split(fused, num_thread or max_threads)
-            s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
-            s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
-            return bx, tx
-
-        output = outs[0]
-        bx, tx = fuse_and_bind(s, output, num_thread=num_thread)
-
-        k = s[dense].op.reduce_axis[0]
-        k, k_unroll = s[dense].split(k, unroll_step)
-        s[dense].unroll(k_unroll)
-
-        if dense.op not in s.outputs:
+
+    def _callback(op):
+        if op.tag == 'dense':
+            vec_size = [1, 2, 4, 8, 16]
+            max_unroll = 32
+
+            dense = op.output(0)
+            output = outs[0]
+
+            y, x = s[output].op.axis
+            c = s[dense].op.reduce_axis[0]
+
+            ##### space definition begin #####
+            cfg.define_split('tile_y', y, num_outputs=3)
+            cfg.define_split('tile_x', x, num_outputs=3)
+            cfg.define_split('c_unroll', c, num_outputs=2, max_factor=64)
+
+            # fallback support
+            if cfg.is_fallback:
+                ref_log = autotvm.tophub.load_reference_log(
+                    'mali', 'rk3399', 'dense', 'direct')
+                cfg.fallback_with_reference_log(ref_log)
+            ##### space definition end #####
+
+            if dense.op in s.outputs:
+                dense = s.cache_write(output, 'local')
+
+            by, ty, yi = cfg['tile_y'].apply(s, output, y)
+            bx, tx, xi = cfg['tile_x'].apply(s, output, x)
+
+            s[output].bind(by, tvm.thread_axis('blockIdx.y'))
+            s[output].bind(bx, tvm.thread_axis('blockIdx.x'))
+            s[output].bind(ty, tvm.thread_axis('threadIdx.y'))
+            s[output].bind(tx, tvm.thread_axis('threadIdx.x'))
+
+            if cfg['tile_y'].size[-1] < max_unroll:
+                s[output].unroll(yi)
+            if cfg['tile_x'].size[-1] in vec_size:
+                s[output].vectorize(xi)
             s[dense].compute_at(s[output], tx)
 
-#        bias = s[outs[0]].op.input_tensors[1]
-#        print(tvm.lower(s, [data, weight, bias, outs[0]], simple_mode=True))
-
-    scheduled_ops = []
-
-    def traverse(OP):
-        """Internal travserse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(OP.tag):
-            if OP not in s.outputs:
-                s[OP].compute_inline()
-            for tensor in OP.input_tensors:
-                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
-        # schedule dense
-        elif OP.tag == 'dense':
-            dense = OP.output(0)
-            _schedule(dense)
-        else:
-            raise RuntimeError("Unsupported operator: %s" % OP.tag)
-
-        scheduled_ops.append(OP)
-
-    traverse(outs[0].op)
+            k = s[dense].op.reduce_axis[0]
+            y, x = s[dense].op.axis
+            k, k_unroll = cfg['c_unroll'].apply(s, dense, k)
+            s[dense].reorder(k, k_unroll, y, x)
+            s[dense].unroll(k_unroll)
+            if cfg['tile_y'].size[-1] < max_unroll:
+                s[dense].unroll(y)
+            if cfg['tile_x'].size[-1] in vec_size:
+                s[dense].vectorize(x)
+
+    traverse_inline(s, outs[0].op, _callback)
     return s
+
+def fuse_and_bind(s, tensor, axis=None, num_thread=None):
+    """ fuse all the axis and bind to GPU threads """
+    axis = axis or s[tensor].op.axis
+    fused = s[tensor].fuse(*axis)
+    bx, tx = s[tensor].split(fused, num_thread)
+    s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
+    return bx, tx
diff --git a/topi/python/topi/mali/depthwise_conv2d.py b/topi/python/topi/mali/depthwise_conv2d.py
index cad0733a153f..8652ba583260 100644
--- a/topi/python/topi/mali/depthwise_conv2d.py
+++ b/topi/python/topi/mali/depthwise_conv2d.py
@@ -1,21 +1,28 @@
 # pylint: disable=invalid-name,unused-variable,unused-argument
 """depthwise_conv2d schedule on ARM Mali GPU"""
 
-from __future__ import absolute_import as _abs
 import tvm
+from tvm import autotvm
 
-from .. import generic
-from .. import util
-from .. import tag
+from ..generic import schedule_depthwise_conv2d_nchw
+from ..nn import depthwise_conv2d_nchw
+from ..util import traverse_inline
 
-@generic.schedule_depthwise_conv2d_nchw.register(["mali"])
-def schedule_depthwise_conv2d_nchw(outs):
-    """Schedule for depthwise_conv2d nchw forward.
+# register original implementation of depthwise_conv2d_nchw since we don't need to change this part
+autotvm.register_topi_compute(depthwise_conv2d_nchw, 'mali', 'direct',
+                              depthwise_conv2d_nchw.fdefault)
+
+# register customized schedule for arm cpu.
+@autotvm.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'mali', 'direct')
+def schedule_depthwise_conv2d_nchw_mali(cfg, outs):
+    """Schedule depthwise conv2d
 
     Parameters
     ----------
+    cfg: ConfigEntity
+        The configuration of this template
     outs: Array of Tensor
-        The computation graph description of depthwise_conv2d
+        The computation graph description of depthwise convolution2d
         in the format of an array of tensors.
 
     Returns
@@ -25,89 +32,95 @@ def schedule_depthwise_conv2d_nchw(outs):
     """
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
-    def _schedule(pad_data, kernel, conv):
-        raw_data = s[pad_data].op.input_tensors[0]
 
-        if conv.op not in s.outputs:  # has bias or relu
-            output = outs[0]
-        else:                         # no bias or relu
-            output = conv
+    def _schedule(pad_data, kernel, conv):
+        """schedule depthwise_conv2d"""
+        max_unroll = 16
+        vec_size = [1, 2, 4, 8, 16]
 
-        def tile_and_bind3d(tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
-            """ tile and bind 3d """
-            y_factor = y_factor or z_factor
-            x_factor = x_factor or y_factor
-            zo, zi = s[tensor].split(z, z_factor)
-            yo, yi = s[tensor].split(y, y_factor)
-            xo, xi = s[tensor].split(x, x_factor)
-            s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
-            s[tensor].bind(zi, tvm.thread_axis("threadIdx.z"))
-            s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
-            s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
-            s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
-            s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
-            return zo, zi, yo, yi, xo, xi
-
-        # set tunable parameters
-        VH = 1
-        VW = 1
-        num_thread = 4
-        while util.get_const_int(conv.shape[3]) % (VW * 2) == 0 and VW * 2 <= 4:
-            VW = VW * 2
-        while util.get_const_int(conv.shape[2]) % (VH * 2) == 0 and VH * 2 <= 2:
-            VH = VH * 2
-        if raw_data.dtype == 'float16':
-            if util.get_const_int(conv.shape[3]) % (VW * 2) == 0:
-                VW *= 2
-                num_thread *= 2
-            else:
-                num_thread *= 2
+        ##### space definition begin #####
+        n, c, y, x = s[conv].op.axis
+        bc, tc, ci = cfg.define_split("tile_c", c, num_outputs=3)
+        by, ty, yi = cfg.define_split('tile_y', y, num_outputs=3)
+        bx, tx, xi = cfg.define_split("tile_x", x, num_outputs=3)
+        cfg.define_annotate('ann_spatial', [ci, yi, xi], policy='try_unroll_vec')
 
-        # schedule padding
-        _, c, y, x = s[pad_data].op.axis
-        tile_and_bind3d(pad_data, c, y, x, num_thread, 1, 1)
+        # fallback support
+        if cfg.is_fallback:
+            ref_log = autotvm.tophub.load_reference_log(
+                'mali', 'rk3399', 'depthwise_conv2d_nchw', 'direct')
+            cfg.fallback_with_reference_log(ref_log)
+        ###### space definition end ######
 
-        # schedule conv
-        di, dj = s[conv].op.reduce_axis
-        s[conv].unroll(di)
-        s[conv].unroll(dj)
 
-        _, c, y, x = s[output].op.axis
-        y, x, yi, xi = s[output].tile(y, x, VH, VW)
-        s[output].unroll(yi)
-        s[output].vectorize(xi)
+        # schedule padding
+        n, c, y, x = s[pad_data].op.axis
+        tile_and_bind3d(s, pad_data, c, y, x, cfg["tile_c"].size[1], 1, 1)
 
-        _, _, _, _, _, ji = tile_and_bind3d(output, c, y, x, num_thread, 1, 1)
+        # schedule dilation
+        if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+            s[kernel].compute_inline()
 
+        # schedule conv
         if conv.op not in s.outputs:
-            _, c, y, x = s[conv].op.axis
-            y, x, yi, xi = s[conv].tile(y, x, VH, VW)
-            s[conv].unroll(yi)
-            s[conv].vectorize(xi)
-            s[conv].compute_at(s[output], ji)
-
-    scheduled_ops = []
-
-    def traverse(op):
-        """Internal travserse function"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
+            s[conv].set_scope('local')
+            OL = conv
+            output = s.outputs[0].output(0)
+        else:
+            OL = s.cache_write(conv, 'local')
+            output = conv
 
+        n, c, y, x = s[output].op.axis
+        bc, tc, ci = cfg['tile_c'].apply(s, output, c)
+        by, ty, yi = cfg['tile_y'].apply(s, output, y)
+        bx, tx, xi = cfg['tile_x'].apply(s, output, x)
+
+        bc = s[output].fuse(n, bc)
+        s[output].bind(bc, tvm.thread_axis("blockIdx.z"))
+        s[output].bind(tc, tvm.thread_axis("threadIdx.z"))
+        s[output].bind(by, tvm.thread_axis("blockIdx.y"))
+        s[output].bind(ty, tvm.thread_axis("threadIdx.y"))
+        s[output].bind(bx, tvm.thread_axis("blockIdx.x"))
+        s[output].bind(tx, tvm.thread_axis("threadIdx.x"))
+
+        di, dj = s[OL].op.reduce_axis
+        s[OL].unroll(di)
+        s[OL].unroll(dj)
+
+        s[OL].compute_at(s[output], tx)
+        n, ci, yi, xi = s[OL].op.axis
+
+        cfg["ann_spatial"].apply(s, OL, [ci, yi, xi],
+                                 axis_lens=[cfg['tile_c'].size[2], cfg['tile_y'].size[2],
+                                            cfg['tile_x'].size[2]],
+                                 max_unroll=max_unroll,
+                                 vec_size=vec_size,
+                                 cfg=cfg)
+
+    def _callback(op):
+        """traverse to find op to schedule"""
         # schedule depthwise_conv2d
         if op.tag == 'depthwise_conv2d_nchw':
             pad_data = op.input_tensors[0]
             kernel = op.input_tensors[1]
-            if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
-                s[kernel].compute_inline()
             conv = op.output(0)
             _schedule(pad_data, kernel, conv)
 
-        scheduled_ops.append(op)
-
-    traverse(outs[0].op)
+    traverse_inline(s, outs[0].op, _callback)
     return s
+
+
+def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
+    """ tile and bind 3d """
+    y_factor = y_factor or z_factor
+    x_factor = x_factor or y_factor
+    zo, zi = s[tensor].split(z, z_factor)
+    yo, yi = s[tensor].split(y, y_factor)
+    xo, xi = s[tensor].split(x, x_factor)
+    s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
+    s[tensor].bind(zi, tvm.thread_axis("threadIdx.z"))
+    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
+    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
+    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+    return zo, zi, yo, yi, xo, xi
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index e0d2c403d4b4..a85d1268dbf8 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -16,7 +16,7 @@
                        'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
 
 @tvm.target.generic_func
-def conv2d(input, filter, strides, padding, layout='NCHW', out_dtype=None):
+def conv2d(input, filter, strides, padding, dilation, layout='NCHW', out_dtype=None):
     """Conv2D operator.
 
     Parameters
@@ -33,6 +33,9 @@ def conv2d(input, filter, strides, padding, layout='NCHW', out_dtype=None):
     padding : int or a list/tuple of two ints
         padding size, or [pad_height, pad_width]
 
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
     layout : str
         layout of data
 
@@ -44,11 +47,11 @@ def conv2d(input, filter, strides, padding, layout='NCHW', out_dtype=None):
     # search platform specific declaration first
     # default declaration
     if layout == 'NCHW':
-        return conv2d_nchw(input, filter, strides, padding, out_dtype)
+        return conv2d_nchw(input, filter, strides, padding, dilation, out_dtype)
     elif layout == 'HWCN':
-        return conv2d_hwcn(input, filter, strides, padding, out_dtype)
+        return conv2d_hwcn(input, filter, strides, padding, dilation, out_dtype)
     elif layout == 'NHWC':
-        return conv2d_nhwc(input, filter, strides, padding, out_dtype)
+        return conv2d_nhwc(input, filter, strides, padding, dilation, out_dtype)
     else:
         raise ValueError("not support this layout {} yet".format(layout))
 
@@ -79,46 +82,13 @@ def _get_workload(data, kernel, stride, padding, out_dtype):
         HSTR, WSTR = stride
     else:
         HSTR, WSTR = stride, stride
-    assert data.dtype == kernel.dtype, \
+    assert (data.dtype == kernel.dtype) or (data.dtype == 'uint8' and kernel.dtype == 'int8'), \
         "Do not support inputs with different data types now. ' \
         '{} vs. {}".format(data.dtype, kernel.dtype)
     return Workload(data.dtype, out_dtype, IH, IW, CI, CO, KH, KW, HPAD, WPAD, HSTR, WSTR)
 
 
-@tvm.target.generic_func
-def _get_alter_layout_schedule(wkl):
-    # pylint: disable=unreachable
-    """ Get the platform specific schedule for conv2d_alter_layout. """
-    target = tvm.target.current_target()
-    raise RuntimeError(
-        "No schedule for current target:{}".format(target))
-    # This return has no use, merely to supress pylint warning
-    return wkl
-
-
-@tvm.target.generic_func
-def _get_schedule(wkl):
-    # pylint: disable=unreachable
-    """ Get the platform specific schedule. """
-    target = tvm.target.current_target()
-    raise RuntimeError(
-        "No schedule for current target:{}".format(target))
-    # This return has no use, merely to supress pylint warning
-    return wkl
-
-
-@tvm.target.generic_func
-def _get_schedule_NCHWc(wkl, layout, out_layout):
-    # pylint: disable=unreachable
-    """ Get the platform specific schedule. """
-    target = tvm.target.current_target()
-    raise RuntimeError(
-        "No schedule for current target:{}".format(target))
-    # This return has no use, merely to supress pylint warning
-    return wkl
-
-
-def conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
+def conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None):
     """Convolution operator in NCHW layout.
 
     Parameters
@@ -135,6 +105,9 @@ def conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
     padding : int or str
         Padding size, or ['VALID', 'SAME']
 
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
     Returns
     -------
     Output : tvm.Tensor
@@ -143,18 +116,27 @@ def conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
     if out_dtype is None:
         out_dtype = Input.dtype
     assert isinstance(stride, int) or len(stride) == 2
-    batch, in_channel, in_height, in_width = Input.shape
-    num_filter, channel, kernel_h, kernel_w = Filter.shape
+    assert isinstance(dilation, int) or len(dilation) == 2
     if isinstance(stride, int):
         stride_h = stride_w = stride
     else:
         stride_h, stride_w = stride
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (kernel_h, kernel_w))
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    batch, in_channel, in_height, in_width = Input.shape
+    num_filter, channel, kernel_h, kernel_w = Filter.shape
     # compute the output shape
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w))
     out_channel = num_filter
-    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
+    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
     # compute graph
     pad_before = [0, 0, pad_top, pad_left]
     pad_after = [0, 0, pad_down, pad_right]
@@ -166,12 +148,13 @@ def conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
     return tvm.compute(
         (batch, out_channel, out_height, out_width),
         lambda nn, ff, yy, xx: tvm.sum(
-            temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
+            temp[nn, rc, yy * stride_h + ry * dilation_h,
+                 xx * stride_w + rx * dilation_w].astype(out_dtype) *
             Filter[ff, rc, ry, rx].astype(out_dtype),
             axis=[rc, ry, rx]), tag="conv2d_nchw")
 
 
-def conv2d_hwcn(Input, Filter, stride, padding, out_dtype=None):
+def conv2d_hwcn(Input, Filter, stride, padding, dilation, out_dtype=None):
     """Convolution operator in HWCN layout.
 
     Parameters
@@ -188,6 +171,9 @@ def conv2d_hwcn(Input, Filter, stride, padding, out_dtype=None):
     padding : int or str
         Padding size, or ['VALID', 'SAME']
 
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
     Returns
     -------
     output : tvm.Tensor
@@ -196,19 +182,28 @@ def conv2d_hwcn(Input, Filter, stride, padding, out_dtype=None):
     if out_dtype is None:
         out_dtype = Input.dtype
     assert isinstance(stride, int) or len(stride) == 2
-    in_height, in_width, in_channel, batch = Input.shape
-    kernel_h, kernel_w, channel, num_filter = Filter.shape
+    assert isinstance(dilation, int) or len(dilation) == 2
+
     if isinstance(stride, int):
         stride_h = stride_w = stride
     else:
         stride_h, stride_w = stride
 
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (kernel_h, kernel_w))
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    in_height, in_width, in_channel, batch = Input.shape
+    kernel_h, kernel_w, channel, num_filter = Filter.shape
     # compute the output shape
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w))
     out_channel = num_filter
-    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
+    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
     pad_before = [pad_top, pad_left, 0, 0]
     pad_after = [pad_down, pad_right, 0, 0]
     PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
@@ -218,13 +213,14 @@ def conv2d_hwcn(Input, Filter, stride, padding, out_dtype=None):
     Output = tvm.compute(
         (out_height, out_width, out_channel, batch),
         lambda yy, xx, ff, nn: tvm.sum(
-            PaddedInput[yy * stride_h + ry, xx * stride_w + rx, rc, nn].astype(out_dtype) *
+            PaddedInput[yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w,
+                        rc, nn].astype(out_dtype) *
             Filter[ry, rx, rc, ff].astype(out_dtype), axis=[ry, rx, rc]),
         name="Conv2dOutput", tag="conv2d_hwcn")
     return Output
 
 
-def conv2d_nhwc(Input, Filter, stride, padding, out_dtype='float32'):
+def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'):
     """Convolution operator in NHWC layout.
 
     Parameters
@@ -241,25 +237,37 @@ def conv2d_nhwc(Input, Filter, stride, padding, out_dtype='float32'):
     padding : int or str
         Padding size, or ['VALID', 'SAME']
 
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
     Returns
     -------
     output : tvm.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]
     """
     assert isinstance(stride, int) or len(stride) == 2
-    batch, in_height, in_width, in_channel = Input.shape
-    kernel_h, kernel_w, channel, num_filter = Filter.shape
+    assert isinstance(dilation, int) or len(dilation) == 2
+
     if isinstance(stride, int):
         stride_h = stride_w = stride
     else:
         stride_h, stride_w = stride
 
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (kernel_h, kernel_w))
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    batch, in_height, in_width, in_channel = Input.shape
+    kernel_h, kernel_w, channel, num_filter = Filter.shape
     # compute the output shape
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w))
     out_channel = num_filter
-    out_height = simplify((in_height - kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - kernel_w + pad_left + pad_right) // stride_w + 1)
+    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
     pad_before = [0, pad_top, pad_left, 0]
     pad_after = [0, pad_down, pad_right, 0]
     PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
@@ -269,15 +277,15 @@ def conv2d_nhwc(Input, Filter, stride, padding, out_dtype='float32'):
     Output = tvm.compute(
         (batch, out_height, out_width, out_channel),
         lambda nn, yy, xx, ff: tvm.sum(
-            PaddedInput[nn, yy * stride_h + ry, xx * stride_w + rx, rc].astype(out_dtype) *
+            PaddedInput[nn, yy * stride_h + ry * dilation_h,
+                        xx * stride_w + rx * dilation_w, rc].astype(out_dtype) *
             Filter[ry, rx, rc, ff].astype(out_dtype), axis=[ry, rx, rc]),
         name="Conv2dOutput", tag="conv2d_nhwc")
     return Output
 
 
 @tvm.target.generic_func
-def conv2d_NCHWc(data, kernel, num_filter, kernel_size, stride,
-                 padding, layout, out_layout, out_dtype='float32'):
+def conv2d_NCHWc(data, kernel, stride, padding, dilation, layout, out_layout, out_dtype='float32'):
     """Conv2D operator for nChw[x]c layout.
 
     Parameters
@@ -290,18 +298,15 @@ def conv2d_NCHWc(data, kernel, num_filter, kernel_size, stride,
         [num_filter_chunk, in_channel_chunk, filter_height, filter_width,
         in_channel_block, num_filter_block]
 
-    num_filter : int
-        number of filters, i.e., output channel size
-
-    kernel_size : tuple of two ints
-       [kernel_height, kernel_width]
-
     stride : int or a list/tuple of two ints
         stride size, or [stride_height, stride_width]
 
     padding : int or a list/tuple of two ints
         padding size, or [pad_height, pad_width]
 
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
     layout : str
         Input data layout
 
@@ -373,7 +378,7 @@ def conv2d_winograd_weight_transform(kernel, tile_size):
 
 
 @tvm.target.generic_func
-def conv2d_winograd_without_weight_transform(input, filter, strides, padding,
+def conv2d_winograd_without_weight_transform(input, filter, strides, padding, dilation,
                                              layout, out_dtype, tile_size):
     """Compute convolution in winograd algorithm. The filter is supposed to be transformed
     in advance.
@@ -397,3 +402,80 @@ def conv2d_winograd_without_weight_transform(input, filter, strides, padding,
         4-D with shape [batch, out_height, out_width, out_channel]
     """
     raise ValueError("missing register for topi.nn.conv2d_winograd_without_weight_transform")
+
+
+@tvm.target.generic_func
+def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtype=None):
+    """Group convolution operator in NCHW layout.
+
+    Parameters
+    ----------
+    Input : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    Filter : tvm.Tensor
+        4-D with shape [num_filter, in_channel // groups, filter_height, filter_width]
+
+    stride : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
+
+    dilation : int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
+    groups : int
+        number of groups
+
+    out_dtype : str
+        The output type. This is used for mixed precision.
+
+    Returns
+    -------
+    Output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    if out_dtype is None:
+        out_dtype = Input.dtype
+    assert isinstance(stride, int) or len(stride) == 2
+    assert isinstance(dilation, int) or len(dilation) == 2
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    batch, in_channel, in_height, in_width = get_const_tuple(Input.shape)
+    num_filter, _, kernel_h, kernel_w = get_const_tuple(Filter.shape)
+
+    assert in_channel % groups == 0, "input channels must divide group size"
+    assert num_filter % groups == 0, "output channels must divide group size"
+
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (kernel_h, kernel_w))
+    # compute the output shape
+    out_channel = num_filter
+    out_height = simplify(
+        (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify(
+        (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1)
+    # compute graph
+    pad_before = [0, 0, pad_top, pad_left]
+    pad_after = [0, 0, pad_down, pad_right]
+    temp = pad(Input, pad_before, pad_after, name="pad_temp")
+    rc = tvm.reduce_axis((0, in_channel // groups), name='rc')
+    ry = tvm.reduce_axis((0, kernel_h), name='ry')
+    rx = tvm.reduce_axis((0, kernel_w), name='rx')
+    return tvm.compute(
+        (batch, out_channel, out_height, out_width),
+        lambda nn, ff, yy, xx: tvm.sum(
+            temp[nn, ff // (num_filter//groups) * (in_channel//groups) + rc,
+                 yy * stride_h + ry * dilation_h,
+                 xx * stride_w + rx * dilation_w].astype(out_dtype) *
+            Filter[ff, rc, ry, rx].astype(out_dtype),
+            axis=[rc, ry, rx]), tag="conv2d_nchw")
diff --git a/topi/python/topi/nn/depthwise_conv2d.py b/topi/python/topi/nn/depthwise_conv2d.py
index c7906d3a4373..ca24b08dd0bb 100644
--- a/topi/python/topi/nn/depthwise_conv2d.py
+++ b/topi/python/topi/nn/depthwise_conv2d.py
@@ -1,6 +1,7 @@
-# pylint: disable=invalid-name, unused-variable, too-many-locals
+# pylint: disable=invalid-name, unused-variable, too-many-locals, unused-argument
 """Depthwise convolution operators"""
 from __future__ import absolute_import as _abs
+from collections import namedtuple
 import tvm
 
 from .dilate import dilate
@@ -8,9 +9,30 @@
 from .util import get_pad_tuple
 from ..util import simplify
 
+# workload description of depthwise-conv2d
+Workload = namedtuple('Workload',
+                      ['in_dtype', 'out_dtype', 'height', 'width', 'in_filter', 'out_filter',
+                       'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
+
+def _get_workload(data, kernel, stride, padding, out_dtype):
+    """ Get the workload structure. """
+    _, in_channel, height, width = [x.value for x in data.shape]
+    channel, channel_multiplier, kh, kw = [x.value for x in kernel.shape]
+    out_channel = channel * channel_multiplier
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+    if isinstance(stride, (tuple, list)):
+        HSTR, WSTR = stride
+    else:
+        HSTR, WSTR = stride, stride
+    assert (data.dtype == kernel.dtype) or (data.dtype == 'uint8' and kernel.dtype == 'int8'), \
+        "Do not support inputs with different data types now. ' \
+        '{} vs. {}".format(data.dtype, kernel.dtype)
+    return Workload(data.dtype, out_dtype, height, width, in_channel,
+                    out_channel, kh, kw, HPAD, WPAD, HSTR, WSTR)
+
 
 @tvm.target.generic_func
-def depthwise_conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
+def depthwise_conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None):
     """Depthwise convolution nchw forward operator.
 
     Parameters
@@ -27,6 +49,9 @@ def depthwise_conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
     padding : int or str
         Padding size, or ['VALID', 'SAME']
 
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
     out_dtype: str, optional
         Output data type
 
@@ -37,18 +62,27 @@ def depthwise_conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
     """
     out_dtype = Input.dtype if out_dtype is None else out_dtype
 
-    batch, in_channel, in_height, in_width = Input.shape
-    filter_channel, channel_multiplier, filter_height, filter_width = Filter.shape
     if isinstance(stride, int):
         stride_h = stride_w = stride
     else:
         stride_h, stride_w = stride
 
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    batch, in_channel, in_height, in_width = Input.shape
+    # shape of dilated kernel
+    filter_channel, channel_multiplier, filter_height, filter_width = Filter.shape
+
+    dilated_kernel_h = (filter_height - 1) * dilation_h + 1
+    dilated_kernel_w = (filter_width - 1) * dilation_w + 1
     pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (filter_height, filter_width))
+        padding, (dilated_kernel_h, dilated_kernel_w))
     out_channel = simplify(in_channel * channel_multiplier)
-    out_height = simplify((in_height - filter_height + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - filter_width + pad_left + pad_right) // stride_w + 1)
+    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
 
     # padding stage
     pad_before = [0, 0, pad_top, pad_left]
@@ -60,7 +94,8 @@ def depthwise_conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
     Output = tvm.compute(
         (batch, out_channel, out_height, out_width),
         lambda b, c, i, j: tvm.sum(
-            (PaddedInput[b, c/channel_multiplier, i*stride_h+di, j*stride_w+dj].astype(out_dtype) *
+            (PaddedInput[b, c/channel_multiplier, i*stride_h+di*dilation_h,
+                         j*stride_w+dj*dilation_w].astype(out_dtype) *
              Filter[c/channel_multiplier, c%channel_multiplier, di, dj].astype(out_dtype)),
             axis=[di, dj]),
         name='DepthwiseConv2d', tag="depthwise_conv2d_nchw")
@@ -68,7 +103,7 @@ def depthwise_conv2d_nchw(Input, Filter, stride, padding, out_dtype=None):
 
 
 @tvm.target.generic_func
-def depthwise_conv2d_nhwc(Input, Filter, stride, padding, out_dtype=None):
+def depthwise_conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype=None):
     """Depthwise convolution nhwc forward operator.
 
     Parameters
@@ -85,6 +120,9 @@ def depthwise_conv2d_nhwc(Input, Filter, stride, padding, out_dtype=None):
     padding : int or str
         Padding size, or ['VALID', 'SAME']
 
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
     out_dtype: str, optional
         Output data type
 
@@ -95,18 +133,27 @@ def depthwise_conv2d_nhwc(Input, Filter, stride, padding, out_dtype=None):
     """
     out_dtype = Input.dtype if out_dtype is None else out_dtype
 
-    batch, in_height, in_width, in_channel = Input.shape
-    filter_height, filter_width, filter_channel, channel_multiplier = Filter.shape
     if isinstance(stride, int):
         stride_h = stride_w = stride
     else:
         stride_h, stride_w = stride
 
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    batch, in_height, in_width, in_channel = Input.shape
+    # shape of dilated kernel
+    filter_height, filter_width, filter_channel, channel_multiplier = Filter.shape
+
+    dilated_kernel_h = (filter_height - 1) * dilation_h + 1
+    dilated_kernel_w = (filter_width - 1) * dilation_w + 1
     pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (filter_height, filter_width))
+        padding, (dilated_kernel_h, dilated_kernel_w))
     out_channel = simplify(in_channel * channel_multiplier)
-    out_height = simplify((in_height - filter_height + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - filter_width + pad_left + pad_right) // stride_w + 1)
+    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
+    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
 
     # padding stage
     pad_before = [0, pad_top, pad_left, 0]
@@ -118,8 +165,8 @@ def depthwise_conv2d_nhwc(Input, Filter, stride, padding, out_dtype=None):
     Output = tvm.compute(
         (batch, out_height, out_width, out_channel),
         lambda b, i, j, c: tvm.sum(
-            (PaddedInput[b, i*stride_h + di, j*stride_w + dj, c/channel_multiplier].astype(
-                out_dtype) *
+            (PaddedInput[b, i*stride_h + di*dilation_h, j*stride_w + dj*dilation_w,
+                         c/channel_multiplier].astype(out_dtype) *
              Filter[di, dj, c/channel_multiplier, c%channel_multiplier].astype(out_dtype)),
             axis=[di, dj]),
         name='DepthwiseConv2d', tag="depthwise_conv2d_nhwc")
@@ -232,3 +279,44 @@ def depthwise_conv2d_backward_weight_nhwc(Input, Out_grad, oshape, fshape, strid
         tag='depthwise_conv2d_backward_weight_nhwc')
 
     return Weight_grad
+
+
+@tvm.target.generic_func
+def depthwise_conv2d_NCHWc(Input, Filter, stride, padding, dilation,
+                           layout, out_layout, out_dtype=None):
+    """Depthwise convolution NCHW[x]c forward operator.
+
+    Parameters
+    ----------
+    Input : tvm.Tensor
+        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]
+
+    Filter : tvm.Tensor
+        4-D with shape [out_channel_chunk, filter_height, filter_width, out_channel_block]
+        In NCHWc depthwise convolution,
+        we group kernel's in_channel and channel_multiplier together then do the tiling.
+
+    stride : tuple of two ints
+        The spatial stride along height and width
+
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
+
+    dilation: int or a list/tuple of two ints
+         dilation size, or [dilation_height, dilation_width]
+
+    layout : str
+        Input data layout
+
+    out_layout : str
+        Output data layout
+
+    out_dtype: str, optional
+        Output data type
+
+    Returns
+    -------
+    Output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    raise ValueError("missing register for topi.nn.depthwise_conv2d_NCHWc")
diff --git a/topi/python/topi/nn/upsampling.py b/topi/python/topi/nn/upsampling.py
index 55f7844319f3..757d8fe674c2 100644
--- a/topi/python/topi/nn/upsampling.py
+++ b/topi/python/topi/nn/upsampling.py
@@ -1,6 +1,7 @@
 """TVM operator upsampling compute."""
 from __future__ import absolute_import
 import topi
+from ..util import simplify
 
 
 def upsampling(data, scale, layout="NCHW", method='NEAREST_NEIGHBOR'):
@@ -31,9 +32,9 @@ def upsampling(data, scale, layout="NCHW", method='NEAREST_NEIGHBOR'):
     """
 
     if layout == "NCHW":
-        out_shape = (data.shape[2] * scale, data.shape[3] * scale)
+        out_shape = (simplify(data.shape[2] * scale), simplify(data.shape[3] * scale))
     elif layout == "NHWC":
-        out_shape = (data.shape[1] * scale, data.shape[2] * scale)
+        out_shape = (simplify(data.shape[1] * scale), simplify(data.shape[2] * scale))
     else:
         raise ValueError("not support this layout {} yet".format(layout))
 
diff --git a/topi/python/topi/opengl/pooling.py b/topi/python/topi/opengl/pooling.py
index 8195ea91d8a6..d6dbf0eac5c2 100644
--- a/topi/python/topi/opengl/pooling.py
+++ b/topi/python/topi/opengl/pooling.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name, unused-variable
+# pylint: disable=invalid-name, unused-variable, unused-argument
 """Schedule for pooling operators"""
 import tvm
 from .. import tag
@@ -54,7 +54,7 @@ def traverse(OP):
 
 
 @generic.schedule_pool.register(["opengl"])
-def schedule_pool(outs):
+def schedule_pool(outs, layout):
     """Schedule for pool.
 
     Parameters
@@ -63,6 +63,9 @@ def schedule_pool(outs):
         The computation graph description of pool
         in the format of an array of tensors.
 
+    layout: str
+        Data layout.
+
     Returns
     -------
     s: Schedule
diff --git a/topi/python/topi/reduction.py b/topi/python/topi/reduction.py
index 9f88953bb770..7a4f161a8fee 100644
--- a/topi/python/topi/reduction.py
+++ b/topi/python/topi/reduction.py
@@ -1,9 +1,7 @@
 # pylint: disable=redefined-builtin,consider-using-enumerate,no-member
 """Reduce operators"""
 from __future__ import absolute_import as _abs
-import tvm
-from . import tag
-from .util import ravel_index
+from . import cpp
 
 def _get_real_axis(ndim, axis):
     if axis is None:
@@ -26,131 +24,6 @@ def _get_real_axis(ndim, axis):
     return real_axis
 
 
-def get_reduce_out_shape(src_shape, axis=None, keepdims=False):
-    """Get the output shape for the reduction OPs
-
-    Parameters
-    ----------
-    src_shape : tuple of int or tvm.expr.IntImm
-
-    axis : None or int or tuple of int
-
-    keepdims : bool
-
-    Returns
-    -------
-    dst_shape : tuple of int or tvm.expr.IntImm
-    """
-    real_axis = _get_real_axis(len(src_shape), axis)
-    if keepdims:
-        dst_shape = [src_shape[i] if i in real_axis else 1 for i in range(len(src_shape))]
-    else:
-        dst_shape = []
-        for i in range(len(src_shape)):
-            if i not in real_axis:
-                dst_shape.append(src_shape[i])
-    return dst_shape
-
-
-def _argmax_comp(lhs, rhs):
-    """Compare function of argmax"""
-    idx = tvm.make.Select((lhs[1] >= rhs[1]), lhs[0], rhs[0])
-    val = tvm.make.Select((lhs[1] >= rhs[1]), lhs[1], rhs[1])
-    return idx, val
-
-
-def _argmax_init(idx_typ, val_typ):
-    """Initial ind and val of argmax"""
-    return tvm.const(-1, idx_typ), tvm.min_value(val_typ)
-
-
-def _argmin_comp(lhs, rhs):
-    """Compare function of argmin"""
-    idx = tvm.make.Select((lhs[1] <= rhs[1]), lhs[0], rhs[0])
-    val = tvm.make.Select((lhs[1] <= rhs[1]), lhs[1], rhs[1])
-    return idx, val
-
-
-def _argmin_init(idx_typ, val_typ):
-    """Initial ind and val of argmax"""
-    return tvm.const(-1, idx_typ), tvm.max_value(val_typ)
-
-
-def _choose_idx(idx, _, *indices):
-    """Chose the idx from idx and val"""
-    return idx(*indices)
-
-
-def comm_reduce(data, axis=None, keepdims=False, func=tvm.sum, is_idx_reduce=False):
-    """Reducing the data
-
-    Parameters
-    ----------
-    data : tvm.Tensor
-        The input data
-
-    axis : None or int or tuple of int
-        Axis or axes along which a sum is performed.
-        The default, axis=None, will sum all of the elements of the input array.
-        If axis is negative it counts from the last to the first axis.
-
-    keepdims : bool
-        If this is set to True, the axes which are reduced are left in the result as dimensions
-         with size one.
-        With this option, the result will broadcast correctly against the input array.
-
-    func : function
-        functions like tvm.sum, tvm.max, tvm.min
-
-    Returns
-    -------
-    ret : tvm.Tensor
-    """
-    ndim = len(data.shape)
-    assert ndim != 0, "Reduce a dim-0 input is not supported!"
-    real_axis = _get_real_axis(ndim, axis)
-    reduce_axes = [tvm.reduce_axis((0, data.shape[i]), "k%d" %i) for i in real_axis]
-    if keepdims:
-        target_shape = [1 if i in real_axis else data.shape[i] for i in range(ndim)]
-    else:
-        target_shape = []
-        for i in range(ndim):
-            if i not in real_axis:
-                target_shape.append(tvm.convert(data.shape[i]))
-    def _compute(*indices):
-        eval_range = []
-        eval_indices = []
-        if not keepdims:
-            arg_counter = 0
-        else:
-            arg_counter = None
-        red_counter = 0
-        for i in range(len(data.shape)):
-            if i in real_axis:
-                eval_range.append(reduce_axes[red_counter])
-                eval_indices.append(reduce_axes[red_counter].var)
-                red_counter += 1
-            else:
-                if not keepdims:
-                    eval_range.append(indices[arg_counter])
-                    arg_counter += 1
-                else:
-                    eval_range.append(indices[i])
-        if not is_idx_reduce:
-            return func(data[tuple(eval_range)], axis=reduce_axes)
-        idx = ravel_index(eval_indices, [data.shape[i] for i in real_axis])
-        return func((idx, data[tuple(eval_range)]), axis=reduce_axes)
-    if is_idx_reduce:
-        temp_idx, temp_val = tvm.compute(target_shape, _compute, name=data.name + "_red_temp")
-        out = tvm.compute(target_shape,
-                          lambda *indices: _choose_idx(temp_idx, temp_val, *indices),
-                          name=data.name + "_red")
-    else:
-        out = tvm.compute(target_shape, _compute, name=data.name + "_red")
-    return out
-
-
-@tvm.tag_scope(tag=tag.COMM_REDUCE)
 def sum(data, axis=None, keepdims=False):
     """Sum of array elements over a given axis or a list of axes
 
@@ -173,10 +46,9 @@ def sum(data, axis=None, keepdims=False):
     -------
     ret : tvm.Tensor
     """
-    return comm_reduce(data, axis=axis, keepdims=keepdims, func=tvm.sum)
+    return cpp.sum(data, axis, keepdims)
 
 
-@tvm.tag_scope(tag=tag.COMM_REDUCE)
 def max(data, axis=None, keepdims=False):
     """Maximum of array elements over a given axis or a list of axes
 
@@ -199,10 +71,9 @@ def max(data, axis=None, keepdims=False):
     -------
     ret : tvm.Tensor
     """
-    return comm_reduce(data, axis=axis, keepdims=keepdims, func=tvm.max)
+    return cpp.max(data, axis, keepdims)
 
 
-@tvm.tag_scope(tag=tag.COMM_REDUCE)
 def min(data, axis=None, keepdims=False):
     """Minimum of array elements over a given axis or a list of axes
 
@@ -225,10 +96,9 @@ def min(data, axis=None, keepdims=False):
     -------
     ret : tvm.Tensor
     """
-    return comm_reduce(data, axis=axis, keepdims=keepdims, func=tvm.min)
+    return cpp.min(data, axis, keepdims)
 
 
-@tvm.tag_scope(tag=tag.COMM_REDUCE_IDX)
 def argmax(data, axis=None, keepdims=False):
     """Returns the indices of the maximum values along an axis.
 
@@ -251,11 +121,9 @@ def argmax(data, axis=None, keepdims=False):
     -------
     ret : tvm.Tensor
     """
-    _argmax = tvm.comm_reducer(fcombine=_argmax_comp, fidentity=_argmax_init, name='argmax')
-    return comm_reduce(data, axis=axis, keepdims=keepdims, func=_argmax, is_idx_reduce=True)
+    return cpp.argmax(data, axis, keepdims)
 
 
-@tvm.tag_scope(tag=tag.COMM_REDUCE_IDX)
 def argmin(data, axis=None, keepdims=False):
     """Returns the indices of the minimum values along an axis.
 
@@ -278,5 +146,29 @@ def argmin(data, axis=None, keepdims=False):
     -------
     ret : tvm.Tensor
     """
-    _argmin = tvm.comm_reducer(fcombine=_argmin_comp, fidentity=_argmin_init, name='argmin')
-    return comm_reduce(data, axis=axis, keepdims=keepdims, func=_argmin, is_idx_reduce=True)
+    return cpp.argmin(data, axis, keepdims)
+
+
+def prod(data, axis=None, keepdims=False):
+    """Product of array elements over a given axis or a list of axes
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        The input tvm tensor
+
+    axis : None or int or tuple of int
+        Axis or axes along which a prod operation is performed.
+        The default, axis=None, will get the prod element over all of the elements of the
+        input array. If axis is negative it counts from the last to the first axis.
+
+    keepdims : bool
+        If this is set to True, the axes which are reduced are left in the result as dimensions
+        with size one.
+        With this option, the result will broadcast correctly against the input array.
+
+    Returns
+    -------
+    ret : tvm.Tensor
+    """
+    return cpp.prod(data, axis, keepdims)
diff --git a/topi/python/topi/rocm/conv2d.py b/topi/python/topi/rocm/conv2d.py
index 1aa125f8f68f..b5839c0c866b 100644
--- a/topi/python/topi/rocm/conv2d.py
+++ b/topi/python/topi/rocm/conv2d.py
@@ -1,26 +1,29 @@
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-statements, too-many-arguments, too-many-branches, line-too-long
-"""Compute and schedule for rocm conv2d_nchw with auto fusion"""
+# pylint: disable=invalid-name
+"""Compute definition for conv2d with rocm backend"""
 import tvm
+from tvm import autotvm
 from tvm.contrib import miopen
-import topi
-from .. import generic
-from ..nn.conv2d import conv2d
-from ..util import get_const_int
 
+from .. import nn, generic
+from ..util import get_const_tuple
+from ..cuda.conv2d import conv2d_cuda, schedule_conv2d_nchw_cuda
 
-@conv2d.register("rocm")
-def conv2d_rocm(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
+@autotvm.register_topi_compute(nn.conv2d, 'rocm', ['direct', 'winograd'])
+def conv2d_rocm(cfg, data, kernel, strides, padding, dilation, layout='NCHW', out_dtype='float32'):
     """Conv2D operator for rocm backend.
 
     Parameters
     ----------
+    cfg: ConfigEntity
+        The config for this template
+
     input : tvm.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]
 
     filter : tvm.Tensor
         4-D with shape [num_filter, in_channel, filter_height, filter_width]
 
-    stride : int or a list/tuple of two ints
+    strides : int or a list/tuple of two ints
         stride size, or [stride_height, stride_width]
 
     padding : int or a list/tuple of two ints
@@ -34,31 +37,25 @@ def conv2d_rocm(data, kernel, stride, padding, layout='NCHW', out_dtype='float32
     output : tvm.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
-    assert layout == 'NCHW', "Only NCHW layout is supported."
-    assert isinstance(stride, int) or len(stride) == 2
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-    if isinstance(padding, int):
-        pad_h = pad_w = padding
-    else:
-        pad_h, pad_w = padding
-    # handle dilation
-    dilation_h = dilation_w = 1
-    kernel_tvm = kernel
-    kernel_cudnn = kernel
-    if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
-        kernel_before_dilation = kernel.op.input_tensors[0]
-        kernel_cudnn = kernel_before_dilation
-        dilation_h = (get_const_int(kernel.shape[2]) + get_const_int(kernel_before_dilation.shape[2]) - 1) \
-            // get_const_int(kernel_before_dilation.shape[2])
-        dilation_w = (get_const_int(kernel.shape[3]) + get_const_int(kernel_before_dilation.shape[3]) - 1) \
-            // get_const_int(kernel_before_dilation.shape[2])
+
     target = tvm.target.current_target()
     if "miopen" in target.libs:
+        assert layout == 'NCHW', "Only NCHW layout is supported."
+        CO, CI, KH, KW = get_const_tuple(kernel.shape)
+        N, _, H, W = get_const_tuple(data.shape)
+
+        # handle dilation
+        stride_h, stride_w = (strides, strides) if isinstance(strides, int) else strides
+        pad_h, pad_w = (padding, padding) if isinstance(padding, int) else padding
+        dilation_h, dilation_w = (dilation, dilation) if isinstance(dilation, int) else dilation
+
+        OH = (H + 2 * pad_h - KH) // stride_h + 1
+        OW = (W + 2 * pad_w - KW) // stride_w + 1
+        cfg.add_flop(2 * N * OH * OW * CO * CI * ((KH - 1) * dilation_h + 1) *\
+                    ((KW - 1) * dilation_w + 1))
+
         return miopen.conv2d_forward(data,
-                                     kernel_cudnn,
+                                     kernel_before_dilation,
                                      stride_h,
                                      stride_w,
                                      pad_h,
@@ -66,25 +63,30 @@ def conv2d_rocm(data, kernel, stride, padding, layout='NCHW', out_dtype='float32
                                      dilation_h,
                                      dilation_w,
                                      conv_mode=0)
-    return topi.nn.conv2d_nchw(data, kernel_tvm, stride, padding, out_dtype)
+
+    return conv2d_cuda(cfg, data, kernel, strides, padding, dilation, layout, out_dtype)
 
 
-@generic.schedule_conv2d_nchw.register(["rocm"])
-def schedule_conv2d_nchw(outs):
-    """Schedule for conv2d_nchw with rocm backend.
+@autotvm.register_topi_schedule(generic.schedule_conv2d_nchw, 'rocm', ["direct", 'winograd'])
+def schedule_conv2d_nchw_rocm(cfg, outs):
+    """TOPI schedule callback of conv2d for rocm
 
     Parameters
     ----------
+    cfg: ConfigEntity
+        The config for this template
+
     outs: Array of Tensor
-        The computation graph description of conv2d_nchw
+        The computation graph description of conv2d
         in the format of an array of tensors.
 
     Returns
     -------
     s: Schedule
-        The computation schedule for conv2d_nchw.
+        The computation schedule for conv2d.
     """
     target = tvm.target.current_target()
     if target and "miopen" in target.libs:
-        return topi.generic.schedule_extern(outs)
-    return topi.cuda.schedule_conv2d_nchw(outs)
+        return generic.schedule_extern(outs)
+
+    return schedule_conv2d_nchw_cuda(cfg, outs)
diff --git a/topi/python/topi/sparse/__init__.py b/topi/python/topi/sparse/__init__.py
new file mode 100644
index 000000000000..bfac967d2f76
--- /dev/null
+++ b/topi/python/topi/sparse/__init__.py
@@ -0,0 +1,7 @@
+# pylint: disable=wildcard-import
+"""Sparse operators"""
+from __future__ import absolute_import as _abs
+
+from .csrmv import csrmv
+from .csrmm import csrmm
+from .dense import dense
diff --git a/topi/python/topi/sparse/csrmm.py b/topi/python/topi/sparse/csrmm.py
new file mode 100644
index 000000000000..f0574bf3df6d
--- /dev/null
+++ b/topi/python/topi/sparse/csrmm.py
@@ -0,0 +1,94 @@
+"""TVM operator compute SpMM in CSR format."""
+from __future__ import absolute_import
+import tvm
+from .. import tag
+from ..util import simplify
+
+def csrmm_default(data, indices, indptr, weight, bias=None):
+    # pylint: disable=invalid-name
+    """The default implementation of csrmm in topi.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        1-D with shape [nonzeros]
+
+    indices : tvm.Tensor
+        1-D with shape [nonzeros]
+
+    indptr : tvm.Tensor
+        1-D with shape [m+1]
+
+    weight : tvm.Tensor
+        2-D with shape [k, n]
+
+    bias : tvm.Tensor, optional
+        1-D with shape [m]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D with shape [m, n]
+    """
+    assert len(data.shape) == 1 and len(indices.shape) == 1 and len(indptr.shape) == 1 \
+        and len(weight.shape) == 2, "only support 2-dim csrmm"
+    assert isinstance(weight, tvm.tensor.Tensor), \
+        "weight matrix is assumed to be tvm.Tensor, but weight is `%s`" % (type(weight))
+    if bias is not None:
+        assert len(bias.shape) == 1
+    M = simplify(indptr.shape[0]-1)
+    _, N = weight.shape
+    def csrmm_default_ir(data, indices, indptr, weight, out):
+        """define ir for csrmm"""
+        irb = tvm.ir_builder.create()
+        data_ptr = irb.buffer_ptr(data)
+        indices_ptr = irb.buffer_ptr(indices)
+        indptr_ptr = irb.buffer_ptr(indptr)
+        weight_ptr = irb.buffer_ptr(weight)
+        out_ptr = irb.buffer_ptr(out)
+        M = simplify(indptr.shape[0]-1)
+        _, N = weight.shape
+        with irb.for_range(0, N, for_type="vectorize", name='n') as n:
+            with irb.for_range(0, M, for_type="parallel", name='row') as row:
+                dot = irb.allocate('float32', (1,), name='dot', scope='local')
+                out_ptr[row*N+n] = 0.
+                dot[0] = 0.
+                row_start = indptr_ptr[row]
+                row_end = indptr_ptr[row+1]
+                row_elems = row_end-row_start
+                with irb.for_range(0, row_elems, name='idx') as idx:
+                    elem = row_start+idx
+                    dot[0] += data_ptr[elem] * weight_ptr[indices_ptr[elem]*N+n]
+                out_ptr[row*N+n] += dot[0]
+        return irb.get()
+    oshape = (M, N)
+    matmul = tvm.extern(oshape, [data, indices, indptr, weight],
+                        lambda ins, outs: csrmm_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
+                        tag="csrmm", dtype='float32', name='out')
+    if bias is not None:
+        matmul = tvm.compute(oshape, lambda i, j: matmul[i, j] + bias[i], \
+                             tag=tag.BROADCAST)
+    return matmul
+
+
+def csrmm(a, b, c=None):
+    """The `csrmm` routine performs a matrix-matrix operation defined as :math:`C := A*B + C`,
+    where `B` and `C` are dense matrices, `A` is an m-by-k sparse matrix in the CSR format.
+
+    Parameters
+    ----------
+    a : tvm.contrib.sparse.CSRNDArray
+        2-D sparse matrix with shape [m, k]
+
+    b : tvm.Tensor
+        2-D dense matrix with shape [k, n]
+
+    c : tvm.Tensor, optional
+        1-D dense vector with shape [n]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D with shape [m, n]
+    """
+    return csrmm_default(a.data, a.indices, a.indptr, b, c)
diff --git a/topi/python/topi/sparse/csrmv.py b/topi/python/topi/sparse/csrmv.py
new file mode 100644
index 000000000000..7cd101711cca
--- /dev/null
+++ b/topi/python/topi/sparse/csrmv.py
@@ -0,0 +1,90 @@
+"""TVM operator compute SpMV in CSR format."""
+from __future__ import absolute_import
+import tvm
+from .. import tag
+
+def csrmv_default(data, indices, indptr, weight, bias=None):
+    """The default implementation of csrmv in topi.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        1-D with shape [nonzeros]
+
+    indices : tvm.Tensor
+        1-D with shape [nonzeros]
+
+    indptr : tvm.Tensor
+        1-D with shape [m+1]
+
+    weight : tvm.Tensor
+        2-D with shape [k, 1]
+
+    bias : tvm.Tensor, optional
+        1-D with shape [1]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D with shape [m, 1]
+    """
+    assert len(data.shape) == 1 and len(weight.shape) == 2, \
+        "only support 2-dim csrmv"
+    assert isinstance(weight, tvm.tensor.Tensor), \
+        "weight matrix is assumed to be tvm.Tensor, but weight is `%s`" % (type(weight))
+    if bias is not None:
+        assert len(bias.shape) == 1
+    batch = indptr.shape[0]-1
+    def csrmv_default_ir(data, indices, indptr, weight, out):
+        """define ir for csrmv"""
+        irb = tvm.ir_builder.create()
+        data_ptr = irb.buffer_ptr(data)
+        indices_ptr = irb.buffer_ptr(indices)
+        indptr_ptr = irb.buffer_ptr(indptr)
+        weight_ptr = irb.buffer_ptr(weight)
+        out_ptr = irb.buffer_ptr(out)
+        num_rows = indptr.shape[0]-1
+        with irb.for_range(0, num_rows, for_type="parallel", name='row') as row:
+            dot = irb.allocate('float32', (1,), name='dot', scope='local')
+            out_ptr[row] = 0.
+            dot[0] = 0.
+            row_start = indptr_ptr[row]
+            row_end = indptr_ptr[row+1]
+            row_elems = row_end-row_start
+            with irb.for_range(0, row_elems, name='elemidx') as elemidx:
+                elem = row_start+elemidx
+                dot[0] += data_ptr[elem] * weight_ptr[indices_ptr[elem]]
+            out_ptr[row] += dot[0]
+        return irb.get()
+    oshape = (batch, 1)
+    matmul = tvm.extern(oshape, [data, indices, indptr, weight],
+                        lambda ins, outs: csrmv_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
+                        tag="csrmv", dtype='float32', name='csrmv')
+    if bias is not None:
+        matmul = tvm.compute((batch, 1), lambda i, j: matmul[i, 0] + bias[i], \
+                             tag=tag.BROADCAST)
+    return matmul
+
+
+def csrmv(a, x, y=None):
+    """The `csrmv` routine performs a matrix-vector operation defined as :math:`y := A*x + y`,
+    where `x` and `y` are vectors, `A` is an m-by-k sparse matrix in the CSR format.
+
+    Parameters
+
+    ----------
+    a : tvm.contrib.sparse.CSRNDArray
+        2-D sparse matrix with shape [m, k]
+
+    x : tvm.Tensor
+        2-D dense matrix with shape [k, 1]
+
+    y : tvm.Tensor, optional
+        1-D dense vector with shape [1]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D dense matrix with shape [m, 1]
+    """
+    return csrmv_default(a.data, a.indices, a.indptr, x, y)
diff --git a/topi/python/topi/sparse/dense.py b/topi/python/topi/sparse/dense.py
new file mode 100644
index 000000000000..01f323bc8ce9
--- /dev/null
+++ b/topi/python/topi/sparse/dense.py
@@ -0,0 +1,173 @@
+"""TVM operator compute Dense in CSR format."""
+from __future__ import absolute_import
+import tvm
+from .. import tag
+from ..util import simplify
+
+def dense_si(data, indices, indptr, weight, bias=None):
+    # pylint: disable=invalid-name
+    """The implementation of dense in topi, assuming sparse input.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        1-D with shape [num_nonzeros]
+
+    indices : tvm.Tensor
+        1-D with shape [num_nonzeros]
+
+    indptr : tvm.Tensor
+        1-D with shape [m+1]
+
+    weight : tvm.Tensor
+        2-D with shape [k, n]
+
+    bias : tvm.Tensor, optional
+        1-D with shape [m]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D with shape [m, n]
+    """
+    assert len(data.shape) == 1 and len(indices.shape) == 1 and len(indptr.shape) == 1 \
+        and len(weight.shape) == 2, "only support 2-dim dense"
+    assert isinstance(weight, tvm.tensor.Tensor), \
+        "weight matrix is assumed to be tvm.Tensor, but weight is `%s`" % (type(weight))
+    if bias is not None:
+        assert len(bias.shape) == 1
+    dtype = data.dtype
+    M = simplify(indptr.shape[0]-1)
+    N, _ = weight.shape
+    def dense_default_ir(data, indices, indptr, weight, out):
+        """Define IR for Dense"""
+        dtype = data.dtype
+        irb = tvm.ir_builder.create()
+        data_ptr = irb.buffer_ptr(data)
+        indices_ptr = irb.buffer_ptr(indices)
+        indptr_ptr = irb.buffer_ptr(indptr)
+        weight_ptr = irb.buffer_ptr(weight)
+        out_ptr = irb.buffer_ptr(out)
+        M = simplify(indptr.shape[0]-1)
+        N, K = weight.shape
+        with irb.for_range(0, N, for_type="vectorize", name='n') as n:
+            with irb.for_range(0, M, for_type="parallel", name='m') as m:
+                dot = irb.allocate(dtype, (1,), name='dot', scope='local')
+                out_ptr[m*N+n] = tvm.const(0, dtype)
+                dot[0] = tvm.const(0, dtype)
+                row_start = indptr_ptr[m]
+                row_elems = indptr_ptr[m+1]-row_start
+                with irb.for_range(0, row_elems, name='k') as k:
+                    elem = row_start+k
+                    dot[0] += data_ptr[elem] * weight_ptr[indices_ptr[elem]+n*K]
+                out_ptr[m*N+n] += dot[0]
+        return irb.get()
+    oshape = (M, N)
+    matmul = tvm.extern(oshape, [data, indices, indptr, weight],
+                        lambda ins, outs: dense_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
+                        tag="dense", dtype=dtype, name='out')
+    if bias is not None:
+        matmul = tvm.compute(oshape, lambda i, j: matmul[i, j] + bias[j], \
+                             tag=tag.BROADCAST)
+    return matmul
+
+
+def dense_sw(data, w_data, w_indices, w_indptr, bias=None):
+    # pylint: disable=invalid-name
+    """The implementation of dense in topi, assuming sparse weight.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        2-D with shape [m, k]
+
+    w_data : tvm.Tensor
+        1-D with shape [nonzeros]
+
+    w_indices : tvm.Tensor
+        1-D with shape [nonzeros]
+
+    w_indptr : tvm.Tensor
+        1-D with shape [n+1]
+
+    bias : tvm.Tensor, optional
+        1-D with shape [n]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D with shape [m, n]
+    """
+    assert len(w_data.shape) == 1 and len(w_indices.shape) == 1 and len(w_indptr.shape) == 1 \
+        and len(data.shape) == 2, "only support 2-dim dense"
+    assert isinstance(data, tvm.tensor.Tensor), \
+        "data matrix is assumed to be tvm.Tensor, but weight is `%s`" % (type(data))
+    if bias is not None:
+        assert len(bias.shape) == 1
+    dtype = data.dtype
+    M, _ = data.shape
+    N = simplify(w_indptr.shape[0]-1)
+    def dense_default_ir(data, w_data, w_indices, w_indptr, out):
+        """Define IR for Dense"""
+        dtype = data.dtype
+        irb = tvm.ir_builder.create()
+        data_ptr = irb.buffer_ptr(data)
+        w_data_ptr = irb.buffer_ptr(w_data)
+        w_indices_ptr = irb.buffer_ptr(w_indices)
+        w_indptr_ptr = irb.buffer_ptr(w_indptr)
+        out_ptr = irb.buffer_ptr(out)
+        M, K = data.shape
+        N = simplify(w_indptr.shape[0]-1)
+        with irb.for_range(0, M, for_type="vectorize", name='m') as m:
+            with irb.for_range(0, N, for_type="parallel", name='n') as n:
+                dot = irb.allocate(dtype, (1,), name='dot', scope='local')
+                out_ptr[m*N+n] = tvm.const(0, dtype)
+                dot[0] = tvm.const(0, dtype)
+                row_start = w_indptr_ptr[n]
+                row_elems = w_indptr_ptr[n+1]-row_start
+                with irb.for_range(0, row_elems, name='k') as k:
+                    elem = row_start+k
+                    dot[0] += w_data_ptr[elem] * data_ptr[w_indices_ptr[elem]+m*K]
+                out_ptr[m*N+n] += dot[0]
+        return irb.get()
+    oshape = (M, N)
+    matmul = tvm.extern(oshape, [data, w_data, w_indices, w_indptr],
+                        lambda ins, outs: dense_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
+                        tag="dense", dtype=dtype, name='out')
+    if bias is not None:
+        matmul = tvm.compute(oshape, lambda i, j: matmul[i, j] + bias[j], \
+                             tag=tag.BROADCAST)
+    return matmul
+
+
+def dense(data, weight, bias=None):
+    """Applies a linear transformation: :math:`Y = XW^T + b`.
+    Either data or weight should be tvm.contrib.sparse.CSRNDArray.
+
+    Parameters
+    ----------
+    data : tvm.contrib.sparse.CSRNDArray or tvm.tensor.Tensor
+        2-D with shape [batch, in_dim]
+
+    weight : tvm.tensor.Tensor or tvm.contrib.sparse.CSRNDArray
+        2-D with shape [out_dim, in_dim]
+
+    bias : tvm.tensor.Tensor, optional
+        1-D with shape [out_dim]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        2-D with shape [batch, out_dim]
+    """
+    ret = None
+    if isinstance(data, tvm.contrib.sparse.CSRPlaceholderOp) and \
+       isinstance(weight, tvm.tensor.Tensor):
+        ret = dense_si(data.data, data.indices, data.indptr, weight, bias)
+    elif isinstance(data, tvm.tensor.Tensor) and \
+       isinstance(weight, tvm.contrib.sparse.CSRPlaceholderOp):
+        ret = dense_sw(data, weight.data, weight.indices, weight.indptr, bias)
+    else:
+        raise NotImplementedError("implementation for %s as data and %s as weights, "
+                                  "is not supported yet." % (type(data), type(weight), ))
+    return ret
diff --git a/topi/python/topi/tensor.py b/topi/python/topi/tensor.py
index 6fcddedbe445..06f23bbe7703 100644
--- a/topi/python/topi/tensor.py
+++ b/topi/python/topi/tensor.py
@@ -1,11 +1,8 @@
 # pylint: disable=invalid-name,consider-using-enumerate,unused-argument,len-as-condition
 """Elementwise operators"""
 from __future__ import absolute_import as _abs
-import tvm
 from . import cpp
-from . import tag
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
 def elemwise_sum(xs):
     """Perform element-wise sum on inputs
 
@@ -22,7 +19,6 @@ def elemwise_sum(xs):
     return cpp.elemwise_sum(xs)
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
 def full(shape, dtype, fill_value):
     """Fill tensor with fill_value
 
@@ -43,7 +39,6 @@ def full(shape, dtype, fill_value):
     return cpp.full(shape, dtype, fill_value)
 
 
-@tvm.tag_scope(tag=tag.ELEMWISE)
 def full_like(x, fill_value):
     """Construct a tensor with same shape as input tensor,
        then fill tensor with fill_value.
diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py
index c9d995a38686..c496e08c1835 100644
--- a/topi/python/topi/testing/__init__.py
+++ b/topi/python/topi/testing/__init__.py
@@ -15,7 +15,8 @@
 from .bilinear_resize_python import bilinear_resize_python
 from .reorg_python import reorg_python
 from .region_python import region_python
-from .yolo_python import yolo_python
 from .shortcut_python import shortcut_python
 from .lrn_python import lrn_python
 from .l2_normalize_python import l2_normalize_python
+from .gather_nd_python import gather_nd_python
+from .strided_slice_python import strided_slice_python
diff --git a/topi/python/topi/testing/conv2d_nchw_python.py b/topi/python/topi/testing/conv2d_nchw_python.py
index 4a40d02d215c..7d2aa0d0fedf 100644
--- a/topi/python/topi/testing/conv2d_nchw_python.py
+++ b/topi/python/topi/testing/conv2d_nchw_python.py
@@ -4,8 +4,8 @@
 import scipy.signal
 
 
-def conv2d_nchw_python(a_np, w_np, stride, padding):
-    """Convolution operator in HWCN layout.
+def _conv2d_nchw_python(a_np, w_np, stride, padding):
+    """Convolution operator in NCHW layout.
 
     Parameters
     ----------
@@ -66,3 +66,36 @@ def conv2d_nchw_python(a_np, w_np, stride, padding):
                     apad, np.rot90(np.rot90(w_np[f, c])), mode='valid')
                 b_np[n, f] += out[::stride_h, ::stride_w]
     return b_np
+
+
+def conv2d_nchw_python(a_np, w_np, stride, padding, groups=1):
+    """Convolution operator in NCHW layout.
+
+    Parameters
+    ----------
+    a_np : numpy.ndarray
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    w_np : numpy.ndarray
+        4-D with shape [num_filter, in_channel // groups, filter_height, filter_width]
+
+    stride : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+
+    padding : int or str or a list/tuple of two ints
+        Padding size, or ['VALID', 'SAME'], or [pad_height, pad_width]
+
+    groups : int
+        Number of groups
+
+    Returns
+    -------
+    b_np : np.ndarray
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    a_slices = np.array_split(a_np, groups, axis=1)
+    w_slices = np.array_split(w_np, groups, axis=0)
+    b_slices = [_conv2d_nchw_python(a_slice, w_slice, stride, padding)
+                for a_slice, w_slice in zip(a_slices, w_slices)]
+    b_np = np.concatenate(b_slices, axis=1)
+    return b_np
diff --git a/topi/python/topi/testing/conv2d_nhwc_python.py b/topi/python/topi/testing/conv2d_nhwc_python.py
index 880088a6f89f..a872bddab09b 100644
--- a/topi/python/topi/testing/conv2d_nhwc_python.py
+++ b/topi/python/topi/testing/conv2d_nhwc_python.py
@@ -13,7 +13,7 @@ def conv2d_nhwc_python(a_np, w_np, stride, padding):
         4-D with shape [batch, in_height, in_width, in_channel]
 
     w_np : numpy.ndarray
-        4-D with shape [num_filter, filter_height, filter_width, in_channel]
+        4-D with shape [filter_height, filter_width, in_channel, num_filter]
 
     stride : int or a list/tuple of two ints
         Stride size, or [stride_height, stride_width]
@@ -63,5 +63,5 @@ def conv2d_nhwc_python(a_np, w_np, stride, padding):
                     apad = at[n, c]
                 out = scipy.signal.convolve2d(
                     apad, np.rot90(np.rot90(wt[f, c])), mode='valid')
-                bt[n, f] += out[::stride, ::stride]
+                bt[n, f] += out[::stride_h, ::stride_w]
     return bt.transpose((0, 2, 3, 1))
diff --git a/topi/python/topi/testing/gather_nd_python.py b/topi/python/topi/testing/gather_nd_python.py
new file mode 100644
index 000000000000..e2d74cfee1fd
--- /dev/null
+++ b/topi/python/topi/testing/gather_nd_python.py
@@ -0,0 +1,36 @@
+# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
+"""gather_nd in python"""
+import numpy as np
+
+def gather_nd_python(a_np, indices_np):
+    """ Python version of GatherND operator
+
+    Parameters
+    ----------
+    a_np : numpy.ndarray
+        Numpy array
+
+    indices_np : numpy.ndarray
+        Numpy array
+
+    Returns
+    -------
+    b_np : numpy.ndarray
+        Numpy array
+    """
+    a_shape = a_np.shape
+    indices_np = indices_np.astype('int32')
+    indices_shape = indices_np.shape
+    assert len(indices_shape) > 1
+    assert indices_shape[0] <= len(a_shape)
+    b_shape = list(indices_shape[1:])
+    for i in range(indices_shape[0], len(a_shape)):
+        b_shape.append(a_shape[i])
+    b_np = np.zeros(b_shape)
+    for idx in np.ndindex(*indices_shape[1:]):
+        a_idx = []
+        for i in range(indices_shape[0]):
+            indices_pos = tuple([i] + list(idx))
+            a_idx.append(indices_np[indices_pos])
+        b_np[idx] = a_np[tuple(a_idx)]
+    return b_np
diff --git a/topi/python/topi/testing/strided_slice_python.py b/topi/python/topi/testing/strided_slice_python.py
new file mode 100644
index 000000000000..4407b3bec1c7
--- /dev/null
+++ b/topi/python/topi/testing/strided_slice_python.py
@@ -0,0 +1,32 @@
+"""gather_nd in python"""
+
+def strided_slice_python(data, begin, end, strides):
+    """Python version of strided slice operator.
+
+    Parameters
+    ----------
+    data : numpy.ndarray
+        Input data
+
+    begin : list
+        Begining of the slices.
+
+    end : list
+        End of the slices.
+
+    strides : list
+        The stride of each slice.
+
+    Returns
+    -------
+    result : numpy.ndarray
+        The sliced result.
+    """
+    strides = [] if strides is None else strides
+    slices = []
+    for i in range(len(data.shape)):
+        slices.append(slice(
+            begin[i] if i < len(begin) else None,
+            end[i] if i < len(end) else None,
+            strides[i] if i < len(strides) else None))
+    return data[tuple(slices)]
diff --git a/topi/python/topi/testing/yolo_python.py b/topi/python/topi/testing/yolo_python.py
deleted file mode 100644
index a6b3a41203c6..000000000000
--- a/topi/python/topi/testing/yolo_python.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
-"""Yolo operator in python"""
-import numpy as np
-
-def entry_index(batch, w, h, outputs, classes, coords, location, entry):
-    n = int(location/(w*h))
-    loc = location%(w*h)
-    return batch*outputs + n*w*h*(coords+classes+1) + entry*w*h + loc
-
-def yolo_python(a_np, N, classes):
-    """Yolo operator
-    Parameters
-    ----------
-    a_np : numpy.ndarray
-        4-D with shape [batch, in_channel, in_height, in_width]
-
-    N : int
-        Darknet layer parameter n
-
-    classes : int
-        Darknet layer parameter classes
-
-    Returns
-    -------
-    b_np : np.ndarray
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-
-    batch, in_channel, in_height, in_width = a_np.shape
-    a_np_temp = np.reshape(a_np, batch*in_channel*in_height*in_width)
-    outputs = batch*in_channel*in_height*in_width
-    b_np = np.zeros(batch*in_channel*in_height*in_width)
-    for i in range(batch*in_channel*in_height*in_width):
-        b_np[i] = a_np_temp[i]
-    for b in range(batch):
-        for n in range(N):
-            index = entry_index(b, in_width, in_height, outputs, classes, 4, n*in_width*in_height, 0)
-            b_np[index: index+2*in_width*in_height] = 1/(1+np.exp(-1*b_np[index: index+2*in_width*in_height]))
-            index = entry_index(b, in_width, in_height, outputs, classes, 4, n*in_width*in_height, 4)
-            b_np[index: index+(1+classes)*in_width*in_height] = 1/(1+np.exp(-1*b_np[index: index+(1+classes)*in_width*in_height]))
-
-    b_np = np.reshape(b_np, (batch, in_channel, in_height, in_width))
-    return b_np
diff --git a/topi/python/topi/transform.py b/topi/python/topi/transform.py
index 2ad01852c8b9..b9a7bd4f2992 100644
--- a/topi/python/topi/transform.py
+++ b/topi/python/topi/transform.py
@@ -3,11 +3,9 @@
 from __future__ import absolute_import as _abs
 import tvm
 import topi
-from . import tag
-from .util import ravel_index, unravel_index, get_const_int, get_const_tuple
 from . import cpp
 
-@tvm.tag_scope(tag=tag.BROADCAST)
+
 def expand_dims(a, axis, num_newaxis=1):
     """Expand the shape of an array.
 
@@ -23,15 +21,9 @@ def expand_dims(a, axis, num_newaxis=1):
     -------
     ret : tvm.Tensor
     """
-    axis = len(a.shape) + axis + 1 if axis < 0 else axis
-    new_shape = a.shape[:axis] + ([1] * num_newaxis) + a.shape[axis:]
-    def _compute(*indices):
-        idx = indices[:axis] + indices[axis + num_newaxis:]
-        return a(*idx)
-    return tvm.compute(new_shape, _compute)
+    return cpp.expand_dims(a, axis, num_newaxis)
 
 
-@tvm.tag_scope(tag=tag.BROADCAST)
 def expand_like(a, shape_like, axis):
     """Expand an input array with the shape of second array.
     This operation can always be composed of unsqueezing and
@@ -85,7 +77,6 @@ def _compute(*idxs):
     return tvm.compute(shape_like.shape, _compute)
 
 
-@tvm.tag_scope(tag=tag.INJECTIVE)
 def transpose(a, axes=None):
     """Permute the dimensions of an array.
 
@@ -101,17 +92,9 @@ def transpose(a, axes=None):
     -------
     ret : tvm.Tensor
     """
-    ndim = len(a.shape)
-    axes = axes if axes else tuple(reversed(range(ndim)))
-    new_shape = [a.shape[x] for x in axes]
-    def _compute(*indices):
-        idx = [1] * len(axes)
-        for i, k in enumerate(axes):
-            idx[k] = indices[i]
-        return a(*idx)
-    return tvm.compute(new_shape, _compute)
-
-@tvm.tag_scope(tag=tag.INJECTIVE)
+    return cpp.transpose(a, axes)
+
+
 def flip(a, axis=0):
     """Flip/reverse elements of an array in a particular axis.
 
@@ -129,7 +112,6 @@ def flip(a, axis=0):
     """
     return cpp.flip(a, axis)
 
-@tvm.tag_scope(tag=tag.INJECTIVE)
 def strided_slice(a, begin, end, strides=None):
     """Slice of an array.
 
@@ -155,7 +137,7 @@ def strided_slice(a, begin, end, strides=None):
     """
     return cpp.strided_slice(a, begin, end, strides)
 
-@tvm.tag_scope(tag=tag.INJECTIVE)
+
 def reshape(a, newshape):
     """Reshape the array
 
@@ -170,13 +152,9 @@ def reshape(a, newshape):
     -------
     ret : tvm.Tensor
     """
-    ndim = len(a.shape)
-    a_shape = [a.shape[i] for i in range(ndim)]
-    return tvm.compute(newshape,
-                       lambda *indices: a(*unravel_index(ravel_index(indices, newshape), a_shape)))
+    return cpp.reshape(a, newshape)
 
 
-@tvm.tag_scope(tag=tag.INJECTIVE)
 def squeeze(a, axis=None):
     """Remove single-dimensional entries from the shape of an array.
 
@@ -192,44 +170,9 @@ def squeeze(a, axis=None):
     -------
     squeezed : tvm.Tensor
     """
-    a_ndim = len(a.shape)
-    a_shape = get_const_tuple(a.shape)
-    if axis is None:
-        axis = []
-        for i, ele in enumerate(a_shape):
-            if ele == 1:
-                axis.append(i)
-    else:
-        if isinstance(axis, int):
-            axis = axis + a_ndim if axis < 0 else axis
-            assert a_shape[axis] == 1
-            axis = [axis]
-        else:
-            axis = [ele + a_ndim if ele < 0 else ele for ele in axis]
-            for ele in axis:
-                assert a_shape[ele] == 1
-    out_shape = []
-    search_axis = set(axis)
-    for i, a_dim in enumerate(a_shape):
-        if i not in search_axis:
-            out_shape.append(a_dim)
-    if not out_shape:
-        out_shape.append(1)
-    def _compute(*indices):
-        real_indices = []
-        flag = 0
-        for i in range(a_ndim):
-            if i not in search_axis:
-                real_indices.append(indices[i - flag])
-            else:
-                real_indices.append(0)
-                flag += 1
-        return a(*real_indices)
-
-    return tvm.compute(out_shape, _compute)
-
-
-@tvm.tag_scope(tag=tag.INJECTIVE)
+    return cpp.squeeze(a, axis)
+
+
 def concatenate(a_tuple, axis=0):
     """Join a sequence of arrays along an existing axis.
 
@@ -245,28 +188,9 @@ def concatenate(a_tuple, axis=0):
     -------
     ret : tvm.Tensor
     """
-    assert isinstance(a_tuple, (list, tuple))
-    if axis < 0:
-        axis += len(a_tuple[0].shape)
-    assert axis < len(a_tuple[0].shape)
-    axis_sizes = [a_tuple[i].shape[axis] for i in range(len(a_tuple))]
-    out_shape = [a_tuple[0].shape[i] for i in range(0, axis)] + [sum(axis_sizes)]\
-                + [a_tuple[0].shape[i] for i in range(axis + 1, len(a_tuple[0].shape))]
-    out_shape[axis] = tvm.ir_pass.Simplify(out_shape[axis])
-
-    def _compute(*indices):
-        ret = a_tuple[0](*indices)
-        ind = indices[axis]
-        for i in range(len(a_tuple) - 1):
-            ind -= axis_sizes[i]
-            ret = tvm.select(ind >= 0,
-                             a_tuple[i + 1](*(indices[0:axis] + (ind,) + indices[axis + 1:])),
-                             ret)
-        return ret
-    return tvm.compute(out_shape, _compute)
-
-
-@tvm.tag_scope(tag=tag.INJECTIVE)
+    return cpp.concatenate(a_tuple, axis)
+
+
 def split(ary, indices_or_sections, axis=0):
     """Split an array into multiple sub-arrays.
 
@@ -282,40 +206,9 @@ def split(ary, indices_or_sections, axis=0):
     -------
     ret : tuple of tvm.Tensor
     """
-    def _compute(begin, *indices):
-        real_indices = indices[:axis] + (indices[axis] + begin, ) + indices[axis + 1:]
-        return ary(*real_indices)
-
-    if axis < 0:
-        axis += len(ary.shape)
-    src_axis_size = get_const_int(ary.shape[axis])
-    if isinstance(indices_or_sections, int):
-        assert indices_or_sections > 0
-        assert src_axis_size % indices_or_sections == 0
-        seg_size = src_axis_size // indices_or_sections
-        begin_ids = [seg_size * i for i in range(indices_or_sections)]
-    elif isinstance(indices_or_sections, (tuple, list)):
-        assert tuple(indices_or_sections) == tuple(sorted(indices_or_sections)),\
-            "Should be sorted, recieved %s" % str(indices_or_sections)
-        begin_ids = [0] + list(indices_or_sections)
-    else:
-        raise NotImplementedError()
-    out_shapes = []
-    for i in range(len(begin_ids)):
-        if i == len(begin_ids) - 1:
-            out_axis_size = src_axis_size - begin_ids[i]
-        else:
-            out_axis_size = begin_ids[i + 1] - begin_ids[i]
-        out_shapes.append([ary.shape[i] for i in range(axis)] + [out_axis_size] +\
-                          [ary.shape[i] for i in range(axis + 1, len(ary.shape))])
-    # pylint: disable=cell-var-from-loop
-    return [tvm.compute(out_shape,
-                        lambda *indices: _compute(begin_id, *indices), name="s%d" %i)
-            for i, (out_shape, begin_id) in enumerate(zip(out_shapes, begin_ids))]
-    # pylint: enable=cell-var-from-loop
-
-
-@tvm.tag_scope(tag=tag.INJECTIVE)
+    return cpp.split(ary, indices_or_sections, axis)
+
+
 def take(a, indices, axis=None):
     """Take elements from an array along an axis.
 
@@ -338,3 +231,61 @@ def take(a, indices, axis=None):
     if axis is None:
         return cpp.take(a, indices)
     return cpp.take(a, indices, int(axis))
+
+
+def gather_nd(a, indices):
+    """Gather elements from a n-dimension array..
+
+    Parameters
+    ----------
+    a : tvm.Tensor
+        The source array.
+
+    indices : tvm.Tensor
+        The indices of the values to extract.
+
+    Returns
+    -------
+    ret : tvm.Tensor
+    """
+    return cpp.gather_nd(a, indices)
+
+
+def matmul(a, b, transp_a=False, transp_b=False):
+    """
+    Creates an operation that calculates a matrix multiplication (row-major notation):
+        A(i, k) * B(k, j)
+    if trans_a == trans_b, the usual transposed combinations, otherwise
+
+    Parameters
+    ----------
+    a : The matrix A
+    b : The matrix B
+    trans_a : Is A's layout transposed?
+    trans_b : Is B's layout transposed?
+
+    Returns
+    -------
+    A Tensor whose op member is the matmul operation
+    """
+    return cpp.matmul(a, b, transp_a, transp_b)
+
+
+def tensordot(a, b, axes):
+    """A generalization of matrix multiplication to tensor.
+
+    Parameters
+    ----------
+    a : The tensor A
+    b : The tensor B
+    axes : The number of dimensions to reduce over
+
+    Returns
+    -------
+    A Tensor computing the result
+    """
+    if isinstance(axes, int):
+        return cpp.tensordot(a, b, axes)
+    if isinstance(axes[0], int):
+        return cpp.tensordot(a, b, (axes[0],), (axes[1],))
+    return cpp.tensordot(a, b, axes[0], axes[1])
diff --git a/topi/python/topi/util.py b/topi/python/topi/util.py
index 71e123e83475..edfb0e467e1f 100644
--- a/topi/python/topi/util.py
+++ b/topi/python/topi/util.py
@@ -1,8 +1,9 @@
 # pylint: disable=invalid-name
 """Common topi utilities"""
 from __future__ import absolute_import as _abs
-import tvm
+from numbers import Integral
 
+import tvm
 from . import tag
 
 def traverse_inline(s, final_op, callback):
@@ -68,13 +69,35 @@ def get_const_int(expr):
     out_value : int
         The output.
     """
-    if isinstance(expr, int):
+    if isinstance(expr, Integral):
         return expr
     if not isinstance(expr, (tvm.expr.IntImm, tvm.expr.UIntImm)):
         expr = tvm.ir_pass.Simplify(expr)
     if not isinstance(expr, (tvm.expr.IntImm, tvm.expr.UIntImm)):
         raise ValueError("Expect value to be constant int")
-    return expr.value
+    return int(expr.value)
+
+
+def get_const_float(expr):
+    """Verifies expr is a floating point and get the constant value.
+
+    Parameters
+    ----------
+    expr : tvm.Expr or float
+        The input expression.
+
+    Returns
+    -------
+    out_value : float
+        The output.
+    """
+    if isinstance(expr, float):
+        return float(expr)
+    if not isinstance(expr, tvm.expr.FloatImm):
+        expr = tvm.ir_pass.Simplify(expr)
+    if not isinstance(expr, tvm.expr.FloatImm):
+        raise ValueError("Expect value to be constant float")
+    return float(expr.value)
 
 
 def equal_const_int(expr, value):
@@ -90,7 +113,7 @@ def equal_const_int(expr, value):
     equal : bool
         Whether they equals.
     """
-    if isinstance(expr, int):
+    if isinstance(expr, Integral):
         return expr == value
     if not isinstance(expr, (tvm.expr.IntImm, tvm.expr.UIntImm)):
         expr = tvm.ir_pass.Simplify(expr)
@@ -119,6 +142,26 @@ def get_const_tuple(in_tuple):
     return out_tuple
 
 
+def get_float_tuple(in_tuple):
+    """Verifies input tuple is FloatImm, returns tuple of float.
+
+    Parameters
+    ----------
+    in_tuple : tuple of Expr
+        The input.
+
+    Returns
+    -------
+    out_tuple : tuple of float
+        The output.
+    """
+    out_tuple = ()
+    for elem in in_tuple:
+        value = get_const_float(elem)
+        out_tuple = out_tuple + (value, )
+    return out_tuple
+
+
 def simplify(expr):
     """Simplify the expression if it is Expr, directly return if it is int.
 
diff --git a/topi/python/topi/vision/ssd/multibox.py b/topi/python/topi/vision/ssd/multibox.py
index a8f97146519b..4e6e6ab27fea 100644
--- a/topi/python/topi/vision/ssd/multibox.py
+++ b/topi/python/topi/vision/ssd/multibox.py
@@ -164,10 +164,10 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
         oy = py * vy * ah + ay
         ow = tvm.exp(pw * vw) * aw / 2.0
         oh = tvm.exp(ph * vh) * ah / 2.0
-        return tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox - ow)), ox - ow), \
-               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy - oh)), oy - oh), \
-               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox + ow)), ox + ow), \
-               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy + oh)), oy + oh)
+        return tvm.select(clip, tvm.max(0, tvm.min(1, ox - ow)), ox - ow), \
+               tvm.select(clip, tvm.max(0, tvm.min(1, oy - oh)), oy - oh), \
+               tvm.select(clip, tvm.max(0, tvm.min(1, ox + ow)), ox + ow), \
+               tvm.select(clip, tvm.max(0, tvm.min(1, oy + oh)), oy + oh)
 
     batch_size = cls_prob.shape[0]
     num_classes = cls_prob.shape[1]
@@ -191,7 +191,7 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
                 with ib.if_scope(j > 0):
                     temp = p_cls_prob[n * num_anchors * num_classes + j * num_anchors + i]
                     cls_id[0] = tvm.select(temp > score[0], j, cls_id[0])
-                    score[0] = tvm.make.Max(temp, score[0])
+                    score[0] = tvm.max(temp, score[0])
             with ib.if_scope(tvm.all(cls_id[0] > 0, score[0] < threshold)):
                 cls_id[0] = 0
             # [id, prob, xmin, ymin, xmax, ymax]
diff --git a/topi/python/topi/vision/yolo/__init__.py b/topi/python/topi/vision/yolo/__init__.py
index 2c0a165f8aac..c0e9899a41aa 100644
--- a/topi/python/topi/vision/yolo/__init__.py
+++ b/topi/python/topi/vision/yolo/__init__.py
@@ -3,4 +3,3 @@
 from __future__ import absolute_import as _abs
 
 from .region import *
-from .yolo import *
diff --git a/topi/python/topi/vision/yolo/yolo.py b/topi/python/topi/vision/yolo/yolo.py
deleted file mode 100644
index 6ae630a86d8f..000000000000
--- a/topi/python/topi/vision/yolo/yolo.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# pylint: disable=invalid-name, unused-variable
-"""
-YOLO Operator
-=============
-YOLO operator, used in darknet.
-"""
-from __future__ import absolute_import as _abs
-import tvm
-from ... import cpp
-
-@tvm.target.generic_func
-def yolo(data, num, classes):
-    """YOLO forward operators.
-    Parameters
-    ----------
-    data : tvm.Tensor
-        4-D with shape [batch, c_in, h_in, w_in]
-
-    num : int
-        Darknet layer parameter n
-
-    classes : int
-        Darknet layer parameter classes
-
-    Returns
-    -------
-    out : tvm.Tensor
-        4-D with shape [batch, c_in, h_in, w_in]
-    """
-    return cpp.yolo.yolo(data, num, classes)
diff --git a/topi/python/topi/x86/__init__.py b/topi/python/topi/x86/__init__.py
index c146419fcec9..9e0e94e6cd2d 100644
--- a/topi/python/topi/x86/__init__.py
+++ b/topi/python/topi/x86/__init__.py
@@ -9,3 +9,4 @@
 from .injective import *
 from .pooling import schedule_pool, schedule_global_pool
 from .bitserial_conv2d import schedule_bitserial_conv2d
+from .depthwise_conv2d import schedule_depthwise_conv2d_NCHWc
diff --git a/topi/python/topi/x86/check_targets.py b/topi/python/topi/x86/check_targets.py
new file mode 100644
index 000000000000..fad74eaf582a
--- /dev/null
+++ b/topi/python/topi/x86/check_targets.py
@@ -0,0 +1,12 @@
+# pylint: disable=invalid-name,unused-variable,invalid-name,unused-argument
+"""Checks different x86 targets for target specific schedules"""
+
+def check_skylake(target):
+    """
+    Checks if the target is skylake
+    """
+
+    for opt in target.options:
+        if opt == '-mcpu=skylake-avx512':
+            return True
+    return False
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index 721c7c169d99..fe38b38d38e0 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -1,193 +1,157 @@
-# pylint: disable=invalid-name,unused-variable,invalid-name,unused-argument
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
 """Conv2D schedule on x86"""
 import tvm
+from tvm import autotvm
+from tvm.autotvm.task.topi_integration import deserialize_args
+from tvm.autotvm.task import get_config
 from .. import generic, tag
 from .. import nn
-from ..nn.util import infer_pad, infer_stride
-from ..nn.conv2d import conv2d, conv2d_NCHWc, conv2d_alter_layout, \
-    _get_workload, _get_schedule, _get_schedule_NCHWc, \
-    _get_alter_layout_schedule, Workload
+from ..util import get_const_tuple
+from ..nn.conv2d import conv2d, conv2d_NCHWc, \
+    conv2d_alter_layout, _get_workload as _get_conv2d_workload
+from ..nn.depthwise_conv2d import _get_workload as _get_depthwise_conv2d_workload
+from ..nn.depthwise_conv2d import depthwise_conv2d_NCHWc, depthwise_conv2d_nchw
+from ..nn.pad import pad
 
 from . import conv2d_avx_1x1, conv2d_avx_common
-from .conv2d_avx_common import AVXConvCommonFwd
-from .conv2d_avx_1x1 import AVXConv1x1Fwd
-
-@_get_schedule.register("cpu")
-def _get_schedule_conv(wkl):
-    _WORKLOADS_AVX = [
-        # workloads of resnet18_v1 on imagenet
-        Workload('float32', 'float32', 224, 224, 3, 64, 7, 7, 3, 3, 2, 2),
-        Workload('float32', 'float32', 56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
-        Workload('float32', 'float32', 56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
-        Workload('float32', 'float32', 56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
-        Workload('float32', 'float32', 56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
-        Workload('float32', 'float32', 28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
-        Workload('float32', 'float32', 28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
-        Workload('float32', 'float32', 28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
-        Workload('float32', 'float32', 14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
-        Workload('float32', 'float32', 14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
-        Workload('float32', 'float32', 14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
-        Workload('float32', 'float32', 7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
-        # workloads of resnet34_v1 on imagenet, no extra workload required
-        # workloads of resnet50_v1 on imagenet
-        Workload('float32', 'float32', 56, 56, 64, 256, 1, 1, 0, 0, 1, 1),
-        Workload('float32', 'float32', 56, 56, 256, 64, 1, 1, 0, 0, 1, 1),
-        Workload('float32', 'float32', 56, 56, 256, 128, 1, 1, 0, 0, 2, 2),
-        Workload('float32', 'float32', 28, 28, 128, 512, 1, 1, 0, 0, 1, 1),
-        Workload('float32', 'float32', 56, 56, 256, 512, 1, 1, 0, 0, 2, 2),
-        Workload('float32', 'float32', 28, 28, 512, 128, 1, 1, 0, 0, 1, 1),
-        Workload('float32', 'float32', 28, 28, 512, 256, 1, 1, 0, 0, 2, 2),
-        Workload('float32', 'float32', 14, 14, 256, 1024, 1, 1, 0, 0, 1, 1),
-        Workload('float32', 'float32', 28, 28, 512, 1024, 1, 1, 0, 0, 2, 2),
-        Workload('float32', 'float32', 14, 14, 1024, 256, 1, 1, 0, 0, 1, 1),
-        Workload('float32', 'float32', 14, 14, 1024, 512, 1, 1, 0, 0, 2, 2),
-        Workload('float32', 'float32', 7, 7, 512, 2048, 1, 1, 0, 0, 1, 1),
-        Workload('float32', 'float32', 14, 14, 1024, 2048, 1, 1, 0, 0, 2, 2),
-        Workload('float32', 'float32', 7, 7, 2048, 512, 1, 1, 0, 0, 1, 1),
-        # workloads of resnet101_v1 on imagenet, no extra workload required
-        # workloads of resnet152_v1 on imagenet, no extra workload required
-        # workloads of resnet18_v2 on imagenet, no extra workload required
-        # workloads of resnet34_v2 on imagenet, no extra workload required
-    ]
-
-    fp32_vec_len = 8
-    target = tvm.target.current_target(allow_none=False)
-    for opt in target.options:
-        if opt == '-mcpu=skylake-avx512':
-            fp32_vec_len = 16
-
-    _SCHEDULES_AVX = [
-        # workloads of resnet18_v1 on imagenet
-        AVXConvCommonFwd(3, fp32_vec_len, 28, False),
-        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 28),
-        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 28),
-        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 28, False),
-        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 14, False),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
-        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 14, True),
-        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 7, True),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 1, 7),
-        AVXConvCommonFwd(fp32_vec_len, fp32_vec_len, 7, True),
-        # workloads of resnet34_v1 on imagenet, no extra workload required
-        # workloads of resnet50_v1 on imagenet
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 28),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 14),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
-        AVXConv1x1Fwd(fp32_vec_len, fp32_vec_len, 2, 7),
-        # workloads of resnet101_v1 on imagenet, no extra workload required
-        # workloads of resnet152_v1 on imagenet, no extra workload required
-        # workloads of resnet18_v2 on imagenet, no extra workload required
-        # workloads of resnet34_v2 on imagenet, no extra workload required
-    ]
-
-    if wkl not in _WORKLOADS_AVX:
-        if wkl.hkernel == 1 and wkl.wkernel == 1:
-            return conv2d_avx_1x1._get_default_schedule(wkl, fp32_vec_len)
-        return conv2d_avx_common._get_default_schedule(wkl, fp32_vec_len)
-    idx = _WORKLOADS_AVX.index(wkl)
-    sch = _SCHEDULES_AVX[idx]
-    return sch
-
-@_get_schedule_NCHWc.register("cpu")
-def _get_schedule_NCHWc_x86(wkl, layout, out_layout):
-    return _get_schedule_conv(wkl)
-
-@_get_alter_layout_schedule.register("cpu")
-def _get_alter_layout_schedule_x86(wkl):
-    return _get_schedule_conv(wkl)
-
-@conv2d.register("cpu")
-def _declaration_conv(data, kernel, stride, padding, layout, out_dtype):
-    _AVX_SCH_TO_DECL_FUNC = {
-        AVXConvCommonFwd: conv2d_avx_common._declaration_conv,
-        AVXConv1x1Fwd: conv2d_avx_1x1._declaration_conv
-    }
+
+def _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False):
+    """
+    Get default schedule config for the workload
+    """
+    if is_depthwise:
+        wkl = _get_depthwise_conv2d_workload(data, kernel, strides, padding, out_dtype)
+        from .depthwise_conv2d import _fallback_schedule
+        _fallback_schedule(cfg, wkl)
+    else:
+        wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype)
+        is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1
+        if is_kernel_1x1:
+            conv2d_avx_1x1._fallback_schedule(cfg, wkl)
+        else:
+            conv2d_avx_common._fallback_schedule(cfg, wkl)
+
+
+def _create_tuning_space(cfg, data, kernel, strides, padding, dilation, layout):
+    """Create schedule configuration from input arguments"""
+    dshape = get_const_tuple(data.shape)
+    kshape = get_const_tuple(kernel.shape)
+    if layout == 'NCHW':
+        n, ic, h, w = dshape
+        oc, _, kh, kw = kshape
+    else:
+        raise ValueError("Not support this layout {} with "
+                         "schedule template.".format(layout))
+    is_kernel_1x1 = kh == 1 and kw == 1
+    ph, pw = padding if isinstance(padding, (tuple, list)) else (padding, padding)
+    sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    oh = (h - kh + 2 * ph) // sh + 1
+    ow = (w - kw + 2 * pw) // sw + 1
+
+    # Create schedule config
+    cfg.define_split("tile_ic", ic, num_outputs=2)
+    cfg.define_split("tile_oc", oc, num_outputs=2)
+    cfg.define_split("tile_ow", ow, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
+    if is_kernel_1x1:
+        cfg.define_knob("tile_oh", [1, 2] if oh > 1 else [1])
+    else:
+        cfg.define_knob("unroll_kw", [True, False])
+
+
+@autotvm.register_topi_compute(conv2d, 'cpu', 'direct')
+def _declaration_conv(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
     out_dtype = data.dtype if out_dtype is None else out_dtype
-    target = tvm.target.current_target(allow_none=False)
-    wkl = _get_workload(data, kernel, stride, padding, out_dtype)
+    padding = padding if isinstance(padding, (tuple, list)) else (padding, padding)
+    strides = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    dilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
     if layout == 'NCHW':
-        sch = _get_schedule(wkl)
-        return _AVX_SCH_TO_DECL_FUNC[type(sch)](data, kernel, stride, padding, layout, out_dtype)
+        _create_tuning_space(cfg, data, kernel, strides, padding, dilation, layout)
+        if cfg.is_fallback:
+            _get_default_config(cfg, data, kernel, strides, padding, out_dtype)
+        return _declaration_conv_impl(cfg, data, kernel, strides,
+                                      padding, dilation, layout, out_dtype)
     elif layout == 'HWCN':
-        return nn.conv2d_hwcn(data, kernel, stride, padding, out_dtype)
+        return nn.conv2d_hwcn(data, kernel, strides, padding, dilation, out_dtype)
     elif layout == 'NHWC':
-        return nn.conv2d_nhwc(data, kernel, stride, padding, out_dtype)
+        return nn.conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype)
     else:
         raise ValueError("not support this layout {} yet".format(layout))
 
 
-@conv2d_alter_layout.register("cpu")
-def _alter_conv2d_layout(attrs, inputs, tinfos):
-    import nnvm.symbol as sym
-    copy_inputs = [s for s in inputs]
-    new_attrs = {k : attrs[k] for k in attrs.keys()}
-    # only optimize for NCHW, groups=1 conv
-    if attrs['layout'] != 'NCHW' or attrs.get_int("groups") != 1:
-        return None
-
-    data = tinfos[0]
-    kernel = tinfos[1]
-
-    import ast
-    padding = ast.literal_eval(attrs['padding'])
-    stride = ast.literal_eval(attrs['strides'])
+def _declaration_conv_impl(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
+    out_dtype = data.dtype if out_dtype is None else out_dtype
+    assert layout == 'NCHW', "only support NCHW convolution for AVX"
 
-    wkl = _get_workload(data, kernel, stride, padding, data.dtype)
-    sch = _get_alter_layout_schedule(wkl)
-    is_kernel_1x1 = isinstance(sch, AVXConv1x1Fwd)
-    ic_bn, oc_bn = sch.ic_bn, sch.oc_bn
+    assert isinstance(dilation, int) or len(dilation) == 2
+    if isinstance(dilation, int):
+        dilation_h, dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
 
-    new_attrs['layout'] = 'NCHW%dc' % ic_bn
-    new_attrs['out_layout'] = 'NCHW%dc' % oc_bn
+    HPAD, WPAD = padding
+    HSTR, WSTR = strides
 
-    if is_kernel_1x1:
-        # (oc, ic, h, w) -> (OC, IC, ic, oc, h, w)
-        new_attrs['kernel_layout'] = 'OI%di%doHW' % (ic_bn, oc_bn)
-    else:
-        # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
-        new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn)
+    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
+    num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape)
 
-    return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
+    pad_height = in_height + 2 * HPAD
+    pad_width = in_width + 2 * WPAD
 
+    dilated_kernel_h = (kernel_height - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_width - 1) * dilation_w + 1
+    out_height = (in_height + 2 * HPAD - dilated_kernel_h) // HSTR + 1
+    out_width = (in_width + 2 * WPAD - dilated_kernel_w) // WSTR + 1
 
-@conv2d_NCHWc.register("cpu")
-def _declaration_conv_NCHWc(data, kernel, num_filter, kernel_size, stride,
-                            padding, layout, out_layout, out_dtype):
-    _AVX_SCH_TO_DECL_FUNC = {
-        AVXConvCommonFwd: conv2d_avx_common._declaration_conv_NCHWc,
-        AVXConv1x1Fwd: conv2d_avx_1x1._declaration_conv_NCHWc
-    }
-    n, ic_chunk, h, w, ic_block = [x.value for x in data.shape]
-    ic = ic_chunk * ic_block
-    kh, kw = kernel_size
-    wkl = _get_workload(tvm.placeholder((n, ic, h, w), dtype=out_dtype),
-                        tvm.placeholder((num_filter, ic, kh, kw), dtype=out_dtype),
-                        stride, padding, out_dtype)
-    sch = _get_schedule_NCHWc(wkl, layout, out_layout)
-    return _AVX_SCH_TO_DECL_FUNC[type(sch)](wkl, sch, data, kernel)
-
-
-@generic.schedule_conv2d_nchw.register(["cpu"])
-def schedule_conv2d(outs):
+    # pack data
+    DOPAD = (HPAD != 0 or WPAD != 0)
+    if DOPAD:
+        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
+    else:
+        data_pad = data
+
+    # fetch schedule
+    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
+
+    shape = (batch_size, in_channel // ic_bn, pad_height, ic_bn, pad_width)
+    data_vec = tvm.compute(shape,
+                           lambda n, C, h, c, w: data_pad[n, C * ic_bn + c, h, w],
+                           name='data_vec')
+
+    # pack kernel
+    shape = (num_filter//oc_bn, in_channel//ic_bn,
+             kernel_height, kernel_width, ic_bn, oc_bn)
+    kernel_vec = tvm.compute(shape,
+                             lambda CO, CI, h, w, ci, co:
+                             kernel[CO * oc_bn + co, CI * ic_bn + ci, h, w],
+                             name='kernel_vec')
+
+    # convolution
+    oshape = (batch_size, num_filter//oc_bn, out_height, out_width, oc_bn)
+    unpack_shape = (batch_size, num_filter, out_height, out_width)
+
+    ic = tvm.reduce_axis((0, in_channel), name='ic')
+    kh = tvm.reduce_axis((0, kernel_height), name='kh')
+    kw = tvm.reduce_axis((0, kernel_width), name='kw')
+
+    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                       tvm.sum(data_vec[n, ic//ic_bn, oh*HSTR+kh*dilation_h, ic%ic_bn,
+                                        ow*WSTR+kw*dilation_w].astype(out_dtype) *
+                               kernel_vec[oc_chunk, ic//ic_bn, kh, kw, ic%ic_bn,
+                                          oc_block].astype(out_dtype),
+                               axis=[ic, kh, kw]), name='conv')
+
+    unpack = tvm.compute(unpack_shape,
+                         lambda n, c, h, w: conv[n, c // oc_bn, h, w, c % oc_bn]
+                         .astype(out_dtype),
+                         name='output_unpack',
+                         tag='conv2d_nchw')
+    return unpack
+
+
+@autotvm.register_topi_schedule(generic.schedule_conv2d_nchw, 'cpu', ['direct'])
+def schedule_conv2d(cfg, outs):
     """Create schedule for tensors"""
-    _AVX_SCH_TO_SCH_FUNC = {
-        AVXConvCommonFwd: conv2d_avx_common._schedule_conv,
-        AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv
-    }
     s = tvm.create_schedule([x.op for x in outs])
-    target = tvm.target.current_target(allow_none=False)
     scheduled_ops = []
 
     def traverse(op):
@@ -213,16 +177,14 @@ def traverse(op):
             if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
-            padding = infer_pad(data, data_pad)
-            if data_pad is None:
-                stride = infer_stride(data, kernel, output)
-            else:
-                stride = infer_stride(data_pad, kernel, output)
 
-            wkl = _get_workload(data, kernel, stride, padding, output.dtype)
-            sch = _get_schedule(wkl)
-            _AVX_SCH_TO_SCH_FUNC[type(sch)](s, data, data_pad, data_vec,
-                                            kernel, kernel_vec, conv_out, output, outs[0])
+            _, _, kh, kw = get_const_tuple(kernel.shape)
+            is_kernel_1x1 = kh == 1 and kw == 1
+            args = [s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, outs[0]]
+            if is_kernel_1x1:
+                conv2d_avx_1x1._schedule_conv(*args)
+            else:
+                conv2d_avx_common._schedule_conv(*args)
 
         scheduled_ops.append(op)
 
@@ -230,7 +192,7 @@ def traverse(op):
     return s
 
 
-@generic.schedule_conv2d_nhwc.register(["cpu"])
+@generic.schedule_conv2d_nhwc.register("cpu")
 def schedule_conv2d_nhwc(outs):
     """Create schedule for tensors"""
     s = tvm.create_schedule([x.op for x in outs])
@@ -285,14 +247,183 @@ def traverse(op):
     return s
 
 
-@generic.schedule_conv2d_NCHWc.register(["cpu"])
-def schedule_conv2d_NCHWc(num_filter, kernel_size, stride, padding,
-                          layout, out_layout, outs):
+# Define template function for autotvm task
+# We define schedule template in this function instead of
+# declaration function since actual input arguments need
+# to be altered by the schedule selected.
+@autotvm.task.register("topi_x86_conv2d_NCHWc")
+def _topi_nn_conv2d_NCHWc(*args, **kwargs):
+    assert not kwargs, "Do not support kwargs in template function call"
+    data, kernel, strides, padding, dilation, origin_layout, dtype = deserialize_args(args)
+    raw_data_shape = get_const_tuple(data.shape)
+    raw_kernel_shape = get_const_tuple(kernel.shape)
+
+    # get config here
+    cfg = get_config()
+    _create_tuning_space(cfg, data, kernel, strides, padding, dilation, origin_layout)
+
+    # change shape with the value in config
+    ic_bn, oc_bn, ow_bn = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
+                           cfg["tile_ow"].size[-1])
+    new_data_shape = (raw_data_shape[0], raw_data_shape[1] // ic_bn,
+                      raw_data_shape[2], raw_data_shape[3], ic_bn)
+    data_layout = "NCHW%dc" % ic_bn
+    out_layout = "NCHW%dc" % oc_bn
+    new_kernel_shape = (raw_kernel_shape[0] // oc_bn, raw_kernel_shape[1] // ic_bn,
+                        raw_kernel_shape[2], raw_kernel_shape[3], ic_bn, oc_bn)
+    new_data = tvm.placeholder(new_data_shape, data.dtype)
+    new_kernel = tvm.placeholder(new_kernel_shape, kernel.dtype)
+
+    C = _declaration_conv_NCHWc(cfg, new_data, new_kernel, strides, padding, dilation,
+                                data_layout, out_layout, dtype)
+    s = _schedule_conv2d_NCHWc(cfg, [C])
+    return s, [new_data, new_kernel, C]
+
+
+@conv2d_alter_layout.register("cpu")
+def _alter_conv2d_layout(attrs, inputs, tinfo):
+    import nnvm.symbol as sym
+    copy_inputs = [s for s in inputs]
+    new_attrs = {k : attrs[k] for k in attrs.keys()}
+    data, kernel = tinfo[0], tinfo[1]
+    batch_size, in_channel, height, width = get_const_tuple(data.shape)
+
+    groups = attrs.get_int("groups")
+    out_channel = attrs.get_int("channels")
+    padding = attrs.get_int_tuple("padding")
+    strides = attrs.get_int_tuple("strides")
+    dilation = attrs.get_int_tuple("dilation")
+    layout = attrs['layout']
+    kh, kw = attrs.get_int_tuple("kernel_size")
+
+    dtype = data.dtype
+    out_dtype = dtype if attrs["out_dtype"] == "same" else attrs["out_dtype"]
+    is_depthwise = groups == in_channel and groups == out_channel
+
+    # only optimize for NCHW
+    if layout != 'NCHW':
+        return None
+    if groups != 1 and not is_depthwise:
+        return None
+
+    dispatch_ctx = autotvm.task.DispatchContext.current
+    target = tvm.target.current_target()
+    # query schedule and fallback if necessary
+    workload = autotvm.task.args_to_workload(
+        [data, kernel, strides, padding, dilation, out_dtype], depthwise_conv2d_nchw) \
+        if is_depthwise else \
+        autotvm.task.args_to_workload(
+            [data, kernel, strides, padding, dilation, layout, out_dtype], conv2d)
+    cfg = dispatch_ctx.query(target, workload)
+    if cfg.is_fallback:
+        _get_default_config(cfg, data, kernel, strides, padding, out_dtype, is_depthwise)
+
+    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
+    new_attrs['layout'] = 'NCHW%dc' % ic_bn
+    new_attrs['out_layout'] = 'NCHW%dc' % oc_bn
+
+    new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
+                               dtype=data.dtype)
+    if is_depthwise:
+        # channel, channel_multiplier, kh, kw -> out_channel_chunk, kh, kw, out_channel_block
+        # in which out_channel = merge(channel, channel_multiplier)
+        kernel_sym = copy_inputs[1]
+        kernel_sym = sym.reshape(kernel_sym, shape=(out_channel//oc_bn, oc_bn, kh, kw))
+        kernel_sym = sym.transpose(kernel_sym, axes=(0, 2, 3, 1))
+        copy_inputs[1] = kernel_sym
+
+        # Store altered operator's config
+        new_kernel = tvm.placeholder((out_channel//oc_bn, kh, kw, oc_bn), dtype=kernel.dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, strides, padding, dilation, new_attrs['layout'],
+             new_attrs['out_layout'], out_dtype], depthwise_conv2d_NCHWc)
+    else:
+        out_channel, _, kh, kw = get_const_tuple(kernel.shape)
+        # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
+        new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn)
+
+        # Store altered operator's config
+        new_kernel = tvm.placeholder((out_channel//oc_bn, in_channel//ic_bn, kh, kw, ic_bn, oc_bn),
+                                     dtype=kernel.dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [new_data, new_kernel, strides, padding, dilation, new_attrs['layout'],
+             new_attrs['out_layout'], out_dtype], conv2d_NCHWc)
+
+    dispatch_ctx.update(target, new_workload, cfg)
+    return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
+
+
+@autotvm.register_topi_compute(conv2d_NCHWc, 'cpu', 'direct')
+def _declaration_conv_NCHWc(cfg, data, kernel, strides,
+                            padding, dilation, layout, out_layout, out_dtype):
+    # layout and out_layout are not used here,
+    # we keep them for debug convenience when dumping autotvm workload
+    HPAD, WPAD = padding if isinstance(padding, (tuple, list)) else (padding, padding)
+    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
+    assert (dh, dw) == (1, 1), "Does not support dilation"
+
+    n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
+    in_channel = ic_chunk * ic_bn
+    if data.dtype == 'uint8':
+        oc_chunk, _, kernel_height, kernel_width, _, oc_bn, _ = get_const_tuple(kernel.shape)
+    else:
+        oc_chunk, _, kernel_height, kernel_width, _, oc_bn = get_const_tuple(kernel.shape)
+    num_filter = oc_chunk * oc_bn
+
+    if cfg.is_fallback:
+        _get_default_config(cfg, tvm.placeholder((n, in_channel, ih, iw), dtype=data.dtype),
+                            tvm.placeholder((num_filter, in_channel, kernel_height, kernel_width),
+                                            dtype=kernel.dtype),
+                            strides, padding, out_dtype)
+
+    # output shape
+    out_height = (ih + 2 * HPAD - kernel_height) // HSTR + 1
+    out_width = (iw + 2 * WPAD - kernel_width) // WSTR + 1
+    oshape = (n, oc_chunk, out_height, out_width, oc_bn)
+
+    # DOPAD
+    DOPAD = (HPAD != 0 or WPAD != 0)
+    if DOPAD:
+        data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad")
+    else:
+        data_pad = data
+
+    ic = tvm.reduce_axis((0, in_channel), name='ic')
+    kh = tvm.reduce_axis((0, kernel_height), name='kh')
+    kw = tvm.reduce_axis((0, kernel_width), name='kw')
+
+    if data.dtype == 'uint8':
+        assert out_dtype == "int32", \
+            "INT8 convolution requires input dtype = uint8 and output dtype=int32"
+        # Intel performs dot product of 2 "4" Int8 values
+        # Current implementation requires ic_bn to be a multiple of 4
+        n_elems = 4
+        assert ic_bn % n_elems == 0
+
+        ic_outer = tvm.reduce_axis((0, in_channel//ic_bn), name='ic_outer')
+        ic_f_inner = tvm.reduce_axis((0, ic_bn//n_elems), name='ic_f_inner')
+        ic_s_inner = tvm.reduce_axis((0, n_elems), name='ic_s_inner')
+        return tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                           tvm.sum(data_pad[n, ic_outer, oh*HSTR+kh, ow*WSTR+kw,
+                                            ic_f_inner * n_elems +  ic_s_inner]
+                                   .astype(out_dtype) *
+                                   kernel[oc_chunk, ic_outer, kh, kw, ic_f_inner,
+                                          oc_block, ic_s_inner].astype(out_dtype),
+                                   axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner]),
+                           name='conv2d_NCHWc_int8', tag="conv2d_NCHWc_int8")
+    # else: fp implementation
+    return tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
+                       tvm.sum(data_pad[n, ic//ic_bn, oh*HSTR+kh, ow*WSTR+kw,
+                                        ic%ic_bn].astype(out_dtype) *
+                               kernel[oc_chunk, ic//ic_bn, kh, kw, ic%ic_bn, oc_block],
+                               axis=[ic, kh, kw]),
+                       name='conv2d_NCHWc', tag="conv2d_NCHWc")
+
+
+@autotvm.register_topi_schedule(generic.schedule_conv2d_NCHWc, 'cpu', ['direct'])
+def _schedule_conv2d_NCHWc(cfg, outs):
     """Create schedule for tensors"""
-    _AVX_SCH_TO_SCH_FUNC = {
-        AVXConvCommonFwd: conv2d_avx_common._schedule_conv_NCHWc,
-        AVXConv1x1Fwd: conv2d_avx_1x1._schedule_conv_NCHWc
-    }
     s = tvm.create_schedule([x.op for x in outs])
     scheduled_ops = []
 
@@ -317,17 +448,20 @@ def traverse(op):
                 data_pad = data
                 data = data_pad.op.input_tensors[0]
 
-            n, ic_chunk, h, w, ic_block = [x.value for x in data.shape]
-            ic = ic_chunk * ic_block
-            original_data = tvm.placeholder((n, ic, h, w), dtype=conv_out.dtype)
-
-            kh, kw = kernel_size
-            original_kernel = tvm.placeholder((num_filter, ic, kh, kw), dtype=conv_out.dtype)
-
-            wkl = _get_workload(original_data, original_kernel, stride, padding, conv_out.dtype)
-            sch = _get_schedule_NCHWc(wkl, layout, out_layout)
-            _AVX_SCH_TO_SCH_FUNC[type(sch)](s, wkl, sch, data_vec,
-                                            kernel, conv_out, outs[0])
+            args = [s, cfg, data_vec, conv_out, outs[0]]
+            if data.dtype == 'uint8':
+                # int8 conv kernel is 7-dim
+                _, _, kh, kw, _, _, _ = get_const_tuple(kernel.shape)
+                if kh == 1 and kw == 1:
+                    conv2d_avx_1x1._schedule_conv_NCHWc_int8(*args)
+                else:
+                    conv2d_avx_common._schedule_conv_NCHWc_int8(*args)
+            else:
+                _, _, kh, kw, _, _, = get_const_tuple(kernel.shape)
+                if kh == 1 and kw == 1:
+                    conv2d_avx_1x1._schedule_conv_NCHWc(*args)
+                else:
+                    conv2d_avx_common._schedule_conv_NCHWc(*args)
 
         scheduled_ops.append(op)
 
diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py
index 7d820701e1f4..d44e3899293d 100644
--- a/topi/python/topi/x86/conv2d_avx_1x1.py
+++ b/topi/python/topi/x86/conv2d_avx_1x1.py
@@ -1,18 +1,17 @@
 # pylint: disable=invalid-name,unused-variable,unused-argument,invalid-name
 """1x1 Conv2D schedule on for Intel CPU"""
 from __future__ import absolute_import as _abs
-from collections import namedtuple
 import tvm
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
+from ..nn.util import infer_pad
 from ..util import get_const_tuple
-from ..nn.conv2d import _get_schedule, _get_workload
-from ..nn.util import infer_pad, infer_stride
-from ..nn.pad import pad
+from .tensor_intrin import dot_16x1x16_int8_int8_int32
+from .check_targets import check_skylake
+from .util import get_fp32_len
 
-AVXConv1x1Fwd = namedtuple('AVXConv1x1Fwd', ['ic_bn', 'oc_bn', 'oh_factor', 'ow_factor'])
-
-
-def _get_default_schedule(wkl, simd_width):
+def _fallback_schedule(cfg, wkl):
+    simd_width = get_fp32_len()
     HPAD, WPAD = wkl.hpad, wkl.wpad
     HSTR, WSTR = wkl.hstride, wkl.wstride
     out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
@@ -34,67 +33,22 @@ def _get_default_schedule(wkl, simd_width):
         if out_width % ow_factor == 0:
             for oh_factor in range(out_height, 0, -1):
                 if out_height % oh_factor == 0 and ow_factor * oh_factor < 32:
-                    return AVXConv1x1Fwd(ic_bn, oc_bn, oh_factor, ow_factor)
-
+                    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
+                    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
+                    cfg["tile_oh"] = OtherOptionEntity(oh_factor)
+                    cfg["tile_ow"] = SplitEntity([out_width // ow_factor, ow_factor])
+                    return
     raise ValueError("cannot decide default schedule for workload: {}".format(wkl))
 
 
-def _declaration_conv(data, kernel, stride, padding, layout, out_dtype):
-    assert layout == 'NCHW', "only support NCHW convolution for AVX"
-    wkl = _get_workload(data, kernel, stride, padding, out_dtype)
-    sch = _get_schedule(wkl)
-
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
-
-    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
-    num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape)
-
-    pad_height = in_height + 2 * HPAD
-    pad_width = in_width + 2 * WPAD
-
-    out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1
-    out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1
+def _schedule_conv(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last):
+    # fetch schedule
+    ic_bn, oc_bn, oh_factor, ow_factor = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
+                                          cfg["tile_oh"].val, cfg["tile_ow"].size[-1])
 
-    DOPAD = (HPAD != 0 or WPAD != 0)
-    if DOPAD:
-        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
-    else:
-        data_pad = data
-    shape = (batch_size, in_channel // sch.ic_bn, pad_height, pad_width, sch.ic_bn)
-    data_vec = tvm.compute(shape, lambda n, C, h, w, c: data_pad[n, C * sch.ic_bn + c, h, w])
-
-    shape = (num_filter // sch.oc_bn, in_channel // sch.ic_bn, sch.ic_bn, sch.oc_bn, 1, 1)
-    kernel_vec = tvm.compute(shape, lambda CO, CI, ci, co, h, w:
-                             kernel[CO * sch.oc_bn + co, CI * sch.ic_bn + ci, h, w],
-                             name='kernel_vec')
-
-    oshape = (batch_size, num_filter // sch.oc_bn, out_height, out_width, sch.oc_bn)
-    ic = tvm.reduce_axis((0, in_channel), name='ic')
-    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
-                       tvm.sum(data_vec[n, ic//sch.ic_bn, oh*HSTR, ow*WSTR, ic%sch.ic_bn] *
-                               kernel_vec[oc_chunk, ic//sch.ic_bn, ic%sch.ic_bn, oc_block, 0, 0],
-                               axis=[ic]), name='conv')
-
-    oshape = (batch_size, num_filter, out_height, out_width)
-    unpack = tvm.compute(oshape, lambda n, oc, oh, ow:
-                         conv[n, oc // sch.oc_bn, oh, ow, oc % sch.oc_bn],
-                         tag='conv2d_nchw')
-    return unpack
-
-
-def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, output, last):
     # no stride and padding info here
     padding = infer_pad(data, data_pad)
-    if data_pad is None:
-        stride = infer_stride(data, kernel, output)
-    else:
-        stride = infer_stride(data_pad, kernel, output)
-
-    wkl = _get_workload(data, kernel, stride, padding, output.dtype)
-    sch = _get_schedule(wkl)
-
-    HPAD, WPAD = wkl.hpad, wkl.wpad
+    HPAD, WPAD = padding
     DOPAD = (HPAD != 0 or WPAD != 0)
 
     A, W = data, kernel_vec
@@ -109,7 +63,7 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou
     # schedule kernel pack
     oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis
     s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
-    if sch.oc_bn > 1:
+    if oc_bn > 1:
         s[W].vectorize(oc_block)
     parallel_axis = s[W].fuse(oc_chunk, oh)
     s[W].parallel(parallel_axis)
@@ -118,17 +72,17 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou
     CC = s.cache_write(C, 'global')
 
     batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
-    oh_outer, oh_inner = s[C].split(oh, factor=sch.oh_factor)
+    oh_outer, oh_inner = s[C].split(oh, factor=oh_factor)
     s[C].vectorize(oc_block)
 
     s[CC].compute_at(s[C], oh_outer)
     _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
-    ic, = s[CC].op.reduce_axis
+    ic, _, _ = s[CC].op.reduce_axis
 
-    ic_chunk, ic_block = s[CC].split(ic, factor=sch.ic_bn)
+    ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn)
 
-    oh_outer, oh_inner = s[CC].split(oh, factor=sch.oh_factor)
-    ow_outer, ow_inner = s[CC].split(ow, factor=sch.ow_factor)
+    oh_outer, oh_inner = s[CC].split(oh, factor=oh_factor)
+    ow_outer, ow_inner = s[CC].split(ow, factor=ow_factor)
 
     s[CC].reorder(oc_chunk, oh_outer, ow_outer, ic_chunk, ic_block, oh_inner, ow_inner, oc_block)
     s[CC].vectorize(oc_block)
@@ -140,9 +94,9 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou
         s[O0].compute_inline()
     batch, oc, oh, ow = s[O].op.axis
 
-    oc_chunk, oc_block = s[O].split(oc, factor=sch.oc_bn)
-    oh_outer, oh_inner = s[O].split(oh, factor=sch.oh_factor)
-    ow_outer, ow_inner = s[O].split(ow, factor=sch.ow_factor)
+    oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
+    oh_outer, oh_inner = s[O].split(oh, factor=oh_factor)
+    ow_outer, ow_inner = s[O].split(ow, factor=ow_factor)
     s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
 
     parallel_axis = s[O].fuse(oc_chunk, oh_outer)
@@ -154,33 +108,80 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou
     return s
 
 
-def _declaration_conv_NCHWc(wkl, sch, data, kernel):
-    out_dtype = wkl.out_dtype
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
+def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
+    # fetch schedule
+    oh_factor, ow_factor = cfg["tile_oh"].val, cfg["tile_ow"].size[-1]
+    _, _, _, _, ic_bn = get_const_tuple(data.shape)
 
-    batch_size = data.shape[0]
-    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
-    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+    # schedule data
+    A = data
+    if isinstance(s[A].op, tvm.tensor.ComputeOp):
+        batch, ic_chunk, ih, iw, ic_block = s[A].op.axis
+        parallel_axis = s[A].fuse(ic_chunk, ih)
+        s[A].parallel(parallel_axis)
 
-    DOPAD = (HPAD != 0 or WPAD != 0)
-    if DOPAD:
-        data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad")
-    else:
-        data_pad = data
+    C, O = conv_out, last
+    CC = s.cache_write(C, 'global')
+
+    batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
+    oh_outer, oh_inner = s[C].split(oh, factor=oh_factor)
+    ow_outer, ow_inner = s[C].split(ow, factor=ow_factor)
+    s[C].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
+    s[C].vectorize(oc_block)
 
-    oshape = (batch_size, wkl.out_filter//sch.oc_bn, out_height, out_width, sch.oc_bn)
-    ic = tvm.reduce_axis((0, wkl.in_filter), name='ic')
-    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
-                       tvm.sum(data_pad[n, ic//sch.ic_bn, oh*HSTR, ow*WSTR, ic%sch.ic_bn]
-                               .astype(out_dtype) *
-                               kernel[oc_chunk, ic // sch.ic_bn, ic % sch.ic_bn, oc_block, 0, 0],
-                               axis=[ic]), name='conv2d_NCHWc', tag='conv2d_NCHWc')
+    parallel_axis = s[C].fuse(oc_chunk, oh_outer)
+    s[CC].compute_at(s[C], parallel_axis)
+    if C == O:
+        s[C].parallel(parallel_axis)
+
+    _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
+    ic, _, _ = s[CC].op.reduce_axis
 
-    return conv
+    ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn)
 
+    oh_outer, oh_inner = s[CC].split(oh, factor=oh_factor)
+    ow_outer, ow_inner = s[CC].split(ow, factor=ow_factor)
+
+    s[CC].reorder(oc_chunk, oh_outer, ow_outer, ic_chunk, ic_block, oh_inner, ow_inner, oc_block)
+    s[CC].fuse(oc_chunk, oh_outer)
+    s[CC].vectorize(oc_block)
+
+    s[CC].unroll(ow_inner)
+    s[CC].unroll(oh_inner)
+
+    if C != O:
+        batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
+        oh_outer, oh_inner = s[O].split(oh, factor=oh_factor)
+        ow_outer, ow_inner = s[O].split(ow, factor=ow_factor)
+        s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
+
+        parallel_axis = s[O].fuse(oc_chunk, oh_outer)
+        s[C].compute_at(s[O], parallel_axis)
+        s[O].vectorize(oc_block)
+        s[O].parallel(parallel_axis)
+
+    return s
+
+
+def _schedule_conv_NCHWc_int8(s, cfg, data, conv_out, last):
+    """
+    Defines the schedule for INT8 for intel machines
+    Uses the Intel intrinsics to use INT8 operations
+    More details - https://software.intel.com/en-us/articles/
+    lower-numerical-precision-deep-learning-inference-and-training
+    """
+    target = tvm.target.current_target(allow_none=False)
+    int32_lanes = -1
+    if check_skylake(target):
+        int32_lanes = 16
+    else:
+        return s
+    assert int32_lanes != -1
+
+    oh_factor, ow_factor = cfg["tile_oh"].val, cfg["tile_ow"].size[-1]
+    _, _, _, _, ic_bn = get_const_tuple(data.shape)
+    _, _, _, _, oc_bn = get_const_tuple(conv_out.shape)
 
-def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last):
     # schedule data
     A = data
     if isinstance(s[A].op, tvm.tensor.ComputeOp):
@@ -192,8 +193,8 @@ def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last):
     CC = s.cache_write(C, 'global')
 
     batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
-    oh_outer, oh_inner = s[C].split(oh, factor=sch.oh_factor)
-    ow_outer, ow_inner = s[C].split(ow, factor=sch.ow_factor)
+    oh_outer, oh_inner = s[C].split(oh, factor=oh_factor)
+    ow_outer, ow_inner = s[C].split(ow, factor=ow_factor)
     s[C].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
     s[C].vectorize(oc_block)
 
@@ -203,24 +204,29 @@ def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last):
         s[C].parallel(parallel_axis)
 
     _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
-    ic, = s[CC].op.reduce_axis
+    kh, kw, ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis
 
-    ic_chunk, ic_block = s[CC].split(ic, factor=sch.ic_bn)
+    # Skylake and future processors have 16 vector lanes
+    assert oc_bn % int32_lanes == 0
 
-    oh_outer, oh_inner = s[CC].split(oh, factor=sch.oh_factor)
-    ow_outer, ow_inner = s[CC].split(ow, factor=sch.ow_factor)
+    oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=int32_lanes)
 
-    s[CC].reorder(oc_chunk, oh_outer, ow_outer, ic_chunk, ic_block, oh_inner, ow_inner, oc_block)
+    oh_outer, oh_inner = s[CC].split(oh, factor=oh_factor)
+    ow_outer, ow_inner = s[CC].split(ow, factor=ow_factor)
+
+    s[CC].reorder(oc_chunk, oh_outer, ow_outer, kh, kw, ic_outer, ic_f_inner, oh_inner,
+                  ow_inner, oc_f_inner, oc_s_inner, ic_s_inner)
     s[CC].fuse(oc_chunk, oh_outer)
-    s[CC].vectorize(oc_block)
 
+    pc = dot_16x1x16_int8_int8_int32()
+    s[CC].tensorize(oc_s_inner, pc)
     s[CC].unroll(ow_inner)
     s[CC].unroll(oh_inner)
 
     if C != O:
         batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
-        oh_outer, oh_inner = s[O].split(oh, factor=sch.oh_factor)
-        ow_outer, ow_inner = s[O].split(ow, factor=sch.ow_factor)
+        oh_outer, oh_inner = s[O].split(oh, factor=oh_factor)
+        ow_outer, ow_inner = s[O].split(ow, factor=ow_factor)
         s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
 
         parallel_axis = s[O].fuse(oc_chunk, oh_outer)
diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py
index 8f8086fdebb4..1b8ee5fe9be4 100644
--- a/topi/python/topi/x86/conv2d_avx_common.py
+++ b/topi/python/topi/x86/conv2d_avx_common.py
@@ -1,21 +1,19 @@
 # pylint: disable=invalid-name,unused-variable,unused-argument,invalid-name
 """Conv2D schedule on for Intel CPU"""
 from __future__ import absolute_import as _abs
-from collections import namedtuple
 import tvm
+from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 
+from ..nn.util import infer_pad
 from ..util import get_const_tuple
-from ..nn.conv2d import _get_schedule, _get_workload
-from ..nn.util import infer_pad, infer_stride
-from ..nn.pad import pad
+from .tensor_intrin import dot_16x1x16_int8_int8_int32
+from .check_targets import check_skylake
+from .util import get_fp32_len
 
-AVXConvCommonFwd = namedtuple('AVXConvCommonFwd', ['ic_bn', 'oc_bn', 'reg_n', 'unroll_kw'])
-
-
-def _get_default_schedule(wkl, simd_width):
+def _fallback_schedule(cfg, wkl):
+    simd_width = get_fp32_len()
     HPAD, WPAD = wkl.hpad, wkl.wpad
     HSTR, WSTR = wkl.hstride, wkl.wstride
-    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
     out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
 
     oc_bn = 1
@@ -36,81 +34,20 @@ def _get_default_schedule(wkl, simd_width):
             reg_n = n
             break
 
-    return AVXConvCommonFwd(ic_bn, oc_bn, reg_n, False)
-
+    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
+    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
+    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
+    cfg["unroll_kw"] = OtherOptionEntity(False)
 
-def _declaration_conv(data, kernel, stride, padding, layout, out_dtype):
-    out_dtype = data.dtype if out_dtype is None else out_dtype
-    assert layout == 'NCHW', "only support NCHW convolution for AVX"
-    wkl = _get_workload(data, kernel, stride, padding, out_dtype)
-    sch = _get_schedule(wkl)
 
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
+def _schedule_conv(s, cfg, data, data_pad, data_vec, kernel_vec, conv_out, output, last):
+    # fetch schedule
+    ic_bn, oc_bn, reg_n, unroll_kw = (cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1],
+                                      cfg["tile_ow"].size[-1], cfg["unroll_kw"].val)
 
-    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
-    num_filter, _, kernel_height, kernel_width = get_const_tuple(kernel.shape)
-
-    pad_height = in_height + 2 * HPAD
-    pad_width = in_width + 2 * WPAD
-
-    out_height = (in_height + 2 * HPAD - kernel_height) // HSTR + 1
-    out_width = (in_width + 2 * WPAD - kernel_width) // WSTR + 1
-
-    # pack data
-    DOPAD = (HPAD != 0 or WPAD != 0)
-    if DOPAD:
-        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
-    else:
-        data_pad = data
-
-    shape = (batch_size, in_channel // sch.ic_bn, pad_height, sch.ic_bn, pad_width)
-    data_vec = tvm.compute(shape,
-                           lambda n, C, h, c, w: data_pad[n, C * sch.ic_bn + c, h, w],
-                           name='data_vec')
-
-    # pack kernel
-    shape = (num_filter//sch.oc_bn, in_channel//sch.ic_bn,
-             kernel_height, kernel_width, sch.ic_bn, sch.oc_bn)
-    kernel_vec = tvm.compute(shape, lambda CO, CI, h, w, ci, co:
-                             kernel[CO * sch.oc_bn + co, CI * sch.ic_bn + ci, h, w],
-                             name='kernel_vec')
-
-    # convolution
-    oshape = (batch_size, num_filter//sch.oc_bn, out_height, out_width, sch.oc_bn)
-    unpack_shape = (batch_size, num_filter, out_height, out_width)
-
-    ic = tvm.reduce_axis((0, in_channel), name='ic')
-    kh = tvm.reduce_axis((0, kernel_height), name='kh')
-    kw = tvm.reduce_axis((0, kernel_width), name='kw')
-
-    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
-                       tvm.sum(data_vec[n, ic//sch.ic_bn, oh*HSTR+kh, ic%sch.ic_bn, ow*WSTR+kw]
-                               .astype(out_dtype) *
-                               kernel_vec[oc_chunk, ic//sch.ic_bn, kh, kw, ic%sch.ic_bn, oc_block]
-                               .astype(out_dtype),
-                               axis=[ic, kh, kw]),
-                       name='conv')
-
-    unpack = tvm.compute(unpack_shape,
-                         lambda n, c, h, w: conv[n, c // sch.oc_bn, h, w, c % sch.oc_bn]
-                         .astype(out_dtype),
-                         name='output_unpack',
-                         tag='conv2d_nchw')
-    return unpack
-
-
-def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, output, last):
     # no stride and padding info here
     padding = infer_pad(data, data_pad)
-    if data_pad is None:
-        stride = infer_stride(data, kernel, output)
-    else:
-        stride = infer_stride(data_pad, kernel, output)
-    wkl = _get_workload(data, kernel, stride, padding, output.dtype)
-    sch = _get_schedule(wkl)
-
-    HPAD, WPAD = wkl.hpad, wkl.wpad
+    HPAD, WPAD = padding
     DOPAD = (HPAD != 0 or WPAD != 0)
 
     A, W = data, kernel_vec
@@ -126,7 +63,7 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou
     # schedule kernel pack
     oc_chunk, ic_chunk, oh, ow, ic_block, oc_block = s[W].op.axis
     s[W].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
-    if sch.oc_bn > 1:
+    if oc_bn > 1:
         s[W].vectorize(oc_block)
     parallel_axis = s[W].fuse(oc_chunk, oh)
     s[W].parallel(parallel_axis)
@@ -136,7 +73,7 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou
     CC = s.cache_write(C, 'global')
 
     _, oc_chunk, oh, ow, oc_block = s[C].op.axis
-    ow_chunk, ow_block = s[C].split(ow, factor=sch.reg_n)
+    ow_chunk, ow_block = s[C].split(ow, factor=reg_n)
     s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
     s[C].fuse(oc_chunk, oh)
     s[C].vectorize(oc_block)
@@ -145,10 +82,10 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou
     _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
     ic, kh, kw = s[CC].op.reduce_axis
 
-    ow_chunk, ow_block = s[CC].split(ow, factor=sch.reg_n)
-    ic_chunk, ic_block = s[CC].split(ic, factor=sch.ic_bn)
+    ow_chunk, ow_block = s[CC].split(ow, factor=reg_n)
+    ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn)
 
-    if sch.unroll_kw:
+    if unroll_kw:
         s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block)
         s[CC].unroll(kw)
     else:
@@ -162,8 +99,8 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou
         s[O0].compute_inline()
 
     batch, oc, oh, ow = s[O].op.axis
-    ow_chunk, ow_block = s[O].split(ow, factor=sch.reg_n)
-    oc_chunk, oc_block = s[O].split(oc, factor=sch.oc_bn)
+    ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
+    oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
     s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
     parallel_axis = s[O].fuse(oc_chunk, oh)
     s[C].compute_at(s[O], parallel_axis)
@@ -174,43 +111,86 @@ def _schedule_conv(s, data, data_pad, data_vec, kernel, kernel_vec, conv_out, ou
     return s
 
 
-def _declaration_conv_NCHWc(wkl, sch, data, kernel):
-    out_dtype = wkl.out_dtype
-    HPAD, WPAD = wkl.hpad, wkl.wpad
-    HSTR, WSTR = wkl.hstride, wkl.wstride
+def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
+    # fetch schedule
+    reg_n, unroll_kw = cfg["tile_ow"].size[-1], cfg["unroll_kw"].val
+    _, _, _, _, ic_bn = get_const_tuple(data.shape)
 
-    batch_size = data.shape[0]
-    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
-    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+    # schedule data
+    A = data
+    if isinstance(s[A].op, tvm.tensor.ComputeOp):
+        batch, ic_chunk, ih, iw, ic_block = s[A].op.axis
+        parallel_axis = s[A].fuse(ic_chunk, ih)
+        s[A].parallel(parallel_axis)
 
-    # pack data
-    DOPAD = (HPAD != 0 or WPAD != 0)
-    if DOPAD:
-        data_pad = pad(data, (0, 0, HPAD, WPAD, 0), name="data_pad")
+    # schedule 5-D NCHW[x]c conv
+    C, O = conv_out, last
+    CC = s.cache_write(C, 'global')
+
+    _, oc_chunk, oh, ow, oc_block = s[C].op.axis
+    ow_chunk, ow_block = s[C].split(ow, factor=reg_n)
+    s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
+    parallel_axis = s[C].fuse(oc_chunk, oh)
+    s[C].vectorize(oc_block)
+    if C == O:
+        s[C].parallel(parallel_axis)
+
+    s[CC].compute_at(s[C], ow_chunk)
+    _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
+    ic, kh, kw = s[CC].op.reduce_axis
+
+    ow_chunk, ow_block = s[CC].split(ow, factor=reg_n)
+    ic_chunk, ic_block = s[CC].split(ic, factor=ic_bn)
+
+    if unroll_kw:
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block)
+        s[CC].unroll(kw)
     else:
-        data_pad = data
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block)
 
-    # convolution
-    oshape = (batch_size, wkl.out_filter//sch.oc_bn, out_height, out_width, sch.oc_bn)
+    s[CC].vectorize(oc_block)
+    s[CC].unroll(ow_block)
 
-    ic = tvm.reduce_axis((0, wkl.in_filter), name='ic')
-    kh = tvm.reduce_axis((0, wkl.hkernel), name='kh')
-    kw = tvm.reduce_axis((0, wkl.wkernel), name='kw')
+    if C != O:
+        batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
+        ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
+        s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
+        parallel_axis = s[O].fuse(oc_chunk, oh)
+        s[C].compute_at(s[O], parallel_axis)
+        s[O].vectorize(oc_block)
+        s[O].parallel(parallel_axis)
 
-    conv = tvm.compute(oshape, lambda n, oc_chunk, oh, ow, oc_block:
-                       tvm.sum(data_pad[n, ic//sch.ic_bn, oh*HSTR+kh, ow*WSTR+kw, ic%sch.ic_bn]
-                               .astype(out_dtype) *
-                               kernel[oc_chunk, ic//sch.ic_bn, kh, kw, ic%sch.ic_bn, oc_block],
-                               axis=[ic, kh, kw]), name='conv2d_NCHWc', tag="conv2d_NCHWc")
+    return s
 
-    return conv
 
+def _schedule_conv_NCHWc_int8(s, cfg, data, conv_out, last):
+    """
+    Defines the schedule for INT8 for intel machines
+    Uses the Intel intrinsics to use INT8 operations
+    More details - https://software.intel.com/en-us/articles/
+    lower-numerical-precision-deep-learning-inference-and-training
+    """
+
+    # Currently INT8 operations are supported for only Skylake
+    # In future the _intrin_reduce4int8 will be updated for VNNI instructions
+    # In case of unsupported target, the schedule will go to the original
+    # compute
+
+    target = tvm.target.current_target(allow_none=False)
+    int32_lanes = -1
+    if check_skylake(target):
+        int32_lanes = 16
+    else:
+        return s
+    assert int32_lanes != -1
+
+    reg_n, unroll_kw = cfg["tile_ow"].size[-1], cfg["unroll_kw"].val
+    _, _, _, _, ic_bn = get_const_tuple(data.shape)
+    _, _, _, _, oc_bn = get_const_tuple(conv_out.shape)
 
-def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last):
-    # schedule data
     A = data
     if isinstance(s[A].op, tvm.tensor.ComputeOp):
-        batch, ic_chunk, ih, iw, ic_block = s[A].op.axis
+        batch, ic_chunk, ih, iw, _ = s[A].op.axis
         parallel_axis = s[A].fuse(ic_chunk, ih)
         s[A].parallel(parallel_axis)
 
@@ -219,7 +199,7 @@ def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last):
     CC = s.cache_write(C, 'global')
 
     _, oc_chunk, oh, ow, oc_block = s[C].op.axis
-    ow_chunk, ow_block = s[C].split(ow, factor=sch.reg_n)
+    ow_chunk, ow_block = s[C].split(ow, factor=reg_n)
     s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
     parallel_axis = s[C].fuse(oc_chunk, oh)
     s[C].vectorize(oc_block)
@@ -228,23 +208,32 @@ def _schedule_conv_NCHWc(s, wkl, sch, data, kernel, conv_out, last):
 
     s[CC].compute_at(s[C], ow_chunk)
     _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
-    ic, kh, kw = s[CC].op.reduce_axis
+    kh, kw, ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis
 
-    ow_chunk, ow_block = s[CC].split(ow, factor=sch.reg_n)
-    ic_chunk, ic_block = s[CC].split(ic, factor=sch.ic_bn)
+    ow_chunk, ow_block = s[CC].split(ow, factor=reg_n)
 
-    if sch.unroll_kw:
-        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, ic_block, kw, ow_block, oc_block)
+    # Skylake and future processors have 16 vector lanes
+    assert oc_bn % int32_lanes == 0
+
+    oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=int32_lanes)
+
+    if unroll_kw:
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_outer, kh, ic_f_inner, kw,
+                      ow_block, oc_f_inner, oc_s_inner, ic_s_inner)
         s[CC].unroll(kw)
     else:
-        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_chunk, kh, kw, ic_block, ow_block, oc_block)
+        s[CC].reorder(oc_chunk, oh, ow_chunk, ic_outer, kh, kw, ic_f_inner,
+                      ow_block, oc_f_inner, oc_s_inner, ic_s_inner)
 
-    s[CC].vectorize(oc_block)
+
+    pc = dot_16x1x16_int8_int8_int32()
+    s[CC].tensorize(oc_s_inner, pc)
     s[CC].unroll(ow_block)
+    s[CC].unroll(oc_f_inner)
 
     if C != O:
         batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
-        ow_chunk, ow_block = s[O].split(ow, factor=sch.reg_n)
+        ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
         s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
         parallel_axis = s[O].fuse(oc_chunk, oh)
         s[C].compute_at(s[O], parallel_axis)
diff --git a/topi/python/topi/x86/depthwise_conv2d.py b/topi/python/topi/x86/depthwise_conv2d.py
new file mode 100644
index 000000000000..64858df91cdc
--- /dev/null
+++ b/topi/python/topi/x86/depthwise_conv2d.py
@@ -0,0 +1,203 @@
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+"""Depthwise Conv2D schedule on x86"""
+import tvm
+from tvm import autotvm
+from tvm.autotvm.task import get_config
+from tvm.autotvm.task.space import SplitEntity
+from tvm.autotvm.task.topi_integration import deserialize_args
+from .. import generic, tag
+from ..nn.pad import pad
+from ..util import get_const_tuple
+from ..nn.util import get_pad_tuple
+from ..nn.depthwise_conv2d import depthwise_conv2d_NCHWc, _get_workload
+
+from .util import get_fp32_len
+
+def _fallback_schedule(cfg, wkl):
+    """
+    Get default schedule for the workload
+    Parameters
+    ----------
+    cfg : tvm.autotvm.task.space.FallbackConfigEntity
+        Fallback config to be updated
+    wkl : topi.nn.depthwise_conv2d.Workload
+        Convolution workload
+    """
+    simd_width = get_fp32_len()
+
+    HPAD, WPAD = wkl.hpad, wkl.wpad
+    HSTR, WSTR = wkl.hstride, wkl.wstride
+    out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1
+    out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1
+
+    oc_bn = 1
+    for bn in range(simd_width, 0, -1):
+        if wkl.out_filter % bn == 0:
+            oc_bn = bn
+            break
+
+    ic_bn = 1
+    for bn in range(oc_bn, 0, -1):
+        if wkl.in_filter % bn == 0:
+            ic_bn = bn
+            break
+
+    reg_n = 1
+    for n in range(31, 0, -1):
+        if out_width % n == 0:
+            reg_n = n
+            break
+
+    cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
+    cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
+    cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
+
+
+@autotvm.register_topi_compute(depthwise_conv2d_NCHWc, 'cpu', 'direct')
+def _depthwise_conv2d_NCHWc_cpu(cfg, data, kernel, strides, padding, dilation,
+                                layout, out_layout, out_dtype=None):
+    out_dtype = data.dtype if out_dtype is None else out_dtype
+    batch, in_channel_chunk, in_height, in_width, in_channel_block = get_const_tuple(data.shape)
+    out_channel_chunk, filter_height, filter_width, out_channel_block \
+        = get_const_tuple(kernel.shape)
+
+    strides = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    HSTR, WSTR = strides
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (filter_height, filter_width))
+
+    dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
+    assert (dh, dw) == (1, 1), "Does not support dilation"
+
+    in_channel = in_channel_chunk * in_channel_block
+    out_channel = out_channel_chunk * out_channel_block
+    channel_multiplier = out_channel // in_channel
+
+    out_height = (in_height - filter_height + pad_top + pad_down) // HSTR + 1
+    out_width = (in_width - filter_width + pad_left + pad_right) // WSTR + 1
+
+    # get workload and related schedule config
+    wkl = _get_workload(tvm.placeholder((batch, in_channel, in_height, in_width), dtype=data.dtype),
+                        tvm.placeholder((out_channel, in_channel, filter_height, filter_width),
+                                        dtype=kernel.dtype),
+                        strides, padding, out_dtype)
+    if cfg.is_fallback:
+        _fallback_schedule(cfg, wkl)
+
+    # padding stage
+    DOPAD = (pad_top != 0 or pad_left != 0 or pad_down != 0 or pad_right != 0)
+    if DOPAD:
+        pad_before = [0, 0, pad_top, pad_left, 0]
+        pad_after = [0, 0, pad_down, pad_right, 0]
+        data_pad = pad(data, pad_before, pad_after, name="PaddedInput")
+    else:
+        data_pad = data
+
+    # depthconv stage
+    kh = tvm.reduce_axis((0, filter_height), name='kh')
+    kw = tvm.reduce_axis((0, filter_width), name='kw')
+    Output = tvm.compute(
+        (batch, out_channel_chunk, out_height, out_width, out_channel_block),
+        lambda b, oco, oh, ow, oci: tvm.sum(
+            (data_pad[b, (oco * out_channel_block + oci) // channel_multiplier // in_channel_block,
+                      oh*HSTR+kh, ow*WSTR+kw,
+                      ((oco * out_channel_block + oci) // channel_multiplier) % in_channel_block]
+             .astype(out_dtype) *
+             kernel[oco, kh, kw, oci].astype(out_dtype)),
+            axis=[kh, kw]),
+        name='DepthwiseConv2d', tag="depthwise_conv2d_NCHWc")
+    return Output
+
+
+@autotvm.register_topi_schedule(generic.schedule_depthwise_conv2d_NCHWc, 'cpu', ['direct'])
+def schedule_depthwise_conv2d_NCHWc(cfg, outs):
+    """CPU schedule for depthwise conv2d in NCHW[x]c layout"""
+    s = tvm.create_schedule([x.op for x in outs])
+    scheduled_ops = []
+    def traverse(op):
+        """Traverse operators from computation graph"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
+                    traverse(tensor.op)
+        if 'depthwise_conv2d_NCHWc' in op.tag:
+            conv_out = op.output(0)
+            data = conv_out.op.input_tensors[0]
+            kernel = conv_out.op.input_tensors[1]
+            _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data, kernel, conv_out, outs[0])
+        scheduled_ops.append(op)
+    traverse(outs[0].op)
+    return s
+
+def _schedule_depthwise_conv2d_NCHWc_impl(s, cfg, data, kernel, conv_out, output):
+    tile_ow = cfg["tile_ow"].size[-1]
+    # schedule data
+    A = data
+    if isinstance(s[A].op, tvm.tensor.ComputeOp):
+        batch, ic_chunk, ih, iw, ic_block = s[A].op.axis
+        p = s[A].fuse(ic_chunk, ih)
+        s[A].parallel(p)
+
+    C, O = conv_out, output
+    CC = s.cache_write(C, 'global')
+
+    _, ic_chunk, oh, ow, ic_block = s[C].op.axis
+    ow_chunk, ow_block = s[C].split(ow, factor=tile_ow)
+    s[C].reorder(ic_chunk, oh, ow_chunk, ow_block, ic_block)
+    parallel_axis = s[C].fuse(ic_chunk, oh)
+    s[C].parallel(parallel_axis)
+    s[CC].compute_at(s[C], ow_chunk)
+
+    _, ic_chunk, oh, ow, ic_block = s[CC].op.axis
+    kh, kw = s[CC].op.reduce_axis
+    ow_chunk, ow_block = s[CC].split(ow, factor=tile_ow)
+    s[CC].reorder(ic_chunk, oh, kh, kw, ow_block, ic_block)
+    s[CC].vectorize(ic_block)
+    s[CC].unroll(ow_block)
+
+    if C != O:
+        batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
+        ow_chunk, ow_block = s[O].split(ow, factor=tile_ow)
+        s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
+        parallel_axis = s[O].fuse(oc_chunk, oh)
+        s[C].compute_at(s[O], parallel_axis)
+        s[O].vectorize(oc_block)
+        s[O].parallel(parallel_axis)
+    return s
+
+
+@autotvm.task.register("topi_x86_depthwise_conv2d_NCHWc_from_nchw")
+def _topi_nn_depthwise_conv2d_NCHWc(*args, **kwargs):
+    assert not kwargs, "Do not support kwargs in template function call"
+    data, kernel, strides, padding, dilation, dtype = deserialize_args(args)
+
+    batch, in_channel, height, width = get_const_tuple(data.shape)
+    filter_channel, channel_multiplier, kh, kw = get_const_tuple(kernel.shape)
+    ph, pw = padding if isinstance(padding, (tuple, list)) else (padding, padding)
+    sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    out_height = (height - kh + 2 * ph) // sh + 1
+    out_width = (width - kw + 2 * pw) // sw + 1
+    out_channel = filter_channel * channel_multiplier
+
+    # get config here
+    cfg = get_config()
+    cfg.define_split("tile_ic", in_channel, num_outputs=2)
+    cfg.define_split("tile_oc", out_channel, num_outputs=2)
+    cfg.define_split("tile_ow", out_width, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
+
+    # change shape with the value in config
+    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
+    new_data_shape = (batch, in_channel // ic_bn, height, width, ic_bn)
+    new_kernel_shape = (out_channel // oc_bn, kh, kw, oc_bn)
+    new_data = tvm.placeholder(new_data_shape, data.dtype)
+    new_kernel = tvm.placeholder(new_kernel_shape, kernel.dtype)
+
+    data_layout = "NCHW%dc" % ic_bn
+    out_layout = "NCHW%dc" % oc_bn
+
+    C = _depthwise_conv2d_NCHWc_cpu(cfg, new_data, new_kernel, strides, padding, dilation,
+                                    data_layout, out_layout, dtype)
+    s = schedule_depthwise_conv2d_NCHWc(cfg, [C])
+    return s, [new_data, new_kernel, C]
diff --git a/topi/python/topi/x86/injective.py b/topi/python/topi/x86/injective.py
index b43ebb98b82f..06847bf9f427 100644
--- a/topi/python/topi/x86/injective.py
+++ b/topi/python/topi/x86/injective.py
@@ -29,6 +29,52 @@ def schedule_injective(outs):
     elif len(s[x].op.axis) >= 3:
         fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1])
         s[x].parallel(fused)
+    elif len(s[x].op.axis) >= 1:
+        s[x].parallel(s[x].op.axis[0])
+    return s
+
+@generic.schedule_concatenate.register(["cpu"])
+def schedule_concatenate(outs):
+    """X86 schedule for concatenate op.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of injective in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    def vectorize(sch, tensor, vectorize_limit):
+        """Internal vectorization function for concatenate."""
+        inner_axis = s[tensor].op.axis[len(s[tensor].op.axis) - 1]
+        inner_length = tensor.shape[len(tensor.shape) - 1].value
+        if inner_length <= vectorize_limit:
+            sch[tensor].vectorize(inner_axis)
+        else:
+            split_factor = 1
+            for i in range(vectorize_limit, 1, -1):
+                if inner_length % i == 0:
+                    split_factor = i
+                    break
+            if split_factor > 1:
+                _, inner_i = sch[tensor].split(inner_axis, split_factor)
+                sch[tensor].vectorize(inner_i)
+
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    x = outs[0]
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+    if len(s[x].op.axis) >= 5:
+        fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1], s[x].op.axis[2])
+        vectorize(s, x, 64)
+        s[x].parallel(fused)
+    elif len(s[x].op.axis) >= 3:
+        fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1])
+        s[x].parallel(fused)
     else:
         s[x].parallel(s[x].op.axis[0])
     return s
diff --git a/topi/python/topi/x86/nn.py b/topi/python/topi/x86/nn.py
index 03e07222c420..6802d4c01e60 100644
--- a/topi/python/topi/x86/nn.py
+++ b/topi/python/topi/x86/nn.py
@@ -2,8 +2,9 @@
 """x86 nn operators"""
 from __future__ import absolute_import as _abs
 import tvm
+
 from .. import generic
-from .. import tag
+from ..util import traverse_inline
 
 @generic.schedule_softmax.register(["cpu"])
 def schedule_softmax(outs):
@@ -53,44 +54,38 @@ def schedule_dense(outs):
 
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def traverse(op):
-        """Traverse operators from computation graph"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
 
+    def _callback(op):
         if 'dense' in op.tag:
-            C = op.output(0)
-            x, y = C.op.axis
+            output = outs[0]
+            dense = op.output(0)
 
             # Write cache for blocks
-            CC = s.cache_write(C, 'global')
+            if dense.op in s.outputs:
+                CC = s.cache_write(dense, 'local')
+            else:
+                CC = dense
 
             # Tile
             bnx = 1
             bny = 4
-            _, yo, _, yi = s[C].tile(x, y, bnx, bny)
-            s[CC].compute_at(s[C], yo)
+            x, y = output.op.axis
+            xo, yo, xi, yi = s[output].tile(x, y, bnx, bny)
+
             xc, yc = s[CC].op.axis
             k, = s[CC].op.reduce_axis
             ko, ki = s[CC].split(k, factor=4)
             s[CC].reorder(ko, xc, ki, yc)
+
             s[CC].unroll(ki)
             s[CC].vectorize(yc)
 
-            # Vectorization
-            s[C].vectorize(yi)
-
-            # Parallelization
-            s[C].parallel(yo)
+            s[output].unroll(xi)
+            s[output].vectorize(yi)
 
-        scheduled_ops.append(op)
+            fused = s[output].fuse(xo, yo)
+            s[output].parallel(fused)
+            s[CC].compute_at(s[output], fused)
 
-    traverse(outs[0].op)
+    traverse_inline(s, outs[0].op, _callback)
     return s
diff --git a/topi/python/topi/x86/pooling.py b/topi/python/topi/x86/pooling.py
index 998edf7a0e16..5fce5f32afb6 100644
--- a/topi/python/topi/x86/pooling.py
+++ b/topi/python/topi/x86/pooling.py
@@ -4,19 +4,47 @@
 from .. import generic
 from .. import tag
 
-def _parallel_sch(sch):
+def _parallel_sch(sch, oshape, do_vectorize=False):
+    def vectorize(fused_axis, num_parallel_axis, vectorize_limit=64):
+        """Internal vectorization utility function."""
+        reorder_axis = [fused_axis]
+        for i in range(num_parallel_axis, len(sch.op.axis) - 1):
+            reorder_axis.append(sch.op.axis[i])
+        kw, kh = sch.op.reduce_axis
+        fuse_k = sch.fuse(kw, kh)
+        c = sch.op.axis[len(sch.op.axis) - 1]
+        reorder_axis += [fuse_k, c]
+        sch.reorder(*reorder_axis)
+        inner_length = oshape[len(oshape) - 1].value
+        if inner_length <= vectorize_limit:
+            sch.vectorize(c)
+        else:
+            split_factor = 1
+            for i in range(vectorize_limit, 1, -1):
+                if inner_length % i == 0:
+                    split_factor = i
+                    break
+            if split_factor > 1:
+                _, c_i = sch.split(c, split_factor)
+                sch.vectorize(c_i)
+
     if len(sch.op.axis) >= 5:
         fused = sch.fuse(sch.op.axis[0], sch.op.axis[1], sch.op.axis[2])
-        sch.parallel(fused)
+        if do_vectorize:
+            vectorize(fused, 3)
+
     elif len(sch.op.axis) >= 3:
         fused = sch.fuse(sch.op.axis[0], sch.op.axis[1])
-        sch.parallel(fused)
+        if do_vectorize:
+            vectorize(fused, 2)
     else:
         sch.parallel(sch.op.axis[0])
+        return
+    sch.parallel(fused)
 
 
 @generic.schedule_pool.register(["cpu"])
-def schedule_pool(outs):
+def schedule_pool(outs, layout):
     """Schedule for pool
 
     Parameters
@@ -25,6 +53,9 @@ def schedule_pool(outs):
           The computation graph description of pool
           in the format of an array of tensors.
 
+    layout: str
+        Data layout.
+
     Returns
     -------
     sch: Schedule
@@ -37,7 +68,8 @@ def schedule_pool(outs):
     def _schedule(PaddedInput, Pool):
         if isinstance(PaddedInput.op, tvm.tensor.ComputeOp):
             s[PaddedInput].compute_inline()
-        _parallel_sch(s[Pool])
+        do_vectorize = layout[-1] not in "HWhw"
+        _parallel_sch(s[Pool], outs[0].shape, do_vectorize)
 
     def traverse(OP):
         """Internal travserse function"""
@@ -93,7 +125,7 @@ def traverse(OP):
         # schedule pool
         elif OP.tag.startswith('global_pool'):
             Pool = OP.output(0)
-            _parallel_sch(s[Pool])
+            _parallel_sch(s[Pool], outs[0].shape)
         else:
             raise RuntimeError("Unsupported operator: %s" % OP.tag)
 
diff --git a/topi/python/topi/x86/tensor_intrin.py b/topi/python/topi/x86/tensor_intrin.py
new file mode 100644
index 000000000000..28e57f1c10f8
--- /dev/null
+++ b/topi/python/topi/x86/tensor_intrin.py
@@ -0,0 +1,84 @@
+"""Core kernel of dot product of 4 Int8 operations"""
+#pylint: disable=invalid-name
+import tvm
+
+
+def dot_16x1x16_int8_int8_int32():
+    """
+    Int8 dot product by every 4 elements using AVX2 Skylake instructions.
+    This function takes two arrays of int8 datatype -- data[4] and
+    kernel[16][4] -- and computes a dot product of data[4] with every
+    4 elements of kernels, resulting in output[16] of int32 datatype.
+    The pseudo code is as follows.
+    .. code-block:: c
+        void dot_16x1x16_int8_int8_int32(int8 data[4], int8 kernel[16][4],
+                int32 output[16]){
+            for (int i = 0; i < 16; i++){
+                out[i] = 0;
+                for (int k = 0; k < 4; k++){
+                    out[i] += data[k] * kernel[i][k]
+                }
+            }
+        }
+
+    Physically, the kernel array sits in an AVX512 vector register and
+    the data[4] is broadcasted to another AVX512 vector register. This
+    function returns a TensorIntrin that can be used to tensorize
+    a schedule.
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The Skylake int8 TensorIntrin that can be used in tensorizing schedule
+    """
+
+    int32_lanes = 16 # 16 int32 lanes in AVX512
+    num_int8_elements = 4 # 4 int8 elements in int32
+    data = tvm.placeholder((num_int8_elements,), dtype='uint8', name='data')
+    kernel = tvm.placeholder((int32_lanes, num_int8_elements), dtype='int8', name='kernel')
+    k = tvm.reduce_axis((0, num_int8_elements), name='k')
+    C = tvm.compute((int32_lanes,),
+                    lambda i: tvm.sum(data[k].astype('int32') *
+                                      kernel[i, k].astype('int32'),
+                                      axis=k),
+                    name="C")
+
+    a_buffer = tvm.decl_buffer(data.shape, dtype='uint8', name="a_buffer",
+                               offset_factor=1,
+                               strides=[1])
+    b_buffer = tvm.decl_buffer(kernel.shape, dtype='int8', name="b_buffer",
+                               offset_factor=1,
+                               strides=[tvm.var('ldw'), 1])
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            ib = tvm.ir_builder.create()
+            if index == 1:
+                ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16')))
+                return ib.get()
+
+            a_int8 = ins[0].vload([0], "uint8x4")
+            re_int32 = tvm.call_pure_intrin('int32', 'reinterpret', a_int8)
+            vec_ai32 = re_int32.astype('int32x16')
+            vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai32)
+            vec_b = ins[1].vload([0, 0], "int8x64")
+            vec_one = tvm.const(1, "int16x32")
+            pair_reduction = tvm.call_llvm_intrin('int16x32',
+                                                  'llvm.x86.avx512.pmaddubs.w.512',
+                                                  tvm.const(0, 'uint32'),
+                                                  vec_a, vec_b)
+            quad_reduction = tvm.call_llvm_intrin('int32x16',
+                                                  'llvm.x86.avx512.pmaddw.d.512',
+                                                  tvm.const(0, 'uint32'),
+                                                  pair_reduction, vec_one)
+            if index == 0:
+                ib.emit(outs[0].vstore(0, quad_reduction))
+            else:
+                ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], 'int32x16')))
+            return ib.get()
+
+        # body, reset, update
+        return _instr(0), _instr(1), _instr(2)
+
+    with tvm.build_config(offset_factor=1, partition_const_loop=True):
+        return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
diff --git a/topi/python/topi/x86/util.py b/topi/python/topi/x86/util.py
new file mode 100644
index 000000000000..678ff8e24cff
--- /dev/null
+++ b/topi/python/topi/x86/util.py
@@ -0,0 +1,12 @@
+"""Common x86 related utilities"""
+from __future__ import absolute_import as _abs
+import tvm
+
+def get_fp32_len():
+    fp32_vec_len = 8
+    target = tvm.target.current_target()
+    if target is not None:
+        for opt in target.options:
+            if opt == '-mcpu=skylake-avx512':
+                fp32_vec_len = 16
+    return fp32_vec_len
diff --git a/topi/recipe/broadcast/test_broadcast_map.py b/topi/recipe/broadcast/test_broadcast_map.py
index 9c4e521ddd0d..11a4a34647db 100644
--- a/topi/recipe/broadcast/test_broadcast_map.py
+++ b/topi/recipe/broadcast/test_broadcast_map.py
@@ -48,7 +48,7 @@ def test_broadcast_to(in_shape, out_shape):
     out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), tvm.gpu())
     for _ in range(2):
         fcuda(data_nd, out_nd)
-    np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+    tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
 
 def test_broadcast_binary_op(lhs_shape, rhs_shape, typ="add"):
@@ -95,7 +95,7 @@ def test_broadcast_binary_op(lhs_shape, rhs_shape, typ="add"):
     out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), tvm.gpu())
     for _ in range(2):
         fcuda(lhs_nd, rhs_nd, out_nd)
-    np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+    tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
 
 if __name__ == "__main__":
diff --git a/topi/recipe/conv/depthwise_conv2d_test.py b/topi/recipe/conv/depthwise_conv2d_test.py
index d02f088e989a..cce36517a5ea 100644
--- a/topi/recipe/conv/depthwise_conv2d_test.py
+++ b/topi/recipe/conv/depthwise_conv2d_test.py
@@ -106,9 +106,9 @@ def check_device(device):
         for c in range(in_channel * channel_multiplier):
             scale_shift_scipy[:,c,:,:] = depthwise_conv2d_scipy[:,c,:,:] * scale_np[c] + shift_np[c]
         relu_scipy = np.maximum(scale_shift_scipy, 0)
-        np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
-        np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
-        np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
         print("success")
 
     for device in ['cuda', 'opencl', 'rocm']:
@@ -195,9 +195,9 @@ def check_device(device):
         for c in range(in_channel * channel_multiplier):
             scale_shift_scipy[:,:,:,c] = depthwise_conv2d_scipy[:,:,:,c] * scale_np[c] + shift_np[c]
         relu_scipy = np.maximum(scale_shift_scipy, 0)
-        np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
-        np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
-        np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
         print("success")
 
     for device in ['cuda', 'opencl', 'rocm']:
diff --git a/topi/recipe/conv/test_conv2d_hwcn_map.py b/topi/recipe/conv/test_conv2d_hwcn_map.py
index 8c8471d7af9c..c92dcc5d8fe7 100644
--- a/topi/recipe/conv/test_conv2d_hwcn_map.py
+++ b/topi/recipe/conv/test_conv2d_hwcn_map.py
@@ -64,10 +64,10 @@ def check_device(device):
                               unroll_explicit=device == 'rocm'):
             func1 = tvm.build(s1, [A, W, B], device)
             func1(a, w, b)
-            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+            tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
             func2 = tvm.build(s2, [A, W, C], device)
             func2(a, w, c)
-            np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+            tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'rocm']:
         check_device(device)
diff --git a/topi/recipe/conv/test_conv_int8_intel.py b/topi/recipe/conv/test_conv_int8_intel.py
new file mode 100644
index 000000000000..593f913db15d
--- /dev/null
+++ b/topi/recipe/conv/test_conv_int8_intel.py
@@ -0,0 +1,134 @@
+#pylint: disable-msg=too-many-arguments, too-many-locals, assignment-from-no-return
+""" Conv Int8 functional and performance testing"""
+import sys
+import logging
+import numpy as np
+import tvm
+import topi
+
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+LOGGER = logging.getLogger('test_conv_int8_intel')
+LOGGER.disabled = False
+
+# All the WORKLOADS from Resnet except first layer
+# Workload is ['height', 'width', 'in_filter', 'out_filter',
+#              'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
+WORKLOADS = [(56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
+             (56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
+             (56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
+             (56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
+             (28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
+             (28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
+             (28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
+             (14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
+             (14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
+             (14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
+             (7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
+             (56, 56, 64, 256, 1, 1, 0, 0, 1, 1),
+             (56, 56, 256, 64, 1, 1, 0, 0, 1, 1),
+             (56, 56, 256, 128, 1, 1, 0, 0, 2, 2),
+             (28, 28, 128, 512, 1, 1, 0, 0, 1, 1),
+             (56, 56, 256, 512, 1, 1, 0, 0, 2, 2),
+             (28, 28, 512, 128, 1, 1, 0, 0, 1, 1),
+             (28, 28, 512, 256, 1, 1, 0, 0, 2, 2),
+             (14, 14, 256, 1024, 1, 1, 0, 0, 1, 1),
+             (28, 28, 512, 1024, 1, 1, 0, 0, 2, 2),
+             (14, 14, 1024, 256, 1, 1, 0, 0, 1, 1),
+             (14, 14, 1024, 512, 1, 1, 0, 0, 2, 2),
+             (7, 7, 512, 2048, 1, 1, 0, 0, 1, 1),
+             (14, 14, 1024, 2048, 1, 1, 0, 0, 2, 2),
+             (7, 7, 2048, 512, 1, 1, 0, 0, 1, 1)
+            ]
+
+
+TARGET_NAME = 'llvm -mcpu=skylake-avx512'
+NUM_VEC_LANES = 16
+CTX = tvm.context(TARGET_NAME, 0)
+
+def get_shape(im_height, im_width, in_filter, out_filter, k_h, k_w, hpad, wpad,
+              hstride, wstride, out_dtype):
+    """
+    Finds out the shape of all data structures
+    """
+    ## Find shapes
+    data_shape = (1, in_filter//NUM_VEC_LANES, im_height, im_width, NUM_VEC_LANES)
+
+    if out_dtype == 'int32':
+        kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, k_h, k_w,
+                        NUM_VEC_LANES//4, NUM_VEC_LANES, 4)
+    elif out_dtype == 'float32':
+        kernel_shape = (out_filter//NUM_VEC_LANES, in_filter//NUM_VEC_LANES, k_h, k_w,
+                        NUM_VEC_LANES, NUM_VEC_LANES)
+    out_height = (im_height + 2 * hpad - k_h) // hstride + 1
+    out_width = (im_width + 2 * wpad - k_w) // wstride + 1
+    o_shape = (1, out_filter//NUM_VEC_LANES, out_height, out_width, NUM_VEC_LANES)
+    return (data_shape, kernel_shape, o_shape)
+
+
+
+def run_inference(data_dtype, kernel_dtype, out_dtype, im_height, im_width, in_filter,
+                  out_filter, k_h, k_w, hpad, wpad, hstride, wstride):
+    """
+    Runs the inference and checks the functional correctness between
+    compute and schedule outputs
+    """
+    (data_shape, kernel_shape, o_shape) = get_shape(im_height, im_width, in_filter,
+                                                    out_filter, k_h, k_w, hpad, wpad,
+                                                    hstride, wstride, out_dtype)
+
+    # Create TVM placeholders
+    data = tvm.placeholder(data_shape, name='data', dtype=data_dtype)
+    kernel = tvm.placeholder(kernel_shape, name='kernel', dtype=kernel_dtype)
+
+    # Create the numpy arrays to be used for executing conv models
+    if data_dtype == 'float32':
+        data_array = tvm.nd.array(np.random.rand(*data_shape).astype(dtype=data_dtype), CTX)
+        kernel_array = tvm.nd.array(np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), CTX)
+    else:
+        data_array = tvm.nd.array(np.random.randint(100, size=data_shape).astype(data_dtype))
+        kernel_array = tvm.nd.array(np.random.randint(100, size=kernel_shape).astype(kernel_dtype))
+
+    # c_orig will be used for declaration ouptut
+    # c_sch will be used for scheduled computation output
+    c_orig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX)
+    c_sch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX)
+
+
+    with tvm.target.create(TARGET_NAME):
+        conv = topi.nn.conv2d_NCHWc(data, kernel, stride=hstride,
+                                    padding=hpad, layout='NCHWc',
+                                    out_layout='NCHWc', out_dtype=out_dtype)
+        out = topi.nn.relu(conv)
+        sch = tvm.create_schedule(out.op)
+        func = tvm.build(sch, [data, kernel, out], target=TARGET_NAME, name='out')
+        func(data_array, kernel_array, c_orig)
+        LOGGER.debug(tvm.lower(sch, [data, kernel], simple_mode=True))
+
+        # Generate and run the optimized schedule
+        sconv = topi.generic.nn.schedule_conv2d_NCHWc(outs=[out])
+        func = tvm.build(sconv, [data, kernel, out], target=TARGET_NAME, name='conv')
+        func(data_array, kernel_array, c_sch)
+
+        # Functional check
+        if data_dtype == 'uint8':
+            np.testing.assert_equal(c_orig.asnumpy(), c_sch.asnumpy())
+        else:
+            assert np.allclose(c_orig.asnumpy(), c_sch.asnumpy())
+
+        evaluator = func.time_evaluator(func.entry_name, CTX, number=1000)
+        LOGGER.debug(tvm.lower(sconv, [data, kernel], simple_mode=True))
+        return evaluator(data_array, kernel_array, c_sch).mean
+
+if __name__ == "__main__":
+    LOGGER.info("Workload, Kernel_size, FP32_time, INT8_time, Speedup")
+    SPEEDUP_ARRAY = []
+    for i, wkl in enumerate(WORKLOADS):
+        fp32_time = run_inference('float32', 'float32', 'float32', *wkl)
+        int8_time = run_inference('uint8', 'int8', 'int32', *wkl)
+        kernel_h = wkl[4]
+        kernel_w = wkl[5]
+        LOGGER.info("Workload#" + str(i) + ", " + str(kernel_h) + "x" + str(kernel_w) + ", "
+                    + str(fp32_time) + ", " + str(int8_time) + ", " + str(fp32_time/int8_time))
+
+        SPEEDUP_ARRAY.append(fp32_time/int8_time)
+    LOGGER.info("Average speedup --> %s" % str(sum(SPEEDUP_ARRAY)/float(len(SPEEDUP_ARRAY))))
diff --git a/topi/recipe/gemm/cuda_gemm_square.py b/topi/recipe/gemm/cuda_gemm_square.py
index f2cabb26bb66..2a47e22e0b59 100644
--- a/topi/recipe/gemm/cuda_gemm_square.py
+++ b/topi/recipe/gemm/cuda_gemm_square.py
@@ -118,7 +118,7 @@ def check_device(device):
         c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
         for i in range(2):
             f(a, b, c)
-        np.testing.assert_allclose(
+        tvm.testing.assert_allclose(
             c.asnumpy(), np.dot(b_np.T, a_np), rtol=1e-5)
 
         num_flops = 2 * nn * nn * nn
diff --git a/topi/recipe/gemm/gemm_int8.py b/topi/recipe/gemm/gemm_int8.py
new file mode 100644
index 000000000000..43029094a25c
--- /dev/null
+++ b/topi/recipe/gemm/gemm_int8.py
@@ -0,0 +1,156 @@
+"Example code to perform int8 GEMM"
+import logging
+import sys
+import numpy as np
+import tvm
+from tvm import autotvm
+from topi.cuda.tensor_intrin import dp4a
+
+DO_TUNING = True
+PRETUNED_INDEX = 75333
+
+intrin_dp4a = dp4a('local', 'local', 'local')
+
+@autotvm.template
+def gemm_int8(n, m, l):
+    A = tvm.placeholder((n, l), name='A', dtype='int8')
+    B = tvm.placeholder((m, l), name='B', dtype='int8')
+
+    k = tvm.reduce_axis((0, l), name='k')
+    C = tvm.compute((n, m), lambda i, j: tvm.sum(A[i, k].astype('int32') * B[j, k].astype(
+        'int32'), axis=k), name='C')
+
+    cfg = autotvm.get_config()
+    s = tvm.create_schedule(C.op)
+    y, x = C.op.axis
+
+    AA = s.cache_read(A, 'shared', [C])
+    BB = s.cache_read(B, 'shared', [C])
+    AL = s.cache_read(AA, 'local', [C])
+    BL = s.cache_read(BB, 'local', [C])
+    CC = s.cache_write(C, 'local')
+
+    k = CC.op.reduce_axis[0]
+
+    cfg.define_split('tile_k', cfg.axis(k), num_outputs=3,
+                     filter=lambda entity: entity.size[2] == 4 and \
+                     entity.size[0] * 2 >= entity.size[1])
+
+    ko, kt, ki = cfg['tile_k'].apply(s, CC, k)
+
+    s[CC].tensorize(ki, intrin_dp4a)
+
+    block_x = tvm.thread_axis('blockIdx.x')
+    block_y = tvm.thread_axis('blockIdx.y')
+    thread_x = tvm.thread_axis('threadIdx.x')
+    thread_y = tvm.thread_axis('threadIdx.y')
+
+    def block_size_filter(entity):
+        return entity.size[0] * 2 >= entity.size[1] * 2 and \
+                entity.size[1] <= 16 and entity.size[3] <= 4
+    cfg.define_split('tile_y', cfg.axis(y), num_outputs=4, filter=block_size_filter)
+    cfg.define_split('tile_x', cfg.axis(x), num_outputs=4, filter=block_size_filter)
+    by, tyz, ty, yi = cfg['tile_y'].apply(s, C, y)
+    bx, txz, tx, xi = cfg['tile_x'].apply(s, C, x)
+
+    s[C].bind(by, block_y)
+    s[C].bind(bx, block_x)
+    s[C].bind(tyz, tvm.thread_axis('vthread'))
+    s[C].bind(txz, tvm.thread_axis('vthread'))
+    s[C].bind(ty, thread_y)
+    s[C].bind(tx, thread_x)
+    s[C].reorder(by, bx, tyz, txz, ty, tx, yi, xi)
+
+    s[CC].compute_at(s[C], tx)
+
+    yo, xo = CC.op.axis
+    s[CC].reorder(ko, kt, yo, xo, ki)
+    s[CC].unroll(kt)
+
+    for stage in [AL, BL]:
+        s[stage].compute_at(s[CC], kt)
+        _, xi = s[stage].split(stage.op.axis[1], factor=4)
+        s[stage].vectorize(xi)
+        s[stage].double_buffer()
+
+    cfg.define_knob('storage_align', [16, 48])
+    for stage in [AA, BB]:
+        s[stage].storage_align(s[stage].op.axis[0],
+                               cfg['storage_align'].val, 0)
+        s[stage].compute_at(s[CC], ko)
+
+        fused = s[stage].fuse(*s[stage].op.axis)
+        ty, tx = s[stage].split(fused, nparts=cfg['tile_y'].size[2])
+        tx, xi = s[stage].split(tx, nparts=cfg['tile_x'].size[2])
+        _, xi = s[stage].split(xi, factor=16)
+
+        s[stage].bind(ty, thread_y)
+        s[stage].bind(tx, thread_x)
+        s[stage].vectorize(xi)
+
+    cfg.define_knob('auto_unroll_max_step', [512, 1500])
+    s[C].pragma(by, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+    s[C].pragma(by, 'unroll_explicit', False)
+
+    cfg.add_flop(n*m*l*2)
+    return s, [A, B, C]
+
+
+if __name__ == '__main__':
+    N = 2048
+    n = m = l = N
+
+    logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
+    task = autotvm.task.create(gemm_int8, args=(n, m, l), target='cuda')
+    print(task.config_space)
+
+    measure_option = autotvm.measure_option(
+        builder=autotvm.LocalBuilder(),
+        runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4)
+    )
+
+    log_name = 'gemm_int8.log'
+    if DO_TUNING:
+        tuner = autotvm.tuner.XGBTuner(task)
+        tuner.tune(n_trial=1000, measure_option=measure_option,
+                   callbacks=[autotvm.callback.log_to_file(log_name)])
+
+        dispatch_context = autotvm.apply_history_best(log_name)
+        best_config = dispatch_context.query(task.target, task.workload)
+        print('\nBest config:')
+        print(best_config)
+    else:
+        config = task.config_space.get(PRETUNED_INDEX)
+        dispatch_context = autotvm.task.ApplyConfig(config)
+        print("Using pretuned config:")
+        print(config)
+
+    with dispatch_context:
+        with tvm.target.create('cuda'):
+            s, arg_bufs = gemm_int8(n, m, l)
+            f = tvm.build(s, arg_bufs, 'cuda', name='gemm_int8')
+
+    ctx = tvm.context('cuda', 0)
+
+    a_np = np.random.randint(size=(n, l), low=-128, high=127, dtype='int8')
+    b_np = np.random.randint(size=(m, l), low=-128, high=127, dtype='int8')
+
+    a = tvm.nd.array(a_np, ctx)
+    b = tvm.nd.array(b_np, ctx)
+    c = tvm.nd.array(np.zeros((n, m), dtype='int32'), ctx)
+    f(a, b, c)
+
+    tvm.testing.assert_allclose(
+        c.asnumpy(),
+        np.dot(
+            a_np.astype('int32'),
+            b_np.T.astype('int32')),
+        rtol=1e-5)
+
+    num_ops = 2 * l * m * n
+    num_runs = 1000
+    timer_f = f.time_evaluator(f.entry_name, ctx, number=num_runs)
+    t = timer_f(a, b, c).mean
+    GOPS = num_ops / (t * 1e3) / 1e6
+    print("average time cost of %d runs = %g ms, %g GOPS." %
+          (num_runs, t * 1e3, GOPS))
diff --git a/topi/recipe/reduce/test_reduce_map.py b/topi/recipe/reduce/test_reduce_map.py
index 6e9befaff2ec..5fadf10d94f9 100644
--- a/topi/recipe/reduce/test_reduce_map.py
+++ b/topi/recipe/reduce/test_reduce_map.py
@@ -67,7 +67,7 @@ def test_reduce_map(in_shape, axis, keepdims, type="sum", test_id=0):
 
     for _ in range(2):
         fcuda(data_tvm, out_tvm)
-    np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 4E-4, 4E-4)
+    tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, rtol=4e-4, atol=4e-4)
 
 if __name__ == "__main__":
     test_reduce_map(in_shape=(128, 24, 128, 24),
diff --git a/topi/recipe/rnn/lstm.py b/topi/recipe/rnn/lstm.py
index 53ccbe598c3d..f627d6ce8f8e 100644
--- a/topi/recipe/rnn/lstm.py
+++ b/topi/recipe/rnn/lstm.py
@@ -1,8 +1,6 @@
 """LSTM Example, still work in progress.."""
 import tvm
-import time
 import os
-import argparse
 from tvm.contrib import nvcc
 import numpy as np
 
@@ -14,16 +12,19 @@
 SKIP_CHECK = False
 UNROLL_WLOAD = True
 
+
 @tvm.register_func
 def tvm_callback_cuda_compile(code):
     """Use nvcc compiler for better perf."""
     ptx =  nvcc.compile_cuda(code, target="ptx")
     return ptx
 
+
 def write_code(code, fname):
     with open(fname, "w") as f:
         f.write(code)
 
+
 @tvm.register_func
 def tvm_callback_cuda_postproc(code):
     if not os.path.exists("perf"):
@@ -33,16 +34,16 @@ def tvm_callback_cuda_postproc(code):
         code = open("perf/%s_manual.cu" % TASK).read()
     return code
 
+
 def lstm():
     if not PERSIST_KERNEL:
         raise ValueError("Non persist LSTM not yet supported")
-    detect_global_barrier = DETECT_GLOBAL_BARRIER
     num_thread_y = 8
-    num_thread_x = 16 * 3 / 2
+    num_thread_x = 16 * 3 // 2
     num_sm = 24
     n_num_step = 128
     num_step = tvm.var('num_step')
-    num_hidden = 1152 / 2
+    num_hidden = 1152 // 2
     batch_size = 1
     # Global transition matrix
     # Input hidden channel can be pre-caculated by a gemm
@@ -165,11 +166,9 @@ def check_device(target):
         flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
         ctx.sync()
         # measure time cost of second step.
-        tstart = time.time()
-        flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
-        ctx.sync()
-        tgap = time.time() - tstart
-        print("Time cost=%g" % tgap)
+        evaluator = flstm.time_evaluator(flstm.entry_name, ctx, 1, repeat=1000)
+        eval_result = evaluator(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
+        print("Time cost=%g" % eval_result.mean)
 
     # set unroll_explicit for more readable code.
     with tvm.build_config(
diff --git a/topi/recipe/rnn/matexp.py b/topi/recipe/rnn/matexp.py
index 13f6357209c6..dddadb8ba5f3 100644
--- a/topi/recipe/rnn/matexp.py
+++ b/topi/recipe/rnn/matexp.py
@@ -145,7 +145,7 @@ def check_device(target):
                 for j in range(n_num_hidden):
                     if abs(res_cmp[i,0,j] - res_gpu[i,0,j]) > 1e-5:
                         print("%d, %d: %g vs %g" % (i,j, res_cmp[i,0,j], res_gpu[i,0,j]))
-            np.testing.assert_allclose(res_gpu, res_cmp, rtol=1e-3)
+            tvm.testing.assert_allclose(res_gpu, res_cmp, rtol=1e-3)
     check_device("cuda")
 
 if __name__ == "__main__":
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index 4cdab4401459..fe2af0561ea7 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -3,6 +3,8 @@
 * \brief Registration of TVM operators and schedules
 * \file topi.cc
 */
+#define TOPI_REDUCE_ATLEAST1D 0
+
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/registry.h>
@@ -30,7 +32,6 @@
 #include <topi/vision/reorg.h>
 #include <topi/image/resize.h>
 #include <topi/vision/yolo/region.h>
-#include <topi/vision/yolo/yolo.h>
 #include <topi/generic/default.h>
 #include <topi/generic/extern.h>
 #include <topi/generic/injective.h>
@@ -58,9 +59,9 @@ using namespace tvm;
 using namespace tvm::runtime;
 
 /*! \brief Canonicalize an argument that may be Array<Expr> or int to Array<Expr> */
-Array<Expr> ArrayOrInt(TVMArgValue arg) {
+Array<Integer> ArrayOrInt(TVMArgValue arg) {
   if (arg.type_code() == kDLInt || arg.type_code() == kDLUInt) {
-    Array<Expr> result;
+    Array<Integer> result;
     result.push_back(arg.operator int());
     return result;
   } else {
@@ -230,6 +231,11 @@ TVM_REGISTER_GLOBAL("topi.argmax")
   *rv = topi::argmax(args[0], ArrayOrInt(args[1]), args[2]);
   });
 
+TVM_REGISTER_GLOBAL("topi.prod")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = topi::prod(args[0], ArrayOrInt(args[1]), args[2]);
+  });
+
 /* Ops from transform.h */
 TVM_REGISTER_GLOBAL("topi.expand_dims")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
@@ -285,6 +291,32 @@ TVM_REGISTER_GLOBAL("topi.where")
   *rv = where(args[0], args[1], args[2]);
 });
 
+TVM_REGISTER_GLOBAL("topi.gather_nd")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = gather_nd(args[0], args[1]);
+});
+
+TVM_REGISTER_GLOBAL("topi.matmul")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  switch ( args.size() ) {
+    case 2: *rv = matmul(args[0], args[1]); break;
+    case 3: *rv = matmul(args[0], args[1], args[2]); break;
+    case 4: *rv = matmul(args[0], args[1], args[2], args[3]); break;
+    default: CHECK(0) << "topi.matmul expects 2, 3 or 4 arguments";
+  }});
+
+TVM_REGISTER_GLOBAL("topi.tensordot")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  if (args.size() == 2) {
+    *rv = tensordot(args[0], args[1]);
+  } else if (args.size() == 3) {
+    *rv = tensordot(args[0], args[1], args[2]);
+  } else {
+    Array<Expr> axes = args[3];
+    *rv = tensordot(args[0], args[1], args[2], axes);
+  }
+  });
+
 TVM_REGISTER_GLOBAL("topi.strided_slice")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   *rv = strided_slice(args[0], args[1], args[2], args[3]);
@@ -397,11 +429,6 @@ TVM_REGISTER_GLOBAL("topi.vision.yolo.region")
   *rv = vision::yolo::region(args[0], args[1], args[2], args[3], args[4], args[5]);
   });
 
-TVM_REGISTER_GLOBAL("topi.vision.yolo.yolo")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = vision::yolo::yolo(args[0], args[1], args[2]);
-  });
-
 /* Ops from image/resize.h */
 TVM_REGISTER_GLOBAL("topi.image.resize")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
diff --git a/topi/tests/python/common.py b/topi/tests/python/common.py
new file mode 100644
index 000000000000..f34f3b331fd1
--- /dev/null
+++ b/topi/tests/python/common.py
@@ -0,0 +1,27 @@
+"""Common utility for topi test"""
+
+from tvm import autotvm
+from tvm.autotvm.task.space import FallbackConfigEntity
+
+
+def get_all_backend():
+    """return all supported target
+
+    Returns
+    -------
+    targets: list
+        A list of all supported targets
+    """
+    return ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx',
+            'llvm -device=arm_cpu', 'opencl -device=mali', 'aocl_sw_emu']
+
+
+class NCHWcInt8Fallback(autotvm.FallbackContext):
+    def _query_inside(self, target, workload):
+        key = (target, workload)
+        if key in self.memory:
+            return self.memory[key]
+        cfg = FallbackConfigEntity()
+        cfg.template_key = 'int8'
+        self.memory[key] = cfg
+        return cfg
diff --git a/topi/tests/python/test_topi_bitserial_conv2d.py b/topi/tests/python/test_topi_bitserial_conv2d.py
index 6df18483a45f..6979cf1ce437 100644
--- a/topi/tests/python/test_topi_bitserial_conv2d.py
+++ b/topi/tests/python/test_topi_bitserial_conv2d.py
@@ -1,11 +1,8 @@
-import os
 import numpy as np
 import tvm
 import topi
 import topi.testing
-from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
-from tvm.contrib import util
 from tvm.contrib.pickle_memoize import memoize
 
 def generate_quantized_np(shape, bits, out_dtype):
@@ -16,23 +13,23 @@ def generate_quantized_np(shape, bits, out_dtype):
 def verify_bitserial_conv2d_nchw(batch, in_size, in_channel, num_filter, kernel, stride, padding, 
     activation_bits, weight_bits, dorefa):
     in_height = in_width = in_size
-    input_type='uint32'
-    out_dtype='int32'
+    input_type = 'uint32'
+    out_dtype = 'int32'
 
     with tvm.target.create('llvm'):
         A = tvm.placeholder((batch, in_channel, in_height, in_width), dtype=input_type, name='A')
         W = tvm.placeholder((num_filter, in_channel, kernel, kernel), dtype=input_type, name='W')
         B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, 
-            out_dtype=out_dtype, layout="NCHW", dorefa=dorefa)
+                                     out_dtype=out_dtype, layout="NCHW", dorefa=dorefa)
         s = topi.generic.schedule_bitserial_conv2d_nchw([B])
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
-    dtype = A.dtype
 
+    @memoize("topi.tests.test_topi_bitseral_conv2d_nchw")
     def get_ref_data():
-        a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type)
-        w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type)
+        a_np = generate_quantized_np(get_const_tuple(a_shape), activation_bits, input_type)
+        w_np = generate_quantized_np(get_const_tuple(w_shape), weight_bits, input_type)
         if dorefa:
             w_ = np.copy(w_np).astype(out_dtype)
             for x in np.nditer(w_, op_flags=['readwrite']):
@@ -49,7 +46,7 @@ def get_ref_data():
     b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
     func = tvm.build(s, [A, W, B], "llvm")
     func(a, w, b)
-    np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+    tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
 def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel, stride, padding, 
                         activation_bits, weight_bits, dorefa):
@@ -61,16 +58,16 @@ def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel,
         A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A')
         W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W')
         B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, out_dtype=out_dtype, 
-                            layout="NHWC", dorefa=dorefa)
+                                     layout="NHWC", dorefa=dorefa)
         s = topi.generic.schedule_bitserial_conv2d_nhwc([B])
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
-    dtype = A.dtype
 
+    @memoize("topi.tests.test_topi_bitseral_conv2d_nhwc")
     def get_ref_data():
-        a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type)
-        w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type)
+        a_np = generate_quantized_np(get_const_tuple(a_shape), activation_bits, input_type)
+        w_np = generate_quantized_np(get_const_tuple(w_shape), weight_bits, input_type)
         if dorefa:
             w_ = np.copy(w_np).astype(out_dtype)
             for x in np.nditer(w_, op_flags=['readwrite']):
@@ -88,7 +85,7 @@ def get_ref_data():
     func = tvm.build(s, [A, W, B], 'llvm')
 
     func(a, w, b)
-    np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+    tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
 def test_bitserial_conv2d():
     in_size = 56
@@ -109,4 +106,4 @@ def test_bitserial_conv2d():
     verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 2, False)
 
 if __name__ == "__main__":
-    test_bitserial_conv2d()
\ No newline at end of file
+    test_bitserial_conv2d()
diff --git a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
index 3de954abc291..de467818d37f 100644
--- a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
+++ b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
@@ -4,10 +4,6 @@
 import tvm
 import topi
 import topi.testing
-from topi.util import get_const_tuple
-from tvm.contrib import util
-
-target = 'llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon'
 
 def generate_quantized_np(shape, bits, out_dtype):
     np.random.seed(0)
@@ -17,20 +13,19 @@ def generate_quantized_np(shape, bits, out_dtype):
 
 # Verify that certain special instructions from the tensorize pass exist
 def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel, stride, padding, 
-                        activation_bits, weight_bits, dorefa):
+                                 activation_bits, weight_bits, dorefa):
     in_height = in_width = in_size
-    input_type='uint32'
-    out_dtype='int32'
+    input_type = 'uint32'
+    out_dtype = 'int32'
 
     with tvm.target.arm_cpu('rasp3b'):
         A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A')
         W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W')
         B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, out_dtype=out_dtype, 
-                            layout="NHWC", dorefa=dorefa)
+                                     layout="NHWC", dorefa=dorefa)
         s = topi.generic.schedule_bitserial_conv2d_nhwc([B])
 
-    
-    func = tvm.build(s, [A, W, B], target)
+    func = tvm.build(s, [A, W, B], tvm.target.arm_cpu('rasp3b'))
    
     assembly = func.get_source('asm')
     matches = re.findall("vpadal", assembly)
@@ -47,7 +42,6 @@ def test_bitserial_conv2d():
     stride = 1
     pad = 1
 
-
     verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, False)
     verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
 
diff --git a/topi/tests/python/test_topi_bnn.py b/topi/tests/python/test_topi_bnn.py
index 90abc68e6b68..7d132bfff0fe 100644
--- a/topi/tests/python/test_topi_bnn.py
+++ b/topi/tests/python/test_topi_bnn.py
@@ -28,7 +28,7 @@ def get_ref_data():
         a_np = (np.random.randint(2, size=(batch, in_dim)) * 2 - 1).astype(dtype)
         b_np = (np.random.randint(2, size=(out_dim, in_dim)) * 2 - 1).astype(dtype)
         c_np = np.dot(a_np, b_np.T)
-        return (a_np, b_np, c_np)
+        return a_np, b_np, c_np
 
     a_np, b_np, c_np = get_ref_data()
 
@@ -44,7 +44,7 @@ def get_ref_data():
     f1(a, bnn_a)
     f2(b, bnn_b)
     f3(bnn_a, bnn_b, bnn_c)
-    np.testing.assert_allclose(bnn_c.asnumpy(), c_np, rtol=1e-5)
+    tvm.testing.assert_allclose(bnn_c.asnumpy(), c_np, rtol=1e-5)
 
 def test_binary_dense():
     verify_binary_dense(1, 4096, 1024)
diff --git a/topi/tests/python/test_topi_broadcast.py b/topi/tests/python/test_topi_broadcast.py
index f888033b3914..3be938852fdf 100644
--- a/topi/tests/python/test_topi_broadcast.py
+++ b/topi/tests/python/test_topi_broadcast.py
@@ -1,5 +1,5 @@
 """Test code for broadcasting operators."""
-import os
+from common import get_all_backend
 import numpy as np
 import tvm
 import topi
@@ -8,6 +8,7 @@ def verify_broadcast_to_ele(in_shape, out_shape, fbcast):
     # Build the logic and compile the function
     A = tvm.placeholder(shape=in_shape, name="A")
     B = fbcast(A, out_shape)
+
     def check_device(device):
         ctx = tvm.context(device, 0)
         if not ctx.exist:
@@ -21,16 +22,11 @@ def check_device(device):
         out_npy = np.broadcast_to(data_npy, out_shape)
         data_nd = tvm.nd.array(data_npy, ctx)
         out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), ctx)
-        for _ in range(1):
-            foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
-
-    check_device("vulkan")
-    check_device("opencl")
-    check_device("cuda")
-    check_device("metal")
-    check_device("rocm")
-    check_device("nvptx")
+        foo(data_nd, out_nd)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+
+    for target in get_all_backend():
+        check_device(target)
     check_device("sdaccel")
 
 
@@ -45,9 +41,10 @@ def verify_broadcast_binary_ele(lhs_shape, rhs_shape,
     B = (tvm.var("B", dtype=dtype) if rhs_shape is None
          else tvm.placeholder(shape=rhs_shape, name="B", dtype=dtype))
     C = ftopi(A, B)
-    if (isinstance(A, tvm.expr.Expr) and isinstance(B, tvm.expr.Expr)):
+    if isinstance(A, tvm.expr.Expr) and isinstance(B, tvm.expr.Expr):
         assert(isinstance(C, tvm.expr.Expr))
         return
+
     def check_device(device):
         ctx = tvm.context(device, 0)
         if not ctx.exist:
@@ -80,14 +77,10 @@ def check_device(device):
         out_npy = fnumpy(lhs_npy, rhs_npy)
         out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), ctx)
         foo(lhs_nd, rhs_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
-
-    check_device("opencl")
-    check_device("vulkan")
-    check_device("cuda")
-    check_device("metal")
-    check_device("rocm")
-    check_device("nvptx")
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
+
+    for target in get_all_backend():
+        check_device(target)
     check_device("sdaccel")
 
 def test_broadcast_to():
diff --git a/topi/tests/python/test_topi_clip.py b/topi/tests/python/test_topi_clip.py
index ffc89aeb9bc3..128a45c46f60 100644
--- a/topi/tests/python/test_topi_clip.py
+++ b/topi/tests/python/test_topi_clip.py
@@ -5,6 +5,7 @@
 from topi.util import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
+from common import get_all_backend
 
 def verify_clip(N, a_min, a_max, dtype):
     A = tvm.placeholder((N, N), dtype=dtype, name='A')
@@ -32,9 +33,9 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device, name="clip")
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ['llvm', 'opencl', 'sdaccel']:
+    for device in get_all_backend():
         check_device(device)
 
 def test_clip():
diff --git a/topi/tests/python/test_topi_conv2d.py b/topi/tests/python/test_topi_conv2d.py
deleted file mode 100644
index 365fdf551c4f..000000000000
--- a/topi/tests/python/test_topi_conv2d.py
+++ /dev/null
@@ -1,47 +0,0 @@
-"""Example code to do conv2d."""
-import os
-import numpy as np
-import tvm
-from tvm import autotvm
-import topi
-import topi.testing
-from tvm.contrib.pickle_memoize import memoize
-from topi.util import get_const_tuple
-
-
-def verify_conv2d(batch, in_size, in_channel, num_filter, kernel, stride, padding):
-    in_height = in_width = in_size
-
-    with tvm.target.arm_cpu():
-        A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
-        W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
-        B = topi.nn.conv2d(A, W, (stride, stride), (padding, padding), 'NCHW', 'float32')
-        s = topi.generic.schedule_conv2d_nchw([B])
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_conv2d.verify_conv2d")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding)
-        return a_np, w_np, b_np
-
-    a_np, w_np, b_np = get_ref_data()
-
-    ctx = tvm.cpu(0)
-    a = tvm.nd.array(a_np, ctx)
-    w = tvm.nd.array(w_np, ctx)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-    func = tvm.build(s, [A, W, B], "llvm")
-    func(a, w, b)
-    np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-
-def test_conv2d():
-    with autotvm.tophub.context(tvm.target.arm_cpu('rasp3b'), allow_fallback=True):
-        verify_conv2d(1, 56, 64, 64, 3, 1, 1)
-
-if __name__ == "__main__":
-    test_conv2d()
diff --git a/topi/tests/python/test_topi_conv2d_NCHWc.py b/topi/tests/python/test_topi_conv2d_NCHWc.py
new file mode 100644
index 000000000000..a3af43c8d810
--- /dev/null
+++ b/topi/tests/python/test_topi_conv2d_NCHWc.py
@@ -0,0 +1,205 @@
+"""Test for NCHW[x]c convolution"""
+
+import numpy as np
+import tvm
+from tvm import autotvm
+import topi
+import topi.testing
+from tvm.contrib.pickle_memoize import memoize
+from topi.util import get_const_tuple
+
+from common import get_all_backend
+
+def _transform_data(data, bn):
+    # NCHW -> NCHW[x]c
+    batch_size, channel, height, width = data.shape
+    data = np.reshape(data, (batch_size, channel//bn, bn, height, width))
+    data = np.transpose(data, (0, 1, 3, 4, 2))
+    return data
+
+def _transform_kernel(kernel, ic_bn, oc_bn):
+    # OIHW -> OIHW[x]i[x]o
+    out_channel, in_channel, kh, kw = kernel.shape
+    kernel = np.reshape(kernel, (out_channel//oc_bn, oc_bn, in_channel//ic_bn, ic_bn, kh, kw))
+    kernel = np.transpose(kernel, (0, 2, 4, 5, 3, 1))
+    return kernel
+
+def _transform_bias(bias, bn):
+    # [num_filter, 1, 1] -> [num_filter//bn, 1, 1, bn]
+    num_filter, h, w = bias.shape
+    bias = np.reshape(bias, (num_filter//bn, bn, h, w))
+    bias = np.transpose(bias, (0, 2, 3, 1))
+    return bias
+
+def verify_conv2d_NCHWc(batch, in_channel, in_size, num_filter, kernel, stride,
+                        padding, dilation=1, add_bias=False, add_relu=False, dtype="float32"):
+    assert dilation == 1, "conv2d_NCHWc does not support dilation for now."
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d)" %
+          (batch, in_channel, in_size, num_filter, kernel, stride, padding))
+
+    in_height = in_width = in_size
+
+    # for testing functionality,
+    # we choose arbitrary block size that can divide the channel,
+    # regardless of the performance.
+    oc_block = 1
+    for bn in range(16, 0, -1):
+        if num_filter % bn == 0:
+            oc_block = bn
+            break
+
+    ic_block = 1
+    for bn in range(oc_block, 0, -1):
+        if in_channel % bn == 0:
+            ic_block = bn
+            break
+
+    A = tvm.placeholder((batch, in_channel//ic_block, in_height, in_width, ic_block), name='A')
+    W = tvm.placeholder((num_filter//oc_block, in_channel//ic_block, kernel, kernel, ic_block, oc_block), name='W')
+    bias = tvm.placeholder((num_filter//oc_block, 1, 1, oc_block), name='bias')
+
+    @memoize("topi.tests.test_topi_conv2d_NCHWc.verify_conv2d_NCHWc")
+    def get_ref_data():
+        a_np = np.random.uniform(size=(batch, in_channel, in_height, in_width)).astype(dtype)
+        w_np = np.random.uniform(size=(num_filter, in_channel, kernel, kernel)).astype(dtype)
+        b_np = np.random.uniform(size=(num_filter, 1, 1)).astype(dtype)
+        c_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding)
+        if add_bias:
+            c_np += b_np
+        if add_relu:
+            c_np = np.maximum(c_np, 0)
+        return _transform_data(a_np, ic_block), _transform_kernel(w_np, ic_block, oc_block), \
+               _transform_bias(b_np, oc_block), _transform_data(c_np, oc_block)
+
+    a_np, w_np, b_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            C = topi.nn.conv2d_NCHWc(A, W, (stride, stride), (padding, padding),
+                                     (dilation, dilation),
+                                     layout='NCHW%dc'%ic_block,
+                                     out_layout="NCHW%dc"%oc_block,
+                                     out_dtype=dtype)
+            if add_bias:
+                C = topi.add(C, bias)
+            if add_relu:
+                C = topi.nn.relu(C)
+            s = topi.generic.schedule_conv2d_NCHWc([C])
+
+        a = tvm.nd.array(a_np, ctx)
+        w = tvm.nd.array(w_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        if add_bias:
+            func = tvm.build(s, [A, W, bias, C], device,
+                             name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
+                                  (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func(a, w, b, c)
+        else:
+            func = tvm.build(s, [A, W, C], device,
+                             name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
+                                  (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func(a, w, c)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    # test llvm only for now since conv2d_NCHWc implement is missing in other backend.
+    for device in ["llvm"]:
+        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
+            check_device(device)
+
+
+def test_conv2d_NCHWc():
+    # ResNet18 workloads
+    verify_conv2d_NCHWc(1,   3, 224,  64, 7, 2, 3)
+    verify_conv2d_NCHWc(1,  64,  56,  64, 3, 1, 1)
+    verify_conv2d_NCHWc(1,  64,  56,  64, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  64,  56, 128, 3, 2, 1)
+    verify_conv2d_NCHWc(1,  64,  56, 128, 1, 2, 0)
+    verify_conv2d_NCHWc(1, 128,  28, 128, 3, 1, 1)
+    verify_conv2d_NCHWc(1, 128,  28, 256, 3, 2, 1)
+    verify_conv2d_NCHWc(1, 128,  28, 256, 1, 2, 0)
+    verify_conv2d_NCHWc(1, 256,  14, 256, 3, 1, 1)
+    verify_conv2d_NCHWc(1, 256,  14, 512, 3, 2, 1)
+    verify_conv2d_NCHWc(1, 256,  14, 512, 1, 2, 0)
+    verify_conv2d_NCHWc(1, 512,   7, 512, 3, 1, 1)
+
+    # bias, relu
+    verify_conv2d_NCHWc(1, 64, 56, 64, 3, 1, 1, add_relu=True)
+    verify_conv2d_NCHWc(1, 64, 56, 64, 3, 1, 1, add_bias=True)
+    verify_conv2d_NCHWc(1, 64, 56, 64, 3, 1, 1, add_bias=True, add_relu=True)
+
+    # disable dilation test since it is not supported by NCHW[x]c conv for now.
+    # verify_conv2d_NCHWc(1, 64, 56, 64, 3, 1, 1, dilation=2)
+
+    # batch size
+    verify_conv2d_NCHWc(4, 64, 56, 64, 3, 1, 1)
+    verify_conv2d_NCHWc(9, 64, 56, 64, 3, 1, 1)
+
+    # weird workloads
+    verify_conv2d_NCHWc(2, 2, 2, 2, 2, 2, 2)
+    verify_conv2d_NCHWc(3, 3, 3, 3, 3, 3, 3)
+    verify_conv2d_NCHWc(4, 4, 4, 4, 4, 4, 4)
+    verify_conv2d_NCHWc(5, 5, 5, 5, 5, 5, 5)
+    verify_conv2d_NCHWc(6, 6, 6, 6, 6, 6, 6)
+
+    # disable these tests due to some bugs of llvm with nvptx
+    # verify_conv2d_NCHWc(1, 1, 1, 1, 1, 1, 1, dilation=1)
+    # verify_conv2d_NCHWc(1, 1, 1, 1, 1, 1, 1, dilation=2)
+    # verify_conv2d_NCHWc(2, 13, 71, 59, 3, 1, 1)
+
+    # inception v3 workloads
+    verify_conv2d_NCHWc(1,    3, 299,  32, 3, 2, 0)
+    verify_conv2d_NCHWc(1,   32, 149,  32, 3, 1, 0)
+    verify_conv2d_NCHWc(1,   32, 147,  64, 3, 1, 1)
+    verify_conv2d_NCHWc(1,   64,  73,  80, 1, 1, 0)
+    verify_conv2d_NCHWc(1,   80,  73, 192, 3, 1, 0)
+    verify_conv2d_NCHWc(1,  192,  35,  64, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  192,  35,  48, 1, 1, 0)
+    verify_conv2d_NCHWc(1,   48,  35,  64, 5, 1, 2)
+    verify_conv2d_NCHWc(1,   64,  35,  96, 3, 1, 1)
+    verify_conv2d_NCHWc(1,   96,  35,  96, 3, 1, 1)
+    verify_conv2d_NCHWc(1,  192,  35,  32, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  256,  35,  64, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  256,  35,  48, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  288,  35,  64, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  288,  35,  48, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  288,  35, 384, 3, 2, 0)
+    verify_conv2d_NCHWc(1,   96,  35,  96, 3, 2, 0)
+    verify_conv2d_NCHWc(1,  768,  17, 192, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  768,  17, 128, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  128,  17, 128, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  128,  17, 192, 7, 1, 3)
+    verify_conv2d_NCHWc(1,  128,  17, 128, 7, 1, 3)
+    verify_conv2d_NCHWc(1,  128,  17, 192, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  768,  17, 160, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  160,  17, 160, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  160,  17, 192, 7, 1, 3)
+    verify_conv2d_NCHWc(1,  160,  17, 160, 7, 1, 3)
+    verify_conv2d_NCHWc(1,  160,  17, 192, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  192,  17, 192, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  192,  17, 192, 7, 1, 3)
+    verify_conv2d_NCHWc(1,  192,  17, 320, 3, 2, 0)
+    verify_conv2d_NCHWc(1,  192,  17, 192, 3, 2, 0)
+    verify_conv2d_NCHWc(1, 1280,   8, 320, 1, 1, 0)
+    verify_conv2d_NCHWc(1, 1280,   8, 384, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  384,   8, 384, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  384,   8, 384, 3, 1, 1)
+    verify_conv2d_NCHWc(1, 1280,   8, 448, 1, 1, 0)
+    verify_conv2d_NCHWc(1,  448,   8, 384, 3, 1, 1)
+    verify_conv2d_NCHWc(1, 1280,   8, 192, 1, 1, 0)
+    verify_conv2d_NCHWc(1, 2048,   8, 320, 1, 1, 0)
+    verify_conv2d_NCHWc(1, 2048,   8, 384, 1, 1, 0)
+    verify_conv2d_NCHWc(1, 2048,   8, 448, 1, 1, 0)
+    verify_conv2d_NCHWc(1, 2048,   8, 192, 1, 1, 0)
+    verify_conv2d_NCHWc(1, 1024,  19,  84, 3, 1, 1)
+    verify_conv2d_NCHWc(1, 2048,  10, 126, 3, 1, 1)
+    verify_conv2d_NCHWc(1,  512,   5, 126, 3, 1, 1)
+    verify_conv2d_NCHWc(1,  256,   3, 126, 3, 1, 1)
+
+if __name__ == "__main__":
+    test_conv2d_NCHWc()
\ No newline at end of file
diff --git a/topi/tests/python/test_topi_conv2d_hwcn.py b/topi/tests/python/test_topi_conv2d_hwcn.py
index 1ff4b02470c4..1af7fa4938dd 100644
--- a/topi/tests/python/test_topi_conv2d_hwcn.py
+++ b/topi/tests/python/test_topi_conv2d_hwcn.py
@@ -13,8 +13,7 @@ def verify_conv2d_hwcn(batch, in_channel, in_size, num_filter, kernel, stride, p
 
     A = tvm.placeholder((in_height, in_width, in_channel, batch), name='A')
     W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W')
-    dW = topi.nn.dilate(W, (dilation, dilation, 1, 1))
-    B = topi.nn.conv2d_hwcn(A, dW, stride, padding)
+    B = topi.nn.conv2d_hwcn(A, W, stride, padding, dilation)
     C = topi.nn.relu(B)
     s1 = topi.cuda.schedule_conv2d_hwcn([B])
     s2 = topi.cuda.schedule_conv2d_hwcn([C])
@@ -43,14 +42,12 @@ def check_device(device):
         w = tvm.nd.array(w_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
-        with tvm.build_config(auto_unroll_max_step=128,
-                              unroll_explicit=(device != "cuda")):
-            func1 = tvm.build(s1, [A, W, B], device)
-            func2 = tvm.build(s2, [A, W, C], device)
-            func1(a, w, b)
-            func2(a, w, c)
-            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-            np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+        func1 = tvm.build(s1, [A, W, B], device)
+        func2 = tvm.build(s2, [A, W, C], device)
+        func1(a, w, b)
+        func2(a, w, c)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
         check_device(device)
diff --git a/topi/tests/python/test_topi_conv2d_int8.py b/topi/tests/python/test_topi_conv2d_int8.py
new file mode 100644
index 000000000000..272a72f82619
--- /dev/null
+++ b/topi/tests/python/test_topi_conv2d_int8.py
@@ -0,0 +1,172 @@
+"""Example code to do convolution."""
+
+import numpy as np
+import tvm
+from tvm import autotvm
+from tvm.autotvm.task.space import FallbackConfigEntity
+import topi
+import topi.testing
+from tvm.contrib.pickle_memoize import memoize
+from topi.util import get_const_tuple
+
+from common import get_all_backend, NCHWcInt8Fallback
+
+oc_block_factor = 4
+
+
+def verify_conv2d_NCHWc_int8(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False):
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+
+    in_height = in_width = in_size
+
+    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A', dtype='int8')
+    W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W', dtype='int8')
+    bias = tvm.placeholder((num_filter // oc_block_factor, 1, 1, oc_block_factor), name='bias',
+                            dtype='int8')
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    bias_shape = get_const_tuple(bias.shape)
+    dtype = A.dtype
+
+    @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
+    def get_ref_data():
+        a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype)
+        w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
+        dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
+        c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(dtype)
+
+        # convert to NCHWc
+        _, _, out_height, out_width = c_np.shape
+        c_np = c_np.reshape((batch, num_filter // oc_block_factor, oc_block_factor, \
+                out_height, out_width)).transpose(0, 1, 3, 4, 2)
+
+        if add_bias:
+            b_np = np.random.uniform(size=bias_shape).astype(dtype)
+            c_np += b_np
+        if add_relu:
+            c_np = np.maximum(c_np, 0)
+
+        return a_np, w_np, b_np, c_np
+
+    a_np, w_np, b_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version):
+            print("Skip because int8 intrinsics are not available")
+            return
+
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            C = topi.nn.conv2d(A, W, (stride, stride), (padding, padding), (dilation, dilation),
+                               layout='NCHW', out_dtype=dtype)
+            if add_bias:
+                C = topi.add(C, bias)
+            if add_relu:
+                C = topi.nn.relu(C)
+            s = topi.generic.schedule_conv2d_nchw([C])
+
+        a = tvm.nd.array(a_np, ctx)
+        w = tvm.nd.array(w_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        if add_bias:
+            tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func(a, w, b, c)
+        else:
+            func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func(a, w, c)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    for device in ["cuda"]:
+        check_device(device)
+
+
+def test_conv2d_nchw():
+    with NCHWcInt8Fallback():
+        # ResNet18 workloads where channels in / out are multiple of oc_block_factor
+        verify_conv2d_NCHWc_int8(1,  64,  56,  64, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1,  64,  56,  64, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  64,  56, 128, 3, 2, 1)
+        verify_conv2d_NCHWc_int8(1,  64,  56, 128, 1, 2, 0)
+        verify_conv2d_NCHWc_int8(1, 128,  28, 128, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1, 128,  28, 256, 3, 2, 1)
+        verify_conv2d_NCHWc_int8(1, 128,  28, 256, 1, 2, 0)
+        verify_conv2d_NCHWc_int8(1, 256,  14, 256, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1, 256,  14, 512, 3, 2, 1)
+        verify_conv2d_NCHWc_int8(1, 256,  14, 512, 1, 2, 0)
+        verify_conv2d_NCHWc_int8(1, 512,   7, 512, 3, 1, 1)
+
+        # bias, relu
+        verify_conv2d_NCHWc_int8(1, 64, 56, 64, 3, 1, 1, add_relu=True)
+        verify_conv2d_NCHWc_int8(1, 64, 56, 64, 3, 1, 1, add_bias=True)
+        verify_conv2d_NCHWc_int8(1, 64, 56, 64, 3, 1, 1, add_bias=True, add_relu=True)
+
+        # dilation = 2
+        verify_conv2d_NCHWc_int8(1, 64, 56, 64, 3, 1, 1, dilation=2)
+
+        # batch size
+        verify_conv2d_NCHWc_int8(4, 64, 56, 64, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(9, 64, 56, 64, 3, 1, 1)
+
+        # weird workloads
+        verify_conv2d_NCHWc_int8(4, 4, 4, 4, 4, 4, 4)
+
+        # inception v3 workloads where channels in / out are multiple of oc_block_factor
+        verify_conv2d_NCHWc_int8(1,   32, 149,  32, 3, 1, 0)
+        verify_conv2d_NCHWc_int8(1,   32, 147,  64, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1,   64,  73,  80, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,   80,  73, 192, 3, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  192,  35,  64, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  192,  35,  48, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,   48,  35,  64, 5, 1, 2)
+        verify_conv2d_NCHWc_int8(1,   64,  35,  96, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1,   96,  35,  96, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1,  192,  35,  32, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  256,  35,  64, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  256,  35,  48, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  288,  35,  64, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  288,  35,  48, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  288,  35, 384, 3, 2, 0)
+        verify_conv2d_NCHWc_int8(1,   96,  35,  96, 3, 2, 0)
+        verify_conv2d_NCHWc_int8(1,  768,  17, 192, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  768,  17, 128, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  128,  17, 128, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  128,  17, 192, 7, 1, 3)
+        verify_conv2d_NCHWc_int8(1,  128,  17, 128, 7, 1, 3)
+        verify_conv2d_NCHWc_int8(1,  128,  17, 192, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  768,  17, 160, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  160,  17, 160, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  160,  17, 192, 7, 1, 3)
+        verify_conv2d_NCHWc_int8(1,  160,  17, 160, 7, 1, 3)
+        verify_conv2d_NCHWc_int8(1,  160,  17, 192, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  192,  17, 192, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  192,  17, 192, 7, 1, 3)
+        verify_conv2d_NCHWc_int8(1,  192,  17, 320, 3, 2, 0)
+        verify_conv2d_NCHWc_int8(1,  192,  17, 192, 3, 2, 0)
+        verify_conv2d_NCHWc_int8(1, 1280,   8, 320, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1, 1280,   8, 384, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  384,   8, 384, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  384,   8, 384, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1, 1280,   8, 448, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1,  448,   8, 384, 3, 1, 1)
+        verify_conv2d_NCHWc_int8(1, 1280,   8, 192, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1, 2048,   8, 320, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1, 2048,   8, 384, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1, 2048,   8, 448, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1, 2048,   8, 192, 1, 1, 0)
+        verify_conv2d_NCHWc_int8(1, 1024,  19,  84, 3, 1, 1)
+
+        # batch > 1
+        verify_conv2d_NCHWc_int8(7,   32, 149,  32, 3, 1, 0)
+        verify_conv2d_NCHWc_int8(8,   32, 149,  32, 3, 1, 0)
+        verify_conv2d_NCHWc_int8(32,  32, 149,  32, 3, 1, 0)
+
+if __name__ == "__main__":
+    test_conv2d_nchw()
diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py
index c663384b8187..abd1d61c34ed 100644
--- a/topi/tests/python/test_topi_conv2d_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_nchw.py
@@ -1,31 +1,41 @@
 """Example code to do convolution."""
-import os
+
 import numpy as np
 import tvm
+from tvm import autotvm
 import topi
 import topi.testing
 from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
 
-def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1):
-    print("Workload: (%d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding))
+from common import get_all_backend
+
+def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False):
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
 
     in_height = in_width = in_size
 
     A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
     W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
+    bias = tvm.placeholder((num_filter, 1, 1), name='bias')
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
+    bias_shape = get_const_tuple(bias.shape)
     dtype = A.dtype
 
     @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw")
     def get_ref_data():
         a_np = np.random.uniform(size=a_shape).astype(dtype)
         w_np = np.random.uniform(size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
         dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
-        b_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding)
-        c_np = np.maximum(b_np, 0)
+        c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding)
+        if add_bias:
+            b_np = np.random.uniform(size=bias_shape).astype(dtype)
+            c_np += b_np
+        if add_relu:
+            c_np = np.maximum(c_np, 0)
         return a_np, w_np, b_np, c_np
 
     a_np, w_np, b_np, c_np = get_ref_data()
@@ -37,67 +47,119 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            dW = topi.nn.dilate(W, (1, 1, dilation, dilation))
-            B = topi.nn.conv2d(A, dW, stride, padding, layout='NCHW')
-            C = topi.nn.relu(B)
-            s1 = topi.generic.schedule_conv2d_nchw([B])
-            s2 = topi.generic.schedule_conv2d_nchw([C])
+            C = topi.nn.conv2d(A, W, (stride, stride), (padding, padding),
+                               (dilation, dilation), layout='NCHW', out_dtype=dtype)
+            if add_bias:
+                C = topi.add(C, bias)
+            if add_relu:
+                C = topi.nn.relu(C)
+            s = topi.generic.schedule_conv2d_nchw([C])
+
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        b = tvm.nd.array(b_np, ctx)
         c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
-        no_unroll_explicit = device in ["cuda", "nvptx", "rocm"]
-        with tvm.build_config(auto_unroll_max_step=1400,
-                              unroll_explicit=not no_unroll_explicit):
-            func1 = tvm.build(s1, [A, W, B], device, name="conv2d_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
-            func2 = tvm.build(s2, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
-            func1(a, w, b)
-            func2(a, w, c)
-            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-            np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+        if add_bias:
+            func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func(a, w, b, c)
+        else:
+            func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func(a, w, c)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
-        check_device(device)
+    for device in get_all_backend():
+        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
+            check_device(device)
 
 
 def test_conv2d_nchw():
     # ResNet18 workloads
-    verify_conv2d_nchw(1, 3, 224, 64, 7, 2, 3)
-    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1)
-    verify_conv2d_nchw(1, 64, 56, 64, 1, 1, 0)
-    verify_conv2d_nchw(1, 64, 56, 128, 3, 2, 1)
-    verify_conv2d_nchw(1, 64, 56, 128, 1, 2, 0)
-    verify_conv2d_nchw(1, 128, 28, 128, 3, 1, 1)
-    verify_conv2d_nchw(1, 128, 28, 256, 3, 2, 1)
-    verify_conv2d_nchw(1, 128, 28, 256, 1, 2, 0)
-    verify_conv2d_nchw(1, 256, 14, 256, 3, 1, 1)
-    verify_conv2d_nchw(1, 256, 14, 512, 3, 2, 1)
-    verify_conv2d_nchw(1, 256, 14, 512, 1, 2, 0)
-    verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1)
-    # ResNet50 workloads
-    verify_conv2d_nchw(1, 64, 56, 256, 1, 1, 0)
-    verify_conv2d_nchw(1, 256, 56, 64, 1, 1, 0)
-    verify_conv2d_nchw(1, 256, 56, 128, 1, 2, 0)
-    verify_conv2d_nchw(1, 128, 28, 512, 1, 1, 0)
-    verify_conv2d_nchw(1, 256, 56, 512, 1, 2, 0)
-    verify_conv2d_nchw(1, 512, 28, 128, 1, 1, 0)
-    verify_conv2d_nchw(1, 512, 28, 256, 1, 2, 0)
-    verify_conv2d_nchw(1, 256, 14, 1024, 1, 1, 0)
-    verify_conv2d_nchw(1, 512, 28, 1024, 1, 2, 0)
-    verify_conv2d_nchw(1, 1024, 14, 256, 1, 1, 0)
-    verify_conv2d_nchw(1, 1024, 14, 512, 1, 2, 0)
-    verify_conv2d_nchw(1, 512, 7, 2048, 1, 2, 0)
-    verify_conv2d_nchw(1, 1024, 14, 2048, 1, 2, 0)
-    verify_conv2d_nchw(1, 2048, 7, 512, 1, 1, 0)
-    # Vgg16 workloads
-    verify_conv2d_nchw(1, 128, 122, 128, 3, 1, 1)
-    # Super resolution workloads
-    verify_conv2d_nchw(1, 1, 224, 64, 5, 1, 2)
-    verify_conv2d_nchw(1, 64, 224, 64, 3, 1, 1)
-    verify_conv2d_nchw(1, 64, 224, 32, 3, 1, 1)
-    verify_conv2d_nchw(1, 32, 224, 9, 3, 1, 1)
+    verify_conv2d_nchw(1,   3, 224,  64, 7, 2, 3)
+    verify_conv2d_nchw(1,  64,  56,  64, 3, 1, 1)
+    verify_conv2d_nchw(1,  64,  56,  64, 1, 1, 0)
+    verify_conv2d_nchw(1,  64,  56, 128, 3, 2, 1)
+    verify_conv2d_nchw(1,  64,  56, 128, 1, 2, 0)
+    verify_conv2d_nchw(1, 128,  28, 128, 3, 1, 1)
+    verify_conv2d_nchw(1, 128,  28, 256, 3, 2, 1)
+    verify_conv2d_nchw(1, 128,  28, 256, 1, 2, 0)
+    verify_conv2d_nchw(1, 256,  14, 256, 3, 1, 1)
+    verify_conv2d_nchw(1, 256,  14, 512, 3, 2, 1)
+    verify_conv2d_nchw(1, 256,  14, 512, 1, 2, 0)
+    verify_conv2d_nchw(1, 512,   7, 512, 3, 1, 1)
+
+    # bias, relu
+    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_relu=True)
+    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_bias=True)
+    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_bias=True, add_relu=True)
+
     # dilation = 2
-    verify_conv2d_nchw(1, 128, 122, 128, 3, 1, 1, dilation=2)
+    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, dilation=2)
+
+    # batch size
+    verify_conv2d_nchw(4, 64, 56, 64, 3, 1, 1)
+    verify_conv2d_nchw(9, 64, 56, 64, 3, 1, 1)
+
+    # weird workloads
+    verify_conv2d_nchw(2, 2, 2, 2, 2, 2, 2)
+    verify_conv2d_nchw(3, 3, 3, 3, 3, 3, 3)
+    verify_conv2d_nchw(4, 4, 4, 4, 4, 4, 4)
+    verify_conv2d_nchw(5, 5, 5, 5, 5, 5, 5)
+    verify_conv2d_nchw(6, 6, 6, 6, 6, 6, 6)
+
+    # disable these tests due to some bugs of llvm with nvptx
+    # verify_conv2d_nchw(1, 1, 1, 1, 1, 1, 1, dilation=1)
+    # verify_conv2d_nchw(1, 1, 1, 1, 1, 1, 1, dilation=2)
+    # verify_conv2d_nchw(2, 13, 71, 59, 3, 1, 1)
+
+    # inception v3 workloads
+    verify_conv2d_nchw(1,    3, 299,  32, 3, 2, 0)
+    verify_conv2d_nchw(1,   32, 149,  32, 3, 1, 0)
+    verify_conv2d_nchw(1,   32, 147,  64, 3, 1, 1)
+    verify_conv2d_nchw(1,   64,  73,  80, 1, 1, 0)
+    verify_conv2d_nchw(1,   80,  73, 192, 3, 1, 0)
+    verify_conv2d_nchw(1,  192,  35,  64, 1, 1, 0)
+    verify_conv2d_nchw(1,  192,  35,  48, 1, 1, 0)
+    verify_conv2d_nchw(1,   48,  35,  64, 5, 1, 2)
+    verify_conv2d_nchw(1,   64,  35,  96, 3, 1, 1)
+    verify_conv2d_nchw(1,   96,  35,  96, 3, 1, 1)
+    verify_conv2d_nchw(1,  192,  35,  32, 1, 1, 0)
+    verify_conv2d_nchw(1,  256,  35,  64, 1, 1, 0)
+    verify_conv2d_nchw(1,  256,  35,  48, 1, 1, 0)
+    verify_conv2d_nchw(1,  288,  35,  64, 1, 1, 0)
+    verify_conv2d_nchw(1,  288,  35,  48, 1, 1, 0)
+    verify_conv2d_nchw(1,  288,  35, 384, 3, 2, 0)
+    verify_conv2d_nchw(1,   96,  35,  96, 3, 2, 0)
+    verify_conv2d_nchw(1,  768,  17, 192, 1, 1, 0)
+    verify_conv2d_nchw(1,  768,  17, 128, 1, 1, 0)
+    verify_conv2d_nchw(1,  128,  17, 128, 1, 1, 0)
+    verify_conv2d_nchw(1,  128,  17, 192, 7, 1, 3)
+    verify_conv2d_nchw(1,  128,  17, 128, 7, 1, 3)
+    verify_conv2d_nchw(1,  128,  17, 192, 1, 1, 0)
+    verify_conv2d_nchw(1,  768,  17, 160, 1, 1, 0)
+    verify_conv2d_nchw(1,  160,  17, 160, 1, 1, 0)
+    verify_conv2d_nchw(1,  160,  17, 192, 7, 1, 3)
+    verify_conv2d_nchw(1,  160,  17, 160, 7, 1, 3)
+    verify_conv2d_nchw(1,  160,  17, 192, 1, 1, 0)
+    verify_conv2d_nchw(1,  192,  17, 192, 1, 1, 0)
+    verify_conv2d_nchw(1,  192,  17, 192, 7, 1, 3)
+    verify_conv2d_nchw(1,  192,  17, 320, 3, 2, 0)
+    verify_conv2d_nchw(1,  192,  17, 192, 3, 2, 0)
+    verify_conv2d_nchw(1, 1280,   8, 320, 1, 1, 0)
+    verify_conv2d_nchw(1, 1280,   8, 384, 1, 1, 0)
+    verify_conv2d_nchw(1,  384,   8, 384, 1, 1, 0)
+    verify_conv2d_nchw(1,  384,   8, 384, 3, 1, 1)
+    verify_conv2d_nchw(1, 1280,   8, 448, 1, 1, 0)
+    verify_conv2d_nchw(1,  448,   8, 384, 3, 1, 1)
+    verify_conv2d_nchw(1, 1280,   8, 192, 1, 1, 0)
+    verify_conv2d_nchw(1, 2048,   8, 320, 1, 1, 0)
+    verify_conv2d_nchw(1, 2048,   8, 384, 1, 1, 0)
+    verify_conv2d_nchw(1, 2048,   8, 448, 1, 1, 0)
+    verify_conv2d_nchw(1, 2048,   8, 192, 1, 1, 0)
+    verify_conv2d_nchw(1, 1024,  19,  84, 3, 1, 1)
+    verify_conv2d_nchw(1, 2048,  10, 126, 3, 1, 1)
+    verify_conv2d_nchw(1,  512,   5, 126, 3, 1, 1)
+    verify_conv2d_nchw(1,  256,   3, 126, 3, 1, 1)
+
 
 if __name__ == "__main__":
     test_conv2d_nchw()
diff --git a/topi/tests/python/test_topi_conv2d_nhwc.py b/topi/tests/python/test_topi_conv2d_nhwc.py
index 7e41517c5d61..af55f5bc172c 100644
--- a/topi/tests/python/test_topi_conv2d_nhwc.py
+++ b/topi/tests/python/test_topi_conv2d_nhwc.py
@@ -13,18 +13,17 @@ def verify_conv2d_nhwc(batch, in_channel, in_size, num_filter, kernel, stride, p
 
     A = tvm.placeholder((batch, in_height, in_width, in_channel), name='A')
     W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W')
-    dW = topi.nn.dilate(W, (1, dilation, dilation, 1))
-    B = topi.nn.conv2d_nhwc(A, dW, stride, padding)
+    B = topi.nn.conv2d_nhwc(A, W, stride, padding, dilation)
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
     dtype = A.dtype
 
-    @memoize("topi.tests.test_topi_conv2d_nhwc.verify_nhwc")
+    @memoize("topi.tests.test_topi_conv2d_nhwc.verify_nhwc.v2")
     def get_ref_data():
         a_np = np.random.uniform(size=a_shape).astype(dtype)
         w_np = np.random.uniform(size=w_shape).astype(dtype)
-        dw_np = topi.testing.dilate_python(w_np, (1, dilation, dilation, 1))
+        dw_np = topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
         b_np = topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding)
         return a_np, w_np, b_np
     a_np, w_np, b_np = get_ref_data()
@@ -42,7 +41,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         func = tvm.build(s, [A, W, B], device)
         func(a, w, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['llvm']:
         check_device(device)
diff --git a/topi/tests/python/test_topi_conv2d_transpose_nchw.py b/topi/tests/python/test_topi_conv2d_transpose_nchw.py
index 0c985400031a..296772f4e9f5 100644
--- a/topi/tests/python/test_topi_conv2d_transpose_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_transpose_nchw.py
@@ -6,14 +6,13 @@
 from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
 
+from common import get_all_backend
 
 def verify_conv2d_transpose_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding):
     in_height = in_width = in_size
 
     A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
     W = tvm.placeholder((in_channel, num_filter, kernel, kernel), name='W')
-    B = topi.nn.conv2d_transpose_nchw(A, W, [stride, stride], padding, A.dtype)
-    C = topi.nn.relu(B)
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
@@ -36,22 +35,23 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
+            B = topi.nn.conv2d_transpose_nchw(A, W, [stride, stride], [padding, padding], A.dtype)
+            C = topi.nn.relu(B)
             s1 = topi.generic.schedule_conv2d_transpose_nchw([B])
             s2 = topi.generic.schedule_conv2d_transpose_nchw([C])
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
-        with tvm.build_config(auto_unroll_max_step=128,
-                              unroll_explicit=(device != "cuda")):
-            func1 = tvm.build(s1, [A, W, B], device)
-            func2 = tvm.build(s2, [A, W, C], device)
-            func1(a, w, b)
-            func2(a, w, c)
-            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-            np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
+        func1 = tvm.build(s1, [A, W, B], device)
+        func2 = tvm.build(s2, [A, W, C], device)
+        func1(a, w, b)
+        func2(a, w, c)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    for device in get_all_backend():
         check_device(device)
 
 
diff --git a/topi/tests/python/test_topi_conv2d_winograd.py b/topi/tests/python/test_topi_conv2d_winograd.py
new file mode 100644
index 000000000000..1ca7240a41b0
--- /dev/null
+++ b/topi/tests/python/test_topi_conv2d_winograd.py
@@ -0,0 +1,109 @@
+"""Example code to do convolution."""
+
+import numpy as np
+import tvm
+from tvm import autotvm
+from tvm.autotvm.task.space import FallbackConfigEntity
+import topi
+import topi.testing
+from tvm.contrib.pickle_memoize import memoize
+from topi.util import get_const_tuple
+
+
+def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False):
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+
+    in_height = in_width = in_size
+
+    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
+    W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
+    bias = tvm.placeholder((num_filter, 1, 1), name='bias')
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    bias_shape = get_const_tuple(bias.shape)
+    dtype = A.dtype
+
+    @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw")
+    def get_ref_data():
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        w_np = np.random.uniform(size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
+        dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
+        c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding)
+        if add_bias:
+            b_np = np.random.uniform(size=bias_shape).astype(dtype)
+            c_np += b_np
+        if add_relu:
+            c_np = np.maximum(c_np, 0)
+        return a_np, w_np, b_np, c_np
+
+    a_np, w_np, b_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            C = topi.nn.conv2d(A, W, stride, padding, dilation, layout='NCHW', out_dtype=dtype)
+            if add_bias:
+                C = topi.add(C, bias)
+            if add_relu:
+                C = topi.nn.relu(C)
+            s = topi.generic.schedule_conv2d_nchw([C])
+
+        a = tvm.nd.array(a_np, ctx)
+        w = tvm.nd.array(w_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        if add_bias:
+            func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func(a, w, b, c)
+        else:
+            func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func(a, w, c)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+
+    for device in ['cuda', 'llvm -device=arm_cpu', 'opencl -device=mali']:
+        check_device(device)
+
+
+class WinogradFallback(autotvm.FallbackContext):
+    def _query_inside(self, target, workload):
+        key = (target, workload)
+        if key in self.memory:
+            return self.memory[key]
+        cfg = FallbackConfigEntity()
+        cfg.template_key = 'winograd'
+        self.memory[key] = cfg
+        return cfg
+
+
+def test_conv2d_nchw():
+    autotvm.DispatchContext.current.silent = True
+
+    with WinogradFallback():
+        # resnet 18 workloads
+        verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1)
+        verify_conv2d_nchw(1, 128, 28, 128, 3, 1, 1)
+        verify_conv2d_nchw(1, 256, 14, 256, 3, 1, 1)
+        verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1)
+
+        # batch size = 2
+        verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1)
+
+        # relu, bias
+        verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1, add_bias=True)
+        verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1, add_relu=True)
+        verify_conv2d_nchw(2, 64, 56, 64, 3, 1, 1, add_relu=True, add_bias=True)
+
+        # werid workloads
+        verify_conv2d_nchw(1, 1, 1, 1, 3, 1, 1)
+        verify_conv2d_nchw(3, 3, 3, 3, 3, 1, 1)
+        verify_conv2d_nchw(2, 13, 71, 59, 3, 1, 1)
+
+if __name__ == "__main__":
+    test_conv2d_nchw()
diff --git a/topi/tests/python/test_topi_dense.py b/topi/tests/python/test_topi_dense.py
index 2df43eb30887..60ef4be4c8e0 100644
--- a/topi/tests/python/test_topi_dense.py
+++ b/topi/tests/python/test_topi_dense.py
@@ -6,13 +6,12 @@
 from topi.util import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
+from common import get_all_backend
 
 def verify_dense(batch, in_dim, out_dim, use_bias=True):
     A = tvm.placeholder((batch, in_dim), name='A')
     B = tvm.placeholder((out_dim, in_dim), name='B')
     C = tvm.placeholder((out_dim,), name='C')
-    D = topi.nn.dense(A, B, C if use_bias else None)
-    D = topi.nn.relu(D)
     dtype = A.dtype
 
     # use memoize to pickle the test data for next time use
@@ -36,6 +35,8 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
+            D = topi.nn.dense(A, B, C if use_bias else None)
+            D = topi.nn.relu(D)
             s = topi.generic.schedule_dense(D)
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(b_np, ctx)
@@ -43,15 +44,17 @@ def check_device(device):
         d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B, C, D], device, name="dense")
         f(a, b, c, d)
-        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
+        tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
+    for device in get_all_backend():
         check_device(device)
 
 def test_dense():
     verify_dense(1, 1024, 1000, use_bias=True)
     verify_dense(1, 1024, 1000, use_bias=False)
 
+    verify_dense(2, 1024, 1000, use_bias=True)
+
 
 if __name__ == "__main__":
     test_dense()
diff --git a/topi/tests/python/test_topi_depthwise_conv2d.py b/topi/tests/python/test_topi_depthwise_conv2d.py
index 3086054ba487..98c93dff9993 100644
--- a/topi/tests/python/test_topi_depthwise_conv2d.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d.py
@@ -1,27 +1,35 @@
 import tvm
+from tvm import autotvm
 import topi
 import topi.testing
 import numpy as np
-from scipy import signal
 from topi.util import get_const_tuple
+from topi.nn.util import get_pad_tuple
 from tvm.contrib.pickle_memoize import memoize
-from topi.cuda.depthwise_conv2d import schedule_depthwise_conv2d_nhwc
 
+from common import get_all_backend
 
 def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding, dilation=1):
     in_width = in_height
     filter_channel = in_channel
     filter_width = filter_height
+    stride_h = stride_w = stride
+
+    if dilation == 1:
+        # here we transform the padding argument from 'str' to  'tuple' ,
+        # because we need this to match the "workload" tuple to the records in TopHub
+        pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width))
+        padding_args = (pad_h, pad_w)
+    else:
+        padding_args = padding
+
     # placeholder
     Input = tvm.placeholder((batch, in_channel, in_height, in_width), name='Input')
     Filter = tvm.placeholder((filter_channel, channel_multiplier, filter_height, filter_width), name='Filter')
-    DilatedFilter = topi.nn.dilate(Filter, (1, 1, dilation, dilation), name='DilatedFilter')
     Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale')
     Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift')
-    # declare
-    DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, DilatedFilter, stride=stride, padding=padding)
-    ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift)
-    Relu = topi.nn.relu(ScaleShift)
+
+    dtype = 'float32'
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -30,6 +38,11 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
+            # declare
+            DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, Filter,
+                (stride_h, stride_w), padding_args, dilation, dtype)
+            ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift)
+            Relu = topi.nn.relu(ScaleShift)
             # schedule
             s1 = topi.generic.schedule_depthwise_conv2d_nchw(DepthwiseConv2d)
             s2 = topi.generic.schedule_depthwise_conv2d_nchw(ScaleShift)
@@ -40,7 +53,6 @@ def check_device(device):
         f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)
 
         # Prepare pod type for test data closure
-        dtype = Input.dtype
         input_shape = get_const_tuple(Input.shape)
         filter_shape = get_const_tuple(Filter.shape)
         scale_shape = get_const_tuple(Scale.shape)
@@ -57,7 +69,7 @@ def get_ref_data():
             shift_np = np.random.uniform(size=shift_shape).astype(dtype)
             # correctness with scipy
             depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw(
-                input_np, dilated_filter_np, stride=stride, padding=padding)
+                input_np, dilated_filter_np, stride, padding)
             scale_shift_scipy = np.zeros(shape=scale_shift_shape)
             for c in range(in_channel * channel_multiplier):
                 scale_shift_scipy[:,c,:,:] = depthwise_conv2d_scipy[:,c,:,:] * scale_np[c] + shift_np[c]
@@ -84,16 +96,13 @@ def get_ref_data():
         # launch kernel 3 (depthwise_conv2d + scale_shift + relu)
         timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1)
         tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean
-        np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
-        np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
-        np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
 
-    check_device("opencl")
-    check_device("cuda")
-    check_device("metal")
-    check_device("rocm")
-    check_device("vulkan")
-    check_device("nvptx")
+    for device in get_all_backend():
+        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
+            check_device(device)
 
 
 def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_multiplier, filter_height, stride_h, padding, dilation=1):
@@ -101,17 +110,22 @@ def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_mu
     filter_channel = in_channel
     filter_width = filter_height
     stride_w = stride_h
+
+    if dilation == 1:
+        # here we transform the padding argument from 'str' to  'tuple' ,
+        # because we need this to match the "workload" tuple to the records in TopHub
+        pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width))
+        padding_args = (pad_h, pad_w)
+    else:
+        padding_args = padding
+
     # placeholder
     Input = tvm.placeholder((batch, in_height, in_width, in_channel), name='Input')
     Filter = tvm.placeholder((filter_height, filter_width,filter_channel, channel_multiplier), name='Filter')
-    DilatedFilter = topi.nn.dilate(Filter, (1, 1, dilation, dilation), name='DilatedFilter')
     Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale')
     Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift')
-    # declare
-    DepthwiseConv2d = topi.nn.depthwise_conv2d_nhwc(Input, DilatedFilter, stride=[stride_h, stride_w], padding=padding)
-    ScaleShift = topi.nn.scale_shift_nhwc(DepthwiseConv2d, Scale, Shift)
-    Relu = topi.nn.relu(ScaleShift)
-    # schedule
+
+    dtype = 'float32'
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -121,6 +135,12 @@ def check_device(device):
         print("Running on target: %s" % device)
 
         with tvm.target.create(device):
+            # declare
+            DepthwiseConv2d = topi.nn.depthwise_conv2d_nhwc(Input, Filter,
+                (stride_h, stride_w), padding_args, dilation, dtype)
+            ScaleShift = topi.nn.scale_shift_nhwc(DepthwiseConv2d, Scale, Shift)
+            Relu = topi.nn.relu(ScaleShift)
+            # schedule
             s1 = topi.generic.schedule_depthwise_conv2d_nhwc(DepthwiseConv2d)
             s2 = topi.generic.schedule_depthwise_conv2d_nhwc(ScaleShift)
             s3 = topi.generic.schedule_depthwise_conv2d_nhwc(Relu)
@@ -130,7 +150,6 @@ def check_device(device):
         f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)
 
         # Prepare pod type for test data closure
-        dtype = Input.dtype
         input_shape = get_const_tuple(Input.shape)
         filter_shape = get_const_tuple(Filter.shape)
         scale_shape = get_const_tuple(Scale.shape)
@@ -138,11 +157,11 @@ def check_device(device):
         scale_shift_shape = get_const_tuple(ScaleShift.shape)
 
         # Use memoize, pickle the test data for next time use.
-        @memoize("topi.tests.test_topi_depthwise_conv2d.nhwc")
+        @memoize("topi.tests.test_topi_depthwise_conv2d.nhwc.v2")
         def get_ref_data():
             input_np = np.random.uniform(size=input_shape).astype(dtype)
             filter_np = np.random.uniform(size=filter_shape).astype(dtype)
-            dilated_filter_np = topi.testing.dilate_python(filter_np, (1, 1, dilation, dilation))
+            dilated_filter_np = topi.testing.dilate_python(filter_np, (dilation, dilation, 1, 1))
             scale_np = np.random.uniform(size=scale_shape).astype(dtype)
             shift_np = np.random.uniform(size=shift_shape).astype(dtype)
             # correctness with scipy
@@ -176,40 +195,159 @@ def get_ref_data():
         timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1)
         tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean
         relu_scipy = np.maximum(scale_shift_scipy, 0)
-        np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
-        np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
-        np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
+
+    for device in get_all_backend():
+        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
+            check_device(device)
+
+def _transform_data(data, bn):
+    # NCHW -> NCHW[x]c
+    batch_size, channel, height, width = data.shape
+    data = np.reshape(data, (batch_size, channel//bn, bn, height, width))
+    data = np.transpose(data, (0, 1, 3, 4, 2))
+    return data
+
+def _transform_kernel(kernel, bn):
+    # channel, channel_multiplier, kh, kw -> out_channel_chunk, kh, kw, out_channel_block
+    channel, channel_multiplier, kh, kw = kernel.shape
+    out_channel = channel * channel_multiplier
+    kernel = np.reshape(kernel, (out_channel//bn, bn, kh, kw))
+    kernel = np.transpose(kernel, (0, 2, 3, 1))
+    return kernel
+
+def depthwise_conv2d_with_workload_NCHWc(batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding, dilation=1):
+    in_width = in_height
+    filter_channel = in_channel
+    filter_width = filter_height
+    stride_h = stride_w = stride
+
+    assert dilation == 1, "depthwise_conv2d_NCHWc currently does not support dilation."
+    pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width))
+    padding_args = (pad_h, pad_w)
+
+    out_channel = filter_channel * channel_multiplier
+    # for testing functionality,
+    # we choose arbitrary block size that can divide the channel,
+    # regardless of the performance.
+    oc_block = 1
+    for bn in range(16, 0, -1):
+        if out_channel % bn == 0:
+            oc_block = bn
+            break
+
+    ic_block = 1
+    for bn in range(oc_block, 0, -1):
+        if in_channel % bn == 0:
+            ic_block = bn
+            break
+
+    # placeholder
+    Input = tvm.placeholder((batch, in_channel//ic_block, in_height, in_width, ic_block), name='Input')
+    Filter = tvm.placeholder((out_channel//oc_block, filter_height, filter_width, oc_block), name='Filter')
+    in_layout = "NCHW%dc" % ic_block
+    out_layout = "NCHW%dc" % oc_block
+    dtype = 'float32'
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            # declare
+            DepthwiseConv2d = topi.nn.depthwise_conv2d_NCHWc(Input, Filter,
+                                                             (stride_h, stride_w),
+                                                             padding_args,
+                                                             (dilation, dilation),
+                                                             in_layout,
+                                                             out_layout, dtype)
+            # TODO: add scale_shift implement for NCHWc and add test here
+            Relu = topi.nn.relu(DepthwiseConv2d)
+            # schedule
+            s1 = topi.generic.schedule_depthwise_conv2d_nchw(DepthwiseConv2d)
+            s2 = topi.generic.schedule_depthwise_conv2d_nchw(Relu)
+        # build the kernels
+        f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
+        f2 = tvm.build(s2, [Input, Filter, Relu], device)
+
+        # Prepare pod type for test data closure
+        input_shape = (batch, in_channel, in_height, in_width)
+        filter_shape = (filter_channel, channel_multiplier, filter_height, filter_width)
+
+        # Use memoize, pickle the test data for next time use.
+        @memoize("topi.tests.test_topi_depthwise_conv2d.NCHWc")
+        def get_ref_data():
+            input_np = np.random.uniform(size=input_shape).astype(dtype)
+            filter_np = np.random.uniform(size=filter_shape).astype(dtype)
+            # correctness with scipy
+            depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw(
+                input_np, filter_np, stride, padding)
+            relu_scipy = np.maximum(depthwise_conv2d_scipy, 0)
+            return (_transform_data(input_np, ic_block),
+                    _transform_kernel(filter_np, oc_block),
+                    _transform_data(depthwise_conv2d_scipy, oc_block),
+                    _transform_data(relu_scipy, oc_block))
+
+        # Get the test data
+        (input_np, filter_np, depthwise_conv2d_scipy, relu_scipy) = get_ref_data()
+
+        input_tvm = tvm.nd.array(input_np, ctx)
+        filter_tvm = tvm.nd.array(filter_np, ctx)
+        depthwise_conv2d_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape),
+                                                     dtype=DepthwiseConv2d.dtype), ctx)
+        relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
+        # launch kernel 1 (depthwise_conv2d)
+        f1(input_tvm, filter_tvm, depthwise_conv2d_tvm)
+        # launch kernel 2 (depthwise_conv2d + relu)
+        f2(input_tvm, filter_tvm, relu_tvm)
+        tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
+        tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
+
+    # test llvm only for now since depthwise_conv2d_NCHWc implement is missing in other backend.
+    for device in ["llvm"]:
+        with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
+            check_device(device)
 
-    check_device("opencl")
-    check_device("cuda")
-    check_device("metal")
-    check_device("rocm")
-    check_device("vulkan")
-    check_device("nvptx")
 
 def test_depthwise_conv2d():
-    print("testing nchw")
-    depthwise_conv2d_with_workload_nchw(1, 728, 64, 1, 3, 1, "SAME")
+    # mobilenet workloads
+    depthwise_conv2d_with_workload_nchw(1, 32, 112, 1, 3, 1, "SAME")
+    depthwise_conv2d_with_workload_nchw(1, 64, 112, 1, 3, 2, "SAME")
+    depthwise_conv2d_with_workload_nchw(1, 128, 56, 1, 3, 1, "SAME")
+    depthwise_conv2d_with_workload_nchw(1, 128, 56, 1, 3, 2, "SAME")
+    depthwise_conv2d_with_workload_nchw(1, 256, 28, 1, 3, 1, "SAME")
+    depthwise_conv2d_with_workload_nchw(1, 256, 28, 1, 3, 2, "SAME")
+    depthwise_conv2d_with_workload_nchw(1, 512, 14, 1, 3, 1, "SAME")
+    depthwise_conv2d_with_workload_nchw(1, 512, 14, 1, 3, 2, "SAME")
+    depthwise_conv2d_with_workload_nchw(1, 1024, 7, 1, 3, 1, "SAME")
+
+    # NCHW
     depthwise_conv2d_with_workload_nchw(1, 728, 32, 1, 3, 1, "SAME")
     depthwise_conv2d_with_workload_nchw(4, 256, 64, 2, 5, 2, "SAME")
-    depthwise_conv2d_with_workload_nchw(4, 256, 32, 2, 5, 2, "SAME")
-    depthwise_conv2d_with_workload_nchw(1, 728, 64, 1, 3, 1, "VALID")
     depthwise_conv2d_with_workload_nchw(1, 728, 32, 1, 3, 1, "VALID")
     depthwise_conv2d_with_workload_nchw(4, 256, 64, 2, 5, 2, "VALID")
-    depthwise_conv2d_with_workload_nchw(4, 256, 32, 2, 5, 2, "VALID")
     # dilation = 2
     depthwise_conv2d_with_workload_nchw(1, 728, 64, 1, 3, 1, "SAME", dilation=2)
-    print("testing nhwc")
-    depthwise_conv2d_with_workload_nhwc(1, 728, 64, 1, 3, 1, "SAME")
+
+    # NHWC
     depthwise_conv2d_with_workload_nhwc(1, 728, 32, 1, 3, 1, "SAME")
     depthwise_conv2d_with_workload_nhwc(4, 256, 64, 2, 5, 2, "SAME")
-    depthwise_conv2d_with_workload_nhwc(4, 256, 32, 2, 5, 2, "SAME")
-    depthwise_conv2d_with_workload_nhwc(1, 728, 64, 1, 3, 1, "VALID")
     depthwise_conv2d_with_workload_nhwc(1, 728, 32, 1, 3, 1, "VALID")
     depthwise_conv2d_with_workload_nhwc(4, 256, 64, 2, 5, 2, "VALID")
-    depthwise_conv2d_with_workload_nhwc(4, 256, 32, 2, 5, 2, "VALID")
     # dilation = 2
-    depthwise_conv2d_with_workload_nhwc(1, 728, 64, 1, 3, 1, "SAME", dilation=2)
+    # disabled because it uses too large shared memory on cuda
+    # depthwise_conv2d_with_workload_nhwc(1, 728, 64, 1, 3, 1, "SAME", dilation=2)
+
+    # NCHW[x]c
+    depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "SAME")
+    depthwise_conv2d_with_workload_NCHWc(4, 256, 64, 2, 5, 2, "SAME")
+    depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "VALID")
+    depthwise_conv2d_with_workload_NCHWc(4, 256, 64, 2, 5, 2, "VALID")
+
 
 if __name__ == "__main__":
     test_depthwise_conv2d()
diff --git a/topi/tests/python/test_topi_depthwise_conv2d_back_input.py b/topi/tests/python/test_topi_depthwise_conv2d_back_input.py
index f7c027344840..78b01ef42167 100644
--- a/topi/tests/python/test_topi_depthwise_conv2d_back_input.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d_back_input.py
@@ -80,7 +80,7 @@ def get_ref_data():
         # launch the kernel
         timer = f.time_evaluator(f.entry_name, ctx, number=1)
         tcost = timer(filter_tvm, out_grad_tvm, in_grad_tvm).mean
-        np.testing.assert_allclose(in_grad_np, in_grad_tvm.asnumpy(), rtol=1e-5)
+        tvm.testing.assert_allclose(in_grad_np, in_grad_tvm.asnumpy(), rtol=1e-5)
 
     check_device("opencl")
     check_device("cuda")
diff --git a/topi/tests/python/test_topi_depthwise_conv2d_back_weight.py b/topi/tests/python/test_topi_depthwise_conv2d_back_weight.py
index da5b0351ae3c..50838a7c863f 100644
--- a/topi/tests/python/test_topi_depthwise_conv2d_back_weight.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d_back_weight.py
@@ -73,7 +73,7 @@ def get_ref_data():
         # launch the kernel
         timer = f.time_evaluator(f.entry_name, ctx, number=1)
         tcost = timer(input_tvm, out_grad_tvm, weight_grad_tvm).mean
-        np.testing.assert_allclose(weight_grad_np, weight_grad_tvm.asnumpy(), rtol=1e-4)
+        tvm.testing.assert_allclose(weight_grad_np, weight_grad_tvm.asnumpy(), rtol=1e-4)
 
     check_device("opencl")
     check_device("cuda")
diff --git a/topi/tests/python/test_topi_dilate.py b/topi/tests/python/test_topi_dilate.py
index 9cc44719745a..d1e157f5e52f 100644
--- a/topi/tests/python/test_topi_dilate.py
+++ b/topi/tests/python/test_topi_dilate.py
@@ -19,7 +19,7 @@ def _test_dilate(input_size, strides):
         output_tvm = tvm.nd.array(np.zeros(shape=output_size).astype(Output.dtype), ctx=ctx)
         f = tvm.build(schedule, [Input, Output], target)
         f(input_tvm, output_tvm)
-        np.testing.assert_allclose(output_tvm.asnumpy(), output_np, rtol=1e-5)
+        tvm.testing.assert_allclose(output_tvm.asnumpy(), output_np, rtol=1e-5)
 
     _test_dilate((32,), (2,))
     _test_dilate((32,32), (2,2))
diff --git a/topi/tests/python/test_topi_group_conv2d.py b/topi/tests/python/test_topi_group_conv2d.py
new file mode 100644
index 000000000000..c1ff656fcd93
--- /dev/null
+++ b/topi/tests/python/test_topi_group_conv2d.py
@@ -0,0 +1,215 @@
+"""Example code to do group convolution."""
+
+import numpy as np
+import tvm
+from tvm import autotvm
+from tvm.autotvm.task.space import FallbackConfigEntity
+import topi
+import topi.testing
+from tvm.contrib.pickle_memoize import memoize
+from topi.util import get_const_tuple
+
+from common import get_all_backend, NCHWcInt8Fallback
+
+
+def verify_group_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups, add_bias=False, add_relu=False):
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d)" %
+        (batch, in_channel, in_size, num_filter,
+         kernel, stride, padding, dilation, groups))
+
+    in_height = in_width = in_size
+
+    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
+    W = tvm.placeholder((num_filter, in_channel // groups, kernel, kernel), name='W')
+    bias = tvm.placeholder((num_filter, 1, 1), name='bias')
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    bias_shape = get_const_tuple(bias.shape)
+    dtype = A.dtype
+
+    @memoize("topi.tests.test_topi_group_conv2d.verify_group_conv2d_nchw")
+    def get_ref_data():
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        w_np = np.random.uniform(size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
+        dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
+        c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding, groups).astype(dtype)
+
+        if add_bias:
+            b_np = np.random.uniform(size=bias_shape).astype(dtype)
+            c_np += b_np
+        if add_relu:
+            c_np = np.maximum(c_np, 0)
+
+        return a_np, w_np, b_np, c_np
+
+    a_np, w_np, b_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version):
+            print("Skip because int8 intrinsics are not available")
+            return
+
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            C = topi.nn.group_conv2d_nchw(A, W, stride, padding, dilation, groups, out_dtype=dtype)
+            if add_bias:
+                C = topi.add(C, bias)
+            if add_relu:
+                C = topi.nn.relu(C)
+            s = topi.generic.schedule_group_conv2d_nchw([C])
+
+        a = tvm.nd.array(a_np, ctx)
+        w = tvm.nd.array(w_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        if add_bias:
+            func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" %\
+                (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups))
+            func(a, w, b, c)
+        else:
+            func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % \
+            (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups))
+            func(a, w, c)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    for device in ["llvm"]:
+        check_device(device)
+
+
+oc_block_factor = 4
+
+
+def verify_group_conv2d_NCHWc_int8(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups, add_bias=False, add_relu=False):
+    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d)" %
+        (batch, in_channel, in_size, num_filter,
+         kernel, stride, padding, dilation, groups))
+
+    in_height = in_width = in_size
+
+    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A', dtype='int8')
+    W = tvm.placeholder((num_filter, in_channel // groups, kernel, kernel), name='W', dtype='int8')
+    bias = tvm.placeholder((num_filter // oc_block_factor, 1, 1, oc_block_factor), name='bias',
+                            dtype='int8')
+
+    a_shape = get_const_tuple(A.shape)
+    w_shape = get_const_tuple(W.shape)
+    bias_shape = get_const_tuple(bias.shape)
+    dtype = A.dtype
+
+    @memoize("topi.tests.test_topi_group_conv2d.verify_group_conv2d_NCHWc_int8")
+    def get_ref_data():
+        a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype)
+        w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
+        dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
+        c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding, groups).astype(dtype)
+
+        # convert to NCHWc
+        _, _, out_height, out_width = c_np.shape
+        c_np = c_np.reshape((batch, num_filter // oc_block_factor, oc_block_factor, \
+                out_height, out_width)).transpose(0, 1, 3, 4, 2)
+
+        if add_bias:
+            b_np = np.random.uniform(size=bias_shape).astype(dtype)
+            c_np += b_np
+        if add_relu:
+            c_np = np.maximum(c_np, 0)
+
+        return a_np, w_np, b_np, c_np
+
+    a_np, w_np, b_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        if device == "cuda" and not tvm.contrib.nvcc.have_int8(ctx.compute_version):
+            print("Skip because int8 intrinsics are not available")
+            return
+
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            C = topi.nn.group_conv2d_nchw(A, W, stride, padding, dilation, groups, out_dtype=dtype)
+            if add_bias:
+                C = topi.add(C, bias)
+            if add_relu:
+                C = topi.nn.relu(C)
+            s = topi.generic.schedule_group_conv2d_nchw([C])
+
+        a = tvm.nd.array(a_np, ctx)
+        w = tvm.nd.array(w_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
+        if add_bias:
+            func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" %\
+                (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups))
+            func(a, w, b, c)
+        else:
+            func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % \
+            (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups))
+            func(a, w, c)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    for device in ["cuda"]:
+        check_device(device)
+
+
+def test_group_conv2d_nchw():
+    # ResNeXt-50 workload
+    verify_group_conv2d_nchw(1, 128, 56, 128, 3, 1, 1, 1, 32)
+    verify_group_conv2d_nchw(1, 256, 56, 256, 3, 2, 1, 1, 32)
+    verify_group_conv2d_nchw(1, 256, 28, 256, 3, 1, 1, 1, 32)
+    verify_group_conv2d_nchw(1, 512, 28, 512, 3, 2, 1, 1, 32)
+    verify_group_conv2d_nchw(1, 512, 14, 512, 3, 1, 1, 1, 32)
+    verify_group_conv2d_nchw(1, 1024, 14, 1024, 3, 2, 1, 1, 32)
+    verify_group_conv2d_nchw(1, 1024, 7, 1024, 3, 1, 1, 1, 32)
+
+    # bias, relu
+    verify_group_conv2d_nchw(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True)
+    verify_group_conv2d_nchw(1, 128, 56, 128, 3, 1, 1, 1, 32, add_bias=True)
+    verify_group_conv2d_nchw(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True,
+                             add_bias=True)
+
+    # dilation
+    verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 2, 32)
+
+    # batch size
+    verify_group_conv2d_nchw(2, 128, 56, 128, 3, 1, 1, 1, 32)
+    verify_group_conv2d_nchw(9, 128, 56, 128, 3, 1, 1, 1, 32)
+
+
+
+def test_group_conv2d_NCHWc_int8():
+    with NCHWcInt8Fallback():
+        # ResNeXt-50 workload
+        verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 1, 32)
+        verify_group_conv2d_NCHWc_int8(1, 256, 56, 256, 3, 2, 1, 1, 32)
+        verify_group_conv2d_NCHWc_int8(1, 256, 28, 256, 3, 1, 1, 1, 32)
+        verify_group_conv2d_NCHWc_int8(1, 512, 28, 512, 3, 2, 1, 1, 32)
+        verify_group_conv2d_NCHWc_int8(1, 512, 14, 512, 3, 1, 1, 1, 32)
+        verify_group_conv2d_NCHWc_int8(1, 1024, 14, 1024, 3, 2, 1, 1, 32)
+        verify_group_conv2d_NCHWc_int8(1, 1024, 7, 1024, 3, 1, 1, 1, 32)
+
+        # bias, relu
+        verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True)
+        verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 1, 32, add_bias=True)
+        verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 1, 32, add_relu=True,
+                                       add_bias=True)
+        # dilation
+        verify_group_conv2d_NCHWc_int8(1, 128, 56, 128, 3, 1, 1, 2, 32)
+
+        # batch size
+        verify_group_conv2d_NCHWc_int8(2, 128, 56, 128, 3, 1, 1, 1, 32)
+        verify_group_conv2d_NCHWc_int8(9, 128, 56, 128, 3, 1, 1, 1, 32)
+
+
+if __name__ == "__main__":
+    test_group_conv2d_nchw()
+    test_group_conv2d_NCHWc_int8()
diff --git a/topi/tests/python/test_topi_l2norm.py b/topi/tests/python/test_topi_l2norm.py
index 75dc57057893..2bf799407398 100644
--- a/topi/tests/python/test_topi_l2norm.py
+++ b/topi/tests/python/test_topi_l2norm.py
@@ -29,7 +29,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device)
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
         check_device(device)
diff --git a/topi/tests/python/test_topi_lrn.py b/topi/tests/python/test_topi_lrn.py
index 478054ddb134..2f96a86f164e 100644
--- a/topi/tests/python/test_topi_lrn.py
+++ b/topi/tests/python/test_topi_lrn.py
@@ -28,7 +28,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device)
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
         check_device(device)
diff --git a/topi/tests/python/test_topi_math.py b/topi/tests/python/test_topi_math.py
index 8d82dbe5bf82..22713aa6cfdd 100644
--- a/topi/tests/python/test_topi_math.py
+++ b/topi/tests/python/test_topi_math.py
@@ -37,9 +37,10 @@ def check_device(device):
             a = tvm.nd.array(a_np, ctx)
             b = tvm.nd.array(np.zeros_like(b_np), ctx)
             foo(a, b)
-            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
+            tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
-        for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'llvm', 'nvptx', 'sdaccel']:
+        for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'llvm', 'nvptx', 'sdaccel',
+                       'aocl_sw_emu']:
             check_device(device)
 
 
diff --git a/topi/tests/python/test_topi_matmul.py b/topi/tests/python/test_topi_matmul.py
new file mode 100644
index 000000000000..2a8eaeb608dd
--- /dev/null
+++ b/topi/tests/python/test_topi_matmul.py
@@ -0,0 +1,61 @@
+import numpy as np
+import tvm
+import topi
+from topi.util import get_const_tuple
+
+def with_tvm(lam, *args):
+    """ Take numpy arrays as args, convert them to TVM tensors and call `lam`.
+    Result of lambda is converted back to numpy array and returned.
+    """
+    ctx = tvm.cpu(0)
+    pls = []     # placeholders
+    vals_nd = [] # initial values
+    for i,arg in enumerate(args):
+        pls.append(tvm.placeholder(arg.shape, name='pl'+str(i)))
+        vals_nd.append(tvm.nd.array(arg, ctx))
+
+    out = lam(*pls)
+    out_nd = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out.dtype), ctx)
+    s = tvm.create_schedule([out.op])
+    m = tvm.build(s, pls + [out], "llvm")
+    m(*(vals_nd+[out_nd]))
+    return out_nd.asnumpy()
+
+def verify_matmul(sa, sb, transp_a, transp_b):
+    a = np.random.uniform(low=-1.0, high=1.0, size=sa).astype(np.float32)
+    b = np.random.uniform(low=-1.0, high=1.0, size=sb).astype(np.float32)
+    c1 = np.matmul(np.transpose(a) if transp_a else a,
+                   np.transpose(b) if transp_b else b)
+    c2 = with_tvm(lambda A,B: topi.matmul(A,B,transp_a,transp_b), a,b)
+    tvm.testing.assert_allclose(c1, c2, rtol=1e-5, atol=1e-5)
+
+def test_matmul():
+    verify_matmul((1,1),(1,1),False,False)
+    verify_matmul((1,1),(1,1),True,True)
+    verify_matmul((2,2),(2,2),False,False)
+    verify_matmul((2,2),(2,2),True,True)
+    verify_matmul((2,3),(3,5),False,False)
+    verify_matmul((5,3),(3,2),False,False)
+    verify_matmul((3,5),(3,2),True,False)
+    verify_matmul((3,5),(2,3),True,True)
+
+def verify_tensordot(sa, sb, axes):
+    a = np.random.uniform(low=-1.0, high=1.0, size=sa).astype(np.float32)
+    b = np.random.uniform(low=-1.0, high=1.0, size=sb).astype(np.float32)
+    c1 = np.tensordot(a, b, axes)
+    c2 = with_tvm(lambda A, B: topi.tensordot(A, B, axes), a, b)
+    tvm.testing.assert_allclose(c1, c2, rtol=1e-5, atol=1e-5)
+
+def test_tensordot():
+    verify_tensordot((3), (3), 0)
+    verify_tensordot((2, 3), (3, 5), 1)
+    verify_tensordot((2, 2, 3), (2, 3, 5), 2)
+    verify_tensordot((2, 2, 3, 4), (2, 3, 4, 5), 3)
+    verify_tensordot((3, 2, 2), (2, 3, 5), (1, 0))
+    verify_tensordot((3, 2, 2), (2, 3, 5), ((1, 0), (0, 1)))
+    verify_tensordot((4, 3, 2, 2), (2, 4, 3, 5), ((1, 2, 0), (2, 0, 1)))
+
+if __name__ == "__main__":
+    test_matmul()
+    test_tensordot()
+
diff --git a/topi/tests/python/test_topi_pooling.py b/topi/tests/python/test_topi_pooling.py
index c9f790146b4a..273320fce727 100644
--- a/topi/tests/python/test_topi_pooling.py
+++ b/topi/tests/python/test_topi_pooling.py
@@ -5,14 +5,18 @@
 import math
 from topi.util import get_const_tuple
 
+from common import get_all_backend
+
 def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_pad=True):
     iw = ih
     kw = kh
     sw = sh
     pt, pl, pb, pr = padding
+    layout = "NCHW"
     A = tvm.placeholder((n, ic, ih, iw), name='A')
     B = topi.nn.pool(A, kernel=[kh, kw], stride=[sh, sw], padding=padding,
-                     pool_type=pool_type, ceil_mode=ceil_mode, count_include_pad=count_include_pad)
+                     pool_type=pool_type, ceil_mode=ceil_mode,
+                     layout="NCHW", count_include_pad=count_include_pad)
     B = topi.nn.relu(B)
     dtype = A.dtype
 
@@ -54,15 +58,15 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_pool(B)
+            s = topi.generic.schedule_pool(B, layout)
 
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device)
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
+    for device in get_all_backend():
         check_device(device)
 
 def test_pool():
@@ -105,9 +109,9 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         f = tvm.build(s, [A, B], device)
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
+    for device in get_all_backend():
         check_device(device)
 
 def test_global_pool():
diff --git a/topi/tests/python/test_topi_reduce.py b/topi/tests/python/test_topi_reduce.py
index 331498deb10c..77a33d86ed3e 100644
--- a/topi/tests/python/test_topi_reduce.py
+++ b/topi/tests/python/test_topi_reduce.py
@@ -4,6 +4,8 @@
 import tvm
 import topi
 
+from common import get_all_backend
+
 def _my_npy_argmax(arr, axis, keepdims):
     if not keepdims:
         return arr.argmax(axis=axis)
@@ -85,16 +87,20 @@ def check_device(device):
                 sel_indices = other_indices[0:axis] + (out_tvm_indices,) + other_indices[axis:]
                 out_tvm_val = in_npy_map[sel_indices]
             if type == "argmax":
-                np.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1E-3, 1E-3)
+                tvm.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1E-3, 1E-3)
             elif type == "argmin":
-                np.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3)
+                tvm.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3)
         else:
-            np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)
-    for device in ["cuda", "opencl", "metal", "llvm", "rocm", "vulkan", "nvptx"]:
+            tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)
+    for device in get_all_backend():
         check_device(device)
 
 
 def test_reduce_map():
+    verify_reduce_map_ele(in_shape=(32,),
+                          axis=0,
+                          keepdims=False,
+                          type="argmax")
     verify_reduce_map_ele(in_shape=(128, 24, 128, 24),
                         axis=(1, 2, 3),
                         keepdims=True,
diff --git a/topi/tests/python/test_topi_region.py b/topi/tests/python/test_topi_region.py
index a2835339e8eb..3357382b232e 100644
--- a/topi/tests/python/test_topi_region.py
+++ b/topi/tests/python/test_topi_region.py
@@ -37,7 +37,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         func = tvm.build(s, [A, B], device)
         func(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['llvm', 'cuda']:
         check_device(device)
diff --git a/topi/tests/python/test_topi_relu.py b/topi/tests/python/test_topi_relu.py
index 7c75a9b08975..a7ff64f0f759 100644
--- a/topi/tests/python/test_topi_relu.py
+++ b/topi/tests/python/test_topi_relu.py
@@ -5,6 +5,8 @@
 import topi
 from topi.util import get_const_tuple
 
+from common import get_all_backend
+
 def verify_relu(m, n):
     A = tvm.placeholder((m, n), name='A')
     B = topi.nn.relu(A)
@@ -25,9 +27,9 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         foo = tvm.build(s, [A, B], device, name="relu")
         foo(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx', 'sdaccel']:
+    for device in get_all_backend():
         check_device(device)
 
 
@@ -43,7 +45,7 @@ def verify_leaky_relu(m, alpha):
     b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
     foo = tvm.build(s, [A, B], "llvm", name="leaky_relu")
     foo(a, b)
-    np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+    tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
 
 def verify_prelu(x, w, axis, weight_reshape):
@@ -66,7 +68,7 @@ def _prelu_numpy(x, W):
     foo = tvm.build(s, [X, W, B], "llvm", name="prelu")
     foo(x_tvm, w_tvm, b)
     out_np = _prelu_numpy(x_np, w_np)
-    np.testing.assert_allclose(b.asnumpy(), out_np, rtol=1e-5)
+    tvm.testing.assert_allclose(b.asnumpy(), out_np, rtol=1e-5)
 
 def test_relu():
     verify_relu(10, 128)
diff --git a/topi/tests/python/test_topi_reorg.py b/topi/tests/python/test_topi_reorg.py
index 5b15b9f6c5aa..339cafe3ba41 100644
--- a/topi/tests/python/test_topi_reorg.py
+++ b/topi/tests/python/test_topi_reorg.py
@@ -38,7 +38,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         func = tvm.build(s, [A, B], device)
         func(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['llvm', 'cuda']:
         check_device(device)
diff --git a/topi/tests/python/test_topi_resize.py b/topi/tests/python/test_topi_resize.py
index cb2a69caf22b..6926a3a2a73c 100644
--- a/topi/tests/python/test_topi_resize.py
+++ b/topi/tests/python/test_topi_resize.py
@@ -38,7 +38,7 @@ def check_device(device):
         f = tvm.build(s, [A, B], device)
         f(a, b)
 
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)
 
     for device in ['llvm', 'cuda', 'vulkan', 'nvptx']:
         check_device(device)
diff --git a/topi/tests/python/test_topi_shortcut.py b/topi/tests/python/test_topi_shortcut.py
index b5840fe8e7b2..f89aa46a1e66 100644
--- a/topi/tests/python/test_topi_shortcut.py
+++ b/topi/tests/python/test_topi_shortcut.py
@@ -36,7 +36,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         func = tvm.build(s, [A1, A2, B], device)
         func(a1, a2, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['llvm', 'cuda']:
         check_device(device)
diff --git a/topi/tests/python/test_topi_softmax.py b/topi/tests/python/test_topi_softmax.py
index f12070695220..1990a9e99d65 100644
--- a/topi/tests/python/test_topi_softmax.py
+++ b/topi/tests/python/test_topi_softmax.py
@@ -7,6 +7,8 @@
 import logging
 from topi.util import get_const_tuple
 
+from common import get_all_backend
+
 def verify_softmax(m, n, dtype="float32"):
     A = tvm.placeholder((m, n), dtype=dtype, name='A')
     B = topi.nn.softmax(A)
@@ -30,7 +32,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         foo = tvm.build(s, [A, B], device, name="softmax")
         foo(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
         check_device(device)
@@ -61,9 +63,9 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         foo = tvm.build(s, [A, B], device, name="log_softmax")
         foo(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ["cuda", "opencl", "metal", "rocm", "vulkan", "nvptx"]:
+    for device in get_all_backend():
         check_device(device)
 
 
diff --git a/topi/tests/python/test_topi_sparse.py b/topi/tests/python/test_topi_sparse.py
new file mode 100644
index 000000000000..16a5ad33f201
--- /dev/null
+++ b/topi/tests/python/test_topi_sparse.py
@@ -0,0 +1,205 @@
+"""Test code for sparse operator"""
+import numpy as np
+import tvm
+import topi
+import topi.testing
+from topi.util import get_const_tuple
+import tvm.contrib.sparse as tvmsp
+from collections import namedtuple
+import time
+
+def verify_dynamic_csrmv(batch, in_dim, out_dim, use_bias=True):
+    nr, nc, n = tvm.var("nr"), tvm.var("nc"), tvm.var("n")
+    dtype = 'float32'
+    A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, dtype=dtype, name='A')
+    B = tvm.placeholder((in_dim, 1), name='B')
+    C = tvm.placeholder((nr,), name='C')
+    D = topi.sparse.csrmv(A, B, C if use_bias else None)
+    s = tvm.create_schedule(D.op)
+    dtype = A.dtype
+
+    # get the test data
+    def get_ref_data():
+        a_np = np.maximum(np.random.uniform(size=(batch, in_dim)).astype(dtype)-0.5, 0.)
+        b_np = np.random.uniform(size=(in_dim, 1)).astype(dtype)-0.5
+        c_np = np.random.uniform(size=(batch, )).astype(dtype)
+        if use_bias:
+            d_np = np.dot(a_np, b_np) + c_np.reshape((batch, 1))
+        else:
+            d_np = np.dot(a_np, b_np)
+        return (a_np, b_np, c_np, d_np)
+    a_np, b_np, c_np, d_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        a = tvmsp.array(a_np, ctx)
+        _nr, _nc, _n = a.shape[0], a.shape[1], a.data.shape[0]
+        assert a.shape[0] == a.indptr.shape[0]-1
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(c_np, ctx)
+        d = tvm.nd.array(np.zeros((_nr, 1), dtype=dtype), ctx)
+        assert a.data.dtype == A.data.dtype
+        assert a.indices.dtype == A.indices.dtype
+        assert a.indptr.dtype == A.indptr.dtype
+        f = tvm.build(s, [nr, A.data, A.indices, A.indptr, B, C, D], device, name="csrmv")
+        f(_nr, a.data, a.indices, a.indptr, b, c, d)
+        tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-4, atol=1e-4)
+
+    for device in ["llvm"]:
+        check_device(device)
+
+def verify_dynamic_csrmm(batch, in_dim, out_dim, use_bias=True):
+    nr, nc, n = tvm.var("nr"), tvm.var("nc"), tvm.var("n")
+    dtype = 'float32'
+    A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, dtype=dtype, name='A')
+    B = tvm.placeholder((in_dim, out_dim), name='B')
+    C = tvm.placeholder((nr,), name='C')
+    D = topi.sparse.csrmm(A, B, C if use_bias else None)
+    s = tvm.create_schedule(D.op)
+    dtype = A.dtype
+
+    # get the test data
+    def get_ref_data():
+        a_np = np.maximum(np.random.uniform(size=(batch, in_dim)).astype(dtype)-0.5, 0.)
+        b_np = np.random.uniform(size=(in_dim, out_dim)).astype(dtype)-0.5
+        c_np = np.random.uniform(size=(batch, )).astype(dtype)
+        if use_bias:
+            d_np = np.dot(a_np, b_np) + c_np.reshape((batch, 1))
+        else:
+            d_np = np.dot(a_np, b_np)
+        return (a_np, b_np, c_np, d_np)
+    a_np, b_np, c_np, d_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        a = tvmsp.array(a_np, ctx)
+        _nr, _nc, _n = a.shape[0], a.shape[1], a.data.shape[0]
+        assert a.shape[0] == a.indptr.shape[0]-1
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(c_np, ctx)
+        d = tvm.nd.array(np.zeros((_nr, out_dim), dtype=dtype), ctx)
+        f = tvm.build(s, [nr, A.data, A.indices, A.indptr, B, C, D], device, name="csrmm")
+
+        f(_nr, a.data, a.indices, a.indptr, b, c, d)
+        tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-2, atol=1e-2)
+
+    for device in ["llvm"]:
+        check_device(device)
+
+def verify_dense_si(batch, in_dim, out_dim, use_bias=True, dtype='float32'):
+    nonzeros = tvm.var('nonzeros')
+    A = tvmsp.placeholder(shape=(batch, in_dim), nonzeros=nonzeros, dtype=dtype, name='A')
+    B = tvm.placeholder((out_dim, in_dim), dtype=dtype, name='B')
+    C = tvm.placeholder((out_dim,), dtype=dtype, name='C')
+    D = topi.sparse.dense(A, B, C if use_bias else None)
+    s = tvm.create_schedule(D.op)
+
+    # get the test data
+    def get_ref_data():
+        mag = 10.
+        a_np = np.maximum(mag*(np.random.uniform(size=(batch, in_dim)).astype('float32')-0.5), 0.).astype(dtype)
+        b_np = (mag*(np.random.uniform(size=(out_dim, in_dim)).astype('float32')-.5)).astype(dtype)
+        c_np = (mag*(np.random.uniform(size=(out_dim,)).astype('float32')-.5)).astype(dtype)
+        if use_bias:
+            d_np = np.dot(a_np, b_np.T) + c_np
+        else:
+            d_np = np.dot(a_np, b_np.T)
+        return (a_np, b_np, c_np, d_np)
+    a_np, b_np, c_np, d_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        a = tvmsp.array(a_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(c_np, ctx)
+        d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
+        f = tvm.build(s, [A.data, A.indices, A.indptr, B, C, D], device, name="dense")
+        f(a.data, a.indices, a.indptr, b, c, d)
+        tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-4, atol=1e-4)
+
+    check_device('llvm')
+
+def verify_dense_sw(batch, in_dim, out_dim, use_bias=True, dtype='float32'):
+    nonzeros = tvm.var('nonzeros')
+    A = tvm.placeholder((batch, in_dim), dtype=dtype, name='A')
+    B = tvmsp.placeholder(shape=(out_dim, in_dim), nonzeros=nonzeros, dtype=dtype, name='B')
+    C = tvm.placeholder((out_dim,), dtype=dtype, name='C')
+    D = topi.sparse.dense(A, B, C if use_bias else None)
+    s = tvm.create_schedule(D.op)
+
+    # get the test data
+    def get_ref_data():
+        mag = 10.
+        a_np = (mag*(np.random.uniform(size=(batch, in_dim)).astype('float32')-.5)).astype(dtype)
+        b_np = np.maximum(mag*(np.random.uniform(size=(out_dim, in_dim)).astype('float32')-0.5), 0.).astype(dtype)
+        c_np = (mag*(np.random.uniform(size=(out_dim,)).astype('float32')-.5)).astype(dtype)
+        if use_bias:
+            d_np = np.dot(a_np, b_np.T) + c_np
+        else:
+            d_np = np.dot(a_np, b_np.T)
+        return (a_np, b_np, c_np, d_np)
+    a_np, b_np, c_np, d_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        a = tvm.nd.array(a_np, ctx)
+        b = tvmsp.array(b_np, ctx)
+        c = tvm.nd.array(c_np, ctx)
+        d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
+        f = tvm.build(s, [A, B.data, B.indices, B.indptr, C, D], device, name="dense")
+        f(a, b.data, b.indices, b.indptr, c, d)
+        tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-4, atol=1e-4)
+
+    check_device('llvm')
+
+def test_csrmv():
+    verify_dynamic_csrmv(batch=5, in_dim=7, out_dim=1, use_bias=False)
+    verify_dynamic_csrmv(batch=5, in_dim=7, out_dim=1, use_bias=True)
+
+def test_csrmm():
+    M, K, N = 5, 7, 2
+    verify_dynamic_csrmm(batch=M, in_dim=K, out_dim=N, use_bias=False)
+    verify_dynamic_csrmm(batch=M, in_dim=K, out_dim=N, use_bias=True)
+
+def test_dense_si():
+    M, K, N = 3, 5, 2
+    verify_dense_si(batch=M, in_dim=K, out_dim=N, use_bias=False, dtype='float32')
+    verify_dense_si(batch=M, in_dim=K, out_dim=N, use_bias=True, dtype='float32')
+    verify_dense_si(batch=M, in_dim=K, out_dim=N, use_bias=False, dtype='int32')
+    verify_dense_si(batch=M, in_dim=K, out_dim=N, use_bias=True, dtype='int32')
+    verify_dense_si(batch=M, in_dim=K, out_dim=N, use_bias=False, dtype='int16')
+    verify_dense_si(batch=M, in_dim=K, out_dim=N, use_bias=True, dtype='int16')
+
+def test_dense_sw():
+    M, K, N = 3, 5, 2
+    verify_dense_sw(batch=M, in_dim=K, out_dim=N, use_bias=False, dtype='float32')
+    verify_dense_sw(batch=M, in_dim=K, out_dim=N, use_bias=True, dtype='float32')
+    verify_dense_sw(batch=M, in_dim=K, out_dim=N, use_bias=False, dtype='int32')
+    verify_dense_sw(batch=M, in_dim=K, out_dim=N, use_bias=True, dtype='int32')
+    verify_dense_sw(batch=M, in_dim=K, out_dim=N, use_bias=False, dtype='int16')
+    verify_dense_sw(batch=M, in_dim=K, out_dim=N, use_bias=True, dtype='int16')
+
+def test_dense():
+    test_dense_si()
+    test_dense_sw()
+
+if __name__ == "__main__":
+    test_csrmv()
+    test_csrmm()
+    test_dense()
diff --git a/topi/tests/python/test_topi_tensor.py b/topi/tests/python/test_topi_tensor.py
index 3d563c21b5c4..f54472716521 100644
--- a/topi/tests/python/test_topi_tensor.py
+++ b/topi/tests/python/test_topi_tensor.py
@@ -32,7 +32,7 @@ def check_device(device):
         tvm_nd = [tvm.nd.array(nd, ctx) for nd in np_nd] + [out]
         f(*tvm_nd)
         np_out = np.sum(np.array(np_nd), axis=0)
-        np.testing.assert_allclose(out.asnumpy(), np_out, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), np_out, rtol=1e-5)
 
     for device in ["llvm"]:
         check_device(device)
@@ -59,11 +59,11 @@ def check_device(device):
         out = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
         f = tvm.build(s1, [A, B], device, name="full_like")
         f(tvm.nd.array(np.zeros(shape, dtype), ctx), out)
-        np.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
 
         f = tvm.build(s2, [C], device, name="full")
         f(out)
-        np.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
 
     for device in ["llvm"]:
         check_device(device)
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index 4788d758cf45..84d4aa6dc952 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -2,6 +2,9 @@
 import numpy as np
 import tvm
 import topi
+import topi.testing
+
+from common import get_all_backend
 
 def verify_expand_dims(in_shape, out_shape, axis, num_newaxis):
     A = tvm.placeholder(shape=in_shape, name="A")
@@ -20,9 +23,9 @@ def check_device(device):
         data_nd = tvm.nd.array(data_npy, ctx)
         out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), ctx)
         foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
+    for device in get_all_backend():
         check_device(device)
 
 
@@ -43,9 +46,9 @@ def check_device(device):
         data_nd = tvm.nd.array(data_npy, ctx)
         out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=B.dtype)
         foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
+    for device in get_all_backend():
         check_device(device)
 
 
@@ -66,9 +69,9 @@ def check_device(device):
         data_nd = tvm.nd.array(data_npy, ctx)
         out_nd = tvm.nd.empty(dst_shape, ctx=ctx, dtype=B.dtype)
         foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
+    for device in get_all_backend():
         check_device(device)
 
 
@@ -88,15 +91,12 @@ def check_device(device):
         data_npy = np.random.normal(size=src_shape).astype(A.dtype)
         out_npy = np.squeeze(data_npy, axis=axis)
         data_nd = tvm.nd.array(data_npy, ctx)
-        if out_npy.shape == ():
-            out_nd_shape = (1,)
-        else:
-            out_nd_shape = out_npy.shape
+        out_nd_shape = out_npy.shape
         out_nd = tvm.nd.empty(out_nd_shape, ctx=ctx, dtype=B.dtype)
         foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
+    for device in get_all_backend():
         check_device(device)
 
 def verify_concatenate(shapes, axis):
@@ -119,9 +119,9 @@ def check_device(device):
         data_nds = [tvm.nd.array(data_npy, ctx) for data_npy in data_npys]
         out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=out_tensor.dtype)
         foo(*(data_nds + [out_nd]))
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
+    for device in get_all_backend():
         check_device(device)
 
 
@@ -137,16 +137,16 @@ def check_device(device):
         with tvm.target.create(device):
             s = topi.generic.schedule_injective(tensor_l)
 
-        foo = tvm.build(s, [A] + tensor_l, device, name="split")
+        foo = tvm.build(s, [A] + list(tensor_l), device, name="split")
         data_npy = np.random.normal(size=src_shape).astype(A.dtype)
         out_npys = np.split(data_npy, indices_or_sections, axis=axis)
         data_nd = tvm.nd.array(data_npy, ctx)
         out_nds = [tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=tensor_l[0].dtype) for out_npy in out_npys]
         foo(*([data_nd] + out_nds))
         for out_nd, out_npy in zip(out_nds, out_npys):
-            np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+            tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
+    for device in get_all_backend():
         check_device(device)
 
 
@@ -179,7 +179,7 @@ def check_device(device):
         tvm_shape_like = tvm.nd.array(np.zeros(out_shape).astype(B.dtype), ctx)
         out = tvm.nd.array(np.zeros(out_shape).astype(A.dtype), ctx)
         f(tvm_input, tvm_shape_like, out)
-        np.testing.assert_allclose(out.asnumpy(), input)
+        tvm.testing.assert_allclose(out.asnumpy(), input)
 
     for device in ["llvm"]:
         check_device(device)
@@ -202,9 +202,9 @@ def check_device(device):
         data_nd = tvm.nd.array(x_np, ctx)
         out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=A.dtype)
         foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "cuda", "opencl", "sdaccel"]:
+    for device in ["llvm", "cuda", "opencl", "sdaccel", "aocl_sw_emu"]:
         check_device(device)
 
 def verify_take(src_shape, indices_src, axis=None):
@@ -241,18 +241,16 @@ def check_device(device):
         indices_nd = tvm.nd.array(indices_src, ctx)
         out_nd = tvm.nd.empty(out_npys.shape, ctx=ctx, dtype=src_dtype)
         foo(data_nd, indices_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npys)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npys)
 
-    for device in ["llvm", "opencl", "sdaccel"]:
+    for device in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
         check_device(device)
 
-def verify_strided_slice(in_shape, begin, end, stride=None):
-    stride = stride if stride else [1, 1, 1]
+def verify_strided_slice(in_shape, begin, end, strides=None):
     A = tvm.placeholder(shape=in_shape, name="A")
-    B = topi.strided_slice(A, begin, end, stride) + 1
-    def test_forward(x, begin, end, stride):
-        return x[begin[0]:end[0]:stride[0],
-                    begin[1]:end[1]:stride[1], begin[2]:end[2]:stride[2]] + 1
+    strides = [1,1,1] if strides is None else strides
+    B = topi.strided_slice(A, begin, end, strides) + 1
+
     def check_device(device):
         ctx = tvm.context(device, 0)
         if not ctx.exist:
@@ -264,13 +262,46 @@ def check_device(device):
 
         foo = tvm.build(s, [A, B], device, name="stride_slice")
         x_np = np.random.uniform(size=in_shape).astype(A.dtype)
-        out_npy = test_forward(x_np, begin, end, stride)
+        out_npy = topi.testing.strided_slice_python(
+            x_np, begin, end, strides) + 1
         data_nd = tvm.nd.array(x_np, ctx)
         out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=A.dtype)
         foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "opencl", "sdaccel"]:
+    for device in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
+        check_device(device)
+
+def verify_gather_nd(src_shape, indices_src, indices_dtype):
+    src_dtype = "float32"
+    indices_src = np.array(indices_src, dtype=indices_dtype)
+    A = tvm.placeholder(shape=src_shape, dtype=src_dtype, name="A")
+    indices = tvm.placeholder(shape=indices_src.shape, dtype=indices_dtype, name="indices")
+    out_tensor = topi.gather_nd(a=A, indices=indices)
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_injective(out_tensor)
+
+        func = tvm.build(s, [A, indices, out_tensor] , device, name="take")
+        shape_size = 1
+        for i in range(len(src_shape)):
+            shape_size = shape_size * src_shape[i]
+        data_npy = np.arange(shape_size, dtype=src_dtype).reshape((src_shape))
+        out_npys = topi.testing.gather_nd_python(data_npy, indices_src)
+
+        data_nd = tvm.nd.array(data_npy, ctx)
+        indices_nd = tvm.nd.array(indices_src, ctx)
+        out_nd = tvm.nd.empty(out_npys.shape, ctx=ctx, dtype=src_dtype)
+        func(data_nd, indices_nd, out_nd)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npys)
+
+    for device in get_all_backend():
         check_device(device)
 
 def test_strided_slice():
@@ -305,6 +336,21 @@ def test_squeeze():
     verify_squeeze((1, 1, 1, 4), (1, 2))
     verify_squeeze((1, 1, 1, 1), None)
 
+    # a special case to trigger inline let expression
+    A = tvm.placeholder((2,), 'float32', 'A')
+    E = topi.squeeze(A)
+    C = tvm.compute((1,), lambda i: E[(2 * A[0] - 1).astype('int32')])
+    for device in ['cuda', 'opencl']:
+        ctx = tvm.context(device, 0)
+        if ctx.exist:
+            with tvm.target.create(device):
+                s = topi.generic.schedule_injective(C)
+                func = tvm.build(s, [A, C])
+            a = tvm.nd.array(np.array((1, 2)).astype('float32'), ctx=ctx)
+            c = tvm.nd.empty((1,), dtype='float32', ctx=ctx)
+            func(a, c)
+            assert c.asnumpy()[0] == 2
+
 
 def test_concatenate():
     verify_concatenate([(2,), (2,), (2,)], 0)
@@ -346,7 +392,23 @@ def test_take():
     verify_take((2,2), [[[1,0],[0,1]]], 1)
     verify_take((4,3,5,6), [[2,1,0,0]], -2)
 
+def test_gather_nd():
+    for indices_dtype in ['int32', 'float32']:
+        verify_gather_nd((4,), [[1.8]], indices_dtype)
+        verify_gather_nd((4,), [[1, 3, 2]], indices_dtype)
+        verify_gather_nd((2, 3), [[1]], indices_dtype)
+        verify_gather_nd((2, 3), [[1], [0]], indices_dtype)
+        verify_gather_nd((2, 3), [[1, 0], [0, 2]], indices_dtype)
+        verify_gather_nd((2, 3, 4), [[1, 0], [0, 2]], indices_dtype)
+        verify_gather_nd((2, 3, 4), [[1, 0], [0, 2], [3, 1]], indices_dtype)
+        verify_gather_nd((2, 3, 4), [[[1, 0], [0, 1]], [[0, 2], [1, 2]],
+                                     [[3, 1], [0, 2]]], indices_dtype)
+        verify_gather_nd((2, 3, 4, 5), [[1, 0], [0, 2]], indices_dtype)
+        verify_gather_nd((2, 3, 4, 5), [[1, 0], [2, 1], [3, 2], [4, 2]],
+                         indices_dtype)
+
 if __name__ == "__main__":
+    test_strided_slice()
     test_concatenate()
     test_tranpose()
     test_expand_dims()
@@ -356,4 +418,4 @@ def test_take():
     test_flip()
     test_expand_like()
     test_take()
-    test_strided_slice()
+    test_gather_nd()
diff --git a/topi/tests/python/test_topi_upsampling.py b/topi/tests/python/test_topi_upsampling.py
index 3affc30a0722..c10ce6e61b5a 100644
--- a/topi/tests/python/test_topi_upsampling.py
+++ b/topi/tests/python/test_topi_upsampling.py
@@ -5,7 +5,7 @@
 import topi.testing
 import math
 
-def verify_upsampling(batch, in_channel, in_height, in_width, scale, layout='NCHW'):
+def verify_upsampling(batch, in_channel, in_height, in_width, scale, layout='NCHW', method="NEAREST_NEIGHBOR"):
 
 
     if layout == 'NCHW':
@@ -22,9 +22,13 @@ def verify_upsampling(batch, in_channel, in_height, in_width, scale, layout='NCH
         raise NotImplementedError(
             'Layout not supported {} '.format(layout))
 
-    B = topi.nn.upsampling(A, scale, layout=layout)
+    B = topi.nn.upsampling(A, scale, layout=layout, method=method)
 
-    b_np = topi.testing.upsampling_python(a_np, scale, layout)
+    if method == "BILINEAR":
+        out_size = (in_height*scale, in_width*scale)
+        b_np = topi.testing.bilinear_resize_python(a_np, out_size, layout)
+    else:
+        b_np = topi.testing.upsampling_python(a_np, scale, layout)
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -39,18 +43,27 @@ def check_device(device):
         f = tvm.build(s, [A, B], device)
         f(a, b)
 
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
     for device in ['llvm', 'cuda', 'vulkan', 'nvptx']:
         check_device(device)
 
 def test_upsampling():
-    # NCHW
+    # NEAREST_NEIGHBOR - NCHW
     verify_upsampling(8, 16, 32, 32, 2)
     verify_upsampling(12, 32, 64, 64, 3)
-    # NHWC
-    verify_upsampling(8, 16, 32, 32, 2, "NHWC")
-    verify_upsampling(12, 32, 64, 64, 3, "NHWC")
+
+    # NEAREST_NEIGHBOR - NHWC
+    verify_upsampling(8, 16, 32, 32, 2, layout="NHWC")
+    verify_upsampling(12, 32, 64, 64, 3, layout="NHWC")
+
+    # BILINEAR - NCHW
+    verify_upsampling(2, 2, 32, 32, 2, method="BILINEAR")
+    verify_upsampling(2, 2, 32, 32, 3, method="BILINEAR")
+
+    # BILINEAR - NHWC
+    verify_upsampling(2, 2, 32, 32, 2, layout="NHWC", method="BILINEAR")
+    verify_upsampling(2, 2, 32, 32, 3, layout="NHWC", method="BILINEAR")
 
 if __name__ == "__main__":
     test_upsampling()
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 959b10f82ca5..547d7bdcfbf6 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -41,7 +41,7 @@ def check_device(device):
         tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
         f = tvm.build(s, [data, valid_count, out], device)
         f(tvm_data, tvm_valid_count, tvm_out)
-        np.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4)
+        tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4)
 
     for device in ['llvm', 'opencl']:
         check_device(device)
@@ -100,7 +100,7 @@ def check_device(device):
         tvm_out = tvm.nd.array(np.zeros(oshape, dtype=dtype), ctx)
         f = tvm.build(s, [data, out], device)
         f(tvm_input_data, tvm_out)
-        np.testing.assert_allclose(tvm_out.asnumpy(), np_out, rtol=1e-3)
+        tvm.testing.assert_allclose(tvm_out.asnumpy(), np_out, rtol=1e-3)
 
     for device in ['llvm', 'opencl']:
         check_device(device)
@@ -148,7 +148,7 @@ def check_device(device):
         tvm_out = tvm.nd.array(np.zeros((batch_size, num_anchors, 6)).astype(out.dtype), ctx)
         f = tvm.build(s, [cls_prob, loc_preds, anchors, out], device)
         f(tvm_cls_prob, tvm_loc_preds, tvm_anchors, tvm_out)
-        np.testing.assert_allclose(tvm_out.asnumpy(), expected_np_out, rtol=1e-4)
+        tvm.testing.assert_allclose(tvm_out.asnumpy(), expected_np_out, rtol=1e-4)
 
     for device in ['llvm', 'opencl']:
         check_device(device)
diff --git a/topi/tests/python_cpp/test_topi_bnn.py b/topi/tests/python_cpp/test_topi_bnn.py
index 3fa5cfc4a0a7..83d880311eff 100644
--- a/topi/tests/python_cpp/test_topi_bnn.py
+++ b/topi/tests/python_cpp/test_topi_bnn.py
@@ -44,7 +44,7 @@ def get_ref_data():
     f1(a, bnn_a)
     f2(b, bnn_b)
     f3(bnn_a, bnn_b, bnn_c)
-    np.testing.assert_allclose(bnn_c.asnumpy(), c_np, rtol=1e-5)
+    tvm.testing.assert_allclose(bnn_c.asnumpy(), c_np, rtol=1e-5)
 
 def test_binary_dense():
     verify_binary_dense(1, 4096, 1024)
diff --git a/topi/tests/python_cpp/test_topi_clip.py b/topi/tests/python_cpp/test_topi_clip.py
index fe00408642f5..d1aca4cb904c 100644
--- a/topi/tests/python_cpp/test_topi_clip.py
+++ b/topi/tests/python_cpp/test_topi_clip.py
@@ -29,7 +29,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device, name="clip")
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['llvm']:
         check_device(device)
diff --git a/topi/tests/python_cpp/test_topi_dense.py b/topi/tests/python_cpp/test_topi_dense.py
index f2369af4319a..636257de7919 100644
--- a/topi/tests/python_cpp/test_topi_dense.py
+++ b/topi/tests/python_cpp/test_topi_dense.py
@@ -47,7 +47,7 @@ def check_device(device):
         d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B, C, D], device, name="dense")
         f(a, b, c, d)
-        np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
+        tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm']:
         check_device(device)
diff --git a/topi/tests/python_cpp/test_topi_dilate.py b/topi/tests/python_cpp/test_topi_dilate.py
index f1924239cc77..1f7f1d8bceeb 100644
--- a/topi/tests/python_cpp/test_topi_dilate.py
+++ b/topi/tests/python_cpp/test_topi_dilate.py
@@ -19,7 +19,7 @@ def _test_dilate(input_size, strides):
         output_tvm = tvm.nd.array(np.zeros(shape=output_size).astype(Output.dtype), ctx=ctx)
         f = tvm.build(schedule, [Input, Output], target)
         f(input_tvm, output_tvm)
-        np.testing.assert_allclose(output_tvm.asnumpy(), output_np, rtol=1e-5)
+        tvm.testing.assert_allclose(output_tvm.asnumpy(), output_np, rtol=1e-5)
 
     _test_dilate((32,), (2,))
     _test_dilate((32,32), (2,2))
diff --git a/topi/tests/python_cpp/test_topi_l2norm.py b/topi/tests/python_cpp/test_topi_l2norm.py
index 08799f76c5c3..fef2710b8d79 100644
--- a/topi/tests/python_cpp/test_topi_l2norm.py
+++ b/topi/tests/python_cpp/test_topi_l2norm.py
@@ -30,7 +30,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         func = tvm.build(s, [A, B], device, name="l2_normalize")
         func(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm']:
         check_device(device)
diff --git a/topi/tests/python_cpp/test_topi_lrn.py b/topi/tests/python_cpp/test_topi_lrn.py
index d685643a9406..14a0eaa27781 100644
--- a/topi/tests/python_cpp/test_topi_lrn.py
+++ b/topi/tests/python_cpp/test_topi_lrn.py
@@ -29,7 +29,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device)
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-1)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-1)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm']:
         check_device(device)
diff --git a/topi/tests/python_cpp/test_topi_pooling.py b/topi/tests/python_cpp/test_topi_pooling.py
index 42232c8e4848..9997fb6738c2 100644
--- a/topi/tests/python_cpp/test_topi_pooling.py
+++ b/topi/tests/python_cpp/test_topi_pooling.py
@@ -67,7 +67,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B], device)
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm']:
         check_device(device)
@@ -115,7 +115,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         f = tvm.build(s, [A, B], device)
         f(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm']:
         check_device(device)
diff --git a/topi/tests/python_cpp/test_topi_reduce.py b/topi/tests/python_cpp/test_topi_reduce.py
index 7bf369c7f1ff..dbfa3683fa66 100644
--- a/topi/tests/python_cpp/test_topi_reduce.py
+++ b/topi/tests/python_cpp/test_topi_reduce.py
@@ -42,6 +42,8 @@ def verify_reduce_map_ele(in_shape, axis, keepdims, type="sum"):
     elif type == "argmin":
         B = topi.cpp.argmin(A1, axis, keepdims)
         out_dtype = "int32"
+    elif type == "prod":
+        B = topi.cpp.prod(A1, axis, keepdims)
     else:
         raise NotImplementedError
 
@@ -57,7 +59,7 @@ def check_device(device):
         else:
             s = topi.cpp.cuda.schedule_reduce(target, [B])
 
-        foo = tvm.build(s, [A, B], device, name="sum")
+        foo = tvm.build(s, [A, B], device, name=type)
         # Test
         in_npy = np.random.uniform(size=in_shape).astype(np.float32)
         in_npy_map = np.sqrt(np.exp(in_npy)).astype(np.float32)
@@ -71,9 +73,10 @@ def check_device(device):
             out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims)
         elif type == "argmin":
             out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims)
+        elif type == "prod":
+            out_npy = in_npy_map.prod(axis=axis, keepdims=keepdims)
         else:
             raise NotImplementedError
-        out_npy = np.atleast_1d(out_npy)
         data_tvm = tvm.nd.array(in_npy, ctx=ctx)
         out_tvm = tvm.nd.empty(shape=out_npy.shape, ctx=ctx, dtype=out_dtype)
         for _ in range(1):
@@ -89,32 +92,40 @@ def check_device(device):
                 sel_indices = other_indices[0:axis] + (out_tvm_indices,) + other_indices[axis:]
                 out_tvm_val = in_npy_map[sel_indices]
             if type == "argmax":
-                np.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1E-3, 1E-3)
+                tvm.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1E-3, 1E-3)
             elif type == "argmin":
-                np.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3)
+                tvm.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3)
         else:
-            np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)
+            tvm.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)
     for device in ["cuda", "opencl", "metal", "llvm", "rocm"]:
         check_device(device)
 
 
 def test_reduce_map():
     verify_reduce_map_ele(in_shape=(128, 24, 128, 24),
-                        axis=(1, 2, 3),
-                        keepdims=True,
-                        type="sum")
+                          axis=(1, 2, 3),
+                          keepdims=True,
+                          type="sum")
     verify_reduce_map_ele(in_shape=(128, 24 * 128 * 24),
-                        axis=(1,),
-                        keepdims=False,
-                        type="max")
+                          axis=(1,),
+                          keepdims=False,
+                          type="max")
     verify_reduce_map_ele(in_shape=(32, 128, 24),
-                        axis=None,
-                        keepdims=True,
-                        type="sum")
+                          axis=None,
+                          keepdims=True,
+                          type="sum")
     verify_reduce_map_ele(in_shape=(128, 24, 128, 24),
-                        axis=(0, 2),
-                        keepdims=False,
-                        type="min")
+                          axis=(0, 2),
+                          keepdims=False,
+                          type="min")
+    verify_reduce_map_ele(in_shape=(128, 4, 4, 128),
+                          axis=(1, ),
+                          keepdims=True,
+                          type="prod")
+    verify_reduce_map_ele(in_shape=(4, 4),
+                          axis=(0, 1),
+                          keepdims=False,
+                          type="prod")
     verify_reduce_map_ele(in_shape=(32, 128),
                           axis=1,
                           keepdims=True,
diff --git a/topi/tests/python_cpp/test_topi_region.py b/topi/tests/python_cpp/test_topi_region.py
index a37cf6610a0f..28e984b70244 100644
--- a/topi/tests/python_cpp/test_topi_region.py
+++ b/topi/tests/python_cpp/test_topi_region.py
@@ -39,7 +39,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         func = tvm.build(s, [A, B], device, name="region")
         func(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm', 'vulkan']:
         check_device(device)
diff --git a/topi/tests/python_cpp/test_topi_relu.py b/topi/tests/python_cpp/test_topi_relu.py
index 6677c1bf5551..3b1b00ec8f67 100644
--- a/topi/tests/python_cpp/test_topi_relu.py
+++ b/topi/tests/python_cpp/test_topi_relu.py
@@ -28,7 +28,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         foo = tvm.build(s, [A, B], device, name="relu")
         foo(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm']:
         check_device(device)
@@ -48,7 +48,7 @@ def verify_leaky_relu(m, alpha):
     b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
     foo = tvm.build(s, [A, B], device, name="leaky_relu")
     foo(a, b)
-    np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+    tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
 def verify_prelu(x, w, axis, weight_reshape):
     X = tvm.placeholder((x), name='X')
@@ -71,7 +71,7 @@ def _prelu_numpy(x, W):
     b = tvm.nd.array(np.zeros(get_const_tuple(X.shape), dtype=B.dtype), ctx)
     foo = tvm.build(s, [X, W, B], "llvm", name="prelu")
     foo(x_tvm, w_tvm, b)
-    np.testing.assert_allclose(b.asnumpy(), out_np, rtol=1e-5)
+    tvm.testing.assert_allclose(b.asnumpy(), out_np, rtol=1e-5)
 
 def test_relu():
     for dtype in ['float32', 'float64', 'int32', 'int16', 'int8', 'int64']:
diff --git a/topi/tests/python_cpp/test_topi_reorg.py b/topi/tests/python_cpp/test_topi_reorg.py
index e5b8aa7f8b31..f7767967c699 100644
--- a/topi/tests/python_cpp/test_topi_reorg.py
+++ b/topi/tests/python_cpp/test_topi_reorg.py
@@ -39,7 +39,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         func = tvm.build(s, [A, B], device, name="reorg")
         func(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm', 'vulkan']:
         check_device(device)
diff --git a/topi/tests/python_cpp/test_topi_softmax.py b/topi/tests/python_cpp/test_topi_softmax.py
index 4d4ac387bccf..09f838ef57ec 100644
--- a/topi/tests/python_cpp/test_topi_softmax.py
+++ b/topi/tests/python_cpp/test_topi_softmax.py
@@ -32,7 +32,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         foo = tvm.build(s, [A, B], device, name="softmax")
         foo(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm']:
         check_device(device)
@@ -66,7 +66,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         foo = tvm.build(s, [A, B], device, name="log_softmax")
         foo(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
     for device in ["cuda", "opencl", "metal", "rocm"]:
         check_device(device)
diff --git a/topi/tests/python_cpp/test_topi_tensor.py b/topi/tests/python_cpp/test_topi_tensor.py
index 1a0a7c92db7e..762ee045e38a 100644
--- a/topi/tests/python_cpp/test_topi_tensor.py
+++ b/topi/tests/python_cpp/test_topi_tensor.py
@@ -30,7 +30,7 @@ def check_device(device):
         tvm_nd = [tvm.nd.array(nd, ctx) for nd in np_nd] + [out]
         f(*tvm_nd)
         np_out = np.sum(np.array(np_nd), axis=0)
-        np.testing.assert_allclose(out.asnumpy(), np_out, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), np_out, rtol=1e-5)
 
     for device in ["llvm"]:
         check_device(device)
@@ -56,11 +56,11 @@ def check_device(device):
         out = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx)
         f = tvm.build(s1, [A, B], device, name="full_like")
         f(tvm.nd.array(np.zeros(shape, dtype), ctx), out)
-        np.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
 
         f = tvm.build(s2, [C], device, name="full")
         f(out)
-        np.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
+        tvm.testing.assert_allclose(out.asnumpy(), np_nd, rtol=1e-5)
 
     for device in ["llvm"]:
         check_device(device)
diff --git a/topi/tests/python_cpp/test_topi_transform.py b/topi/tests/python_cpp/test_topi_transform.py
index c8b7c3906caa..b411375b333e 100644
--- a/topi/tests/python_cpp/test_topi_transform.py
+++ b/topi/tests/python_cpp/test_topi_transform.py
@@ -23,7 +23,7 @@ def check_device(device):
         data_nd = tvm.nd.array(data_npy, ctx)
         out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), ctx)
         foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
         check_device(device)
@@ -50,7 +50,7 @@ def check_device(device):
         data_nd = tvm.nd.array(data_npy, ctx)
         out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=B.dtype)
         foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
         check_device(device)
@@ -76,7 +76,7 @@ def check_device(device):
         data_nd = tvm.nd.array(data_npy, ctx)
         out_nd = tvm.nd.empty(dst_shape, ctx=ctx, dtype=B.dtype)
         foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
         check_device(device)
@@ -100,13 +100,10 @@ def check_device(device):
         data_npy = np.random.normal(size=src_shape).astype(A.dtype)
         out_npy = np.squeeze(data_npy, axis=axis)
         data_nd = tvm.nd.array(data_npy, ctx)
-        if out_npy.shape == ():
-            out_nd_shape = (1,)
-        else:
-            out_nd_shape = out_npy.shape
+        out_nd_shape = out_npy.shape
         out_nd = tvm.nd.empty(out_nd_shape, ctx=ctx, dtype=B.dtype)
         foo(data_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
         check_device(device)
@@ -133,7 +130,7 @@ def check_device(device):
         data_nds = [tvm.nd.array(data_npy, ctx) for data_npy in data_npys]
         out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=out_tensor.dtype)
         foo(*(data_nds + [out_nd]))
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
         check_device(device)
@@ -162,7 +159,7 @@ def check_device(device):
         out_nds = [tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=tensor_l[0].dtype) for out_npy in out_npys]
         foo(*([data_nd] + out_nds))
         for out_nd, out_npy in zip(out_nds, out_npys):
-            np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+            tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
         check_device(device)
@@ -201,7 +198,7 @@ def check_device(device):
         indices_nd = tvm.nd.array(indices_src, ctx)
         out_nd = tvm.nd.empty(out_npys.shape, ctx=ctx, dtype=src_dtype)
         foo(data_nd, indices_nd, out_nd)
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npys)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npys)
 
     for device in ["llvm", "opencl"]:
         check_device(device)
@@ -230,7 +227,7 @@ def check_device(device):
         tvm_out = tvm.nd.empty(x.shape, ctx=ctx, dtype=dtype)
         foo(tvm.nd.array(condition, ctx), tvm.nd.array(x, ctx),
             tvm.nd.array(y, ctx), tvm_out)
-        np.testing.assert_allclose(tvm_out.asnumpy(), np_out)
+        tvm.testing.assert_allclose(tvm_out.asnumpy(), np_out)
 
     for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm"]:
         check_device(device)
@@ -261,7 +258,7 @@ def check_device(device):
         out_nds = [tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=tensor_l[0].dtype) for out_npy in out_npys_split]
         foo(*(data_nds + out_nds))
         for out_nd, out_npy in zip(out_nds, out_npys_split):
-            np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+            tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
     for device in ["llvm", "cuda", "opencl", "metal", "rocm"]:
         check_device(device)
@@ -295,7 +292,7 @@ def check_device(device):
         out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx)
         for _ in range(1):
             foo(*(data_nds + [rhs_nd] + [out_nd]))
-        np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
 
     for device in ["llvm", "cuda", "opencl", "metal", "rocm"]:
         check_device(device)
@@ -340,6 +337,7 @@ def test_concatenate():
 
 def test_split():
     verify_split((2, 12, 3), 3, 1)
+    verify_split((2, 12, 3), 3, -1)
     verify_split((2, 12, 3), [2, 4], 1)
     verify_split((10, 12, 24), [5, 7, 9], -1)
 
diff --git a/topi/tests/python_cpp/test_topi_yolo.py b/topi/tests/python_cpp/test_topi_yolo.py
deleted file mode 100644
index ed234b7bd134..000000000000
--- a/topi/tests/python_cpp/test_topi_yolo.py
+++ /dev/null
@@ -1,49 +0,0 @@
-"""Test code for yolo op"""
-import logging
-import numpy as np
-import tvm
-import topi
-import topi.testing
-from topi.util import get_const_tuple
-
-def verify_yolo(ishape, n, classes):
-    '''Verify yolo operator by comparing outputs from tvm and numpy implementation'''
-    
-    A = tvm.placeholder(ishape, name='A')
-    B = topi.cpp.yolo.yolo(A, n, classes)
-    dtype = A.dtype
-
-    def get_ref_data_yolo():
-        '''Randomly initialize the data variables and get refernce output for the yolo operation'''
-        a_np = np.random.uniform(size=ishape).astype(dtype)
-        b_np = topi.testing.yolo_python(a_np, n, classes)
-        return a_np, b_np
-
-    a_np, b_np = get_ref_data_yolo()
-    def check_device(device):
-        '''Check the device is available and if so, build and run the program'''
-        if not tvm.module.enabled(device):
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        target = topi.cpp.TEST_create_target(device)
-        if device == "llvm":
-            s = topi.cpp.generic.default_schedule(target, [B], False)
-        else:
-            s = topi.cpp.cuda.schedule_injective(target, [B])
-        ctx = tvm.context(device, 0)
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        func = tvm.build(s, [A, B], device, name="yolo")
-        func(a, b)
-        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'llvm', 'vulkan']:
-        check_device(device)
-
-def test_yolo():
-    verify_yolo((1, 425, 19, 19), 5, 80)
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.DEBUG)
-    test_yolo()
diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index 179ac811ab70..347aa4207c9b 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -10,7 +10,7 @@
 
 ######################################################################
 # Install dependencies
-# ----------------------------------------
+# --------------------
 # To use autotvm package in tvm, we need to install some extra dependencies.
 # (change "3" to "2" if you use python2):
 #
@@ -20,7 +20,6 @@
 #
 # To make tvm run faster in tuning, it is recommended to use cython
 # as FFI of tvm. In the root directory of tvm, execute
-# (change "3" to "2" if you use python2):
 #
 # .. code-block:: bash
 #
@@ -41,7 +40,7 @@
 
 ######################################################################
 # Step 1:  Define the search space
-# ---------------------------------
+# --------------------------------
 # There are plenty of useful schedule primitives in tvm. You can also find 
 # some tutorials that describe them in more details, such as 
 # (1). :ref:`opt-conv-gpu`
@@ -64,14 +63,29 @@
 #
 
 @autotvm.template
-def conv2d_no_batching(N, H, W, CI, CO, KH, KW, stride, padding):
+def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
     assert N == 1, "Only consider batch_size = 1 in this template"
 
     data = tvm.placeholder((N, CI, H, W), name='data')
     kernel = tvm.placeholder((CO, CI, KH, KW), name='kernel')
-    conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, 'float32')
+    conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype='float32')
     s = tvm.create_schedule([conv.op])
 
+    ##### space definition begin #####
+    n, f, y, x = s[conv].op.axis
+    rc, ry, rx = s[conv].op.reduce_axis
+
+    cfg = autotvm.get_config()
+    cfg.define_split("tile_f", f, num_outputs=4)
+    cfg.define_split("tile_y", y, num_outputs=4)
+    cfg.define_split("tile_x", x, num_outputs=4)
+    cfg.define_split("tile_rc", rc, num_outputs=3)
+    cfg.define_split("tile_ry", ry, num_outputs=3)
+    cfg.define_split("tile_rx", rx, num_outputs=3)
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+    cfg.define_knob("unroll_explicit", [0, 1])
+    ##### space definition end #####
+
     # inline padding
     pad_data = s[conv].op.input_tensors[0]
     s[pad_data].compute_inline()
@@ -88,10 +102,6 @@ def conv2d_no_batching(N, H, W, CI, CO, KH, KW, stride, padding):
 
     # tile and bind spatial axes
     n, f, y, x = s[output].op.axis
-    cfg = autotvm.get_config()
-    cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
-    cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
-    cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
     bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
     by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
     bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
@@ -109,12 +119,9 @@ def conv2d_no_batching(N, H, W, CI, CO, KH, KW, stride, padding):
     s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
     s[OL].compute_at(s[output], tx)
 
-    # tile and bind reduction axes
+    # tile reduction axes
     n, f, y, x = s[OL].op.axis
     rc, ry, rx = s[OL].op.reduce_axis
-    cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3)
-    cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=3)
-    cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=3)
     rco, rcm, rci = cfg['tile_rc'].apply(s, OL, rc)
     ryo, rym, ryi = cfg['tile_rx'].apply(s, OL, ry)
     rxo, rxm, rxi = cfg['tile_ry'].apply(s, OL, rx)
@@ -137,8 +144,6 @@ def conv2d_no_batching(N, H, W, CI, CO, KH, KW, stride, padding):
         s[load].bind(tx, tvm.thread_axis("threadIdx.x"))
 
     # tune unroll
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-    cfg.define_knob("unroll_explicit", [0, 1])
     s[output].pragma(kernel_scope, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
     s[output].pragma(kernel_scope, 'unroll_explicit', cfg['unroll_explicit'].val)
 
@@ -164,14 +169,16 @@ def conv2d_no_batching(N, H, W, CI, CO, KH, KW, stride, padding):
                            target='cuda')
 print(task.config_space)
 
-# use local gpu, measure 5 times for every config to reduce variance
-# run 8 parallel threads for compilation
-measure_option = autotvm.measure_option('local',
-                                        number=5,
-                                        parallel_num=8,
-                                        timeout=20)
+# Use local gpu, measure 10 times for every config to reduce variance
+# The timeout of compiling a program is 10 seconds, the timeout for running is 4 seconds
+measure_option = autotvm.measure_option(
+    builder=autotvm.LocalBuilder(),
+    runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4)
+)
 
-# begin tuning, log records to file `conv2d.log`
+# Begin tuning, log records to file `conv2d.log`
+# During tuning we will also try many invalid configs, so you are expected to
+# see many error reports. As long as you can see non-zero GFLOPS, it is okay.
 tuner = autotvm.tuner.XGBTuner(task)
 tuner.tune(n_trial=20,
            measure_option=measure_option,
@@ -204,10 +211,10 @@ def conv2d_no_batching(N, H, W, CI, CO, KH, KW, stride, padding):
 c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx)
 func(a_tvm, w_tvm, c_tvm)
 
-np.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
+tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
 
-# Evaluate running time. Here we choose a large repeat number (200) to reduce the noise
+# Evaluate running time. Here we choose a large repeat number (400) to reduce the noise
 # and the overhead of kernel launch. You can also use nvprof to validate the result.
-evaluator = func.time_evaluator(func.entry_name, ctx, number=200)
+evaluator = func.time_evaluator(func.entry_name, ctx, number=400)
 print('Time cost of this operator: %f' % evaluator(a_tvm, w_tvm, c_tvm).mean)
 
diff --git a/tutorials/autotvm/tune_nnvm_arm.py b/tutorials/autotvm/tune_nnvm_arm.py
index d11823f204e1..c21273ed25a3 100644
--- a/tutorials/autotvm/tune_nnvm_arm.py
+++ b/tutorials/autotvm/tune_nnvm_arm.py
@@ -8,9 +8,9 @@
 network.
 
 The operator implementation for ARM CPU in TVM is written in template form.
-It has many tunable knobs (tile factor, vectorization, unrolling, etc).
-We will do tuning for all convolution and depthwise convolution operators
-in the neural network. After the tuning, we can get a log file which stores
+The template has many tunable knobs (tile factor, vectorization, unrolling, etc).
+We will tune all convolution and depthwise convolution operators
+in the neural network. After tuning, we produce a log file which stores
 the best knob values for all required operators. When the tvm compiler compiles
 these operators, it will query this log file to get the best knob values.
 
@@ -21,15 +21,15 @@
 
 ######################################################################
 # Install dependencies
-# ----------------------------------------
-# To use autotvm package in tvm, we need to install some extra dependencies.
+# --------------------
+# To use the autotvm package in tvm, we need to install some extra dependencies.
 # (change "3" to "2" if you use python2):
 #
 # .. code-block:: bash
 #
 #   pip3 install --user psutil xgboost tornado
 #
-# To make tvm run faster in tuning, it is recommended to use cython
+# To make tvm run faster during tuning, it is recommended to use cython
 # as FFI of tvm. In the root directory of tvm, execute
 # (change "3" to "2" if you use python2):
 #
@@ -65,15 +65,20 @@ def get_network(name, batch_size):
     input_shape = (batch_size, 3, 224, 224)
     output_shape = (batch_size, 1000)
 
-    if name =='resnet-18':
-        net, params = nnvm.testing.resnet.get_workload(num_layers=18, batch_size=batch_size)
-    elif name =='mobilenet':
+    if "resnet" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif "vgg" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif name == 'mobilenet':
         net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
-    elif name =='squeezenet v1.1':
+    elif name == 'squeezenet_v1.1':
         net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1')
-    elif name =='vgg-16':
-        net, params = nnvm.testing.vgg.get_workload(num_layers=16, batch_size=batch_size)
-    elif name =='custom':
+    elif name == 'inception_v3':
+        input_shape = (1, 3, 299, 299)
+        net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size)
+    elif name == 'custom':
         # an example for custom network
         from nnvm.testing import utils
         net = nnvm.sym.Variable('data')
@@ -92,6 +97,7 @@ def get_network(name, batch_size):
 
     return net, params, input_shape, output_shape
 
+
 #################################################################
 # Start RPC Tracker
 # -----------------
@@ -102,10 +108,9 @@ def get_network(name, batch_size):
 # To scale up the tuning, TVM uses RPC Tracker to manage distributed devices.
 # The RPC Tracker is a centralized master node. We can register all devices to
 # the tracker. For example, if we have 10 phones, we can register all of them
-# to the tracker, then we can run 10 measurements in parallel, which accelerates
-# the tuning process.
+# to the tracker, and run 10 measurements in parallel, accelerating the tuning process.
 #
-# To start an RPC tracker, run this command in the host machine. The tracker is
+# To start an RPC tracker, run this command on the host machine. The tracker is
 # required during the whole tuning process, so we need to open a new terminal for
 # this command:
 #
@@ -138,6 +143,8 @@ def get_network(name, batch_size):
 # * For Android:
 #   Follow this `readme page <https://github.com/dmlc/tvm/tree/master/apps/android_rpc>`_ to
 #   install tvm rpc apk on the android device. Make sure you can pass the android rpc test.
+#   Then you have already registred your device. During tuning, you have to go to developer option
+#   and enable "Keep screen awake during changing" and charge your phone to make it stable.
 #
 # After registering devices, we can confirm it by querying rpc_tracker
 #
@@ -151,18 +158,20 @@ def get_network(name, batch_size):
 # .. code-block:: bash
 #
 #    Queue Status
-#    ----------------------------
-#    key          free    pending
-#    ----------------------------
-#    mate10pro    2       0
-#    rk3399       2       0
-#    rpi3b        11      0
-#    ----------------------------
+#    ----------------------------------
+#    key          total  free  pending
+#    ----------------------------------
+#    mate10pro    2      2     0
+#    rk3399       2      2     0
+#    rpi3b        11     11    0
+#    ----------------------------------
+#
+# You can register multiple devices to the tracker to accelerate the measurement in tuning.
 
 ###########################################
 # Set Tuning Options
 # ------------------
-# Before tuning, we should do some configurations. Here I use an RK3399 board
+# Before tuning, we should apply some configurations. Here I use an RK3399 board
 # as example. In your setting, you should modify the target and device_key accordingly.
 # set :code:`use_android` to True if you use android phone.
 
@@ -184,43 +193,41 @@ def get_network(name, batch_size):
 dtype = 'float32'
 
 tuning_option = {
-   'log_filename': log_file,
-
-   'tuner': 'xgb',
-   'n_trial': 1000,
-   'early_stopping': 250,
-
-   'measure_option': autotvm.measure_option(
-       autotvm.use_rpc(device_key, host='localhost', port=9190),
-       number=4,
-       parallel_num=1,
-       timeout=10,
-       build_func='ndk' if use_android else 'default',
-   ),
+    'log_filename': log_file,
+
+    'tuner': 'xgb',
+    'n_trial': 1000,
+    'early_stopping': 400,
+
+    'measure_option': autotvm.measure_option(
+        builder=autotvm.LocalBuilder(
+            build_func='ndk' if use_android else 'default'),
+        runner=autotvm.RPCRunner(
+            device_key, host='localhost', port=9190,
+            number=5,
+            timeout=4,
+        ),
+    ),
 }
 
 ####################################################################
 #
 # .. note:: How to set tuning options
 #
-#   In general, the default value provided here works well. It is the same
-#   value that we used to generate pre-tuned parameters.
-#   If you have multiple devices, you can set :code:`parallel_num` to
-#   the number of devices you have. (e.g. set it to 3 if you register 3 rk3399
-#   boards to the tracker).
-#   If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
+#   In general, the default values provided here work well.
+#   If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
 #   which makes the tuning run longer.
-#   If your device is very slow or a single conv2d operator in your network has large FLOPs,
-#   consider setting timeout larger.
+#   If your device runs very slow or your conv2d operators have many GFLOPs, considering to
+#   set timeout larger.
 #
 
 ###################################################################
 # Begin Tuning
 # ------------
 # Now we can extract tuning tasks from the network and begin tuning.
-# Here we provide a simple utility function to tune a list of tasks.
-# This function is just an initial implementation which tune them in sequential order.
-# Later we will bring more sophisticated tuner scheduler.
+# Here, we provide a simple utility function to tune a list of tasks.
+# This function is just an initial implementation which tunes them in sequential order.
+# We will introduce a more sophisticated tuning scheduler in the future.
 
 # You can skip the implementation of this function for this tutorial.
 def tune_tasks(tasks,
@@ -236,7 +243,9 @@ def tune_tasks(tasks,
             try:  # try winograd template
                 tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
                                           tasks[i].target, tasks[i].target_host, 'winograd')
-                tasks.append(tsk)
+                input_channel = tsk.workload[1][1]
+                if input_channel >= 64:
+                    tasks[i] = tsk
             except Exception:
                 pass
 
@@ -245,8 +254,8 @@ def tune_tasks(tasks,
     if os.path.exists(tmp_log_file):
         os.remove(tmp_log_file)
 
-    for i, tsk in enumerate(tasks):
-        prefix = "[Task %2d/%2d] " %(i+1, len(tasks))
+    for i, tsk in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " % (i+1, len(tasks))
 
         # create tuner
         if tuner == 'xgb' or tuner == 'xgb-rank':
@@ -278,9 +287,9 @@ def tune_tasks(tasks,
 
 
 ########################################################################
-# Finally we launch tuning jobs and evaluate the end-to-end performance.
+# Finally, we launch tuning jobs and evaluate the end-to-end performance.
 
-def tune_and_evaluate():
+def tune_and_evaluate(tuning_opt):
     # extract workloads from nnvm graph
     print("Extract tasks...")
     net, params, input_shape, out_shape = get_network(network, batch_size=1)
@@ -290,19 +299,18 @@ def tune_and_evaluate():
 
     # run tuning tasks
     print("Tuning...")
-    tune_tasks(tasks, **tuning_option)
+    tune_tasks(tasks, **tuning_opt)
 
     # compile kernels with history best records
     with autotvm.apply_history_best(log_file):
         print("Compile...")
-        with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
+        with nnvm.compiler.build_config(opt_level=3):
             graph, lib, params = nnvm.compiler.build(
-                net, target=target,
-                shape={'data': input_shape}, params=params, dtype=dtype)
+                net, target=target, shape={'data': input_shape}, params=params, dtype=dtype)
 
         # export library
         tmp = tempdir()
-        if tuning_option['measure_option']['build_func'] == 'ndk': # for android
+        if use_android:
             from tvm.contrib import ndk
             filename = "net.so"
             lib.export_library(tmp.relpath(filename), ndk.create_shared)
@@ -312,68 +320,64 @@ def tune_and_evaluate():
 
         # upload module to device
         print("Upload...")
-        remote = autotvm.measure.request_remote(device_key, timeout=10000)
+        remote = autotvm.measure.request_remote(device_key, 'localhost', 9190,
+                                                timeout=10000)
         remote.upload(tmp.relpath(filename))
         rlib = remote.load_module(filename)
 
         # upload parameters to device
         ctx = remote.context(str(target), 0)
-        rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
-        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
         module = runtime.create(graph, rlib, ctx)
+        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
         module.set_input('data', data_tvm)
-        module.set_input(**rparams)
+        module.set_input(**params)
 
         # evaluate
         print("Evaluate inference time cost...")
         ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=10)
-        prof_res = np.array(ftimer().results) * 1000 # convert to millisecond
+        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
         print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
               (np.mean(prof_res), np.std(prof_res)))
 
 # We do not run the tuning in our webpage server since it takes too long.
-# Uncomment the following line to run by yourself.
+# Uncomment the following line to run it by yourself.
 
-# tune_and_evaluate()
+# tune_and_evaluate(tuning_option)
 
 ######################################################################
 # Sample Output
 # -------------
-# The tuning needs to train xgboost models and use them for prediction.
+# The tuning needs to compile many programs and extract feature from them.
 # So a high performance CPU is recommended.
-# It takes about 2 hours on a 32T AMD Ryzen CPU.
-# One sample output is
+# One sample output is listed below.
+# It takes about 2 hours on a 32T AMD Ryzen Threadripper.
 #
 # .. code-block:: bash
 #
 #    Extract tasks...
 #    Tuning...
-#    [Task  1/16]  Current/Best:   18.85/  19.67 GFLOPS | Progress: (353/1000) | 387.05 s Done.
-#    [Task  2/16]  Current/Best:   16.10/  23.50 GFLOPS | Progress: (444/1000) | 379.99 s Done.
-#    [Task  3/16]  Current/Best:    5.49/  13.96 GFLOPS | Progress: (610/1000) | 485.87 s Done.
-#    [Task  4/16]  Current/Best:   10.07/  20.48 GFLOPS | Progress: (430/1000) | 391.66 s Done.
-#    [Task  5/16]  Current/Best:   11.50/  15.50 GFLOPS | Progress: (374/1000) | 356.03 s Done.
-#    [Task  6/16]  Current/Best:   10.76/  23.77 GFLOPS | Progress: (526/1000) | 526.42 s Done.
-#    [Task  7/16]  Current/Best:   12.71/  22.03 GFLOPS | Progress: (341/1000) | 322.96 s Done.
-#    [Task  8/16]  Current/Best:    8.60/  17.91 GFLOPS | Progress: (272/1000) | 236.08 s Done.
-#    [Task  9/16]  Current/Best:   15.37/  23.62 GFLOPS | Progress: (275/1000) | 275.18 s Done.
-#    [Task 10/16]  Current/Best:    6.62/  23.01 GFLOPS | Progress: (330/1000) | 315.02 s Done.
-#    [Task 11/16]  Current/Best:    1.85/  21.39 GFLOPS | Progress: (281/1000) | 239.19 s Done.
-#    [Task 12/16]  Current/Best:   15.41/  24.02 GFLOPS | Progress: (258/1000) | 270.82 s Done.
-#    [Task 13/16]  Current/Best:   17.96/  25.79 GFLOPS | Progress: (380/1000) | 738.29 s Done.
-#    [Task 14/16]  Current/Best:   14.81/  31.17 GFLOPS | Progress: (413/1000) | 799.21 s Done.
-#    [Task 15/16]  Current/Best:   24.39/  40.97 GFLOPS | Progress: (355/1000) | 700.25 s Done.
-#    [Task 16/16]  Current/Best:    9.42/  49.90 GFLOPS | Progress: (348/1000) | 603.84 s Done.
+#    [Task  1/12]  Current/Best:   22.37/  52.19 GFLOPS | Progress: (544/1000) | 406.59 s Done.
+#    [Task  2/12]  Current/Best:    6.51/  18.77 GFLOPS | Progress: (608/1000) | 325.05 s Done.
+#    [Task  3/12]  Current/Best:    4.67/  24.87 GFLOPS | Progress: (480/1000) | 372.31 s Done.
+#    [Task  4/12]  Current/Best:   11.35/  46.83 GFLOPS | Progress: (736/1000) | 602.39 s Done.
+#    [Task  5/12]  Current/Best:    1.01/  19.80 GFLOPS | Progress: (448/1000) | 262.16 s Done.
+#    [Task  6/12]  Current/Best:    2.47/  23.76 GFLOPS | Progress: (672/1000) | 563.85 s Done.
+#    [Task  7/12]  Current/Best:   14.57/  33.97 GFLOPS | Progress: (544/1000) | 465.15 s Done.
+#    [Task  8/12]  Current/Best:    1.13/  17.65 GFLOPS | Progress: (576/1000) | 365.08 s Done.
+#    [Task  9/12]  Current/Best:   14.45/  22.66 GFLOPS | Progress: (928/1000) | 724.25 s Done.
+#    [Task 10/12]  Current/Best:    3.22/  15.36 GFLOPS | Progress: (864/1000) | 564.27 s Done.
+#    [Task 11/12]  Current/Best:   11.03/  32.23 GFLOPS | Progress: (736/1000) | 635.15 s Done.
+#    [Task 12/12]  Current/Best:    8.00/  21.65 GFLOPS | Progress: (1000/1000) | 1111.81 s Done.
 #    Compile...
 #    Upload...
 #    Evaluate inference time cost...
-#    Mean inference time (std dev): 157.29 ms (1.74 ms)
+#    Mean inference time (std dev): 162.59 ms (0.06 ms)
 
 ######################################################################
 #
-# .. note:: **Meet some problems?**
+# .. note:: **Experiencing Difficulties?**
 #
-#   The auto tuning module is error prone. If you always see " 0.00/ 0.00 GFLOPS",
+#   The auto tuning module is error-prone. If you always see " 0.00/ 0.00 GFLOPS",
 #   then there must be something wrong.
 #
 #   First, make sure you set the correct configuration of your device.
diff --git a/tutorials/autotvm/tune_nnvm_cuda.py b/tutorials/autotvm/tune_nnvm_cuda.py
new file mode 100644
index 000000000000..6e0ace462d6f
--- /dev/null
+++ b/tutorials/autotvm/tune_nnvm_cuda.py
@@ -0,0 +1,374 @@
+"""
+Auto-tuning a convolutional network for NVIDIA GPU
+====================================================
+**Author**: `Lianmin Zheng <https://https://github.com/merrymercy>`_
+
+Auto-tuning for specific devices and workloads is critical for getting the
+best performance. This is a tutorial on how to tune a whole convolutional
+network for NVIDIA GPU.
+
+The operator implementation for NVIDIA GPU in TVM is written in template form.
+The template has many tunable knobs (tile factor, unrolling, etc).
+We will tune all convolution and depthwise convolution operators
+in the neural network. After tuning, we produce a log file which stores
+the best knob values for all required operators. When the tvm compiler compiles
+these operators, it will query this log file to get the best knob values.
+
+We also released pre-tuned parameters for some NVIDIA GPUs. You can go to
+`NVIDIA GPU Benchmark <https://github.com/dmlc/tvm/wiki/Benchmark#nvidia-gpu>`_
+to see the results.
+"""
+
+######################################################################
+# Install dependencies
+# --------------------
+# To use the autotvm package in tvm, we need to install some extra dependencies.
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user psutil xgboost tornado
+#
+# To make tvm run faster during tuning, it is recommended to use cython
+# as FFI of tvm. In the root directory of tvm, execute:
+#
+# .. code-block:: bash
+#
+#   pip3 install --user cython
+#   sudo make cython3
+#
+# Now return to python code. Import packages.
+
+import os
+
+import numpy as np
+
+import nnvm.testing
+import nnvm.compiler
+import tvm
+from tvm import autotvm
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+from tvm.contrib.util import tempdir
+import tvm.contrib.graph_runtime as runtime
+
+#################################################################
+# Define Network
+# --------------
+# First we need to define the network in nnvm symbol API.
+# We can load some pre-defined network from :code:`nnvm.testing`.
+# We can also load models from MXNet, ONNX and TensorFlow (see NNVM
+# tutorials :ref:`tutorial-nnvm` for more details).
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network"""
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if "resnet" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif "vgg" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif name == 'mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif name == 'squeezenet_v1.1':
+        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1')
+    elif name == 'inception_v3':
+        input_shape = (1, 3, 299, 299)
+        net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size)
+    elif name == 'custom':
+        # an example for custom network
+        from nnvm.testing import utils
+        net = nnvm.sym.Variable('data')
+        net = nnvm.sym.conv2d(net, channels=4, kernel_size=(3,3), padding=(1,1))
+        net = nnvm.sym.flatten(net)
+        net = nnvm.sym.dense(net, units=1000)
+        net, params = utils.create_workload(net, batch_size, (3, 224, 224))
+    elif name == 'mxnet':
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+        block = get_model('resnet18_v1', pretrained=True)
+        net, params = nnvm.frontend.from_mxnet(block)
+        net = nnvm.sym.softmax(net)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, input_shape, output_shape
+
+###########################################
+# Set Tuning Options
+# ------------------
+# Before tuning, we apply some configurations.
+
+#### DEVICE CONFIG ####
+target = tvm.target.cuda()
+
+#### TUNING OPTION ####
+network = 'resnet-18'
+log_file = "%s.log" % network
+dtype = 'float32'
+
+tuning_option = {
+    'log_filename': log_file,
+
+    'tuner': 'xgb',
+    'n_trial': 2000,
+    'early_stopping': 600,
+
+    'measure_option': autotvm.measure_option(
+        builder=autotvm.LocalBuilder(timeout=10),
+        runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4),
+    ),
+}
+
+####################################################################
+#
+# .. note:: How to set tuning options
+#
+#   In general, the default value provided here works well.
+#
+#   If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
+#   which makes the tuning runs longer.
+#
+#   If you have multiple devices, you can use all of them for measurement to
+#   accelerate the tuning process. (see the 'Scale up measurement` section below).
+#
+
+###################################################################
+# Begin Tuning
+# ------------
+# Now we can extract tuning tasks from the network and begin tuning.
+# Here, we provide a simple utility function to tune a list of tasks.
+# This function is just an initial implementation which tunes them in sequential order.
+# We will introduce a more sophisticated tuning scheduler in the future.
+
+# You can skip the implementation of this function for this tutorial.
+def tune_tasks(tasks,
+               measure_option,
+               tuner='xgb',
+               n_trial=1000,
+               early_stopping=None,
+               log_filename='tuning.log',
+               use_transfer_learning=True,
+               try_winograd=True):
+    if try_winograd:
+        for i in range(len(tasks)):
+            try:  # try winograd template
+                tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
+                                          tasks[i].target, tasks[i].target_host, 'winograd')
+                input_channel = tsk.workload[1][1]
+                if input_channel >= 64:
+                    tasks[i] = tsk
+            except Exception:
+                pass
+
+    # create tmp log file
+    tmp_log_file = log_filename + ".tmp"
+    if os.path.exists(tmp_log_file):
+        os.remove(tmp_log_file)
+
+    for i, tsk in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " %(i+1, len(tasks))
+
+        # create tuner
+        if tuner == 'xgb' or tuner == 'xgb-rank':
+            tuner_obj = XGBTuner(tsk, loss_type='rank')
+        elif tuner == 'ga':
+            tuner_obj = GATuner(tsk, pop_size=100)
+        elif tuner == 'random':
+            tuner_obj = RandomTuner(tsk)
+        elif tuner == 'gridsearch':
+            tuner_obj = GridSearchTuner(tsk)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        if use_transfer_learning:
+            if os.path.isfile(tmp_log_file):
+                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
+
+        # do tuning
+        tuner_obj.tune(n_trial=min(n_trial, len(tsk.config_space)),
+                       early_stopping=early_stopping,
+                       measure_option=measure_option,
+                       callbacks=[
+                           autotvm.callback.progress_bar(n_trial, prefix=prefix),
+                           autotvm.callback.log_to_file(tmp_log_file)])
+
+    # pick best records to a cache file
+    autotvm.record.pick_best(tmp_log_file, log_filename)
+    os.remove(tmp_log_file)
+
+
+########################################################################
+# Finally, we launch tuning jobs and evaluate the end-to-end performance.
+
+def tune_and_evaluate(tuning_opt):
+    # extract workloads from nnvm graph
+    print("Extract tasks...")
+    net, params, input_shape, out_shape = get_network(network, batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d,))
+
+    # run tuning tasks
+    print("Tuning...")
+    tune_tasks(tasks, **tuning_opt)
+
+    # compile kernels with history best records
+    with autotvm.apply_history_best(log_file):
+        print("Compile...")
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(
+                net, target=target, shape={'data': input_shape}, params=params, dtype=dtype)
+
+        # export library
+        tmp = tempdir()
+        filename = "net.tar"
+        lib.export_library(tmp.relpath(filename))
+
+        # load parameters
+        ctx = tvm.context(str(target), 0)
+        module = runtime.create(graph, lib, ctx)
+        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+        module.set_input('data', data_tvm)
+        module.set_input(**params)
+
+        # evaluate
+        print("Evaluate inference time cost...")
+        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600)
+        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
+        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
+              (np.mean(prof_res), np.std(prof_res)))
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+
+# tune_and_evaluate(tuning_option)
+
+######################################################################
+# Sample Output
+# -------------
+# The tuning needs to compile many programs and extract feature from them.
+# So a high performance CPU is recommended. One sample output is listed below.
+# It takes about 4 hours to get the following output on a 32T AMD Ryzen Threadripper.
+# The tuning target is NVIDIA 1080 Ti.
+# (You can see some errors during compilation. If the tuning is not stuck, it is okay.)
+#
+# .. code-block:: bash
+#
+#    Extract tasks...
+#    Tuning...
+#    [Task  1/12]  Current/Best:  541.83/3570.66 GFLOPS | Progress: (960/2000) | 1001.31 s Done.
+#    [Task  2/12]  Current/Best:    0.56/ 803.33 GFLOPS | Progress: (704/2000) | 608.08 s Done.
+#    [Task  3/12]  Current/Best:  103.69/1141.25 GFLOPS | Progress: (768/2000) | 702.13 s Done.
+#    [Task  4/12]  Current/Best: 2905.03/3925.15 GFLOPS | Progress: (864/2000) | 745.94 sterminate called without an active exception
+#    [Task  4/12]  Current/Best: 2789.36/3925.15 GFLOPS | Progress: (1056/2000) | 929.40 s Done.
+#    [Task  5/12]  Current/Best:   89.06/1076.24 GFLOPS | Progress: (704/2000) | 601.73 s Done.
+#    [Task  6/12]  Current/Best:   40.39/2129.02 GFLOPS | Progress: (1088/2000) | 1125.76 s Done.
+#    [Task  7/12]  Current/Best: 4090.53/5007.02 GFLOPS | Progress: (800/2000) | 903.90 s Done.
+#    [Task  8/12]  Current/Best:    4.78/1272.28 GFLOPS | Progress: (768/2000) | 749.14 s Done.
+#    [Task  9/12]  Current/Best: 1391.45/2325.08 GFLOPS | Progress: (992/2000) | 1084.87 s Done.
+#    [Task 10/12]  Current/Best: 1995.44/2383.59 GFLOPS | Progress: (864/2000) | 862.60 s Done.
+#    [Task 11/12]  Current/Best: 4093.94/4899.80 GFLOPS | Progress: (224/2000) | 240.92 sterminate called without an active exception
+#    [Task 11/12]  Current/Best: 3487.98/4909.91 GFLOPS | Progress: (480/2000) | 534.96 sterminate called without an active exception
+#    [Task 11/12]  Current/Best: 4636.84/4912.17 GFLOPS | Progress: (1184/2000) | 1381.16 sterminate called without an active exception
+#    [Task 11/12]  Current/Best:   50.12/4912.17 GFLOPS | Progress: (1344/2000) | 1602.81 s Done.
+#    [Task 12/12]  Current/Best: 3581.31/4286.30 GFLOPS | Progress: (736/2000) | 943.52 s Done.
+#    Compile...
+#    Evaluate inference time cost...
+#    Mean inference time (std dev): 1.07 ms (0.05 ms)
+#
+# As a reference baseline, the time cost of MXNet + TensorRT on resnet-18 is 1.30ms. So we are a little faster.
+
+######################################################################
+#
+# .. note:: **Experiencing Difficulties?**
+#
+#   The auto tuning module is error-prone. If you always see " 0.00/ 0.00 GFLOPS",
+#   then there must be something wrong.
+#
+#   First, make sure you set the correct configuration of your device.
+#   Then, you can print debug information by adding these lines in the beginning
+#   of the script. It will print every measurement result, where you can find useful
+#   error messages.
+#
+#   .. code-block:: python
+#
+#      import logging
+#      logging.getLogger('autotvm').setLevel(logging.DEBUG)
+#
+#   Finally, always feel free to ask our community for help on https://discuss.tvm.ai
+
+
+#################################################################
+# Scale up measurement by using multiple devices
+# ----------------------------------------------
+#
+# If you have multiple devices, you can use all of them for measurement.
+# TVM uses the RPC Tracker to manage distributed devices.
+# The RPC Tracker is a centralized master node. We can register all devices to
+# the tracker. For example, if we have 10 GPU cards, we can register all of them
+# to the tracker, and run 10 measurements in parallel, accelerating the tuning process.
+#
+# To start an RPC tracker, run this command on the host machine. The tracker is
+# required during the whole tuning process, so we need to open a new terminal for
+# this command:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190
+#
+# The expected output is
+#
+# .. code-block:: bash
+#
+#   INFO:RPCTracker:bind to 0.0.0.0:9190
+#
+# Then open another new terminal for the RPC server. We need to start one server
+# for each dedicated device. We use a string key to distinguish the types of devices.
+# You can pick a name you like.
+# (Note: For rocm backend, there are some internal errors with the compiler,
+# we need to add `--no-fork` to the argument list.)
+#
+# .. code-block:: bash
+#
+#     python -m tvm.exec.rpc_server --tracker=localhost:9190 --key=1080ti
+#
+# After registering devices, we can confirm it by querying rpc_tracker
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.query_rpc_tracker --host=localhost --port=9190
+#
+# For example, if we have four 1080ti, two titanx and one gfx900, the output can be
+#
+# .. code-block:: bash
+#
+#    Queue Status
+#    ----------------------------------
+#    key          total  free  pending
+#    ----------------------------------
+#    1080ti       4      4     0
+#    titanx       2      2     0
+#    gfx900       1      1     0
+#    ----------------------------------
+#
+# Finally, we need to change the tuning option to use RPCRunner. Use the code below
+# to replace the corresponding part above.
+
+tuning_option = {
+    'log_filename': log_file,
+
+    'tuner': 'xgb',
+    'n_trial': 2000,
+    'early_stopping': 600,
+
+    'measure_option': autotvm.measure_option(
+        builder=autotvm.LocalBuilder(timeout=10),
+        runner=autotvm.RPCRunner(
+            '1080ti',  # change the device key to your key
+            'localhost', 9190,
+            number=20, repeat=3, timeout=4),
+    ),
+}
diff --git a/tutorials/autotvm/tune_nnvm_mobile_gpu.py b/tutorials/autotvm/tune_nnvm_mobile_gpu.py
new file mode 100644
index 000000000000..4bd6a11ca2f1
--- /dev/null
+++ b/tutorials/autotvm/tune_nnvm_mobile_gpu.py
@@ -0,0 +1,399 @@
+"""
+Auto-tuning a convolutional network for Mobile GPU
+====================================================
+**Author**: `Lianmin Zheng <https://https://github.com/merrymercy>`_
+
+Auto-tuning for a specific device is critical for getting the best
+performance. This is a tutorial about how to tune a whole convolutional
+network.
+
+The operator implementation for Mobile GPU in TVM is written in template form.
+The template has many tunable knobs (tile factor, vectorization, unrolling, etc).
+We will tune all convolution, depthwise convolution and dense operators
+in the neural network. After tuning, we produce a log file which stores
+the best knob values for all required operators. When the tvm compiler compiles
+these operators, it will query this log file to get the best knob values.
+
+We also released pre-tuned parameters for some arm devices. You can go to
+`Mobile GPU Benchmark <https://github.com/dmlc/tvm/wiki/Benchmark#mobile-gpu>`_
+to see the results.
+"""
+
+######################################################################
+# Install dependencies
+# --------------------
+# To use the autotvm package in tvm, we need to install some extra dependencies.
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user psutil xgboost tornado
+#
+# To make tvm run faster during tuning, it is recommended to use cython
+# as FFI of tvm. In the root directory of tvm, execute
+# (change "3" to "2" if you use python2):
+#
+# .. code-block:: bash
+#
+#   pip3 install --user cython
+#   sudo make cython3
+#
+# Now return to python code. Import packages.
+
+import os
+
+import numpy as np
+
+import nnvm.testing
+import nnvm.compiler
+import tvm
+from tvm import autotvm
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+from tvm.contrib.util import tempdir
+import tvm.contrib.graph_runtime as runtime
+
+#################################################################
+# Define network
+# --------------
+# First we need to define the network in nnvm symbol API.
+# We can load some pre-defined network from :code:`nnvm.testing`.
+# We can also load models from MXNet, ONNX and TensorFlow (see NNVM
+# tutorials :ref:`tutorial-nnvm` for more details).
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network"""
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if "resnet" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif "vgg" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif name == 'mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif name == 'squeezenet_v1.1':
+        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1')
+    elif name == 'inception_v3':
+        input_shape = (1, 3, 299, 299)
+        net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size)
+    elif name == 'custom':
+        # an example for custom network
+        from nnvm.testing import utils
+        net = nnvm.sym.Variable('data')
+        net = nnvm.sym.conv2d(net, channels=4, kernel_size=(3,3), padding=(1,1))
+        net = nnvm.sym.flatten(net)
+        net = nnvm.sym.dense(net, units=1000)
+        net, params = utils.create_workload(net, batch_size, (3, 224, 224))
+    elif name == 'mxnet':
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+        block = get_model('resnet18_v1', pretrained=True)
+        net, params = nnvm.frontend.from_mxnet(block)
+        net = nnvm.sym.softmax(net)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, input_shape, output_shape
+
+
+#################################################################
+# Start RPC Tracker
+# -----------------
+# TVM uses RPC session to communicate with ARM boards.
+# During tuning, the tuner will send the generated code to the board and
+# measure the speed of code on the board.
+#
+# To scale up the tuning, TVM uses RPC Tracker to manage distributed devices.
+# The RPC Tracker is a centralized master node. We can register all devices to
+# the tracker. For example, if we have 10 phones, we can register all of them
+# to the tracker, and run 10 measurements in parallel, accelerating the tuning process.
+#
+# To start an RPC tracker, run this command on the host machine. The tracker is
+# required during the whole tuning process, so we need to open a new terminal for
+# this command:
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190
+#
+# The expected output is
+#
+# .. code-block:: bash
+#
+#   INFO:RPCTracker:bind to 0.0.0.0:9190
+
+#################################################################
+# Register devices to RPC Tracker
+# -----------------------------------
+# Now we can register our devices to the tracker. The first step is to
+# build tvm runtime for the ARM devices.
+#
+# * For Linux:
+#   Follow this section :ref:`build-tvm-runtime-on-device` to build
+#   tvm runtime on the device. Then register the device to tracker by
+#
+#   .. code-block:: bash
+#
+#     python -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=rk3399
+#
+#   (replace :code:`[HOST_IP]` with the IP address of your host machine)
+#
+# * For Android:
+#   Follow this `readme page <https://github.com/dmlc/tvm/tree/master/apps/android_rpc>`_ to
+#   install tvm rpc apk on the android device. Make sure you can pass the android rpc test.
+#   Then you have already registred your device. During tuning, you have to go to developer option
+#   and enable "Keep screen awake during changing" and charge your phone to make it stable.
+#
+# After registering devices, we can confirm it by querying rpc_tracker
+#
+# .. code-block:: bash
+#
+#   python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190
+#
+# For example, if we have 2 Huawei mate10 pro, 11 Raspberry Pi 3B and 2 rk3399,
+# the output can be
+#
+# .. code-block:: bash
+#
+#    Queue Status
+#    ----------------------------------
+#    key          total  free  pending
+#    ----------------------------------
+#    mate10pro    2      2     0
+#    rk3399       2      2     0
+#    rpi3b        11     11    0
+#    ----------------------------------
+#
+# You can register multiple devices to the tracker to accelerate the measurement in tuning.
+
+###########################################
+# Set Tuning Options
+# ------------------
+# Before tuning, we should apply some configurations. Here I use an RK3399 board
+# as example. In your setting, you should modify the target and device_key accordingly.
+# set :code:`use_android` to True if you use android phone.
+
+#### DEVICE CONFIG ####
+
+target = tvm.target.create('opencl -device=mali')
+
+# Replace "aarch64-linux-gnu" with the correct target of your board.
+# This target host is used for cross compilation. You can query it by :code:`gcc -v` on your device.
+target_host = 'llvm -target=aarch64-linux-gnu'
+
+# Also replace this with the device key in your tracker
+device_key = 'rk3399'
+
+# Set this to True if you use android phone
+use_android = False
+
+#### TUNING OPTION ####
+network = 'resnet-18'
+log_file = "%s.%s.log" % (device_key, network)
+dtype = 'float32'
+
+tuning_option = {
+    'log_filename': log_file,
+
+    'tuner': 'xgb',
+    'n_trial': 1000,
+    'early_stopping': 450,
+
+    'measure_option': autotvm.measure_option(
+        builder=autotvm.LocalBuilder(
+            build_func='ndk' if use_android else 'default'),
+        runner=autotvm.RPCRunner(
+            device_key, host='localhost', port=9190,
+            number=10,
+            timeout=5,
+        ),
+    ),
+}
+
+####################################################################
+#
+# .. note:: How to set tuning options
+#
+#   In general, the default values provided here work well.
+#   If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
+#   which makes the tuning run longer.
+#   If your device runs very slow or your conv2d operators have many GFLOPs, considering to
+#   set timeout larger.
+#
+
+###################################################################
+# Begin Tuning
+# ------------
+# Now we can extract tuning tasks from the network and begin tuning.
+# Here, we provide a simple utility function to tune a list of tasks.
+# This function is just an initial implementation which tunes them in sequential order.
+# We will introduce a more sophisticated tuning scheduler in the future.
+
+# You can skip the implementation of this function for this tutorial.
+def tune_tasks(tasks,
+               measure_option,
+               tuner='xgb',
+               n_trial=1000,
+               early_stopping=None,
+               log_filename='tuning.log',
+               use_transfer_learning=True,
+               try_winograd=True):
+    if try_winograd:
+        for i in range(len(tasks)):
+            try:  # try winograd template
+                tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
+                                          tasks[i].target, tasks[i].target_host, 'winograd')
+                tasks.append(tsk)
+            except Exception:
+                pass
+
+    # create tmp log file
+    tmp_log_file = log_filename + ".tmp"
+    if os.path.exists(tmp_log_file):
+        os.remove(tmp_log_file)
+
+    for i, tsk in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " % (i+1, len(tasks))
+
+        # create tuner
+        if tuner == 'xgb' or tuner == 'xgb-rank':
+            tuner_obj = XGBTuner(tsk, loss_type='rank')
+        elif tuner == 'ga':
+            tuner_obj = GATuner(tsk, pop_size=50)
+        elif tuner == 'random':
+            tuner_obj = RandomTuner(tsk)
+        elif tuner == 'gridsearch':
+            tuner_obj = GridSearchTuner(tsk)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        if use_transfer_learning:
+            if os.path.isfile(tmp_log_file):
+                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
+
+        # do tuning
+        tuner_obj.tune(n_trial=min(n_trial, len(tsk.config_space)),
+                       early_stopping=early_stopping,
+                       measure_option=measure_option,
+                       callbacks=[
+                           autotvm.callback.progress_bar(n_trial, prefix=prefix),
+                           autotvm.callback.log_to_file(tmp_log_file)])
+
+    # pick best records to a cache file
+    autotvm.record.pick_best(tmp_log_file, log_filename)
+    os.remove(tmp_log_file)
+
+
+########################################################################
+# Finally, we launch tuning jobs and evaluate the end-to-end performance.
+
+def tune_and_evaluate(tuning_opt):
+    # extract workloads from nnvm graph
+    print("Extract tasks...")
+    net, params, input_shape, out_shape = get_network(network, batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target, target_host=target_host,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d, nnvm.sym.dense))
+
+    # run tuning tasks
+    print("Tuning...")
+    tune_tasks(tasks, **tuning_opt)
+
+    # compile kernels with history best records
+    with autotvm.apply_history_best(log_file):
+        print("Compile...")
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(
+                net, target=target, target_host=target_host,
+                shape={'data': input_shape}, params=params, dtype=dtype)
+
+        # export library
+        tmp = tempdir()
+        if use_android:
+            from tvm.contrib import ndk
+            filename = "net.so"
+            lib.export_library(tmp.relpath(filename), ndk.create_shared)
+        else:
+            filename = "net.tar"
+            lib.export_library(tmp.relpath(filename))
+
+        # upload module to device
+        print("Upload...")
+        remote = autotvm.measure.request_remote(device_key, 'localhost', 9190,
+                                                timeout=10000)
+        remote.upload(tmp.relpath(filename))
+        rlib = remote.load_module(filename)
+
+        # upload parameters to device
+        ctx = remote.context(str(target), 0)
+        module = runtime.create(graph, rlib, ctx)
+        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
+        module.set_input('data', data_tvm)
+        module.set_input(**params)
+
+        # evaluate
+        print("Evaluate inference time cost...")
+        ftimer = module.module.time_evaluator("run", ctx, number==1, repeat=30)
+        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
+        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
+              (np.mean(prof_res), np.std(prof_res)))
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+
+# tune_and_evaluate(tuning_option)
+
+######################################################################
+# Sample Output
+# -------------
+# The tuning needs to compile many programs and extract feature from them.
+# So a high performance CPU is recommended.
+# One sample output is listed below. It takes about 3 hours on a 32T AMD Ryzen Threadripper.
+#
+# .. code-block:: bash
+#
+#    Extract tasks...
+#    Tuning...
+#    [Task  1/17]  Current/Best:   25.30/  39.12 GFLOPS | Progress: (992/1000) | 751.22 s Done.
+#    [Task  2/17]  Current/Best:   40.70/  45.50 GFLOPS | Progress: (736/1000) | 545.46 s Done.
+#    [Task  3/17]  Current/Best:   38.83/  42.35 GFLOPS | Progress: (992/1000) | 1549.85 s Done.
+#    [Task  4/17]  Current/Best:   23.31/  31.02 GFLOPS | Progress: (640/1000) | 1059.31 s Done.
+#    [Task  5/17]  Current/Best:    0.06/   2.34 GFLOPS | Progress: (544/1000) | 305.45 s Done.
+#    [Task  6/17]  Current/Best:   10.97/  17.20 GFLOPS | Progress: (992/1000) | 1050.00 s Done.
+#    [Task  7/17]  Current/Best:    8.98/  10.94 GFLOPS | Progress: (928/1000) | 421.36 s Done.
+#    [Task  8/17]  Current/Best:    4.48/  14.86 GFLOPS | Progress: (704/1000) | 582.60 s Done.
+#    [Task  9/17]  Current/Best:   10.30/  25.99 GFLOPS | Progress: (864/1000) | 899.85 s Done.
+#    [Task 10/17]  Current/Best:   11.73/  12.52 GFLOPS | Progress: (608/1000) | 304.85 s Done.
+#    [Task 11/17]  Current/Best:   15.26/  18.68 GFLOPS | Progress: (800/1000) | 747.52 s Done.
+#    [Task 12/17]  Current/Best:   17.48/  26.71 GFLOPS | Progress: (1000/1000) | 1166.40 s Done.
+#    [Task 13/17]  Current/Best:    0.96/  11.43 GFLOPS | Progress: (960/1000) | 611.65 s Done.
+#    [Task 14/17]  Current/Best:   17.88/  20.22 GFLOPS | Progress: (672/1000) | 670.29 s Done.
+#    [Task 15/17]  Current/Best:   11.62/  13.98 GFLOPS | Progress: (736/1000) | 449.25 s Done.
+#    [Task 16/17]  Current/Best:   19.90/  23.83 GFLOPS | Progress: (608/1000) | 708.64 s Done.
+#    [Task 17/17]  Current/Best:   17.98/  22.75 GFLOPS | Progress: (736/1000) | 1122.60 s Done.
+#    Compile...
+#    Upload...
+#    Evaluate inference time cost...
+#    Mean inference time (std dev): 128.05 ms (7.74 ms)
+#
+
+######################################################################
+#
+# .. note:: **Experiencing Difficulties?**
+#
+#   The auto tuning module is error-prone. If you always see " 0.00/ 0.00 GFLOPS",
+#   then there must be something wrong.
+#
+#   First, make sure you set the correct configuration of your device.
+#   Then, you can print debug information by adding these lines in the beginning
+#   of the script. It will print every measurement result, where you can find useful
+#   error messages.
+#
+#   .. code-block:: python
+#
+#      import logging
+#      logging.getLogger('autotvm').setLevel(logging.DEBUG)
+#
+#   Finally, always feel free to ask our community for help on https://discuss.tvm.ai
diff --git a/tutorials/autotvm/tune_nnvm_x86.py b/tutorials/autotvm/tune_nnvm_x86.py
new file mode 100644
index 000000000000..9f8692c3981e
--- /dev/null
+++ b/tutorials/autotvm/tune_nnvm_x86.py
@@ -0,0 +1,220 @@
+"""
+Auto-tuning a convolutional network for x86 CPU
+====================================================
+**Author**: `Yao Wang <https://github.com/kevinthesun>`_
+
+This is a tutorial about how to tune convolution neural network
+for x86 cpu.
+"""
+import os
+import numpy as np
+
+import nnvm.testing
+import nnvm.compiler
+import tvm
+from tvm import autotvm
+from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
+import tvm.contrib.graph_runtime as runtime
+
+#################################################################
+# Define network
+# --------------
+# First we need to define the network in nnvm symbol API.
+# We can load some pre-defined network from :code:`nnvm.testing`.
+# We can also load models from MXNet, ONNX and TensorFlow (see NNVM
+# tutorials :ref:`tutorial-nnvm` for more details).
+#
+# In this tutorial, we choose resnet-18 as tuning example.
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network"""
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if "resnet" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif "vgg" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif name == 'mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif name == 'squeezenet_v1.1':
+        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1')
+    elif name == 'inception_v3':
+        input_shape = (1, 3, 299, 299)
+        net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size)
+    elif name == 'custom':
+        # an example for custom network
+        from nnvm.testing import utils
+        net = nnvm.sym.Variable('data')
+        net = nnvm.sym.conv2d(net, channels=4, kernel_size=(3,3), padding=(1,1))
+        net = nnvm.sym.flatten(net)
+        net = nnvm.sym.dense(net, units=1000)
+        net, params = utils.create_workload(net, batch_size, (3, 224, 224))
+    elif name == 'mxnet':
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+        block = get_model('resnet18_v1', pretrained=True)
+        net, params = nnvm.frontend.from_mxnet(block)
+        net = nnvm.sym.softmax(net)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, input_shape, output_shape
+
+# Replace "llvm" with the correct target of your cpu.
+# For example, for AWS EC2 c5 instance with Intel Xeon
+# Platinum 8000 series, the target should be "llvm -mcpu=skylake-avx512".
+# For AWS EC2 c4 instance with Intel Xeon E5-2666 v3, it should be
+# "llvm -mcpu=core-avx2".
+target = "llvm"
+
+batch_size = 1
+dtype = "float32"
+model_name = "resnet-18"
+log_file = "%s.log" % model_name
+
+# Set number of threads used for tuning based on the number of
+# physical cpu cores on your machine.
+num_threads = 1
+os.environ["TVM_NUM_THREADS"] = str(num_threads)
+
+
+#################################################################
+# Configure tensor tuning settings and create tasks
+# -------------------------------------------------
+# To get better kernel execution performance on x86 cpu,
+# we need to change data layout of convolution kernel from
+# "NCHW" to "NCHWc". To deal with this situation, we define
+# conv2d_NCHWc operator in topi. We will tune this operator
+# instead of plain conv2d.
+#
+# We will use local mode for tuning configuration. RPC tracker
+# mode can be setup similarly to the approach in
+# :ref:`tune_nnvm_arm` tutorial.
+
+tuning_option = {
+    'log_filename': log_file,
+    'tuner': 'random',
+    'early_stopping': None,
+
+    'measure_option': autotvm.measure_option(
+        builder=autotvm.LocalBuilder(),
+        runner=autotvm.LocalRunner(number=10, repeat=1,
+                                   min_repeat_ms=1000),
+    ),
+}
+
+# You can skip the implementation of this function for this tutorial.
+def tune_kernels(tasks,
+                 measure_option,
+                 tuner='gridsearch',
+                 early_stopping=None,
+                 log_filename='tuning.log'):
+
+    for i, tsk in enumerate(tasks):
+        prefix = "[Task %2d/%2d] " % (i+1, len(tasks))
+
+        # converting conv2d tasks to conv2d_NCHWc tasks
+        op_name = tsk.workload[0]
+        if op_name == 'conv2d':
+            func_create = 'topi_x86_conv2d_NCHWc'
+        elif op_name == 'depthwise_conv2d_nchw':
+            func_create = 'topi_x86_depthwise_conv2d_NCHWc_from_nchw'
+        else:
+            raise ValueError("Tuning {} is not supported on x86".format(op_name))
+
+        task = autotvm.task.create(func_create, args=tsk.args,
+                                   target=target, template_key='direct')
+        task.workload = tsk.workload
+
+        # create tuner
+        if tuner == 'xgb' or tuner == 'xgb-rank':
+            tuner_obj = XGBTuner(task, loss_type='rank')
+        elif tuner == 'ga':
+            tuner_obj = GATuner(task, pop_size=50)
+        elif tuner == 'random':
+            tuner_obj = RandomTuner(task)
+        elif tuner == 'gridsearch':
+            tuner_obj = GridSearchTuner(task)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        # do tuning
+        n_trial=len(task.config_space)
+        tuner_obj.tune(n_trial=n_trial,
+                       early_stopping=early_stopping,
+                       measure_option=measure_option,
+                       callbacks=[
+                           autotvm.callback.progress_bar(n_trial, prefix=prefix),
+                           autotvm.callback.log_to_file(log_filename)])
+
+
+########################################################################
+# Finally, we launch tuning jobs and evaluate the end-to-end performance.
+
+def tune_and_evaluate(tuning_opt):
+    # extract workloads from nnvm graph
+    print("Extract tasks...")
+    net, params, data_shape, out_shape = get_network(model_name, batch_size)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': data_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d,))
+
+    # run tuning tasks
+    print("Tuning...")
+    tune_kernels(tasks, **tuning_opt)
+
+    # compile kernels with history best records
+    with autotvm.apply_history_best(log_file):
+        print("Compile...")
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(
+                net, target=target, shape={'data': data_shape}, params=params, dtype=dtype)
+
+        # upload parameters to device
+        ctx = tvm.cpu()
+        data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype))
+        module = runtime.create(graph, lib, ctx)
+        module.set_input('data', data_tvm)
+        module.set_input(**params)
+
+        # evaluate
+        print("Evaluate inference time cost...")
+        ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3)
+        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
+        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
+              (np.mean(prof_res), np.std(prof_res)))
+
+# We do not run the tuning in our webpage server since it takes too long.
+# Uncomment the following line to run it by yourself.
+
+# tune_and_evaluate(tuning_option)
+
+######################################################################
+# Sample Output
+# -------------
+# The tuning needs to compile many programs and extract feature from them.
+# So a high performance CPU is recommended.
+# One sample output is listed below.
+#
+# .. code-block:: bash
+#
+#    Extract tasks...
+#    Tuning...
+#    [Task  1/12]  Current/Best:  598.05/2497.63 GFLOPS | Progress: (252/252) | 1357.95 s Done.
+#    [Task  2/12]  Current/Best:  522.63/2279.24 GFLOPS | Progress: (784/784) | 3989.60 s Done.
+#    [Task  3/12]  Current/Best:  447.33/1927.69 GFLOPS | Progress: (784/784) | 3869.14 s Done.
+#    [Task  4/12]  Current/Best:  481.11/1912.34 GFLOPS | Progress: (672/672) | 3274.25 s Done.
+#    [Task  5/12]  Current/Best:  414.09/1598.45 GFLOPS | Progress: (672/672) | 2720.78 s Done.
+#    [Task  6/12]  Current/Best:  508.96/2273.20 GFLOPS | Progress: (768/768) | 3718.75 s Done.
+#    [Task  7/12]  Current/Best:  469.14/1955.79 GFLOPS | Progress: (576/576) | 2665.67 s Done.
+#    [Task  8/12]  Current/Best:  230.91/1658.97 GFLOPS | Progress: (576/576) | 2435.01 s Done.
+#    [Task  9/12]  Current/Best:  487.75/2295.19 GFLOPS | Progress: (648/648) | 3009.95 s Done.
+#    [Task 10/12]  Current/Best:  182.33/1734.45 GFLOPS | Progress: (360/360) | 1755.06 s Done.
+#    [Task 11/12]  Current/Best:  372.18/1745.15 GFLOPS | Progress: (360/360) | 1684.50 s Done.
+#    [Task 12/12]  Current/Best:  215.34/2271.11 GFLOPS | Progress: (400/400) | 2128.74 s Done.
+#    Compile...
+#    Evaluate inference time cost...
+#    Mean inference time (std dev): 3.16 ms (0.03 ms)
diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py
index 8d4aab0b2c27..15b883dcbd73 100644
--- a/tutorials/autotvm/tune_simple_template.py
+++ b/tutorials/autotvm/tune_simple_template.py
@@ -14,7 +14,7 @@
 
 ######################################################################
 # Install dependencies
-# ----------------------------------------
+# --------------------
 # To use autotvm package in tvm, we need to install some extra dependencies.
 # (change "3" to "2" if you use python2):
 #
@@ -44,7 +44,7 @@
 
 ######################################################################
 # Step 1:  Define the search space
-# ---------------------------------
+# --------------------------------
 # In this section, we will rewrite a deterministic tvm schedule code to a
 # tunable schedule template. You can regard the process of search space definition
 # as the parametrization of our exiting schedule code.
@@ -73,7 +73,7 @@ def matmul_v0(N, L, M, dtype):
 
 #####################################################################
 # Parametrize the schedule
-# ^^^^^^^^^^^^^^^^^^^^^^^^^
+# ^^^^^^^^^^^^^^^^^^^^^^^^
 # In the previous schedule code, we use a constant "8" as tiling factor.
 # However, it might not be the best one because the best tiling factor depends
 # on real hardware environment and input shape.
@@ -271,9 +271,12 @@ def matmul(N, L, M, dtype):
 logging.getLogger('autotvm').setLevel(logging.DEBUG)
 logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))
 
-# use local cpu, measure 5 times for every config to reduce variance
-measure_option = autotvm.measure_option('local',
-                                        number=5)
+# There are two steps for measuring a config: build and run.
+# By default, we use all cpu cores to compile program. Then measure them sequentially.
+# We measure 5 times and take average to reduce variance.
+measure_option = autotvm.measure_option(
+    builder='local',
+    runner=autotvm.LocalRunner(number=5))
 
 # begin tuning, log records to file `matmul.log`
 tuner = autotvm.tuner.RandomTuner(task)
@@ -302,4 +305,4 @@ def matmul(N, L, M, dtype):
 c_tvm = tvm.nd.empty(c_np.shape)
 func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)
 
-np.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
+tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
diff --git a/tutorials/cross_compilation_and_rpc.py b/tutorials/cross_compilation_and_rpc.py
index e0967d54b09c..6c23368f840a 100644
--- a/tutorials/cross_compilation_and_rpc.py
+++ b/tutorials/cross_compilation_and_rpc.py
@@ -211,7 +211,7 @@ def run_opencl():
     opencl_device_host = '10.77.1.145'
     opencl_device_port = 9090
 
-    # create scheule for the above "add one" compute decleration
+    # create schedule for the above "add one" compute declaration
     s = tvm.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=32)
     s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
diff --git a/tutorials/dev/README.txt b/tutorials/dev/README.txt
new file mode 100644
index 000000000000..a358280640de
--- /dev/null
+++ b/tutorials/dev/README.txt
@@ -0,0 +1,3 @@
+Developer Tutorials
+-------------------
+
diff --git a/tutorials/dev/low_level_custom_pass.py b/tutorials/dev/low_level_custom_pass.py
new file mode 100644
index 000000000000..f2f38207931f
--- /dev/null
+++ b/tutorials/dev/low_level_custom_pass.py
@@ -0,0 +1,156 @@
+"""
+Writing a Customized Pass
+=========================
+**Author**: `Jian Weng <https://were.github.io>`_
+
+TVM is a framework that abstracts away the heterogenity of machine learning accelerators.
+Sometimes users may want customize some analysis and IR transformations
+to adapt TVM to their own specialized hardware. This tutorial helps users write
+a customized pass in TVM.
+
+Prerequisites
+-------------
+
+Before reading this tutorial, we assume readers have already known these topics well:
+
+- Writing an algorithm in TVM and schedule it. Otherwise, see example tutorials like
+  :ref:`opt-gemm`.
+- The basic structure of HalideIR. Otherwise, see ``HalideIR/src/ir/IR.h`` to learn what
+  attributes of IR nodes are defined.
+- Visitor design pattern. Otherwise, check the
+  `Python AST module <https://docs.python.org/3/library/ast.html>`_ to see how an AST
+  visitor is implemented.
+- How a HalideIR/Schedule is lowered to either a LoweredFunc class or a LLVM module. Otherwise,
+  take a look at ``python/tvm/build_module.py`` to get some basics.
+
+"""
+
+from __future__ import absolute_import, print_function
+import tvm
+import numpy as np
+
+######################################################################
+# We first write a very simple vector add and build it with the default schedule. Then, we use
+# our customized lowering pass to manipulate the IR directly instead of using schedule premitives.
+#
+
+n = tvm.const(128)
+a = tvm.placeholder((n, ), name="a")
+b = tvm.placeholder((n, ), name="b")
+c = tvm.compute((n, ), lambda i: a[i] + b[i], name='c')
+
+sch = tvm.create_schedule(c.op)
+ir  = tvm.lower(sch, [a, b, c], simple_mode=True)
+print(ir)
+
+######################################################################
+# Writing a Pass
+# --------------
+# Essentially, an "IR transformation pass" is a function which maps a statement to a new statement.
+# Thus, we define this vectorize function and implement it step by step.
+#
+
+######################################################################
+# TVM already provides two class for users to both analyze and transform IR.
+#
+# IR Visitor
+# ~~~~~~~~~~
+# We can use ``tvm.ir_pass.PostOrderVisit(stmt, func)`` to gather information from the Halide IR.
+# ``func`` is a function callback. This function will be called before exiting the current IR node,
+# i.e. post-order visit. Then we leverage side effects to store the result of IR visit, because the
+# return value of ``func`` will be ignored.
+#
+# .. note::
+#
+#     You MUST use some array to store the result of IR visit. Even the value is a single variable.
+#     This is mainly due to the constraints in the Python-C runtime. The variable values will be
+#     refreshed every recursion but the array values will be preserved.
+#
+
+loops = []
+def find_width8(op):
+    """ Find all the 'For' nodes whose extent can be divided by 8. """
+    if isinstance(op, tvm.stmt.For):
+        if isinstance(op.extent, tvm.expr.IntImm):
+            if op.extent.value % 8 == 0:
+                loops.append(op)
+
+#####################################################################
+# IR Transformation
+# ~~~~~~~~~~~~~~~~~
+# The transformation interface is slightly different from the visitor interface. There is only a
+# post-order callback in the visitor, but transformation visitor supports both a pre-order and a
+# post-order callback. If you want to keep the origin IR node, just return None. If you want to
+# change the current node to some node, use TVM IR maker interface to build it and return
+# this value.
+#
+# .. note::
+#
+#     If the pre-order function is called and returns a value which is not None, the post-order
+#     function will be skipped.
+#
+
+def vectorize8(op):
+    """ Split can vectorize the loops found in `find_width8`. """
+    if op in loops:
+        extent = op.extent.value
+        name = op.loop_var.name
+        lo, li = tvm.var(name + '.outer'), tvm.var(name + '.inner')
+        body = tvm.ir_pass.Substitute(op.body, {op.loop_var: lo * 8 + li})
+        body = tvm.make.For(li, 0, 8, tvm.stmt.For.Vectorized, 0, body)
+        body = tvm.make.For(lo, 0, extent // 8, tvm.stmt.For.Serial, 0, body)
+        return body
+    return None
+
+def vectorize(stmt):
+    global loops
+
+    tvm.ir_pass.PostOrderVisit(stmt, find_width8)
+
+    if not loops:
+        return stmt
+
+    # The last list arugment indicates what kinds of nodes will be transformed.
+    # Thus, in this case only `For` nodes will call `vectorize8`
+    stmt = tvm.ir_pass.IRTransform(stmt, None, vectorize8, ['For'])
+
+    return stmt
+
+#####################################################################
+# Glue to Lowering
+# ----------------
+# So far, we are done with writing this IR transformation pass. What we need to do next is to glue
+# this pass to TVM's lower pass. We can first call this function directly as a sanity check.
+#
+
+print(vectorize(ir))
+
+#####################################################################
+# In TVM, there is a property called ``BuildConfig``. You can use this property to customize your
+# own lowering options. In this case, we inject the pass written above into the TVM standard lowering
+# pass by feeding **a list of tuple** as argument to ``add_lower_pass``. "Tuple" indicates different
+# phases of lowering. In TVM, there are four phases of lowering and user-customized ones will be
+# called after each phase is done.
+#
+# .. note::
+#     Here are the essential transformations done by each phase:
+#       - Phase 0 generates the raw IR and loop levels.
+#       - Phase 1 flattens the array storage.
+#       - Phase 2 transforms loops, like unroll, vectorization and thread-binding.
+#       - Phase 3 does some cleanup work.
+#
+# Thus, a good place to put this transformation pass is just after Phase 1.
+#
+
+with tvm.build_config(add_lower_pass=[(1, vectorize)]) as cfg:
+    print(tvm.lower(sch, [a, b, c], simple_mode=True))
+
+#####################################################################
+# Quick View
+# ----------
+# This tutorial gives a quick view of writing a customized IR transformation pass:
+# - Use ``tvm.ir_pass.PostOrderVisit`` to gather information on each IR nodes.
+# - Use ``tvm.ir_pass.IRTransform`` to transform IR nodes.
+# - Wrap up two above to write an IR-transformation function.
+# - Use ``tvm.build_config`` to put this function to TVM lowering pass
+#
diff --git a/tutorials/get_started.py b/tutorials/get_started.py
index de94827ab1e9..022d087d4d9b 100644
--- a/tutorials/get_started.py
+++ b/tutorials/get_started.py
@@ -138,7 +138,7 @@
 b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
 c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
 fadd(a, b, c)
-np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
 ######################################################################
 # Inspect the Generated Code
@@ -217,7 +217,7 @@
     fadd1_dev = tvm.module.load(temp.relpath("myadd.ptx"))
     fadd1.import_module(fadd1_dev)
 fadd1(a, b, c)
-np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
 ######################################################################
 # Pack Everything into One Library
@@ -231,7 +231,7 @@
 fadd.export_library(temp.relpath("myadd_pack.so"))
 fadd2 = tvm.module.load(temp.relpath("myadd_pack.so"))
 fadd2(a, b, c)
-np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
 ######################################################################
 # .. note:: Runtime API and Thread-Safety
@@ -264,7 +264,7 @@
     b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
     c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
     fadd_cl(a, b, c)
-    np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
+    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
 
 ######################################################################
 # Summary
diff --git a/tutorials/language/extern_op.py b/tutorials/language/extern_op.py
index 298a2cc7dd8b..59efe5000f03 100644
--- a/tutorials/language/extern_op.py
+++ b/tutorials/language/extern_op.py
@@ -59,7 +59,7 @@
 d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
 bb = 10.0
 f(a, b, d, bb)
-np.testing.assert_allclose(
+tvm.testing.assert_allclose(
     d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 10, rtol=1e-5)
 
 ######################################################################
@@ -98,7 +98,7 @@ def my_tvm_addone(x, y):
 a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)
 b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), ctx)
 f(a, b)
-np.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1, rtol=1e-5)
+tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1, rtol=1e-5)
 
 ######################################################################
 # Summary
diff --git a/tutorials/language/reduction.py b/tutorials/language/reduction.py
index 531283e15213..8be614b2f6ea 100644
--- a/tutorials/language/reduction.py
+++ b/tutorials/language/reduction.py
@@ -123,7 +123,7 @@
 a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), ctx)
 b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), ctx)
 fcuda(a, b)
-np.testing.assert_allclose(
+tvm.testing.assert_allclose(
     b.asnumpy(),  np.sum(a.asnumpy(), axis=1), rtol=1e-4)
 
 ######################################################################
diff --git a/tutorials/language/scan.py b/tutorials/language/scan.py
index 6cdb0a0ff38e..8b8f848ffa13 100644
--- a/tutorials/language/scan.py
+++ b/tutorials/language/scan.py
@@ -72,7 +72,7 @@
 a = tvm.nd.array(a_np, ctx)
 b = tvm.nd.array(np.zeros((m, n), dtype=s_scan.dtype), ctx)
 fscan(a, b)
-np.testing.assert_allclose(b.asnumpy(), np.cumsum(a_np, axis=0))
+tvm.testing.assert_allclose(b.asnumpy(), np.cumsum(a_np, axis=0))
 
 ######################################################################
 # Multi-Stage Scan Cell
diff --git a/tutorials/language/tensorize.py b/tutorials/language/tensorize.py
new file mode 100644
index 000000000000..4115de1b2eb4
--- /dev/null
+++ b/tutorials/language/tensorize.py
@@ -0,0 +1,291 @@
+"""
+Use Tensorize to Leverage Hardware Intrinsics
+=============================================
+**Author**: `Yizhi Liu <https://github.com/yzhliu>`_
+
+This is an introduction material on how to perform tensorization in TVM.
+
+By using schedule primitive :code:`tensorize`,
+people can replace a unit of computation with the corresponding intrinsics,
+making it easy to leverage handcrafted micro-kernels,
+as well as extend TVM to support new hardware architectures.
+
+The purpose of this tutorial is to show the functionality
+and usage of tensorize instead of providing an efficient solution.
+
+"""
+from __future__ import absolute_import, print_function
+
+import tvm
+import numpy as np
+
+######################################################################
+# Define Matrix Multiplication
+# ----------------------------
+# Take matrix multiplication as our example.
+# Matmul first multiply the corresponding elements between two matrix,
+# then accumulate across a certain axis.
+# The following lines describe the computation :code:`A * B^T` in TVM.
+#
+N, M, L = 1024, 512, 64
+A = tvm.placeholder((N, L), name='A')
+B = tvm.placeholder((M, L), name='B')
+k = tvm.reduce_axis((0, L), name='k')
+C = tvm.compute((N, M), lambda i, j:
+                tvm.sum(A[i, k] * B[j, k], axis=k), name='C')
+s = tvm.create_schedule(C.op)
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+######################################################################
+# Schedule the Matmul
+# -------------------
+# Now, suppose we have an accelerator that supports
+# matrix-vector multiplication (GEMV) as a hardware primitive,
+# which can take arbitrary size of reduce axis,
+# but another axis needs to be no larger than 16.
+# Thus we break down the matmul loops to make the innermost loops a (16x64) GEMV.
+#
+factor = 16
+x, y = C.op.axis
+z, = C.op.reduce_axis
+yo, yi = s[C].split(y, factor=factor)
+s[C].reorder(x, yo, yi, z)
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+######################################################################
+# As showed in the IR printed above,
+# the inner loops :code:`j.inner` along with :code:`k` together form a computation of GEMV
+# - within the inner most two loops, the index :code:`i` is fixed,
+# the access to the matrix :code:`A` only varies by :code:`k`,
+# which makes the access pattern of :code:`A` a "vector".
+# In order to leverage our hypothetical hardware's GEMV instruction,
+# we can tensorize over :code:`j.inner`.
+#
+# Define GEMV Tensorization Intrinsic
+# -----------------------------------
+# Before scheduling the tensorization, we need to first define the intrinsic function for GEMV.
+# It includes two parts, the first is a compute definition of GEMV.
+# TVM uses it to match the computing pattern in the original Matmul schedule.
+# The second is to specify how to execute GEMV on the device,
+# which is done in :code:`intrin_func` below.
+#
+def intrin_gemv(m, l):
+    a = tvm.placeholder((l,), name='a')
+    b = tvm.placeholder((m, l), name='b')
+    k = tvm.reduce_axis((0, l), name='k')
+    c = tvm.compute((m,), lambda i: tvm.sum(a[k] * b[i, k], axis=k), name='c')
+    Ab = tvm.decl_buffer(a.shape, a.dtype,
+                         name="A",
+                         offset_factor=1,
+                         strides=[1])
+    Bb = tvm.decl_buffer(b.shape, b.dtype,
+                         name="B",
+                         offset_factor=1,
+                         strides=[tvm.var("s1"), 1])
+    Cb = tvm.decl_buffer(c.shape, c.dtype,
+                         name="C",
+                         offset_factor=1,
+                         strides=[1])
+    def intrin_func(ins, outs):
+        ib = tvm.ir_builder.create()
+        aa, bb = ins
+        cc = outs[0]
+        ib.emit(tvm.call_extern("int32", "gemv_update",
+                                cc.access_ptr("w"),
+                                aa.access_ptr("r"),
+                                bb.access_ptr("r"),
+                                m, l, bb.strides[0]))
+        return ib.get()
+    with tvm.build_config(offset_factor=1):
+        return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})
+
+######################################################################
+# Here :code:`tvm.decl_tensor_intrin` declares how to execute the computation :code:`c.op`.
+# Our implementation simply takes the inputs and outputs,
+# converts them to pointers and emit an external function call.
+# Note that tensorization requires user to specify :code:`offset_factor`,
+# with this information, TVM has knowledge of whether the data is aligned
+# between the start address of the original data structure
+# and the offset being passed to tensorize,
+# so that it has chance to optimize with vectorized loading.
+# We set the factor to 1 for simplification.
+#
+# Buffers are also declared for inputs and outputs, though this is not required,
+# we benefit from the extra information provided by buffers. For example, we pass
+# :code:`bb.strides[0]` as an argument to the external function :code:`gemv_update`.
+# For now :code:`bb.strides[0] == l`,
+# but later we will see how they can differ with more complicated schedules.
+#
+# Note that we use :code:`tvm.var("s1")` as the first stride dimension for :code:`B`.
+# If the strides can be inferred
+# - in this case, TVM knows tensor B is compact thus the strides are :code:`[L, 1]` -
+# such placeholder can be put to let TVM automatically bind the inferred value for us.
+#
+gemv = intrin_gemv(factor, L)
+s[C].tensorize(yi, gemv)
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+######################################################################
+# By tensorizing over :code:`yi`, the inner most two loops are
+# now replaced by the intrinsic function we defined before.
+# In order to build and run the module, let's define the external function :code:`gemv_update`,
+# it is a naive implementation of GEMV, just for demonstration.
+#
+def gemv_impl():
+    cc_code = """
+      extern "C" int gemv_update(float *cc, float *aa, float *bb, int m, int l, int stride) {
+        for (int i = 0; i < m; ++i) {
+            for (int j = 0; j < l; ++j) {
+                cc[i] += aa[j] * bb[i * stride + j];
+            }
+        }
+        return 0;
+      }
+    """
+    from tvm.contrib import util, clang
+    temp = util.tempdir()
+    ll_path = temp.relpath("temp.ll")
+    # Create LLVM ir from c source code
+    ll_code = clang.create_llvm(cc_code, output=ll_path)
+    return ll_code
+
+######################################################################
+# Now we leverage the pragma attribute :code:`import_llvm` to import llvm asm inline.
+# The importing needs to happen before the tensorized GEMV being executed.
+#
+s[C].pragma(x, "import_llvm", gemv_impl())
+print(tvm.lower(s, [A, B, C], simple_mode=True))
+
+######################################################################
+# Finally we compare the tensorize version with that :code:`numpy.dot` produces,
+# ensure our implementation is correct.
+#
+func = tvm.build(s, [A, B, C], target="llvm", name="gemv")
+
+from topi.util import get_const_tuple
+dtype = A.dtype
+ctx = tvm.context("cpu", 0)
+a = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype)
+b = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype)
+c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), ctx)
+func(tvm.nd.array(a, ctx), tvm.nd.array(b, ctx), c)
+tvm.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3)
+
+######################################################################
+# Reduce-update for Tensorize
+# ---------------------------
+# So far you have learned the basic idea of tensorize,
+# now let's move one step forward to a more complicated case.
+#
+# Assume our accelerator could only multiply a vector by a square matrix,
+# in which the vector size needs to be no larger than 16.
+# Given such hardware constrain, now we need to split the reduce axis as following,
+#
+zo, zi = s[C].split(z, factor=factor)
+s[C].reorder(x, yo, zo, yi, zi)
+
+######################################################################
+# However, since the tensorize intrinsic now only covers a part of the reduce axis,
+# instead of using one "body" function, TVM requires a :code:`reduce_reset` function,
+# which will be invoked before the reduce for-loop, and a :code:`reduce_update` function,
+# which defines the "update" computing strategy.
+#
+def gemv_impl():
+    cc_code = """
+      extern "C" int gemv_update(float *cc, float *aa, float *bb, int m, int l, int stride) {
+        for (int i = 0; i < m; ++i) {
+            for (int j = 0; j < l; ++j) {
+                cc[i] += aa[j] * bb[i * stride + j];
+            }
+        }
+        return 0;
+      }
+      extern "C" int gemv_reset(float *cc, int m) {
+        for (int i = 0; i < m; ++i) {
+            cc[i] = 0.0;
+        }
+        return 0;
+      }
+    """
+    from tvm.contrib import util, clang
+    temp = util.tempdir()
+    ll_path = temp.relpath("temp.ll")
+    # Create LLVM ir from c source code
+    ll_code = clang.create_llvm(cc_code, output=ll_path)
+    return ll_code
+
+def intrin_gemv(m, l):
+    a = tvm.placeholder((l,), name='a')
+    b = tvm.placeholder((m, l), name='b')
+    k = tvm.reduce_axis((0, l), name='k')
+    c = tvm.compute((m,), lambda i:
+    tvm.sum(a[k] * b[i, k], axis=k), name='c')
+    Ab = tvm.decl_buffer(a.shape, a.dtype,
+                         name="A",
+                         offset_factor=1,
+                         strides=[1])
+    Bb = tvm.decl_buffer(b.shape, b.dtype,
+                         name="B",
+                         offset_factor=1,
+                         strides=[tvm.var("s1"), 1])
+    Cb = tvm.decl_buffer(c.shape, c.dtype,
+                         name="C",
+                         offset_factor=1,
+                         strides=[1])
+    def intrin_func(ins, outs):
+        aa, bb = ins
+        cc = outs[0]
+        def _body():
+            ib = tvm.ir_builder.create()
+            ib.emit(tvm.call_extern("int32", "gemv_update",
+                                    cc.access_ptr("w"),
+                                    aa.access_ptr("r"),
+                                    bb.access_ptr("r"),
+                                    m, l, bb.strides[0]))
+            return ib.get()
+        def _reduce_reset():
+            ib = tvm.ir_builder.create()
+            ib.emit(tvm.call_extern("int32", "gemv_reset", cc.access_ptr("w"), m))
+            return ib.get()
+        def _reduce_update():
+            return _body()
+        return _body(), _reduce_reset(), _reduce_update()
+    with tvm.build_config(offset_factor=1):
+        return tvm.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb})
+
+######################################################################
+# Note that :code:`intrin_func` now returns a triplet:
+# :code:`(body, reduce_reset, reduce_update)`.
+# If tensorization includes all the reduce axes, function :code:`body()` will be invoked,
+# otherwise :code:`reduce_reset()` and :code:`reduce_update()` together will be used.
+# In our example :code:`body()` and :code:`reduce_update()`
+# share the same implementation,
+# while in other cases, hardware may have different instructions for these two functions.
+# Moreover, we can see now :code:`bb.strides[0]` is different from :code:`l`
+# due to the tiling.
+#
+# Tensorize for squared GEMV, build and check the results,
+#
+gemv = intrin_gemv(factor, factor)
+s[C].tensorize(yi, gemv)
+s[C].pragma(yo, "import_llvm", gemv_impl())
+
+func = tvm.build(s, [A, B, C], target="llvm", name="gemv")
+a = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype)
+b = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype)
+c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), ctx)
+func(tvm.nd.array(a, ctx), tvm.nd.array(b, ctx), c)
+tvm.testing.assert_allclose(c.asnumpy(), np.dot(a, b.T), rtol=1e-3)
+
+######################################################################
+# Summary
+# -------
+# This tutorial demonstrates the usage of tensorize intrinsic in TVM.
+# Tensorize provides a way for users to get fully optimized schedule via micro-kernels.
+# For example, INT8 quantization on Intel CPUs uses tensorization
+# to invoke AVX instruction directly.
+# It also enables TVM to compile to ASICs -
+# checkout `VTA <https://docs.tvm.ai/vta/index.html>`_ for details.
+# We also demonstrates how to use inline assembly importing,
+# which helps users inject asm easily into the schedule.
+#
diff --git a/tutorials/nnvm/.gitignore b/tutorials/nnvm/.gitignore
new file mode 100644
index 000000000000..5f8a03c46b89
--- /dev/null
+++ b/tutorials/nnvm/.gitignore
@@ -0,0 +1,11 @@
+*.pb
+*.mlmodel
+*.ttf
+*.txt
+*synset*txt
+*.cfg
+ssd_model
+*.names
+*.jpg
+*.pbtxt
+*.weights
diff --git a/tutorials/nnvm/deploy_model_on_mali_gpu.py b/tutorials/nnvm/deploy_model_on_mali_gpu.py
index 8aacb8433d3d..6e3962a6609f 100644
--- a/tutorials/nnvm/deploy_model_on_mali_gpu.py
+++ b/tutorials/nnvm/deploy_model_on_mali_gpu.py
@@ -91,7 +91,7 @@
 ######################################################################
 # In order to test our model, here we download an image of cat and
 # transform its format.
-img_name = 'cat.jpg'
+img_name = 'cat.png'
 download('https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true', img_name)
 image = Image.open(img_name).resize((224, 224))
 
@@ -132,7 +132,6 @@ def transform_image(image):
 num_classes = 1000
 image_shape = (3, 224, 224)
 data_shape = (batch_size,) + image_shape
-out_shape = (batch_size, num_classes)
 
 ######################################################################
 # Compile The Graph
@@ -165,7 +164,7 @@ def transform_image(image):
     # optimization for mali
     target = tvm.target.mali()
 
-with nnvm.compiler.build_config(opt_level=2):
+with nnvm.compiler.build_config(opt_level=3):
     graph, lib, params = nnvm.compiler.build(net, target=target,
             shape={"data": data_shape}, params=params, target_host=target_host)
 
@@ -197,20 +196,17 @@ def transform_image(image):
 remote.upload(lib_fname)
 rlib = remote.load_module('net.tar')
 
-ctx = remote.cpu(0) if local_demo else remote.cl(0)
-# upload the parameter
-rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
-
 # create the remote runtime module
+ctx = remote.cl(0) if not local_demo else remote.cpu(0)
 module = runtime.create(graph, rlib, ctx)
-# set parameter
-module.set_input(**rparams)
+# set parameter (upload params to the remote device. This may take a while)
+module.set_input(**params)
 # set input data
 module.set_input('data', tvm.nd.array(x.astype('float32')))
 # run
 module.run()
 # get output
-out = module.get_output(0, tvm.nd.empty(out_shape, ctx=ctx))
+out = module.get_output(0)
 # get top1 result
 top1 = np.argmax(out.asnumpy())
 print('TVM prediction top-1: {}'.format(synset[top1]))
diff --git a/tutorials/nnvm/deploy_model_on_rasp.py b/tutorials/nnvm/deploy_model_on_rasp.py
index c11f202c1251..fa5fd2b0952f 100644
--- a/tutorials/nnvm/deploy_model_on_rasp.py
+++ b/tutorials/nnvm/deploy_model_on_rasp.py
@@ -88,7 +88,7 @@
 ######################################################################
 # In order to test our model, here we download an image of cat and
 # transform its format.
-img_name = 'cat.jpg'
+img_name = 'cat.png'
 download('https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true', img_name)
 image = Image.open(img_name).resize((224, 224))
 
@@ -128,7 +128,6 @@ def transform_image(image):
 num_classes = 1000
 image_shape = (3, 224, 224)
 data_shape = (batch_size,) + image_shape
-out_shape = (batch_size, num_classes)
 
 ######################################################################
 # Compile The Graph
@@ -154,9 +153,9 @@ def transform_image(image):
 else:
     target = tvm.target.arm_cpu('rasp3b')
     # The above line is a simple form of
-    # target = tvm.target.create('llvm -devcie=arm_cpu -target=armv7l-linux-gnueabihf')
+    # target = tvm.target.create('llvm -devcie=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon')
 
-with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
+with nnvm.compiler.build_config(opt_level=3):
     graph, lib, params = nnvm.compiler.build(
         net, target, shape={"data": data_shape}, params=params)
 
@@ -188,20 +187,17 @@ def transform_image(image):
 remote.upload(lib_fname)
 rlib = remote.load_module('net.tar')
 
-# upload the parameter (this may take a while)
-ctx = remote.cpu(0)
-rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
-
 # create the remote runtime module
+ctx = remote.cpu(0)
 module = runtime.create(graph, rlib, ctx)
-# set parameter
-module.set_input(**rparams)
+# set parameter (upload params to the remote device. This may take a while)
+module.set_input(**params)
 # set input data
 module.set_input('data', tvm.nd.array(x.astype('float32')))
 # run
 module.run()
 # get output
-out = module.get_output(0, tvm.nd.empty(out_shape, ctx=ctx))
+out = module.get_output(0)
 # get top1 result
 top1 = np.argmax(out.asnumpy())
 print('TVM prediction top-1: {}'.format(synset[top1]))
diff --git a/tutorials/nnvm/deploy_ssd.py b/tutorials/nnvm/deploy_ssd.py
index 58725a3c92d3..9afa113959f0 100644
--- a/tutorials/nnvm/deploy_ssd.py
+++ b/tutorials/nnvm/deploy_ssd.py
@@ -5,7 +5,7 @@
 
 This article is an introductory tutorial to deploy SSD models with TVM.
 We will use mxnet pretrained SSD model with Resnet50 as body network and
-convert it to NNVM graph.
+convert it to NNVM graph;
 """
 import os
 import zipfile
@@ -16,18 +16,27 @@
 
 from nnvm import compiler
 from nnvm.frontend import from_mxnet
+from tvm import relay
 from tvm.contrib.download import download
 from tvm.contrib import graph_runtime
 from mxnet.model import load_checkpoint
 
 
 ######################################################################
-# Set the parameters here
-# -----------------------
+# Preliminary and Set parameters
+# ------------------------------
+# We should build TVM with sort support, in TVM root directory
+#
+# .. code-block:: bash
+#
+#   echo "set(USE_SORT ON)" > config.mk
+#   make -j8
+#
 # .. note::
 #
 #   Currently we support compiling SSD on CPU only.
 #   GPU support is in progress.
+#
 
 model_name = "ssd_resnet50_512"
 model_file = "%s.zip" % model_name
@@ -50,7 +59,7 @@
 inference_symbol_folder = "c1904e900848df4548ce5dfb18c719c7-a28c4856c827fe766aa3da0e35bad41d44f0fb26"
 inference_symbol_url = "https://gist.github.com/kevinthesun/c1904e900848df4548ce5dfb18c719c7/" \
                        "archive/a28c4856c827fe766aa3da0e35bad41d44f0fb26.zip"
-            
+
 dir = "ssd_model"
 if not os.path.exists(dir):
     os.makedirs(dir)
@@ -69,13 +78,31 @@
 zip_ref.close()
 
 ######################################################################
-# Convert and compile model with NNVM for CPU.
+# Convert and compile model with NNVM or Relay for CPU.
 
 sym = mx.sym.load("%s/%s/ssd_resnet50_inference.json" % (dir, inference_symbol_folder))
 _, arg_params, aux_params = load_checkpoint("%s/%s" % (dir, model_name), 0)
-net, params = from_mxnet(sym, arg_params, aux_params)
-with compiler.build_config(opt_level=3):
-    graph, lib, params = compiler.build(net, target, {"data": dshape}, params=params)
+
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "-f", "--frontend",
+    help="Frontend for compilation, nnvm or relay",
+    type=str,
+    default="nnvm")
+args = parser.parse_args()
+if args.frontend == "relay":
+    net, params = relay.frontend.from_mxnet(sym, {"data": dshape}, arg_params=arg_params, aux_params=aux_params)
+    with relay.build_config(opt_level=3):
+        graph, lib, params = relay.build(net, target, params=params)
+elif args.frontend == "nnvm":
+    net, params = from_mxnet(sym, arg_params, aux_params)
+    with compiler.build_config(opt_level=3):
+        graph, lib, params = compiler.build(
+            net, target, {"data": dshape}, params=params)
+else:
+    parser.print_help()
+    parser.exit()
 
 ######################################################################
 # Create TVM runtime and do inference
@@ -94,8 +121,7 @@
 # execute
 m.run()
 # get outputs
-_, oshape = compiler.graph_util.infer_shape(graph, shape={"data": dshape})
-tvm_output = m.get_output(0, tvm.nd.empty(tuple(oshape[0]), dtype))
+tvm_output = m.get_output(0)
 
 
 ######################################################################
@@ -134,4 +160,3 @@ def display(img, out, thresh=0.5):
 
 image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
 display(image, tvm_output.asnumpy()[0], thresh=0.45)
-
diff --git a/tutorials/nnvm/from_coreml.py b/tutorials/nnvm/from_coreml.py
index 3cf8babe6418..1c958746247b 100644
--- a/tutorials/nnvm/from_coreml.py
+++ b/tutorials/nnvm/from_coreml.py
@@ -8,9 +8,11 @@
 For us to begin with, coremltools module is required to be installed.
 
 A quick solution is to install via pip
-```bash
-pip install -U coremltools --user
-```
+
+.. code-block:: bash
+
+    pip install -U coremltools --user
+
 or please refer to official site
 https://github.com/apple/coremltools
 """
@@ -65,7 +67,8 @@ def download(url, path, overwrite=False):
 import nnvm.compiler
 target = 'cuda'
 shape_dict = {'image': x.shape}
-graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
+    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
 
 ######################################################################
 # Execute on TVM
@@ -81,14 +84,13 @@ def download(url, path, overwrite=False):
 # execute
 m.run()
 # get outputs
-output_shape = (1000,)
-tvm_output = m.get_output(0, tvm.nd.empty(output_shape, dtype)).asnumpy()
-top1 = np.argmax(tvm_output)
+tvm_output = m.get_output(0)
+top1 = np.argmax(tvm_output.asnumpy()[0])
 
 #####################################################################
 # Look up synset name
 # -------------------
-# Look up prdiction top 1 index in 1000 class synset.
+# Look up prediction top 1 index in 1000 class synset.
 synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
                       '4d0b62f3d01426887599d4f7ede23ee5/raw/',
                       '596b27d23537e5a1b5751d2b0481ef172f58b539/',
diff --git a/tutorials/nnvm/from_darknet.py b/tutorials/nnvm/from_darknet.py
index 883026f2af98..f0eec98c00ea 100644
--- a/tutorials/nnvm/from_darknet.py
+++ b/tutorials/nnvm/from_darknet.py
@@ -1,11 +1,11 @@
 """
-Compile YOLO-V2 in DarkNet Models
+Compile YOLO-V2 and YOLO-V3 in DarkNet Models
 =================================
 **Author**: `Siju Samuel <https://siju-samuel.github.io/>`_
 
 This article is an introductory tutorial to deploy darknet models with NNVM.
 All the required models and libraries will be downloaded from the internet by the script.
-This script runs the YOLO-V2 Model with the bounding boxes
+This script runs the YOLO-V2 and YOLO-V3 Model with the bounding boxes
 Darknet parsing have dependancy with CFFI and CV2 library
 Please install CFFI and CV2 before executing this script
 
@@ -17,59 +17,53 @@
 
 import nnvm
 import nnvm.frontend.darknet
+import nnvm.testing.yolo_detection
 import nnvm.testing.darknet
 import matplotlib.pyplot as plt
 import numpy as np
 import tvm
-import os
+import sys
 
 from ctypes import *
 from tvm.contrib.download import download
 from nnvm.testing.darknet import __darknetffi__
 
-######################################################################
-# Set the parameters here.
-# Supported models alexnet, resnet50, resnet152, extraction, yolo
-#
-model_name = 'yolo'
-test_image = 'dog.jpg'
-target = 'llvm'
-ctx = tvm.cpu(0)
+# Model name
+MODEL_NAME = 'yolov3'
 
 ######################################################################
-# Prepare cfg and weights file
-# ----------------------------
-# Pretrained model available https://pjreddie.com/darknet/imagenet/
-# Download cfg and weights file first time.
-
-cfg_name = model_name + '.cfg'
-weights_name = model_name + '.weights'
-cfg_url = 'https://github.com/siju-samuel/darknet/blob/master/cfg/' + \
-            cfg_name + '?raw=true'
-weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
+# Download required files
+# -----------------------
+# Download cfg and weights file if first time.
+CFG_NAME = MODEL_NAME + '.cfg'
+WEIGHTS_NAME = MODEL_NAME + '.weights'
+REPO_URL = 'https://github.com/siju-samuel/darknet/blob/master/'
+CFG_URL = REPO_URL + 'cfg/' + CFG_NAME + '?raw=true'
+WEIGHTS_URL = 'https://pjreddie.com/media/files/' + WEIGHTS_NAME
+
+download(CFG_URL, CFG_NAME)
+download(WEIGHTS_URL, WEIGHTS_NAME)
 
-download(cfg_url, cfg_name)
-download(weights_url, weights_name)
-
-######################################################################
 # Download and Load darknet library
-# ---------------------------------
-
-darknet_lib = 'libdarknet.so'
-darknetlib_url = 'https://github.com/siju-samuel/darknet/blob/master/lib/' + \
-                        darknet_lib + '?raw=true'
-download(darknetlib_url, darknet_lib)
-
-#if the file doesnt exist, then exit normally.
-if os.path.isfile('./' + darknet_lib) is False:
-    exit(0)
-
-darknet_lib = __darknetffi__.dlopen('./' + darknet_lib)
-cfg = "./" + str(cfg_name)
-weights = "./" + str(weights_name)
-net = darknet_lib.load_network(cfg.encode('utf-8'), weights.encode('utf-8'), 0)
+if sys.platform in ['linux', 'linux2']:
+    DARKNET_LIB = 'libdarknet2.0.so'
+    DARKNET_URL = REPO_URL + 'lib/' + DARKNET_LIB + '?raw=true'
+elif sys.platform == 'darwin':
+    DARKNET_LIB = 'libdarknet_mac2.0.so'
+    DARKNET_URL = REPO_URL + 'lib_osx/' + DARKNET_LIB + '?raw=true'
+else:
+    err = "Darknet lib is not supported on {} platform".format(sys.platform)
+    raise NotImplementedError(err)
+
+download(DARKNET_URL, DARKNET_LIB)
+
+DARKNET_LIB = __darknetffi__.dlopen('./' + DARKNET_LIB)
+cfg = "./" + str(CFG_NAME)
+weights = "./" + str(WEIGHTS_NAME)
+net = DARKNET_LIB.load_network(cfg.encode('utf-8'), weights.encode('utf-8'), 0)
 dtype = 'float32'
 batch_size = 1
+
 print("Converting darknet to nnvm symbols...")
 sym, params = nnvm.frontend.darknet.from_darknet(net, dtype)
 
@@ -77,39 +71,26 @@
 # Compile the model on NNVM
 # -------------------------
 # compile the model
-data = np.empty([batch_size, net.c ,net.h, net.w], dtype);
+target = 'llvm'
+ctx = tvm.cpu(0)
+data = np.empty([batch_size, net.c, net.h, net.w], dtype)
 shape = {'data': data.shape}
 print("Compiling the model...")
+dtype_dict = {}
 with nnvm.compiler.build_config(opt_level=2):
-    graph, lib, params = nnvm.compiler.build(sym, target, shape, dtype, params)
-
-#####################################################################
-# Save the JSON
-# -------------
-def save_lib():
-    #Save the graph, params and .so to the current directory
-    print("Saving the compiled output...")
-    path_name = 'nnvm_darknet_' + model_name
-    path_lib = path_name + '_deploy_lib.so'
-    lib.export_library(path_lib)
-    with open(path_name
-+ "deploy_graph.json", "w") as fo:
-        fo.write(graph.json())
-    with open(path_name
-+ "deploy_param.params", "wb") as fo:
-        fo.write(nnvm.compiler.save_param_dict(params))
-#save_lib()
+    graph, lib, params = nnvm.compiler.build(sym, target, shape, dtype_dict, params)
 
+[neth, netw] = shape['data'][2:] # Current image shape is 608x608
 ######################################################################
 # Load a test image
 # --------------------------------------------------------------------
+test_image = 'dog.jpg'
 print("Loading the test image...")
 img_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + \
-            test_image   +'?raw=true'
+          test_image + '?raw=true'
 download(img_url, test_image)
 
-data = nnvm.testing.darknet.load_image(test_image, net.w, net.h)
-
+data = nnvm.testing.darknet.load_image(test_image, netw, neth)
 ######################################################################
 # Execute on TVM Runtime
 # ----------------------
@@ -126,27 +107,49 @@ def save_lib():
 
 m.run()
 # get outputs
-out_shape = (net.outputs,)
-tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
-
-#do the detection and bring up the bounding boxes
-thresh = 0.24
-hier_thresh = 0.5
+tvm_out = []
+if MODEL_NAME == 'yolov2':
+    layer_out = {}
+    layer_out['type'] = 'Region'
+    # Get the region layer attributes (n, out_c, out_h, out_w, classes, coords, background)
+    layer_attr = m.get_output(2).asnumpy()
+    layer_out['biases'] = m.get_output(1).asnumpy()
+    out_shape = (layer_attr[0], layer_attr[1]//layer_attr[0],
+                 layer_attr[2], layer_attr[3])
+    layer_out['output'] = m.get_output(0).asnumpy().reshape(out_shape)
+    layer_out['classes'] = layer_attr[4]
+    layer_out['coords'] = layer_attr[5]
+    layer_out['background'] = layer_attr[6]
+    tvm_out.append(layer_out)
+
+elif MODEL_NAME == 'yolov3':
+    for i in range(3):
+        layer_out = {}
+        layer_out['type'] = 'Yolo'
+        # Get the yolo layer attributes (n, out_c, out_h, out_w, classes, total)
+        layer_attr = m.get_output(i*4+3).asnumpy()
+        layer_out['biases'] = m.get_output(i*4+2).asnumpy()
+        layer_out['mask'] = m.get_output(i*4+1).asnumpy()
+        out_shape = (layer_attr[0], layer_attr[1]//layer_attr[0],
+                     layer_attr[2], layer_attr[3])
+        layer_out['output'] = m.get_output(i*4).asnumpy().reshape(out_shape)
+        layer_out['classes'] = layer_attr[4]
+        tvm_out.append(layer_out)
+
+# do the detection and bring up the bounding boxes
+thresh = 0.5
+nms_thresh = 0.45
 img = nnvm.testing.darknet.load_image_color(test_image)
 _, im_h, im_w = img.shape
-probs= []
-boxes = []
-region_layer = net.layers[net.n - 1]
-boxes, probs = nnvm.testing.yolo2_detection.get_region_boxes(region_layer, im_w, im_h, net.w, net.h,
-                       thresh, probs, boxes, 1, tvm_out)
-
-boxes, probs = nnvm.testing.yolo2_detection.do_nms_sort(boxes, probs,
-                       region_layer.w*region_layer.h*region_layer.n, region_layer.classes, 0.3)
+dets = nnvm.testing.yolo_detection.fill_network_boxes((netw, neth), (im_w, im_h), thresh,
+                                                      1, tvm_out)
+last_layer = net.layers[net.n - 1]
+nnvm.testing.yolo_detection.do_nms_sort(dets, last_layer.classes, nms_thresh)
 
 coco_name = 'coco.names'
-coco_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + coco_name   +'?raw=true'
+coco_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + coco_name + '?raw=true'
 font_name = 'arial.ttf'
-font_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + font_name   +'?raw=true'
+font_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + font_name + '?raw=true'
 download(coco_url, coco_name)
 download(font_url, font_name)
 
@@ -155,7 +158,6 @@ def save_lib():
 
 names = [x.strip() for x in content]
 
-nnvm.testing.yolo2_detection.draw_detections(img, region_layer.w*region_layer.h*region_layer.n,
-                 thresh, boxes, probs, names, region_layer.classes)
-plt.imshow(img.transpose(1,2,0))
+nnvm.testing.yolo_detection.draw_detections(img, dets, thresh, names, last_layer.classes)
+plt.imshow(img.transpose(1, 2, 0))
 plt.show()
diff --git a/tutorials/nnvm/from_keras.py b/tutorials/nnvm/from_keras.py
index 402010b98634..fcac3adc79e1 100644
--- a/tutorials/nnvm/from_keras.py
+++ b/tutorials/nnvm/from_keras.py
@@ -9,12 +9,12 @@
 Tensorflow is also required since it's used as the default backend of keras.
 
 A quick solution is to install via pip
-```
-pip install -U keras --user
-```
-```
-pip install -U tensorflow --user
-```
+
+.. code-block:: bash
+
+    pip install -U keras --user
+    pip install -U tensorflow --user
+
 or please refer to official site
 https://keras.io/#installation
 """
@@ -45,7 +45,7 @@ def download(url, path, overwrite=False):
 weights_file = 'resnet50_weights.h5'
 download(weights_url, weights_file)
 keras_resnet50 = keras.applications.resnet50.ResNet50(include_top=True, weights=None,
-	input_shape=(224,224,3), classes=1000)
+                                                      input_shape=(224, 224, 3), classes=1000)
 keras_resnet50.load_weights('resnet50_weights.h5')
 
 ######################################################################
@@ -56,8 +56,8 @@ def download(url, path, overwrite=False):
 from matplotlib import pyplot as plt
 from keras.applications.resnet50 import preprocess_input
 img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
-download(img_url, 'cat.jpg')
-img = Image.open('cat.jpg').resize((224, 224))
+download(img_url, 'cat.png')
+img = Image.open('cat.png').resize((224, 224))
 plt.imshow(img)
 plt.show()
 # input preprocess
@@ -75,8 +75,8 @@ def download(url, path, overwrite=False):
 # compile the model
 target = 'cuda'
 shape_dict = {'input_1': data.shape}
-with nnvm.compiler.build_config(opt_level=2):
-	graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+with nnvm.compiler.build_config(opt_level=3):
+    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
 
 ######################################################################
 # Execute on TVM
@@ -91,14 +91,13 @@ def download(url, path, overwrite=False):
 # execute
 m.run()
 # get outputs
-out_shape = (1000,)
-tvm_out = m.get_output(0, tvm.nd.empty(out_shape, 'float32')).asnumpy()
-top1_tvm = np.argmax(tvm_out)
+tvm_out = m.get_output(0)
+top1_tvm = np.argmax(tvm_out.asnumpy()[0])
 
 #####################################################################
 # Look up synset name
 # -------------------
-# Look up prdiction top 1 index in 1000 class synset.
+# Look up prediction top 1 index in 1000 class synset.
 synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
                       '4d0b62f3d01426887599d4f7ede23ee5/raw/',
                       '596b27d23537e5a1b5751d2b0481ef172f58b539/',
diff --git a/tutorials/nnvm/from_mxnet.py b/tutorials/nnvm/from_mxnet.py
index cce3bc37126a..b4c2c5b7dfbd 100644
--- a/tutorials/nnvm/from_mxnet.py
+++ b/tutorials/nnvm/from_mxnet.py
@@ -10,9 +10,11 @@
 For us to begin with, mxnet module is required to be installed.
 
 A quick solution is
-```
-pip install mxnet --user
-```
+
+.. code-block:: bash
+
+    pip install mxnet --user
+
 or please refer to offical installation guide.
 https://mxnet.incubator.apache.org/versions/master/install/index.html
 """
@@ -31,7 +33,7 @@
 from PIL import Image
 from matplotlib import pyplot as plt
 block = get_model('resnet18_v1', pretrained=True)
-img_name = 'cat.jpg'
+img_name = 'cat.png'
 synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
                       '4d0b62f3d01426887599d4f7ede23ee5/raw/',
                       '596b27d23537e5a1b5751d2b0481ef172f58b539/',
@@ -70,7 +72,8 @@ def transform_image(image):
 import nnvm.compiler
 target = 'cuda'
 shape_dict = {'data': x.shape}
-graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+with nnvm.compiler.build_config(opt_level=3):
+    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
 
 ######################################################################
 # Execute the portable graph on TVM
@@ -86,14 +89,14 @@ def transform_image(image):
 # execute
 m.run()
 # get outputs
-tvm_output = m.get_output(0, tvm.nd.empty((1000,), dtype))
-top1 = np.argmax(tvm_output.asnumpy())
+tvm_output = m.get_output(0)
+top1 = np.argmax(tvm_output.asnumpy()[0])
 print('TVM prediction top-1:', top1, synset[top1])
 
 ######################################################################
 # Use MXNet symbol with pretrained weights
 # ----------------------------------------
-# MXNet often use `arg_prams` and `aux_params` to store network parameters
+# MXNet often use `arg_params` and `aux_params` to store network parameters
 # separately, here we show how to use these weights with existing API
 def block2symbol(block):
     data = mx.sym.Variable('data')
diff --git a/tutorials/nnvm/from_mxnet_to_webgl.py b/tutorials/nnvm/from_mxnet_to_webgl.py
index 75279839bfb3..4e7b57706de6 100644
--- a/tutorials/nnvm/from_mxnet_to_webgl.py
+++ b/tutorials/nnvm/from_mxnet_to_webgl.py
@@ -148,7 +148,7 @@ def download_image():
     from PIL import Image
 
     url = "https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true"
-    img_name = "cat.jpg"
+    img_name = "cat.png"
 
     gluon.utils.download(url, img_name)
     image = Image.open(img_name).resize((224, 224))
diff --git a/tutorials/nnvm/from_onnx.py b/tutorials/nnvm/from_onnx.py
index 8fb5a1048569..0fdef8afa98c 100644
--- a/tutorials/nnvm/from_onnx.py
+++ b/tutorials/nnvm/from_onnx.py
@@ -8,9 +8,11 @@
 For us to begin with, onnx module is required to be installed.
 
 A quick solution is to install protobuf compiler, and
-```bash
-pip install onnx --user
-```
+
+.. code-block:: bash
+
+    pip install onnx --user
+
 or please refer to offical site.
 https://github.com/onnx/onnx
 """
@@ -44,7 +46,7 @@ def download(url, path, overwrite=False):
                      'super_resolution_0.2.onnx'])
 download(model_url, 'super_resolution.onnx', True)
 # now you have super_resolution.onnx on disk
-onnx_model = onnx.load('super_resolution.onnx')
+onnx_model = onnx.load_model('super_resolution.onnx')
 # we can load the graph as NNVM compatible model
 sym, params = nnvm.frontend.from_onnx(onnx_model)
 
@@ -69,7 +71,8 @@ def download(url, path, overwrite=False):
 # assume first input name is data
 input_name = sym.list_input_names()[0]
 shape_dict = {input_name: x.shape}
-graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+with nnvm.compiler.build_config(opt_level=3):
+    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, params=params)
 
 ######################################################################
 # Execute on TVM
diff --git a/tutorials/nnvm/from_tensorflow.py b/tutorials/nnvm/from_tensorflow.py
index ee025c5b09ff..92c287e4ade7 100644
--- a/tutorials/nnvm/from_tensorflow.py
+++ b/tutorials/nnvm/from_tensorflow.py
@@ -5,9 +5,7 @@
 
 For us to begin with, tensorflow python module is required to be installed.
 
-A quick solution is to install tensorflow from
-
-https://www.tensorflow.org/install
+Please refer to https://www.tensorflow.org/install
 """
 
 # tvm and nnvm
@@ -34,13 +32,18 @@
 img_name = 'elephant-299.jpg'
 image_url = os.path.join(repo_base, img_name)
 
-# InceptionV1 model protobuf
+######################################################################
+# Tutorials
+# ---------
 # .. note::
 #
 #   protobuf should be exported with :any:`add_shapes=True` option.
 #   Could use https://github.com/dmlc/web-data/tree/master/tensorflow/scripts/tf-to-nnvm.py
 #   to add shapes for existing models.
 #
+# Please refer docs/frontend/tensorflow.md for more details for various models
+# from tensorflow.
+
 model_name = 'classify_image_graph_def-with_shapes.pb'
 model_url = os.path.join(repo_base, model_name)
 
@@ -52,6 +55,16 @@
 lable_map = 'imagenet_synset_to_human_label_map.txt'
 lable_map_url = os.path.join(repo_base, lable_map)
 
+# Target settings
+# Use these commented settings to build for cuda.
+#target = 'cuda'
+#target_host = 'llvm'
+#layout = "NCHW"
+#ctx = tvm.gpu(0)
+target = 'llvm'
+target_host = 'llvm'
+layout = None
+ctx = tvm.cpu(0)
 
 ######################################################################
 # Download required files
@@ -64,7 +77,6 @@
 download(map_proto_url, map_proto)
 download(lable_map_url, lable_map)
 
-
 ######################################################################
 # Import model
 # ------------
@@ -76,14 +88,16 @@
     graph = tf.import_graph_def(graph_def, name='')
     # Call the utility to import the graph definition into default graph.
     graph_def = nnvm.testing.tf.ProcessGraphDefParam(graph_def)
-
+    # Add shapes to the graph.
+    with tf.Session() as sess:
+        graph_def = nnvm.testing.tf.AddShapesToGraphDef(sess, 'softmax')
 
 ######################################################################
 # Decode image
 # ------------
 # .. note::
 #
-#   tensorflow frontend import doesn't support preprocessing ops like JpegDecode
+#   tensorflow frontend import doesn't support preprocessing ops like JpegDecode.
 #   JpegDecode is bypassed (just return source node).
 #   Hence we supply decoded frame to TVM instead.
 #
@@ -101,7 +115,7 @@
 # Results:
 #   sym: nnvm graph for given tensorflow protobuf.
 #   params: params converted from tensorflow params (tensor protobuf).
-sym, params = nnvm.frontend.from_tensorflow(graph_def)
+sym, params = nnvm.frontend.from_tensorflow(graph_def, layout=layout)
 
 print ("Tensorflow protobuf imported as nnvm graph")
 ######################################################################
@@ -115,18 +129,16 @@
 #   lib: target library which can be deployed on target with tvm runtime.
 
 import nnvm.compiler
-target = 'llvm'
 shape_dict = {'DecodeJpeg/contents': x.shape}
 dtype_dict = {'DecodeJpeg/contents': 'uint8'}
-graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, dtype=dtype_dict, params=params)
+graph, lib, params = nnvm.compiler.build(sym, shape=shape_dict, target=target, target_host=target_host, dtype=dtype_dict, params=params)
 
 ######################################################################
 # Execute the portable graph on TVM
 # ---------------------------------
-# Now we can try deploying the NNVM compiled model on cpu target.
+# Now we can try deploying the NNVM compiled model on target.
 
 from tvm.contrib import graph_runtime
-ctx = tvm.cpu(0)
 dtype = 'uint8'
 m = graph_runtime.create(graph, lib, ctx)
 # set inputs
diff --git a/tutorials/nnvm/nlp/keras_s2s_translate.py b/tutorials/nnvm/nlp/keras_s2s_translate.py
new file mode 100644
index 000000000000..77c7f23902f4
--- /dev/null
+++ b/tutorials/nnvm/nlp/keras_s2s_translate.py
@@ -0,0 +1,238 @@
+"""
+Keras LSTM Sequence to Sequence Model for Translation
+=================================
+**Author**: `Siju Samuel <https://siju-samuel.github.io/>`_
+
+This script demonstrates how to implement a basic character-level sequence-to-sequence model.
+We apply it to translating short English sentences into short French sentences,
+character-by-character.
+
+# Summary of the algorithm
+
+- We start with input sequences from a domain (e.g. English sentences)
+    and corresponding target sequences from another domain
+    (e.g. French sentences).
+- An encoder LSTM turns input sequences to 2 state vectors
+    (we keep the last LSTM state and discard the outputs).
+- A decoder LSTM is trained to turn the target sequences into
+    the same sequence but offset by one timestep in the future,
+    a training process called "teacher forcing" in this context.
+    Is uses as initial state the state vectors from the encoder.
+    Effectively, the decoder learns to generate `targets[t+1...]`
+    given `targets[...t]`, conditioned on the input sequence.
+
+This script loads the s2s.h5 model saved in repository
+https://github.com/dmlc/web-data/raw/master/keras/models/s2s_translate/lstm_seq2seq.py
+and generates sequences from it.  It assumes that no changes have been made (for example:
+latent_dim is unchanged, and the input data and model architecture are unchanged).
+
+# References
+
+- Sequence to Sequence Learning with Neural Networks
+    https://arxiv.org/abs/1409.3215
+- Learning Phrase Representations using
+    RNN Encoder-Decoder for Statistical Machine Translation
+    https://arxiv.org/abs/1406.1078
+
+See lstm_seq2seq.py for more details on the model architecture and how it is trained.
+"""
+
+from keras.models import Model, load_model
+from keras.layers import Input
+import random
+import os
+import numpy as np
+import keras
+import tvm
+import nnvm
+
+######################################################################
+# Download required files
+# -----------------------
+# Download files listed below from dmlc web-data repo.
+model_file = "s2s_translate.h5"
+data_file = "fra-eng.txt"
+
+# Base location for model related files.
+repo_base = 'https://github.com/dmlc/web-data/raw/master/keras/models/s2s_translate/'
+model_url = os.path.join(repo_base, model_file)
+data_url = os.path.join(repo_base, data_file)
+
+# Download files listed below.
+from mxnet.gluon.utils import download
+download(model_url, model_file)
+download(data_url, model_file)
+
+latent_dim = 256  # Latent dimensionality of the encoding space.
+test_samples = 10000  # Number of samples used for testing.
+
+######################################################################
+# Process the data file
+# ---------------------
+# Vectorize the data.  We use the same approach as the training script.
+# NOTE: the data must be identical, in order for the character -> integer
+# mappings to be consistent.
+input_texts = []
+target_texts = []
+input_characters = set()
+target_characters = set()
+with open(data_file, 'r', encoding='utf-8') as f:
+    lines = f.read().split('\n')
+test_samples = min(test_samples, len(lines))
+max_encoder_seq_length = 0
+max_decoder_seq_length = 0
+for line in lines[:test_samples]:
+    input_text, target_text = line.split('\t')
+    # We use "tab" as the "start sequence" character
+    # for the targets, and "\n" as "end sequence" character.
+    target_text = '\t' + target_text + '\n'
+    max_encoder_seq_length = max(max_encoder_seq_length, len(input_text))
+    max_decoder_seq_length = max(max_decoder_seq_length, len(target_text))
+    for char in input_text:
+        if char not in input_characters:
+            input_characters.add(char)
+    for char in target_text:
+        if char not in target_characters:
+            target_characters.add(char)
+
+input_characters = sorted(list(input_characters))
+target_characters = sorted(list(target_characters))
+num_encoder_tokens = len(input_characters)
+num_decoder_tokens = len(target_characters)
+input_token_index = dict(
+    [(char, i) for i, char in enumerate(input_characters)])
+target_token_index = dict(
+    [(char, i) for i, char in enumerate(target_characters)])
+
+# Reverse-lookup token index to decode sequences back to something readable.
+reverse_target_char_index = dict(
+    (i, char) for char, i in target_token_index.items())
+
+######################################################################
+# Load Keras Model
+# ----------------
+# Restore the model and construct the encoder and decoder.
+model = load_model(model_file)
+encoder_inputs = model.input[0]   # input_1
+
+encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output   # lstm_1
+encoder_states = [state_h_enc, state_c_enc]
+encoder_model = Model(encoder_inputs, encoder_states)
+
+decoder_inputs = model.input[1]   # input_2
+decoder_state_input_h = Input(shape=(latent_dim,), name='input_3')
+decoder_state_input_c = Input(shape=(latent_dim,), name='input_4')
+decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
+decoder_lstm = model.layers[3]
+decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
+    decoder_inputs, initial_state=decoder_states_inputs)
+decoder_states = [state_h_dec, state_c_dec]
+decoder_dense = model.layers[4]
+decoder_outputs = decoder_dense(decoder_outputs)
+decoder_model = Model(
+    [decoder_inputs] + decoder_states_inputs,
+    [decoder_outputs] + decoder_states)
+
+######################################################################
+# Compile both encoder and decoder model on NNVM
+# ----------------------------------------------
+# Creates NNVM graph definition from keras model file.
+from tvm.contrib import graph_runtime
+target = 'llvm'
+ctx = tvm.cpu(0)
+
+# Parse Encoder model
+sym, params = nnvm.frontend.from_keras(encoder_model)
+inp_enc_shape = (1, max_encoder_seq_length, num_encoder_tokens)
+shape_dict = {'input_1': inp_enc_shape}
+
+# Build Encoder model
+with nnvm.compiler.build_config(opt_level=2):
+    enc_graph, enc_lib, enc_params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+print("Encoder build ok.")
+
+# Create graph runtime for encoder model
+tvm_enc = graph_runtime.create(enc_graph, enc_lib, ctx)
+tvm_enc.set_input(**enc_params)
+
+# Parse Decoder model
+inp_dec_shape = (1, 1, num_decoder_tokens)
+shape_dict = {'input_2': inp_dec_shape,
+              'input_3': (1, latent_dim),
+              'input_4': (1, latent_dim)}
+
+# Build Decoder model
+sym, params = nnvm.frontend.from_keras(decoder_model)
+with nnvm.compiler.build_config(opt_level=2):
+    dec_graph, dec_lib, dec_params = nnvm.compiler.build(sym, target, shape_dict, params=params)
+print("Decoder build ok.")
+
+# Create graph runtime for decoder model
+tvm_dec = graph_runtime.create(dec_graph, dec_lib, ctx)
+tvm_dec.set_input(**dec_params)
+
+# Decodes an input sequence.
+def decode_sequence(input_seq):
+    # Set the input for encoder model.
+    tvm_enc.set_input('input_1', input_seq)
+
+    # Run encoder model
+    tvm_enc.run()
+
+    # Get states from encoder network
+    h = tvm_enc.get_output(0).asnumpy()
+    c = tvm_enc.get_output(1).asnumpy()
+
+    # Populate the first character of target sequence with the start character.
+    sampled_token_index = target_token_index['\t']
+
+    # Sampling loop for a batch of sequences
+    decoded_sentence = ''
+    while True:
+        # Generate empty target sequence of length 1.
+        target_seq = np.zeros((1, 1, num_decoder_tokens), dtype='float32')
+        # Update the target sequence (of length 1).
+        target_seq[0, 0, sampled_token_index] = 1.
+
+        # Set the input and states for decoder model.
+        tvm_dec.set_input('input_2', target_seq)
+        tvm_dec.set_input('input_3', h)
+        tvm_dec.set_input('input_4', c)
+        # Run decoder model
+        tvm_dec.run()
+
+        output_tokens = tvm_dec.get_output(0).asnumpy()
+        h = tvm_dec.get_output(1).asnumpy()
+        c = tvm_dec.get_output(2).asnumpy()
+
+        # Sample a token
+        sampled_token_index = np.argmax(output_tokens[0, -1, :])
+        sampled_char = reverse_target_char_index[sampled_token_index]
+
+        # Exit condition: either hit max length or find stop character.
+        if sampled_char == '\n':
+            break
+
+        # Update the sentence
+        decoded_sentence += sampled_char
+        if len(decoded_sentence) > max_decoder_seq_length:
+            break
+    return decoded_sentence
+
+def generate_input_seq(input_text):
+    input_seq = np.zeros((1, max_encoder_seq_length, num_encoder_tokens), dtype='float32')
+    for t, char in enumerate(input_text):
+        input_seq[0, t, input_token_index[char]] = 1.
+    return input_seq
+
+######################################################################
+# Run the model
+# -------------
+# Randonly take some text from test samples and translate
+for seq_index in range(100):
+    # Take one sentence randomly and try to decode.
+    index = random.randint(1, test_samples)
+    input_text, _ = lines[index].split('\t')
+    input_seq = generate_input_seq(input_text)
+    decoded_sentence = decode_sequence(input_seq)
+    print((seq_index + 1), ": ", input_text,  "==>", decoded_sentence)
diff --git a/tutorials/nnvm/using_external_lib.py b/tutorials/nnvm/using_external_lib.py
index fd00768b93be..272dcbb2b808 100644
--- a/tutorials/nnvm/using_external_lib.py
+++ b/tutorials/nnvm/using_external_lib.py
@@ -195,7 +195,7 @@
 # -----------------
 # We can check that the results of two runs match.
 
-np.testing.assert_allclose(out_cuda, out_cudnn, rtol=1e-5)
+tvm.testing.assert_allclose(out_cuda, out_cudnn, rtol=1e-5)
 
 #####################################################################
 # Conclusion
diff --git a/tutorials/nnvm_quick_start.py b/tutorials/nnvm_quick_start.py
index c171823604cd..7ff7f89cfe39 100644
--- a/tutorials/nnvm_quick_start.py
+++ b/tutorials/nnvm_quick_start.py
@@ -49,8 +49,8 @@
 data_shape = (batch_size,) + image_shape
 out_shape = (batch_size, num_class)
 
-net, params = nnvm.testing.resnet.get_workload(layers=18,
-        batch_size=batch_size, image_shape=image_shape)
+net, params = nnvm.testing.resnet.get_workload(
+    num_layers=18, batch_size=batch_size, image_shape=image_shape)
 print(net.debug_str())
 
 ######################################################################
@@ -117,7 +117,7 @@
 from tvm.contrib import util
 
 temp = util.tempdir()
-path_lib = temp.relpath("deploy_lib.so")
+path_lib = temp.relpath("deploy_lib.tar")
 lib.export_library(path_lib)
 with open(temp.relpath("deploy_graph.json"), "w") as fo:
     fo.write(graph.json())
@@ -133,9 +133,7 @@
 loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read())
 input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32"))
 
-module = graph_runtime.create(loaded_json, loaded_lib, tvm.gpu(0))
+module = graph_runtime.create(loaded_json, loaded_lib, ctx)
 module.load_params(loaded_params)
 module.run(data=input_data)
-
-out = module.get_output(0, out=tvm.nd.empty(out_shape))
-
+out = module.get_output(0).asnumpy()
diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py
index 803b81e7d222..ecb8707d399b 100644
--- a/tutorials/optimize/opt_gemm.py
+++ b/tutorials/optimize/opt_gemm.py
@@ -1,4 +1,6 @@
 """
+.. _opt-gemm:
+
 How to optimize GEMM on CPU
 ===========================
 **Author**: `Jian Weng <https://github.com/were>`_, \
@@ -91,7 +93,7 @@
 
 c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx)
 func(a, b, c)
-numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
+tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
 evaluator = func.time_evaluator(func.entry_name, ctx, number=1)
 print('Baseline: %f' % evaluator(a, b, c).mean)
@@ -126,7 +128,7 @@
 
 c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
 func(a, b, c)
-numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
+tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
 # By simply tiling the loop 32x32, and hoisting ko, ki outside the blocking loops,
 # we can see big speedup compared with the baseline.
@@ -162,7 +164,7 @@
 
 c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
 func(a, b, c)
-numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
+tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
 evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
 print('Opt2: %f' % evaluator(a, b, c).mean)
@@ -195,7 +197,7 @@
 
 c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
 func(a, b, c)
-numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
+tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
 evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
 print('Opt3: %f' % evaluator(a, b, c).mean)
@@ -250,7 +252,7 @@
 
 c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
 func(a, b, c)
-numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
+tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
 evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
 print('Opt4: %f' % evaluator(a, b, c).mean)
@@ -296,7 +298,7 @@
 
 c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
 func(a, b, c)
-numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
+tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
 evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
 print('Opt5: %f' % evaluator(a, b, c).mean)
@@ -339,7 +341,7 @@
 
 c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx)
 func(a, b, c)
-numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
+tvm.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5)
 
 evaluator = func.time_evaluator(func.entry_name, ctx, number=50)
 opt6_time = evaluator(a, b, c).mean
diff --git a/tutorials/topi/intro_topi.py b/tutorials/topi/intro_topi.py
index cf21aa52261c..8b8124c95e2b 100644
--- a/tutorials/topi/intro_topi.py
+++ b/tutorials/topi/intro_topi.py
@@ -89,7 +89,7 @@
 b_nd = tvm.nd.array(b_np, ctx)
 g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), ctx)
 func(a_nd, b_nd, g_nd)
-np.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-5)
+tvm.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-5)
 
 ######################################################################
 # TOPI also provides common neural nets operations such as _softmax_ with optimized schedule
@@ -103,13 +103,22 @@
 ######################################################################
 # Fusing convolutions
 # -------------------
-# We can fuse :code:`topi.nn.conv2d` and :code:`topi.nn.relu` together
+# We can fuse :code:`topi.nn.conv2d` and :code:`topi.nn.relu` together.
 #
+# .. note::
+#
+#    TOPI functions are all generic functions. They have different implementations
+#    for different backends to optimize for performance.
+#    For each backend, it is necessary to call them under a target scope for both
+#    compute declaration and schedule. TVM will choose the right function to call with
+#    the target information.
+
 data = tvm.placeholder((1, 3, 224, 224))
 kernel = tvm.placeholder((10, 3, 5, 5))
-conv = topi.nn.conv2d(data, kernel, strides=1, padding=2)
-out = topi.nn.relu(conv)
+
 with tvm.target.create("cuda"):
+    conv = topi.nn.conv2d(data, kernel, strides=1, padding=2, dilation=1)
+    out = topi.nn.relu(conv)
     sconv = topi.generic.nn.schedule_conv2d_nchw(out)
     print(tvm.lower(sconv, [data, kernel], simple_mode=True))
 
diff --git a/verilog/tvm_vpi.cc b/verilog/tvm_vpi.cc
index c663f7df51be..949b660ce447 100644
--- a/verilog/tvm_vpi.cc
+++ b/verilog/tvm_vpi.cc
@@ -8,7 +8,9 @@
 #include <cstdlib>
 #include <memory>
 #include <queue>
-#include "./tvm_vpi.h"
+#include <string>
+#include <vector>
+#include "tvm_vpi.h"
 #include "../src/common/pipe.h"
 
 namespace tvm {
diff --git a/vta/hardware/xilinx/src/vta.cc b/vta/hardware/xilinx/src/vta.cc
index 8d0432477486..5ca2cec6575d 100644
--- a/vta/hardware/xilinx/src/vta.cc
+++ b/vta/hardware/xilinx/src/vta.cc
@@ -8,7 +8,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "./vta.h"
+#include "vta.h"
 
 void fetch(
   uint32_t insn_count,
diff --git a/vta/include/vta/driver.h b/vta/include/vta/driver.h
index 269728c51cda..588819a5fe6b 100644
--- a/vta/include/vta/driver.h
+++ b/vta/include/vta/driver.h
@@ -95,12 +95,6 @@ void VTAFlushCache(vta_phy_addr_t buf, int size);
  */
 void VTAInvalidateCache(vta_phy_addr_t buf, int size);
 
-/*!
- * \brief Programming the bit stream on the FPGA.
- * \param bitstream The path to the bit stream file.
- */
-void VTAProgram(const char* bitstream);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/vta/include/vta/runtime.h b/vta/include/vta/runtime.h
index 6d77067be931..e58d45486282 100644
--- a/vta/include/vta/runtime.h
+++ b/vta/include/vta/runtime.h
@@ -11,7 +11,7 @@
 extern "C" {
 #endif
 
-#include "./driver.h"
+#include "driver.h"
 
 #define VTA_MEMCPY_H2D 1
 #define VTA_MEMCPY_D2H 2
diff --git a/vta/python/vta/exec/rpc_server.py b/vta/python/vta/exec/rpc_server.py
index 233d37ccad7c..c318e2dd5178 100644
--- a/vta/python/vta/exec/rpc_server.py
+++ b/vta/python/vta/exec/rpc_server.py
@@ -10,9 +10,9 @@
 import ctypes
 import json
 import tvm
-from tvm._ffi.base import c_str
 from tvm import rpc
 from tvm.contrib import cc
+from pynq import Bitstream
 
 from ..environment import get_env
 from ..pkg_config import PkgConfig
@@ -51,7 +51,8 @@ def ext_dev_callback():
     @tvm.register_func("tvm.contrib.vta.init", override=True)
     def program_fpga(file_name):
         path = tvm.get_global_func("tvm.rpc.server.workpath")(file_name)
-        load_vta_dll().VTAProgram(c_str(path))
+        bitstream = Bitstream(path)
+        bitstream.download()
         logging.info("Program FPGA with %s", file_name)
 
     @tvm.register_func("tvm.rpc.server.shutdown", override=True)
@@ -86,8 +87,8 @@ def reconfig_runtime(cfg_json):
         ldflags = pkg.ldflags
         lib_name = dll_path
         source = pkg.lib_source
-        logging.info("Rebuild runtime: output=%s, cflags=%s, source=%s, ldflags=%s",
-                     dll_path, str(cflags), str(source), str(ldflags))
+        logging.info("Rebuild runtime:\n output=%s,\n cflags=%s,\n source=%s,\n ldflags=%s",
+                     dll_path, '\n\t'.join(cflags), '\n\t'.join(source), '\n\t'.join(ldflags))
         cc.create_shared(lib_name, source, cflags + ldflags)
         with open(cfg_path, "w") as outputfile:
             outputfile.write(pkg.cfg_json)
@@ -98,10 +99,10 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument('--host', type=str, default="0.0.0.0",
                         help='the hostname of the server')
-    parser.add_argument('--port', type=int, default=9090,
-                        help='The port of the PRC')
+    parser.add_argument('--port', type=int, default=9091,
+                        help='The port of the RPC')
     parser.add_argument('--port-end', type=int, default=9199,
-                        help='The end search port of the PRC')
+                        help='The end search port of the RPC')
     parser.add_argument('--key', type=str, default="",
                         help="RPC key used to identify the connection type.")
     parser.add_argument('--tracker', type=str, default="",
diff --git a/vta/python/vta/ir_pass.py b/vta/python/vta/ir_pass.py
index 90df67c53278..3efef7135edb 100644
--- a/vta/python/vta/ir_pass.py
+++ b/vta/python/vta/ir_pass.py
@@ -556,7 +556,7 @@ def _inject_copy(src, dst, pad_before, pad_after, pad_value):
             return irb.get()
 
         else:
-            raise RuntimeError("Donot support copy %s->%s" % (src.scope, dst.scope))
+            raise RuntimeError("Do not support copy %s->%s" % (src.scope, dst.scope))
 
     return tvm.ir_pass.InjectCopyIntrin(stmt_in, "dma_copy", _inject_copy)
 
diff --git a/vta/python/vta/libinfo.py b/vta/python/vta/libinfo.py
index 6cda7dfdeb7d..f7de9c55b1a0 100644
--- a/vta/python/vta/libinfo.py
+++ b/vta/python/vta/libinfo.py
@@ -21,5 +21,6 @@ def find_libvta(optional=False):
     lib_path = [os.path.join(x, lib_name) for x in lib_search]
     lib_found = [x for x in lib_path if os.path.exists(x)]
     if not lib_found and not optional:
-        raise RuntimeError("Cannot find libvta: candidates are: " % str(lib_path))
+        raise RuntimeError('Cannot find the files.\n' +
+                           'List of candidates:\n' + str('\n'.join(lib_path)))
     return lib_found
diff --git a/vta/python/vta/pkg_config.py b/vta/python/vta/pkg_config.py
index c3fe09effb76..30b4808f5e2d 100644
--- a/vta/python/vta/pkg_config.py
+++ b/vta/python/vta/pkg_config.py
@@ -42,8 +42,8 @@ def __init__(self, cfg, proj_root):
         self.include_path = [
             "-I%s/include" % proj_root,
             "-I%s/vta/include" % proj_root,
-            "-I%s/dlpack/include" % proj_root,
-            "-I%s/dmlc-core/include" % proj_root
+            "-I%s/3rdparty/dlpack/include" % proj_root,
+            "-I%s/3rdparty/dmlc-core/include" % proj_root
         ]
         # List of source files that can be used to build standalone library.
         self.lib_source = []
diff --git a/vta/src/device_api.cc b/vta/src/device_api.cc
index 88990e1b1331..a2729baa2591 100644
--- a/vta/src/device_api.cc
+++ b/vta/src/device_api.cc
@@ -72,8 +72,7 @@ class VTADeviceAPI final : public DeviceAPI {
 
 struct VTAWorkspacePool : public WorkspacePool {
   VTAWorkspacePool() :
-      WorkspacePool(static_cast<DLDeviceType>(kExtDev),
-                    VTADeviceAPI::Global()) {}
+      WorkspacePool(kDLExtDev, VTADeviceAPI::Global()) {}
 };
 
 void* VTADeviceAPI::AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) {
diff --git a/vta/src/pynq/pynq_driver.cc b/vta/src/pynq/pynq_driver.cc
index e2630b14acde..5c597d918b5f 100644
--- a/vta/src/pynq/pynq_driver.cc
+++ b/vta/src/pynq/pynq_driver.cc
@@ -6,7 +6,7 @@
 
 #include <vta/driver.h>
 #include <thread>
-#include "./pynq_driver.h"
+#include "pynq_driver.h"
 
 
 void* VTAMemAlloc(size_t size, int cached) {
@@ -136,34 +136,3 @@ int VTADeviceRun(VTADeviceHandle handle,
   return static_cast<VTADevice*>(handle)->Run(
       insn_phy_addr, insn_count, wait_cycles);
 }
-
-void VTAProgram(const char* bitstream) {
-  int elem;
-  FILE *src, *dst, *partial;
-  partial = fopen(VTA_PYNQ_BS_IS_PARTIAL, "w");
-  if (partial == NULL) {
-    printf("Cannot open partial config file %s\n", VTA_PYNQ_BS_IS_PARTIAL);
-        fclose(partial);
-        exit(1);
-  }
-  fputc('0', partial);
-  fclose(partial);
-  src = fopen(bitstream, "rb");
-  if (src == NULL) {
-    printf("Cannot open bitstream %s\n", bitstream);
-    exit(1);
-  }
-  dst = fopen(VTA_PYNQ_BS_XDEVCFG, "wb");
-  if (dst == NULL) {
-    printf("Cannot open device file %s\n", VTA_PYNQ_BS_XDEVCFG);
-    fclose(dst);
-    exit(1);
-  }
-  elem = fgetc(src);
-  while (elem != EOF) {
-    fputc(elem, dst);
-    elem = fgetc(src);
-  }
-  fclose(src);
-  fclose(dst);
-}
diff --git a/vta/src/pynq/pynq_driver.h b/vta/src/pynq/pynq_driver.h
index 7aba00441abd..4a0b03ac25bd 100644
--- a/vta/src/pynq/pynq_driver.h
+++ b/vta/src/pynq/pynq_driver.h
@@ -37,11 +37,6 @@ void VTAUnmapRegister(void *vta, size_t length);
 void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val);
 uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset);
 
-/*! \brief (Pynq only) Partial bitstream status file path */
-#define VTA_PYNQ_BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
-/*! \brief (Pynq only) Bitstream destination file path */
-#define VTA_PYNQ_BS_XDEVCFG "/dev/xdevcfg"
-
 /*! \brief (Pynq only) Path to /dev/mem */
 #define VTA_PYNQ_DEV_MEM_PATH "/dev/mem"
 /*! \brief (Pynq only) MMIO driver constant */
diff --git a/vta/tests/hardware/common/test_lib.cc b/vta/tests/hardware/common/test_lib.cc
index 6c6d28ec0c69..98b5f9a030b9 100644
--- a/vta/tests/hardware/common/test_lib.cc
+++ b/vta/tests/hardware/common/test_lib.cc
@@ -4,7 +4,7 @@
  * \brief Test library for the VTA design simulation and driver tests.
  */
 
-#include "./test_lib.h"
+#include "test_lib.h"
 
 #ifdef NO_SIM
 #ifdef VTA_TARGET_PYNQ
@@ -46,12 +46,12 @@ uint64_t vta(
   void* vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
 
   // Physical address pointers
-  uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
-  uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
-  uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
-  uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
-  uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
-  uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
+  uint32_t insn_phy = insns ? VTAMemGetPhyAddr(insns) : 0;
+  uint32_t uop_phy = uops ? VTAMemGetPhyAddr(uops) : 0;
+  uint32_t input_phy = inputs ? VTAMemGetPhyAddr(inputs) : 0;
+  uint32_t weight_phy = weights ? VTAMemGetPhyAddr(weights) : 0;
+  uint32_t bias_phy = biases ? VTAMemGetPhyAddr(biases) : 0;
+  uint32_t output_phy = outputs ? VTAMemGetPhyAddr(outputs) : 0;
 
 #if VTA_DEBUG == 1
   printf("INFO - Starting FPGA!\n");
@@ -1453,4 +1453,4 @@ int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression
     printf("INFO - Blocked GEMM test failed, got %d errors!\n", err);
     return -1;
   }
-}
\ No newline at end of file
+}
diff --git a/vta/tests/hardware/pynq/Makefile b/vta/tests/hardware/metal_test/Makefile
similarity index 94%
rename from vta/tests/hardware/pynq/Makefile
rename to vta/tests/hardware/metal_test/Makefile
index 7a862e22eff9..4174b4e4726d 100644
--- a/vta/tests/hardware/pynq/Makefile
+++ b/vta/tests/hardware/metal_test/Makefile
@@ -11,7 +11,7 @@ OBJECTS = pynq_driver.o test_lib.o metal_test.o
 EXECUTABLE = vta
 
 # Include VTA config
-VTA_CONFIG = python ../../../make/vta_config.py
+VTA_CONFIG = python ../../../config/vta_config.py
 CFLAGS += `${VTA_CONFIG} --cflags`
 LDFLAGS += `${VTA_CONFIG} --ldflags`
 VTA_TARGET := $(shell ${VTA_CONFIG} --target)
diff --git a/vta/tests/hardware/pynq/metal_test.cc b/vta/tests/hardware/metal_test/metal_test.cc
similarity index 94%
rename from vta/tests/hardware/pynq/metal_test.cc
rename to vta/tests/hardware/metal_test/metal_test.cc
index 56be244baa79..48d719ff4b32 100644
--- a/vta/tests/hardware/pynq/metal_test.cc
+++ b/vta/tests/hardware/metal_test/metal_test.cc
@@ -10,7 +10,9 @@
 #include <string.h>
 #include <time.h>
 #include <vta/driver.h>
-#include "../../../src/pynq/pynq_driver.h"
+#ifdef VTA_TARGET_PYNQ
+#  include "../../../src/pynq/pynq_driver.h"
+#endif  // VTA_TARGET_PYNQ
 #include "../common/test_lib.h"
 
 int main(void) {
diff --git a/vta/tests/python/integration/test_benchmark_gemm.py b/vta/tests/python/integration/test_benchmark_gemm.py
index 7201038b7be0..da867c9b8270 100644
--- a/vta/tests/python/integration/test_benchmark_gemm.py
+++ b/vta/tests/python/integration/test_benchmark_gemm.py
@@ -94,7 +94,7 @@ def verify(s, check_correctness=True):
                                                    env.BATCH,
                                                    env.BLOCK_OUT)
             if check_correctness:
-                np.testing.assert_allclose(res_unpack, res_ref)
+                tvm.testing.assert_allclose(res_unpack, res_ref)
             return cost
 
         def run_schedule(load_inp,
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
index ca2451dec614..6915ff8285ba 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@@ -33,6 +33,7 @@ def run_cpu_conv2d(env, remote, key, batch_size, wl, profile=True):
         res_conv = topi.nn.conv2d(
             data, kernel, padding=(wl.hpad, wl.wpad),
             strides=(wl.hstride, wl.wstride),
+            dilation=(1, 1),
             out_dtype="int32")
         res = topi.right_shift(res_conv, 8)
         res = my_clip(res, 0, 127)
@@ -87,7 +88,7 @@ def verify(s, check_correctness):
                 padding = wl.hpad
                 res_ref = res_ref >> 8
                 res_ref = np.clip(res_ref, 0, 127).astype("int8")
-                np.testing.assert_allclose(res_unpack, res_ref)
+                tvm.testing.assert_allclose(res_unpack, res_ref)
             return cost
 
         def conv_normal(print_ir):
@@ -128,7 +129,7 @@ def _run(env, remote):
                 run_cpu_conv2d(env, remote, key, batch_size, wl)
 
     # load pre-tuned operator parameters for ARM CPU
-    autotvm.tophub.check_package('vta')
+    autotvm.tophub.check_backend('vta')
     with autotvm.tophub.context('llvm -device=vtacpu'):
         vta.testing.run(_run)
 
@@ -219,7 +220,7 @@ def verify(s, check_correctness):
                 res_ref = res_ref >> 8
                 res_ref += bias_orig.reshape(wl.out_filter, 1, 1)
                 res_ref = np.clip(res_ref, 0, 127).astype("int8")
-                np.testing.assert_allclose(res_unpack, res_ref)
+                tvm.testing.assert_allclose(res_unpack, res_ref)
             return cost
 
         def conv_normal(print_ir):
diff --git a/vta/tutorials/convolution_opt.py b/vta/tutorials/convolution_opt.py
index 8e4b77d8b491..f4d3997b0146 100644
--- a/vta/tutorials/convolution_opt.py
+++ b/vta/tutorials/convolution_opt.py
@@ -413,7 +413,7 @@
                            env.BLOCK_OUT,
                            fout_height,
                            fout_width)).transpose((0, 2, 4, 5, 1, 3))
-np.testing.assert_allclose(res_ref, res_nd.asnumpy())
+tvm.testing.assert_allclose(res_ref, res_nd.asnumpy())
 print("Successful 2D convolution test!")
 
 ######################################################################
diff --git a/vta/tutorials/resnet.py b/vta/tutorials/resnet.py
index 8d33a91d5691..8bdb53d15583 100644
--- a/vta/tutorials/resnet.py
+++ b/vta/tutorials/resnet.py
@@ -61,8 +61,8 @@ def classify(m, image):
     m.set_input('data', image)
     timer = m.module.time_evaluator("run", ctx, number=1)
     tcost = timer()
-    tvm_output = m.get_output(0, tvm.nd.empty((1000,), "float32", remote.cpu(0)))
-    top = np.argmax(tvm_output.asnumpy())
+    tvm_output = m.get_output(0)
+    top = np.argmax(tvm_output.asnumpy()[0])
     tcost = "t={0:.2f}s".format(tcost.mean)
     return tcost + " {}".format(synset[top])
 
@@ -154,7 +154,7 @@ def generate_graph(graph_fn, params_fn, device="vta"):
 synset = eval(open(os.path.join(data_dir, categ_fn)).read())
 
 # Download pre-tuned op parameters of conv2d for ARM CPU used in VTA
-autotvm.tophub.check_package('vta')
+autotvm.tophub.check_backend('vta')
 
 
 ######################################################################
@@ -237,8 +237,8 @@ def generate_graph(graph_fn, params_fn, device="vta"):
 tcost = timer()
 
 # Get classification results
-tvm_output = m.get_output(0, tvm.nd.empty((1000,), "float32", remote.cpu(0)))
-top_categories = np.argsort(tvm_output.asnumpy())
+tvm_output = m.get_output(0)
+top_categories = np.argsort(tvm_output.asnumpy()[0])
 
 # Report top-5 classification results
 print("ResNet-18 Prediction #1:", synset[top_categories[-1]])
diff --git a/web/tvm_runtime.js b/web/tvm_runtime.js
index ef594e9433fb..2eab15093b72 100644
--- a/web/tvm_runtime.js
+++ b/web/tvm_runtime.js
@@ -2,7 +2,7 @@
  * TVM Javascript web runtime library.
  *
  * @projectname tvm
- * @version 0.1
+ * @version 0.5.dev
  */
 /* eslint no-unused-vars: "off" */
 /* eslint no-unexpected-multiline: "off" */
@@ -696,6 +696,7 @@ var tvm_runtime = tvm_runtime || {};
       1 : "cpu",
       2 : "gpu",
       4 : "opencl",
+      7 : "vulkan",
       8 : "metal",
       9 : "vpi",
       11 : "opengl",
@@ -706,6 +707,7 @@ var tvm_runtime = tvm_runtime || {};
       "cuda": 2,
       "cl": 4,
       "opencl": 4,
+      "vulkan": 7,
       "metal": 8,
       "vpi": 9,
       "opengl": 11,