diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 849e4606834e..59825d69d0d4 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1 +1 @@
-Thanks for contributing to TVM!   Please refer to guideline https://docs.tvm.ai/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from [Reviewers](https://github.com/dmlc/tvm/blob/master/CONTRIBUTORS.md#reviewers).
+Thanks for contributing to TVM!   Please refer to guideline https://docs.tvm.ai/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from [Reviewers](https://github.com/dmlc/tvm/blob/master/CONTRIBUTORS.md#reviewers) by @ them in the pull request thread.
diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 808f485387f9..3943914eed66 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 808f485387f9a03f78fa9f1159f387d0d91b7a28
+Subproject commit 3943914eed66470bd010df581e29e4dca4f7df6f
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 10730ac718b4..a5f5f1428859 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,7 @@ include(cmake/util/FindCUDA.cmake)
 include(cmake/util/FindVulkan.cmake)
 include(cmake/util/FindLLVM.cmake)
 include(cmake/util/FindROCM.cmake)
+include(cmake/util/FindANTLR.cmake)
 
 if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
   include(${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
@@ -33,6 +34,7 @@ tvm_option(USE_LLVM "Build with LLVM, can be set to specific llvm-config path" O
 tvm_option(USE_STACKVM_RUNTIME "Include stackvm into the runtime" OFF)
 tvm_option(USE_GRAPH_RUNTIME "Build with tiny graph runtime" ON)
 tvm_option(USE_GRAPH_RUNTIME_DEBUG "Build with tiny graph runtime debug mode" OFF)
+tvm_option(USE_OPENMP "Build with OpenMP thread pool implementation" OFF)
 tvm_option(USE_RELAY_DEBUG "Building Relay in debug mode..." OFF)
 tvm_option(USE_SGX "Build with SGX" OFF)
 tvm_option(USE_RTTI "Build with RTTI" ON)
@@ -154,6 +156,7 @@ list(APPEND COMPILER_SRCS ${RELAY_BACKEND_SRCS})
 list(APPEND COMPILER_SRCS ${RELAY_IR_SRCS})
 list(APPEND COMPILER_SRCS ${RELAY_QNN_SRCS})
 
+
 if(USE_VM_PROFILER)
   message(STATUS "Build compiler with Relay VM profiler support...")
   file(GLOB BACKEND_VM_PROFILER_SRCS src/relay/backend/vm/profiler/*.cc)
@@ -233,6 +236,7 @@ include(cmake/modules/VTA.cmake)
 include(cmake/modules/CUDA.cmake)
 include(cmake/modules/OpenCL.cmake)
 include(cmake/modules/OpenGL.cmake)
+include(cmake/modules/OpenMP.cmake)
 include(cmake/modules/Vulkan.cmake)
 include(cmake/modules/Metal.cmake)
 include(cmake/modules/ROCM.cmake)
@@ -264,6 +268,7 @@ add_library(tvm SHARED ${COMPILER_SRCS} ${RUNTIME_SRCS})
 add_library(tvm_topi SHARED ${TOPI_SRCS})
 add_library(tvm_runtime SHARED ${RUNTIME_SRCS})
 
+
 if(USE_RELAY_DEBUG)
   message(STATUS "Building Relay in debug mode...")
   set_target_properties(tvm PROPERTIES COMPILE_DEFINITIONS "USE_RELAY_DEBUG")
diff --git a/Jenkinsfile b/Jenkinsfile
index 6134023f9c21..a66c96f3396e 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -38,9 +38,15 @@
 // - Tag the new version as the lates
 // - Periodically cleanup the old versions on local workers
 //
+
+// Hashtag in the source to build current CI docker builds
+//
+// - ci-cpu:v0.54: e7c88a99f830de30814df14eaa980547ecbd61c1
+//
+
 ci_lint = "tvmai/ci-lint:v0.51"
 ci_gpu = "tvmai/ci-gpu:v0.54"
-ci_cpu = "tvmai/ci-cpu:v0.52"
+ci_cpu = "tvmai/ci-cpu:v0.54"
 ci_i386 = "tvmai/ci-i386:v0.52"
 
 // tvm libraries
@@ -196,10 +202,10 @@ stage('Build') {
         make(ci_cpu, 'build', '-j2')
         pack_lib('cpu', tvm_lib)
         timeout(time: max_time, unit: 'MINUTES') {
-          sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_unittest.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh"
           sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta.sh"
+          sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
         }
       }
     }
diff --git a/cmake/config.cmake b/cmake/config.cmake
index d92c2151d9c8..dbc2e80812fd 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -115,6 +115,10 @@ set(USE_BLAS none)
 # set(USE_MKL_PATH <path to venv or site-packages directory>) if using `pip install mkl`
 set(USE_MKL_PATH none)
 
+# Whether use OpenMP thread pool, choices: gnu, intel
+# Note: "gnu" uses gomp library, "intel" uses iomp5 library
+set(USE_OPENMP none)
+
 # Whether use contrib.random in runtime
 set(USE_RANDOM OFF)
 
@@ -140,6 +144,10 @@ set(USE_ROCBLAS OFF)
 set(USE_SORT ON)
 
 # Build ANTLR parser for Relay text format
+# Possible values:
+# - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
+# - OFF: disable ANTLR
+# - /path/to/antlr-*-complete.jar: path to specific ANTLR jar file
 set(USE_ANTLR OFF)
 
 # Whether use Relay debug mode
diff --git a/cmake/modules/ANTLR.cmake b/cmake/modules/ANTLR.cmake
index 5842c819099d..d3c1b4218253 100644
--- a/cmake/modules/ANTLR.cmake
+++ b/cmake/modules/ANTLR.cmake
@@ -15,29 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 if(USE_ANTLR)
-  find_program(ANTLR4 antlr4)
-
-  if (NOT ANTLR4)
-    file(GLOB_RECURSE ANTLR4JAR
-          /usr/local/lib/antlr-*-complete.jar
-          /usr/local/Cellar/*antlr-*-complete.jar)
-
-    # Get the first element of the list of antlr jars.
-    # Sort and reverse the list so the item selected is the highest
-    #   version in lib or else in Cellar if no lib installation exists.
-    list(SORT ANTLR4JAR)
-    list(REVERSE ANTLR4JAR)
-    list(GET ANTLR4JAR 0 ANTLR4JAR)
-
-    set(JAVA_HOME $ENV{JAVA_HOME})
-    if (NOT DEFINED JAVA_HOME)
-      # Hack to get system to search for Java itself.
-      set(JAVA_HOME "/usr")
-    endif()
-
-    set(ANTLR4 ${JAVA_HOME}/bin/java -jar ${ANTLR4JAR})
-  endif()
-
+  find_antlr(${USE_ANTLR})
   if(ANTLR4)
 
     set(RELAY_PARSER_DIR
diff --git a/cmake/modules/OpenMP.cmake b/cmake/modules/OpenMP.cmake
new file mode 100644
index 000000000000..5dd9be508342
--- /dev/null
+++ b/cmake/modules/OpenMP.cmake
@@ -0,0 +1,48 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# OpenMP Module
+if(USE_OPENMP STREQUAL "gnu")
+  find_package(OpenMP)
+  if(OPENMP_FOUND)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${OpenMP_CXX_LIBRARIES})
+    add_definitions(-DTVM_THREADPOOL_USE_OPENMP=1)
+    message(STATUS "Build with OpenMP ${OpenMP_CXX_LIBRARIES}")
+  else()
+    add_definitions(-DTVM_THREADPOOL_USE_OPENMP=0)
+    message(WARNING "OpenMP cannot be found, use TVM threadpool instead.")
+  endif()
+elseif(USE_OPENMP STREQUAL "intel")
+  find_package(OpenMP)
+  if(OPENMP_FOUND)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    if (MSVC)
+      find_library(OMP_LIBRARY NAMES libiomp5md)
+    else()
+      find_library(OMP_LIBRARY NAMES iomp5)
+    endif()
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${OMP_LIBRARY})
+    add_definitions(-DTVM_THREADPOOL_USE_OPENMP=1)
+    message(STATUS "Build with OpenMP " ${OMP_LIBRARY})
+  else()
+    add_definitions(-DTVM_THREADPOOL_USE_OPENMP=0)
+    message(WARNING "OpenMP cannot be found, use TVM threadpool instead.")
+  endif()
+else()
+  add_definitions(-DTVM_THREADPOOL_USE_OPENMP=0)
+endif()
diff --git a/cmake/util/FindANTLR.cmake b/cmake/util/FindANTLR.cmake
new file mode 100644
index 000000000000..b68f90ead131
--- /dev/null
+++ b/cmake/util/FindANTLR.cmake
@@ -0,0 +1,65 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#######################################################
+# Enhanced version of find ANTLR.
+#
+# Usage:
+#   find_antlr(${USE_ANTLR})
+#
+# - When USE_ANTLR=ON, use auto search by first trying to find antlr4 program,
+#                      then trying to find antlr-*-complete.jar
+# - When USE_ANTLR=/path/to/antlr-*-complete.jar, use provided jar
+#
+# Provide variables:
+# - ANTLR4
+#
+macro(find_antlr use_antlr)
+  set(JAVA_HOME $ENV{JAVA_HOME})
+  if (NOT DEFINED JAVA_HOME)
+    # Hack to get system to search for Java itself.
+    message(STATUS "JAVA_HOME is not defined. Set it to ensure proper use")
+    set(JAVA_HOME "/usr")
+  endif()
+  if(MSVC)
+    set(JAVA_PROGRAM ${JAVA_HOME}/java.exe)
+  else()
+    set(JAVA_PROGRAM ${JAVA_HOME}/bin/java)
+  endif()
+  message(STATUS "Using Java at " ${JAVA_PROGRAM})
+
+  if (${use_antlr} STREQUAL "ON")
+    find_program(ANTLR4 antlr4)
+    if (NOT ANTLR4)
+      file(GLOB_RECURSE ANTLR4JAR
+          /usr/local/lib/antlr-*-complete.jar
+          /usr/local/Cellar/*antlr-*-complete.jar)
+
+      # Get the first element of the list of antlr jars.
+      # Sort and reverse the list so the item selected is the highest
+      #   version in lib or else in Cellar if no lib installation exists.
+      list(SORT ANTLR4JAR)
+      list(REVERSE ANTLR4JAR)
+      list(GET ANTLR4JAR 0 ANTLR4JAR)
+
+      set(ANTLR4 ${JAVA_PROGRAM} -jar ${ANTLR4JAR})
+    endif()
+  elseif(NOT ${use_antlr} STREQUAL "OFF")
+    set(ANTLR4 ${JAVA_PROGRAM} -jar ${use_antlr})
+  endif()
+  message(STATUS "ANTLR4="${ANTLR4})
+endmacro(find_antlr)
diff --git a/docker/install/ubuntu_install_nnpack.sh b/docker/install/ubuntu_install_nnpack.sh
index 4f45f130e2e5..dc51fc28d492 100755
--- a/docker/install/ubuntu_install_nnpack.sh
+++ b/docker/install/ubuntu_install_nnpack.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -22,11 +22,14 @@ set -o pipefail
 
 apt-get update && apt-get install -y --no-install-recommends git cmake
 
-# TODO: specific tag?
 git clone https://github.com/Maratyszcza/NNPACK NNPACK
+git clone https://github.com/Maratyszcza/pthreadpool  NNPACK/pthreadpool
+
+# Use specific versioning tag.
 (cd NNPACK && git checkout 1e005b0c2)
+(cd NNPACK/pthreadpool && git checkout 13da0b4c)
 
 mkdir -p NNPACK/build
 cd NNPACK/build
-cmake -DCMAKE_INSTALL_PREFIX:PATH=. -DNNPACK_INFERENCE_ONLY=OFF -DNNPACK_CONVOLUTION_ONLY=OFF -DNNPACK_BUILD_TESTS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON .. && make -j4 && make install
+cmake -DCMAKE_INSTALL_PREFIX:PATH=. -DNNPACK_INFERENCE_ONLY=OFF -DNNPACK_CONVOLUTION_ONLY=OFF -DNNPACK_BUILD_TESTS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DPTHREADPOOL_SOURCE_DIR=pthreadpool .. && make -j4 && make install
 cd -
diff --git a/docker/install/ubuntu_install_onnx.sh b/docker/install/ubuntu_install_onnx.sh
index f3e8d8e8f540..54210b83f4d6 100755
--- a/docker/install/ubuntu_install_onnx.sh
+++ b/docker/install/ubuntu_install_onnx.sh
@@ -27,5 +27,4 @@ pip3 install onnx==1.5.0
 # not expose that in the wheel!!!
 pip3 install future
 
-pip3 install https://download.pytorch.org/whl/cu80/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl
-pip3 install torchvision
+pip3 install torch==1.2.0 torchvision==0.4.0
diff --git a/docs/dev/virtual_machine.rst b/docs/dev/virtual_machine.rst
index 2791ee71177e..cb08cc14e56e 100644
--- a/docs/dev/virtual_machine.rst
+++ b/docs/dev/virtual_machine.rst
@@ -121,7 +121,7 @@ AllocTensor
 Allocate a tensor value of the appropriate shape (stored in `shape_register`) and `dtype`. The result
 is saved to register `dst`.
 
-AllocDatatype
+AllocADT
 ^^^^^^^^^^^^^
 **Arguments**:
 ::
@@ -176,7 +176,7 @@ GetTagi
   RegName object
   RegName dst
 
-Get the object tag for Datatype object in register `object`. And saves the reult to register `dst`.
+Get the object tag for ADT object in register `object`. And saves the reult to register `dst`.
 
 Fatal
 ^^^^^
@@ -251,9 +251,9 @@ Currently, we support 3 types of objects: tensors, data types, and closures.
 
 ::
 
-    VMObject VMTensor(const tvm::runtime::NDArray& data);
-    VMObject VMDatatype(size_t tag, const std::vector<VMObject>& fields);
-    VMObject VMClosure(size_t func_index, std::vector<VMObject> free_vars);
+    Object Tensor(const tvm::runtime::NDArray& data);
+    Object ADT(size_t tag, const std::vector<Object>& fields);
+    Object Closure(size_t func_index, std::vector<Object> free_vars);
 
 
 Stack and State
diff --git a/docs/frontend/tensorflow.md b/docs/frontend/tensorflow.md
deleted file mode 100644
index 06a6fcc32b4f..000000000000
--- a/docs/frontend/tensorflow.md
+++ /dev/null
@@ -1,53 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# Tensorflow Frontend
-Tensorflow frontend helps in importing tensorflow released model into TVM.
-
-This document helps few steps while importing various different models from
-[tensorflow research/slim](https://github.com/tensorflow/models/tree/master/research/slim).
-
-Current frontend is tested with all versions of below models
-- Inception (V1/V2/V3/V4)
-- Resnet (All)
-- Mobilenet (V1/V2 All)
-- Vgg (16/19)
-
-Tensorflow frontend expects a freezed protobuf format as input.
-
-Not all models are released as freezed protobuf. Some of them are checkpoints (.ckpt).
-Please refer to [export](https://github.com/tensorflow/models/tree/master/research/slim#exporting-the-inference-graph) 
-and [freeze](https://github.com/tensorflow/models/tree/master/research/slim#freezing-the-exported-graph) 
-instructions to generate protobuf from checkpoint.
-
-## General Instructions
-
-### Add Shapes:
-While freezing of protobuf add additional option ```add_shapes=True``` to embed output shapes of each node into graph.
-You may use ```tvm.relay.testing.tf.AddShapesToGraphDef``` from nnvm for the same.
-Please refer to [tensorflow tutorial](https://github.com/dmlc/tvm/blob/master/tutorials/nnvm/from_tensorflow.py).
-
-### Explicit Shape:
-There might be situations where the add_shapes=True may not provide sufficient information about shape.
-You may pass explicit dictionary of input shapes argument for ```from_tensorflow```.
-Please refer to [test cases](https://github.com/dmlc/tvm/blob/master/nnvm/tests/python/frontend/tensorflow/test_forward.py#L36).
-
-### GPU:
-Most of these tensorflow models are released for CPU with NHWC layout.
-To compile for GPU we need to pass extra argument ```layout='NCHW'``` for from_tensorflow.
-This option will do a layout conversion before and after for neural network ops.
-Remaining nnvm build options for GPU compilation remain as it is.
diff --git a/docs/frontend/tensorflow.rst b/docs/frontend/tensorflow.rst
new file mode 100644
index 000000000000..827f5d637988
--- /dev/null
+++ b/docs/frontend/tensorflow.rst
@@ -0,0 +1,241 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+TensorFlow Frontend
+===================
+
+The TensorFlow frontend helps in importing TensorFlow models into TVM.
+
+Supported versions:
+
+- 1.12 and below
+
+Tested models:
+
+- Inception (V1/V2/V3/V4)
+- Resnet (All)
+- Mobilenet (V1/V2 All)
+- Vgg (16/19)
+- BERT (Base/3-layer)
+
+Preparing a Model for Inference
+-------------------------------
+
+Remove Unneeded Nodes
+~~~~~~~~~~~~~~~~~~~~~
+
+The export process will remove many nodes that are not needed for inference, but unfortunately will leave some remaining. The nodes that should be manually removed are:
+
+- Dropout, including `Dropout`_ and `DropoutWrapper`_
+- `Assert`_
+
+.. _Dropout: https://www.tensorflow.org/api_docs/python/tf/nn/dropout
+.. _DropoutWrapper: https://www.tensorflow.org/versions/r1.12/api_docs/python/tf/nn/rnn_cell/DropoutWrapper?hl=hr
+.. _Assert: https://www.tensorflow.org/api_docs/python/tf/debugging/Assert
+
+Convert None Dimensions to Constants
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+TVM has minimal support for dynamic tensor shapes. Dimensions that are ``None`` should be replaced with constants. For example, a model may accept an input with shape ``(None,20)``. This should be converted to a shape like ``(1,20)``. The model should be modified accordingly to ensure that these shapes match throughout the graph.
+
+Export
+~~~~~~
+
+TensorFlow frontend expects a frozen protobuf (.pb) or saved model as input. It currently does not support checkpoint (.ckpt). The graphdef needed by the TensorFlow frontend can be extracted from the active session, or by using the `TFParser`_ helper class.
+
+.. _TFParser: https://github.com/dmlc/tvm/blob/master/python/tvm/relay/frontend/tensorflow_parser.py
+
+The model should be exported with a number of transformations to prepare the model for inference. It is also important to set ```add_shapes=True```, as this will embed the output shapes of each node into the graph. Here is one function to export a model as a protobuf given a session:
+
+.. code:: python
+
+    import tensorflow as tf
+    from tensorflow.tools.graph_transforms import TransformGraph
+
+    def export_pb(session):
+        with tf.gfile.GFile("myexportedmodel.pb", "wb") as f:
+            inputs = ["myinput1", "myinput2"] # replace with your input names
+            outputs = ["myoutput1"] # replace with your output names
+            graph_def = session.graph.as_graph_def(add_shapes=True)
+            graph_def = tf.graph.util.convert_variables_to_constants(session, graph_def, outputs)
+            graph_def = TransformGraph(
+                graph_def,
+                inputs,
+                outputs,
+                [
+                    "remove_nodes(op=Identity, op=CheckNumerics, op=StopGradient)",
+                    "sort_by_execution_order", # sort by execution order after each transform to ensure correct node ordering
+                    "remove_device",
+                    "sort_by_execution_order",
+                    "fold_batch_norms",
+                    "sort_by_execution_order",
+                    "fold_old_batch_norms",
+                    "sort_by_execution_order"
+                ]
+            )
+            f.write(graph_def.SerializeToString())
+
+Another method is to `export and freeze the graph <https://github.com/tensorflow/models/tree/master/research/slim#exporting-the-inference-graph>`_.
+
+Import the Model
+----------------
+
+Explicit Shape:
+~~~~~~~~~~~~~~~
+
+To ensure shapes can be known throughout the entire graph, pass the ```shape``` argument to ```from_tensorflow```. This dictionary maps input names to input shapes. Please refer to these `test cases <https://github.com/dmlc/tvm/blob/master/nnvm/tests/python/frontend/tensorflow/test_forward.py#L36>`_ for examples.
+
+Data Layout
+~~~~~~~~~~~
+
+Most TensorFlow models are released with NHWC layout. NCHW layout often provides better performance, especially on GPU. The TensorFlow frontend can automatically convert the model's data layout by passing the argument ```layout='NCHW'``` to ```from_tensorflow```.
+
+Best Practices
+--------------
+
+- Use static tensor shapes instead of dynamic shapes (remove ```None``` dimensions).
+- Use static RNN instead of dynamic RNN, as ```TensorArray``` isn't supported yet.
+
+Supported Ops
+-------------
+
+- Abs
+- Add
+- All
+- ArgMax
+- ArgMin
+- AvgPool
+- BatchMatMul
+- BatchMatMulV2
+- BatchNormWithGlobalNormalization
+- BatchToSpaceND
+- BiasAdd
+- BroadcastTo
+- Cast
+- Ceil
+- CheckNumerics
+- ClipByValue
+- Concat
+- ConcatV2
+- Conv2D
+- Cos
+- CropAndResize
+- DecodeJpeg
+- DepthwiseConv2dNative
+- DepthToSpace
+- Equal
+- Elu
+- Enter
+- Erf
+- Exit
+- Exp
+- ExpandDims
+- Fill
+- Floor
+- FloorDiv
+- FusedBatchNorm
+- FusedBatchNormV2
+- Gather
+- GatherNd
+- GatherV2
+- Greater
+- GreaterEqual
+- Identity
+- LeakyRelu
+- LeftShift
+- Less
+- LessEqual
+- Log
+- Log1p
+- LoopCond
+- LogicalAnd
+- LogicalOr
+- LogicalNot
+- LogSoftmax
+- LRN
+- LSTMBlockCell
+- MatMul
+- Max
+- MaxPool
+- Maximum
+- Mean
+- Merge
+- Min
+- Minimum
+- MirrorPad
+- Mod
+- Mul
+- Neg
+- NextIteration
+- NotEqual
+- OneHot
+- Pack
+- Pad
+- PadV2
+- Pow
+- Prod
+- Range
+- Rank
+- RealDiv
+- Relu
+- Relu6
+- Reshape
+- ResizeBilinear
+- ResizeBicubic
+- ResizeNearestNeighbor
+- ReverseV2
+- RightShift
+- Round
+- Rsqrt
+- Select
+- Selu
+- Shape
+- Sigmoid
+- Sign
+- Sin
+- Size
+- Slice
+- Softmax
+- Softplus
+- SpaceToBatchND
+- SpaceToDepth,
+- Split
+- SplitV
+- Sqrt
+- Square
+- SquareDifference
+- Squeeze
+- StridedSlice
+- Sub
+- Sum
+- Switch
+- Tanh
+- TensorArrayV3
+- TensorArrayScatterV3
+- TensorArrayGatherV3
+- TensorArraySizeV3
+- TensorArrayWriteV3
+- TensorArrayReadV3
+- TensorArraySplitV3
+- TensorArrayConcatV3
+- Tile
+- TopKV2
+- Transpose
+- TruncateMod
+- Unpack
+- Where
+- ZerosLike
diff --git a/docs/index.rst b/docs/index.rst
index 9666fff0c5d3..f02dcc7c91e2 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -47,6 +47,12 @@ Developer Guide
    dev/index
    nnvm_top
 
+Frontends
+----------------
+.. toctree::
+   :maxdepth: 1
+
+   frontend/tensorflow
 
 Index
 -----
diff --git a/docs/vta/install.md b/docs/vta/install.md
index c43a167292b4..02c50fbba481 100644
--- a/docs/vta/install.md
+++ b/docs/vta/install.md
@@ -229,7 +229,7 @@ Now you can connect the power cable and serial port to boot the Angstrom Linux.
 > In this case, you might need to build the `zImage` file of your own from [socfpga-4.9.78-ltsi](https://github.com/altera-opensource/linux-socfpga/tree/socfpga-4.9.78-ltsi) branch of the [linux-socfpga](https://github.com/altera-opensource/linux-socfpga) repository. 
 > For a quick fix, you can also download a prebuilt version of the `zImage` file [here](https://raw.githubusercontent.com/liangfu/de10-nano-supplement/master/zImage).
 
-After connecting he usb cables to the DE10-Nano board, power on the board by connecting the power cable. You may then connect to the serial port of the device by using `minicom` on your host PC:
+After connecting the usb cables to the DE10-Nano board, power on the board by connecting the power cable. You may then connect to the serial port of the device by using `minicom` on your host PC:
 
 ``` bash
 # NOTE: root privilege is typically required to run the following command.
diff --git a/golang/src/value.go b/golang/src/value.go
index 576331a8cfa0..5e0f78270eaa 100644
--- a/golang/src/value.go
+++ b/golang/src/value.go
@@ -44,8 +44,8 @@ var KTVMType                = int32(C.kTVMType)
 var KTVMContext             = int32(C.kTVMContext)
 // KArrayHandle is golang type code for TVM kArrayHandle.
 var KArrayHandle            = int32(C.kArrayHandle)
-// KNodeHandle is golang type code for TVM kNodeHandle.
-var KNodeHandle             = int32(C.kNodeHandle)
+// KObjectHandle is golang type code for TVM kObjectHandle.
+var KObjectHandle             = int32(C.kObjectHandle)
 // KModuleHandle is gonag type code for TVM kModuleHandle.
 var KModuleHandle           = int32(C.kModuleHandle)
 // KFuncHandle is gonalg type code for TVM kFuncHandle.
diff --git a/include/tvm/api_registry.h b/include/tvm/api_registry.h
index e12d841519ca..c41c3087f4ac 100644
--- a/include/tvm/api_registry.h
+++ b/include/tvm/api_registry.h
@@ -58,7 +58,7 @@ class EnvFuncNode : public Node {
   /*! \brief constructor */
   EnvFuncNode() {}
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
   }
 
@@ -79,7 +79,7 @@ class EnvFunc : public NodeRef {
   explicit EnvFunc(NodePtr<Node> n) : NodeRef(n) {}
   /*! \return The internal global function pointer */
   const EnvFuncNode* operator->() const {
-    return static_cast<EnvFuncNode*>(node_.get());
+    return static_cast<const EnvFuncNode*>(get());
   }
   /*!
    * \brief Invoke the function.
@@ -124,19 +124,19 @@ class TypedEnvFunc<R(Args...)> : public NodeRef {
   /*! \brief short hand for this function type */
   using TSelf = TypedEnvFunc<R(Args...)>;
   TypedEnvFunc() {}
-  explicit TypedEnvFunc(NodePtr<Node> n) : NodeRef(n) {}
+  explicit TypedEnvFunc(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief Assign global function to a TypedEnvFunc
    * \param other Another global function.
    * \return reference to self.
    */
   TSelf& operator=(const EnvFunc& other) {
-    this->node_ = other.node_;
+    ObjectRef::operator=(other);
     return *this;
   }
   /*! \return The internal global function pointer */
   const EnvFuncNode* operator->() const {
-    return static_cast<EnvFuncNode*>(node_.get());
+    return static_cast<const EnvFuncNode*>(get());
   }
   /*!
    * \brief Invoke the function.
diff --git a/include/tvm/arithmetic.h b/include/tvm/arithmetic.h
index 8be1c3604813..bda6ac647f55 100644
--- a/include/tvm/arithmetic.h
+++ b/include/tvm/arithmetic.h
@@ -60,7 +60,7 @@ class ConstIntBoundNode : public Node {
   int64_t min_value;
   int64_t max_value;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("min_value", &min_value);
     v->Visit("max_value", &max_value);
   }
@@ -162,7 +162,7 @@ class ModularSetNode : public Node {
   /*! \brief The base */
   int64_t base;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("coeff", &coeff);
     v->Visit("base", &base);
   }
@@ -351,7 +351,7 @@ enum SignType {
  */
 struct IntSetNode : public Node {
   static constexpr const char* _type_key = "IntSet";
-  TVM_DECLARE_BASE_NODE_INFO(IntSetNode, Node);
+  TVM_DECLARE_BASE_NODE_INFO(IntSetNode, Object);
 };
 
 /*!
@@ -362,7 +362,7 @@ class IntSet : public NodeRef {
   /*! \brief constructor */
   IntSet() {}
   // constructor from not container.
-  explicit IntSet(NodePtr<Node> n) : NodeRef(n) {}
+  explicit IntSet(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -692,7 +692,7 @@ Array<Expr> DetectClipBound(const Expr& e,
 
 // implementation
 inline const IntSetNode* IntSet::operator->() const {
-  return static_cast<const IntSetNode*>(node_.get());
+  return static_cast<const IntSetNode*>(get());
 }
 }  // namespace arith
 }  // namespace tvm
diff --git a/include/tvm/attrs.h b/include/tvm/attrs.h
index 3b64d1f961e2..2fbb9e6a866e 100644
--- a/include/tvm/attrs.h
+++ b/include/tvm/attrs.h
@@ -115,7 +115,7 @@ class AttrFieldInfoNode : public Node {
   /*! \brief detailed description of the type */
   std::string description;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("type_info", &type_info);
     v->Visit("description", &description);
@@ -163,7 +163,7 @@ class AttrsEqual {
     return lhs == rhs;
   }
   // node comparator
-  TVM_DLL bool operator()(const NodeRef& lhs, const NodeRef& rhs) const;
+  TVM_DLL bool operator()(const ObjectRef& lhs, const ObjectRef& rhs) const;
 
  protected:
   friend class AttrsEqualHandler;
@@ -197,13 +197,13 @@ class AttrsHash {
   size_t operator()(const std::string& value) const {
     return std::hash<std::string>()(value);
   }
-  size_t operator()(const Type& value) const {
+  size_t operator()(const DataType& value) const {
     return std::hash<int>()(
         static_cast<int>(value.code()) |
         (static_cast<int>(value.bits()) << 8) |
         (static_cast<int>(value.lanes()) << 16));
   }
-  TVM_DLL size_t operator()(const NodeRef& value) const;
+  TVM_DLL size_t operator()(const ObjectRef& value) const;
 
  private:
   friend class AttrsHashHandler;
@@ -221,6 +221,8 @@ class BaseAttrsNode : public Node {
  public:
   using TVMArgs = runtime::TVMArgs;
   using TVMRetValue = runtime::TVMRetValue;
+  // visit function
+  virtual void VisitAttrs(AttrVisitor* v) {}
   /*!
    * \brief Initialize the attributes by sequence of arguments
    * \param args The postional arguments in the form
@@ -260,7 +262,7 @@ class BaseAttrsNode : public Node {
    * \return The comparison result.
    */
   TVM_DLL virtual bool ContentEqual(
-      const Node* other, AttrsEqual equal) const = 0;
+      const Object* other, AttrsEqual equal) const = 0;
   /*!
    * \brief Content aware hash.
    * \param hasher The hasher to run the hash.
@@ -290,7 +292,7 @@ class Attrs : public NodeRef {
  private:
   /*! \return the internal attribute node */
   const BaseAttrsNode* ptr() const {
-    return static_cast<const BaseAttrsNode*>(node_.get());
+    return static_cast<const BaseAttrsNode*>(get());
   }
 };
 
@@ -315,7 +317,7 @@ class DictAttrsNode : public BaseAttrsNode {
   void VisitNonDefaultAttrs(AttrVisitor* v) final;
   void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final;
   Array<AttrFieldInfo> ListFieldInfo() const final;
-  bool ContentEqual(const Node* other, AttrsEqual equal) const final;
+  bool ContentEqual(const Object* other, AttrsEqual equal) const final;
   size_t ContentHash(AttrsHash hasher) const final;
   // type info
   static constexpr const char* _type_key = "DictAttrs";
@@ -369,7 +371,7 @@ class AttrsEqualVisitor {
  public:
   bool result_{true};
   // constructor
-  AttrsEqualVisitor(const Node* lhs, const Node* rhs, const AttrsEqual& equal)
+  AttrsEqualVisitor(const Object* lhs, const Object* rhs, const AttrsEqual& equal)
       : lhs_(lhs), rhs_(rhs), equal_(equal) {
   }
   template<typename T>
@@ -387,8 +389,8 @@ class AttrsEqualVisitor {
   }
 
  private:
-  const Node* lhs_;
-  const Node* rhs_;
+  const Object* lhs_;
+  const Object* rhs_;
   const AttrsEqual& equal_;
 };
 
@@ -488,7 +490,7 @@ inline void SetIntValue(T* ptr, const TVMArgValue& val) {
     } else if (const ir::UIntImm* op = expr.as<ir::UIntImm>()) {
       *ptr = static_cast<T>(op->value);
     } else {
-      LOG(FATAL) << "Expect int value, but get " << expr->type_key();
+      LOG(FATAL) << "Expect int value, but get " << expr->GetTypeKey();
     }
   }
 }
@@ -521,7 +523,7 @@ inline void SetValue<double>(double* ptr, const TVMArgValue& val) {
     } else if (const ir::UIntImm* op = expr.as<ir::UIntImm>()) {
       *ptr = static_cast<double>(op->value);
     } else {
-      LOG(FATAL) << "Expect float value, but get " << expr->type_key();
+      LOG(FATAL) << "Expect float value, but get " << expr->GetTypeKey();
     }
   }
 }
@@ -753,12 +755,12 @@ class AttrNonDefaultVisitor {
 template<typename DerivedType>
 class AttrsNode : public BaseAttrsNode {
  public:
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     ::tvm::detail::AttrNormalVisitor vis(v);
     self()->__VisitAttrs__(vis);
   }
 
-  void VisitNonDefaultAttrs(AttrVisitor* v) final {
+  void VisitNonDefaultAttrs(AttrVisitor* v) {
     ::tvm::detail::AttrNonDefaultVisitor vis(v);
     self()->__VisitAttrs__(vis);
   }
@@ -827,7 +829,7 @@ class AttrsNode : public BaseAttrsNode {
     return visitor.fields_;
   }
 
-  bool ContentEqual(const Node* other, AttrsEqual equal) const final {
+  bool ContentEqual(const Object* other, AttrsEqual equal) const final {
     DerivedType* pself = self();
     if (pself == other) return true;
     if (other == nullptr) return false;
@@ -839,7 +841,7 @@ class AttrsNode : public BaseAttrsNode {
 
   size_t ContentHash(AttrsHash hasher) const final {
     ::tvm::detail::AttrsHashVisitor visitor(hasher);
-    visitor.result_ = std::hash<std::string>()(this->type_key());
+    visitor.result_ = this->GetTypeKeyHash();
     self()->__VisitAttrs__(visitor);
     return visitor.result_;
   }
diff --git a/include/tvm/base.h b/include/tvm/base.h
index f358f7f5d447..9b3b4cd3e8df 100644
--- a/include/tvm/base.h
+++ b/include/tvm/base.h
@@ -19,88 +19,16 @@
 
 /*!
  * \file tvm/base.h
- * \brief Defines the base data structure
+ * \brief Base utilities
  */
 #ifndef TVM_BASE_H_
 #define TVM_BASE_H_
 
 #include <dmlc/logging.h>
-#include <dmlc/registry.h>
-#include <tvm/node/node.h>
-#include <string>
-#include <memory>
-#include <functional>
 #include <utility>
-#include "runtime/registry.h"
 
 namespace tvm {
 
-using ::tvm::Node;
-using ::tvm::NodeRef;
-using ::tvm::AttrVisitor;
-
-/*!
- * \brief Macro to define common node ref methods.
- * \param TypeName The name of the NodeRef.
- * \param BaseTypeName The Base type.
- * \param NodeName The node container type.
- */
-#define TVM_DEFINE_NODE_REF_METHODS(TypeName, BaseTypeName, NodeName)   \
-  TypeName() {}                                                         \
-  explicit TypeName(::tvm::NodePtr<::tvm::Node> n) : BaseTypeName(n) {} \
-  const NodeName* operator->() const {                                  \
-    return static_cast<const NodeName*>(node_.get());                   \
-  }                                                                     \
-  operator bool() const { return this->defined(); }                     \
-  using ContainerType = NodeName;
-
-/*!
- * \brief Macro to define CopyOnWrite function in a NodeRef.
- * \param NodeName The Type of the Node.
- *
- *  CopyOnWrite will generate a unique copy of the internal node.
- *  The node will be copied if it is referenced by multiple places.
- *  The function returns the raw pointer to the node to allow modification
- *  of the content.
- *
- * \code
- *
- *  MyCOWNodeRef ref, ref2;
- *  ref2 = ref;
- *  ref.CopyOnWrite()->value = new_value;
- *  assert(ref2->value == old_value);
- *  assert(ref->value == new_value);
- *
- * \endcode
- */
-#define TVM_DEFINE_NODE_REF_COW(NodeName)                               \
-  NodeName* CopyOnWrite() {                                             \
-      CHECK(node_ != nullptr);                                          \
-      if (!node_.unique())  {                                           \
-        NodePtr<NodeName> n = make_node<NodeName>(*(operator->()));     \
-        NodePtr<Node>(std::move(n)).swap(node_);                        \
-      }                                                                 \
-      return static_cast<NodeName*>(node_.get());                       \
-    }
-
-/*! \brief Macro to make it easy to define node ref type given node */
-#define TVM_DEFINE_NODE_REF(TypeName, NodeName)                      \
-  class TypeName : public ::tvm::NodeRef {                           \
-   public:                                                           \
-    TVM_DEFINE_NODE_REF_METHODS(TypeName, ::tvm::NodeRef, NodeName); \
-  };                                                                 \
-
-/*!
- * \brief Macro to make it easy to define node ref type that
- *  has a CopyOnWrite member function.
- */
-#define TVM_DEFINE_COW_NODE_REF(TypeName, BaseType, NodeName)           \
-  class TypeName : public BaseType {                                    \
-   public:                                                              \
-    TVM_DEFINE_NODE_REF_METHODS(TypeName, BaseType, NodeName);          \
-    TVM_DEFINE_NODE_REF_COW(NodeName);                                  \
-  };
-
 /*!
  * \brief RAII wrapper function to enter and exit a context object
  *        similar to python's with syntax.
@@ -145,99 +73,6 @@ class With {
   ContextType ctx_;
 };
 
-/*!
- * \brief save the node as well as all the node it depends on as json.
- *  This can be used to serialize any TVM object
- *
- * \return the string representation of the node.
- */
-std::string SaveJSON(const NodeRef& node);
-
-/*!
- * \brief Internal implementation of LoadJSON
- * Load tvm Node object from json and return a shared_ptr of Node.
- * \param json_str The json string to load from.
- *
- * \return The shared_ptr of the Node.
- */
-NodePtr<Node> LoadJSON_(std::string json_str);
-
-/*!
- * \brief Load the node from json string.
- *  This can be used to deserialize any TVM object.
- *
- * \param json_str The json string to load from.
- *
- * \tparam NodeType the nodetype
- *
- * \code
- *  Expr e = LoadJSON<Expr>(json_str);
- * \endcode
- */
-template<typename NodeType,
-         typename = typename std::enable_if<std::is_base_of<NodeRef, NodeType>::value>::type >
-inline NodeType LoadJSON(const std::string& json_str) {
-  return NodeType(LoadJSON_(json_str));
-}
-
-/*!
- * \brief Registry entry for NodeFactory.
- *
- *  There are two types of Nodes that can be serialized.
- *  The normal node requires a registration a creator function that
- *  constructs an empty Node of the corresponding type.
- *
- *  The global singleton(e.g. global operator) where only global_key need to be serialized,
- *  in this case, FGlobalKey need to be defined.
- */
-struct NodeFactoryReg {
-  /*!
-   * \brief creator function.
-   * \param global_key Key that identifies a global single object.
-   *        If this is not empty then FGlobalKey
-   * \return The created function.
-   */
-  using FCreate = std::function<NodePtr<Node>(const std::string& global_key)>;
-  /*!
-   * \brief Global key function, only needed by global objects.
-   * \param node The node pointer.
-   * \return node The global key to the node.
-   */
-  using FGlobalKey = std::function<std::string(const Node* node)>;
-  /*! \brief registered name */
-  std::string name;
-  /*!
-   * \brief The creator function
-   */
-  FCreate fcreator = nullptr;
-  /*!
-   * \brief The global key function.
-   */
-  FGlobalKey fglobal_key = nullptr;
-  // setter of creator
-  NodeFactoryReg& set_creator(FCreate f) {  // NOLINT(*)
-    this->fcreator = f;
-    return *this;
-  }
-  // setter of creator
-  NodeFactoryReg& set_global_key(FGlobalKey f) {  // NOLINT(*)
-    this->fglobal_key = f;
-    return *this;
-  }
-  // global registry singleton
-  TVM_DLL static ::dmlc::Registry<::tvm::NodeFactoryReg> *Registry();
-};
-
-/*!
- * \brief Register a Node type
- * \note This is necessary to enable serialization of the Node.
- */
-#define TVM_REGISTER_NODE_TYPE(TypeName)                                \
-  static DMLC_ATTRIBUTE_UNUSED ::tvm::NodeFactoryReg & __make_Node ## _ ## TypeName ## __ = \
-      ::tvm::NodeFactoryReg::Registry()->__REGISTER__(TypeName::_type_key) \
-      .set_creator([](const std::string&) { return ::tvm::make_node<TypeName>(); })
-
-
 #define TVM_STRINGIZE_DETAIL(x) #x
 #define TVM_STRINGIZE(x) TVM_STRINGIZE_DETAIL(x)
 #define TVM_DESCRIBE(...) describe(__VA_ARGS__ "\n\nFrom:" __FILE__ ":" TVM_STRINGIZE(__LINE__))
diff --git a/include/tvm/buffer.h b/include/tvm/buffer.h
index 1233e9b0b89b..d2c2b40661e2 100644
--- a/include/tvm/buffer.h
+++ b/include/tvm/buffer.h
@@ -51,7 +51,7 @@ enum BufferType : int {
 class Buffer : public NodeRef {
  public:
   Buffer() {}
-  explicit Buffer(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Buffer(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief Return a new buffer that is equivalent with current one
    *  but always add stride field.
@@ -135,7 +135,7 @@ class BufferNode : public Node {
   /*! \brief constructor */
   BufferNode() {}
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("data", &data);
     v->Visit("dtype", &dtype);
     v->Visit("shape", &shape);
@@ -171,7 +171,7 @@ class BufferNode : public Node {
 };
 
 inline const BufferNode* Buffer::operator->() const {
-  return static_cast<const BufferNode*>(node_.get());
+  return static_cast<const BufferNode*>(get());
 }
 
 /*!
diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h
index 1d57d82e66c6..7114a4550331 100644
--- a/include/tvm/build_module.h
+++ b/include/tvm/build_module.h
@@ -61,7 +61,7 @@ class TargetNode : public Node {
   /*! \return the full device string to pass to codegen::Build */
   TVM_DLL const std::string& str() const;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("target_name", &target_name);
     v->Visit("device_name", &device_name);
     v->Visit("device_type", &device_type);
@@ -93,7 +93,7 @@ class TargetNode : public Node {
 class Target : public NodeRef {
  public:
   Target() {}
-  explicit Target(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Target(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
   * \brief Create a Target given a string
   * \param target_str the string to parse
@@ -110,7 +110,7 @@ class Target : public NodeRef {
   TVM_DLL static tvm::Target Current(bool allow_not_defined = true);
 
   const TargetNode* operator->() const {
-      return static_cast<const TargetNode*>(node_.get());
+      return static_cast<const TargetNode*>(get());
   }
 
   using ContainerType = TargetNode;
@@ -229,7 +229,7 @@ class BuildConfigNode : public Node {
   /*! \brief Whether to disable loop vectorization. */
   bool disable_vectorize = false;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("data_alignment", &data_alignment);
     v->Visit("offset_factor", &offset_factor);
     v->Visit("double_buffer_split_loop", &double_buffer_split_loop);
@@ -256,12 +256,12 @@ class BuildConfigNode : public Node {
 class BuildConfig : public ::tvm::NodeRef {
  public:
   BuildConfig() {}
-  explicit BuildConfig(NodePtr<::tvm::Node> n) : NodeRef(n) {}
+  explicit BuildConfig(ObjectPtr<Object> n) : NodeRef(n) {}
   const BuildConfigNode* operator->() const {
-    return static_cast<const BuildConfigNode*>(node_.get());
+    return static_cast<const BuildConfigNode*>(get());
   }
   BuildConfigNode* operator->() {
-    return static_cast<BuildConfigNode*>(node_.get());
+    return static_cast<BuildConfigNode*>(get_mutable());
   }
   /*!
    * \brief Construct a BuildConfig containing a empty build config node.
@@ -371,7 +371,7 @@ class GenericFuncNode;
 class GenericFunc : public NodeRef {
  public:
   GenericFunc() {}
-  explicit GenericFunc(NodePtr<Node> n) : NodeRef(n) {}
+  explicit GenericFunc(ObjectPtr<Object> n) : NodeRef(n) {}
 
   /*!
    * \brief Set the default function implementaiton.
@@ -473,15 +473,17 @@ class GenericFuncNode : public Node {
   /* \brief map from keys to registered functions */
   std::unordered_map<std::string, runtime::PackedFunc> dispatch_dict_;
 
+  void VisitAttrs(AttrVisitor* v) {}
+
   static constexpr const char* _type_key = "GenericFunc";
   TVM_DECLARE_NODE_TYPE_INFO(GenericFuncNode, Node);
 };
 
 inline GenericFuncNode* GenericFunc::operator->() {
-  return static_cast<GenericFuncNode*>(node_.get());
+  return static_cast<GenericFuncNode*>(get_mutable());
 }
 
-#define TVM_GENERIC_FUNC_REG_VAR_DEF                               \
+#define TVM_GENERIC_FUNC_REG_VAR_DEF                            \
   static TVM_ATTRIBUTE_UNUSED ::tvm::GenericFunc& __mk_ ## TVM
 
 /*!
diff --git a/include/tvm/c_dsl_api.h b/include/tvm/c_dsl_api.h
deleted file mode 100644
index bbbb84926e8e..000000000000
--- a/include/tvm/c_dsl_api.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file tvm/c_dsl_api.h
- *
- * \brief TVM DSL Node C API, used to interact to DSL compilation.
- *
- *  These are only a few functions needed for DSL construction time.
- *  These function are only available when link libtvm.
- *  If only TVM runtime is linked, calling these function will trigger error.
- *
- * \note Most API functions are registerd as PackedFunc and
- *  can be grabbed via TVMFuncGetGlobal
- */
-#ifndef TVM_C_DSL_API_H_
-#define TVM_C_DSL_API_H_
-
-#include "runtime/c_runtime_api.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*! \brief handle to node */
-typedef void* NodeHandle;
-
-/*!
- * \brief free the node handle
- * \param handle The node handle to be freed.
- * \return 0 when success, -1 when failure happens
- */
-TVM_DLL int TVMNodeFree(NodeHandle handle);
-
-/*!
- * \brief Convert type key to type index.
- * \param type_key The key of the type.
- * \param out_index the corresponding type index.
- * \return 0 when success, -1 when failure happens
- */
-TVM_DLL int TVMNodeTypeKey2Index(const char* type_key,
-                                 int* out_index);
-
-/*!
- * \brief Get runtime type index of the node.
- * \param handle the node handle.
- * \param out_index the corresponding type index.
- * \return 0 when success, -1 when failure happens
- */
-TVM_DLL int TVMNodeGetTypeIndex(NodeHandle handle,
-                                int* out_index);
-
-/*!
- * \brief get attributes given key
- * \param handle The node handle
- * \param key The attribute name
- * \param out_value The attribute value
- * \param out_type_code The type code of the attribute.
- * \param out_success Whether get is successful.
- * \return 0 when success, -1 when failure happens
- * \note API calls always exchanges with type bits=64, lanes=1
- */
-TVM_DLL int TVMNodeGetAttr(NodeHandle handle,
-                           const char* key,
-                           TVMValue* out_value,
-                           int* out_type_code,
-                           int* out_success);
-
-/*!
- * \brief get attributes names in the node.
- * \param handle The node handle
- * \param out_size The number of functions
- * \param out_array The array of function names.
- * \return 0 when success, -1 when failure happens
- */
-TVM_DLL int TVMNodeListAttrNames(NodeHandle handle,
-                                 int *out_size,
-                                 const char*** out_array);
-#ifdef __cplusplus
-}  // TVM_EXTERN_C
-#endif
-#endif  // TVM_C_DSL_API_H_
diff --git a/include/tvm/channel.h b/include/tvm/channel.h
index 143d4295f3e3..3a40a787d891 100644
--- a/include/tvm/channel.h
+++ b/include/tvm/channel.h
@@ -35,7 +35,7 @@ class Channel : public NodeRef {
  public:
   /*! \brief default constructor  */
   Channel() {}
-  explicit Channel(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Channel(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -54,7 +54,7 @@ struct ChannelNode : public Node {
   /*! \brief default data type in read/write */
   Type dtype;
   // visit all attributes
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("handle_var", &handle_var);
     v->Visit("dtype", &dtype);
   }
@@ -67,7 +67,7 @@ struct ChannelNode : public Node {
 
 // Inline implementations
 inline const ChannelNode* Channel::operator->() const {
-  return static_cast<const ChannelNode*>(node_.get());
+  return static_cast<const ChannelNode*>(get());
 }
 }  // namespace tvm
 #endif  // TVM_CHANNEL_H_
diff --git a/include/tvm/data_layout.h b/include/tvm/data_layout.h
index c2ae572de818..5e2cc08660db 100644
--- a/include/tvm/data_layout.h
+++ b/include/tvm/data_layout.h
@@ -104,7 +104,7 @@ class LayoutNode : public Node {
    */
   Array<IterVar> axes;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("axes", &axes);
   }
@@ -127,7 +127,7 @@ class LayoutNode : public Node {
  */
 class Layout : public NodeRef {
  public:
-  explicit Layout(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Layout(ObjectPtr<Object> n) : NodeRef(n) {}
 
   /*! \brief default constructor */
   Layout() = default;
@@ -152,7 +152,7 @@ class Layout : public NodeRef {
    * \return the pointer to the internal node container
    */
   const LayoutNode* operator->() const {
-    return static_cast<const LayoutNode*>(node_.get());
+    return static_cast<const LayoutNode*>(get());
   }
 
   /*!
@@ -160,7 +160,7 @@ class Layout : public NodeRef {
    * \return the pointer to the internal node container
    */
   LayoutNode* operator->() {
-    return static_cast<LayoutNode*>(node_.get());
+    return static_cast<LayoutNode*>(get_mutable());
   }
 
   /*!
@@ -325,7 +325,7 @@ class BijectiveLayoutNode : public Node {
   /*! \brief The destination layout */
   Layout dst_layout;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("src_layout", &src_layout);
     v->Visit("dst_layout", &dst_layout);
     v->Visit("forward_rule", &forward_rule);
@@ -369,7 +369,7 @@ class BijectiveLayout : public NodeRef {
 };
 
 inline const BijectiveLayoutNode* BijectiveLayout::operator->() const {
-  return static_cast<const BijectiveLayoutNode*>(node_.get());
+  return static_cast<const BijectiveLayoutNode*>(get());
 }
 
 }  // namespace tvm
diff --git a/include/tvm/expr.h b/include/tvm/expr.h
index 201a2b485aa6..ea578152899d 100644
--- a/include/tvm/expr.h
+++ b/include/tvm/expr.h
@@ -27,8 +27,10 @@
 #include <string>
 #include <algorithm>
 #include <unordered_map>
+#include <iostream>
 #include "base.h"
 #include "dtype.h"
+#include "node/node.h"
 #include "node/container.h"
 #include "node/ir_functor.h"
 #include "runtime/c_runtime_api.h"
@@ -49,7 +51,7 @@ class ExprNode : public Node {
 class Expr : public NodeRef {
  public:
   Expr() {}
-  explicit Expr(NodePtr<Node> ptr) : NodeRef(ptr) {}
+  explicit Expr(ObjectPtr<Object> ptr) : NodeRef(ptr) {}
   /*!
    * \brief construct from integer.
    * \param value The value to be constructed.
@@ -110,7 +112,7 @@ class Variable : public ExprNode {
 
   static Var make(DataType dtype, std::string name_hint);
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("name", &name_hint);
   }
@@ -122,7 +124,7 @@ class Variable : public ExprNode {
 /*! \brief a named variable in TVM */
 class Var : public Expr {
  public:
-  explicit Var(NodePtr<Node> n) : Expr(n) {}
+  explicit Var(ObjectPtr<Object> n) : Expr(n) {}
   TVM_DLL explicit Var(std::string name_hint = "v",
                        Type t = Int(32));
   /*!
@@ -145,7 +147,7 @@ class Var : public Expr {
    * \return the corresponding Variable.
    */
   const Variable* get() const {
-    return static_cast<Variable*>(node_.get());
+    return static_cast<const Variable*>(data_.get());
   }
   /*! \brief type indicate the container type */
   using ContainerType = Variable;
@@ -164,7 +166,7 @@ class IntImm : public ExprNode {
   /*! \brief the Internal value. */
   int64_t value;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("value", &value);
   }
@@ -187,7 +189,7 @@ class Integer : public Expr {
   /*!
    * \brief constructor from node.
    */
-  explicit Integer(NodePtr<Node> node) : Expr(node) {}
+  explicit Integer(ObjectPtr<Object> node) : Expr(node) {}
   /*!
    * \brief Construct integer from int value.
    */
@@ -197,7 +199,7 @@ class Integer : public Expr {
    * \param other another expression.
    */
   Integer& operator=(const Integer& other) {
-    node_ = other.node_;
+    data_ = other.data_;
     return *this;
   }
   /*!
@@ -205,13 +207,13 @@ class Integer : public Expr {
    * \return the content of the integer.
    */
   const IntImm* operator->() const {
-    return static_cast<const IntImm*>(node_.get());
+    return static_cast<const IntImm*>(get());
   }
   /*!
    * \brief convert to int64_t
    */
   operator int64_t() const {
-    CHECK(node_ != nullptr)
+    CHECK(data_ != nullptr)
         << " Trying to reference a null Integer";
     return (*this)->value;
   }
@@ -230,7 +232,7 @@ class RangeNode : public Node {
   RangeNode() {}
   RangeNode(Expr min, Expr extent) : min(min), extent(extent) {}
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("min", &min);
     v->Visit("extent", &extent);
   }
@@ -346,7 +348,7 @@ class IterVar : public NodeRef {
   // construct a new iter var without a domain
   IterVar() {}
   // construct from shared ptr.
-  explicit IterVar(NodePtr<Node> n) : NodeRef(n) {}
+  explicit IterVar(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -406,7 +408,7 @@ class IterVarNode : public Node {
    */
   std::string thread_tag;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dom", &dom);
     v->Visit("var", &var);
     v->Visit("iter_type", &iter_type);
@@ -423,7 +425,7 @@ class IterVarNode : public Node {
 
 // inline implementations
 inline const IterVarNode* IterVar::operator->() const {
-  return static_cast<const IterVarNode*>(node_.get());
+  return static_cast<const IterVarNode*>(data_.get());
 }
 
 inline IterVar::operator Expr() const {
@@ -481,16 +483,16 @@ class IRPrinter {
       : stream(stream) {}
 
   /*! \brief The node to be printed. */
-  TVM_DLL void Print(const NodeRef& node);
+  TVM_DLL void Print(const ObjectRef& node);
   /*! \brief Print indent to the stream */
   TVM_DLL void PrintIndent();
   // Allow registration to be printer.
-  using FType = IRFunctor<void(const NodeRef&, IRPrinter *)>;
+  using FType = IRFunctor<void(const ObjectRef&, IRPrinter *)>;
   TVM_DLL static FType& vtable();
 };
 
 // default print function for all nodes
-inline std::ostream& operator<<(std::ostream& os, const NodeRef& n) {  // NOLINT(*)
+inline std::ostream& operator<<(std::ostream& os, const ObjectRef& n) {  // NOLINT(*)
   IRPrinter(os).Print(n);
   return os;
 }
@@ -498,10 +500,7 @@ inline std::ostream& operator<<(std::ostream& os, const NodeRef& n) {  // NOLINT
 
 namespace std {
 template <>
-struct hash<::tvm::IterVar> {
-  std::size_t operator()(const ::tvm::IterVar& k) const {
-    return k.hash();
-  }
+struct hash<::tvm::IterVar> : public ::tvm::NodeHash {
 };
 }
 #endif  // TVM_EXPR_H_
diff --git a/include/tvm/ir.h b/include/tvm/ir.h
index 079f05f5a7f2..b6c3028d892f 100644
--- a/include/tvm/ir.h
+++ b/include/tvm/ir.h
@@ -45,7 +45,7 @@ class UIntImm : public ExprNode {
   /*! \brief The constant value content. */
   uint64_t value;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("value", &value);
   }
@@ -62,7 +62,7 @@ class FloatImm : public ExprNode {
   /*! \brief The constant value content. */
   double value;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("value", &value);
   }
@@ -79,7 +79,7 @@ class StringImm : public ExprNode {
   /*! \brief The constant value content. */
   std::string value;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("value", &value);
   }
@@ -99,7 +99,7 @@ class Cast : public ExprNode {
   /*! \brief Original data type. */
   Expr value;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("value", &value);
   }
@@ -122,7 +122,7 @@ class BinaryOpNode : public ExprNode {
   /*! \brief The right operand. */
   Expr b;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &(this->type));
     v->Visit("a", &a);
     v->Visit("b", &b);
@@ -214,7 +214,7 @@ class CmpOpNode : public ExprNode {
   /*! \brief The right operand. */
   Expr b;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &(this->type));
     v->Visit("a", &a);
     v->Visit("b", &b);
@@ -278,7 +278,7 @@ class And : public ExprNode {
   /*! \brief The right operand. */
   Expr b;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &(this->type));
     v->Visit("a", &a);
     v->Visit("b", &b);
@@ -298,7 +298,7 @@ class Or : public ExprNode {
   /*! \brief The right operand. */
   Expr b;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("a", &a);
     v->Visit("b", &b);
@@ -316,7 +316,7 @@ class Not : public ExprNode {
   /*! \brief The input operand. */
   Expr a;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("a", &a);
   }
@@ -343,7 +343,7 @@ class Select : public ExprNode {
   /*! \brief value to be returned when condition is false. */
   Expr false_value;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("condition", &condition);
     v->Visit("true_value", &true_value);
@@ -380,7 +380,7 @@ class Load : public ExprNode {
   /*! \brief The predicate to mask which lanes would be loaded. */
   Expr predicate;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("buffer_var", &buffer_var);
     v->Visit("index", &index);
@@ -411,7 +411,7 @@ class Ramp : public ExprNode {
   /*! \brief Total number of lanes. */
   int lanes;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("base", &base);
     v->Visit("stride", &stride);
@@ -432,7 +432,7 @@ class Broadcast : public ExprNode {
   /*! \brief The numerb of lanes. */
   int lanes;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("value", &value);
     v->Visit("lanes", &lanes);
@@ -456,7 +456,7 @@ class Let : public ExprNode {
   /*! \brief The result expression. */
   Expr body;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("var", &var);
     v->Visit("value", &value);
@@ -522,7 +522,7 @@ class Call : public ExprNode {
   /*! \brief The output value index if func's value is a tuple. */
   int value_index{0};
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("name", &name);
     v->Visit("args", &args);
@@ -592,7 +592,7 @@ class Shuffle : public ExprNode {
   /*! \brief The indices of each element. */
   Array<Expr> indices;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("vectors", &vectors);
     v->Visit("indices", &indices);
   }
@@ -652,7 +652,7 @@ class CommReducerNode : public Node {
                                   Array<Expr> result,
                                   Array<Expr> identity_element);
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("lhs", &lhs);
     v->Visit("rhs", &rhs);
     v->Visit("result", &result);
@@ -664,10 +664,10 @@ class CommReducerNode : public Node {
 };
 
 inline const CommReducerNode* CommReducer::get() const {
-  return static_cast<CommReducerNode*>(node_.get());
+  return static_cast<const CommReducerNode*>(data_.get());
 }
 inline const CommReducerNode* CommReducer::operator->() const {
-  return static_cast<CommReducerNode*>(node_.get());
+  return get();
 }
 
 /*! \brief Reduction operator operator */
@@ -694,7 +694,7 @@ class Reduce : public ExprNode {
                            Expr condition,
                            int value_index);
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("dtype", &type);
     v->Visit("combiner", &combiner);
     v->Visit("source", &source);
@@ -710,7 +710,7 @@ class Reduce : public ExprNode {
 /*! \brief Any shape. */
 class Any : public ExprNode {
  public:
-  void VisitAttrs(AttrVisitor* v) final {}
+  void VisitAttrs(AttrVisitor* v) {}
   /*! \brief Convert to var. */
   Var ToVar() const {
     return Variable::make(Int(32), "any_dim");
@@ -735,7 +735,7 @@ class LetStmt : public StmtNode {
   /*! \brief The body block. */
   Stmt body;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("var", &var);
     v->Visit("value", &value);
     v->Visit("body", &body);
@@ -768,7 +768,7 @@ class AttrStmt : public StmtNode {
   /*! \brief The body statement to be executed */
   Stmt body;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("node", &node);
     v->Visit("attr_key", &attr_key);
     v->Visit("value", &value);
@@ -799,7 +799,7 @@ class AssertStmt : public StmtNode {
    */
   Stmt body;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("condition", &condition);
     v->Visit("message", &message);
     v->Visit("body", &body);
@@ -822,7 +822,7 @@ class ProducerConsumer : public StmtNode {
   /*! \brief Body to be executed. */
   Stmt body;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("func", &func);
     v->Visit("is_producer", &is_producer);
     v->Visit("body", &body);
@@ -863,7 +863,7 @@ class Store : public StmtNode {
   /*! \brief The predicate to mask which lanes would be stored. */
   Expr predicate;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("buffer_var", &buffer_var);
     v->Visit("value", &value);
     v->Visit("index", &index);
@@ -893,7 +893,7 @@ class Provide : public StmtNode {
   /*! \brief The index arguments of the function. */
   Array<Expr> args;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("func", &func);
     v->Visit("value_index", &value_index);
     v->Visit("value", &value);
@@ -929,7 +929,7 @@ class Allocate : public StmtNode {
   Expr new_expr;
   std::string free_function;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("buffer_var", &buffer_var);
     v->Visit("dtype", &type);
     v->Visit("extents", &extents);
@@ -972,7 +972,7 @@ class Free : public StmtNode {
   /*! \brief The buffer variable. */
   Var buffer_var;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("buffer_var", &buffer_var);
   }
 
@@ -1001,7 +1001,7 @@ class Realize : public StmtNode {
   /*! \brief The body of realization. */
   Stmt body;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("func", &func);
     v->Visit("value_index", &value_index);
     v->Visit("dtype", &type);
@@ -1031,7 +1031,7 @@ class Block : public StmtNode {
   /*! \brief The restof statments. */
   Stmt rest;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("first", &first);
     v->Visit("rest", &rest);
   }
@@ -1055,7 +1055,7 @@ class IfThenElse : public StmtNode {
   /*! \brief The branch to be executed when condition is false, can be null. */
   Stmt else_case;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("condition", &condition);
     v->Visit("then_case", &then_case);
     v->Visit("else_case", &else_case);
@@ -1078,7 +1078,7 @@ class Evaluate : public StmtNode {
   /*! \brief The expression to be evaluated. */
   Expr value;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("value", &value);
   }
 
@@ -1142,7 +1142,7 @@ class For : public StmtNode {
                            DeviceAPI device_api,
                            Stmt body);
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("loop_var", &loop_var);
     v->Visit("min", &min);
     v->Visit("extent", &extent);
@@ -1169,7 +1169,7 @@ class Prefetch : public StmtNode {
   /*! \brief Bounds to be prefetched. */
   Region bounds;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("func", &func);
     v->Visit("value_index", &value_index);
     v->Visit("type", &type);
@@ -1310,6 +1310,16 @@ constexpr const char* opengl_stage_scope = "opengl_stage_scope";
  */
 constexpr const char* device_scope = "device_scope";
 
+/*!
+ * \brief Mark that the shape of TensorCore fragment
+ */
+constexpr const char* fragment_shape = "fragment_shape";
+
+/*!
+ * \brief Mark that the layout of TensorCore fragment
+ */
+constexpr const char* fragment_layout = "fragment_layout";
+
 /*!
  * \brief Check if attr_key is a pragma key extension
  * \param attr_key The attr key to be compared
@@ -1552,6 +1562,54 @@ constexpr const char* tvm_global_barrier_kinit = "tvm_global_barrier_kinit";
  *  }
  */
 constexpr const char* tvm_thread_allreduce = "tvm_thread_allreduce";
+/*!
+ * \brief tvm intrinsic for tensor core load operators.
+ *
+ *  void tvm_load_matrix_sync(Var fragment, UIntImm m, UIntImm, n, UIntImm k,
+ *                            Expr index, Expr buffer_ptr, Expr stride,
+ *                            StringImm layout) {
+ *    // m, n, k are the shape of wmma fragment.
+ *    // Determine fragment layout(column-major or row major) by layout.
+ *    // fragments must be in 'wmma.matrix_a' or 'wmma.matrix_b' scope.
+ *    nvcuda::wmma::load_matrix_sync(fragment[index], buffer_ptr, stride);
+ *  }
+ */
+constexpr const char* tvm_load_matrix_sync = "tvm_load_matrix_sync";
+/*!
+ * \brief tvm intrinsic for tensor core mma_sync operators.
+ *
+ *  void tvm_mma_sync(Var fragment_d, Expr index_d,
+ *                    Var fragment_a, Expr index_a,
+ *                    Var fragment_b, Expr index_b,
+ *                    Var fragment_c, Expr index_c) {
+ *    nvcuda::wmma::mma_sync(fragment_d[index_d], fragment_a[index_a],
+ *                           fragment_b[index_b], fragment_c[index_c]);
+ *  }
+ */
+constexpr const char* tvm_mma_sync = "tvm_mma_sync";
+/*!
+ * \brief tvm intrinsic for tensor core fill_fragment operators.
+ *
+ *  void tvm_fill_fragment(Var fragment, UIntImm m, UIntImm, n, UIntImm k,
+ *                         Expr index, Expr value) {
+ *    // m, n, k are the shape of wmma fragment
+ *    // fragments must be in 'wmma.accumulator' scope.
+ *    nvcuda::wmma::fill_fragment(fragment[index], value);
+ *  }
+ */
+constexpr const char* tvm_fill_fragment = "tvm_fill_fragment";
+/*!
+ * \brief tvm intrinsic for tensor core store operators.
+ *
+ *  void tvm_store_matrix_sync(Var fragment, UIntImm m, UIntImm, n, UIntImm k,
+ *                             Expr index, Expr buffer_ptr, Expr stride,
+ *                             StringImm layout) {
+ *    // m, n, k are the shape of wmma fragment
+ *    // fragments must be in 'wmma.accumulator' scope.
+ *    nvcuda::wmma::store_matrix_sync(fragment[index], buffer_ptr, stride, layout);
+ *  }
+ */
+constexpr const char* tvm_store_matrix_sync = "tvm_store_matrix_sync";
 
 }   // namespace intrinsic
 
@@ -1576,7 +1634,7 @@ namespace std {
 template <>
 struct hash<::tvm::ir::TensorKey> {
   std::size_t operator()(const ::tvm::ir::TensorKey& k) const {
-    size_t lhs = k.f.hash();
+    size_t lhs = ::tvm::NodeHash()(k.f);
     size_t rhs = static_cast<size_t>(k.value_index);
     lhs ^= rhs + 0x9e3779b9 + (lhs << 6) + (lhs >> 2);
     return lhs;
diff --git a/include/tvm/ir_functor_ext.h b/include/tvm/ir_functor_ext.h
index a7d91eacf851..54a5eff6846b 100644
--- a/include/tvm/ir_functor_ext.h
+++ b/include/tvm/ir_functor_ext.h
@@ -84,19 +84,19 @@ class StmtFunctor;
   }
 #define STMT_FUNCTOR_DEFAULT {                                      \
     return VisitStmtDefault_(op, std::forward<Args>(args)...);      \
-}
+  }
 
 #define IR_EXPR_FUNCTOR_DISPATCH(OP)                                    \
   vtable.template set_dispatch<OP>(                                     \
-      [](const NodeRef& n, TSelf* self, Args... args) {                 \
-        return self->VisitExpr_(static_cast<const OP*>(n.node_.get()),  \
+      [](const ObjectRef& n, TSelf* self, Args... args) {               \
+        return self->VisitExpr_(static_cast<const OP*>(n.get()),        \
                                 std::forward<Args>(args)...);           \
       });                                                               \
 
 #define IR_STMT_FUNCTOR_DISPATCH(OP)                                    \
   vtable.template set_dispatch<OP>(                                     \
-      [](const NodeRef& n, TSelf* self, Args... args) {                 \
-        return self->VisitStmt_(static_cast<const OP*>(n.node_.get()),  \
+      [](const ObjectRef& n, TSelf* self, Args... args) {               \
+        return self->VisitStmt_(static_cast<const OP*>(n.get()),        \
                                 std::forward<Args>(args)...);           \
       });                                                               \
 
@@ -104,7 +104,7 @@ template<typename R, typename ...Args>
 class ExprFunctor<R(const Expr& n, Args...)> {
  private:
   using TSelf = ExprFunctor<R(const Expr& n, Args...)>;
-  using FType = IRFunctor<R(const NodeRef& n, TSelf* self, Args...)>;
+  using FType = IRFunctor<R(const ObjectRef& n, TSelf* self, Args...)>;
 
  public:
   /*! \brief the result type of this functor */
@@ -164,7 +164,7 @@ class ExprFunctor<R(const Expr& n, Args...)> {
   virtual R VisitExpr_(const FloatImm* op, Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExpr_(const StringImm* op, Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExprDefault_(const Node* op, Args ...) {
-    LOG(FATAL) << "Do not have a default for " << op->type_key();
+    LOG(FATAL) << "Do not have a default for " << op->GetTypeKey();
     return R();
   }
 
@@ -213,7 +213,7 @@ template<typename R, typename ...Args>
 class StmtFunctor<R(const Stmt& n, Args... args)> {
  private:
   using TSelf = StmtFunctor<R(const Stmt& n, Args... args)>;
-  using FType = IRFunctor<R(const NodeRef& n, TSelf* self, Args... args)>;
+  using FType = IRFunctor<R(const ObjectRef& n, TSelf* self, Args... args)>;
 
  public:
   /*! \brief the result type of this functor */
@@ -255,7 +255,7 @@ class StmtFunctor<R(const Stmt& n, Args... args)> {
   virtual R VisitStmt_(const Block* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmt_(const Evaluate* op, Args... args) STMT_FUNCTOR_DEFAULT;
   virtual R VisitStmtDefault_(const Node* op, Args ...) {
-    LOG(FATAL) << "Do not have a default for " << op->type_key();
+    LOG(FATAL) << "Do not have a default for " << op->GetTypeKey();
     return R();
   }
 
diff --git a/include/tvm/ir_mutator.h b/include/tvm/ir_mutator.h
index b82a19d4689c..c910a48620c8 100644
--- a/include/tvm/ir_mutator.h
+++ b/include/tvm/ir_mutator.h
@@ -65,9 +65,9 @@ class TVM_DLL IRMutator {
   /*! \brief destructor */
   virtual ~IRMutator() {}
   /*! \brief functor type of expr mutation */
-  using FMutateExpr = IRFunctor<Expr(const NodeRef&, const Expr&, IRMutator*)>;
+  using FMutateExpr = IRFunctor<Expr(const ObjectRef&, const Expr&, IRMutator*)>;
   /*! \brief functor type of stmt mutation */
-  using FMutateStmt = IRFunctor<Stmt(const NodeRef&, const Stmt&, IRMutator*)>;
+  using FMutateStmt = IRFunctor<Stmt(const ObjectRef&, const Stmt&, IRMutator*)>;
   /*! \return internal vtable of expr */
   static FMutateExpr& vtable_expr();  // NOLINT(*)
   /*! \return internal stmt of expr */
diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h
index 5ac71fdce47b..842c6af8cf5d 100644
--- a/include/tvm/ir_pass.h
+++ b/include/tvm/ir_pass.h
@@ -377,6 +377,13 @@ Stmt LowerStorageAccessInfo(Stmt stmt);
  */
 Stmt DecorateDeviceScope(Stmt stmt);
 
+/*!
+ * \brief Loop invariant code motion which locates and hoists if statements.
+ * \param stmt The stmt to do if statement hoisting.
+ * \return Transformed stmt.
+ */
+Stmt HoistIfThenElse(Stmt stmt);
+
 /*!
  * \brief Make an user callable API LoweredFunc.
  *
@@ -506,6 +513,15 @@ LoweredFunc CombineContextCall(LoweredFunc f);
  */
 LoweredFunc PointerValueTypeRewrite(LoweredFunc f);
 
+/*!
+ * \brief Lower attached storage access information on device.
+ * Do this pass after all storage access analysis finish.
+ *
+ * \param func The device function to be lowered.
+ * \return Transformed function.
+ */
+LoweredFunc LowerDeviceStorageAccessInfo(LoweredFunc func);
+
 /*!
  * \brief Lower intrinsic function calls.
  * \param f The device function to be lowered.
@@ -525,6 +541,14 @@ LoweredFunc LowerIntrin(LoweredFunc f, const std::string& target);
  */
 LoweredFunc LowerCustomDatatypes(LoweredFunc f, const std::string& target);
 
+/*!
+ * \brief Infer the TensorCore fragment infomation using tensor intrinsics
+ *
+ * \param f The device function to be lowered.
+ * \return Transformed function.
+ */
+LoweredFunc InferFragment(LoweredFunc f);
+
 /*!
  * \brief Verify if memory accesses are legal for a specific target device type.
  *
diff --git a/include/tvm/ir_visitor.h b/include/tvm/ir_visitor.h
index f20b91368587..bebf94585ed6 100644
--- a/include/tvm/ir_visitor.h
+++ b/include/tvm/ir_visitor.h
@@ -49,7 +49,7 @@ namespace ir {
  * // The use case is to count number of Variables in the ir tree.
  * class MyCounter : public IRVisitor {
  *  public:
- *   int Count(const NodeRef& n) {
+ *   int Count(const ObjectRef& n) {
  *     ret_ = 0;
  *     this->Visit(n);
  *     return ret_;
@@ -94,7 +94,7 @@ class TVM_DLL IRVisitor {
   /*! \brief destructor */
   virtual ~IRVisitor() {}
   /*! \brief functor type of visitor */
-  using FVisit = IRFunctor<void(const NodeRef&, IRVisitor*)>;
+  using FVisit = IRFunctor<void(const ObjectRef&, IRVisitor*)>;
   /*! \return internal vtable*/
   static FVisit& vtable();
   // overloadable visit function.
diff --git a/include/tvm/lowered_func.h b/include/tvm/lowered_func.h
index 4da93b80c2ab..6709f545cb39 100644
--- a/include/tvm/lowered_func.h
+++ b/include/tvm/lowered_func.h
@@ -44,7 +44,7 @@ class LoweredFuncNode;
 class LoweredFunc : public ir::FunctionRef {
  public:
   LoweredFunc() {}
-  explicit LoweredFunc(NodePtr<Node> n) : FunctionRef(n) {}
+  explicit LoweredFunc(ObjectPtr<Object> n) : FunctionRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -119,7 +119,7 @@ class LoweredFuncNode : public ir::FunctionBaseNode {
   int num_outputs() const final {
     return 1;
   }
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("args", &args);
     v->Visit("thread_axis", &thread_axis);
@@ -136,17 +136,14 @@ class LoweredFuncNode : public ir::FunctionBaseNode {
 
 // Implementations of inline functions
 inline const LoweredFuncNode* LoweredFunc::operator->() const {
-  return static_cast<const LoweredFuncNode*>(node_.get());
+  return static_cast<const LoweredFuncNode*>(get());
 }
 
 }  // namespace tvm
 
 namespace std {
 template <>
-struct hash<::tvm::LoweredFunc> {
-  std::size_t operator()(const ::tvm::LoweredFunc& k) const {
-    return k.hash();
-  }
+struct hash<::tvm::LoweredFunc> : public tvm::NodeHash {
 };
 }
 
diff --git a/include/tvm/node/container.h b/include/tvm/node/container.h
index c2c639e374f5..c36c6c141451 100644
--- a/include/tvm/node/container.h
+++ b/include/tvm/node/container.h
@@ -38,67 +38,49 @@ namespace tvm {
 class ArrayNode : public Node {
  public:
   /*! \brief the data content */
-  std::vector<NodePtr<Node> > data;
+  std::vector<ObjectRef> data;
 
-  void VisitAttrs(AttrVisitor* visitor) final {
-     // Visitor to array have no effect.
+  void VisitAttrs(AttrVisitor* visitor) {
   }
 
   static constexpr const char* _type_key = "Array";
-  TVM_DECLARE_NODE_TYPE_INFO(ArrayNode, Node);
+  TVM_DECLARE_FINAL_OBJECT_INFO(ArrayNode, Node);
 };
 
 /*! \brief map node content */
 class MapNode : public Node {
  public:
-  void VisitAttrs(AttrVisitor* visitor) final {
-     // Visitor to map have no effect.
+  void VisitAttrs(AttrVisitor* visitor) {
   }
-  // hash function
-  struct Hash {
-    size_t operator()(const NodePtr<Node>& n) const {
-      return std::hash<Node*>()(n.get());
-    }
-  };
-  // comparator
-  struct Equal {
-    bool operator()(
-        const NodePtr<Node>& a,
-        const NodePtr<Node>& b) const {
-      return a.get() == b.get();
-    }
-  };
 
   /*! \brief The corresponding conatiner type */
   using ContainerType = std::unordered_map<
-   NodePtr<Node>,
-   NodePtr<Node>,
-   Hash, Equal>;
+    ObjectRef,
+    ObjectRef,
+    ObjectHash, ObjectEqual>;
 
   /*! \brief the data content */
   ContainerType data;
 
   static constexpr const char* _type_key = "Map";
-  TVM_DECLARE_NODE_TYPE_INFO(MapNode, Node);
+  TVM_DECLARE_FINAL_OBJECT_INFO(MapNode, Node);
 };
 
 
 /*! \brief specialized map node with string as key */
 class StrMapNode : public Node {
  public:
-  void VisitAttrs(AttrVisitor* visitor) final {
-     // Visitor to map have no effect.
-  }
   /*! \brief The corresponding conatiner type */
-  using ContainerType = std::unordered_map<
-    std::string,
-    NodePtr<Node> >;
+  using ContainerType = std::unordered_map<std::string, ObjectRef>;
+
+  void VisitAttrs(AttrVisitor* visitor) {
+  }
 
   /*! \brief the data content */
   ContainerType data;
 
   static constexpr const char* _type_key = "StrMap";
-  TVM_DECLARE_NODE_TYPE_INFO(StrMapNode, Node);
+  TVM_DECLARE_FINAL_OBJECT_INFO(StrMapNode, Node);
 };
 
 /*!
@@ -111,9 +93,9 @@ template<typename Converter,
 class IterAdapter {
  public:
   using difference_type = typename std::iterator_traits<TIter>::difference_type;
-  using value_type = typename std::iterator_traits<TIter>::value_type;
-  using pointer = typename std::iterator_traits<TIter>::pointer;
-  using reference = typename std::iterator_traits<TIter>::reference;
+  using value_type = typename Converter::ResultType;
+  using pointer = typename Converter::ResultType*;
+  using reference = typename Converter::ResultType&;   // NOLINT(*)
   using iterator_category = typename std::iterator_traits<TIter>::iterator_category;
 
   explicit IterAdapter(TIter iter) : iter_(iter) {}
@@ -138,7 +120,7 @@ class IterAdapter {
   inline bool operator!=(IterAdapter other) const {
     return !(*this == other);
   }
-  inline const typename Converter::ResultType operator*() const {
+  inline const value_type operator*() const {
     return Converter::convert(*iter_);
   }
 
@@ -162,26 +144,27 @@ class Array : public NodeRef {
    * \brief default constructor
    */
   Array() {
-    node_ = make_node<ArrayNode>();
+    data_ = make_node<ArrayNode>();
   }
   /*!
    * \brief move constructor
    * \param other source
    */
   Array(Array<T> && other) {  // NOLINT(*)
-    node_ = std::move(other.node_);
+    data_ = std::move(other.data_);
   }
   /*!
    * \brief copy constructor
    * \param other source
    */
-  Array(const Array<T> &other) : NodeRef(other.node_) { // NOLINT(*)
+  Array(const Array<T> &other) { // NOLINT(*)
+    data_ = std::move(other.data_);
   }
   /*!
    * \brief constructor from pointer
    * \param n the container pointer
    */
-  explicit Array(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Array(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief constructor from iterator
    * \param begin begin of iterator
@@ -214,9 +197,9 @@ class Array : public NodeRef {
   explicit Array(size_t n, const T& val) {
     auto tmp_node = make_node<ArrayNode>();
     for (size_t i = 0; i < n; ++i) {
-      tmp_node->data.push_back(val.node_);
+      tmp_node->data.push_back(val);
     }
-    node_ = std::move(tmp_node);
+    data_ = std::move(tmp_node);
   }
   /*!
    * \brief move assign operator
@@ -224,7 +207,7 @@ class Array : public NodeRef {
    * \return reference to self.
    */
   Array<T>& operator=(Array<T> && other) {
-    node_ = std::move(other.node_);
+    data_ = std::move(other.data_);
     return *this;
   }
   /*!
@@ -233,7 +216,7 @@ class Array : public NodeRef {
    * \return reference to self.
    */
   Array<T>& operator=(const Array<T> & other) {
-    node_ = other.node_;
+    data_ = other.data_;
     return *this;
   }
   /*!
@@ -246,9 +229,9 @@ class Array : public NodeRef {
   void assign(IterType begin, IterType end) {
     auto n = make_node<ArrayNode>();
     for (IterType it = begin; it != end; ++it) {
-      n->data.push_back((*it).node_);
+      n->data.push_back(T(*it));
     }
-    node_ = std::move(n);
+    data_ = std::move(n);
   }
   /*!
    * \brief Read i-th element from array.
@@ -256,12 +239,13 @@ class Array : public NodeRef {
    * \return the i-th element.
    */
   inline const T operator[](size_t i) const {
-    return T(static_cast<const ArrayNode*>(node_.get())->data[i]);
+    return DowncastNoCheck<T>(
+        static_cast<const ArrayNode*>(data_.get())->data[i]);
   }
   /*! \return The size of the array */
   inline size_t size() const {
-    if (node_.get() == nullptr) return 0;
-    return static_cast<const ArrayNode*>(node_.get())->data.size();
+    if (data_.get() == nullptr) return 0;
+    return static_cast<const ArrayNode*>(data_.get())->data.size();
   }
   /*!
    * \brief copy on write semantics
@@ -272,12 +256,12 @@ class Array : public NodeRef {
    * \return Handle to the internal node container(which ganrantees to be unique)
    */
   inline ArrayNode* CopyOnWrite() {
-    if (node_.get() == nullptr || !node_.unique())  {
+    if (data_.get() == nullptr || !data_.unique())  {
       NodePtr<ArrayNode> n = make_node<ArrayNode>();
-      n->data = static_cast<ArrayNode*>(node_.get())->data;
-      NodePtr<Node>(std::move(n)).swap(node_);
+      n->data = static_cast<ArrayNode*>(data_.get())->data;
+      ObjectPtr<Object>(std::move(n)).swap(data_);
     }
-    return static_cast<ArrayNode*>(node_.get());
+    return static_cast<ArrayNode*>(data_.get());
   }
   /*!
    * \brief push a new item to the back of the list
@@ -285,7 +269,7 @@ class Array : public NodeRef {
    */
   inline void push_back(const T& item) {
     ArrayNode* n = this->CopyOnWrite();
-    n->data.push_back(item.node_);
+    n->data.push_back(item);
   }
   /*!
    * \brief set i-th element of the array.
@@ -294,7 +278,7 @@ class Array : public NodeRef {
    */
   inline void Set(size_t i, const T& value) {
     ArrayNode* n = this->CopyOnWrite();
-    n->data[i] = value.node_;
+    n->data[i] = value;
   }
   /*! \return whether array is empty */
   inline bool empty() const {
@@ -303,34 +287,34 @@ class Array : public NodeRef {
   /*! \brief specify container node */
   using ContainerType = ArrayNode;
 
-  struct Ptr2NodeRef {
+  struct ValueConverter {
     using ResultType = T;
-    static inline T convert(const NodePtr<Node>& n) {
-      return T(n);
+    static inline T convert(const ObjectRef& n) {
+      return DowncastNoCheck<T>(n);
     }
   };
-  using iterator = IterAdapter<Ptr2NodeRef,
-                               std::vector<NodePtr<Node> >::const_iterator>;
+  using iterator = IterAdapter<ValueConverter,
+                               std::vector<ObjectRef>::const_iterator>;
 
   using reverse_iterator = IterAdapter<
-    Ptr2NodeRef,
-    std::vector<NodePtr<Node> >::const_reverse_iterator>;
+    ValueConverter,
+    std::vector<ObjectRef>::const_reverse_iterator>;
 
   /*! \return begin iterator */
   inline iterator begin() const {
-    return iterator(static_cast<const ArrayNode*>(node_.get())->data.begin());
+    return iterator(static_cast<const ArrayNode*>(data_.get())->data.begin());
   }
   /*! \return end iterator */
   inline iterator end() const {
-    return iterator(static_cast<const ArrayNode*>(node_.get())->data.end());
+    return iterator(static_cast<const ArrayNode*>(data_.get())->data.end());
   }
   /*! \return rbegin iterator */
   inline reverse_iterator rbegin() const {
-    return reverse_iterator(static_cast<const ArrayNode*>(node_.get())->data.rbegin());
+    return reverse_iterator(static_cast<const ArrayNode*>(data_.get())->data.rbegin());
   }
   /*! \return rend iterator */
   inline reverse_iterator rend() const {
-    return reverse_iterator(static_cast<const ArrayNode*>(node_.get())->data.rend());
+    return reverse_iterator(static_cast<const ArrayNode*>(data_.get())->data.rend());
   }
 };
 
@@ -355,26 +339,26 @@ class Map : public NodeRef {
    * \brief default constructor
    */
   Map() {
-    node_ = make_node<MapNode>();
+    data_ = make_node<MapNode>();
   }
   /*!
    * \brief move constructor
    * \param other source
    */
   Map(Map<K, V> && other) {  // NOLINT(*)
-    node_ = std::move(other.node_);
+    data_ = std::move(other.data_);
   }
   /*!
    * \brief copy constructor
    * \param other source
    */
-  Map(const Map<K, V> &other) : NodeRef(other.node_) { // NOLINT(*)
+  Map(const Map<K, V> &other) : NodeRef(other.data_) { // NOLINT(*)
   }
   /*!
    * \brief constructor from pointer
    * \param n the container pointer
    */
-  explicit Map(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Map(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief constructor from iterator
    * \param begin begin of iterator
@@ -406,7 +390,7 @@ class Map : public NodeRef {
    * \return reference to self.
    */
   Map<K, V>& operator=(Map<K, V> && other) {
-    node_ = std::move(other.node_);
+    data_ = std::move(other.data_);
     return *this;
   }
   /*!
@@ -415,7 +399,7 @@ class Map : public NodeRef {
    * \return reference to self.
    */
   Map<K, V>& operator=(const Map<K, V> & other) {
-    node_ = other.node_;
+    data_ = other.data_;
     return *this;
   }
   /*!
@@ -428,10 +412,9 @@ class Map : public NodeRef {
   void assign(IterType begin, IterType end) {
     NodePtr<MapNode> n = make_node<MapNode>();
     for (IterType i = begin; i != end; ++i) {
-      n->data.emplace(std::make_pair(i->first.node_,
-                                     i->second.node_));
+      n->data.emplace(std::make_pair(i->first, i->second));
     }
-    node_ = std::move(n);
+    data_ = std::move(n);
   }
   /*!
    * \brief Read element from map.
@@ -439,7 +422,8 @@ class Map : public NodeRef {
    * \return the corresonding element.
    */
   inline const V operator[](const K& key) const {
-    return V(static_cast<const MapNode*>(node_.get())->data.at(key.node_));
+    return DowncastNoCheck<V>(
+        static_cast<const MapNode*>(data_.get())->data.at(key));
   }
   /*!
    * \brief Read element from map.
@@ -447,17 +431,18 @@ class Map : public NodeRef {
    * \return the corresonding element.
    */
   inline const V at(const K& key) const {
-    return V(static_cast<const MapNode*>(node_.get())->data.at(key.node_));
+    return DowncastNoCheck<V>(
+        static_cast<const MapNode*>(data_.get())->data.at(key));
   }
   /*! \return The size of the array */
   inline size_t size() const {
-    if (node_.get() == nullptr) return 0;
-    return static_cast<const MapNode*>(node_.get())->data.size();
+    if (data_.get() == nullptr) return 0;
+    return static_cast<const MapNode*>(data_.get())->data.size();
   }
   /*! \return The number of elements of the key */
   inline size_t count(const K& key) const {
-    if (node_.get() == nullptr) return 0;
-    return static_cast<const MapNode*>(node_.get())->data.count(key.node_);
+    if (data_.get() == nullptr) return 0;
+    return static_cast<const MapNode*>(data_.get())->data.count(key);
   }
   /*!
    * \brief copy on write semantics
@@ -468,12 +453,12 @@ class Map : public NodeRef {
    * \return Handle to the internal node container(which ganrantees to be unique)
    */
   inline MapNode* CopyOnWrite() {
-    if (node_.get() == nullptr || !node_.unique())  {
+    if (data_.get() == nullptr || !data_.unique())  {
       NodePtr<MapNode> n = make_node<MapNode>();
-      n->data = static_cast<const MapNode*>(node_.get())->data;
-      NodePtr<Node>(std::move(n)).swap(node_);
+      n->data = static_cast<const MapNode*>(data_.get())->data;
+      ObjectPtr<Object>(std::move(n)).swap(data_);
     }
-    return static_cast<MapNode*>(node_.get());
+    return static_cast<MapNode*>(data_.get());
   }
   /*!
    * \brief set the Map.
@@ -482,7 +467,7 @@ class Map : public NodeRef {
    */
   inline void Set(const K& key, const V& value) {
     MapNode* n = this->CopyOnWrite();
-    n->data[key.node_] = value.node_;
+    n->data[key] = value;
   }
 
   /*! \return whether array is empty */
@@ -492,29 +477,31 @@ class Map : public NodeRef {
   /*! \brief specify container node */
   using ContainerType = MapNode;
 
-  struct Ptr2NodeRef {
+  struct ValueConverter {
     using ResultType = std::pair<K, V>;
     static inline ResultType convert(const std::pair<
-                            NodePtr<Node>,
-                            NodePtr<Node> >& n) {
-      return std::make_pair(K(n.first), V(n.second));
+                                     ObjectRef,
+                                     ObjectRef>& n) {
+      return std::make_pair(DowncastNoCheck<K>(n.first),
+                            DowncastNoCheck<V>(n.second));
     }
   };
 
   using iterator = IterAdapter<
-    Ptr2NodeRef, MapNode::ContainerType::const_iterator>;
+    ValueConverter, MapNode::ContainerType::const_iterator>;
 
   /*! \return begin iterator */
   inline iterator begin() const {
-    return iterator(static_cast<const MapNode*>(node_.get())->data.begin());
+    return iterator(static_cast<const MapNode*>(data_.get())->data.begin());
   }
   /*! \return end iterator */
   inline iterator end() const {
-    return iterator(static_cast<const MapNode*>(node_.get())->data.end());
+    return iterator(static_cast<const MapNode*>(data_.get())->data.end());
   }
   /*! \return begin iterator */
   inline iterator find(const K& key) const {
-    return iterator(static_cast<const MapNode*>(node_.get())->data.find(key.node_));
+    return iterator(
+        static_cast<const MapNode*>(data_.get())->data.find(key));
   }
 };
 
@@ -524,14 +511,14 @@ class Map<std::string, V, T1, T2> : public NodeRef {
  public:
   // for code reuse
   Map() {
-    node_ = make_node<StrMapNode>();
+    data_ = make_node<StrMapNode>();
   }
   Map(Map<std::string, V> && other) {  // NOLINT(*)
-    node_ = std::move(other.node_);
+    data_ = std::move(other.data_);
   }
-  Map(const Map<std::string, V> &other) : NodeRef(other.node_) { // NOLINT(*)
+  Map(const Map<std::string, V> &other) : NodeRef(other.data_) { // NOLINT(*)
   }
-  explicit Map(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Map(ObjectPtr<Object> n) : NodeRef(n) {}
   template<typename IterType>
   Map(IterType begin, IterType end) {
     assign(begin, end);
@@ -545,76 +532,77 @@ class Map<std::string, V, T1, T2> : public NodeRef {
     assign(init.begin(), init.end());
   }
   Map<std::string, V>& operator=(Map<std::string, V> && other) {
-    node_ = std::move(other.node_);
+    data_ = std::move(other.data_);
     return *this;
   }
   Map<std::string, V>& operator=(const Map<std::string, V> & other) {
-    node_ = other.node_;
+    data_ = other.data_;
     return *this;
   }
   template<typename IterType>
   void assign(IterType begin, IterType end) {
     auto n = make_node<StrMapNode>();
     for (IterType i = begin; i != end; ++i) {
-      n->data.emplace(std::make_pair(i->first,
-                                     i->second.node_));
+      n->data.emplace(std::make_pair(i->first, i->second));
     }
-    node_ = std::move(n);
+    data_ = std::move(n);
   }
   inline const V operator[](const std::string& key) const {
-    return V(static_cast<const StrMapNode*>(node_.get())->data.at(key));
+    return DowncastNoCheck<V>(
+        static_cast<const StrMapNode*>(data_.get())->data.at(key));
   }
   inline const V at(const std::string& key) const {
-    return V(static_cast<const StrMapNode*>(node_.get())->data.at(key));
+    return DowncastNoCheck<V>(
+        static_cast<const StrMapNode*>(data_.get())->data.at(key));
   }
   inline size_t size() const {
-    if (node_.get() == nullptr) return 0;
-    return static_cast<const StrMapNode*>(node_.get())->data.size();
+    if (data_.get() == nullptr) return 0;
+    return static_cast<const StrMapNode*>(data_.get())->data.size();
   }
   inline size_t count(const std::string& key) const {
-    if (node_.get() == nullptr) return 0;
-    return static_cast<const StrMapNode*>(node_.get())->data.count(key);
+    if (data_.get() == nullptr) return 0;
+    return static_cast<const StrMapNode*>(data_.get())->data.count(key);
   }
   inline StrMapNode* CopyOnWrite() {
-    if (node_.get() == nullptr || !node_.unique())  {
+    if (data_.get() == nullptr || !data_.unique())  {
       NodePtr<StrMapNode> n = make_node<StrMapNode>();
-      n->data = static_cast<const StrMapNode*>(node_.get())->data;
-      NodePtr<Node>(std::move(n)).swap(node_);
+      n->data = static_cast<const StrMapNode*>(data_.get())->data;
+      ObjectPtr<Object>(std::move(n)).swap(data_);
     }
-    return static_cast<StrMapNode*>(node_.get());
+    return static_cast<StrMapNode*>(data_.get());
   }
   inline void Set(const std::string& key, const V& value) {
     StrMapNode* n = this->CopyOnWrite();
-    n->data[key] = value.node_;
+    n->data[key] = value;
   }
   inline bool empty() const {
     return size() == 0;
   }
   using ContainerType = StrMapNode;
 
-  struct Ptr2NodeRef {
+  struct ValueConverter {
     using ResultType = std::pair<std::string, V>;
     static inline ResultType convert(const std::pair<
-                            std::string,
-                            NodePtr<Node> >& n) {
-      return std::make_pair(n.first, V(n.second));
+                                     std::string,
+                                     ObjectRef>& n) {
+      return std::make_pair(n.first, DowncastNoCheck<V>(n.second));
     }
   };
 
   using iterator = IterAdapter<
-    Ptr2NodeRef, StrMapNode::ContainerType::const_iterator>;
+    ValueConverter, StrMapNode::ContainerType::const_iterator>;
 
   /*! \return begin iterator */
   inline iterator begin() const {
-    return iterator(static_cast<const StrMapNode*>(node_.get())->data.begin());
+    return iterator(static_cast<const StrMapNode*>(data_.get())->data.begin());
   }
   /*! \return end iterator */
   inline iterator end() const {
-    return iterator(static_cast<const StrMapNode*>(node_.get())->data.end());
+    return iterator(static_cast<const StrMapNode*>(data_.get())->data.end());
   }
   /*! \return begin iterator */
   inline iterator find(const std::string& key) const {
-    return iterator(static_cast<const StrMapNode*>(node_.get())->data.find(key));
+    return iterator(static_cast<const StrMapNode*>(data_.get())->data.find(key));
   }
 };
 
diff --git a/include/tvm/node/ir_functor.h b/include/tvm/node/ir_functor.h
index 23c5a3fafdab..e902e8fb6d44 100644
--- a/include/tvm/node/ir_functor.h
+++ b/include/tvm/node/ir_functor.h
@@ -34,10 +34,10 @@
 
 namespace tvm {
 /*!
- * \brief A dynamically dispatched functor on NodeRef in the first argument.
+ * \brief A dynamically dispatched functor on ObjectRef in the first argument.
  *
  * \code
- *   IRFunctor<std::string (const NodeRef& n, std::string prefix)> tostr;
+ *   IRFunctor<std::string (const ObjectRef& n, std::string prefix)> tostr;
  *   tostr.set_dispatch<Add>([](const Add* op, std::string prefix) {
  *     return prefix + "Add";
  *   });
@@ -60,10 +60,10 @@ template<typename FType>
 class IRFunctor;
 
 template<typename R, typename ...Args>
-class IRFunctor<R(const NodeRef& n, Args...)> {
+class IRFunctor<R(const ObjectRef& n, Args...)> {
  private:
-  using Function = std::function<R (const NodeRef&n, Args...)>;
-  using TSelf = IRFunctor<R (const NodeRef& n, Args...)>;
+  using Function = std::function<R (const ObjectRef&n, Args...)>;
+  using TSelf = IRFunctor<R (const ObjectRef& n, Args...)>;
   /*! \brief internal function table */
   std::vector<Function> func_;
 
@@ -75,8 +75,8 @@ class IRFunctor<R(const NodeRef& n, Args...)> {
    * \param n The node to be dispatched
    * \return Whether dispatching function is registered for n's type.
    */
-  inline bool can_dispatch(const NodeRef& n) const {
-    uint32_t type_index = n.type_index();
+  inline bool can_dispatch(const ObjectRef& n) const {
+    uint32_t type_index = n->type_index();
     return type_index < func_.size() && func_[type_index] != nullptr;
   }
   /*!
@@ -85,12 +85,12 @@ class IRFunctor<R(const NodeRef& n, Args...)> {
    * \param args The additional arguments
    * \return The result.
    */
-  inline R operator()(const NodeRef& n, Args... args) const {
-    uint32_t type_index = n.type_index();
+  inline R operator()(const ObjectRef& n, Args... args) const {
+    uint32_t type_index = n->type_index();
     CHECK(type_index < func_.size() &&
           func_[type_index] != nullptr)
         << "IRFunctor calls un-registered function on type "
-        << Node::TypeIndex2Key(type_index);
+        << n->GetTypeKey();
     return func_[type_index](n, std::forward<Args>(args)...);
   }
   /*!
@@ -101,19 +101,19 @@ class IRFunctor<R(const NodeRef& n, Args...)> {
    */
   template<typename TNode>
   inline TSelf& set_dispatch(Function f) {  // NOLINT(*)
-    uint32_t tindex = Node::TypeKey2Index(TNode::_type_key);
+    uint32_t tindex = TNode::RuntimeTypeIndex();
     if (func_.size() <= tindex) {
       func_.resize(tindex + 1, nullptr);
     }
     CHECK(func_[tindex] == nullptr)
-        << "Dispatch for " << Node::TypeIndex2Key(tindex)
+        << "Dispatch for " << TNode::_type_key
         << " is already set";
     func_[tindex] = f;
     return *this;
   }
   /*!
    * \brief set the dispacher for type TNode
-   *  This allows f to used detailed const Node pointer to replace NodeRef
+   *  This allows f to used detailed const Node pointer to replace ObjectRef
    *
    * \param f The function to be set.
    * \tparam TNode the type of Node to be dispatched.
@@ -121,8 +121,8 @@ class IRFunctor<R(const NodeRef& n, Args...)> {
    */
   template<typename TNode>
   inline TSelf& set_dispatch(std::function<R(const TNode* n, Args...)> f) { // NOLINT(*)
-    Function fun = [f](const NodeRef& n, Args... args) {
-      return f(static_cast<const TNode*>(n.node_.get()),
+    Function fun = [f](const ObjectRef& n, Args... args) {
+      return f(static_cast<const TNode*>(n.get()),
                std::forward<Args>(args)...);
     };
     return this->set_dispatch<TNode>(fun);
@@ -135,7 +135,7 @@ class IRFunctor<R(const NodeRef& n, Args...)> {
   */
   template<typename TNode>
   inline TSelf& clear_dispatch() {  // NOLINT(*)
-    uint32_t tindex = Node::TypeKey2Index(TNode::_type_key);
+    uint32_t tindex = TNode::RuntimeTypeIndex();
     CHECK_LT(tindex, func_.size()) << "clear_dispatch: index out of range";
     func_[tindex] = nullptr;
     return *this;
@@ -172,7 +172,7 @@ class IRFunctor<R(const NodeRef& n, Args...)> {
  *      f(e, this);
  *    }
  *
- *    using FType = IRFunctor<void (const NodeRef&, IRPrinter *)>;
+ *    using FType = IRFunctor<void (const ObjectRef&, IRPrinter *)>;
  *    // function to return global function table
  *    static FType& vtable();
  *  };
@@ -232,15 +232,15 @@ template<typename FType>
 class IRFunctorStaticRegistry;
 
 template<typename R, typename ...Args>
-class IRFunctorStaticRegistry<R(const NodeRef& n, Args...)> {
+class IRFunctorStaticRegistry<R(const ObjectRef& n, Args...)> {
  private:
-  IRFunctor<R(const NodeRef& n, Args...)> *irf_;
+  IRFunctor<R(const ObjectRef& n, Args...)> *irf_;
   std::shared_ptr<IRFunctorCleanList> free_list;
 
-  using TSelf = IRFunctorStaticRegistry<R(const NodeRef& n, Args...)>;
+  using TSelf = IRFunctorStaticRegistry<R(const ObjectRef& n, Args...)>;
 
  public:
-  IRFunctorStaticRegistry(IRFunctor<R(const NodeRef& n, Args...)> *irf) {
+  IRFunctorStaticRegistry(IRFunctor<R(const ObjectRef& n, Args...)> *irf) {
     irf_ = irf;
     free_list = std::make_shared<IRFunctorCleanList>();
   }
@@ -261,12 +261,12 @@ class IRFunctorStaticRegistry<R(const NodeRef& n, Args...)> {
 * the compiler to deduce the template types.
 */
 template<typename R, typename ...Args>
-IRFunctorStaticRegistry<R(const NodeRef& n, Args...)> MakeIRFunctorStaticRegistry(
-  IRFunctor<R(const NodeRef& n, Args...)> *irf) {
-  return IRFunctorStaticRegistry<R(const NodeRef& n, Args...)>(irf);
+IRFunctorStaticRegistry<R(const ObjectRef& n, Args...)> MakeIRFunctorStaticRegistry(
+  IRFunctor<R(const ObjectRef& n, Args...)> *irf) {
+  return IRFunctorStaticRegistry<R(const ObjectRef& n, Args...)>(irf);
 }
 
-#define TVM_AUTO_REGISTER_VAR_DEF(ClsName)                           \
+#define TVM_AUTO_REGISTER_VAR_DEF(ClsName)                        \
   static TVM_ATTRIBUTE_UNUSED auto __make_functor ## _ ## ClsName
 
 /*!
diff --git a/include/tvm/node/memory.h b/include/tvm/node/memory.h
deleted file mode 100644
index 1bba57144e19..000000000000
--- a/include/tvm/node/memory.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*!
- * \file tvm/node/memory.h
- * \brief Node memory management.
- */
-#ifndef TVM_NODE_MEMORY_H_
-#define TVM_NODE_MEMORY_H_
-
-#include <utility>
-#include "node.h"
-
-namespace tvm {
-/*!
- * \brief Allocate a node object.
- * \param args arguments to the constructor.
- * \tparam T the node type.
- * \return The NodePtr to the allocated object.
- */
-template<typename T, typename... Args>
-inline NodePtr<T> make_node(Args&&... args);
-
-// Detail implementations after this
-//
-// The current design allows swapping the
-// allocator pattern when necessary.
-//
-// Possible future allocator optimizations:
-// - Arena allocator that gives ownership of memory to arena (deleter_= nullptr)
-// - Thread-local object pools: one pool per size and alignment requirement.
-// - Can specialize by type of object to give the specific allocator to each object.
-//
-template<typename T>
-class SimpleNodeAllocator {
- public:
-  template<typename... Args>
-  static T* New(Args&&... args) {
-    return new T(std::forward<Args>(args)...);
-  }
-  static NodeBase::FDeleter Deleter() {
-    return Deleter_;
-  }
-
- private:
-  static void Deleter_(NodeBase* ptr) {
-    delete static_cast<T*>(ptr);
-  }
-};
-
-template<typename T, typename... Args>
-inline NodePtr<T> make_node(Args&&... args) {
-  using Allocator = SimpleNodeAllocator<T>;
-  static_assert(std::is_base_of<NodeBase, T>::value,
-                "make_node can only be used to create NodeBase");
-  T* node = Allocator::New(std::forward<Args>(args)...);
-  node->deleter_ = Allocator::Deleter();
-  return NodePtr<T>(node);
-}
-
-}  // namespace tvm
-#endif  // TVM_NODE_MEMORY_H_
diff --git a/include/tvm/node/node.h b/include/tvm/node/node.h
index cb18e46e9a5c..4014c3700596 100644
--- a/include/tvm/node/node.h
+++ b/include/tvm/node/node.h
@@ -18,344 +18,143 @@
  */
 /*!
  * \file tvm/node/node.h
- * \brief Node system data structure.
+ * \brief Definitions and helper macros for IR/AST nodes.
+ *
+ *  The node folder contains base utilities for IR/AST nodes,
+ *  invariant of which specific language dialect.
+ *
+ *  We implement AST/IR nodes as sub-classes of runtime::Object.
+ *  The base class Node is just an alias of runtime::Object.
+ *
+ *  Besides the runtime type checking provided by Object,
+ *  node folder contains additional functionalities such as
+ *  reflection and serialization, which are important features
+ *  for building a compiler infra.
  */
 #ifndef TVM_NODE_NODE_H_
 #define TVM_NODE_NODE_H_
 
-#include <dmlc/logging.h>
 #include <tvm/runtime/c_runtime_api.h>
-#include <tvm/runtime/node_base.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/memory.h>
+#include <tvm/node/reflection.h>
+
 #include <string>
 #include <vector>
 #include <utility>
 #include <type_traits>
 
-
 namespace tvm {
-// forward declaration
-class DataType;
-class Node;
-class NodeRef;
 
-namespace runtime {
-// forward declaration
-class NDArray;
-// forward declaration
-class ObjectRef;
-}  // namespace runtime
+using runtime::TypeIndex;
+using runtime::Object;
+using runtime::ObjectPtr;
+using runtime::ObjectRef;
+using runtime::GetRef;
+using runtime::Downcast;
+using runtime::ObjectHash;
+using runtime::ObjectEqual;
+using runtime::make_object;
 
-/*!
- * \brief Visitor class to each node content.
- *  The content is going to be called for each field.
- */
-class TVM_DLL AttrVisitor {
- public:
-//! \cond Doxygen_Suppress
-  virtual ~AttrVisitor() = default;
-  virtual void Visit(const char* key, double* value) = 0;
-  virtual void Visit(const char* key, int64_t* value) = 0;
-  virtual void Visit(const char* key, uint64_t* value) = 0;
-  virtual void Visit(const char* key, int* value) = 0;
-  virtual void Visit(const char* key, bool* value) = 0;
-  virtual void Visit(const char* key, std::string* value) = 0;
-  virtual void Visit(const char* key, void** value) = 0;
-  virtual void Visit(const char* key, DataType* value) = 0;
-  virtual void Visit(const char* key, NodeRef* value) = 0;
-  virtual void Visit(const char* key, runtime::NDArray* value) = 0;
-  virtual void Visit(const char* key, runtime::ObjectRef* value) = 0;
-  template<typename ENum,
-           typename = typename std::enable_if<std::is_enum<ENum>::value>::type>
-  void Visit(const char* key, ENum* ptr) {
-    static_assert(std::is_same<int, typename std::underlying_type<ENum>::type>::value,
-                  "declare enum to be enum int to use visitor");
-    this->Visit(key, reinterpret_cast<int*>(ptr));
-  }
-//! \endcond
-};
+using NodeHash = ObjectHash;
+using NodeEqual = ObjectEqual;
+using Node = Object;
 
 /*!
- * \brief base class of node container in DSL AST.
+ * \brief Base class of all references to AST/IR nodes.
  */
-class TVM_DLL Node : public NodeBase {
+class NodeRef : public ObjectRef {
  public:
-  /*! \brief virtual destructor */
-  virtual ~Node() {}
-  /*! \return The unique type key of the node */
-  virtual const char* type_key() const = 0;
-  /*!
-   * \brief Apply visitor to each field of the Node
-   *  Visitor could mutate the content of the node.
-   *  override if Node contains attribute fields.
-   * \param visitor The visitor
-   */
-  virtual void VisitAttrs(AttrVisitor* visitor) {}
-  /*! \return the type index of the node */
-  virtual uint32_t type_index() const = 0;
-  /*!
-   * \brief Whether this node derives from node with type_index=tid.
-   *  Implemented by TVM_DECLARE_NODE_TYPE_INFO
-   *
-   * \param tid The type index.
-   * \return the check result.
-   */
-  virtual bool _DerivedFrom(uint32_t tid) const;
-  /*!
-   * \brief get a runtime unique type index given a type key
-   * \param type_key Type key of a type.
-   * \return the corresponding type index.
-   */
-  static uint32_t TypeKey2Index(const char* type_key);
-  /*!
-   * \brief get type key from type index.
-   * \param index The type index
-   * \return the corresponding type key.
-   */
-  static const char* TypeIndex2Key(uint32_t index);
-  /*!
-   * \return whether the type is derived from
-   */
-  template<typename T>
-  inline bool derived_from() const;
-  /*!
-   * \return whether the node is of type T
-   * \tparam The type to be checked.
-   */
-  template<typename T>
-  inline bool is_type() const;
-  /*!
-   * \brief Get a NodePtr that holds reference to this Node.
-   * \return the NodePtr
-   */
-  inline NodePtr<Node> GetNodePtr() const;
-  // node ref can see this
-  friend class NodeRef;
-  static constexpr const char* _type_key = "Node";
-};
-
-/*! \brief Base class of all node reference object */
-class NodeRef {
- public:
-  /*! \brief type indicate the container type */
-  using ContainerType = Node;
-  /*!
-   * \brief Comparator
-   * \param other Another node ref.
-   * \return the compare result.
-   */
-  inline bool operator==(const NodeRef& other) const;
-  /*!
-   * \brief Comparator
-   * \param other Another node ref.
-   * \return the compare result.
-   */
-  inline bool same_as(const NodeRef& other) const;
-  /*!
-   * \brief Comparator
-   * \param other Another node ref.
-   * \return the compare result.
-   */
-  inline bool operator<(const NodeRef& other) const;
-  /*!
-   * \brief Comparator
-   * \param other Another node ref.
-   * \return the compare result.
-   */
-  inline bool operator!=(const NodeRef& other) const;
-  /*! \return the hash function for NodeRef */
-  inline size_t hash() const;
-  /*! \return whether the expression is null */
-  inline bool defined() const;
-  /*! \return the internal type index of IRNode */
-  inline uint32_t type_index() const;
-  /*! \return the internal node pointer */
-  inline const Node* get() const;
-  /*! \return the internal node pointer */
-  inline const Node* operator->() const;
-  /*!
-   * \brief Downcast this ir node to its actual type (e.g. Add, or
-   * Select). This returns nullptr if the node is not of the requested
-   * type. Example usage:
-   *
-   * if (const Add *add = node->as<Add>()) {
-   *   // This is an add node
-   * }
-   * \tparam T the target type, must be subtype of IRNode
-   */
-  template<typename T>
-  inline const T *as() const;
-  /*!
-   * \brief A more powerful version of as that also works with
-   *  intermediate base types.
-   * \tparam T the target type, must be subtype of IRNode
-   */
-  template<typename T>
-  inline const T *as_derived() const;
-  /*! \brief default constructor */
-  NodeRef() = default;
-  explicit NodeRef(NodePtr<Node> node) : node_(node) {}
-  /*! \brief the internal node object, do not touch  */
-  NodePtr<Node> node_;
+  NodeRef() {}
+  explicit NodeRef(ObjectPtr<Object> n) : ObjectRef(n) {}
 };
 
 /*!
- * \brief Get a reference type from a Node ptr type
- *
- *  It is always important to get a reference type
- *  if we want to return a value as reference or keep
- *  the node alive beyond the scope of the function.
- *
- * \param ptr The node pointer
- * \tparam RefType The reference type
- * \tparam NodeType The node type
- * \return The corresponding RefType
+ * \brief Allocate a node object.
+ * \param args arguments to the constructor.
+ * \tparam T the node type.
+ * \return The NodePtr to the allocated object.
+ * \note This function is an alias of make_object.
  */
-template <typename RefType, typename NodeType>
-inline RefType GetRef(const NodeType* ptr);
-
-/*!
- * \brief Downcast a base reference type to a more specific type.
- *
- * \param ref The inptut reference
- * \return The corresponding SubRef.
- * \tparam SubRef The target specific reference type.
- * \tparam BaseRef the current reference type.
- */
-template <typename SubRef, typename BaseRef>
-inline SubRef Downcast(BaseRef ref);
+template<typename T, typename... Args>
+inline NodePtr<T> make_node(Args&&... args) {
+  return runtime::make_object<T>(std::forward<Args>(args)...);
+}
 
 /*!
  * \brief helper macro to declare type information in a base node.
  */
-#define TVM_DECLARE_BASE_NODE_INFO(TypeName, Parent)                    \
-  bool _DerivedFrom(uint32_t tid) const override {                      \
-    static uint32_t tidx = TypeKey2Index(TypeName::_type_key);          \
-    if (tidx == tid) return true;                                       \
-    return Parent::_DerivedFrom(tid);                                   \
-  }
+#define TVM_DECLARE_BASE_NODE_INFO(TypeName, Parent)  \
+  TVM_DECLARE_BASE_OBJECT_INFO(TypeName, Parent)
 
 /*!
  * \brief helper macro to declare type information in a terminal node
  */
-#define TVM_DECLARE_NODE_TYPE_INFO(TypeName, Parent)                    \
-  const char* type_key() const final {                                  \
-    return TypeName::_type_key;                                         \
-  }                                                                     \
-  uint32_t type_index() const final {                                   \
-    static uint32_t tidx = TypeKey2Index(TypeName::_type_key);          \
-    return tidx;                                                        \
-  }                                                                     \
-  bool _DerivedFrom(uint32_t tid) const final {                         \
-    static uint32_t tidx = TypeKey2Index(TypeName::_type_key);          \
-    if (tidx == tid) return true;                                       \
-    return Parent::_DerivedFrom(tid);                                   \
-  }
-
-// implementations of inline functions after this
-template<typename T>
-inline bool Node::derived_from() const {
-  // use static field so query only happens once.
-  static uint32_t type_id = Node::TypeKey2Index(T::_type_key);
-  return this->_DerivedFrom(type_id);
-}
-
-
-template<typename T>
-inline bool Node::is_type() const {
-  // use static field so query only happens once.
-  static uint32_t type_id = Node::TypeKey2Index(T::_type_key);
-  return type_id == this->type_index();
-}
-
-
-inline NodePtr<Node> Node::GetNodePtr() const {
-  return NodePtr<Node>(const_cast<Node*>(this));
-}
-
-template <typename RefType, typename NodeType>
-inline RefType GetRef(const NodeType* ptr) {
-  static_assert(std::is_base_of<typename RefType::ContainerType, NodeType>::value,
-                "Can only cast to the ref of same container type");
-  return RefType(ptr->GetNodePtr());
-}
-
-template <typename SubRef, typename BaseRef>
-inline SubRef Downcast(BaseRef ref) {
-  CHECK(ref->template is_type<typename SubRef::ContainerType>() ||
-        ref->template derived_from<typename SubRef::ContainerType>())
-      << "Downcast from " << ref->type_key() << " to "
-      << SubRef::ContainerType::_type_key << " failed.";
-  return SubRef(std::move(ref.node_));
-}
+#define TVM_DECLARE_NODE_TYPE_INFO(TypeName, Parent)  \
+  TVM_DECLARE_FINAL_OBJECT_INFO(TypeName, Parent);
 
-inline const Node* NodeRef::get() const {
-  return node_.get();
-}
-
-inline const Node* NodeRef::operator->() const {
-  return node_.get();
-}
-
-inline bool NodeRef::defined() const {
-  return node_.get() != nullptr;
-}
-
-inline bool NodeRef::operator==(const NodeRef& other) const {
-  return node_.get() == other.node_.get();
-}
-
-inline bool NodeRef::same_as(const NodeRef& other) const {
-  return node_.get() == other.node_.get();
-}
-
-inline bool NodeRef::operator<(const NodeRef& other) const {
-  return node_.get() < other.node_.get();
-}
-
-inline bool NodeRef::operator!=(const NodeRef& other) const {
-  return node_.get() != other.node_.get();
-}
-
-inline size_t NodeRef::hash() const {
-  return std::hash<Node*>()(node_.get());
-}
-
-inline uint32_t NodeRef::type_index() const {
-  CHECK(node_.get() != nullptr)
-      << "null type";
-  return get()->type_index();
-}
 
-template<typename T>
-inline const T* NodeRef::as() const {
-  const Node* ptr = static_cast<const Node*>(get());
-  if (ptr && ptr->is_type<T>()) {
-    return static_cast<const T*>(ptr);
-  }
-  return nullptr;
-}
-
-template<typename T>
-inline const T* NodeRef::as_derived() const {
-  const Node* ptr = static_cast<const Node*>(get());
-  if (ptr && (ptr->is_type<T>() || ptr->derived_from<T>())) {
-    return static_cast<const T*>(ptr);
-  }
-  return nullptr;
-}
+/*!
+ * \brief Macro to define common node ref methods.
+ * \param TypeName The name of the NodeRef.
+ * \param BaseTypeName The Base type.
+ * \param NodeName The node container type.
+ */
+#define TVM_DEFINE_NODE_REF_METHODS(TypeName, BaseTypeName, NodeName)   \
+  TypeName() {}                                                         \
+  explicit TypeName(::tvm::ObjectPtr<::tvm::Object> n)                  \
+      : BaseTypeName(n) {}                                              \
+  const NodeName* operator->() const {                                  \
+    return static_cast<const NodeName*>(data_.get());                   \
+  }                                                                     \
+  operator bool() const { return this->defined(); }                     \
+  using ContainerType = NodeName;
 
-/*! \brief The hash function for nodes */
-struct NodeHash {
-  size_t operator()(const NodeRef& a) const {
-    return a.hash();
-  }
-};
+/*!
+ * \brief Macro to define CopyOnWrite function in a NodeRef.
+ * \param NodeName The Type of the Node.
+ *
+ *  CopyOnWrite will generate a unique copy of the internal node.
+ *  The node will be copied if it is referenced by multiple places.
+ *  The function returns the raw pointer to the node to allow modification
+ *  of the content.
+ *
+ * \code
+ *
+ *  MyCOWNodeRef ref, ref2;
+ *  ref2 = ref;
+ *  ref.CopyOnWrite()->value = new_value;
+ *  assert(ref2->value == old_value);
+ *  assert(ref->value == new_value);
+ *
+ * \endcode
+ */
+#define TVM_DEFINE_NODE_REF_COW(NodeName)                               \
+  NodeName* CopyOnWrite() {                                             \
+      CHECK(data_ != nullptr);                                          \
+      if (!data_.unique())  {                                           \
+        NodePtr<NodeName> n = make_node<NodeName>(*(operator->()));     \
+        ObjectPtr<Object>(std::move(n)).swap(data_);                    \
+      }                                                                 \
+      return static_cast<NodeName*>(data_.get());                       \
+    }
+
+/*! \brief Macro to make it easy to define node ref type given node */
+#define TVM_DEFINE_NODE_REF(TypeName, NodeName)                      \
+  class TypeName : public ::tvm::NodeRef {                           \
+   public:                                                           \
+    TVM_DEFINE_NODE_REF_METHODS(TypeName, ::tvm::NodeRef, NodeName); \
+  };                                                                 \
 
-/*! \brief The equal comparator for nodes */
-struct NodeEqual {
-  bool operator()(const NodeRef& a, const NodeRef& b) const {
-    return a.get() == b.get();
-  }
-};
+/*!
+ * \brief Macro to make it easy to define node ref type that
+ *  has a CopyOnWrite member function.
+ */
+#define TVM_DEFINE_COW_NODE_REF(TypeName, BaseType, NodeName)           \
+  class TypeName : public BaseType {                                    \
+   public:                                                              \
+    TVM_DEFINE_NODE_REF_METHODS(TypeName, BaseType, NodeName);          \
+    TVM_DEFINE_NODE_REF_COW(NodeName);                                  \
+  };
 }  // namespace tvm
 #endif  // TVM_NODE_NODE_H_
diff --git a/include/tvm/node/reflection.h b/include/tvm/node/reflection.h
new file mode 100644
index 000000000000..e6caa443ab9c
--- /dev/null
+++ b/include/tvm/node/reflection.h
@@ -0,0 +1,241 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * \file tvm/node/reflection.h
+ * \brief Reflection and serialization of compiler IR/AST nodes.
+ */
+#ifndef TVM_NODE_REFLECTION_H_
+#define TVM_NODE_REFLECTION_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/memory.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/ndarray.h>
+
+#include <vector>
+#include <string>
+
+namespace tvm {
+
+// forward declaration
+class DataType;
+
+using runtime::Object;
+using runtime::ObjectPtr;
+using runtime::ObjectRef;
+
+/*!
+ * \brief Visitor class for to get the attributesof a AST/IR node.
+ *  The content is going to be called for each field.
+ *
+ *  Each objects that wants reflection will need to implement
+ *  a VisitAttrs function and call visitor->Visit on each of its field.
+ */
+class TVM_DLL AttrVisitor {
+ public:
+//! \cond Doxygen_Suppress
+  virtual ~AttrVisitor() = default;
+  virtual void Visit(const char* key, double* value) = 0;
+  virtual void Visit(const char* key, int64_t* value) = 0;
+  virtual void Visit(const char* key, uint64_t* value) = 0;
+  virtual void Visit(const char* key, int* value) = 0;
+  virtual void Visit(const char* key, bool* value) = 0;
+  virtual void Visit(const char* key, std::string* value) = 0;
+  virtual void Visit(const char* key, void** value) = 0;
+  virtual void Visit(const char* key, DataType* value) = 0;
+  virtual void Visit(const char* key, runtime::NDArray* value) = 0;
+  virtual void Visit(const char* key, runtime::ObjectRef* value) = 0;
+  template<typename ENum,
+           typename = typename std::enable_if<std::is_enum<ENum>::value>::type>
+  void Visit(const char* key, ENum* ptr) {
+    static_assert(std::is_same<int, typename std::underlying_type<ENum>::type>::value,
+                  "declare enum to be enum int to use visitor");
+    this->Visit(key, reinterpret_cast<int*>(ptr));
+  }
+//! \endcond
+};
+
+/*!
+ * \brief Virtual function table to support IR/AST node reflection.
+ *
+ * Functions are stored  in columar manner.
+ * Each column is a vector indexed by Object's type_index.
+ */
+class ReflectionVTable {
+ public:
+  /*!
+   * \brief Visitor function.
+   * \note We use function pointer, instead of std::function
+   *       to reduce the dispatch overhead as field visit
+   *       does not need as much customization.
+   */
+  typedef void (*FVisitAttrs)(Object* self, AttrVisitor* visitor);
+  /*!
+   * \brief creator function.
+   * \param global_key Key that identifies a global single object.
+   *        If this is not empty then FGlobalKey must be defined for the object.
+   * \return The created function.
+   */
+  using FCreate = std::function<ObjectPtr<Object>(const std::string& global_key)>;
+  /*!
+   * \brief Global key function, only needed by global objects.
+   * \param node The node pointer.
+   * \return node The global key to the node.
+   */
+  using FGlobalKey = std::function<std::string(const Object* self)>;
+  /*!
+   * \brief Dispatch the VisitAttrs function.
+   * \param self The pointer to the object.
+   * \param visitor The attribute visitor.
+   */
+  inline void VisitAttrs(Object* self, AttrVisitor* visitor) const;
+  /*!
+   * \brief Get global key of the object, if any.
+   * \param self The pointer to the object.
+   * \return the global key if object has one, otherwise return empty string.
+   */
+  inline std::string GetGlobalKey(Object* self) const;
+  /*!
+   * \brief Create an initial object using default constructor
+   *        by type_key and global key.
+   *
+   * \param type_key The type key of the object.
+   * \param global_key A global key that can be used to uniquely identify the object if any.
+   */
+  TVM_DLL ObjectPtr<Object> CreateInitObject(const std::string& type_key,
+                                             const std::string& global_key = "") const;
+  /*!
+   * \brief Get an field object by the attr name.
+   * \param self The pointer to the object.
+   * \param attr_name The name of the field.
+   * \return The corresponding attribute value.
+   * \note This function will throw an exception if the object does not contain the field.
+   */
+  TVM_DLL runtime::TVMRetValue GetAttr(Object* self, const std::string& attr_name) const;
+
+  /*!
+   * \brief List all the fields in the object.
+   * \return All the fields.
+   */
+  TVM_DLL std::vector<std::string> ListAttrNames(Object* self) const;
+
+  /*! \return The global singleton. */
+  TVM_DLL static ReflectionVTable* Global();
+
+  class Registry;
+  template<typename T>
+  inline Registry Register();
+
+ private:
+  /*! \brief Attribute visitor. */
+  std::vector<FVisitAttrs> fvisit_attrs_;
+  /*! \brief Creation function. */
+  std::vector<FCreate> fcreate_;
+  /*! \brief Global key function. */
+  std::vector<FGlobalKey> fglobal_key_;
+};
+
+/*! \brief Registry of a reflection table. */
+class ReflectionVTable::Registry {
+ public:
+  Registry(ReflectionVTable* parent, uint32_t type_index)
+      : parent_(parent), type_index_(type_index) { }
+  /*!
+   * \brief Set fcreate function.
+   * \param f The creator function.
+   * \return rference to self.
+   */
+  Registry& set_creator(FCreate f) {  // NOLINT(*)
+    CHECK_LT(type_index_, parent_->fcreate_.size());
+    parent_->fcreate_[type_index_] = f;
+    return *this;
+  }
+  /*!
+   * \brief Set global_key function.
+   * \param f The creator function.
+   * \return rference to self.
+   */
+  Registry& set_global_key(FGlobalKey f) {  // NOLINT(*)
+    CHECK_LT(type_index_, parent_->fglobal_key_.size());
+    parent_->fglobal_key_[type_index_] = f;
+    return *this;
+  }
+
+ private:
+  ReflectionVTable* parent_;
+  uint32_t type_index_;
+};
+
+/*!
+ * \brief Register a node type to object registry and reflection registry.
+ * \param TypeName The name of the type.
+ * \note This macro will call TVM_REGISTER_OBJECT_TYPE for the type as well.
+ */
+#define TVM_REGISTER_NODE_TYPE(TypeName)                                \
+  TVM_REGISTER_OBJECT_TYPE(TypeName);                                   \
+  static DMLC_ATTRIBUTE_UNUSED ::tvm::ReflectionVTable::Registry &      \
+  __make_Node ## _ ## TypeName ## __ =                                  \
+      ::tvm::ReflectionVTable::Global()->Register<TypeName>()           \
+      .set_creator([](const std::string&) {                             \
+          return ::tvm::runtime::make_object<TypeName>();               \
+        })
+
+// Implementation details
+template<typename T>
+inline ReflectionVTable::Registry
+ReflectionVTable::Register() {
+  uint32_t tindex = T::RuntimeTypeIndex();
+  if (tindex >= fvisit_attrs_.size()) {
+    fvisit_attrs_.resize(tindex + 1, nullptr);
+    fcreate_.resize(tindex + 1, nullptr);
+    fglobal_key_.resize(tindex + 1, nullptr);
+  }
+  // functor that implemnts the redirection.
+  struct Functor {
+    static void VisitAttrs(Object* self, AttrVisitor* v) {
+      static_cast<T*>(self)->VisitAttrs(v);
+     }
+  };
+
+  fvisit_attrs_[tindex] = Functor::VisitAttrs;
+  return Registry(this, tindex);
+}
+
+inline void ReflectionVTable::
+VisitAttrs(Object* self, AttrVisitor* visitor) const {
+  uint32_t tindex = self->type_index();
+  if (tindex >= fvisit_attrs_.size() || fvisit_attrs_[tindex] == nullptr) {
+    LOG(FATAL) << "TypeError: " << self->GetTypeKey()
+               << " is not registered via TVM_REGISTER_NODE_TYPE";
+  }
+  fvisit_attrs_[tindex](self, visitor);
+}
+
+inline std::string ReflectionVTable::GetGlobalKey(Object* self) const {
+  uint32_t tindex = self->type_index();
+  if (tindex < fglobal_key_.size() && fglobal_key_[tindex] != nullptr) {
+    return fglobal_key_[tindex](self);
+  } else {
+    return std::string();
+  }
+}
+
+}  // namespace tvm
+#endif  // TVM_NODE_REFLECTION_H_
diff --git a/include/tvm/node/serialization.h b/include/tvm/node/serialization.h
new file mode 100644
index 000000000000..ac675946e0eb
--- /dev/null
+++ b/include/tvm/node/serialization.h
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Utility functions for serialization.
+ * \file tvm/node/serialization.h
+ */
+#ifndef TVM_NODE_SERIALIZATION_H_
+#define TVM_NODE_SERIALIZATION_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/object.h>
+
+#include <string>
+
+namespace tvm {
+/*!
+ * \brief save the node as well as all the node it depends on as json.
+ *  This can be used to serialize any TVM object
+ *
+ * \return the string representation of the node.
+ */
+TVM_DLL std::string SaveJSON(const runtime::ObjectRef& node);
+
+/*!
+ * \brief Internal implementation of LoadJSON
+ * Load tvm Node object from json and return a shared_ptr of Node.
+ * \param json_str The json string to load from.
+ *
+ * \return The shared_ptr of the Node.
+ */
+TVM_DLL runtime::ObjectRef LoadJSON(std::string json_str);
+
+}  // namespace tvm
+#endif  // TVM_NODE_SERIALIZATION_H_
diff --git a/include/tvm/operation.h b/include/tvm/operation.h
index b950aa952f04..f53c1ce56a93 100644
--- a/include/tvm/operation.h
+++ b/include/tvm/operation.h
@@ -188,7 +188,7 @@ class PlaceholderOpNode : public OperationNode {
       const std::unordered_map<IterVar, Range>& dom_map,
       bool debug_keep_trivial_loop) const final;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("tag", &tag);
     v->Visit("attrs", &attrs);
@@ -259,7 +259,7 @@ class TVM_DLL ComputeOpNode : public BaseComputeOpNode {
       bool debug_keep_trivial_loop) const final;
   size_t num_schedulable_dims() const final;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("tag", &tag);
     v->Visit("attrs", &attrs);
@@ -312,7 +312,7 @@ class TensorComputeOpNode : public BaseComputeOpNode {
       bool debug_keep_trivial_loop) const final;
   size_t num_schedulable_dims() const final;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("tag", &tag);
     v->Visit("axis", &axis);
@@ -394,7 +394,7 @@ class ScanOpNode : public OperationNode {
       const std::unordered_map<IterVar, Range>& dom_map,
       bool debug_keep_trivial_loop) const final;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("tag", &tag);
     v->Visit("attrs", &attrs);
@@ -461,7 +461,7 @@ class ExternOpNode : public OperationNode {
       const std::unordered_map<IterVar, Range>& dom_map,
       bool debug_keep_trivial_loop) const final;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("tag", &tag);
     v->Visit("attrs", &attrs);
@@ -529,7 +529,7 @@ class HybridOpNode : public OperationNode {
       const std::unordered_map<IterVar, Range>& dom_map,
       bool debug_keep_trivial_loop) const final;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("tag", &tag);
     v->Visit("attrs", &attrs);
@@ -651,7 +651,7 @@ inline Tensor compute(Array<Expr> shape,
 
 // inline function.
 inline const OperationNode* Operation::operator->() const {
-  return static_cast<const OperationNode*>(node_.get());
+  return static_cast<const OperationNode*>(get());
 }
 }  // namespace tvm
 #endif  // TVM_OPERATION_H_
diff --git a/include/tvm/packed_func_ext.h b/include/tvm/packed_func_ext.h
index 5951594b873c..71f8f55b2655 100644
--- a/include/tvm/packed_func_ext.h
+++ b/include/tvm/packed_func_ext.h
@@ -20,7 +20,7 @@
 /*!
  * \file tvm/packed_func_ext.h
  * \brief Extension package to PackedFunc
- *   This enales pass NodeRef types into/from PackedFunc.
+ *   This enales pass ObjectRef types into/from PackedFunc.
  */
 #ifndef TVM_PACKED_FUNC_EXT_H_
 #define TVM_PACKED_FUNC_EXT_H_
@@ -37,6 +37,7 @@
 #include "runtime/packed_func.h"
 
 namespace tvm {
+
 using runtime::TVMArgs;
 using runtime::TVMRetValue;
 using runtime::PackedFunc;
@@ -47,103 +48,99 @@ namespace runtime {
  * \tparam T the type to be checked.
  */
 template<typename T>
-struct NodeTypeChecker {
-  static inline bool Check(Node* sptr) {
-    // This is the only place in the project where RTTI is used
-    // It can be turned off, but will make non strict checking.
-    // TODO(tqchen) possibly find alternative to turn of RTTI
+struct ObjectTypeChecker {
+  static bool Check(const Object* ptr) {
     using ContainerType = typename T::ContainerType;
-    // always allow nullptr.
-    if (sptr == nullptr) return true;
-    return sptr->derived_from<ContainerType>();
+    if (ptr == nullptr) return true;
+    return ptr->IsInstance<ContainerType>();
   }
-  static inline void PrintName(std::ostringstream& os) { // NOLINT(*)
+  static void PrintName(std::ostream& os) { // NOLINT(*)
     using ContainerType = typename T::ContainerType;
     os << ContainerType::_type_key;
   }
 };
 
 template<typename T>
-struct NodeTypeChecker<Array<T> > {
-  static inline bool Check(Node* sptr) {
-    if (sptr == nullptr) return true;
-    if (!sptr->is_type<ArrayNode>()) return false;
-    ArrayNode* n = static_cast<ArrayNode*>(sptr);
+struct ObjectTypeChecker<Array<T> > {
+  static bool Check(const Object* ptr) {
+    if (ptr == nullptr) return true;
+    if (!ptr->IsInstance<ArrayNode>()) return false;
+    const ArrayNode* n = static_cast<const ArrayNode*>(ptr);
     for (const auto& p : n->data) {
-      if (!NodeTypeChecker<T>::Check(p.get())) {
+      if (!ObjectTypeChecker<T>::Check(p.get())) {
         return false;
       }
     }
     return true;
   }
-  static inline void PrintName(std::ostringstream& os) { // NOLINT(*)
-    os << "array<";
-    NodeTypeChecker<T>::PrintName(os);
-    os << ">";
+  static void PrintName(std::ostream& os) { // NOLINT(*)
+    os << "List[";
+    ObjectTypeChecker<T>::PrintName(os);
+    os << "]";
   }
 };
 
 template<typename V>
-struct NodeTypeChecker<Map<std::string, V> > {
-  static inline bool Check(Node* sptr) {
-    if (sptr == nullptr) return true;
-    if (!sptr->is_type<StrMapNode>()) return false;
-    StrMapNode* n = static_cast<StrMapNode*>(sptr);
+struct ObjectTypeChecker<Map<std::string, V> > {
+  static bool Check(const Object* ptr) {
+    if (ptr == nullptr) return true;
+    if (!ptr->IsInstance<StrMapNode>()) return false;
+    const StrMapNode* n = static_cast<const StrMapNode*>(ptr);
     for (const auto& kv : n->data) {
-      if (!NodeTypeChecker<V>::Check(kv.second.get())) return false;
+      if (!ObjectTypeChecker<V>::Check(kv.second.get())) return false;
     }
     return true;
   }
-  static inline void PrintName(std::ostringstream& os) { // NOLINT(*)
-    os << "map<string";
+  static void PrintName(std::ostream& os) { // NOLINT(*)
+    os << "Map[str";
     os << ',';
-    NodeTypeChecker<V>::PrintName(os);
-    os << '>';
+    ObjectTypeChecker<V>::PrintName(os);
+    os << ']';
   }
 };
 
 template<typename K, typename V>
-struct NodeTypeChecker<Map<K, V> > {
-  static inline bool Check(Node* sptr) {
-    if (sptr == nullptr) return true;
-    if (!sptr->is_type<MapNode>()) return false;
-    MapNode* n = static_cast<MapNode*>(sptr);
+struct ObjectTypeChecker<Map<K, V> > {
+  static bool Check(const Object* ptr) {
+    if (ptr == nullptr) return true;
+    if (!ptr->IsInstance<MapNode>()) return false;
+    const MapNode* n = static_cast<const MapNode*>(ptr);
     for (const auto& kv : n->data) {
-      if (!NodeTypeChecker<K>::Check(kv.first.get())) return false;
-      if (!NodeTypeChecker<V>::Check(kv.second.get())) return false;
+      if (!ObjectTypeChecker<K>::Check(kv.first.get())) return false;
+      if (!ObjectTypeChecker<V>::Check(kv.second.get())) return false;
     }
     return true;
   }
-  static inline void PrintName(std::ostringstream& os) { // NOLINT(*)
-    os << "map<";
-    NodeTypeChecker<K>::PrintName(os);
+  static void PrintName(std::ostringstream& os) { // NOLINT(*)
+    os << "Map[";
+    ObjectTypeChecker<K>::PrintName(os);
     os << ',';
-    NodeTypeChecker<V>::PrintName(os);
-    os << '>';
+    ObjectTypeChecker<V>::PrintName(os);
+    os << ']';
   }
 };
 
 template<typename T>
-inline std::string NodeTypeName() {
+inline std::string ObjectTypeName() {
   std::ostringstream os;
-  NodeTypeChecker<T>::PrintName(os);
+  ObjectTypeChecker<T>::PrintName(os);
   return os.str();
 }
 
 // extensions for tvm arg value
 
-template<typename TNodeRef>
-inline TNodeRef TVMArgValue::AsNodeRef() const {
+template<typename TObjectRef>
+inline TObjectRef TVMArgValue::AsObjectRef() const {
   static_assert(
-      std::is_base_of<NodeRef, TNodeRef>::value,
-      "Conversion only works for NodeRef");
-  if (type_code_ == kNull) return TNodeRef(NodePtr<Node>(nullptr));
-  TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
-  NodePtr<Node>& sptr = *ptr<NodePtr<Node> >();
-  CHECK(NodeTypeChecker<TNodeRef>::Check(sptr.get()))
-      << "Expected type " << NodeTypeName<TNodeRef>()
-      << " but get " << sptr->type_key();
-  return TNodeRef(sptr);
+      std::is_base_of<ObjectRef, TObjectRef>::value,
+      "Conversion only works for ObjectRef");
+  if (type_code_ == kNull) return TObjectRef(NodePtr<Node>(nullptr));
+  TVM_CHECK_TYPE_CODE(type_code_, kObjectHandle);
+  Object* ptr = static_cast<Object*>(value_.v_handle);
+  CHECK(ObjectTypeChecker<TObjectRef>::Check(ptr))
+      << "Expected type " << ObjectTypeName<TObjectRef>()
+      << " but get " << ptr->GetTypeKey();
+  return TObjectRef(ObjectPtr<Node>(ptr));
 }
 
 inline TVMArgValue::operator tvm::Expr() const {
@@ -156,18 +153,20 @@ inline TVMArgValue::operator tvm::Expr() const {
   if (type_code_ == kDLFloat) {
     return Expr(static_cast<float>(value_.v_float64));
   }
-  TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
-  NodePtr<Node>& sptr = *ptr<NodePtr<Node> >();
-  if (sptr->is_type<IterVarNode>()) {
-    return IterVar(sptr)->var;
+
+  TVM_CHECK_TYPE_CODE(type_code_, kObjectHandle);
+  Object* ptr = static_cast<Object*>(value_.v_handle);
+
+  if (ptr->IsInstance<IterVarNode>()) {
+    return IterVar(ObjectPtr<Node>(ptr))->var;
   }
-  if (sptr->is_type<TensorNode>()) {
-    return Tensor(sptr)();
+  if (ptr->IsInstance<TensorNode>()) {
+    return Tensor(ObjectPtr<Node>(ptr))();
   }
-  CHECK(NodeTypeChecker<Expr>::Check(sptr.get()))
-      << "Expected type " << NodeTypeName<Expr>()
-      << " but get " << sptr->type_key();
-  return Expr(sptr);
+  CHECK(ObjectTypeChecker<Expr>::Check(ptr))
+      << "Expected type " << ObjectTypeName<Expr>()
+      << " but get " << ptr->GetTypeKey();
+  return Expr(ObjectPtr<Node>(ptr));
 }
 
 inline TVMArgValue::operator tvm::Integer() const {
@@ -177,68 +176,36 @@ inline TVMArgValue::operator tvm::Integer() const {
     CHECK_GE(value_.v_int64, std::numeric_limits<int>::min());
     return Integer(static_cast<int>(value_.v_int64));
   }
-  NodePtr<Node>& sptr = *ptr<NodePtr<Node> >();
-  CHECK(NodeTypeChecker<Integer>::Check(sptr.get()))
-      << "Expected type " << NodeTypeName<Expr>()
-      << " but get " << sptr->type_key();
-  return Integer(sptr);
-}
-
-inline NodePtr<Node>& TVMArgValue::node_sptr() {
-  TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
-  return *ptr<NodePtr<Node> >();
+  TVM_CHECK_TYPE_CODE(type_code_, kObjectHandle);
+  Object* ptr = static_cast<Object*>(value_.v_handle);
+  CHECK(ObjectTypeChecker<Integer>::Check(ptr))
+      << "Expected type " << ObjectTypeName<Expr>()
+      << " but get " << ptr->GetTypeKey();
+  return Integer(ObjectPtr<Node>(ptr));
 }
 
-
-template<typename TNodeRef, typename>
-inline bool TVMArgValue::IsNodeType() const {
-  TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
-  NodePtr<Node>& sptr =
-      *ptr<NodePtr<Node> >();
-  return NodeTypeChecker<TNodeRef>::Check(sptr.get());
+template<typename TObjectRef, typename>
+inline bool TVMPODValue_::IsObjectRef() const {
+  TVM_CHECK_TYPE_CODE(type_code_, kObjectHandle);
+  Object* ptr = static_cast<Object*>(value_.v_handle);
+  return ObjectTypeChecker<TObjectRef>::Check(ptr);
 }
 
 // extensions for TVMRetValue
-inline TVMRetValue& TVMRetValue::operator=(
-    const NodePtr<Node>& other) {
-  if (other.get() == nullptr) {
-    SwitchToPOD(kNull);
-  } else {
-    SwitchToClass<NodePtr<Node> >(kNodeHandle, other);
-  }
-  return *this;
-}
-
-inline TVMRetValue& TVMRetValue::operator=(const NodeRef& other) {
-  if (!other.defined()) {
-    SwitchToPOD(kNull);
-  } else {
-    SwitchToClass<NodePtr<Node> >(kNodeHandle, other.node_);
-  }
-  return *this;
-}
-
-template<typename TNodeRef>
-inline TNodeRef TVMRetValue::AsNodeRef() const {
+template<typename TObjectRef>
+inline TObjectRef TVMRetValue::AsObjectRef() const {
   static_assert(
-      std::is_base_of<NodeRef, TNodeRef>::value,
-      "Conversion only works for NodeRef");
-  if (type_code_ == kNull) return TNodeRef();
-  TVM_CHECK_TYPE_CODE(type_code_, kNodeHandle);
-  NodePtr<Node>& sptr = *ptr<NodePtr<Node> >();
-  CHECK(NodeTypeChecker<TNodeRef>::Check(sptr.get()))
-      << "Expected type " << NodeTypeName<TNodeRef>()
-      << " but get " << sptr->type_key();
-  return TNodeRef(sptr);
-}
+      std::is_base_of<ObjectRef, TObjectRef>::value,
+      "Conversion only works for ObjectRef");
+  if (type_code_ == kNull) return TObjectRef();
+  TVM_CHECK_TYPE_CODE(type_code_, kObjectHandle);
 
-inline void TVMArgsSetter::operator()(size_t i, const NodeRef& other) const {  // NOLINT(*)
-  if (other.defined()) {
-    values_[i].v_handle = const_cast<NodePtr<Node>*>(&(other.node_));
-    type_codes_[i] = kNodeHandle;
-  } else {
-    type_codes_[i] = kNull;
-  }
+  Object* ptr = static_cast<Object*>(value_.v_handle);
+
+  CHECK(ObjectTypeChecker<TObjectRef>::Check(ptr))
+      << "Expected type " << ObjectTypeName<TObjectRef>()
+      << " but get " << ptr->GetTypeKey();
+  return TObjectRef(ObjectPtr<Object>(ptr));
 }
 
 // type related stuffs
diff --git a/include/tvm/relay/adt.h b/include/tvm/relay/adt.h
index 4329c438e8a0..a74353239a00 100644
--- a/include/tvm/relay/adt.h
+++ b/include/tvm/relay/adt.h
@@ -52,7 +52,7 @@ class PatternNode : public RelayNode {
 class Pattern : public NodeRef {
  public:
   Pattern() {}
-  explicit Pattern(NodePtr<tvm::Node> p) : NodeRef(p) {}
+  explicit Pattern(ObjectPtr<tvm::Object> p) : NodeRef(p) {}
 
   using ContainerType = PatternNode;
 };
@@ -66,7 +66,7 @@ class PatternWildcardNode : public PatternNode {
 
   TVM_DLL static PatternWildcard make();
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("span", &span);
   }
 
@@ -88,7 +88,7 @@ class PatternVarNode : public PatternNode {
 
   TVM_DLL static PatternVar make(tvm::relay::Var var);
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("var", &var);
     v->Visit("span", &span);
   }
@@ -122,7 +122,7 @@ class ConstructorNode : public ExprNode {
                                   tvm::Array<Type> inputs,
                                   GlobalTypeVar belong_to);
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("name_hint", &name_hint);
     v->Visit("inputs", &inputs);
     v->Visit("belong_to", &belong_to);
@@ -151,7 +151,7 @@ class PatternConstructorNode : public PatternNode {
 
   TVM_DLL static PatternConstructor make(Constructor constructor, tvm::Array<Pattern> var);
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("constructor", &constructor);
     v->Visit("patterns", &patterns);
     v->Visit("span", &span);
@@ -175,7 +175,7 @@ class PatternTupleNode : public PatternNode {
 
   TVM_DLL static PatternTuple make(tvm::Array<Pattern> var);
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("patterns", &patterns);
     v->Visit("span", &span);
   }
@@ -213,7 +213,7 @@ class TypeDataNode : public TypeNode {
   /*! \brief The constructors. */
   tvm::Array<Constructor> constructors;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("header", &header);
     v->Visit("type_vars", &type_vars);
     v->Visit("constructors", &constructors);
@@ -240,7 +240,7 @@ class ClauseNode : public Node {
   /*! \brief The resulting value. */
   Expr rhs;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("lhs", &lhs);
     v->Visit("rhs", &rhs);
   }
@@ -269,7 +269,7 @@ class MatchNode : public ExprNode {
    */
   bool complete;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("data", &data);
     v->Visit("clauses", &clauses);
     v->Visit("complete", &complete);
diff --git a/include/tvm/relay/base.h b/include/tvm/relay/base.h
index f94ba5e26068..5a2326ece05d 100644
--- a/include/tvm/relay/base.h
+++ b/include/tvm/relay/base.h
@@ -83,10 +83,12 @@ using NodeEqual = ::tvm::NodeEqual;
 #define RELAY_DEFINE_NODE_REF(TypeName, NodeName, NodeRefBase)          \
   class TypeName : public NodeRefBase {                                 \
    public:                                                              \
-    TypeName() {}                                                        \
-    explicit TypeName(::tvm::NodePtr<::tvm::Node> n) : NodeRefBase(n) {} \
+    TypeName() {}                                                       \
+    explicit TypeName(::tvm::ObjectPtr<::tvm::Object> n)                \
+        : NodeRefBase(n) {                                              \
+    }                                                                   \
     const NodeName* operator->() const {                                \
-      return static_cast<const NodeName*>(node_.get());                 \
+      return static_cast<const NodeName*>(get());                       \
     }                                                                   \
     operator bool() { return this->defined(); }                         \
     using ContainerType = NodeName;                                     \
@@ -105,7 +107,7 @@ class SourceNameNode : public Node {
   /*! \brief The source name. */
   std::string name;
   // override attr visitor
-  void VisitAttrs(AttrVisitor* v) final { v->Visit("name", &name); }
+  void VisitAttrs(AttrVisitor* v) { v->Visit("name", &name); }
 
   static constexpr const char* _type_key = "relay.SourceName";
   TVM_DECLARE_NODE_TYPE_INFO(SourceNameNode, Node);
@@ -127,7 +129,7 @@ class SourceName : public NodeRef {
    * \return the pointer to the internal node container
    */
   inline const SourceNameNode* operator->() const {
-    return static_cast<SourceNameNode*>(this->node_.get());
+    return static_cast<const SourceNameNode*>(get());
   }
 
   /*!
@@ -158,7 +160,7 @@ class SpanNode : public Node {
   /*! \brief column offset */
   int col_offset;
   // override attr visitor
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("source", &source);
     v->Visit("lineno", &lineno);
     v->Visit("col_offset", &col_offset);
@@ -202,7 +204,7 @@ class IdNode : public Node {
    */
   std::string name_hint;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("name_hint", &name_hint);
   }
 
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index b1b8d6a7154e..ff075e3a8970 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -95,7 +95,7 @@ class ConstantNode : public ExprNode {
     return data->ndim == 0;
   }
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("data", &data);
     v->Visit("span", &span);
     v->Visit("_checked_type_", &checked_type_);
@@ -117,7 +117,7 @@ class TupleNode : public ExprNode {
   /*! \brief the fields of the tuple */
   tvm::Array<relay::Expr> fields;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("fields", &fields);
     v->Visit("span", &span);
     v->Visit("_checked_type_", &checked_type_);
@@ -165,7 +165,7 @@ class VarNode : public ExprNode {
     return vid->name_hint;
   }
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("vid", &vid);
     v->Visit("type_annotation", &type_annotation);
     v->Visit("span", &span);
@@ -197,7 +197,7 @@ class GlobalVarNode : public ExprNode {
   /*! \brief The name of the variable, this only acts as a hint. */
   std::string name_hint;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("name_hint", &name_hint);
     v->Visit("span", &span);
     v->Visit("_checked_type_", &checked_type_);
@@ -243,7 +243,7 @@ class FunctionNode : public ExprNode {
    */
   tvm::Attrs attrs;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("params", &params);
     v->Visit("body", &body);
     v->Visit("ret_type", &ret_type);
@@ -274,6 +274,19 @@ class FunctionNode : public ExprNode {
                                tvm::Array<TypeVar> ty_params,
                                tvm::Attrs attrs = Attrs());
 
+  /*!
+   * \brief Attach the function's parameters to its attributes for use in analysis.
+   * \return The function with its parameters attached.
+   */
+  Function SetParams(const tvm::Map<Var, Constant>& parameters) const;
+
+  /*!
+   * \brief Retrieve the function's parameters.
+   *
+   * \return The function's parameter.
+   */
+  tvm::Map<Var, Constant> GetParams() const;
+
   static constexpr const char* _type_key = "relay.Function";
   TVM_DECLARE_NODE_TYPE_INFO(FunctionNode, ExprNode);
 };
@@ -284,7 +297,6 @@ RELAY_DEFINE_NODE_REF(Function, FunctionNode, Expr);
 TVM_DLL NodeRef FunctionGetAttr(const Function& func, const std::string& key);
 TVM_DLL Function FunctionSetAttr(const Function& func, const std::string& key, const NodeRef& data);
 
-
 /*!
  * \brief Call corresponds to operator invocation.
  *  Corresponds to the operator in computational graph terminology.
@@ -327,7 +339,7 @@ class CallNode : public ExprNode {
    */
   tvm::Array<Type> type_args;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("op", &op);
     v->Visit("args", &args);
     v->Visit("attrs", &attrs);
@@ -369,7 +381,7 @@ class LetNode : public ExprNode {
   /*! \brief The body of the let binding */
   Expr body;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("var", &var);
     v->Visit("value", &value);
     v->Visit("body", &body);
@@ -407,7 +419,7 @@ class IfNode : public ExprNode {
   /*! \brief The expression evaluated when condition is false */
   Expr false_branch;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("cond", &cond);
     v->Visit("true_branch", &true_branch);
     v->Visit("false_branch", &false_branch);
@@ -432,7 +444,7 @@ class TupleGetItemNode : public ExprNode {
   /*! \brief which value to get */
   int index;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("tuple_value", &tuple);
     v->Visit("index", &index);
     v->Visit("span", &span);
@@ -454,7 +466,7 @@ class RefCreateNode : public ExprNode {
   /*! \brief The initial value of the Reference. */
   Expr value;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("value", &value);
     v->Visit("span", &span);
     v->Visit("_checked_type_", &checked_type_);
@@ -475,7 +487,7 @@ class RefReadNode : public ExprNode {
   /*! \brief The Reference Expression. */
   Expr ref;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("ref", &ref);
     v->Visit("span", &span);
     v->Visit("_checked_type_", &checked_type_);
@@ -498,7 +510,7 @@ class RefWriteNode : public ExprNode {
   /*! \brief The value to write into. */
   Expr value;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("ref", &ref);
     v->Visit("value", &value);
     v->Visit("span", &span);
@@ -541,10 +553,11 @@ RELAY_DEFINE_NODE_REF(TempExpr, TempExprNode, Expr);
 
 // implementataions
 inline const Type& ExprNode::checked_type() const {
-  CHECK(checked_type_.defined()) << "internal error: the type checker has "
-    "not populated the checked_type "
-    "field for "
-                                 << GetRef<Expr>(this);
+  CHECK(checked_type_.defined())
+      << "internal error: the type checker has "
+      << "not populated the checked_type "
+      << "field for "
+      << GetRef<Expr>(this);
   return this->checked_type_;
 }
 
@@ -557,7 +570,7 @@ inline const TTypeNode* ExprNode::type_as() const {
   const TTypeNode* node = checked_type_.as<TTypeNode>();
   CHECK(node != nullptr)
       << "Expected type to be " << TTypeNode::_type_key
-      << ", but get " << checked_type_->type_key();
+      << ", but get " << checked_type_->GetTypeKey();
   return node;
 }
 
diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index e0d940c5d1a5..8bc87a27f66f 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -57,8 +57,8 @@ class ExprFunctor;
 
 #define RELAY_EXPR_FUNCTOR_DISPATCH(OP)                                \
   vtable.template set_dispatch<OP>(                                    \
-      [](const NodeRef& n, TSelf* self, Args... args) {                \
-        return self->VisitExpr_(static_cast<const OP*>(n.node_.get()), \
+      [](const ObjectRef& n, TSelf* self, Args... args) {                \
+        return self->VisitExpr_(static_cast<const OP*>(n.get()), \
                                 std::forward<Args>(args)...);          \
       });
 
@@ -66,7 +66,7 @@ template <typename R, typename... Args>
 class ExprFunctor<R(const Expr& n, Args...)> {
  private:
   using TSelf = ExprFunctor<R(const Expr& n, Args...)>;
-  using FType = tvm::IRFunctor<R(const NodeRef& n, TSelf* self, Args...)>;
+  using FType = tvm::IRFunctor<R(const ObjectRef& n, TSelf* self, Args...)>;
 
  public:
   /*! \brief the result type of this functor */
@@ -117,7 +117,7 @@ class ExprFunctor<R(const Expr& n, Args...)> {
   virtual R VisitExpr_(const ConstructorNode* op, Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExpr_(const MatchNode* op, Args... args) EXPR_FUNCTOR_DEFAULT;
   virtual R VisitExprDefault_(const Node* op, Args...) {
-    LOG(FATAL) << "Do not have a default for " << op->type_key();
+    LOG(FATAL) << "Do not have a default for " << op->GetTypeKey();
     throw;
   }
 
diff --git a/include/tvm/relay/interpreter.h b/include/tvm/relay/interpreter.h
index d05099f781ac..d5d783d4804a 100644
--- a/include/tvm/relay/interpreter.h
+++ b/include/tvm/relay/interpreter.h
@@ -78,9 +78,9 @@ class ValueNode : public RelayNode {
 class Value : public NodeRef {
  public:
   Value() {}
-  explicit Value(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Value(ObjectPtr<Object> n) : NodeRef(n) {}
   const ValueNode* operator->() const {
-    return static_cast<const ValueNode*>(node_.get());
+    return static_cast<const ValueNode*>(get());
   }
 
   using ContainerType = ValueNode;
@@ -106,7 +106,7 @@ class ClosureNode : public ValueNode {
 
   ClosureNode() {}
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("env", &env);
     v->Visit("func", &func);
   }
@@ -119,6 +119,32 @@ class ClosureNode : public ValueNode {
 
 RELAY_DEFINE_NODE_REF(Closure, ClosureNode, Value);
 
+/*! \brief A Relay Recursive Closure. A closure that has a name. */
+class RecClosure;
+
+/*! \brief The container type of RecClosure. */
+class RecClosureNode : public ValueNode {
+ public:
+  /*! \brief The closure. */
+  Closure clos;
+  /*! \brief variable the closure bind to. */
+  Var bind;
+
+  RecClosureNode() {}
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("clos", &clos);
+    v->Visit("bind", &bind);
+  }
+
+  TVM_DLL static RecClosure make(Closure clos, Var bind);
+
+  static constexpr const char* _type_key = "relay.RecClosure";
+  TVM_DECLARE_NODE_TYPE_INFO(RecClosureNode, ValueNode);
+};
+
+RELAY_DEFINE_NODE_REF(RecClosure, RecClosureNode, Value);
+
 /*! \brief A tuple value. */
 class TupleValue;
 
@@ -128,7 +154,7 @@ struct TupleValueNode : ValueNode {
 
   TupleValueNode() {}
 
-  void VisitAttrs(tvm::AttrVisitor* v) final { v->Visit("fields", &fields); }
+  void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("fields", &fields); }
 
   TVM_DLL static TupleValue make(tvm::Array<Value> value);
 
@@ -147,7 +173,7 @@ struct TensorValueNode : ValueNode {
 
   TensorValueNode() {}
 
-  void VisitAttrs(tvm::AttrVisitor* v) final { v->Visit("data", &data); }
+  void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("data", &data); }
 
   /*! \brief Build a value from an NDArray. */
   TVM_DLL static TensorValue make(runtime::NDArray data);
@@ -166,7 +192,7 @@ struct RefValueNode : ValueNode {
 
   RefValueNode() {}
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("value", &value);
   }
 
@@ -189,7 +215,7 @@ struct ConstructorValueNode : ValueNode {
   /*! \brief Optional field tracking ADT constructor. */
   Constructor constructor;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("tag", &tag);
     v->Visit("fields", &fields);
     v->Visit("constructor", &constructor);
diff --git a/include/tvm/relay/module.h b/include/tvm/relay/module.h
index 8b17020a1132..160ae5db8265 100644
--- a/include/tvm/relay/module.h
+++ b/include/tvm/relay/module.h
@@ -68,7 +68,7 @@ class ModuleNode : public RelayNode {
 
   ModuleNode() {}
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("functions", &functions);
     v->Visit("type_definitions", &type_definitions);
     v->Visit("global_var_map_", &global_var_map_);
@@ -281,10 +281,10 @@ class ModuleNode : public RelayNode {
 
 struct Module : public NodeRef {
   Module() {}
-  explicit Module(NodePtr<tvm::Node> p) : NodeRef(p) {}
+  explicit Module(ObjectPtr<::tvm::Object> p) : NodeRef(p) {}
 
-  inline ModuleNode* operator->() const {
-    return static_cast<ModuleNode*>(node_.get());
+  ModuleNode* operator->() const {
+    return static_cast<ModuleNode*>(get_mutable());
   }
 
   using ContainerType = ModuleNode;
diff --git a/include/tvm/relay/op.h b/include/tvm/relay/op.h
index 0a6d3725655f..7d2a1f653a93 100644
--- a/include/tvm/relay/op.h
+++ b/include/tvm/relay/op.h
@@ -24,6 +24,8 @@
 #ifndef TVM_RELAY_OP_H_
 #define TVM_RELAY_OP_H_
 
+#include <dmlc/registry.h>
+
 #include <functional>
 #include <limits>
 #include <string>
@@ -82,7 +84,7 @@ class OpNode : public relay::ExprNode {
    */
   int32_t support_level = 10;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("op_type", &op_type);
     v->Visit("description", &description);
@@ -138,7 +140,7 @@ class Op : public relay::Expr {
   /*! \brief default constructor  */
   Op() {}
   /*! \brief constructor from node pointer */
-  explicit Op(NodePtr<Node> n) : Expr(n) {}
+  explicit Op(ObjectPtr<Object> n) : Expr(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -221,11 +223,12 @@ class OpRegistry {
                                     const Attrs&,
                                     const TypeReporter&)> type_rel_func);
   /*!
-   * \brief Set the type key of attributes.
-   * \param type_key The type of of the attrs field.
+   * \brief Set the the attrs type key and index to be AttrsType.
+   * \tparam AttrsType the attribute type to b set.
    * \return reference to self.
    */
-  inline OpRegistry& set_attrs_type_key(const std::string& type_key);
+  template<typename AttrsType>
+  inline OpRegistry& set_attrs_type();
   /*!
    * \brief Set the num_inputs
    * \param n The number of inputs to be set.
@@ -397,7 +400,7 @@ class OpMap {
 
 // implementations
 inline const OpNode* Op::operator->() const {
-  return static_cast<const OpNode*>(node_.get());
+  return static_cast<const OpNode*>(get());
 }
 
 template <typename ValueType>
@@ -496,10 +499,10 @@ inline OpRegistry& OpRegistry::set_num_inputs(int32_t n) {  // NOLINT(*)
   return *this;
 }
 
-inline OpRegistry& OpRegistry::set_attrs_type_key(  // NOLINT(*)
-    const std::string& type_key) {
-  get()->attrs_type_key = type_key;
-  get()->attrs_type_index = Node::TypeKey2Index(type_key.c_str());
+template<typename AttrsType>
+inline OpRegistry& OpRegistry::set_attrs_type() {  // NOLINT(*)
+  get()->attrs_type_key = AttrsType::_type_key;
+  get()->attrs_type_index = AttrsType::RuntimeTypeIndex();
   return *this;
 }
 
diff --git a/include/tvm/relay/pattern_functor.h b/include/tvm/relay/pattern_functor.h
index 7f1c47e03592..c15523cb25de 100644
--- a/include/tvm/relay/pattern_functor.h
+++ b/include/tvm/relay/pattern_functor.h
@@ -57,8 +57,8 @@ class PatternFunctor;
 
 #define RELAY_PATTERN_FUNCTOR_DISPATCH(OP)                                \
   vtable.template set_dispatch<OP>(                                       \
-      [](const NodeRef& n, TSelf* self, Args... args) {                   \
-        return self->VisitPattern_(static_cast<const OP*>(n.node_.get()), \
+      [](const ObjectRef& n, TSelf* self, Args... args) {                   \
+        return self->VisitPattern_(static_cast<const OP*>(n.get()), \
                                    std::forward<Args>(args)...);          \
       });
 
@@ -66,7 +66,7 @@ template <typename R, typename... Args>
 class PatternFunctor<R(const Pattern& n, Args...)> {
  private:
   using TSelf = PatternFunctor<R(const Pattern& n, Args...)>;
-  using FType = tvm::IRFunctor<R(const NodeRef& n, TSelf* self, Args...)>;
+  using FType = tvm::IRFunctor<R(const ObjectRef& n, TSelf* self, Args...)>;
 
  public:
   /*! \brief the result type of this functor */
@@ -103,7 +103,7 @@ class PatternFunctor<R(const Pattern& n, Args...)> {
   virtual R VisitPattern_(const PatternTupleNode* op,
                           Args... args) PATTERN_FUNCTOR_DEFAULT;
   virtual R VisitPatternDefault_(const Node* op, Args...) {
-    LOG(FATAL) << "Do not have a default for " << op->type_key();
+    LOG(FATAL) << "Do not have a default for " << op->GetTypeKey();
     throw;
   }
 
diff --git a/include/tvm/relay/qnn/attrs.h b/include/tvm/relay/qnn/attrs.h
index 83b55b04222a..e5f4ba94e12e 100644
--- a/include/tvm/relay/qnn/attrs.h
+++ b/include/tvm/relay/qnn/attrs.h
@@ -49,7 +49,7 @@ struct RequantizeAttrs : public tvm::AttrsNode<RequantizeAttrs> {
         .describe("The scale of the output tensor.");
     TVM_ATTR_FIELD(output_zero_point)
         .describe("The zero point of the output tensor.");
-    TVM_ATTR_FIELD(rounding).set_default("TONEAREST")
+    TVM_ATTR_FIELD(rounding).set_default("UPWARD")
         .describe("Defines the rounding direction when the value is midway between"
                   "two representable values. There are two supported modes - UPWARD"
                   "or TONEAREST. Both modes behave exactly same except at the"
diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index a2119c90f750..82144d76e565 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -101,7 +101,7 @@ class PassContextNode : public RelayNode {
 
   PassContextNode() = default;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("opt_level", &opt_level);
     v->Visit("fallback_device", &fallback_device);
     v->Visit("required_pass", &required_pass);
@@ -134,16 +134,16 @@ class PassContext : public NodeRef {
    * \return const access pointer.
    */
   const PassContextNode* operator->() const {
-    CHECK(node_.get() != nullptr);
-    return static_cast<const PassContextNode*>(node_.get());
+    CHECK(get() != nullptr);
+    return static_cast<const PassContextNode*>(get());
   }
   /*!
    * \brief mutable accessor.
    * \return mutable access pointer.
    */
   PassContextNode* operator->() {
-    CHECK(node_.get() != nullptr);
-    return static_cast<PassContextNode*>(node_.get());
+    CHECK(get() != nullptr);
+    return static_cast<PassContextNode*>(get_mutable());
   }
   /*!
    * \brief Construct a PassContext containing the default configurations.
@@ -196,7 +196,7 @@ class PassInfoNode : public RelayNode {
 
   PassInfoNode() = default;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("opt_level", &opt_level);
     v->Visit("name", &name);
     v->Visit("required", &required);
@@ -221,6 +221,7 @@ class Pass;
  */
 class PassNode : public RelayNode {
  public:
+  virtual ~PassNode() {}
   /*!
    * \brief Get the pass information/meta data. */
   virtual PassInfo Info() const = 0;
@@ -247,7 +248,7 @@ class PassNode : public RelayNode {
   virtual Module operator()(const Module& mod,
                             const PassContext& pass_ctx) const = 0;
 
-  void VisitAttrs(tvm::AttrVisitor* v) override {}
+  void VisitAttrs(tvm::AttrVisitor* v) {}
 
   static constexpr const char* _type_key = "relay.Pass";
   TVM_DECLARE_BASE_NODE_INFO(PassNode, RelayNode);
diff --git a/include/tvm/relay/type.h b/include/tvm/relay/type.h
index 16e36785c533..e0c056c1216b 100644
--- a/include/tvm/relay/type.h
+++ b/include/tvm/relay/type.h
@@ -58,7 +58,7 @@ class TypeNode : public RelayNode {
 class Type : public NodeRef {
  public:
   Type() {}
-  explicit Type(NodePtr<tvm::Node> p) : NodeRef(p) {}
+  explicit Type(ObjectPtr<tvm::Object> p) : NodeRef(p) {}
 
   using ContainerType = TypeNode;
 };
@@ -96,7 +96,7 @@ class TensorTypeNode : public BaseTensorTypeNode {
   /*! \brief The content data type */
   DataType dtype;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("shape", &shape);
     v->Visit("dtype", &dtype);
     v->Visit("span", &span);
@@ -159,7 +159,7 @@ class TypeVarNode : public TypeNode {
   /*! \brief The kind of type parameter */
   Kind kind;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("var", &var);
     v->Visit("kind", &kind);
     v->Visit("span", &span);
@@ -188,7 +188,7 @@ class GlobalTypeVarNode : public TypeNode {
   /*! \brief The kind of type parameter */
   Kind kind;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("var", &var);
     v->Visit("kind", &kind);
     v->Visit("span", &span);
@@ -216,7 +216,7 @@ class TypeCallNode : public TypeNode {
   /*! \brief The arguments. */
   tvm::Array<Type> args;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("func", &func);
     v->Visit("args", &args);
     v->Visit("span", &span);
@@ -245,7 +245,7 @@ class IncompleteTypeNode : public TypeNode {
  public:
   Kind kind;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("kind", &kind);
     v->Visit("span", &span);
   }
@@ -297,7 +297,7 @@ class FuncTypeNode : public TypeNode {
    */
   tvm::Array<TypeConstraint> type_constraints;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("arg_types", &arg_types);
     v->Visit("ret_type", &ret_type);
     v->Visit("type_params", &type_params);
@@ -330,7 +330,7 @@ class TupleTypeNode : public TypeNode {
 
   TupleTypeNode() {}
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("fields", &fields);
     v->Visit("span", &span);
   }
@@ -357,7 +357,7 @@ class RefTypeNode : public TypeNode {
 
   RefTypeNode() {}
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("value", &value);
     v->Visit("span", &span);
   }
@@ -417,7 +417,7 @@ class TypeReporterNode : public Node {
   TVM_DLL virtual Module GetModule() = 0;
 
   // solver is not serializable.
-  void VisitAttrs(tvm::AttrVisitor* v) final {}
+  void VisitAttrs(tvm::AttrVisitor* v) {}
 
   static constexpr const char* _type_key = "relay.TypeReporter";
   TVM_DECLARE_NODE_TYPE_INFO(TypeReporterNode, Node);
@@ -430,10 +430,11 @@ class TypeReporterNode : public Node {
 class TypeReporter : public NodeRef {
  public:
   TypeReporter() {}
-  explicit TypeReporter(::tvm::NodePtr<::tvm::Node> n) : NodeRef(n) {
+  explicit TypeReporter(::tvm::ObjectPtr<::tvm::Object> n) : NodeRef(n) {
   }
   TypeReporterNode* operator->() const {
-    return static_cast<TypeReporterNode*>(node_.get());
+    return const_cast<TypeReporterNode*>(
+        static_cast<const TypeReporterNode*>(get()));
   }
   using ContainerType = TypeReporterNode;
 };
@@ -487,7 +488,7 @@ class TypeRelationNode : public TypeConstraintNode {
   /*! \brief Attributes to the relation function */
   Attrs attrs;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("func", &func);
     v->Visit("args", &args);
     v->Visit("num_inputs", &num_inputs);
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 54e6f98e8ee5..267504beb11a 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -98,13 +98,12 @@ typedef enum {
   kTVMType = 5U,
   kTVMContext = 6U,
   kArrayHandle = 7U,
-  kNodeHandle = 8U,
+  kObjectHandle = 8U,
   kModuleHandle = 9U,
   kFuncHandle = 10U,
   kStr = 11U,
   kBytes = 12U,
   kNDArrayContainer = 13U,
-  kObjectCell = 14U,
   // Extension codes for other frameworks to integrate TVM PackedFunc.
   // To make sure each framework's id do not conflict, use first and
   // last sections to mark ranges.
@@ -549,13 +548,31 @@ TVM_DLL int TVMStreamStreamSynchronize(int device_type,
                                        TVMStreamHandle dst);
 
 /*!
- * \brief Get the tag from an object.
+ * \brief Get the type_index from an object.
  *
  * \param obj The object handle.
- * \param tag The tag of object.
+ * \param out_tindex the output type index.
  * \return 0 when success, -1 when failure happens
  */
-TVM_DLL int TVMGetObjectTag(TVMObjectHandle obj, int* tag);
+TVM_DLL int TVMObjectGetTypeIndex(TVMObjectHandle obj, unsigned* out_tindex);
+
+/*!
+ * \brief Convert type key to type index.
+ * \param type_key The key of the type.
+ * \param out_tindex the corresponding type index.
+ * \return 0 when success, -1 when failure happens
+ */
+TVM_DLL int TVMObjectTypeKey2Index(const char* type_key, unsigned* out_tindex);
+
+/*!
+ * \brief Free the object.
+ *
+ * \param obj The object handle.
+ * \note Internally we decrease the reference counter of the object.
+ *       The object will be freed when every reference to the object are removed.
+ * \return 0 when success, -1 when failure happens
+ */
+TVM_DLL int TVMObjectFree(TVMObjectHandle obj);
 
 #ifdef __cplusplus
 }  // TVM_EXTERN_C
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index 68029c13cb93..bb362dcdec66 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -230,6 +230,7 @@ inline std::ostream& operator<<(std::ostream& os, DLContext ctx) {  // NOLINT(*)
   os << runtime::DeviceName(device_type) << "(" << ctx.device_id << ")";
   return os;
 }
+
 #endif
 }  // namespace runtime
 }  // namespace tvm
diff --git a/include/tvm/runtime/memory.h b/include/tvm/runtime/memory.h
index 6b4f01e4ac9b..d28552eaf7fd 100644
--- a/include/tvm/runtime/memory.h
+++ b/include/tvm/runtime/memory.h
@@ -69,7 +69,7 @@ class ObjAllocatorBase {
                   "make_node can only be used to create NodeBase");
     T* ptr = Handler::New(static_cast<Derived*>(this),
                          std::forward<Args>(args)...);
-    ptr->type_index_ = T::type_index();
+    ptr->type_index_ = T::RuntimeTypeIndex();
     ptr->deleter_ = Handler::Deleter();
     return ObjectPtr<T>(ptr);
   }
@@ -82,6 +82,8 @@ class SimpleObjAllocator :
   template<typename T>
   class Handler {
    public:
+    using StorageType = typename std::aligned_storage<sizeof(T), alignof(T)>::type;
+
     template<typename... Args>
     static T* New(SimpleObjAllocator*, Args&&... args) {
       // NOTE: the first argument is not needed for SimpleObjAllocator
@@ -91,7 +93,15 @@ class SimpleObjAllocator :
       // In the case of an object pool, an allocator needs to create
       // a special chunk memory that hides reference to the allocator
       // and call allocator's release function in the deleter.
-      return new T(std::forward<Args>(args)...);
+
+      // NOTE2: Use inplace new to allocate
+      // This is used to get rid of warning when deleting a virtual
+      // class with non-virtual destructor.
+      // We are fine here as we captured the right deleter during construction.
+      // This is also the right way to get storage type for an object pool.
+      StorageType* data = new StorageType();
+      new (data) T(std::forward<Args>(args)...);
+      return reinterpret_cast<T*>(data);
     }
 
     static Object::FDeleter Deleter() {
@@ -99,8 +109,17 @@ class SimpleObjAllocator :
     }
 
    private:
-    static void Deleter_(Object* ptr) {
-      delete static_cast<T*>(ptr);
+    static void Deleter_(Object* objptr) {
+      // NOTE: this is important to cast back to T*
+      // because objptr and tptr may not be the same
+      // depending on how sub-class allocates the space.
+      T* tptr = static_cast<T*>(objptr);
+      // It is important to do tptr->T::~T(),
+      // so that we explicitly call the specific destructor
+      // instead of tptr->~T(), which could mean the intention
+      // call a virtual destructor(which may not be available and is not required).
+      tptr->T::~T();
+      delete reinterpret_cast<StorageType*>(tptr);
     }
   };
 };
diff --git a/include/tvm/runtime/node_base.h b/include/tvm/runtime/node_base.h
deleted file mode 100644
index 8b47c18a09a7..000000000000
--- a/include/tvm/runtime/node_base.h
+++ /dev/null
@@ -1,259 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file tvm/runtime/node_base.h
- * \brief Base data structure for Node.
- *
- * \note Node is not a runtime feature.
- *  This file only exposes the signature of NodePtr for PackedFunc.
- */
-#ifndef TVM_RUNTIME_NODE_BASE_H_
-#define TVM_RUNTIME_NODE_BASE_H_
-
-#include <utility>
-#include <atomic>
-
-namespace tvm {
-
-// forward declarations
-template<typename T>
-class NodePtr;
-class Node;
-class NodeRef;
-
-/*!
- * \brief Base class of Node for runtime destructor purposes.
- *
- * Node is a reference counted object which is used to construct AST.
- * Each node is backed by a custom deleter, which deletes the object.
- * Do not call create raw Node pointer, always use tvm::make_node.
- *
- * \note In most cases, please inheritate tvm::Node.
- * \sa Node, NodePtr, make_node
- */
-class NodeBase {
- public:
-  /*!
-   * \brief type of NodeBase deleter
-   * \param self pointer to the NodeBase.
-   */
-  typedef void (*FDeleter)(NodeBase* self);
-
- protected:
-  // default constructor and copy constructor
-  NodeBase() {}
-  // override the copy and assign constructors to do nothing.
-  // This is to make sure only contents, but not deleter and ref_counter
-  // are copied when a child class copies itself.
-  NodeBase(const NodeBase& other) {  // NOLINT(*)
-  }
-  NodeBase(NodeBase&& other) {  // NOLINT(*)
-  }
-  NodeBase& operator=(const NodeBase& other) {  //NOLINT(*)
-    return *this;
-  }
-  NodeBase& operator=(NodeBase&& other) {  //NOLINT(*)
-    return *this;
-  }
-
- private:
-  /*! \brief Internal reference counter */
-  std::atomic<int> ref_counter_{0};
-  /*!
-   * \brief deleter of this object to enable customized allocation.
-   * If the deleter is nullptr, no deletion will be performed.
-   * The creator of the Node must always set the deleter field properly.
-   */
-  FDeleter deleter_ = nullptr;
-  // reference counting functions
-  void IncRef() {
-    ref_counter_.fetch_add(1, std::memory_order_relaxed);
-  }
-  void DecRef() {
-    if (ref_counter_.fetch_sub(1, std::memory_order_release) == 1) {
-      std::atomic_thread_fence(std::memory_order_acquire);
-      if (this->deleter_ != nullptr) {
-        (*this->deleter_)(this);
-      }
-    }
-  }
-  int use_count() const {
-    return ref_counter_.load(std::memory_order_relaxed);
-  }
-  // friend declaration
-  template<typename>
-  friend class NodePtr;
-  template<typename Y, typename... Args>
-  friend NodePtr<Y> make_node(Args&&...);
-};
-
-/*!
- * \brief Smart pointer for Node containers,
- *  must be subclass of NodeBase
- * \tparam T the content data type.
- */
-template<typename T>
-class NodePtr {
- public:
-  /*! \brief default constructor */
-  NodePtr() {}
-  /*! \brief default constructor */
-  NodePtr(std::nullptr_t) {}  // NOLINT(*)
-  /*!
-   * \brief copy constructor
-   * \param other The value to be moved
-   */
-  NodePtr(const NodePtr<T>& other)  // NOLINT(*)
-      : NodePtr(other.data_) {
-  }
-  /*!
-   * \brief copy constructor
-   * \param other The value to be moved
-   */
-  template<typename Y>
-  NodePtr(const NodePtr<Y>& other)  // NOLINT(*)
-      : NodePtr(other.data_) {
-    static_assert(std::is_base_of<T, Y>::value,
-                  "can only assign of child class NodePtr to parent");
-  }
-  /*!
-   * \brief move constructor
-   * \param other The value to be moved
-   */
-  NodePtr(NodePtr<T>&& other) // NOLINT(*)
-      : data_(other.data_) {
-    other.data_ = nullptr;
-  }
-  /*!
-   * \brief move constructor
-   * \param other The value to be moved
-   */
-  template<typename Y>
-  NodePtr(NodePtr<Y>&& other)  // NOLINT(*)
-      : data_(other.data_) {
-    static_assert(std::is_base_of<T, Y>::value,
-                  "can only assign of child class NodePtr to parent");
-    other.data_ = nullptr;
-  }
-  /*! \brief destructor */
-  ~NodePtr() {
-    this->reset();
-  }
-  /*!
-   * \brief Swap this array with another NDArray
-   * \param other The other NDArray
-   */
-  void swap(NodePtr<T>& other) {  // NOLINT(*)
-    std::swap(data_, other.data_);
-  }
-  /*!
-   * \return Get the content of the pointer
-   */
-  T* get() const {
-    return static_cast<T*>(data_);
-  }
-  /*!
-   * \return The pointer
-   */
-  T* operator->() const {
-    return get();
-  }
-  /*!
-   * \return The reference
-   */
-  T& operator*() const { // NOLINT(*)
-    return *get();
-  }
-  /*!
-   * \brief copy assignmemt
-   * \param other The value to be assigned.
-   * \return reference to self.
-   */
-  NodePtr<T>& operator=(const NodePtr<T>& other) {  // NOLINT(*)
-    // takes in plane operator to enable copy elison.
-    // copy-and-swap idiom
-    NodePtr(other).swap(*this);  // NOLINT(*)
-    return *this;
-  }
-  /*!
-   * \brief move assignmemt
-   * \param other The value to be assigned.
-   * \return reference to self.
-   */
-  NodePtr<T>& operator=(NodePtr<T>&& other) {  // NOLINT(*)
-    // copy-and-swap idiom
-    NodePtr(std::move(other)).swap(*this); // NOLINT(*)
-    return *this;
-  }
-  /*! \brief reset the content of ptr to be nullptr */
-  void reset() {
-    if (data_ != nullptr) {
-      data_->DecRef();
-      data_ = nullptr;
-    }
-  }
-  /*! \return The use count of the ptr, for debug purposes */
-  int use_count() const {
-    return data_ != nullptr ? data_->use_count() : 0;
-  }
-  /*! \return whether the reference is unique */
-  bool unique() const {
-    return data_ != nullptr && data_->use_count() == 1;
-  }
-  /*! \return Whether two NodePtr do not equals each other */
-  bool operator==(const NodePtr<T>& other) const {
-    return data_ == other.data_;
-  }
-  /*! \return Whether two NodePtr equals each other */
-  bool operator!=(const NodePtr<T>& other) const {
-    return data_ != other.data_;
-  }
-  /*! \return Whether the pointer is nullptr */
-  bool operator==(std::nullptr_t null) const {
-    return data_ == nullptr;
-  }
-  /*! \return Whether the pointer is not nullptr */
-  bool operator!=(std::nullptr_t null) const {
-    return data_ != nullptr;
-  }
-
- private:
-  /*! \brief internal pointer field */
-  NodeBase* data_{nullptr};
-  /*!
-   * \brief constructor from NodeBase
-   * \param data The node base pointer
-   */
-  explicit NodePtr(NodeBase* data)
-      : data_(data) {
-    if (data != nullptr) {
-      data_->IncRef();
-    }
-  }
-  // friend declaration
-  friend class Node;
-  template<typename>
-  friend class NodePtr;
-  template<typename Y, typename... Args>
-  friend NodePtr<Y> make_node(Args&&...);
-};
-}  // namespace tvm
-
-#endif  // TVM_RUNTIME_NODE_BASE_H_
diff --git a/include/tvm/runtime/object.h b/include/tvm/runtime/object.h
index 7b0653ae5485..cc4a295cc5d4 100644
--- a/include/tvm/runtime/object.h
+++ b/include/tvm/runtime/object.h
@@ -23,6 +23,7 @@
 #ifndef TVM_RUNTIME_OBJECT_H_
 #define TVM_RUNTIME_OBJECT_H_
 
+#include <dmlc/logging.h>
 #include <type_traits>
 #include <string>
 #include <utility>
@@ -51,7 +52,7 @@ enum TypeIndex  {
   kRoot = 0,
   kVMTensor = 1,
   kVMClosure = 2,
-  kVMDatatype = 3,
+  kVMADT = 3,
   kStaticIndexEnd,
   /*! \brief Type index is allocated during runtime. */
   kDynamic = kStaticIndexEnd
@@ -65,7 +66,7 @@ enum TypeIndex  {
  * - _type_index:
  *      Static type index of the object, if assigned to TypeIndex::kDynamic
  *      the type index will be assigned during runtime.
- *      Runtime type index can be accessed by ObjectType::type_index();
+ *      Runtime type index can be accessed by ObjectType::TypeIndex();
  * - _type_key:
  *       The unique string identifier of tyep type.
  * - _type_final:
@@ -147,10 +148,23 @@ class Object {
    * \param self pointer to the Object.
    */
   typedef void (*FDeleter)(Object* self);
-  /*! \return The internal type index of the object. */
+  /*! \return The internal runtime type index of the object. */
   uint32_t type_index() const {
     return type_index_;
   }
+  /*!
+   * \return the type key of the object.
+   * \note this operation is expensive, can be used for error reporting.
+   */
+  std::string GetTypeKey() const {
+    return TypeIndex2Key(type_index_);
+  }
+  /*!
+   * \return A hash value of the return of GetTypeKey.
+   */
+  size_t GetTypeKeyHash() const {
+    return TypeIndex2KeyHash(type_index_);
+  }
   /*!
    * Check if the object is an instance of TargetType.
    * \tparam TargetType The target type to be checked.
@@ -159,19 +173,65 @@ class Object {
   template<typename TargetType>
   inline bool IsInstance() const;
 
+  /*!
+   * \brief Get the type key of the corresponding index from runtime.
+   * \param tindex The type index.
+   * \return the result.
+   */
+  TVM_DLL static std::string TypeIndex2Key(uint32_t tindex);
+  /*!
+   * \brief Get the type key hash of the corresponding index from runtime.
+   * \param tindex The type index.
+   * \return the related key-hash.
+   */
+  TVM_DLL static size_t TypeIndex2KeyHash(uint32_t tindex);
+  /*!
+   * \brief Get the type index of the corresponding key from runtime.
+   * \param key The type key.
+   * \return the result.
+   */
+  TVM_DLL static uint32_t TypeKey2Index(const std::string& key);
+
 #if TVM_OBJECT_ATOMIC_REF_COUNTER
   using RefCounterType = std::atomic<int32_t>;
 #else
   using RefCounterType = int32_t;
 #endif
 
-  // Object type properties
   static constexpr const char* _type_key = "Object";
+
+  static uint32_t _GetOrAllocRuntimeTypeIndex() {
+    return TypeIndex::kRoot;
+  }
+  static uint32_t RuntimeTypeIndex() {
+    return TypeIndex::kRoot;
+  }
+
+  // Default object type properties for sub-classes
   static constexpr bool _type_final = false;
   static constexpr uint32_t _type_child_slots = 0;
   static constexpr bool _type_child_slots_can_overflow = true;
-  static const uint32_t _GetOrAllocRuntimeTypeIndex() {
-    return 0;
+  // NOTE: the following field is not type index of Object
+  // but was intended to be used by sub-classes as default value.
+  // The type index of Object is TypeIndex::kRoot
+  static constexpr uint32_t _type_index = TypeIndex::kDynamic;
+
+  // Default constructor and copy constructor
+  Object() {}
+  // Override the copy and assign constructors to do nothing.
+  // This is to make sure only contents, but not deleter and ref_counter
+  // are copied when a child class copies itself.
+  // This will enable us to use make_object<ObjectClass>(*obj_ptr)
+  // to copy an existing object.
+  Object(const Object& other) {  // NOLINT(*)
+  }
+  Object(Object&& other) {  // NOLINT(*)
+  }
+  Object& operator=(const Object& other) {  //NOLINT(*)
+    return *this;
+  }
+  Object& operator=(Object&& other) {  //NOLINT(*)
+    return *this;
   }
 
  protected:
@@ -209,25 +269,12 @@ class Object {
    * \return The allocated type index.
    */
   TVM_DLL static uint32_t GetOrAllocRuntimeTypeIndex(
-      const char* key,
+      const std::string& key,
       uint32_t static_tindex,
       uint32_t parent_tindex,
       uint32_t type_child_slots,
       bool type_child_slots_can_overflow);
 
-  /*!
-   * \brief Get the type key of the corresponding index from runtime.
-   * \param tindex The type index.
-   */
-  TVM_DLL static std::string TypeIndex2Key(uint32_t tindex);
-
-  /*!
-   * \brief Get the type index of the corresponding key from runtime.
-   * \param key The type key.
-   */
-  TVM_DLL static uint32_t TypeKey2Index(const char* key);
-
- private:
   // reference counter related operations
   /*! \brief developer function, increases reference counter. */
   inline void IncRef();
@@ -253,8 +300,35 @@ class Object {
   template<typename>
   friend class ObjectPtr;
   friend class TVMRetValue;
+  friend class TVMObjectCAPI;
 };
 
+/*!
+ * \brief Get a reference type from a raw object ptr type
+ *
+ *  It is always important to get a reference type
+ *  if we want to return a value as reference or keep
+ *  the node alive beyond the scope of the function.
+ *
+ * \param ptr The node pointer
+ * \tparam RefType The reference type
+ * \tparam ObjectType The node type
+ * \return The corresponding RefType
+ */
+template <typename RefType, typename ObjectType>
+inline RefType GetRef(const ObjectType* ptr);
+
+/*!
+ * \brief Downcast a base reference type to a more specific type.
+ *
+ * \param ref The inptut reference
+ * \return The corresponding SubRef.
+ * \tparam SubRef The target specific reference type.
+ * \tparam BaseRef the current reference type.
+ */
+template <typename SubRef, typename BaseRef>
+inline SubRef Downcast(BaseRef ref);
+
 /*!
  * \brief A custom smart pointer for Object.
  * \tparam T the content data type.
@@ -388,7 +462,7 @@ class ObjectPtr {
   /*! \brief internal pointer field */
   Object* data_{nullptr};
   /*!
-   * \brief constructor from NodeBase
+   * \brief constructor from Object
    * \param data The data pointer
    */
   explicit ObjectPtr(Object* data) : data_(data) {
@@ -399,6 +473,7 @@ class ObjectPtr {
   // friend classes
   friend class Object;
   friend class ObjectRef;
+  friend struct ObjectHash;
   template<typename>
   friend class ObjectPtr;
   template<typename>
@@ -406,6 +481,9 @@ class ObjectPtr {
   friend class TVMPODValue_;
   friend class TVMArgsSetter;
   friend class TVMRetValue;
+  friend class TVMArgValue;
+  template <typename RefType, typename ObjType>
+  friend RefType GetRef(const ObjType* ptr);
 };
 
 /*! \brief Base class of all object reference */
@@ -415,10 +493,54 @@ class ObjectRef {
   ObjectRef() = default;
   /*! \brief Constructor from existing object ptr */
   explicit ObjectRef(ObjectPtr<Object> data) : data_(data) {}
+  /*!
+   * \brief Comparator
+   * \param other Another object ref.
+   * \return the compare result.
+   */
+  bool same_as(const ObjectRef& other) const {
+    return data_ == other.data_;
+  }
+  /*!
+   * \brief Comparator
+   * \param other Another object ref.
+   * \return the compare result.
+   */
+  bool operator==(const ObjectRef& other) const {
+    return data_ == other.data_;
+  }
+  /*!
+   * \brief Comparator
+   * \param other Another node ref.
+   * \return the compare result.
+   */
+  bool operator!=(const ObjectRef& other) const {
+    return data_ != other.data_;
+  }
+  /*!
+   * \brief Comparator
+   * \param other Another object ref by address.
+   * \return the compare result.
+   */
+  bool operator<(const ObjectRef& other) const {
+    return data_.get() < other.data_.get();
+  }
+  /*! \return whether the expression is null */
+  bool defined() const {
+    return data_ != nullptr;
+  }
   /*! \return the internal object pointer */
-  inline const Object* get() const;
+  const Object* get() const {
+    return data_.get();
+  }
   /*! \return the internal node pointer */
-  inline const Object* operator->() const;
+  const Object* operator->() const {
+    return get();
+  }
+  /*! \return whether the reference is unique */
+  bool unique() const {
+    return data_.unique();
+  }
   /*!
    * \brief Try to downcast the internal Object to a
    *  raw pointer of a corresponding type.
@@ -433,25 +555,81 @@ class ObjectRef {
   template <typename ObjectType>
   inline const ObjectType* as() const;
 
-  /*! \brief type indicate the container type */
+  /*! \brief type indicate the container type. */
   using ContainerType = Object;
 
  protected:
   /*! \brief Internal pointer that backs the reference. */
   ObjectPtr<Object> data_;
+  /*! \return return a mutable internal ptr, can be used by sub-classes. */
+  Object* get_mutable() const {
+    return data_.get();
+  }
+  /*!
+   * \brief Internal helper function downcast a ref without check.
+   * \note Only used for internal dev purposes.
+   * \tparam T The target reference type.
+   * \return The casted result.
+   */
+  template<typename T>
+  static T DowncastNoCheck(ObjectRef ref) {
+    return T(std::move(ref.data_));
+  }
+  /*!
+   * \brief Internal helper function get data_ as ObjectPtr of ObjectType.
+   * \note only used for internal dev purpose.
+   * \tparam ObjectType The corresponding object type.
+   * \return the corresponding type.
+   */
+  template<typename ObjectType>
+  static ObjectPtr<ObjectType> GetDataPtr(const ObjectRef& ref) {
+    return ObjectPtr<ObjectType>(ref.data_.data_);
+  }
   // friend classes.
+  friend struct ObjectHash;
   friend class TVMRetValue;
   friend class TVMArgsSetter;
+  template <typename SubRef, typename BaseRef>
+  friend SubRef Downcast(BaseRef ref);
 };
 
+
+/*! \brief ObjectRef hash functor */
+struct ObjectHash {
+  size_t operator()(const ObjectRef& a) const {
+    return operator()(a.data_);
+  }
+
+  template<typename T>
+  size_t operator()(const ObjectPtr<T>& a) const {
+    return std::hash<Object*>()(a.get());
+  }
+};
+
+
+/*! \brief ObjectRef equal functor */
+struct ObjectEqual {
+  bool operator()(const ObjectRef& a, const ObjectRef& b) const {
+    return a.same_as(b);
+  }
+
+  template<typename T>
+  size_t operator()(const ObjectPtr<T>& a, const ObjectPtr<T>& b) const {
+    return a == b;
+  }
+};
+
+
 /*!
  * \brief helper macro to declare a base object type that can be inheritated.
  * \param TypeName The name of the current type.
  * \param ParentType The name of the ParentType
  */
 #define TVM_DECLARE_BASE_OBJECT_INFO(TypeName, ParentType)              \
-  static const uint32_t type_index()  {                                 \
-    if (_type_index != TypeIndex::kDynamic) return _type_index;         \
+  static const uint32_t RuntimeTypeIndex()  {                           \
+    if (TypeName::_type_index != ::tvm::runtime::TypeIndex::kDynamic) { \
+      return TypeName::_type_index;                                     \
+    }                                                                   \
     return _GetOrAllocRuntimeTypeIndex();                               \
   }                                                                     \
   static const uint32_t _GetOrAllocRuntimeTypeIndex()  {                \
@@ -550,11 +728,11 @@ inline bool Object::IsInstance() const {
     if (TargetType::_type_final) {
       // if the target type is a final type
       // then we only need to check the equivalence.
-      return self->type_index_ == TargetType::type_index();
+      return self->type_index_ == TargetType::RuntimeTypeIndex();
     } else {
       // if target type is a non-leaf type
       // Check if type index falls into the range of reserved slots.
-      uint32_t begin = TargetType::type_index();
+      uint32_t begin = TargetType::RuntimeTypeIndex();
       // The condition will be optimized by constant-folding.
       if (TargetType::_type_child_slots != 0) {
         uint32_t end = begin + TargetType::_type_child_slots;
@@ -564,22 +742,15 @@ inline bool Object::IsInstance() const {
       }
       if (!TargetType::_type_child_slots_can_overflow) return false;
       // Invariance: parent index is always smaller than the child.
-      if (self->type_index_ < TargetType::type_index()) return false;
+      if (self->type_index_ < TargetType::RuntimeTypeIndex()) return false;
       // The rare slower-path, check type hierachy.
-      return self->DerivedFrom(TargetType::type_index());
+      return self->DerivedFrom(TargetType::RuntimeTypeIndex());
     }
   } else {
     return false;
   }
 }
 
-inline const Object* ObjectRef::get() const {
-  return data_.data_;
-}
-
-inline const Object* ObjectRef::operator->() const {
-  return get();
-}
 
 template <typename ObjectType>
 inline const ObjectType* ObjectRef::as() const {
@@ -590,7 +761,27 @@ inline const ObjectType* ObjectRef::as() const {
     return nullptr;
   }
 }
+
+template <typename RefType, typename ObjType>
+inline RefType GetRef(const ObjType* ptr) {
+  static_assert(std::is_base_of<typename RefType::ContainerType, ObjType>::value,
+                "Can only cast to the ref of same container type");
+  return RefType(ObjectPtr<Object>(const_cast<Object*>(static_cast<const Object*>(ptr))));
+}
+
+template <typename SubRef, typename BaseRef>
+inline SubRef Downcast(BaseRef ref) {
+  CHECK(ref->template IsInstance<typename SubRef::ContainerType>())
+      << "Downcast from " << ref->GetTypeKey() << " to "
+      << SubRef::ContainerType::_type_key << " failed.";
+  return SubRef(std::move(ref.data_));
+}
+
 }  // namespace runtime
+
+template<typename T>
+using NodePtr = runtime::ObjectPtr<T>;
+
 }  // namespace tvm
 
 #endif  // TVM_RUNTIME_OBJECT_H_
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 5b71bbc66142..a42946ac2d2c 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -40,7 +40,6 @@
 #include "module.h"
 #include "ndarray.h"
 #include "object.h"
-#include "node_base.h"
 
 // Whether use TVM runtime in header only mode.
 #ifndef TVM_RUNTIME_HEADER_ONLY
@@ -490,9 +489,12 @@ class TVMPODValue_ {
     return NDArray(static_cast<NDArray::Container*>(value_.v_handle));
   }
   operator ObjectRef() const {
-    if (type_code_ == kNull) return ObjectRef(ObjectPtr<Object>(nullptr));
-    TVM_CHECK_TYPE_CODE(type_code_, kObjectCell);
-    return ObjectRef(ObjectPtr<Object>(static_cast<Object*>(value_.v_handle)));
+    if (type_code_ == kNull) {
+      return ObjectRef(ObjectPtr<Object>(nullptr));
+    }
+    TVM_CHECK_TYPE_CODE(type_code_, kObjectHandle);
+    return ObjectRef(
+        ObjectPtr<Object>(static_cast<Object*>(value_.v_handle)));
   }
   operator TVMContext() const {
     TVM_CHECK_TYPE_CODE(type_code_, kTVMContext);
@@ -512,9 +514,14 @@ class TVMPODValue_ {
     CHECK_LT(type_code_, kExtEnd);
     return static_cast<TExtension*>(value_.v_handle)[0];
   }
+  template<typename TObjectRef,
+           typename = typename std::enable_if<
+             std::is_class<TObjectRef>::value>::type>
+  inline bool IsObjectRef() const;
   int type_code() const {
     return type_code_;
   }
+
   /*!
    * \brief return handle as specific pointer type.
    * \tparam T the data type.
@@ -567,6 +574,7 @@ class TVMArgValue : public TVMPODValue_ {
   using TVMPODValue_::operator NDArray;
   using TVMPODValue_::operator TVMContext;
   using TVMPODValue_::operator ObjectRef;
+  using TVMPODValue_::IsObjectRef;
 
   // conversion operator.
   operator std::string() const {
@@ -610,21 +618,15 @@ class TVMArgValue : public TVMPODValue_ {
     return value_;
   }
   // Deferred extension handler.
-  template<typename TNodeRef>
-  inline TNodeRef AsNodeRef() const;
+  template<typename TObjectRef>
+  inline TObjectRef AsObjectRef() const;
   template<typename T,
            typename = typename std::enable_if<
            std::is_class<T>::value>::type>
   inline operator T() const;
-  template<typename TNodeRef,
-           typename = typename std::enable_if<
-             std::is_class<TNodeRef>::value>::type>
-  inline bool IsNodeType() const;
   inline operator tvm::DataType() const;
   inline operator tvm::Expr() const;
   inline operator tvm::Integer() const;
-  // get internal node ptr, if it is node
-  inline NodePtr<Node>& node_sptr();
 };
 
 /*!
@@ -663,6 +665,8 @@ class TVMRetValue : public TVMPODValue_ {
   using TVMPODValue_::operator TVMContext;
   using TVMPODValue_::operator NDArray;
   using TVMPODValue_::operator ObjectRef;
+  using TVMPODValue_::IsObjectRef;
+
   TVMRetValue(const TVMRetValue& other) : TVMPODValue_() {
     this->Assign(other);
   }
@@ -760,11 +764,19 @@ class TVMRetValue : public TVMPODValue_ {
     return *this;
   }
   TVMRetValue& operator=(ObjectRef other) {
-    this->Clear();
-    type_code_ = kObjectCell;
-    // move the handle out
-    value_.v_handle = other.data_.data_;
-    other.data_.data_ = nullptr;
+    return operator=(std::move(other.data_));
+  }
+  template<typename T>
+  TVMRetValue& operator=(ObjectPtr<T> other) {
+    if (other.data_ != nullptr) {
+      this->Clear();
+      type_code_ = kObjectHandle;
+      // move the handle out
+      value_.v_handle = other.data_;
+      other.data_ = nullptr;
+    } else {
+      SwitchToPOD(kNull);
+    }
     return *this;
   }
   TVMRetValue& operator=(PackedFunc f) {
@@ -814,21 +826,19 @@ class TVMRetValue : public TVMPODValue_ {
   }
   /*! \return The value field, if the data is POD */
   const TVMValue& value() const {
-    CHECK(type_code_ != kNodeHandle &&
+    CHECK(type_code_ != kObjectHandle &&
           type_code_ != kFuncHandle &&
           type_code_ != kModuleHandle &&
           type_code_ != kStr) << "TVMRetValue.value can only be used for POD data";
     return value_;
   }
-  // NodeRef related extenstions: in tvm/packed_func_ext.h
+  // ObjectRef related extenstions: in tvm/packed_func_ext.h
   template<typename T,
            typename = typename std::enable_if<
              std::is_class<T>::value>::type>
   inline operator T() const;
-  template<typename TNodeRef>
-  inline TNodeRef AsNodeRef() const;
-  inline TVMRetValue& operator=(const NodeRef& other);
-  inline TVMRetValue& operator=(const NodePtr<Node>& other);
+  template<typename TObjectRef>
+  inline TObjectRef AsObjectRef() const;
   // type related
   inline operator tvm::DataType() const;
   inline TVMRetValue& operator=(const tvm::DataType& other);
@@ -857,12 +867,7 @@ class TVMRetValue : public TVMPODValue_ {
         *this = other.operator NDArray();
         break;
       }
-      case kNodeHandle: {
-        SwitchToClass<NodePtr<Node> >(
-            kNodeHandle, *other.template ptr<NodePtr<Node> >());
-        break;
-      }
-      case kObjectCell: {
+      case kObjectHandle: {
         *this = other.operator ObjectRef();
         break;
       }
@@ -908,12 +913,11 @@ class TVMRetValue : public TVMPODValue_ {
       case kStr: delete ptr<std::string>(); break;
       case kFuncHandle: delete ptr<PackedFunc>(); break;
       case kModuleHandle: delete ptr<Module>(); break;
-      case kNodeHandle: delete ptr<NodePtr<Node> >(); break;
       case kNDArrayContainer: {
         static_cast<NDArray::Container*>(value_.v_handle)->DecRef();
         break;
       }
-      case kObjectCell: {
+      case kObjectHandle: {
         static_cast<Object*>(value_.v_handle)->DecRef();
         break;
       }
@@ -939,14 +943,13 @@ inline const char* TypeCode2Str(int type_code) {
     case kBytes: return "bytes";
     case kHandle: return "handle";
     case kNull: return "NULL";
-    case kNodeHandle: return "NodeHandle";
     case kArrayHandle: return "ArrayHandle";
     case kTVMType: return "TVMType";
     case kTVMContext: return "TVMContext";
     case kFuncHandle: return "FunctionHandle";
     case kModuleHandle: return "ModuleHandle";
     case kNDArrayContainer: return "NDArrayContainer";
-    case kObjectCell: return "ObjectCell";
+    case kObjectHandle: return "ObjectCell";
     default: LOG(FATAL) << "unknown type_code="
                         << static_cast<int>(type_code); return "";
   }
@@ -1057,8 +1060,6 @@ inline PackedFunc::FType PackedFunc::body() const {
   return body_;
 }
 
-
-
 // internal namespace
 namespace detail {
 
@@ -1163,8 +1164,12 @@ class TVMArgsSetter {
     type_codes_[i] = kNDArrayContainer;
   }
   void operator()(size_t i, const ObjectRef& value) const {  // NOLINT(*)
-    values_[i].v_handle = value.data_.data_;
-    type_codes_[i] = kObjectCell;
+    if (value.defined()) {
+      values_[i].v_handle = value.data_.data_;
+      type_codes_[i] = kObjectHandle;
+    } else {
+      type_codes_[i] = kNull;
+    }
   }
   void operator()(size_t i, const TVMRetValue& value) const {  // NOLINT(*)
     if (value.type_code() == kStr) {
@@ -1181,8 +1186,6 @@ class TVMArgsSetter {
            typename = typename std::enable_if<
              extension_type_info<T>::code != 0>::type>
   inline void operator()(size_t i, const T& value) const;
-  // NodeRef related extenstions: in tvm/packed_func_ext.h
-  inline void operator()(size_t i, const NodeRef& other) const;  // NOLINT(*)
   inline void operator()(size_t i, const tvm::DataType& t) const;
 
  private:
@@ -1301,7 +1304,7 @@ template<typename T, typename TSrc, bool is_ext, bool is_nd>
 struct TVMValueCast {
   static T Apply(const TSrc* self) {
     static_assert(!is_ext && !is_nd, "The default case accepts only non-extensions");
-    return self->template AsNodeRef<T>();
+    return self->template AsObjectRef<T>();
   }
 };
 
diff --git a/include/tvm/runtime/registry.h b/include/tvm/runtime/registry.h
index 40e1a520cb67..d668984f50e2 100644
--- a/include/tvm/runtime/registry.h
+++ b/include/tvm/runtime/registry.h
@@ -91,7 +91,7 @@ class Registry {
    *        Note that this will ignore default arg values and always require all arguments to be provided.
    *
    * \code
-   * 
+   *
    * int multiply(int x, int y) {
    *   return x * y;
    * }
@@ -115,7 +115,7 @@ class Registry {
    *        Note that this will ignore default arg values and always require all arguments to be provided.
    *
    * \code
-   * 
+   *
    * // node subclass:
    * struct Example {
    *    int doThing(int x);
@@ -143,7 +143,7 @@ class Registry {
    *        Note that this will ignore default arg values and always require all arguments to be provided.
    *
    * \code
-   * 
+   *
    * // node subclass:
    * struct Example {
    *    int doThing(int x);
@@ -168,22 +168,22 @@ class Registry {
 
   /*!
    * \brief set the body of the function to be the passed method pointer.
-   *        Used when calling a method on a Node subclass through a NodeRef subclass.
+   *        Used when calling a method on a Node subclass through a ObjectRef subclass.
    *        Note that this will ignore default arg values and always require all arguments to be provided.
    *
    * \code
-   * 
+   *
    * // node subclass:
    * struct ExampleNode: BaseNode {
    *    int doThing(int x);
    * }
-   * 
+   *
    * // noderef subclass
-   * struct Example; 
+   * struct Example;
    *
    * TVM_REGISTER_API("Example_doThing")
    * .set_body_method<Example>(&ExampleNode::doThing); // will have type int(Example, int)
-   * 
+   *
    * // note that just doing:
    * // .set_body_method(&ExampleNode::doThing);
    * // wouldn't work, because ExampleNode can't be taken from a TVMArgValue.
@@ -191,15 +191,15 @@ class Registry {
    * \endcode
    *
    * \param f the method pointer to forward to.
-   * \tparam TNodeRef the node reference type to call the method on
+   * \tparam TObjectRef the node reference type to call the method on
    * \tparam TNode the node type containing the method (inferred).
    * \tparam R the return type of the function (inferred).
    * \tparam Args the argument types of the function (inferred).
    */
-  template<typename TNodeRef, typename TNode, typename R, typename ...Args,
-    typename = typename std::enable_if<std::is_base_of<NodeRef, TNodeRef>::value>::type>
+  template<typename TObjectRef, typename TNode, typename R, typename ...Args,
+    typename = typename std::enable_if<std::is_base_of<ObjectRef, TObjectRef>::value>::type>
   Registry& set_body_method(R (TNode::*f)(Args...)) {
-    return set_body_typed<R(TNodeRef, Args...)>([f](TNodeRef ref, Args... params) {
+    return set_body_typed<R(TObjectRef, Args...)>([f](TObjectRef ref, Args... params) {
       TNode* target = ref.operator->();
       // call method pointer
       return (target->*f)(params...);
@@ -208,22 +208,22 @@ class Registry {
 
   /*!
    * \brief set the body of the function to be the passed method pointer.
-   *        Used when calling a method on a Node subclass through a NodeRef subclass.
+   *        Used when calling a method on a Node subclass through a ObjectRef subclass.
    *        Note that this will ignore default arg values and always require all arguments to be provided.
    *
    * \code
-   * 
+   *
    * // node subclass:
    * struct ExampleNode: BaseNode {
    *    int doThing(int x);
    * }
-   * 
+   *
    * // noderef subclass
-   * struct Example; 
+   * struct Example;
    *
    * TVM_REGISTER_API("Example_doThing")
    * .set_body_method<Example>(&ExampleNode::doThing); // will have type int(Example, int)
-   * 
+   *
    * // note that just doing:
    * // .set_body_method(&ExampleNode::doThing);
    * // wouldn't work, because ExampleNode can't be taken from a TVMArgValue.
@@ -231,15 +231,15 @@ class Registry {
    * \endcode
    *
    * \param f the method pointer to forward to.
-   * \tparam TNodeRef the node reference type to call the method on
+   * \tparam TObjectRef the node reference type to call the method on
    * \tparam TNode the node type containing the method (inferred).
    * \tparam R the return type of the function (inferred).
    * \tparam Args the argument types of the function (inferred).
    */
-  template<typename TNodeRef, typename TNode, typename R, typename ...Args,
-    typename = typename std::enable_if<std::is_base_of<NodeRef, TNodeRef>::value>::type>
+  template<typename TObjectRef, typename TNode, typename R, typename ...Args,
+    typename = typename std::enable_if<std::is_base_of<ObjectRef, TObjectRef>::value>::type>
   Registry& set_body_method(R (TNode::*f)(Args...) const) {
-    return set_body_typed<R(TNodeRef, Args...)>([f](TNodeRef ref, Args... params) {
+    return set_body_typed<R(TObjectRef, Args...)>([f](TObjectRef ref, Args... params) {
       const TNode* target = ref.operator->();
       // call method pointer
       return (target->*f)(params...);
diff --git a/include/tvm/runtime/vm.h b/include/tvm/runtime/vm.h
index aa8543d569af..ee973cb62092 100644
--- a/include/tvm/runtime/vm.h
+++ b/include/tvm/runtime/vm.h
@@ -26,6 +26,7 @@
 
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -56,31 +57,31 @@ class Tensor : public ObjectRef {
 
 
 /*! \brief An object representing a structure or enumeration. */
-class DatatypeObj : public Object {
+class ADTObj : public Object {
  public:
   /*! \brief The tag representing the constructor used. */
   size_t tag;
   /*! \brief The fields of the structure. */
   std::vector<ObjectRef> fields;
 
-  static constexpr const uint32_t _type_index = TypeIndex::kVMDatatype;
-  static constexpr const char* _type_key = "vm.Datatype";
-  TVM_DECLARE_FINAL_OBJECT_INFO(DatatypeObj, Object);
+  static constexpr const uint32_t _type_index = TypeIndex::kVMADT;
+  static constexpr const char* _type_key = "vm.ADT";
+  TVM_DECLARE_FINAL_OBJECT_INFO(ADTObj, Object);
 };
 
-/*! \brief reference to data type. */
-class Datatype : public ObjectRef {
+/*! \brief reference to algebraic data type objects. */
+class ADT : public ObjectRef {
  public:
-  Datatype(size_t tag, std::vector<ObjectRef> fields);
+  ADT(size_t tag, std::vector<ObjectRef> fields);
 
   /*!
    * \brief construct a tuple object.
    * \param fields The fields of the tuple.
    * \return The constructed tuple type.
    */
-  static Datatype Tuple(std::vector<ObjectRef> fields);
+  static ADT Tuple(std::vector<ObjectRef> fields);
 
-  TVM_DEFINE_OBJECT_REF_METHODS(Datatype, ObjectRef, DatatypeObj);
+  TVM_DEFINE_OBJECT_REF_METHODS(ADT, ObjectRef, ADTObj);
 };
 
 /*! \brief An object representing a closure. */
@@ -128,7 +129,7 @@ enum class Opcode {
   InvokePacked = 4U,
   AllocTensor = 5U,
   AllocTensorReg = 6U,
-  AllocDatatype = 7U,
+  AllocADT = 7U,
   AllocClosure = 8U,
   GetField = 9U,
   If = 10U,
@@ -236,7 +237,7 @@ struct Instruction {
       /*! \brief The register to project from. */
       RegName object;
     } get_tag;
-    struct /* AllocDatatype Operands */ {
+    struct /* AllocADT Operands */ {
       /*! \brief The datatype's constructor tag. */
       Index constructor_tag;
       /*! \brief The number of fields to store in the datatype. */
@@ -293,7 +294,7 @@ struct Instruction {
    *  \param dst The register name of the destination.
    *  \return The allocate instruction tensor.
    */
-  static Instruction AllocDatatype(Index tag, Index num_fields, const std::vector<RegName>& fields,
+  static Instruction AllocADT(Index tag, Index num_fields, const std::vector<RegName>& fields,
                                    RegName dst);
   /*! \brief Construct an allocate closure instruction.
    *  \param func_index The index of the function table.
@@ -430,15 +431,184 @@ struct VMFrame {
         caller_return_register(0) {}
 };
 
+/*! \brief The executable emitted by the VM compiler.
+ *
+ * The executable contains information (e.g. data in different memory regions)
+ * to run in a virtual machine.
+ *
+ *  - Global section, containing all globals.
+ *  - Constant section, storing the constant pool.
+ *  - Primitive name section, containing the function name of the primitive ops
+ *  used by the virtual machine.
+ *  - Code section, handling the VM functions and bytecode.
+ */
+class Executable : public ModuleNode {
+ public:
+  /*!
+   * \brief Get a PackedFunc from an executable module.
+   *
+   * \param name the name of the function.
+   * \param sptr_to_self The shared_ptr that points to this module node.
+   *
+   * \return PackedFunc or nullptr when it is not available.
+   */
+  PackedFunc GetFunction(const std::string& name,
+                         const std::shared_ptr<ModuleNode>& sptr_to_self) final;
+
+  /*!
+   * \brief Serialize the executable into global section, constant section, and
+   * code section.
+   *
+   * \return The binary representation of the VM.
+   */
+  TVMByteArray Save();
+
+  /*!
+   * \brief Load the saved VM executable.
+   *
+   * \param code The bytecode in string.
+   * \param lib The compiled runtime library.
+   *
+   * \return exe The constructed executable.
+   */
+  static runtime::Module Load(const std::string& code, const runtime::Module lib);
+
+  /*!
+   * \brief Get the serialized form of the `functions`. This is
+   * essentially bytecode serialization.
+   *
+   * \return The serialized vm bytecode.
+   *
+   * \note The bytecode is in the following format:
+   *   func_name reg_file_size num_instructions
+   *   param1 param2 ... paramM
+   *   instruction1
+   *   instruction2
+   *   ...
+   *   instructionN
+   *
+   * Each instruction is printed in the following format:
+   *   opcode num_fields field1 ... fieldX # The text format.
+   *
+   * Serializing an `Instruction` requires us to deal with the bytecode. Each line
+   * of the instructions could be serialized as the following format:
+   *   hash, opcode, f1, f2, ..., fX, field with variable length
+   *   1. hash: the hash of the instruction. This number will be used to help us
+   * validate if an instruction is well-formed during deserialization.
+   *   2. opcode: the opcode code of the instruction.
+   *   3. f1, f2, ..., fX. These fields together represent the fixed fields in
+   * an instruction, e.g., `from` and `dst` fields of a `Move` instruction. For
+   * example, `DLDataType` will be unpacked into three fields (code, bits, lanes).
+   *   4. The rest of the line indicates the field with variable length, e.g.,
+   * the shape of a tensor, the args used by an `InvokPacked` instruction, etc.
+
+   * The field starting from # is only used for debugging. The serialized code
+   * doesn't contain it, therefore the deserializer doens't need to handle it.
+   */
+  std::string GetBytecode() const;
+
+/*!
+   * \brief Print the detailed statistics of the given code, i.e. number of
+   * globls and constants, etc.
+   */
+  std::string Stats() const;
+
+  /*! \brief Get the `lib` module in an executable. Users have the flexibility to call
+   * `export_library` from the frontend to save the library to disk.
+   *
+   * \return The runtime module that contains the hardwre dependent code.
+   */
+  runtime::Module GetLib() const { return lib; }
+
+  virtual ~Executable() {}
+
+  const char* type_key() const final {
+    return "VMExecutable";
+  }
+
+  /*! \brief The runtime module/library that contains both the host and also the device
+   * code when executing on non-CPU devices. */
+  runtime::Module lib;
+  /*! \brief The global constant pool. */
+  std::vector<ObjectRef> constants;
+  /*! \brief A map from globals (as strings) to their index in the function map. */
+  std::unordered_map<std::string, Index> global_map;
+  /*! \brief A mapping from the packed function (as string) to the index that
+   * corresponds to the position of the `packed_funcs` list in a `VirtualMachine` object.
+   */
+  std::unordered_map<std::string, Index> primitive_map;
+  /*! \brief The virtual machine's function table. */
+  std::vector<VMFunction> functions;
+
+ private:
+  /*!
+   * \brief Save the globals.
+   *
+   * \param strm The input stream.
+   */
+  void SaveGlobalSection(dmlc::Stream* strm);
+
+  /*!
+   * \brief Save the constant pool.
+   *
+   * \param strm The input stream.
+   */
+  void SaveConstantSection(dmlc::Stream* strm);
+
+  /*!
+   * \brief Save primitive op names.
+   *
+   *  \param strm The input stream.
+   */
+  void SavePrimitiveOpNames(dmlc::Stream* strm);
+
+  /*!
+   * \brief Save the vm functions.
+   *
+   * \param strm The input stream.
+   */
+  void SaveCodeSection(dmlc::Stream* strm);
+
+  /*!
+   * \brief Load the globals.
+   *
+   * \param strm The input stream.
+   */
+  void LoadGlobalSection(dmlc::Stream* strm);
+
+  /*!
+   * \brief Load the constant pool.
+   *
+   * \param strm The input stream.
+   */
+  void LoadConstantSection(dmlc::Stream* strm);
+
+  /*!
+   * \brief Load primitive op names.
+   *
+   * \param strm The input stream.
+   */
+  void LoadPrimitiveOpNames(dmlc::Stream* strm);
+
+  /*!
+   * \brief Load the vm functions.
+   *
+   * \param strm The input stream.
+   */
+  void LoadCodeSection(dmlc::Stream* strm);
+
+  /*! \brief The serialized bytecode. */
+  std::string code_;
+};
+
 /*! \brief The virtual machine.
  *
  * The virtual machine contains all the current execution state,
- * as well as the global view of functions, the global constant
- * table, the compiled operators.
+ * as well as the executable.
  *
  * The goal is to have a single self-contained object,
  * enabling one to easily pass around VMs, execute them on
- * multiple threads, or serialized them to disk or over the
+ * multiple threads, or serialize them to disk or over the
  * wire.
  */
 class VirtualMachine : public runtime::ModuleNode {
@@ -486,16 +656,18 @@ class VirtualMachine : public runtime::ModuleNode {
     return "VirtualMachine";
   }
 
-  /*! \brief The runtime module/library that contains generated code. */
-  runtime::Module lib;
+  VirtualMachine() : frames(), func_index(0), code(nullptr), pc(0), exec(nullptr) {}
+
+  /*! \brief load the executable for the virtual machine.
+   *  \param exec The executable.
+   */
+  void LoadExecutable(const Executable* exec);
+
+ protected:
   /*! \brief The virtual machine's packed function table. */
   std::vector<PackedFunc> packed_funcs;
-  /*! \brief The virtual machine's function table. */
-  std::vector<VMFunction> functions;
   /*! \brief The current stack of call frames. */
   std::vector<VMFrame> frames;
-  /*! \brief The global constant pool. */
-  std::vector<ObjectRef> constants;
   /*! \brief The fuction table index of the current function. */
   Index func_index;
   /*! \brief The current pointer to the code section. */
@@ -506,6 +678,9 @@ class VirtualMachine : public runtime::ModuleNode {
   /*! \brief The special return register. */
   ObjectRef return_register;
 
+  /*! \brief The executable the VM will operate on. */
+  const Executable* exec;
+
   /*! \brief The set of TVM contexts the VM is currently executing on. */
   std::vector<TVMContext> ctxs;
 
@@ -550,8 +725,6 @@ class VirtualMachine : public runtime::ModuleNode {
    */
   ObjectRef Invoke(const std::string& name, const std::vector<ObjectRef>& args);
 
-  VirtualMachine() : functions(), frames(), func_index(0), code(nullptr), pc(0) {}
-
   /*! \brief Initialize the virtual machine for a set of contexts.
    *  \param contexts The set of TVM contexts.
    */
@@ -565,21 +738,6 @@ class VirtualMachine : public runtime::ModuleNode {
    */
   TVMContext GetParamsContext() const;
 
-  /*!
-   * \brief Load parameters from the parameter bytearray.
-   * \param params The binary file that contains parameters.
-   */
-  void LoadParams(const std::string& params);
-
-  /*! \brief A map from globals (as strings) to their index in the function map.
-   */
-  std::unordered_map<std::string, Index> global_map;
-
-  /*! \brief A mapping from the packed function (as string) to the index that
-   * corresponds to the position of the `packed_funcs` list.
-   */
-  std::unordered_map<std::string, Index> primitive_map;
-
  private:
   /*! \brief Invoke a global setting up the VM state to execute.
    *
@@ -589,6 +747,12 @@ class VirtualMachine : public runtime::ModuleNode {
 
   /*! \brief The parameter name to data mapping. */
   std::unordered_map<std::string, ObjectRef> params_;
+
+  /*!
+   * \brief The constant pool for runtime. It caches the device dependent
+   * object to avoid rellocation of constants during inference.
+   */
+  std::vector<ObjectRef> const_pool_;
 };
 
 }  // namespace vm
diff --git a/include/tvm/schedule.h b/include/tvm/schedule.h
index af3e929ac3fa..3f4ee38a7695 100644
--- a/include/tvm/schedule.h
+++ b/include/tvm/schedule.h
@@ -56,7 +56,7 @@ enum AttachType : int {
 class Stage : public NodeRef {
  public:
   Stage() {}
-  explicit Stage(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Stage(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief create a new schedule for op.
    * \param op The operator in the schedule
@@ -280,7 +280,7 @@ class Stage : public NodeRef {
 class Schedule : public NodeRef {
  public:
   Schedule() {}
-  explicit Schedule(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Schedule(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief Get a copy of current schedule.
    * \return The copied schedule.
@@ -403,7 +403,7 @@ class Schedule : public NodeRef {
 class IterVarRelation : public NodeRef {
  public:
   IterVarRelation() {}
-  explicit IterVarRelation(NodePtr<Node> n) : NodeRef(n) {}
+  explicit IterVarRelation(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -417,7 +417,7 @@ class IterVarRelation : public NodeRef {
 class IterVarAttr : public NodeRef {
  public:
   IterVarAttr() {}
-  explicit IterVarAttr(NodePtr<Node> n) : NodeRef(n) {}
+  explicit IterVarAttr(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -495,7 +495,7 @@ class StageNode : public Node {
   /*! \brief Number of direct child stages, only used for group stage.*/
   int num_child_stages{0};
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("op", &op);
     v->Visit("origin_op", &origin_op);
     v->Visit("all_iter_vars", &all_iter_vars);
@@ -540,7 +540,7 @@ class ScheduleNode : public Node {
    */
   std::unordered_map<const Node*, Stage> op2stage_cache_;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("outputs", &outputs);
     v->Visit("stages", &stages);
     v->Visit("groups", &groups);
@@ -617,7 +617,7 @@ class IterVarAttrNode : public Node {
    */
   Array<Expr> pragma_values;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("iter_type", &iter_type);
     v->Visit("bind_thread", &bind_thread);
     v->Visit("prefetch_data", &prefetch_data);
@@ -657,7 +657,7 @@ class SplitNode : public IterVarRelationNode {
   /*! \brief Number of parts, only factor or nparts can be given */
   Expr nparts;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("parent", &parent);
     v->Visit("outer", &outer);
     v->Visit("inner", &inner);
@@ -687,7 +687,7 @@ class FuseNode : public IterVarRelationNode {
   /*! \brief The target domain */
   IterVar fused;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("outer", &outer);
     v->Visit("inner", &inner);
     v->Visit("fused", &fused);
@@ -712,7 +712,7 @@ class RebaseNode : public IterVarRelationNode {
   /*! \brief The inner domain */
   IterVar rebased;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("parent", &parent);
     v->Visit("rebased", &rebased);
   }
@@ -732,7 +732,7 @@ class SingletonNode : public IterVarRelationNode {
   /*! \brief The singleton iterator */
   IterVar iter;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("iter", &iter);
   }
 
@@ -745,25 +745,25 @@ class SingletonNode : public IterVarRelationNode {
 
 // implementations
 inline const StageNode* Stage::operator->() const {
-  return static_cast<const StageNode*>(node_.get());
+  return static_cast<const StageNode*>(get());
 }
 inline StageNode* Stage::operator->() {
-  return static_cast<StageNode*>(node_.get());
+  return static_cast<StageNode*>(get_mutable());
 }
 
 inline const ScheduleNode* Schedule::operator->() const {
-  return static_cast<const ScheduleNode*>(node_.get());
+  return static_cast<const ScheduleNode*>(get());
 }
 inline ScheduleNode* Schedule::operator->() {
-  return static_cast<ScheduleNode*>(node_.get());
+  return static_cast<ScheduleNode*>(get_mutable());
 }
 
 inline const IterVarRelationNode* IterVarRelation::operator->() const {
-  return static_cast<const IterVarRelationNode*>(node_.get());
+  return static_cast<const IterVarRelationNode*>(get());
 }
 
 inline const IterVarAttrNode* IterVarAttr::operator->() const {
-  return static_cast<const IterVarAttrNode*>(node_.get());
+  return static_cast<const IterVarAttrNode*>(get());
 }
 }  // namespace tvm
 #endif  // TVM_SCHEDULE_H_
diff --git a/include/tvm/target_info.h b/include/tvm/target_info.h
index 1e3a7686ca00..86cb0e275609 100644
--- a/include/tvm/target_info.h
+++ b/include/tvm/target_info.h
@@ -47,7 +47,7 @@ struct MemoryInfoNode : public Node {
    */
   Expr head_address;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("unit_bits", &unit_bits);
     v->Visit("max_num_bits", &max_num_bits);
     v->Visit("max_simd_bits", &max_simd_bits);
diff --git a/include/tvm/tensor.h b/include/tvm/tensor.h
index f37cc7bed7d1..599d6ff657d1 100644
--- a/include/tvm/tensor.h
+++ b/include/tvm/tensor.h
@@ -50,7 +50,7 @@ class Tensor : public NodeRef {
  public:
   /*! \brief default constructor, used internally */
   Tensor() {}
-  explicit Tensor(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Tensor(ObjectPtr<Object> n) : NodeRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -141,7 +141,7 @@ class Operation : public ir::FunctionRef {
  public:
   /*! \brief default constructor  */
   Operation() {}
-  explicit Operation(NodePtr<Node> n) : FunctionRef(n) {}
+  explicit Operation(ObjectPtr<Object> n) : FunctionRef(n) {}
   /*!
    * \brief access the internal node container
    * \return the pointer to the internal node container
@@ -171,7 +171,7 @@ class TensorNode : public Node {
   /*! \brief constructor */
   TensorNode() {}
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("shape", &shape);
     v->Visit("dtype", &dtype);
     v->Visit("op", &op);
@@ -189,7 +189,7 @@ class TensorNode : public Node {
 
 // Implementations of inline functions
 inline const TensorNode* Tensor::operator->() const {
-  return static_cast<const TensorNode*>(node_.get());
+  return static_cast<const TensorNode*>(get());
 }
 
 inline size_t Tensor::ndim() const {
@@ -250,19 +250,17 @@ DEFINE_OVERLOAD_SLICE_BINARY_OP(<);  // NOLINT(*)
 
 namespace std {
 template <>
-struct hash<::tvm::Operation> {
-  std::size_t operator()(const ::tvm::Operation& k) const {
-    return k.hash();
-  }
+struct hash<::tvm::Operation> : public ::tvm::NodeHash {
 };
 
 template <>
 struct hash<::tvm::Tensor> {
   std::size_t operator()(const ::tvm::Tensor& k) const {
+    ::tvm::NodeHash hasher;
     if (k.defined() && k->op.defined()) {
-      return k->op.hash();
+      return hasher(k->op);
     } else{
-      return k.hash();
+      return hasher(k);
     }
   }
 };
diff --git a/include/tvm/tensor_intrin.h b/include/tvm/tensor_intrin.h
index b5ca6eb4358b..0d4795ad5440 100644
--- a/include/tvm/tensor_intrin.h
+++ b/include/tvm/tensor_intrin.h
@@ -87,7 +87,7 @@ class TensorIntrinNode : public Node {
   /*! \brief constructor */
   TensorIntrinNode() {}
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("op", &op);
     v->Visit("inputs", &inputs);
@@ -112,7 +112,7 @@ class TensorIntrinNode : public Node {
 };
 
 inline const TensorIntrinNode* TensorIntrin::operator->() const {
-  return static_cast<const TensorIntrinNode*>(node_.get());
+  return static_cast<const TensorIntrinNode*>(get());
 }
 
 // Internal node container of tensor intrinsic calling.
@@ -152,7 +152,7 @@ class TensorIntrinCallNode : public Node {
   /*! \brief scalar expression inputs */
   Array<Expr> scalar_inputs;
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("intrin", &intrin);
     v->Visit("tensors", &tensors);
     v->Visit("regions", &regions);
@@ -170,7 +170,7 @@ class TensorIntrinCallNode : public Node {
 };
 
 inline const TensorIntrinCallNode* TensorIntrinCall::operator->() const {
-  return static_cast<const TensorIntrinCallNode*>(node_.get());
+  return static_cast<const TensorIntrinCallNode*>(get());
 }
 
 }  // namespace tvm
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectProxyServerProcessor.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectProxyServerProcessor.java
index 2fc97f65aca4..04888f568be3 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectProxyServerProcessor.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectProxyServerProcessor.java
@@ -30,7 +30,6 @@ public class ConnectProxyServerProcessor implements ServerProcessor {
   private final String host;
   private final int port;
   private final String key;
-  private final SocketFileDescriptorGetter socketFileDescriptorGetter;
 
   private volatile Socket currSocket = new Socket();
   private Runnable callback;
@@ -40,14 +39,11 @@ public class ConnectProxyServerProcessor implements ServerProcessor {
    * @param host Proxy server host.
    * @param port Proxy server port.
    * @param key Proxy server key.
-   * @param sockFdGetter Method to get file descriptor from Java socket.
    */
-  public ConnectProxyServerProcessor(String host, int port, String key,
-      SocketFileDescriptorGetter sockFdGetter) {
+  public ConnectProxyServerProcessor(String host, int port, String key) {
     this.host = host;
     this.port = port;
     this.key = "server:" + key;
-    socketFileDescriptorGetter = sockFdGetter;
   }
   
   /** 
@@ -70,8 +66,8 @@ public void setStartTimeCallback(Runnable callback) {
     try {
       SocketAddress address = new InetSocketAddress(host, port);
       currSocket.connect(address, 6000);
-      InputStream in = currSocket.getInputStream();
-      OutputStream out = currSocket.getOutputStream();
+      final InputStream in = currSocket.getInputStream();
+      final OutputStream out = currSocket.getOutputStream();
       out.write(Utils.toBytes(RPC.RPC_MAGIC));
       out.write(Utils.toBytes(key.length()));
       out.write(Utils.toBytes(key));
@@ -91,11 +87,10 @@ public void setStartTimeCallback(Runnable callback) {
       if (callback != null) {
         callback.run();
       }
-      final int sockFd = socketFileDescriptorGetter.get(currSocket);
-      if (sockFd != -1) {
-        new NativeServerLoop(sockFd).run();
-        System.err.println("Finish serving " + address);
-      }
+
+      SocketChannel sockChannel = new SocketChannel(currSocket);
+      new NativeServerLoop(sockChannel.getFsend(), sockChannel.getFrecv()).run();
+      System.err.println("Finish serving " + address);
     } catch (Throwable e) {
       e.printStackTrace();
       throw new RuntimeException(e);
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectTrackerServerProcessor.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectTrackerServerProcessor.java
index 47881eb350c3..c449bb18a565 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectTrackerServerProcessor.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/ConnectTrackerServerProcessor.java
@@ -37,7 +37,6 @@
  */
 public class ConnectTrackerServerProcessor implements ServerProcessor {
   private ServerSocket server;
-  private final SocketFileDescriptorGetter socketFileDescriptorGetter;
   private final String trackerHost;
   private final int trackerPort;
   // device key
@@ -62,10 +61,11 @@ public class ConnectTrackerServerProcessor implements ServerProcessor {
    * @param trackerHost Tracker host.
    * @param trackerPort Tracker port.
    * @param key Device key.
-   * @param sockFdGetter Method to get file descriptor from Java socket.
+   * @param watchdog watch for timeout, etc.
+   * @throws java.io.IOException when socket fails to open.
    */
   public ConnectTrackerServerProcessor(String trackerHost, int trackerPort, String key,
-      SocketFileDescriptorGetter sockFdGetter, RPCWatchdog watchdog) throws IOException {
+      RPCWatchdog watchdog) throws IOException {
     while (true) {
       try {
         this.server = new ServerSocket(serverPort);
@@ -81,7 +81,6 @@ public ConnectTrackerServerProcessor(String trackerHost, int trackerPort, String
       }
     }
     System.err.println("using port: " + serverPort);
-    this.socketFileDescriptorGetter = sockFdGetter;
     this.trackerHost = trackerHost;
     this.trackerPort = trackerPort;
     this.key = key;
@@ -163,11 +162,9 @@ public String getMatchKey() {
       System.err.println("Connection from " + socket.getRemoteSocketAddress().toString());
       // received timeout in seconds
       watchdog.startTimeout(timeout * 1000);
-      final int sockFd = socketFileDescriptorGetter.get(socket);
-      if (sockFd != -1) {
-        new NativeServerLoop(sockFd).run();
-        System.err.println("Finish serving " + socket.getRemoteSocketAddress().toString());
-      }
+      SocketChannel sockChannel = new SocketChannel(socket);
+      new NativeServerLoop(sockChannel.getFsend(), sockChannel.getFrecv()).run();
+      System.err.println("Finish serving " + socket.getRemoteSocketAddress().toString());
       Utils.closeQuietly(socket);
     } catch (ConnectException e) {
       // if the tracker connection failed, wait a bit before retrying
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/NativeServerLoop.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/NativeServerLoop.java
index 255dabb438d5..697ce45fa04f 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/NativeServerLoop.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/NativeServerLoop.java
@@ -28,14 +28,17 @@
  * Call native ServerLoop on socket file descriptor.
  */
 public class NativeServerLoop implements Runnable {
-  private final int sockFd;
+  private final Function fsend;
+  private final Function frecv;
 
   /**
    * Constructor for NativeServerLoop.
-   * @param nativeSockFd native socket file descriptor.
+   * @param fsend socket.send function.
+   * @param frecv socket.recv function.
    */
-  public NativeServerLoop(final int nativeSockFd) {
-    sockFd = nativeSockFd;
+  public NativeServerLoop(final Function fsend, final Function frecv) {
+    this.fsend = fsend;
+    this.frecv = frecv;
   }
 
   @Override public void run() {
@@ -43,7 +46,7 @@ public NativeServerLoop(final int nativeSockFd) {
     try {
       tempDir = serverEnv();
       System.err.println("starting server loop...");
-      RPC.getApi("_ServerLoop").pushArg(sockFd).invoke();
+      RPC.getApi("_ServerLoop").pushArg(fsend).pushArg(frecv).invoke();
       System.err.println("done server loop...");
     } catch (IOException e) {
       e.printStackTrace();
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java
index 8ebf188b0667..278ef9fe8eef 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java
@@ -200,6 +200,7 @@ public void upload(byte[] data, String target) {
    * Upload file to remote runtime temp folder.
    * @param data The file in local to upload.
    * @param target The path in remote.
+   * @throws java.io.IOException for network failure.
    */
   public void upload(File data, String target) throws IOException {
     byte[] blob = getBytesFromFile(data);
@@ -209,6 +210,7 @@ public void upload(File data, String target) throws IOException {
   /**
    * Upload file to remote runtime temp folder.
    * @param data The file in local to upload.
+   * @throws java.io.IOException for network failure.
    */
   public void upload(File data) throws IOException {
     upload(data, data.getName());
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/Server.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/Server.java
index c81faa0ca999..a9ea2d89a62c 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/Server.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/Server.java
@@ -17,31 +17,12 @@
 
 package ml.dmlc.tvm.rpc;
 
-import sun.misc.SharedSecrets;
-
-import java.io.FileDescriptor;
-import java.io.FileInputStream;
 import java.io.IOException;
-import java.io.InputStream;
-import java.net.Socket;
 
 /**
  * RPC Server.
  */
 public class Server {
-  private static SocketFileDescriptorGetter defaultSocketFdGetter
-      = new SocketFileDescriptorGetter() {
-          @Override public int get(Socket socket) {
-            try {
-              InputStream is = socket.getInputStream();
-              FileDescriptor fd = ((FileInputStream) is).getFD();
-              return SharedSecrets.getJavaIOFileDescriptorAccess().get(fd);
-            } catch (IOException e) {
-              e.printStackTrace();
-              return -1;
-            }
-          }
-        };
   private final WorkerThread worker;
 
   private static class WorkerThread extends Thread {
@@ -72,35 +53,10 @@ public void terminate() {
   /**
    * Start a standalone server.
    * @param serverPort Port.
-   * @param socketFdGetter Method to get system file descriptor of the server socket.
-   * @throws IOException if failed to bind localhost:port.
-   */
-  public Server(int serverPort, SocketFileDescriptorGetter socketFdGetter) throws IOException {
-    worker = new WorkerThread(new StandaloneServerProcessor(serverPort, socketFdGetter));
-  }
-
-  /**
-   * Start a standalone server.
-   * Use sun.misc.SharedSecrets.getJavaIOFileDescriptorAccess
-   * to get file descriptor for the socket.
-   * @param serverPort Port.
    * @throws IOException if failed to bind localhost:port.
    */
   public Server(int serverPort) throws IOException {
-    this(serverPort, defaultSocketFdGetter);
-  }
-
-  /**
-   * Start a server connected to proxy.
-   * @param proxyHost The proxy server host.
-   * @param proxyPort The proxy server port.
-   * @param key The key to identify the server.
-   * @param socketFdGetter Method to get system file descriptor of the server socket.
-   */
-  public Server(String proxyHost, int proxyPort, String key,
-      SocketFileDescriptorGetter socketFdGetter) {
-    worker = new WorkerThread(
-      new ConnectProxyServerProcessor(proxyHost, proxyPort, key, socketFdGetter));
+    worker = new WorkerThread(new StandaloneServerProcessor(serverPort));
   }
 
   /**
@@ -112,7 +68,8 @@ public Server(String proxyHost, int proxyPort, String key,
    * @param key The key to identify the server.
    */
   public Server(String proxyHost, int proxyPort, String key) {
-    this(proxyHost, proxyPort, key, defaultSocketFdGetter);
+    worker = new WorkerThread(
+        new ConnectProxyServerProcessor(proxyHost, proxyPort, key));
   }
 
   /**
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/SocketChannel.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/SocketChannel.java
new file mode 100644
index 000000000000..e72581b2358f
--- /dev/null
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/SocketChannel.java
@@ -0,0 +1,49 @@
+package ml.dmlc.tvm.rpc;
+
+import ml.dmlc.tvm.Function;
+import ml.dmlc.tvm.TVMValue;
+import ml.dmlc.tvm.TVMValueBytes;
+
+import java.io.IOException;
+import java.net.Socket;
+
+public class SocketChannel {
+  private final Socket socket;
+
+  SocketChannel(Socket sock) {
+    socket = sock;
+  }
+
+  private Function fsend = Function.convertFunc(new Function.Callback() {
+    @Override public Object invoke(TVMValue... args) {
+      byte[] data = args[0].asBytes();
+      try {
+        socket.getOutputStream().write(data);
+      } catch (IOException e) {
+        e.printStackTrace();
+        return -1;
+      }
+      return data.length;
+    }
+  });
+
+  private Function frecv = Function.convertFunc(new Function.Callback() {
+    @Override public Object invoke(TVMValue... args) {
+      long size = args[0].asLong();
+      try {
+        return new TVMValueBytes(Utils.recvAll(socket.getInputStream(), (int) size));
+      } catch (IOException e) {
+        e.printStackTrace();
+        return -1;
+      }
+    }
+  });
+
+  public Function getFsend() {
+    return fsend;
+  }
+
+  public Function getFrecv() {
+    return frecv;
+  }
+}
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/SocketFileDescriptorGetter.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/SocketFileDescriptorGetter.java
deleted file mode 100644
index 4c35f720009d..000000000000
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/SocketFileDescriptorGetter.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package ml.dmlc.tvm.rpc;
-
-import java.net.Socket;
-
-/**
- * Interface for defining different socket fd getter.
- */
-public interface SocketFileDescriptorGetter {
-  /**
-   * Get native socket file descriptor.
-   * @param socket Java socket.
-   * @return native socket fd.
-   */
-  public int get(Socket socket);
-}
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/StandaloneServerProcessor.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/StandaloneServerProcessor.java
index 06e3303d1523..2d2303d3fe8a 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/StandaloneServerProcessor.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/StandaloneServerProcessor.java
@@ -28,12 +28,9 @@
  */
 public class StandaloneServerProcessor implements ServerProcessor {
   private final ServerSocket server;
-  private final SocketFileDescriptorGetter socketFileDescriptorGetter;
 
-  public StandaloneServerProcessor(int serverPort,
-      SocketFileDescriptorGetter sockFdGetter) throws IOException {
+  public StandaloneServerProcessor(int serverPort) throws IOException {
     this.server = new ServerSocket(serverPort);
-    this.socketFileDescriptorGetter = sockFdGetter;
   }
 
   @Override public void terminate() {
@@ -46,9 +43,9 @@ public StandaloneServerProcessor(int serverPort,
 
   @Override public void run() {
     try {
-      Socket socket = server.accept();
-      InputStream in = socket.getInputStream();
-      OutputStream out = socket.getOutputStream();
+      final Socket socket = server.accept();
+      final InputStream in = socket.getInputStream();
+      final OutputStream out = socket.getOutputStream();
       int magic = Utils.wrapBytes(Utils.recvAll(in, 4)).getInt();
       if (magic != RPC.RPC_MAGIC) {
         Utils.closeQuietly(socket);
@@ -66,12 +63,10 @@ public StandaloneServerProcessor(int serverPort,
         out.write(Utils.toBytes(serverKey));
       }
 
+      SocketChannel sockChannel = new SocketChannel(socket);
       System.err.println("Connection from " + socket.getRemoteSocketAddress().toString());
-      final int sockFd = socketFileDescriptorGetter.get(socket);
-      if (sockFd != -1) {
-        new NativeServerLoop(sockFd).run();
-        System.err.println("Finish serving " + socket.getRemoteSocketAddress().toString());
-      }
+      new NativeServerLoop(sockChannel.getFsend(), sockChannel.getFrecv()).run();
+      System.err.println("Finish serving " + socket.getRemoteSocketAddress().toString());
       Utils.closeQuietly(socket);
     } catch (Throwable e) {
       e.printStackTrace();
diff --git a/jvm/core/src/test/java/ml/dmlc/tvm/contrib/GraphRuntimeTest.java b/jvm/core/src/test/java/ml/dmlc/tvm/contrib/GraphRuntimeTest.java
index d719eb6f61e7..a29402867381 100644
--- a/jvm/core/src/test/java/ml/dmlc/tvm/contrib/GraphRuntimeTest.java
+++ b/jvm/core/src/test/java/ml/dmlc/tvm/contrib/GraphRuntimeTest.java
@@ -17,7 +17,10 @@
 
 package ml.dmlc.tvm.contrib;
 
-import ml.dmlc.tvm.*;
+import ml.dmlc.tvm.Module;
+import ml.dmlc.tvm.NDArray;
+import ml.dmlc.tvm.TVMContext;
+import ml.dmlc.tvm.TestUtils;
 import ml.dmlc.tvm.rpc.Client;
 import ml.dmlc.tvm.rpc.RPCSession;
 import ml.dmlc.tvm.rpc.Server;
diff --git a/jvm/native/src/main/native/ml_dmlc_tvm_native_c_api.cc b/jvm/native/src/main/native/ml_dmlc_tvm_native_c_api.cc
index 1eff6c45e1fc..b4bfd4270775 100644
--- a/jvm/native/src/main/native/ml_dmlc_tvm_native_c_api.cc
+++ b/jvm/native/src/main/native/ml_dmlc_tvm_native_c_api.cc
@@ -242,7 +242,7 @@ extern "C" int funcInvokeCallback(TVMValue *args,
   for (int i = 0; i < numArgs; ++i) {
     TVMValue arg = args[i];
     int tcode = typeCodes[i];
-    if (tcode == kNodeHandle || tcode == kFuncHandle || tcode == kModuleHandle) {
+    if (tcode == kObjectHandle || tcode == kFuncHandle || tcode == kModuleHandle) {
       TVMCbArgToReturn(&arg, tcode);
     }
     jobject jarg = tvmRetValueToJava(env, arg, tcode);
@@ -259,8 +259,8 @@ extern "C" int funcInvokeCallback(TVMValue *args,
       reinterpret_cast<jobject>(resourceHandle), jargs);
 
   TVMFuncArgsThreadLocalEntry *e = TVMFuncArgsThreadLocalStore::Get();
-  const int prevNumStrArg = e->tvmFuncArgPushedStrs.size();
-  const int prevNumBytesArg = e->tvmFuncArgPushedBytes.size();
+  const size_t prevNumStrArg = e->tvmFuncArgPushedStrs.size();
+  const size_t prevNumBytesArg = e->tvmFuncArgPushedBytes.size();
 
   // convert returned (java) TVMValue to (C) TVMValue
   env->CallStaticVoidMethod(clsFunc, pushArgToStack, jretValue);
diff --git a/jvm/pom.xml b/jvm/pom.xml
index 99cfe0d7b5ec..150c3a00a894 100644
--- a/jvm/pom.xml
+++ b/jvm/pom.xml
@@ -164,8 +164,8 @@
         <artifactId>maven-compiler-plugin</artifactId>
         <version>3.3</version>
         <configuration>
-          <source>1.6</source>
-          <target>1.6</target>
+          <source>1.7</source>
+          <target>1.7</target>
           <encoding>UTF-8</encoding>
         </configuration>
       </plugin>
diff --git a/nnvm/include/nnvm/compiler/util.h b/nnvm/include/nnvm/compiler/util.h
index fa8b69f9b70a..9555c0e7b3ea 100644
--- a/nnvm/include/nnvm/compiler/util.h
+++ b/nnvm/include/nnvm/compiler/util.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -56,7 +56,7 @@ inline tvm::Array<tvm::Expr> ShapeToArray(TShape shape) {
  * \return An Array of Expr, where each element is a constant int32
  */
 inline tvm::Array<tvm::Integer> ShapeToIntArray(TShape shape) {
-  return tvm::Array<tvm::Integer>(ShapeToArray(shape).node_);
+  return tvm::Downcast<tvm::Array<tvm::Integer> >(ShapeToArray(shape));
 }
 }  // namespace compiler
 }  // namespace nnvm
diff --git a/nnvm/src/compiler/compile_engine.cc b/nnvm/src/compiler/compile_engine.cc
index 542455969b8b..5ce78d1d58d6 100644
--- a/nnvm/src/compiler/compile_engine.cc
+++ b/nnvm/src/compiler/compile_engine.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -388,6 +388,9 @@ TVM_REGISTER_GLOBAL("nnvm.compiler.CacheItem2ScheduleArgs")
     *rv = ret;
   });
 
+TVM_REGISTER_NODE_TYPE(GraphFuncNode);
+TVM_REGISTER_NODE_TYPE(GraphCacheEntryNode);
+
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<GraphFuncNode>([](const GraphFuncNode *op, IRPrinter *p) {
     p->stream << "GraphFunc(name=" << op->func_name
diff --git a/nnvm/src/compiler/compile_engine.h b/nnvm/src/compiler/compile_engine.h
index 35287f5a9358..ec9a13b13b17 100644
--- a/nnvm/src/compiler/compile_engine.h
+++ b/nnvm/src/compiler/compile_engine.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -55,7 +55,7 @@ struct GraphFuncNode : public tvm::Node {
   /*! \brief The lowered functions */
   tvm::Array<tvm::LoweredFunc> funcs;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("target", &target);
     v->Visit("func_name", &func_name);
     v->Visit("inputs", &inputs);
@@ -78,7 +78,7 @@ struct GraphCacheEntryNode : public tvm::Node {
   /*! \brief Index of the master node for calling schedule*/
   int master_idx;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("graph_func", &graph_func);
     v->Visit("use_count", &use_count);
     v->Visit("master_idx", &master_idx);
@@ -92,7 +92,7 @@ class GraphCacheEntry : public ::tvm::NodeRef {
   GraphCacheEntry() {}
   explicit GraphCacheEntry(::tvm::NodePtr<::tvm::Node> n) : NodeRef(n) {}
   GraphCacheEntryNode* operator->() {
-    return static_cast<GraphCacheEntryNode*>(node_.get());
+    return static_cast<GraphCacheEntryNode*>(get_mutable());
   }
   using ContainerType = GraphCacheEntryNode;
 };
diff --git a/nnvm/src/compiler/graph_hash.h b/nnvm/src/compiler/graph_hash.h
index aed3462cf128..6966a152224b 100644
--- a/nnvm/src/compiler/graph_hash.h
+++ b/nnvm/src/compiler/graph_hash.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -48,7 +48,7 @@ struct GraphKeyNode : public tvm::Node {
   // The graph hash key is ensured always not to be 0
   mutable size_t cache_hash_key_{0};
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("inputs", &inputs);
     v->Visit("target", &target);
   }
diff --git a/nnvm/src/compiler/graph_runtime.cc b/nnvm/src/compiler/graph_runtime.cc
index 3bfebe3ba4e8..d8ff3bf34bf8 100644
--- a/nnvm/src/compiler/graph_runtime.cc
+++ b/nnvm/src/compiler/graph_runtime.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,11 +18,12 @@
  */
 
 /*!
- * Copyright (c) 2017 by Contributors
  * \file graph_runtime.cc
  * \brief Interface code with TVM graph runtime.
 */
 #include <dmlc/memory_io.h>
+#include <tvm/runtime/registry.h>
+
 #include <utility>
 #include "graph_runtime.h"
 
diff --git a/nnvm/src/compiler/graph_runtime.h b/nnvm/src/compiler/graph_runtime.h
index 3a847de83d9f..770c98e83261 100644
--- a/nnvm/src/compiler/graph_runtime.h
+++ b/nnvm/src/compiler/graph_runtime.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -28,7 +28,6 @@
 #include <nnvm/graph.h>
 #include <tvm/base.h>
 #include <tvm/expr.h>
-#include <tvm/node/memory.h>
 #include <tvm/packed_func_ext.h>
 #include <tvm/runtime/ndarray.h>
 #include <vector>
@@ -62,13 +61,13 @@ struct NDArrayWrapperNode : public ::tvm::Node {
   std::string name;
   tvm::runtime::NDArray array;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("array", &array);
   }
 
   static constexpr const char* _type_key = "NDArrayWrapper";
-  TVM_DECLARE_NODE_TYPE_INFO(NDArrayWrapperNode, Node);
+  TVM_DECLARE_NODE_TYPE_INFO(NDArrayWrapperNode, tvm::Node);
 };
 
 TVM_DEFINE_NODE_REF(NDArrayWrapper, NDArrayWrapperNode);
diff --git a/nnvm/src/compiler/packed_func_ext.cc b/nnvm/src/compiler/packed_func_ext.cc
index bbcc62a99ad8..45f1451663e6 100644
--- a/nnvm/src/compiler/packed_func_ext.cc
+++ b/nnvm/src/compiler/packed_func_ext.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -115,7 +115,7 @@ TVM_REGISTER_GLOBAL("nnvm._register_compute")
                         const Array<Tensor>& out_info)
         -> Array<Tensor> {
       TVMRetValue ret = (*f)(GetAttrDict(attrs), inputs, out_info);
-      if ((*ret.ptr<::tvm::NodePtr<tvm::Node> >())->derived_from<tvm::TensorNode>()) {
+      if (ret.IsObjectRef<tvm::Tensor>()) {
         return {ret.operator Tensor()};
       } else {
         return ret;
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index 5496a4c674f6..c48ae0061f9e 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -1237,7 +1237,7 @@ Array<Integer> GetIntArray(Array<Expr> arr) {
     CHECK(!arr[i].defined() || arr[i].as<IntImm>())
         << "Expect an int array";
   }
-  return Array<Integer>(arr.node_);
+  return Downcast<Array<Integer> >(arr);
 }
 
 NNVM_REGISTER_OP(slice_like)
diff --git a/python/tvm/_ffi/_ctypes/function.py b/python/tvm/_ffi/_ctypes/function.py
index 895c72d28d01..2f0b5babda4d 100644
--- a/python/tvm/_ffi/_ctypes/function.py
+++ b/python/tvm/_ffi/_ctypes/function.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 # coding: utf-8
-# pylint: disable=invalid-name, protected-access, too-many-branches, global-statement
+# pylint: disable=invalid-name, protected-access, too-many-branches, global-statement, unused-import
 """Function configuration API."""
 from __future__ import absolute_import
 
@@ -32,8 +32,8 @@
 from .types import TVMValue, TypeCode
 from .types import TVMPackedCFunc, TVMCFuncFinalizer
 from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func, _ctx_to_int64
-from .node import NodeBase
-from . import node as _node
+from .object import ObjectBase, _set_class_node
+from . import object as _object
 
 FunctionHandle = ctypes.c_void_p
 ModuleHandle = ctypes.c_void_p
@@ -107,9 +107,9 @@ def _make_tvm_args(args, temp_args):
     values = (TVMValue * num_args)()
     type_codes = (ctypes.c_int * num_args)()
     for i, arg in enumerate(args):
-        if isinstance(arg, NodeBase):
+        if isinstance(arg, ObjectBase):
             values[i].v_handle = arg.handle
-            type_codes[i] = TypeCode.NODE_HANDLE
+            type_codes[i] = TypeCode.OBJECT_HANDLE
         elif arg is None:
             values[i].v_handle = None
             type_codes[i] = TypeCode.NULL
@@ -147,7 +147,7 @@ def _make_tvm_args(args, temp_args):
         elif isinstance(arg, (list, tuple, dict, NodeGeneric)):
             arg = convert_to_node(arg)
             values[i].v_handle = arg.handle
-            type_codes[i] = TypeCode.NODE_HANDLE
+            type_codes[i] = TypeCode.OBJECT_HANDLE
             temp_args.append(arg)
         elif isinstance(arg, _CLASS_MODULE):
             values[i].v_handle = arg.handle
@@ -163,9 +163,6 @@ def _make_tvm_args(args, temp_args):
             values[i].v_handle = arg.handle
             type_codes[i] = TypeCode.FUNC_HANDLE
             temp_args.append(arg)
-        elif isinstance(arg, _CLASS_OBJECT):
-            values[i].v_handle = arg.handle
-            type_codes[i] = TypeCode.OBJECT_CELL
         else:
             raise TypeError("Don't know how to handle type %s" % type(arg))
     return values, type_codes, num_args
@@ -225,7 +222,7 @@ def __init_handle_by_constructor__(fconstructor, args):
         raise get_last_ffi_error()
     _ = temp_args
     _ = args
-    assert ret_tcode.value == TypeCode.NODE_HANDLE
+    assert ret_tcode.value == TypeCode.OBJECT_HANDLE
     handle = ret_val.v_handle
     return handle
 
@@ -246,7 +243,7 @@ def _handle_return_func(x):
     return _CLASS_FUNCTION(handle, False)
 
 # setup return handle for function type
-_node.__init_by_constructor__ = __init_handle_by_constructor__
+_object.__init_by_constructor__ = __init_handle_by_constructor__
 RETURN_SWITCH[TypeCode.FUNC_HANDLE] = _handle_return_func
 RETURN_SWITCH[TypeCode.MODULE_HANDLE] = _return_module
 RETURN_SWITCH[TypeCode.NDARRAY_CONTAINER] = lambda x: _make_array(x.v_handle, False, True)
diff --git a/python/tvm/_ffi/_ctypes/node.py b/python/tvm/_ffi/_ctypes/object.py
similarity index 52%
rename from python/tvm/_ffi/_ctypes/node.py
rename to python/tvm/_ffi/_ctypes/object.py
index 39fe0ef35525..c3ae56822198 100644
--- a/python/tvm/_ffi/_ctypes/node.py
+++ b/python/tvm/_ffi/_ctypes/object.py
@@ -14,66 +14,59 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name, protected-access
-# pylint: disable=no-member, missing-docstring, not-callable
+# pylint: disable=invalid-name
+"""Runtime Object api"""
 from __future__ import absolute_import
 
 import ctypes
-from ..base import _LIB, check_call, c_str
+from ..base import _LIB, check_call
+from .types import TypeCode, RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func
 from ..node_generic import _set_class_node_base
-from .types import TVMValue, TypeCode
-from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func
 
-NodeHandle = ctypes.c_void_p
+
+ObjectHandle = ctypes.c_void_p
 __init_by_constructor__ = None
 
-"""Maps node type to its constructor"""
-NODE_TYPE = {}
+"""Maps object type to its constructor"""
+OBJECT_TYPE = {}
+
+_CLASS_NODE = None
+
+def _set_class_node(node_class):
+    global _CLASS_NODE
+    _CLASS_NODE = node_class
+
 
-def _register_node(index, cls):
-    """register node class"""
-    NODE_TYPE[index] = cls
+def _register_object(index, cls):
+    """register object class"""
+    OBJECT_TYPE[index] = cls
 
-def _return_node(x):
-    """Return node function"""
+
+def _return_object(x):
     handle = x.v_handle
-    if not isinstance(handle, NodeHandle):
-        handle = NodeHandle(handle)
-    tindex = ctypes.c_int()
-    check_call(_LIB.TVMNodeGetTypeIndex(handle, ctypes.byref(tindex)))
-    cls = NODE_TYPE.get(tindex.value, NodeBase)
+    if not isinstance(handle, ObjectHandle):
+        handle = ObjectHandle(handle)
+    tindex = ctypes.c_uint()
+    check_call(_LIB.TVMObjectGetTypeIndex(handle, ctypes.byref(tindex)))
+    cls = OBJECT_TYPE.get(tindex.value, _CLASS_NODE)
     # Avoid calling __init__ of cls, instead directly call __new__
     # This allows child class to implement their own __init__
-    node = cls.__new__(cls)
-    node.handle = handle
-    return node
-
+    obj = cls.__new__(cls)
+    obj.handle = handle
+    return obj
 
-RETURN_SWITCH[TypeCode.NODE_HANDLE] = _return_node
-C_TO_PY_ARG_SWITCH[TypeCode.NODE_HANDLE] = _wrap_arg_func(
-    _return_node, TypeCode.NODE_HANDLE)
+RETURN_SWITCH[TypeCode.OBJECT_HANDLE] = _return_object
+C_TO_PY_ARG_SWITCH[TypeCode.OBJECT_HANDLE] = _wrap_arg_func(
+    _return_object, TypeCode.OBJECT_HANDLE)
 
 
-class NodeBase(object):
+class ObjectBase(object):
+    """Base object for all object types"""
     __slots__ = ["handle"]
-    # pylint: disable=no-member
+
     def __del__(self):
         if _LIB is not None:
-            check_call(_LIB.TVMNodeFree(self.handle))
-
-    def __getattr__(self, name):
-        ret_val = TVMValue()
-        ret_type_code = ctypes.c_int()
-        ret_success = ctypes.c_int()
-        check_call(_LIB.TVMNodeGetAttr(
-            self.handle, c_str(name),
-            ctypes.byref(ret_val),
-            ctypes.byref(ret_type_code),
-            ctypes.byref(ret_success)))
-        if not ret_success.value:
-            raise AttributeError(
-                "'%s' object has no attribute '%s'" % (str(type(self)), name))
-        return RETURN_SWITCH[ret_type_code.value](ret_val)
+            check_call(_LIB.TVMObjectFree(self.handle))
 
     def __init_handle_by_constructor__(self, fconstructor, *args):
         """Initialize the handle by calling constructor function.
@@ -95,8 +88,9 @@ def __init_handle_by_constructor__(self, fconstructor, *args):
         # assign handle first to avoid error raising
         self.handle = None
         handle = __init_by_constructor__(fconstructor, args)
-        if not isinstance(handle, NodeHandle):
-            handle = NodeHandle(handle)
+        if not isinstance(handle, ObjectHandle):
+            handle = ObjectHandle(handle)
         self.handle = handle
 
-_set_class_node_base(NodeBase)
+
+_set_class_node_base(ObjectBase)
diff --git a/python/tvm/_ffi/_ctypes/vmobj.py b/python/tvm/_ffi/_ctypes/vmobj.py
deleted file mode 100644
index 59930e55c382..000000000000
--- a/python/tvm/_ffi/_ctypes/vmobj.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Runtime Object api"""
-from __future__ import absolute_import
-
-import ctypes
-from ..base import _LIB, check_call
-from .types import TypeCode, RETURN_SWITCH
-
-ObjectHandle = ctypes.c_void_p
-
-"""Maps object type to its constructor"""
-OBJECT_TYPE = {}
-
-def _register_object(index, cls):
-    """register object class"""
-    OBJECT_TYPE[index] = cls
-
-
-def _return_object(x):
-    handle = x.v_handle
-    if not isinstance(handle, ObjectHandle):
-        handle = ObjectHandle(handle)
-    tag = ctypes.c_int()
-    check_call(_LIB.TVMGetObjectTag(handle, ctypes.byref(tag)))
-    cls = OBJECT_TYPE.get(tag.value, ObjectBase)
-    obj = cls(handle)
-    return obj
-
-RETURN_SWITCH[TypeCode.OBJECT_CELL] = _return_object
-
-
-class ObjectBase(object):
-    __slots__ = ["handle"]
-
-    def __init__(self, handle):
-        self.handle = handle
diff --git a/python/tvm/_ffi/_cython/base.pxi b/python/tvm/_ffi/_cython/base.pxi
index 63130ef67d38..4b7b2c88ffa5 100644
--- a/python/tvm/_ffi/_cython/base.pxi
+++ b/python/tvm/_ffi/_cython/base.pxi
@@ -31,13 +31,12 @@ cdef enum TVMTypeCode:
     kTVMType = 5
     kTVMContext = 6
     kArrayHandle = 7
-    kNodeHandle = 8
+    kObjectHandle = 8
     kModuleHandle = 9
     kFuncHandle = 10
     kStr = 11
     kBytes = 12
     kNDArrayContainer = 13
-    kObjectCell = 14
     kExtBegin = 15
 
 cdef extern from "tvm/runtime/c_runtime_api.h":
@@ -78,7 +77,7 @@ ctypedef void* TVMStreamHandle
 ctypedef void* TVMRetValueHandle
 ctypedef void* TVMFunctionHandle
 ctypedef void* ObjectHandle
-ctypedef void* NodeHandle
+
 
 ctypedef struct TVMNDArrayContainer:
     DLTensor dl_tensor
@@ -130,19 +129,9 @@ cdef extern from "tvm/runtime/c_runtime_api.h":
     int TVMArrayToDLPack(DLTensorHandle arr_from,
                          DLManagedTensor** out)
     void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor)
-    int TVMGetObjectTag(ObjectHandle obj, int* tag)
-
-cdef extern from "tvm/c_dsl_api.h":
-    int TVMNodeFree(NodeHandle handle)
-    int TVMNodeTypeKey2Index(const char* type_key,
-                             int* out_index)
-    int TVMNodeGetTypeIndex(NodeHandle handle,
-                            int* out_index)
-    int TVMNodeGetAttr(NodeHandle handle,
-                       const char* key,
-                       TVMValue* out_value,
-                       int* out_type_code,
-                       int* out_success)
+    int TVMObjectFree(ObjectHandle obj)
+    int TVMObjectGetTypeIndex(ObjectHandle obj, unsigned* out_index)
+
 
 cdef inline py_str(const char* x):
     if PY_MAJOR_VERSION < 3:
diff --git a/python/tvm/_ffi/_cython/core.pyx b/python/tvm/_ffi/_cython/core.pyx
index 4b8536c726aa..cbf9d5859046 100644
--- a/python/tvm/_ffi/_cython/core.pyx
+++ b/python/tvm/_ffi/_cython/core.pyx
@@ -16,7 +16,8 @@
 # under the License.
 
 include "./base.pxi"
-include "./node.pxi"
+include "./object.pxi"
+# include "./node.pxi"
 include "./function.pxi"
 include "./ndarray.pxi"
-include "./vmobj.pxi"
+
diff --git a/python/tvm/_ffi/_cython/function.pxi b/python/tvm/_ffi/_cython/function.pxi
index cf1884c32486..a2360427b6c7 100644
--- a/python/tvm/_ffi/_cython/function.pxi
+++ b/python/tvm/_ffi/_cython/function.pxi
@@ -41,10 +41,9 @@ cdef int tvm_callback(TVMValue* args,
     for i in range(num_args):
         value = args[i]
         tcode = type_codes[i]
-        if (tcode == kNodeHandle or
+        if (tcode == kObjectHandle or
             tcode == kFuncHandle or
             tcode == kModuleHandle or
-            tcode == kObjectCell or
             tcode > kExtBegin):
             CALL(TVMCbArgToReturn(&value, tcode))
 
@@ -98,9 +97,9 @@ cdef inline int make_arg(object arg,
                          list temp_args) except -1:
     """Pack arguments into c args tvm call accept"""
     cdef unsigned long long ptr
-    if isinstance(arg, NodeBase):
-        value[0].v_handle = (<NodeBase>arg).chandle
-        tcode[0] = kNodeHandle
+    if isinstance(arg, ObjectBase):
+        value[0].v_handle = (<ObjectBase>arg).chandle
+        tcode[0] = kObjectHandle
     elif isinstance(arg, NDArrayBase):
         value[0].v_handle = (<NDArrayBase>arg).chandle
         tcode[0] = (kNDArrayContainer if
@@ -152,15 +151,12 @@ cdef inline int make_arg(object arg,
         temp_args.append(tstr)
     elif isinstance(arg, (list, tuple, dict, NodeGeneric)):
         arg = convert_to_node(arg)
-        value[0].v_handle = (<NodeBase>arg).chandle
-        tcode[0] = kNodeHandle
+        value[0].v_handle = (<ObjectBase>arg).chandle
+        tcode[0] = kObjectHandle
         temp_args.append(arg)
     elif isinstance(arg, _CLASS_MODULE):
         value[0].v_handle = c_handle(arg.handle)
         tcode[0] = kModuleHandle
-    elif isinstance(arg, _CLASS_OBJECT):
-        value[0].v_handle = c_handle(arg.handle)
-        tcode[0] = kObjectCell
     elif isinstance(arg, FunctionBase):
         value[0].v_handle = (<FunctionBase>arg).chandle
         tcode[0] = kFuncHandle
@@ -188,8 +184,8 @@ cdef inline bytearray make_ret_bytes(void* chandle):
 
 cdef inline object make_ret(TVMValue value, int tcode):
     """convert result to return value."""
-    if tcode == kNodeHandle:
-        return make_ret_node(value.v_handle)
+    if tcode == kObjectHandle:
+        return make_ret_object(value.v_handle)
     elif tcode == kNull:
         return None
     elif tcode == kInt:
@@ -212,8 +208,6 @@ cdef inline object make_ret(TVMValue value, int tcode):
         fobj = _CLASS_FUNCTION(None, False)
         (<FunctionBase>fobj).chandle = value.v_handle
         return fobj
-    elif tcode == kObjectCell:
-        return make_ret_object(value.v_handle)
     elif tcode in _TVM_EXT_RET:
         return _TVM_EXT_RET[tcode](ctypes_handle(value.v_handle))
 
@@ -314,6 +308,7 @@ cdef class FunctionBase:
 _CLASS_FUNCTION = None
 _CLASS_MODULE = None
 _CLASS_OBJECT = None
+_CLASS_NODE = None
 
 def _set_class_module(module_class):
     """Initialize the module."""
@@ -327,3 +322,7 @@ def _set_class_function(func_class):
 def _set_class_object(obj_class):
     global _CLASS_OBJECT
     _CLASS_OBJECT = obj_class
+
+def _set_class_node(node_class):
+    global _CLASS_NODE
+    _CLASS_NODE = node_class
diff --git a/python/tvm/_ffi/_cython/node.pxi b/python/tvm/_ffi/_cython/object.pxi
similarity index 64%
rename from python/tvm/_ffi/_cython/node.pxi
rename to python/tvm/_ffi/_cython/object.pxi
index 5e0c366e5600..9561eab94ea2 100644
--- a/python/tvm/_ffi/_cython/node.pxi
+++ b/python/tvm/_ffi/_cython/object.pxi
@@ -15,43 +15,46 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from ... import _api_internal
-from ..base import string_types
+"""Maps object type to its constructor"""
 from ..node_generic import _set_class_node_base
 
-"""Maps node type to its constructor"""
-NODE_TYPE = []
+OBJECT_TYPE = []
 
-def _register_node(int index, object cls):
-    """register node class"""
-    while len(NODE_TYPE) <= index:
-        NODE_TYPE.append(None)
-    NODE_TYPE[index] = cls
+def _register_object(int index, object cls):
+    """register object class"""
+    while len(OBJECT_TYPE) <= index:
+        OBJECT_TYPE.append(None)
+    OBJECT_TYPE[index] = cls
 
 
-cdef inline object make_ret_node(void* chandle):
-    global NODE_TYPE
-    cdef int tindex
-    cdef list node_type
+cdef inline object make_ret_object(void* chandle):
+    global OBJECT_TYPE
+    global _CLASS_NODE
+    cdef unsigned tindex
+    cdef list object_type
     cdef object cls
-    node_type = NODE_TYPE
-    CALL(TVMNodeGetTypeIndex(chandle, &tindex))
-    if tindex < len(node_type):
-        cls = node_type[tindex]
+    cdef object handle
+    object_type = OBJECT_TYPE
+    handle = ctypes_handle(chandle)
+    CALL(TVMObjectGetTypeIndex(chandle, &tindex))
+    if tindex < len(object_type):
+        cls = object_type[tindex]
         if cls is not None:
             obj = cls.__new__(cls)
         else:
-            obj = NodeBase.__new__(NodeBase)
+            # default use node base class
+            # TODO(tqchen) change to object after Node unifies with Object
+            obj = _CLASS_NODE.__new__(_CLASS_NODE)
     else:
-        obj = NodeBase.__new__(NodeBase)
-    (<NodeBase>obj).chandle = chandle
+        obj = _CLASS_NODE.__new__(_CLASS_NODE)
+    (<ObjectBase>obj).chandle = chandle
     return obj
 
 
-cdef class NodeBase:
+cdef class ObjectBase:
     cdef void* chandle
 
-    cdef _set_handle(self, handle):
+    cdef inline _set_handle(self, handle):
         cdef unsigned long long ptr
         if handle is None:
             self.chandle = NULL
@@ -70,17 +73,7 @@ cdef class NodeBase:
             self._set_handle(value)
 
     def __dealloc__(self):
-        CALL(TVMNodeFree(self.chandle))
-
-    def __getattr__(self, name):
-        cdef TVMValue ret_val
-        cdef int ret_type_code, ret_succ
-        CALL(TVMNodeGetAttr(self.chandle, c_str(name),
-                            &ret_val, &ret_type_code, &ret_succ))
-        if ret_succ == 0:
-            raise AttributeError(
-                "'%s' object has no attribute '%s'" % (type(self), name))
-        return make_ret(ret_val, ret_type_code)
+        CALL(TVMObjectFree(self.chandle))
 
     def __init_handle_by_constructor__(self, fconstructor, *args):
         """Initialize the handle by calling constructor function.
@@ -104,7 +97,8 @@ cdef class NodeBase:
         cdef void* chandle
         ConstructorCall(
             (<FunctionBase>fconstructor).chandle,
-            kNodeHandle, args, &chandle)
+            kObjectHandle, args, &chandle)
         self.chandle = chandle
 
-_set_class_node_base(NodeBase)
+
+_set_class_node_base(ObjectBase)
diff --git a/python/tvm/_ffi/_cython/vmobj.pxi b/python/tvm/_ffi/_cython/vmobj.pxi
deleted file mode 100644
index 9b487566a6a6..000000000000
--- a/python/tvm/_ffi/_cython/vmobj.pxi
+++ /dev/null
@@ -1,67 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Maps object type to its constructor"""
-OBJECT_TYPE = []
-
-def _register_object(int index, object cls):
-    """register node class"""
-    while len(OBJECT_TYPE) <= index:
-        OBJECT_TYPE.append(None)
-    OBJECT_TYPE[index] = cls
-
-
-cdef inline object make_ret_object(void* chandle):
-    global OBJECT_TYPE
-    cdef int tag
-    cdef list object_type
-    cdef object cls
-    cdef object handle
-    object_type = OBJECT_TYPE
-    handle = ctypes_handle(chandle)
-    CALL(TVMGetObjectTag(chandle, &tag))
-    if tag < len(object_type):
-        cls = object_type[tag]
-        if cls is not None:
-            obj = cls(handle)
-        else:
-            obj = ObjectBase(handle)
-    else:
-        obj = ObjectBase(handle)
-    return obj
-
-
-cdef class ObjectBase:
-    cdef ObjectHandle chandle
-
-    cdef inline _set_handle(self, handle):
-        if handle is None:
-            self.chandle = NULL
-        else:
-            self.chandle = c_handle(handle)
-
-    property handle:
-        def __get__(self):
-            if self.chandle == NULL:
-                return None
-            else:
-                return ctypes.cast(<unsigned long long>self.chandle, ctypes.c_void_p)
-        def __set__(self, value):
-            self._set_handle(value)
-
-    def __init__(self, handle):
-        self._set_handle(handle)
diff --git a/python/tvm/_ffi/function.py b/python/tvm/_ffi/function.py
index 4bb31820548f..60e7aeb9aec5 100644
--- a/python/tvm/_ffi/function.py
+++ b/python/tvm/_ffi/function.py
@@ -22,7 +22,6 @@
 import sys
 import ctypes
 from .base import _LIB, check_call, py_str, c_str, string_types, _FFI_MODE
-from . import vmobj as _vmobj
 
 IMPORT_EXCEPT = RuntimeError if _FFI_MODE == "cython" else ImportError
 
diff --git a/python/tvm/_ffi/node.py b/python/tvm/_ffi/node.py
index baca89d628b8..c6c151af9053 100644
--- a/python/tvm/_ffi/node.py
+++ b/python/tvm/_ffi/node.py
@@ -21,21 +21,8 @@
 import ctypes
 import sys
 from .. import _api_internal
+from .object import Object, register_object, _set_class_node
 from .node_generic import NodeGeneric, convert_to_node, const
-from .base import _LIB, check_call, c_str, py_str, _FFI_MODE
-
-IMPORT_EXCEPT = RuntimeError if _FFI_MODE == "cython" else ImportError
-try:
-    # pylint: disable=wrong-import-position
-    if _FFI_MODE == "ctypes":
-        raise ImportError()
-    if sys.version_info >= (3, 0):
-        from ._cy3.core import _register_node, NodeBase as _NodeBase
-    else:
-        from ._cy2.core import _register_node, NodeBase as _NodeBase
-except IMPORT_EXCEPT:
-    # pylint: disable=wrong-import-position
-    from ._ctypes.node import _register_node, NodeBase as _NodeBase
 
 
 def _new_object(cls):
@@ -43,20 +30,22 @@ def _new_object(cls):
     return cls.__new__(cls)
 
 
-class NodeBase(_NodeBase):
+class NodeBase(Object):
     """NodeBase is the base class of all TVM language AST object."""
     def __repr__(self):
         return _api_internal._format_str(self)
 
     def __dir__(self):
-        plist = ctypes.POINTER(ctypes.c_char_p)()
-        size = ctypes.c_uint()
-        check_call(_LIB.TVMNodeListAttrNames(
-            self.handle, ctypes.byref(size), ctypes.byref(plist)))
-        names = []
-        for i in range(size.value):
-            names.append(py_str(plist[i]))
-        return names
+        fnames = _api_internal._NodeListAttrNames(self)
+        size = fnames(-1)
+        return [fnames(i) for i in range(size)]
+
+    def __getattr__(self, name):
+        try:
+            return _api_internal._NodeGetAttr(self, name)
+        except AttributeError:
+            raise AttributeError(
+                "%s has no attribute %s" % (str(type(self)), name))
 
     def __hash__(self):
         return _api_internal._raw_ptr(self)
@@ -95,24 +84,6 @@ def same_as(self, other):
         return self.__hash__() == other.__hash__()
 
 
-def register_node(type_key=None):
-    """register node type
-
-    Parameters
-    ----------
-    type_key : str or cls
-        The type key of the node
-    """
-    node_name = type_key if isinstance(type_key, str) else type_key.__name__
-
-    def register(cls):
-        """internal register function"""
-        tindex = ctypes.c_int()
-        ret = _LIB.TVMNodeTypeKey2Index(c_str(node_name), ctypes.byref(tindex))
-        if ret == 0:
-            _register_node(tindex.value, cls)
-        return cls
-
-    if isinstance(type_key, str):
-        return register
-    return register(type_key)
+# pylint: disable=invalid-name
+register_node = register_object
+_set_class_node(NodeBase)
diff --git a/python/tvm/_ffi/object.py b/python/tvm/_ffi/object.py
new file mode 100644
index 000000000000..002fd27af0fd
--- /dev/null
+++ b/python/tvm/_ffi/object.py
@@ -0,0 +1,137 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Runtime Object API"""
+from __future__ import absolute_import
+
+import sys
+import ctypes
+from .base import _FFI_MODE, _RUNTIME_ONLY, check_call, _LIB, c_str
+
+IMPORT_EXCEPT = RuntimeError if _FFI_MODE == "cython" else ImportError
+
+try:
+    # pylint: disable=wrong-import-position,unused-import
+    if _FFI_MODE == "ctypes":
+        raise ImportError()
+    if sys.version_info >= (3, 0):
+        from ._cy3.core import _set_class_object, _set_class_node
+        from ._cy3.core import ObjectBase as _ObjectBase
+        from ._cy3.core import _register_object
+    else:
+        from ._cy2.core import _set_class_object, _set_class_node
+        from ._cy2.core import ObjectBase as _ObjectBase
+        from ._cy2.core import _register_object
+except IMPORT_EXCEPT:
+    # pylint: disable=wrong-import-position,unused-import
+    from ._ctypes.function import _set_class_object, _set_class_node
+    from ._ctypes.object import ObjectBase as _ObjectBase
+    from ._ctypes.object import _register_object
+
+
+class Object(_ObjectBase):
+    """Base class for all tvm's runtime objects."""
+    pass
+
+
+def register_object(type_key=None):
+    """register object type.
+
+    Parameters
+    ----------
+    type_key : str or cls
+        The type key of the node
+
+    Examples
+    --------
+    The following code registers MyObject
+    using type key "test.MyObject"
+
+    .. code-block:: python
+
+      @tvm.register_object("test.MyObject")
+      class MyObject(Object):
+          pass
+    """
+    object_name = type_key if isinstance(type_key, str) else type_key.__name__
+
+    def register(cls):
+        """internal register function"""
+        if hasattr(cls, "_type_index"):
+            tindex = cls._type_index
+        else:
+            tidx = ctypes.c_uint()
+            if not _RUNTIME_ONLY:
+                check_call(_LIB.TVMObjectTypeKey2Index(
+                    c_str(object_name), ctypes.byref(tidx)))
+            else:
+                # directly skip unknown objects during runtime.
+                ret = _LIB.TVMObjectTypeKey2Index(
+                    c_str(object_name), ctypes.byref(tidx))
+                if ret != 0:
+                    return cls
+            tindex = tidx.value
+        _register_object(tindex, cls)
+        return cls
+
+    if isinstance(type_key, str):
+        return register
+
+    return register(type_key)
+
+
+def getitem_helper(obj, elem_getter, length, idx):
+    """Helper function to implement a pythonic getitem function.
+
+    Parameters
+    ----------
+    obj: object
+        The original object
+
+    elem_getter : function
+        A simple function that takes index and return a single element.
+
+    length : int
+        The size of the array
+
+    idx : int or slice
+        The argument passed to getitem
+
+    Returns
+    -------
+    result : object
+        The result of getitem
+    """
+    if isinstance(idx, slice):
+        start = idx.start if idx.start is not None else 0
+        stop = idx.stop if idx.stop is not None else length
+        step = idx.step if idx.step is not None else 1
+        if start < 0:
+            start += length
+        if stop < 0:
+            stop += length
+        return [elem_getter(obj, i) for i in range(start, stop, step)]
+
+    if idx < -length or idx >= length:
+        raise IndexError("Index out of range. size: {}, got index {}"
+                         .format(length, idx))
+    if idx < 0:
+        idx += length
+    return elem_getter(obj, idx)
+
+
+_set_class_object(Object)
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index 0d28abd46cb2..2dbb67dfbf73 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -36,13 +36,12 @@ class TypeCode(object):
     TVM_TYPE = 5
     TVM_CONTEXT = 6
     ARRAY_HANDLE = 7
-    NODE_HANDLE = 8
+    OBJECT_HANDLE = 8
     MODULE_HANDLE = 9
     FUNC_HANDLE = 10
     STR = 11
     BYTES = 12
     NDARRAY_CONTAINER = 13
-    OBJECT_CELL = 14
     EXT_BEGIN = 15
 
 
diff --git a/python/tvm/_ffi/vmobj.py b/python/tvm/_ffi/vmobj.py
deleted file mode 100644
index ea3431aa973c..000000000000
--- a/python/tvm/_ffi/vmobj.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Runtime Object api"""
-from __future__ import absolute_import
-
-import sys
-from .base import _FFI_MODE
-
-IMPORT_EXCEPT = RuntimeError if _FFI_MODE == "cython" else ImportError
-
-try:
-    # pylint: disable=wrong-import-position
-    if _FFI_MODE == "ctypes":
-        raise ImportError()
-    if sys.version_info >= (3, 0):
-        from ._cy3.core import _set_class_object
-        from ._cy3.core import ObjectBase as _ObjectBase
-        from ._cy3.core import _register_object
-    else:
-        from ._cy2.core import _set_class_object
-        from ._cy2.core import ObjectBase as _ObjectBase
-        from ._cy2.core import _register_object
-except IMPORT_EXCEPT:
-    # pylint: disable=wrong-import-position
-    from ._ctypes.function import _set_class_object
-    from ._ctypes.vmobj import ObjectBase as _ObjectBase
-    from ._ctypes.vmobj import _register_object
-
-
-class ObjectTag(object):
-    """Type code used in API calls"""
-    TENSOR = 1
-    CLOSURE = 2
-    DATATYPE = 3
-
-
-class Object(_ObjectBase):
-    """The VM Object used in Relay virtual machine."""
-
-
-def register_object(cls):
-    _register_object(cls.tag, cls)
-    return cls
-
-
-_set_class_object(Object)
diff --git a/python/tvm/api.py b/python/tvm/api.py
index e7523bd733f9..f0261be37e41 100644
--- a/python/tvm/api.py
+++ b/python/tvm/api.py
@@ -21,6 +21,7 @@
 from numbers import Integral as _Integral
 
 from ._ffi.base import string_types
+from ._ffi.object import register_object, Object
 from ._ffi.node import register_node, NodeBase
 from ._ffi.node import convert_to_node as _convert_to_node
 from ._ffi.node_generic import _scalar_type_inference
diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index 5b0294ef2d07..55be05f4b88f 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -52,8 +52,7 @@ def _build(func,
 def extract_from_program(func, params, ops, target, target_host=None):
     """ Extract tuning tasks from a relay program.
 
-    This function collects tuning tasks by building the program
-    with a "tracing" target and tracing all the calls to topi.
+    This function is the single program version of extract_from_multiple_program.
 
     Parameters
     ----------
@@ -73,66 +72,14 @@ def extract_from_program(func, params, ops, target, target_host=None):
     task: Array of autotvm.task.Task
         collected tasks
     """
-    import tvm.relay.op
-    from tvm import relay
-    import topi
-
-    env = TaskExtractEnv.get()
-
-    # NOTE: To add more ops, you only need to change the following lists
-    # relay op -> topi compute
-    OP2TOPI = {
-        tvm.relay.op.nn.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw,
-                                 topi.nn.group_conv2d_nchw, topi.nn.conv2d_NCHWc],
-        tvm.relay.op.nn.conv2d_transpose: [topi.nn.conv2d_transpose_nchw],
-        tvm.relay.op.nn.dense: [topi.nn.dense],
-        tvm.relay.op.nn.deformable_conv2d: [topi.nn.deformable_conv2d_nchw],
-    }
-
-    topi_funcs = []
-    for op_name in ops:
-        if op_name in OP2TOPI:
-            topi_funcs.extend(OP2TOPI[op_name])
-        else:
-            warnings.warn("Op %s is not tunable, ignored" % op_name)
-
-    # run compiler to collect all TOPI calls during compilation
-    env.reset(topi_funcs)
-    with env:
-        # disable logger temporarily
-        old_state = logger.disabled
-        logger.disabled = True
-
-        relay.backend.compile_engine.get().clear()
-        # wrap build call in thread to avoid multiprocessing problems
-        mod = relay.Module.from_expr(func)
-        build_thread = threading.Thread(target=_build,
-                                        args=(mod,
-                                              target,
-                                              target_host,
-                                              params))
-        build_thread.start()
-        build_thread.join()
-
-        logger.disabled = old_state
-
-    # create tasks for target
-    tasks = []
-    for task_name, args in env.get_tasks():
-        try:
-            tsk = create(task_name, args,
-                         target=target, target_host=target_host,
-                         template_key='direct')
-            tasks.append(tsk)
-        except topi.InvalidShapeError:
-            warnings.warn("Invalid shape during AutoTVM task creation")
-    return tasks
+    return extract_from_multiple_program([func], [params], ops, target, target_host)
 
 
 def extract_from_multiple_program(funcs, params, ops, target, target_host=None):
     """ Extract tuning tasks from multiple relay programs.
 
-    This function is the multiple program version of extract_from_program
+    This function collects tuning tasks by building a list of programs
+    with a "tracing" target and tracing all the calls to topi.
 
     Parameters
     ----------
@@ -152,19 +99,20 @@ def extract_from_multiple_program(funcs, params, ops, target, target_host=None):
     task: Array of autotvm.task.Task
         collected tasks
     """
-    env = TaskExtractEnv.get()
     import tvm.relay.op
     from tvm import relay
     import topi
 
+    env = TaskExtractEnv.get()
+
     # NOTE: To add more ops, you only need to change the following lists
     # relay op -> topi compute
     OP2TOPI = {
         tvm.relay.op.nn.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw,
-                                 topi.nn.group_conv2d_nchw],
+                                 topi.nn.group_conv2d_nchw, topi.nn.conv2d_NCHWc],
         tvm.relay.op.nn.conv2d_transpose: [topi.nn.conv2d_transpose_nchw],
         tvm.relay.op.nn.dense: [topi.nn.dense],
-        tvm.relay.op.nn.contrib_deformable_conv2d: [topi.nn.deformable_conv2d_nchw],
+        tvm.relay.op.nn.deformable_conv2d: [topi.nn.deformable_conv2d_nchw],
     }
 
     topi_funcs = []
@@ -185,11 +133,8 @@ def extract_from_multiple_program(funcs, params, ops, target, target_host=None):
             relay.backend.compile_engine.get().clear()
             # wrap build call in thread to avoid multiprocessing problems
             mod = relay.Module.from_expr(func)
-            build_thread = threading.Thread(target=my_build,
-                                            args=(mod,
-                                                  target,
-                                                  target_host,
-                                                  params))
+            build_thread = threading.Thread(target=_build,
+                                            args=(mod, target, target_host, param))
             build_thread.start()
             build_thread.join()
 
diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index e0db27574898..4f3cc90b474e 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -226,7 +226,7 @@ def args_to_workload(x, topi_compute_func=None):
     elif x is None:
         workload = 0
     else:
-        raise RuntimeError('Do not support type "%s" in argument. Consider to use'
+        raise RuntimeError('Do not support type "%s" in argument. Consider to use '
                            'primitive types only' % type(x))
     return (get_func_name(topi_compute_func), ) + workload  if topi_compute_func else workload
 
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index 09f08ad8b4ae..ac4683d4ae0b 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -176,9 +176,12 @@ def _topi_nn_conv2d(*args, **kwargs):
             args = deserialize_args(args)
             A, W = args[:2]
             layout = args[-2]
-            assert layout == 'NCHW', "only support NCHW currently"
+            assert layout == 'NCHW' or layout == 'HWCN', "only support NCHW/HWCN currently"
             C = topi.nn.conv2d(*args, **kwargs)
-            s = topi.generic.schedule_conv2d_nchw([C])
+            if layout == 'NCHW':
+                s = topi.generic.schedule_conv2d_nchw([C])
+            else:
+                s = topi.generic.schedule_conv2d_hwcn([C])
             return s, [A, W, C]
 
         @register("topi_nn_depthwise_conv2d_nchw")
diff --git a/python/tvm/build_module.py b/python/tvm/build_module.py
index 4cb09931616e..fe2f64142c56 100644
--- a/python/tvm/build_module.py
+++ b/python/tvm/build_module.py
@@ -413,7 +413,6 @@ def lower(sch,
 
     # Phase 3
     stmt = ir_pass.Simplify(stmt)
-    stmt = ir_pass.LowerStorageAccessInfo(stmt)
     stmt = ir_pass.RemoveNoOp(stmt)
     if not cfg.disable_select_rewriting:
         stmt = ir_pass.RewriteUnsafeSelect(stmt)
@@ -465,6 +464,7 @@ def _build_for_device(flist, target, target_host):
                 func = ir_pass.ThreadSync(func, "global")
             func = ir_pass.ThreadSync(func, "shared")
             func = ir_pass.ThreadSync(func, "warp")
+            func = ir_pass.InferFragment(func)
             warp_size = target.thread_warp_size
             func = ir_pass.LowerThreadAllreduce(func, warp_size)
             fsplits = [s for s in ir_pass.SplitHostDevice(func)]
@@ -494,6 +494,8 @@ def _build_for_device(flist, target, target_host):
         assert not fdevice
 
     target_host = _target.create(target_host)
+    fdevice = [ir_pass.LowerDeviceStorageAccessInfo(x) for x in fdevice]
+    fhost = [ir_pass.LowerDeviceStorageAccessInfo(x) for x in fhost]
     fdevice = [ir_pass.LowerIntrin(x, target.target_name) for x in fdevice]
     fhost = [ir_pass.LowerIntrin(x, target_host.target_name) for x in fhost]
     fhost = [ir_pass.CombineContextCall(x) for x in fhost]
@@ -568,10 +570,11 @@ def build(inputs,
         B = tvm.placeholder((n,), name='B')
         C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
         s1 = tvm.create_schedule(C.op)
-        s2 = topi.cpp.cuda.schedule_injective("cuda", [C])
-        f1 = tvm.lower(s1, [A, B, C], name="test_add1")
-        f2 = tvm.lower(s2, [A, B, C], name="test_add2")
-        m = tvm.build({"llvm": [f1], "cuda": [f2]}, target_host="llvm")
+        with tvm.target.cuda() as cuda_tgt:
+          s2 = topi.cuda.schedule_injective(cuda_tgt, [C])
+          f1 = tvm.lower(s1, [A, B, C], name="test_add1")
+          f2 = tvm.lower(s2, [A, B, C], name="test_add2")
+          m = tvm.build({"llvm": [f1], "cuda": [f2]}, target_host="llvm")
 
     Note
     ----
diff --git a/python/tvm/error.py b/python/tvm/error.py
index b5a7ed2374b7..a6d4f701d2a6 100644
--- a/python/tvm/error.py
+++ b/python/tvm/error.py
@@ -49,6 +49,7 @@ def __init__(self, msg):
 
 register_error("ValueError", ValueError)
 register_error("TypeError", TypeError)
+register_error("AttributeError", AttributeError)
 
 
 @register_error
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index ceb98c4d251e..fff9c99e5007 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -37,8 +37,6 @@
 from . import feature
 from .backend import vm
 from .backend import profiler_vm
-from .backend import serializer
-from .backend import deserializer
 from .backend import vmobj
 
 # Root operators
diff --git a/python/tvm/relay/backend/deserializer.py b/python/tvm/relay/backend/deserializer.py
deleted file mode 100644
index fde702b1cd04..000000000000
--- a/python/tvm/relay/backend/deserializer.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# License .to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""
-The Relay Virtual Machine deserializer.
-
-Python interface for deserializing a Relay VM.
-"""
-from tvm import module
-from tvm._ffi.runtime_ctypes import TVMByteArray
-from . import _vm
-from . import vm as rly_vm
-
-def _create_deserializer(code, lib):
-    """Create a deserializer object.
-
-    Parameters
-    ----------
-    code : bytearray
-        The serialized virtual machine code.
-
-    lib : :py:class:`~tvm.module.Module`
-        The serialized runtime module/library that contains the hardware
-        dependent binary code.
-
-    Returns
-    -------
-    ret : Deserializer
-        The created virtual machine deserializer.
-    """
-    if isinstance(code, (bytes, str)):
-        code = bytearray(code)
-    elif not isinstance(code, (bytearray, TVMByteArray)):
-        raise TypeError("vm is expected to be the type of bytearray or " +
-                        "TVMByteArray, but received {}".format(type(code)))
-
-    if not isinstance(lib, module.Module):
-        raise TypeError("lib is expected to be the type of tvm.module.Module" +
-                        ", but received {}".format(type(lib)))
-    return _vm._Deserializer(code, lib)
-
-
-class Deserializer:
-    """Relay VM deserializer.
-
-    Parameters
-    ----------
-    code : bytearray
-        The serialized virtual machine code.
-
-    lib : :py:class:`~tvm.module.Module`
-        The serialized runtime module/library that contains the hardware
-        dependent binary code.
-    """
-    def __init__(self, code, lib):
-        self.mod = _create_deserializer(code, lib)
-        self._deserialize = self.mod["deserialize"]
-
-    def deserialize(self):
-        """Deserialize the serialized bytecode into a Relay VM.
-
-        Returns
-        -------
-        ret : VirtualMachine
-            The deserialized Relay VM.
-        """
-        return rly_vm.VirtualMachine(self._deserialize())
diff --git a/python/tvm/relay/backend/interpreter.py b/python/tvm/relay/backend/interpreter.py
index ae60b7a89b2f..1d53f6a92b07 100644
--- a/python/tvm/relay/backend/interpreter.py
+++ b/python/tvm/relay/backend/interpreter.py
@@ -72,6 +72,11 @@ class Closure(Value):
     """A closure produced by the interpreter."""
 
 
+@register_relay_node
+class RecClosure(Value):
+    """A recursive closure produced by the interpreter."""
+
+
 @register_relay_node
 class ConstructorValue(Value):
     def __init__(self, tag, fields, constructor):
diff --git a/python/tvm/relay/backend/profiler_vm.py b/python/tvm/relay/backend/profiler_vm.py
index 8ae3161e0b83..ded5d0d13bd7 100644
--- a/python/tvm/relay/backend/profiler_vm.py
+++ b/python/tvm/relay/backend/profiler_vm.py
@@ -49,8 +49,8 @@ def compile(mod, target=None, target_host=None, params=None):
 
     Returns
     -------
-    vm : VirtualMachineProfiler
-        The profile VM runtime.
+    exec : Executable
+        The executable with profiling code.
     """
     compiler = VMCompilerProfiler()
     target = compiler.update_target(target)
@@ -60,7 +60,11 @@ def compile(mod, target=None, target_host=None, params=None):
     tophub_context = compiler.tophub_context(target)
     with tophub_context:
         compiler._compile(mod, target, target_host)
-    return VirtualMachineProfiler(compiler._get_vm())
+    return vm.Executable(compiler._get_exec())
+
+def enabled():
+    """Whether vm profiler is enabled."""
+    return hasattr(_vm, "_VMCompilerProfiler")
 
 class VMCompilerProfiler(vm.VMCompiler):
     """Build Relay module to run on VM runtime."""
@@ -68,13 +72,17 @@ def __init__(self):
         super().__init__()
         self.mod = _vm._VMCompilerProfiler()
         self._compile = self.mod["compile"]
-        self._get_vm = self.mod["get_vm"]
+        self._get_exec = self.mod["get_executable"]
         self._set_params_func = self.mod["set_params"]
 
 class VirtualMachineProfiler(vm.VirtualMachine):
     """Relay profile VM runtime."""
     def __init__(self, mod):
         super().__init__(mod)
+        m = mod.module if isinstance(mod, vm.Executable) else mod
+        self.mod = _vm._VirtualMachineDebug(m)
+        self._init = self.mod["init"]
+        self._invoke = self.mod["invoke"]
         self._get_stat = self.mod["get_stat"]
 
     def get_stat(self):
diff --git a/python/tvm/relay/backend/serializer.py b/python/tvm/relay/backend/serializer.py
deleted file mode 100644
index b45ba9116a15..000000000000
--- a/python/tvm/relay/backend/serializer.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# License .to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""
-The Relay Virtual Machine serializer.
-
-Python interface for serializing a Relay VM.
-"""
-import tvm
-from . import _vm
-from . import vm as rly_vm
-
-def _create_serializer(vm):
-    """Create a VM serializer.
-
-    Parameters
-    ----------
-    vm : Union[VirtualMachine, :py:class:`~tvm.module.Module`]
-        The virtual machine to be serialized.
-
-    Returns
-    -------
-    ret : Serializer
-        The created virtual machine serializer.
-    """
-    if isinstance(vm, rly_vm.VirtualMachine):
-        vm = vm.module
-    elif not isinstance(vm, tvm.module.Module):
-        raise TypeError("vm is expected to be the type of VirtualMachine or " +
-                        "tvm.Module, but received {}".format(type(vm)))
-
-    return _vm._Serializer(vm)
-
-
-class Serializer:
-    """Relay VM serializer."""
-    def __init__(self, vm):
-        self.mod = _create_serializer(vm)
-        self._get_lib = self.mod["get_lib"]
-        self._get_bytecode = self.mod["get_bytecode"]
-        self._get_globals = self.mod["get_globals"]
-        self._get_stats = self.mod["get_stats"]
-        self._get_primitive_ops = self.mod["get_primitive_ops"]
-        self._serialize = self.mod["serialize"]
-
-    @property
-    def stats(self):
-        """Get the statistics of the Relay VM.
-
-        Returns
-        -------
-        ret : String
-            The serialized statistic information.
-        """
-        return self._get_stats()
-
-    @property
-    def primitive_ops(self):
-        """Get the name of the primitive ops that are executed in the VM.
-
-        Returns
-        -------
-        ret : List[:py:class:`~tvm.expr.StringImm`]
-            The list of primitive ops.
-        """
-        return [prim_op.value for prim_op in self._get_primitive_ops()]
-
-    @property
-    def bytecode(self):
-        """Get the bytecode of the Relay VM.
-
-        Returns
-        -------
-        ret : String
-            The serialized bytecode.
-
-        Notes
-        -----
-        The bytecode is in the following format:
-          func_name reg_file_size num_instructions
-          param1 param2 ... paramM
-          instruction1
-          instruction2
-          ...
-          instructionN
-
-        Each instruction is printed in the following format:
-          hash opcode field1 ... fieldX # The text format.
-
-        The part starting from # is only used for visualization and debugging.
-        The real serialized code doesn't contain it, therefore the deserializer
-        doesn't need to deal with it as well.
-        """
-        return self._get_bytecode()
-
-    @property
-    def globals(self):
-        """Get the globals used by the Relay VM.
-
-        Returns
-        -------
-        ret : List[:py:class:`~tvm.expr.StringImm`]
-            The serialized globals.
-        """
-        return [glb.value for glb in self._get_globals()]
-
-    def serialize(self):
-        """Serialize the Relay VM.
-
-        Returns
-        -------
-        code : bytearray
-            The binary blob representing a serialized Relay VM. It can then be
-            saved to disk and later deserialized into a new VM.
-
-        lib : :py:class:`~tvm.module.Module`
-            The runtime module that contains the generated code. It is
-            basically a library that is composed of hardware dependent code.
-
-        Notes
-        -----
-        The returned code is organized with the following sections in order.
-         - Global section. This section contains the globals used by the
-         virtual machine.
-         - Constant section. This section is used to store the constant pool of
-         a virtual machine.
-         - Primitive name section. This section is introduced to accommodate
-         the list of primitive operator names that will be invoked by the
-         virtual machine.
-         - Code section. The VM functions, including bytecode, are sitting in
-         this section.
-
-        Examples
-        --------
-        .. code-block:: python
-
-            import numpy as np
-            import tvm
-            from tvm import relay
-
-            # define a simple network.
-            x = relay.var('x', shape=(10, 10))
-            f = relay.Function([x], x + x)
-            mod = relay.Module({"main": f})
-
-            # create a Relay VM.
-            ctx = tvm.cpu()
-            target = "llvm"
-            compiler = relay.vm.VMCompiler()
-            vm = compiler.compile(mod, target)
-            vm.init(ctx)
-
-            # serialize.
-            ser = relay.serializer.Serializer(vm)
-            code, lib = ser.serialize()
-
-            # save and load the code and lib file.
-            tmp = tvm.contrib.util.tempdir()
-            path_lib = tmp.relpath("lib.so")
-            lib.export_library(path_lib)
-            with open(tmp.relpath("code.bc"), "wb") as fo:
-                fo.write(code)
-
-            loaded_lib = tvm.module.load(path_lib)
-            loaded_code = bytearray(open(tmp.relpath("code.bc"), "rb").read())
-
-            # deserialize.
-            deser = relay.deserializer.Deserializer(loaded_code, loaded_lib)
-            des_vm = deser.deserialize()
-
-            # execute the deserialized vm.
-            des_vm.init(ctx)
-            x_data = np.random.rand(10, 10).astype('float32')
-            res = des_vm.run(x_data)
-            print(res.asnumpy())
-        """
-        return self._serialize(), self._get_lib()
diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index e54629dd1344..e190e3f1eb41 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -24,15 +24,18 @@
 
 import tvm
 from tvm import autotvm
-from tvm._ffi.runtime_ctypes import TVMByteArray
 from tvm.relay import expr as _expr
+from tvm._ffi.runtime_ctypes import TVMByteArray
 from . import _vm
 from . import vmobj as _obj
 from .interpreter import Executor
 
+Tensor = _obj.Tensor
+ADT = _obj.ADT
+
 def _convert(arg, cargs):
     if isinstance(arg, (np.ndarray, tvm.nd.NDArray)):
-        cargs.append(_obj.tensor_object(arg))
+        cargs.append(_obj.Tensor(arg))
     elif isinstance(arg, (tuple, list)):
         field_args = []
         for field in arg:
@@ -41,6 +44,7 @@ def _convert(arg, cargs):
     else:
         raise "unsupported type"
 
+
 def convert(args):
     cargs = []
     for arg in args:
@@ -49,12 +53,202 @@ def convert(args):
     return cargs
 
 
+class Executable(object):
+    """Relay VM executable"""
+    def __init__(self, mod):
+        self.mod = mod
+        self._save = self.mod["save"]
+        self._get_lib = self.mod["get_lib"]
+        self._get_bytecode = self.mod["get_bytecode"]
+        self._get_stats = self.mod["get_stats"]
+
+    def save(self):
+        """Save the Relay VM Executable.
+
+        Returns
+        -------
+        code : bytearray
+            The binary blob representing a serialized Relay VM executable. It
+            can then be saved to disk and later deserialized into a new
+            Executable.
+
+        lib : :py:class:`~tvm.module.Module`
+            The runtime module that contains the generated code. It is
+            basically a library that is composed of hardware dependent code.
+
+        Notes
+        -----
+        The returned code is organized with the following sections in order.
+         - Global section. This section contains the globals used by the
+         virtual machine.
+         - Constant section. This section is used to store the constant pool of
+         a virtual machine.
+         - Primitive name section. This section is introduced to accommodate
+         the list of primitive operator names that will be invoked by the
+         virtual machine.
+         - Code section. The VM functions, including bytecode, are sitting in
+         this section.
+
+        Examples
+        --------
+
+        .. code-block:: python
+
+            import numpy as np
+            import tvm
+            from tvm import relay
+            # define a simple network.
+            x = relay.var('x', shape=(10, 10))
+            f = relay.Function([x], x + x)
+            mod = relay.Module({"main": f})
+            # create a Relay VM.
+            ctx = tvm.cpu()
+            target = "llvm"
+            executable = relay.vm.compile(mod, target)
+            code, lib = executable.save()
+            # save and load the code and lib file.
+            tmp = tvm.contrib.util.tempdir()
+            path_lib = tmp.relpath("lib.so")
+            lib.export_library(path_lib)
+            with open(tmp.relpath("code.ro"), "wb") as fo:
+                fo.write(code)
+            loaded_lib = tvm.module.load(path_lib)
+            loaded_code = bytearray(open(tmp.relpath("code.ro"), "rb").read())
+            # deserialize.
+            des_exec = relay.vm.Executable.load_exec(loaded_code, loaded_code)
+            # execute the deserialized executable.
+            x_data = np.random.rand(10, 10).astype('float32')
+            des_vm = relay.vm.VirtualMachine(des_exec)
+            des_vm.init(ctx)
+            res = des_vm.run(x_data)
+            print(res.asnumpy())
+        """
+        return self._save(), self._get_lib()
+
+    @staticmethod
+    def load_exec(bytecode, lib):
+        """Construct an executable from saved artifacts.
+
+        Parameters
+        ----------
+        bytecode : bytearray
+            The binary blob representing a the Relay VM bytecode.
+
+        lib : :py:class:`~tvm.module.Module`
+            The runtime module that contains the generated code.
+
+        Returns
+        -------
+        exec: Executable
+            An executable constructed using the provided artifacts.
+        """
+        if isinstance(bytecode, (bytes, str)):
+            code = bytearray(bytecode)
+        elif not isinstance(bytecode, (bytearray, TVMByteArray)):
+            raise TypeError("bytecode is expected to be the type of bytearray " +
+                            "or TVMByteArray, but received {}".format(type(code)))
+
+        if not isinstance(lib, tvm.module.Module):
+            raise TypeError("lib is expected to be the type of tvm.module.Module" +
+                            ", but received {}".format(type(lib)))
+
+        return Executable(_vm.Load_Executable(bytecode, lib))
+
+    @property
+    def lib(self):
+        """Get the library that contains hardware dependent code.
+
+        Returns
+        -------
+        ret : :py:class:`~tvm.Module`
+            The runtime module that contains hardware dependent code.
+        """
+        return self._get_lib()
+
+    @property
+    def stats(self):
+        """Get the statistics of the Relay VM executable.
+
+        Returns
+        -------
+        ret : String
+            The statistic information of the VM executable.
+        """
+        return self._get_stats()
+
+    @property
+    def primitive_ops(self):
+        """Get the name of the primitive ops contained in the executable.
+
+        Returns
+        -------
+        ret : List[String]
+            The list of primitive ops.
+        """
+        ret = []
+        num_primitives = _vm.GetNumOfPrimitives(self.module)
+        for i in range(num_primitives):
+            ret.append(_vm.GetPrimitiveFields(self.module, i))
+        return ret
+
+    @property
+    def bytecode(self):
+        """Get the bytecode of the Relay VM executable.
+
+        Returns
+        -------
+        ret : String
+            The bytecode of the executable.
+
+        Notes
+        -----
+        The bytecode is in the following format:
+          func_name reg_file_size num_instructions
+          param1 param2 ... paramM
+          instruction1
+          instruction2
+          ...
+          instructionN
+
+        Each instruction is printed in the following format:
+          hash opcode field1 ... fieldX # The text format.
+
+        The part starting from # is only used for visualization and debugging.
+        The real serialized code doesn't contain it, therefore the deserializer
+        doesn't need to deal with it as well.
+        """
+        return self._get_bytecode()
+
+    @property
+    def globals(self):
+        """Get the globals used by the Relay VM executable.
+
+        Returns
+        -------
+        ret : List[String]
+            The globals contained in the executable.
+        """
+        ret = []
+        num_globals = _vm.GetNumOfGlobals(self.module)
+        for i in range(num_globals):
+            ret.append(_vm.GetGlobalFields(self.module, i))
+        return ret
+
+    @property
+    def module(self):
+        """Return the runtime module contained in a virtual machine executable."""
+        return self.mod
+
+
 class VirtualMachine(object):
     """Relay VM runtime."""
     def __init__(self, mod):
-        self.mod = mod
+        if not isinstance(mod, (Executable, tvm.module.Module)):
+            raise TypeError("mod is expected to be the type of Executable or " +
+                            "tvm.Module, but received {}".format(type(mod)))
+        m = mod.module if isinstance(mod, Executable) else mod
+        self.mod = _vm._VirtualMachine(m)
         self._init = self.mod["init"]
-        self._load_params = self.mod["load_params"]
         self._invoke = self.mod["invoke"]
 
     def init(self, ctx):
@@ -68,23 +262,6 @@ def init(self, ctx):
         args = [ctx.device_type, ctx.device_id]
         self._init(*args)
 
-    def load_params(self, params):
-        """Load parameters for the VM.
-
-        Parameters
-        ----------
-        params : Union[bytearray, Dict]
-            The dictionary that contains serialized parameters.
-        """
-        if isinstance(params, dict):
-            params = tvm.relay.save_param_dict(params)
-        elif isinstance(params, (bytes, str)):
-            params = bytearray(params)
-        if not isinstance(params, (bytearray, TVMByteArray)):
-            raise TypeError("params must be a bytearray")
-
-        self._load_params(bytearray(params))
-
     def invoke(self, func_name, *args):
         """Invoke a function.
 
@@ -119,11 +296,6 @@ def run(self, *args):
         """
         return self.invoke("main", *args)
 
-    @property
-    def module(self):
-        """Return the runtime module contained in a virtual machine."""
-        return self.mod
-
 
 def compile(mod, target=None, target_host=None, params=None):
     """
@@ -152,8 +324,8 @@ def compile(mod, target=None, target_host=None, params=None):
 
     Returns
     -------
-    vm : VirtualMachine
-        The VM runtime.
+    exec : Executable
+        The VM executable that contains both library code and bytecode.
     """
     compiler = VMCompiler()
 
@@ -164,14 +336,14 @@ def compile(mod, target=None, target_host=None, params=None):
     tophub_context = compiler.tophub_context(target)
     with tophub_context:
         compiler._compile(mod, target, target_host)
-    return VirtualMachine(compiler._get_vm())
+    return Executable(compiler._get_exec())
 
 class VMCompiler(object):
     """Build Relay module to run on VM runtime."""
     def __init__(self):
         self.mod = _vm._VMCompiler()
         self._compile = self.mod["compile"]
-        self._get_vm = self.mod["get_vm"]
+        self._get_exec = self.mod["get_executable"]
         self._set_params_func = self.mod["set_params"]
 
     def set_params(self, params):
@@ -237,7 +409,7 @@ class VMExecutor(Executor):
     mod : :py:class:`~tvm.relay.module.Module`
         The module to support the execution.
 
-    ctx : :py:class:`TVMContext`
+    ctx : :py:class:`~tvm.TVMContext`
         The runtime context to run the code on.
 
     target : :py:class:`Target`
@@ -249,7 +421,8 @@ def __init__(self, mod, ctx, target):
         self.mod = mod
         self.ctx = ctx
         self.target = target
-        self.vm = compile(mod, target)
+        self.executable = compile(mod, target)
+        self.vm = VirtualMachine(self.executable)
         self.vm.init(ctx)
 
     def _make_executor(self, expr=None):
diff --git a/python/tvm/relay/backend/vmobj.py b/python/tvm/relay/backend/vmobj.py
index 4c92e9bf38a6..f3fdb763209d 100644
--- a/python/tvm/relay/backend/vmobj.py
+++ b/python/tvm/relay/backend/vmobj.py
@@ -18,32 +18,37 @@
 from __future__ import absolute_import as _abs
 import numpy as _np
 
-from tvm._ffi.vmobj import Object, ObjectTag, register_object
+from tvm._ffi.object import Object, register_object, getitem_helper
 from tvm import ndarray as _nd
 from . import _vmobj
 
-# TODO(@icemelon9): Add ClosureObject
 
-@register_object
-class TensorObject(Object):
-    """Tensor object."""
-    tag = ObjectTag.TENSOR
+@register_object("vm.Tensor")
+class Tensor(Object):
+    """Tensor object.
 
-    def __init__(self, handle):
-        """Constructs a Tensor object
-
-        Parameters
-        ----------
-        handle : object
-            Object handle
+    Parameters
+    ----------
+    arr : numpy.ndarray or tvm.nd.NDArray
+        The source array.
 
-        Returns
-        -------
-        obj : TensorObject
-            A tensor object.
-        """
-        super(TensorObject, self).__init__(handle)
-        self.data = _vmobj.GetTensorData(self)
+    ctx :  TVMContext, optional
+        The device context to create the array
+    """
+    def __init__(self, arr, ctx=None):
+        if isinstance(arr, _np.ndarray):
+            ctx = ctx if ctx else _nd.cpu(0)
+            self.__init_handle_by_constructor__(
+                _vmobj.Tensor, _nd.array(arr, ctx=ctx))
+        elif isinstance(arr, _nd.NDArray):
+            self.__init_handle_by_constructor__(
+                _vmobj.Tensor, arr)
+        else:
+            raise RuntimeError("Unsupported type for tensor object.")
+
+    @property
+    def data(self):
+        return _vmobj.GetTensorData(self)
 
     def asnumpy(self):
         """Convert data to numpy array
@@ -56,69 +61,38 @@ def asnumpy(self):
         return self.data.asnumpy()
 
 
-@register_object
-class DatatypeObject(Object):
-    """Datatype object."""
-    tag = ObjectTag.DATATYPE
+@register_object("vm.ADT")
+class ADT(Object):
+    """Algebatic data type(ADT) object.
 
-    def __init__(self, handle):
-        """Constructs a Datatype object
+    Parameters
+    ----------
+    tag : int
+        The tag of ADT.
 
-        Parameters
-        ----------
-        handle : object
-            Object handle
+    fields : list[Object] or tuple[Object]
+        The source tuple.
+    """
+    def __init__(self, tag, fields):
+        for f in fields:
+            assert isinstance(f, Object)
+        self.__init_handle_by_constructor__(
+            _vmobj.ADT, tag, *fields)
 
-        Returns
-        -------
-        obj : DatatypeObject
-            A Datatype object.
-        """
-        super(DatatypeObject, self).__init__(handle)
-        self.tag = _vmobj.GetDatatypeTag(self)
-        num_fields = _vmobj.GetDatatypeNumberOfFields(self)
-        self.fields = []
-        for i in range(num_fields):
-            self.fields.append(_vmobj.GetDatatypeFields(self, i))
+    @property
+    def tag(self):
+        return _vmobj.GetADTTag(self)
 
     def __getitem__(self, idx):
-        return self.fields[idx]
+        return getitem_helper(
+            self, _vmobj.GetADTFields, len(self), idx)
 
     def __len__(self):
-        return len(self.fields)
-
-    def __iter__(self):
-        return iter(self.fields)
-
-# TODO(icemelon9): Add closure object
-
-def tensor_object(arr, ctx=_nd.cpu(0)):
-    """Create a tensor object from source arr.
-
-    Parameters
-    ----------
-    arr : numpy.ndarray or tvm.nd.NDArray
-        The source array.
-
-    ctx :  TVMContext, optional
-        The device context to create the array
-
-    Returns
-    -------
-    ret : TensorObject
-        The created object.
-    """
-    if isinstance(arr, _np.ndarray):
-        tensor = _vmobj.Tensor(_nd.array(arr, ctx))
-    elif isinstance(arr, _nd.NDArray):
-        tensor = _vmobj.Tensor(arr)
-    else:
-        raise RuntimeError("Unsupported type for tensor object.")
-    return tensor
+        return _vmobj.GetADTNumberOfFields(self)
 
 
 def tuple_object(fields):
-    """Create a datatype object from source tuple.
+    """Create a ADT object from source tuple.
 
     Parameters
     ----------
@@ -127,30 +101,9 @@ def tuple_object(fields):
 
     Returns
     -------
-    ret : DatatypeObject
+    ret : ADT
         The created object.
     """
     for f in fields:
         assert isinstance(f, Object)
     return _vmobj.Tuple(*fields)
-
-
-def datatype_object(tag, fields):
-    """Create a datatype object from tag and source fields.
-
-    Parameters
-    ----------
-    tag : int
-        The tag of datatype.
-
-    fields : list[Object] or tuple[Object]
-        The source tuple.
-
-    Returns
-    -------
-    ret : DatatypeObject
-        The created object.
-    """
-    for f in fields:
-        assert isinstance(f, Object)
-    return _vmobj.Datatype(tag, *fields)
diff --git a/python/tvm/relay/debug.py b/python/tvm/relay/debug.py
index ee30f25d88c1..8887a7eb3c7c 100644
--- a/python/tvm/relay/debug.py
+++ b/python/tvm/relay/debug.py
@@ -17,12 +17,8 @@
 # pylint: disable=wildcard-import, redefined-builtin, invalid-name
 """The Relay IR namespace containing the IR definition and compiler."""
 from __future__ import absolute_import
-from .base import NodeBase, register_relay_node
 from ..api import register_func
 
-@register_relay_node
-class InterpreterState(NodeBase):
-    pass
 
 # pylint: disable=unused-argument
 def _debugger_init(expr, stack):
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 88779dfd76e0..8d59e99d8388 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -27,6 +27,7 @@
 from .._ffi import base as _base
 from .. import nd as _nd
 from .. import convert
+from ..ndarray import NDArray
 
 # will be registered afterwards
 _op_make = None
@@ -305,6 +306,17 @@ def __call__(self, *args):
         """
         return Call(self, args, None, None)
 
+    def get_params(self):
+        return _expr.FunctionGetParams(self)
+
+    def set_params(self, params):
+        for key in params:
+            value = params[key]
+            if isinstance(value, NDArray):
+                params[key] = Constant(value)
+
+        return _expr.FunctionSetParams(self, params)
+
 
 @register_relay_node
 class Call(Expr):
diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index 637e1f0860da..d4b9162d6f3d 100644
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -259,7 +259,7 @@ def get_relay_op(op_name):
             op = None
     else:
         # try search op in various modules
-        for candidate in (_op, _op.nn, _op.image, _op.vision):
+        for candidate in (_op, _op.nn, _op.image, _op.vision, _op.contrib):
             op = getattr(candidate, op_name, None)
             if op is not None:
                 break
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index a7f787484b2c..a1f51ad41fb0 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -915,6 +915,7 @@ def _impl_v1(cls, inputs, attr, params):
         reps = attr.pop('repeats')  # The number of times repeating the tensor data.
         return _op.tile(inputs[0], reps)
 
+
 class Erf(OnnxOpConverter):
     """Operator converter for Erf
     """
@@ -922,6 +923,40 @@ class Erf(OnnxOpConverter):
     def _impl_v1(cls, inputs, attr, params):
         return _op.erf(inputs[0])
 
+class Where(OnnxOpConverter):
+    """Operator converter for Where
+    """
+    @classmethod
+    def _impl_v9(cls, inputs, attr, params):
+        return _op.where(inputs[0], inputs[1], inputs[2])
+
+
+class ConstantOfShape(Elemwise):
+    """Operator converter for ConstantOfShape
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        if not isinstance(inputs, list) or len(inputs) < 2:
+            raise ValueError("Expect minimum 2 inputs")
+        # reps: The number of times repeating the tensor data.
+        shape = tuple(params[inputs[1].name_hint].asnumpy().astype('int').tolist())
+        return _op.tile(inputs[0], reps=shape)
+
+
+class ConstantOfShape(Elemwise):
+    """Operator converter for ConstantOfShape
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        if not isinstance(inputs, list) or len(inputs) < 2:
+            raise ValueError("Expect minimum 2 inputs")
+        # reps: The number of times repeating the tensor data.
+        try:
+            shape = tuple(params[inputs[1].name_hint].asnumpy().astype('int').tolist())
+        except Exception as e:
+            raise ValueError(e)
+        return _op.tile(inputs[0], reps=shape)
+
 
 # compatible operators that do NOT require any conversion.
 _identity_list = []
@@ -1042,7 +1077,9 @@ def _get_convert_map(opset):
         'Not': Not.get_converter(opset),
         'And': And.get_converter(opset),
         'Tile': Tile.get_converter(opset),
-        'Erf': Erf.get_converter(opset)
+        'Erf': Erf.get_converter(opset),
+        'Where': Where.get_converter(opset),
+        'ConstantOfShape': ConstantOfShape.get_converter(opset)
     }
 
 
@@ -1162,7 +1199,14 @@ def from_onnx(self, graph, opset):
                     self._params[i_name] = fill_value
                     self._nodes[i_name] = new_var(node.output[0], shape=(), dtype=dtype)
                     inputs.append(self._nodes[i_name])
-
+                if op_name == "ConstantOfShape":
+                    t_proto = self._parse_attr(node.attribute)["value"]
+                    i_name = node.output[0]
+                    self._params[i_name] = self._parse_array(t_proto)
+                    self._nodes[i_name] = new_var(i_name,
+                                                  shape=list(t_proto.dims),
+                                                  dtype=self._params[i_name].dtype)
+                    inputs.append(self._nodes[i_name])
                 i_name = self._parse_value_proto(node)
                 attr['tvm_custom'] = {}
                 attr['tvm_custom']['name'] = i_name
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 38f9c523e0b1..bfa3431ba29e 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -22,10 +22,14 @@
 
 import warnings
 from collections import defaultdict
+
 # Numpy support
 import numpy as np
 
 import tvm
+
+from tvm.relay.prelude import Prelude
+
 from .. import analysis
 from .. import expr as _expr
 from .. import op as _op
@@ -432,6 +436,24 @@ def _impl(inputs, attr, params):
         return AttrCvt(op_name="copy", ignores=['message'])(inputs, attr)
     return _impl
 
+def _assert():
+    # ToDo: In general people want asserts to be gone from TensorFlow graphs
+    # when they are optimizing them, so converting it to a no-op is
+    # reasonable. However, it would be nice to have the option to keep them
+    # once Relay gets a Halt or Assert op.
+    return _no_op()
+
+def _no_op():
+    def _impl(inputs, attr, params):
+        # ToDo: This should really be an op that returns nothing, which could
+        # be represented as an empty tuple. It turns out that TVM
+        # infrastructure doesn't like running functions that return None and
+        # also don't like running functions that return an empty tuple. So it
+        # doesn't work, but it should be made to work and then this could be
+        # improved. In the mean time, it is hard to imagine a case where it
+        # matters in any real way that a no-op is converted to a constant 0.
+        return tvm.relay.const(0)
+    return _impl
 
 def _matmul():
     def _impl(inputs, attr, params):
@@ -508,6 +530,69 @@ def _impl(inputs, attr, params):
         return _op.concatenate(inputs_reshaped, axis)
     return _impl
 
+def _tensor_array():
+    def _impl(inputs, attr, params, prelude):
+        dtype_str = attr.get('dtype').name
+        tensor_array_constructor = prelude.get_var('tensor_array', dtype_str)
+        return tensor_array_constructor(_op.take(inputs[0], tvm.relay.const(0)))
+    return _impl
+
+def _tensor_array_scatter():
+    def _impl(inputs, attr, params, prelude):
+        dtype_str = attr.get('T').name
+        values_rank = len(inputs[2].type_annotation.shape)
+        unstack_name = "tensor_array_unstack_tensor{}".format(values_rank)
+        unstack_function = prelude.get_var(unstack_name, dtype_str)
+        values = unstack_function(inputs[2])
+        tensor_array_scatter_func = prelude.get_var('tensor_array_scatter', dtype_str)
+        return tensor_array_scatter_func(inputs[0], inputs[1], values)
+    return _impl
+
+def _tensor_array_gather():
+    def _impl(inputs, attr, params, prelude):
+        return prelude.tensor_array_gather(inputs[2], inputs[1])
+    return _impl
+
+def _tensor_array_size():
+    def _impl(inputs, attr, params, prelude):
+        return prelude.length(inputs[0])
+    return _impl
+
+def _tensor_array_write():
+    def _impl(inputs, attr, params, prelude):
+        input_rank = len(inputs[2].type_annotation.shape)
+        dtype = attr.get('T').name
+
+        tensor_name = 'tensor{}'.format(input_rank)
+        tensor_func = prelude.get_var(tensor_name, dtype)
+        v = tensor_func(inputs[2])
+        write_func = prelude.get_var('tensor_array_write', dtype)
+
+        return write_func(inputs[3], _op.take(inputs[1], tvm.relay.const(0)), v)
+    return _impl
+
+def _tensor_array_read():
+    def _impl(inputs, attr, params, prelude):
+        read_func = prelude.get_var('tensor_array_read', attr.get('dtype').name)
+        return read_func(inputs[2], _op.take(inputs[1], tvm.relay.const(0)))
+    return _impl
+
+def _tensor_array_split():
+    def _impl(inputs, attr, params, prelude):
+        input_rank = len(inputs[1].type_annotation.shape)
+        dtype_str = attr.get('T').name
+        v = prelude.get_var("tensor{}".format(input_rank), dtype_str)(inputs[1])
+        lengths = _op.cast(inputs[2], 'int32')
+        split_var = prelude.get_var('tensor_array_split', dtype_str)
+        return split_var(inputs[0], v, lengths)
+    return _impl
+
+def _tensor_array_concat():
+    def _impl(inputs, attr, params, prelude):
+        concat_func = prelude.get_var('tensor_array_concat', attr['dtype'].name)
+        return concat_func(inputs[1])
+    return _impl
+
 def _tile():
     def _impl(inputs, attr, params):
         reps = _get_list_param(params, inputs.pop())
@@ -1238,6 +1323,13 @@ def _impl(inputs, attr, params):
         return _op.multiply(difference, difference)
     return _impl
 
+def _size():
+    def _impl(inputs, attr, params):
+        new_attr = attr
+        new_attr['out_type'] = attr['out_type'].name
+        return AttrCvt('ndarray_size', transforms={'out_type' : 'dtype'})(inputs, new_attr)
+    return _impl
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -1252,6 +1344,7 @@ def _impl(inputs, attr, params):
     'All'                               : _reduce('all'),
     'ArgMax'                            : _argx(_op.argmax, 'argmax'),
     'ArgMin'                            : _argx(_op.argmin, 'argmin'),
+    'Assert'                            : _assert(),
     'AvgPool'                           : _pooling('avg_pool'),
     'BatchMatMul'                       : _batch_matmul(),
     'BatchMatMulV2'                     : _batch_matmul(),
@@ -1310,9 +1403,18 @@ def _impl(inputs, attr, params):
     'Mod'                               : _elemwise('mod'),
     'Mul'                               : _elemwise('multiply'),
     'Neg'                               : AttrCvt('negative'),
+    'NoOp'                              : _no_op(),
     'NotEqual'                          : _broadcast('not_equal'),
     'OneHot'                            : _one_hot(),
     'Pack'                              : _pack(),
+    'TensorArrayV3'                     : _tensor_array(),
+    'TensorArrayScatterV3'              : _tensor_array_scatter(),
+    'TensorArrayGatherV3'               : _tensor_array_gather(),
+    'TensorArraySizeV3'                 : _tensor_array_size(),
+    'TensorArrayWriteV3'                : _tensor_array_write(),
+    'TensorArrayReadV3'                 : _tensor_array_read(),
+    'TensorArraySplitV3'                : _tensor_array_split(),
+    'TensorArrayConcatV3'               : _tensor_array_concat(),
     'Pad'                               : _pad('Pad'),
     'PadV2'                             : _pad('PadV2'),
     'Pow'                               : _elemwise('power'),
@@ -1335,7 +1437,7 @@ def _impl(inputs, attr, params):
     'Shape'                             : _shape(),
     'Sigmoid'                           : AttrCvt('sigmoid'),
     'Sign'                              : AttrCvt('sign'),
-    'Size'                              : AttrCvt('ndarray_size'),
+    'Size'                              : _size(),
     'Slice'                             : _slice(),
     'Softmax'                           : _softmax(),
     'Softplus'                          : _softplus(),
@@ -1860,6 +1962,7 @@ def __init__(self):
         self._loops = {}
         self._branches = {}
         self._mod = _module.Module({})
+        self._prelude = Prelude(self._mod)
 
     def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
         """Construct relay nodes from tensorflow graph definition - GraphDef.
@@ -2113,8 +2216,11 @@ def _parse_param(self, key, value, name, shape):
             if np_array.dtype == np.dtype(object):
                 # Object types are generally tensorflow DT_STRING (DecodeJpeg op).
                 # Just leave it as placeholder.
-                self._nodes[name] = [_expr.var(name, shape=shape[name], dtype='uint8')]
-
+                if shape:
+                    var_shape = shape[name]
+                else:
+                    var_shape = tensor_util.TensorShapeProtoToList(value.tensor.tensor_shape)
+                self._nodes[name] = [_expr.var(name, shape=var_shape, dtype='uint8')]
                 return
 
             array_ndim = len(np_array.shape)
@@ -2335,7 +2441,11 @@ def _convert_operator(self, op_name, inputs, attrs,
         if op_name in identity_list:
             sym = get_relay_op(op_name)(*inputs, **attrs)
         elif op_name in convert_map:
-            sym = convert_map[op_name](inputs, attrs, self._params)
+            if 'TensorArray' in op_name:
+                sym = convert_map[op_name](inputs, attrs, self._params, self._prelude)
+            else:
+                sym = convert_map[op_name](inputs, attrs, self._params)
+
         elif op_name in convert_map_rnn:
             sym = self._convert_rnn_operator(op_name, inputs, attrs,
                                              self._params, graph,
diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 35bc85e09fdd..b042af9fbe65 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -82,6 +82,7 @@ def __init__(self, model, subgraph, exp_tab):
             'REDUCE_MAX': self._convert_reduce_max,
             'MEAN': self._convert_reduce_mean,
             'REDUCE_PROD': self._convert_reduce_prod,
+            'SUM': self._convert_reduce_sum,
             'FULLY_CONNECTED': self.convert_fully_connected,
             'PAD': self.convert_pad,
             'PACK': self.convert_pack,
@@ -224,6 +225,18 @@ def has_same_qnn_params(self, lhs_tensor, rhs_tensor):
         return lhs_tensor.qnn_params['scale'] == rhs_tensor.qnn_params['scale'] and \
                 lhs_tensor.qnn_params['zero_point'] == rhs_tensor.qnn_params['zero_point']
 
+    def is_quantized(self, op):
+        """Check if an input tensor is quantized."""
+        try:
+            from tflite.Operator import Operator
+        except ImportError:
+            raise ImportError("The tflite package must be installed")
+
+        assert isinstance(op, Operator)
+        input_tensors = self.get_input_tensors(op)
+        first_tensor = input_tensors[0]
+        return first_tensor.qnn_params is not None
+
     def convert_conv2d(self, op):
         """Convert TFLite conv2d"""
         return self.convert_conv(op, "conv2d")
@@ -498,7 +511,25 @@ def _convert_elemwise(self, relay_op, op):
             rhs_type_str = self.get_tensor_type_str(rhs_tensor.tensor.Type())
             rhs_expr = self.exp_tab.new_const(self.get_tensor_value(rhs_tensor),
                                               dtype=rhs_type_str)
-        out = relay_op(lhs_expr, rhs_expr)
+
+        output_tensors = self.get_output_tensors(op)
+        assert len(output_tensors) == 1, "output tensors length should be 1"
+        output_tensor = output_tensors[0]
+
+        # If quantized, extracts qnn params and call QNN add operator.
+        if lhs_tensor.qnn_params:
+            assert rhs_tensor.qnn_params, "Both tensors should be quantized."
+            assert output_tensor.qnn_params, "Output tensor should be quantized."
+            out = relay_op(lhs=lhs_expr,
+                           rhs=rhs_expr,
+                           lhs_scale=lhs_tensor.qnn_params['scale'],
+                           lhs_zero_point=lhs_tensor.qnn_params['zero_point'],
+                           rhs_scale=rhs_tensor.qnn_params['scale'],
+                           rhs_zero_point=rhs_tensor.qnn_params['zero_point'],
+                           output_scale=output_tensor.qnn_params['scale'],
+                           output_zero_point=output_tensor.qnn_params['zero_point'])
+        else:
+            out = relay_op(lhs_expr, rhs_expr)
 
         # Options (fused_activation_function)
         options = None
@@ -517,36 +548,70 @@ def _convert_elemwise(self, relay_op, op):
             fused_activation_fn = options.FusedActivationFunction()
             # if we have activation fn
             if fused_activation_fn != ActivationFunctionType.NONE:
+                if output_tensor.qnn_params:
+                    raise tvm.error.OpNotImplemented(
+                        'Elemwise operators with fused activation are not supported yet.')
                 out = self.convert_fused_activation_function(out, fused_activation_fn)
 
         return out
 
     def convert_add(self, op):
         """Convert TFLite ADD"""
+        # Check if the input tensor is quantized, call QNN op
+        if self.is_quantized(op):
+            return self._convert_elemwise(_qnn.op.add, op)
         return self._convert_elemwise(_op.add, op)
 
     def convert_sub(self, op):
         """Convert TFLite SUB"""
+        # Check if the input tensor is quantized, call QNN op
+        if self.is_quantized(op):
+            raise tvm.error.OpNotImplemented(
+                'TFlite quantized sub operator is not supported yet.')
         return self._convert_elemwise(_op.subtract, op)
 
     def convert_mul(self, op):
         """Convert TFLite MUL"""
+        # Check if the input tensor is quantized, call QNN op
+        if self.is_quantized(op):
+            raise tvm.error.OpNotImplemented(
+                'TFlite quantized mul operator is not supported yet.')
         return self._convert_elemwise(_op.multiply, op)
 
     def convert_div(self, op):
         """Convert TFLite DIV"""
+        # Check if the input tensor is quantized, call QNN op
+        if self.is_quantized(op):
+            raise tvm.error.OpNotImplemented(
+                'TFlite quantized div operator is not supported yet.')
         return self._convert_elemwise(_op.divide, op)
 
     def convert_pow(self, op):
+        # Check if the input tensor is quantized, call QNN op
+        if self.is_quantized(op):
+            raise tvm.error.OpNotImplemented(
+                'TFlite quantized pow operator is not supported yet.')
         return self._convert_elemwise(_op.power, op)
 
     def convert_maximum(self, op):
+        # Check if the input tensor is quantized, call QNN op
+        if self.is_quantized(op):
+            raise tvm.error.OpNotImplemented(
+                'TFlite quantized maximum operator is not supported yet.')
         return self._convert_elemwise(_op.maximum, op)
 
     def convert_minimum(self, op):
+        # Check if the input tensor is quantized, call QNN op
+        if self.is_quantized(op):
+            raise tvm.error.OpNotImplemented(
+                'TFlite quantized minimum operator is not supported yet.')
         return self._convert_elemwise(_op.minimum, op)
 
     def convert_greater(self, op):
+        # Check if the input tensor is quantized, call QNN op
+        if self.is_quantized(op):
+            raise tvm.error.OpNotImplemented(
+                'TFlite quantized greater operator is not supported yet.')
         return self._convert_elemwise(_op.greater, op)
 
     def convert_zeros_like(self, op):
@@ -608,6 +673,9 @@ def _convert_reduce_mean(self, op):
     def _convert_reduce_prod(self, op):
         return self._convert_reduce(_op.reduce.prod, op)
 
+    def _convert_reduce_sum(self, op):
+        return self._convert_reduce(_op.reduce.sum, op)
+
     def convert_fully_connected(self, op):
         """Convert TFLite fully connected"""
         try:
diff --git a/python/tvm/relay/op/_reduce.py b/python/tvm/relay/op/_reduce.py
index f6b699f1e9cc..845ec4b9ba87 100644
--- a/python/tvm/relay/op/_reduce.py
+++ b/python/tvm/relay/op/_reduce.py
@@ -37,3 +37,4 @@ def _schedule_reduce(_, outs, target):
 _reg.register_schedule("mean", _schedule_reduce)
 _reg.register_schedule("variance", _schedule_reduce)
 _reg.register_schedule("nn.cross_entropy", _schedule_reduce)
+_reg.register_schedule("nn.cross_entropy_with_logits", _schedule_reduce)
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index da5804906269..188b3bb15956 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -108,6 +108,29 @@ def clip_compute(attrs, inputs, output_type, target):
 
 register_schedule("clip", schedule_elemwise)
 
+@script
+def _cast_shape_function(x):
+    out_ndim = len(x)
+    out = output_tensor((out_ndim,), "int64")
+    for i in const_range(out_ndim):
+        out[i] = x[i]
+    return out
+
+def cast_shape_func(attrs, inputs, out_ndims):
+    return [_cast_shape_function(*inputs)]
+
+@script
+def _expand_dims_shape_func(x):
+    ndim = len(x.shape)
+    out = output_tensor((ndim+1,), "int64")
+    out[0] = int64(1)
+    for i in const_range(0, ndim):
+        out[i+1] = int64(x.shape[i])
+    return out
+
+def expand_dims_shape_func(attrs, inputs, out_ndims):
+    return [_expand_dims_shape_func(*inputs)]
+
 # shape func
 @script
 def _broadcast_shape_func(x, y, ndim):
@@ -140,6 +163,9 @@ def _broadcast_shape_func(x, y, ndim):
 def broadcast_shape_func(attrs, inputs, out_ndims):
     return [_broadcast_shape_func(*inputs, out_ndims[0])]
 
+register_shape_func("expand_dims", False, expand_dims_shape_func)
+register_shape_func("cast", False, cast_shape_func)
+
 register_shape_func("add", False, broadcast_shape_func)
 register_shape_func("subtract", False, broadcast_shape_func)
 register_shape_func("multiply", False, broadcast_shape_func)
diff --git a/python/tvm/relay/op/_tensor_grad.py b/python/tvm/relay/op/_tensor_grad.py
index 3a82e46e6a7d..d55cad7c7a2d 100644
--- a/python/tvm/relay/op/_tensor_grad.py
+++ b/python/tvm/relay/op/_tensor_grad.py
@@ -48,6 +48,9 @@
     tile,
     transpose,
     where,
+    repeat,
+    expand_dims,
+    full_like
 )
 
 
@@ -198,6 +201,7 @@ def clip_grad(orig, grad):
 
 @register_gradient("nn.max_pool2d")
 def max_pool2d_grad(orig, grad):
+    """Returns the gradient of max_pool2d."""
     attrs = orig.attrs
     pool_grad = _nn.max_pool2d_grad(grad, orig.args[0], pool_size=attrs.pool_size,
                                     strides=attrs.strides, padding=attrs.padding,
@@ -207,6 +211,7 @@ def max_pool2d_grad(orig, grad):
 
 @register_gradient("nn.avg_pool2d")
 def avg_pool2d_grad(orig, grad):
+    """Returns the gradient of avg_pool2d."""
     attrs = orig.attrs
     pool_grad = _nn.avg_pool2d_grad(grad, orig.args[0], pool_size=attrs.pool_size,
                                     strides=attrs.strides, padding=attrs.padding,
@@ -215,6 +220,26 @@ def avg_pool2d_grad(orig, grad):
     return [pool_grad]
 
 
+@register_gradient("nn.global_avg_pool2d")
+def global_avg_pool2d_grad(orig, grad):
+    """Returns the gradient of global_avg_pool2d."""
+    data = orig.args[0]
+    shape = data.checked_type.shape
+    layout = orig.attrs.layout
+
+    # we assume NCHW or NHWC layout for now, but easy to add more
+    assert layout in ["NCHW", "NHWC"]
+    if layout == "NCHW":
+        pool_size = shape[2], shape[3]
+    elif layout == "NHWC":
+        pool_size = shape[1], shape[2]
+
+    pool_grad = _nn.avg_pool2d_grad(grad, data, pool_size=pool_size,
+                                    strides=(1, 1), padding=(0, 0),
+                                    layout=layout)
+    return [pool_grad]
+
+
 # not implemented, this is only for testing.
 @register_gradient("concatenate")
 def concatenate_grad(orig, grad):
@@ -287,16 +312,53 @@ def conv2d_grad(orig, grad):
     return [backward_data, backward_weight]
 
 
+def _get_reduce_axis(call):
+    """Helper function that returns the reduce axis of the call as plain python ints."""
+    x, axis = call.args[0], call.attrs.axis
+    shape = x.checked_type.concrete_shape
+
+    # should never exclude when axis is None
+    assert not (axis is None and call.attrs.exclude)
+
+    if axis is None:
+        return None
+
+    # convert to nonnegative integers and sort
+    axis = sorted([ax if ax >= 0 else len(shape) + ax for ax in map(int, axis)])
+    if call.attrs.exclude:
+        axis = [ax for ax in range(len(shape)) if ax not in axis]
+    return axis
+
+
+def _unreduce_expand(x, axis):
+    """Helper function that returns x expanded on the reduced dimensions in axis."""
+    # assume axis is sorted nonnegative ints
+    for ax in axis:
+        x = expand_dims(x, ax)
+    return x
+
+
 @register_gradient("max")
 def max_grad(orig, grad):
     """Returns the gradient of max"""
-    # Only support axis=0, since broadcasting orig to x behaves incorrectly
-    x, axis = orig.args[0], orig.attrs.axis
-    assert(axis is not None and len(axis) == 1 and int(axis[0]) == 0)
-    orig = broadcast_to_like(orig, x)
-    grad = broadcast_to_like(grad, x)
-    indicators = cast_like(equal(orig, x), grad)
-    return [indicators * grad]
+    x, axis = orig.args[0], _get_reduce_axis(orig)
+    shape = x.checked_type.concrete_shape
+
+    repeated = orig
+    if axis is None:
+        repeated = full_like(x, repeated)
+    else:
+        # expand dims (if necessary) and repeat along each axis
+        if not orig.attrs.keepdims:
+            repeated = _unreduce_expand(repeated, axis)
+            grad = _unreduce_expand(grad, axis)
+        for ax in axis:
+            repeated = repeat(repeated, shape[ax], ax)
+
+    indicators = cast_like(equal(repeated, x), grad)
+    num_selected = _sum(indicators, axis, keepdims=True)
+    # spread error across all max weights
+    return [indicators * grad / num_selected]
 
 
 @register_gradient("nn.softmax")
@@ -372,7 +434,11 @@ def negative_grad(orig, grad):
 @register_gradient("sum")
 def sum_grad(orig, grad):
     """Returns grad broadcasted to data dims"""
-    data = orig.args[0]
+    data, axis = orig.args[0], _get_reduce_axis(orig)
+    if not orig.attrs.keepdims:
+        if axis is None:
+            axis = list(range(len(data.checked_type.concrete_shape)))
+        grad = _unreduce_expand(grad, axis)
     return [broadcast_to_like(grad, data)]
 
 
@@ -383,3 +449,12 @@ def cross_entropy_grad(orig, grad):
     batch_size = take(shape, const(0, dtype='int32'), axis=0)
     grad = grad / batch_size.astype('float32')
     return [-grad * y / x, -grad * log(x)]
+
+
+@register_gradient("nn.cross_entropy_with_logits")
+def cross_entropy_with_logits_grad(orig, grad):
+    x, y = orig.args
+    shape = shape_of(x)
+    batch_size = take(shape, const(0, dtype='int32'), axis=0)
+    grad = grad / batch_size.astype('float32')
+    return [-grad * y, -grad * x]
diff --git a/python/tvm/relay/op/annotation/annotation.py b/python/tvm/relay/op/annotation/annotation.py
index 10c898538596..2b9d4bcd81bc 100644
--- a/python/tvm/relay/op/annotation/annotation.py
+++ b/python/tvm/relay/op/annotation/annotation.py
@@ -17,10 +17,10 @@
 """Annotation operations."""
 from __future__ import absolute_import as _abs
 from . import _make
+from ..op import register_schedule, schedule_injective
 from .... import nd as _nd
 from .... import TVMContext as _TVMContext
 
-
 def on_device(data, device):
     """Annotate an expression with a certain device type.
 
@@ -61,3 +61,20 @@ def stop_fusion(data):
         The annotated expression.
     """
     return _make.stop_fusion(data)
+
+def checkpoint(data):
+    """Annotate an expression to be a checkpoint for the checkpointing memory optimization.
+
+    Parameters
+    ----------
+    data : tvm.relay.Expr
+        The expression to be annotated.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The annotated expression.
+    """
+    return _make.checkpoint(data)
+
+register_schedule("annotation.checkpoint", schedule_injective)
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index b8572349fb9d..5786c228abc0 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -153,14 +153,14 @@ def compute_conv2d(attrs, inputs, out_type, target):
     out_dtype = (inputs[0].dtype if out_dtype in ("same", "")
                  else out_dtype)
 
-    assert layout in ["NCHW", "NHWC", "NCHW4c"]
+    assert layout in ["NCHW", "NHWC", "NCHW4c", "HWCN"]
     (dilation_h, dilation_w) = dilation
     if dilation_h < 1 or dilation_w < 1:
         raise ValueError("dilation should be positive value")
 
     def _get_out_depth():
         weight_shape = get_const_tuple(inputs[1].shape)
-        if kernel_layout == "HWOI":
+        if kernel_layout.startswith("HW"):
             return weight_shape[2] * weight_shape[3]
         return weight_shape[0] * weight_shape[1]
 
@@ -192,11 +192,13 @@ def schedule_conv2d(attrs, outs, target):
     with target:
         if groups == 1 and layout == "NCHW":
             return topi.generic.schedule_conv2d_nchw(outs)
-        if groups == 1 and layout == "NCHW4c":
+        elif groups == 1 and layout == "NCHW4c":
             return topi.generic.schedule_conv2d_nchw(outs)
-        if groups == 1 and layout == "NHWC":
+        elif groups == 1 and layout == "NHWC":
             return topi.generic.schedule_conv2d_nhwc(outs)
-        if groups != 1:
+        elif groups == 1 and layout == "HWCN":
+            return topi.generic.schedule_conv2d_hwcn(outs)
+        elif groups != 1:
             # collect in_channels to distinguish depthwise and group conv2d
             op = _find_conv2d_op(outs[0].op)
             assert op is not None
@@ -768,3 +770,12 @@ def schedule_bitserial_dense(attrs, outputs, target):
 def compute_cross_entropy(attrs, inputs, out_dtype, target):
     x, y = inputs
     return [-topi.sum(topi.log(x) * y) / x.shape[0]]
+
+
+reg.register_pattern("nn.cross_entropy_with_logits", OpPattern.OPAQUE)
+
+
+@reg.register_compute("nn.cross_entropy_with_logits")
+def compute_cross_entropy_with_logits(attrs, inputs, out_dtype, target):
+    x, y = inputs
+    return [-topi.sum(x * y) / x.shape[0]]
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 9ddb3ece4ce2..1f289d1bd27a 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -1807,3 +1807,22 @@ def cross_entropy(predictions, targets):
       The computed result.
     """
     return _make.cross_entropy(predictions, targets)
+
+
+def cross_entropy_with_logits(predictions, targets):
+    """CrossEntropy with logits.
+
+    Parameters
+    ----------
+    predictions : tvm.relay.Expr
+      The predictions.
+
+    targets : tvm.relay.Expr
+      The targets.
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+      The computed result.
+    """
+    return _make.cross_entropy_with_logits(predictions, targets)
diff --git a/python/tvm/relay/prelude.py b/python/tvm/relay/prelude.py
index 803d8ef50db5..d27ffe512617 100644
--- a/python/tvm/relay/prelude.py
+++ b/python/tvm/relay/prelude.py
@@ -16,8 +16,513 @@
 # under the License.
 # pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name
 """A prelude containing useful global functions and ADT definitions."""
+from .ty import GlobalTypeVar, TensorType, Any, scalar_type
+from .expr import Var, Function, GlobalVar, If, const
+from .op.tensor import add, subtract, equal
+from .adt import Constructor, TypeData, Clause, Match
+from .adt import PatternConstructor, PatternVar, PatternWildcard
+from . import op
 from .module import Module
 
+class TensorArrayOps(object):
+    """Contains tensor array related ops"""
+
+    def __init__(self, prelude, dtype):
+        """Create tensor array ops registry"""
+        self.prelude = prelude
+        self.dtype = dtype
+
+    def get_name(self, canonical):
+        """Get name corresponding to the caninical name"""
+        return self.prelude.get_name(canonical, self.dtype)
+
+    def get_var(self, canonical):
+        """Get var corresponding to the caninical name"""
+        return self.prelude.get_var(canonical, self.dtype)
+
+    def define_tensor_adt(self):
+        """Defines the dynamic tensor ADT, which is the container for tensors
+        with variable shapes."""
+        tensor_type_name = self.get_name('tensor_t')
+        tensor_type_var = GlobalTypeVar(tensor_type_name)
+        setattr(self.prelude, tensor_type_name, tensor_type_var)
+        tensor0_type = TensorType([], self.dtype)
+        tensor1_type = TensorType([Any()], self.dtype)
+        tensor2_type = TensorType([Any(), Any()], self.dtype)
+        tensor3_type = TensorType([Any(), Any(), Any()], self.dtype)
+        tensor4_type = TensorType([Any(), Any(), Any(), Any()], self.dtype)
+        tensor5_type = TensorType([Any(), Any(), Any(), Any(), Any()], self.dtype)
+        tensor6_type = TensorType([Any(), Any(), Any(), Any(), Any(), Any()], self.dtype)
+        tensor_nil_name = self.get_name('tensor_nil')
+        tensor0_name = self.get_name('tensor0')
+        tensor1_name = self.get_name('tensor1')
+        tensor2_name = self.get_name('tensor2')
+        tensor3_name = self.get_name('tensor3')
+        tensor4_name = self.get_name('tensor4')
+        tensor5_name = self.get_name('tensor5')
+        tensor6_name = self.get_name('tensor6')
+        tensor_nil_case = Constructor(tensor_nil_name, [], tensor_type_var)
+        tensor0_case = Constructor(tensor0_name, [tensor0_type], tensor_type_var)
+        tensor1_case = Constructor(tensor1_name, [tensor1_type], tensor_type_var)
+        tensor2_case = Constructor(tensor2_name, [tensor2_type], tensor_type_var)
+        tensor3_case = Constructor(tensor3_name, [tensor3_type], tensor_type_var)
+        tensor4_case = Constructor(tensor4_name, [tensor4_type], tensor_type_var)
+        tensor5_case = Constructor(tensor5_name, [tensor5_type], tensor_type_var)
+        tensor6_case = Constructor(tensor6_name, [tensor6_type], tensor_type_var)
+        setattr(self.prelude, tensor_nil_name, tensor_nil_case)
+        setattr(self.prelude, tensor0_name, tensor0_case)
+        setattr(self.prelude, tensor1_name, tensor1_case)
+        setattr(self.prelude, tensor2_name, tensor2_case)
+        setattr(self.prelude, tensor3_name, tensor3_case)
+        setattr(self.prelude, tensor4_name, tensor4_case)
+        setattr(self.prelude, tensor5_name, tensor5_case)
+        setattr(self.prelude, tensor6_name, tensor6_case)
+        self.prelude.mod[tensor_type_var] = TypeData(tensor_type_var, [], [tensor_nil_case,
+                                                                           tensor0_case,
+                                                                           tensor1_case,
+                                                                           tensor2_case,
+                                                                           tensor3_case,
+                                                                           tensor4_case,
+                                                                           tensor5_case,
+                                                                           tensor6_case])
+
+    def define_tensor_take(self):
+        """Defines a function to return a range of tensor_t on axis 0.
+            tensor_take(t, lower, upper) :
+            tensor_t -> Tensor[(), int32] -> Tensor[(), int32] -> tensor_t
+        """
+        take_name = self.get_name("tensor_take")
+        take_var = GlobalVar(take_name)
+        setattr(self.prelude, take_name, take_var)
+        tensor_t = self.get_var('tensor_t')
+        tensor1_var = self.get_var('tensor1')
+        tensor2_var = self.get_var('tensor2')
+        tensor3_var = self.get_var('tensor3')
+        tensor4_var = self.get_var('tensor4')
+        tensor5_var = self.get_var('tensor5')
+        tensor6_var = self.get_var('tensor6')
+        t = Var('tensor', tensor_t())
+        lower = Var('lower', scalar_type('int32'))
+        upper = Var('upper', scalar_type('int32'))
+        t1 = Var('t1')
+        t2 = Var('t2')
+        t3 = Var('t3')
+        t4 = Var('t4')
+        t5 = Var('t5')
+        t6 = Var('t6')
+        tensor1_case =\
+            Clause(PatternConstructor(tensor1_var, [PatternVar(t1)]),
+                   tensor1_var(op.take(t1, op.arange(lower, upper, dtype='int32'))))
+        tensor2_case =\
+            Clause(PatternConstructor(tensor2_var, [PatternVar(t2)]),
+                   tensor2_var(op.take(t2, op.arange(lower, upper, dtype='int32'), axis=0)))
+        tensor3_case =\
+            Clause(PatternConstructor(tensor3_var, [PatternVar(t3)]),
+                   tensor3_var(op.take(t3, op.arange(lower, upper, dtype='int32'), axis=0)))
+        tensor4_case =\
+            Clause(PatternConstructor(tensor4_var, [PatternVar(t4)]),
+                   tensor4_var(op.take(t4, op.arange(lower, upper, dtype='int32'), axis=0)))
+        tensor5_case =\
+            Clause(PatternConstructor(tensor5_var, [PatternVar(t5)]),
+                   tensor5_var(op.take(t5, op.arange(lower, upper, dtype='int32'), axis=0)))
+        tensor6_case =\
+            Clause(PatternConstructor(tensor6_var, [PatternVar(t6)]),
+                   tensor6_var(op.take(t6, op.arange(lower, upper, dtype='int32'), axis=0)))
+        self.prelude.mod[take_var] =\
+            Function([t, lower, upper],
+                     Match(t, [tensor1_case,
+                               tensor2_case,
+                               tensor3_case,
+                               tensor4_case,
+                               tensor5_case,
+                               tensor6_case], False),
+                     tensor_t(), [])
+
+    def define_tensor_expand_dims(self):
+        """Defines a function to grow a tensor_t's rank by adding one dimension in front
+        of the original tensor_t.
+        tensor_expand_dims(t) : tensor_t -> tensor_t
+        """
+        expand_dims_name = self.get_name("tensor_expand_dims")
+        expand_dims_var = GlobalVar(expand_dims_name)
+        setattr(self.prelude, expand_dims_name, expand_dims_var)
+        tensor_type_var = self.get_var('tensor_t')
+        x = Var("x", tensor_type_var())
+        t0 = Var("t0")
+        t1 = Var("t1")
+        t2 = Var("t2")
+        t3 = Var("t3")
+        t4 = Var("t4")
+        t5 = Var("t5")
+        tensor0_var = self.get_var('tensor0')
+        tensor1_var = self.get_var('tensor1')
+        tensor2_var = self.get_var('tensor2')
+        tensor3_var = self.get_var('tensor3')
+        tensor4_var = self.get_var('tensor4')
+        tensor5_var = self.get_var('tensor5')
+        tensor6_var = self.get_var('tensor6')
+        tensor0_case = Clause(PatternConstructor(tensor0_var, [PatternVar(t0)]),
+                              tensor1_var(op.expand_dims(t0, 0, 1)))
+        tensor1_case = Clause(PatternConstructor(tensor1_var, [PatternVar(t1)]),
+                              tensor2_var(op.expand_dims(t1, 0, 1)))
+        tensor2_case = Clause(PatternConstructor(tensor2_var, [PatternVar(t2)]),
+                              tensor3_var(op.expand_dims(t2, 0, 1)))
+        tensor3_case = Clause(PatternConstructor(tensor3_var, [PatternVar(t3)]),
+                              tensor4_var(op.expand_dims(t3, 0, 1)))
+        tensor4_case = Clause(PatternConstructor(tensor4_var, [PatternVar(t4)]),
+                              tensor5_var(op.expand_dims(t4, 0, 1)))
+        tensor5_case = Clause(PatternConstructor(tensor5_var, [PatternVar(t5)]),
+                              tensor6_var(op.expand_dims(t5, 0, 1)))
+        self.prelude.mod[expand_dims_var] =\
+            Function([x],
+                     Match(x, [tensor0_case,
+                               tensor1_case,
+                               tensor2_case,
+                               tensor3_case,
+                               tensor4_case,
+                               tensor5_case], False))
+
+    def define_tensor_concat(self):
+        """Defines a function to concatenate two tensor_t on the first axis
+
+        tensor_concatenate(t) : tensor_t -> tensor_t -> tensor_t
+        """
+        concat_name = self.get_name("tensor_concatenate")
+        concat_var = GlobalVar(concat_name)
+        setattr(self.prelude, concat_name, concat_var)
+        tensor_type_var = self.get_var('tensor_t')
+        x = Var("x", tensor_type_var())
+        y = Var("y", tensor_type_var())
+
+        tensor1_var = self.get_var('tensor1')
+        tensor2_var = self.get_var('tensor2')
+        tensor3_var = self.get_var('tensor3')
+        tensor4_var = self.get_var('tensor4')
+        t11 = Var("t11")
+        t12 = Var("t12")
+        t21 = Var("t21")
+        t22 = Var("t22")
+        t31 = Var("t31")
+        t32 = Var("t32")
+        t41 = Var("t41")
+        t42 = Var("t42")
+        tensor1_case = Clause(PatternConstructor(tensor1_var, [PatternVar(t11)]),
+                              Match(y, [Clause(PatternConstructor(tensor1_var, [PatternVar(t12)]),
+                                               tensor1_var(op.concatenate([t11, t12], axis=0)))],
+                                    False))
+        tensor2_case = Clause(PatternConstructor(tensor2_var, [PatternVar(t21)]),
+                              Match(y, [Clause(PatternConstructor(tensor2_var, [PatternVar(t22)]),
+                                               tensor2_var(op.concatenate([t21, t22], axis=0)))],
+                                    False))
+        tensor3_case = Clause(PatternConstructor(tensor3_var, [PatternVar(t31)]),
+                              Match(y, [Clause(PatternConstructor(tensor3_var, [PatternVar(t32)]),
+                                               tensor3_var(op.concatenate([t31, t32], axis=0)))],
+                                    False))
+        tensor4_case = Clause(PatternConstructor(tensor4_var, [PatternVar(t41)]),
+                              Match(y, [Clause(PatternConstructor(tensor4_var, [PatternVar(t42)]),
+                                               tensor4_var(op.concatenate([t41, t42], axis=0)))],
+                                    False))
+        # op.concatenate does not support tensor with rank higher than 4
+        self.prelude.mod[concat_var] =\
+            Function([x, y], Match(x, [tensor1_case,
+                                       tensor2_case,
+                                       tensor3_case,
+                                       tensor4_case], False))
+
+    def define_tensor_array(self):
+        """Defines a function to create a tensor array with size n.
+        tensor_array(n) : Tensor[(), int32] -> list[tensor_t]
+        """
+        tensor_array_constructor_name = self.get_name("tensor_array")
+        tensor_array_constructor_var = GlobalVar(tensor_array_constructor_name)
+        setattr(self.prelude, tensor_array_constructor_name, tensor_array_constructor_var)
+        tensor_nil_var = self.get_var('tensor_nil')
+        tensor_type_var = self.get_var('tensor_t')
+        n = Var("x", scalar_type('int32'))
+        body = If(equal(n, const(0)),
+                  self.prelude.nil(),
+                  self.prelude.cons(tensor_nil_var(),
+                                    tensor_array_constructor_var(subtract(n, const(1)))))
+        self.prelude.mod[tensor_array_constructor_var] = \
+            Function([n], body, self.prelude.l(tensor_type_var()), [])
+
+    def define_tensor_array_read(self):
+        """Defines a function to get the head of a list. Assume the list has at least one
+        element.
+
+        tensor_array_read(ta, n) : list[tensor_t] -> Tensor[(), int32] -> tensor_t
+        """
+        read_name = self.get_name("tensor_array_read")
+        read_var = GlobalVar(read_name)
+        setattr(self.prelude, read_name, read_var)
+        tensor_type_var = self.get_var('tensor_t')
+
+        tensor_array = Var("tensor_array", self.prelude.l(tensor_type_var()))
+        n = Var("x", scalar_type('int32'))
+        self.prelude.mod[read_var] =\
+            Function([tensor_array, n], self.prelude.nth(tensor_array, n), tensor_type_var(), [])
+
+    def define_tensor_array_write(self):
+        """Defines a function to update a tensor array at index n with value v.
+        tensor_array_write(ta, n, v) :
+            list[tensor_t] -> Tensor[(), int32] -> tensor_t -> list[tensor_t]
+        """
+        write_name = self.get_name("tensor_array_write")
+        write_var = GlobalVar(write_name)
+        setattr(self.prelude, write_name, write_var)
+        tensor_type_var = self.get_var('tensor_t')
+        tensor_array = Var("tensor_array", self.prelude.l(tensor_type_var()))
+        n = Var("x", scalar_type('int32'))
+        v = Var("v", tensor_type_var())
+        self.prelude.mod[write_var] =\
+            Function([tensor_array, n, v], self.prelude.update(tensor_array, n, v),
+                     self.prelude.l(tensor_type_var()), [])
+
+    def define_tensor_array_unstack_tensor1(self):
+        """Defines a function to unstack the values of a tensor_t with rank 1 in a tensor array.
+        tensor_array_unstack_tensor1(t) : tensor_t -> list[tensor_t]
+        """
+        helper_name = self.get_name("tensor_array_unstack_tensor1_helper")
+        helper_var = GlobalVar(helper_name)
+        setattr(self.prelude, helper_name, helper_var)
+        tensor = Var("t", TensorType([Any()], self.dtype))
+        up = Var("up", scalar_type('int32'))
+        i = Var("i", scalar_type('int32'))
+        tensor_type_var = self.get_var('tensor_t')
+        tensor0_var = self.get_var('tensor0')
+        helper_body =\
+            If(equal(i, up),
+               self.prelude.nil(),
+               self.prelude.cons(tensor0_var(op.take(tensor, i)),
+                                 helper_var(add(i, const(1)), up, tensor)))
+        self.prelude.mod[helper_var] =\
+            Function([i, up, tensor], helper_body, self.prelude.l(tensor_type_var()), [])
+        unstack_name = self.get_name("tensor_array_unstack_tensor1")
+        unstack_var = GlobalVar(unstack_name)
+        setattr(self.prelude, unstack_name, unstack_var)
+        tensor1 = Var("tensor", TensorType([Any()], self.dtype))
+        shape = op.shape_of(tensor1)
+        ndim = op.take(shape, const(0))
+        self.prelude.mod[unstack_var] =\
+            Function([tensor1], helper_var(const(0), ndim, tensor1),
+                     self.prelude.l(tensor_type_var()), [])
+
+    def define_tensor_array_unstack_tensor2(self):
+        """Defines a function to unstack the values of a tensor_t with rank 2 in a tensor array.
+
+        tensor_array_unstack_tensor2(t) : tensor_t -> list[tensor_t]
+        """
+        helper_name = self.get_name("tensor_array_unstack_tensor2_helper")
+        helper_var = GlobalVar(helper_name)
+        setattr(self.prelude, helper_name, helper_var)
+        tensor = Var("t", TensorType([Any(), Any()], self.dtype))
+        up = Var("up", scalar_type('int32'))
+        i = Var("i", scalar_type('int32'))
+
+        helper_body = If(equal(i, up),
+                         self.prelude.nil(),
+                         self.prelude.cons(self.get_var('tensor1')(op.take(tensor, i, axis=0)),
+                                           helper_var(add(i, const(1)), up, tensor)))
+        self.prelude.mod[helper_var] =\
+            Function([i, up, tensor], helper_body, self.prelude.l(self.get_var('tensor_t')()), [])
+
+        tensor_array_unstack_tensor2_name = self.get_name("tensor_array_unstack_tensor2")
+        tensor_array_unstack_tensor2_var = GlobalVar(tensor_array_unstack_tensor2_name)
+        setattr(self.prelude, tensor_array_unstack_tensor2_name, tensor_array_unstack_tensor2_var)
+        tensor2 = Var("tensor", TensorType([Any(), Any()], self.dtype))
+        shape = op.shape_of(tensor2)
+        ndim = op.take(shape, const(0))
+        self.prelude.mod[tensor_array_unstack_tensor2_var] =\
+            Function([tensor2], helper_var(const(0), ndim, tensor2),
+                     self.prelude.l(self.get_var('tensor_t')()), [])
+
+    def define_tensor_array_scatter(self):
+        """Defines a function to scatter the values of a tensor_t in indices of a tensor array.
+        tensor_array_scatter(ta, indices, value) :
+            list[tensor_t] -> Tensor[(Any), int32] -> tensor_t -> list[tensor_t]
+        """
+        tensor_array_scatter_helper_name = self.get_name("tensor_array_scatter_helper")
+        tensor_array_scatter_helper_var = GlobalVar(tensor_array_scatter_helper_name)
+        tensor_t = self.get_var('tensor_t')
+        ta = Var("ta", self.prelude.l(tensor_t()))
+        current = Var("current", scalar_type('int32'))
+        limit = Var("limit", scalar_type('int32'))
+        indices_ = Var('indices_', TensorType([Any()], 'int32'))
+        values_ = Var('values_', self.prelude.l(tensor_t()))
+        write_var = self.get_var('tensor_array_write')
+        read_var = self.get_var('tensor_array_read')
+        helper_body = If(equal(current, limit),
+                         ta,
+                         tensor_array_scatter_helper_var(
+                             write_var(ta, op.take(indices_, current),
+                                       read_var(values_, current)),
+                             add(current, const(1)),
+                             limit, indices_, values_))
+        self.prelude.mod[tensor_array_scatter_helper_var] =\
+            Function([ta, current, limit, indices_, values_],
+                     helper_body, self.prelude.l(tensor_t()), [])
+        tensor_array_scatter_name = self.get_name("tensor_array_scatter")
+        tensor_array_scatter_var = GlobalVar(tensor_array_scatter_name)
+        setattr(self.prelude, tensor_array_scatter_name, tensor_array_scatter_var)
+        tensor_array = Var("tensor_array", self.prelude.l(tensor_t()))
+        indices = Var('indices', TensorType([Any()], 'int32'))
+        values = Var('values', self.prelude.l(tensor_t()))
+        indices_shape = op.shape_of(indices)
+        limit = op.take(indices_shape, const(0))
+        body = tensor_array_scatter_helper_var(tensor_array, const(0), limit, indices, values)
+        self.prelude.mod[tensor_array_scatter_var] =\
+            Function([tensor_array, indices, values], body, self.prelude.l(tensor_t()), [])
+
+    def define_tensor_array_split(self):
+        """Defines a function to split the values of a tensor_t into a tensor array.
+        tensor_array_split(ta, value, lengths) :
+            list[tensor_t] -> tensor_t -> Tensor[(Any), int32] -> list[tensor_t]
+        """
+        tensor_t = self.get_var('tensor_t')
+        tensor_array_split_helper_name = self.get_name("ta_split_helper")
+        tensor_array_split_helper_var = GlobalVar(tensor_array_split_helper_name)
+        setattr(self.prelude, tensor_array_split_helper_name, tensor_array_split_helper_var)
+        ta1 = Var("tensor_array", self.prelude.l(tensor_t()))
+        value1 = Var('value1', tensor_t())
+        offset1 = Var('offset1', scalar_type('int32'))
+        current1 = Var('current1', scalar_type('int32'))
+        limit1 = Var('limit1', scalar_type('int32'))
+        lengths1 = Var('lengths', TensorType([Any()], 'int32'))
+        write_var = self.get_var('tensor_array_write')
+        take_var = self.get_var('tensor_take')
+        helper1_body = If(equal(current1, limit1),
+                          ta1,
+                          write_var(
+                              tensor_array_split_helper_var(
+                                  ta1,
+                                  value1,
+                                  add(offset1, op.take(lengths1, current1)),
+                                  add(current1, const(1)),
+                                  limit1,
+                                  lengths1
+                              ),
+                              current1,
+                              take_var(value1,
+                                       offset1,
+                                       add(op.take(lengths1, current1), offset1))))
+        self.prelude.mod[tensor_array_split_helper_var] = \
+            Function([ta1, value1, offset1, current1, limit1, lengths1],
+                     helper1_body, self.prelude.l(tensor_t()), [])
+        split_name = self.get_name("tensor_array_split")
+        split_var = GlobalVar(split_name)
+        setattr(self.prelude, split_name, split_var)
+        tensor_array = Var("tensor_array", self.prelude.l(tensor_t()))
+        value = Var('value', tensor_t())
+        lengths = Var('lengths', TensorType([Any()], 'int32'))
+        lengths_shape = op.shape_of(lengths)
+        lengths_limit = op.take(lengths_shape, const(0))
+        body = tensor_array_split_helper_var(
+            tensor_array,
+            value,
+            const(0),
+            const(0),
+            lengths_limit,
+            lengths)
+        self.prelude.mod[split_var] =\
+            Function([tensor_array, value, lengths], body, self.prelude.l(tensor_t()), [])
+
+    def define_tensor_array_concat(self):
+        """Defines a function to return the values in the tensor array as concatenated tensor_t.
+        tensor_array_concat(ta) : list[tensor_t] -> tensor_t
+        """
+        concat_name = self.get_name("tensor_array_concat")
+        concat_var = GlobalVar(concat_name)
+        setattr(self.prelude, concat_name, concat_var)
+        tensor_concat_var = self.get_var('tensor_concatenate')
+        tensor_t = self.get_var('tensor_t')
+        tensor_nil_var = self.get_var('tensor_nil')
+        tensor_array = Var("tensor_array", self.prelude.l(tensor_t()))
+        hd = Var("hd")
+        tl = Var("tl")
+        nil_case = Clause(PatternConstructor(self.prelude.nil), tensor_nil_var())
+        cons_case = Clause(PatternConstructor(self.prelude.cons, [PatternVar(hd), PatternVar(tl)]),
+                           Match(tl, [
+                               Clause(PatternConstructor(self.prelude.nil), hd),
+                               Clause(PatternWildcard(),
+                                      tensor_concat_var(hd, concat_var(tl)))
+                           ], False))
+        self.prelude.mod[concat_var] =\
+            Function([tensor_array],
+                     Match(tensor_array, [nil_case, cons_case], False), tensor_t(), [])
+
+    def define_tensor_array_gather(self):
+        """Defines a function to return the selected values in a tensor array as tensor_t.
+        tensor_array_gather(ta, indices) : list[tensor_t] -> Tensor[(Any), int32] -> tensor_t
+        """
+        helper_name = self.get_name("tensor_array_gather_helper")
+        helper_var = GlobalVar(helper_name)
+        setattr(self.prelude, helper_name, helper_var)
+        tensor_type_var = self.get_var('tensor_t')
+        stack_var = self.get_var('tensor_array_stack')
+        read_var = self.get_var('tensor_array_read')
+        ta = Var("ta", self.prelude.l(tensor_type_var()))
+        accu = Var("accu", self.prelude.l(tensor_type_var()))
+        current = Var("current", scalar_type('int32'))
+        limit = Var("limit", scalar_type('int32'))
+        indices_ = Var('indices_', TensorType([Any()], 'int32'))
+        helper_body =\
+            If(equal(current, const(0)),
+               stack_var(accu),
+               helper_var(
+                   ta,
+                   self.prelude.cons(
+                       read_var(
+                           ta, op.take(indices_, subtract(current, const(1)))), accu),
+                   subtract(current, const(1)),
+                   limit, indices_))
+        self.prelude.mod[helper_var] = \
+            Function([ta, accu, current, limit, indices_], helper_body, tensor_type_var(), [])
+        gather_name = self.get_name("tensor_array_gather")
+        gather_var = GlobalVar(gather_name)
+        setattr(self.prelude, gather_name, gather_var)
+        tensor_array = Var("tensor_array", self.prelude.l(tensor_type_var()))
+        indices = Var('indices', TensorType([Any()], 'int32'))
+        indices_shape = op.shape_of(indices)
+        limit = op.take(indices_shape, const(0))
+        body = helper_var(tensor_array, self.prelude.nil(), limit, limit, indices)
+        self.prelude.mod[gather_var] =\
+            Function([tensor_array, indices], body, tensor_type_var(), [])
+
+    def define_tensor_array_stack(self):
+        """Defines a function to get the values in the tensor array as a stack tensor_t.
+        tensor_array_stack(l) : list[tensor_t] -> tensor_t
+        """
+        stack_name = self.get_name("tensor_array_stack")
+        stack_var = GlobalVar(stack_name)
+        setattr(self.prelude, stack_name, stack_var)
+        tensor_type_var = self.get_var('tensor_t')
+        tensor_array = Var("tensor_array", self.prelude.l(tensor_type_var()))
+        expand_dims_var = self.get_var('tensor_expand_dims')
+        concat_var = self.get_var('tensor_concatenate')
+        tensor_array_expand_dims = self.prelude.map(expand_dims_var, tensor_array)
+        tensors = self.prelude.foldl(concat_var,
+                                     self.prelude.hd(tensor_array_expand_dims),
+                                     self.prelude.tl(tensor_array_expand_dims))
+        self.prelude.mod[stack_var] = Function([tensor_array], tensors, tensor_type_var(), [])
+
+    def register(self):
+        """Register all tensor array ops in Prelude"""
+        self.define_tensor_adt()
+        self.define_tensor_take()
+        self.define_tensor_expand_dims()
+        self.define_tensor_concat()
+        self.define_tensor_array()
+        self.define_tensor_array_read()
+        self.define_tensor_array_write()
+        self.define_tensor_array_unstack_tensor1()
+        self.define_tensor_array_unstack_tensor2()
+        self.define_tensor_array_scatter()
+        self.define_tensor_array_split()
+        self.define_tensor_array_concat()
+        self.define_tensor_array_stack()
+        # TODO(wweic): Gather fails in PartialEvaluate
+        # self.define_tensor_array_gather()
+
 class Prelude:
     """Contains standard definitions."""
 
@@ -27,6 +532,17 @@ def __init__(self, mod=None):
         self.mod = mod
         self.load_prelude()
 
+    def get_name(self, canonical, dtype):
+        """Get name corresponding to the canonical name"""
+        if canonical == 'tensor_t':
+            return 'tensor_{}_t'.format(dtype)
+        return "{}_{}".format(canonical, dtype)
+
+    def get_var(self, canonical, dtype):
+        """Get var corresponding to the canonical name"""
+        name = self.get_name(canonical, dtype)
+        return getattr(self, name)
+
     def load_prelude(self):
         """Parses the Prelude from Relay's text format into a module."""
         # TODO(@jroesch): we should remove this helper when we port over prelude
@@ -74,3 +590,7 @@ def load_prelude(self):
         ]
         for global_def in GLOBAL_DEFS:
             setattr(self, global_def, self.mod.get_global_var(global_def))
+
+        for dtype in ['float32', 'int32']:
+            tensor_array_ops = TensorArrayOps(self, dtype)
+            tensor_array_ops.register()
diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
index 0fdc0f3a3231..6b2e073822f1 100644
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -100,7 +100,7 @@ def _is_int8_hw_support(target):
         Checks to ensure that we can use Intel DLBoost instructions - Check if the target is skylake
         and above.
         """
-        supported_arches = {'-mcpu=skylake-avx512',}
+        supported_arches = {'-mcpu=skylake-avx512', '-mcpu=cascadelake'}
         return supported_arches.intersection(set(target.options))
 
     # Collect the dtypes.
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index ed443abb5293..7faf62b4be14 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -27,7 +27,7 @@ def requantize(data,
                input_zero_point,
                output_scale,
                output_zero_point,
-               rounding="TONEAREST",
+               rounding="UPWARD",
                out_dtype="int8"):
     r"""Requantized operator.
 
@@ -349,3 +349,45 @@ def dense(data,
                        input_zero_point,
                        kernel_zero_point,
                        out_dtype)
+
+
+def mul(lhs, rhs, lhs_scale, lhs_zero_point, rhs_scale, rhs_zero_point,
+        output_scale, output_zero_point):
+    """Quantized multiplication with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side quantized input data.
+
+    rhs : relay.Expr
+        The right hand side quantized input data.
+
+    lhs_scale: float
+        The scale of the lhs quantized expr.
+
+    lhs_zero_point: int
+       The zero point of lhs quantized expr.
+
+    rhs_scale: float
+        The scale of the rhs quantized expr.
+
+    rhs_zero_point: int
+       The zero point of rhs quantized expr.
+
+    output_scale: float
+        The scale of the output quantized expr.
+
+    output_zero_point: int
+       The zero point of output quantized expr.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+
+    """
+    return _make.mul(lhs, rhs,
+                     lhs_scale, lhs_zero_point,
+                     rhs_scale, rhs_zero_point,
+                     output_scale, output_zero_point)
diff --git a/python/tvm/relay/testing/py_converter.py b/python/tvm/relay/testing/py_converter.py
index d661be73ad02..d7b59922b89d 100644
--- a/python/tvm/relay/testing/py_converter.py
+++ b/python/tvm/relay/testing/py_converter.py
@@ -203,8 +203,12 @@ def convert_module(self):
         for var, func in self.mod.functions.items():
             # optimize the definition so any operators used are lowered
             opt_func = self.optimize(func)
-            converted_func, _ = self.convert_func_node(opt_func, var)
-            defs.append(converted_func)
+            try:
+                converted_func, _ = self.convert_func_node(opt_func, var)
+                defs.append(converted_func)
+            except TypeError:
+                # TODO(wweic): fix conversion for Any
+                pass
         return defs
 
 
diff --git a/python/tvm/rpc/tracker.py b/python/tvm/rpc/tracker.py
index f31625cd34ed..b9b29a7fe4a1 100644
--- a/python/tvm/rpc/tracker.py
+++ b/python/tvm/rpc/tracker.py
@@ -230,7 +230,7 @@ def call_handler(self, args):
             port, matchkey = args[2]
             self.pending_matchkeys.add(matchkey)
             # got custom address (from rpc server)
-            if args[3] is not None:
+            if len(args) >= 4 and args[3] is not None:
                 value = (self, args[3], port, matchkey)
             else:
                 value = (self, self._addr[0], port, matchkey)
diff --git a/python/tvm/target.py b/python/tvm/target.py
index 4548ffac4c88..087c9b47fd7a 100644
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -128,6 +128,16 @@ def model(self):
                 return opt.value[7:]
         return 'unknown'
 
+    @property
+    def mcpu(self):
+        """Returns the mcpu from the target if it exists."""
+        mcpu = ''
+        if self.options is not None:
+            for opt in self.options:
+                if 'mcpu' in opt:
+                    mcpu = opt.split('=')[1]
+        return mcpu
+
     def __enter__(self):
         _api_internal._EnterTargetScope(self)
         return self
@@ -496,6 +506,19 @@ def vta(model='unknown', options=None):
     return ret
 
 
+def bifrost(model='unknown', options=None):
+    """Return an ARM Mali GPU target (Bifrost architecture).
+
+    Parameters
+    ----------
+    options : str or list of str
+        Additional options
+    """
+    opts = ["-device=bifrost", '-model=%s' % model]
+    opts = _merge_opts(opts, options)
+    return _api_internal._TargetCreate("opencl", *opts)
+
+
 def create(target_str):
     """Get a target given target string.
 
diff --git a/rust/common/src/packed_func.rs b/rust/common/src/packed_func.rs
index d9399492264b..848d5c00ab3f 100644
--- a/rust/common/src/packed_func.rs
+++ b/rust/common/src/packed_func.rs
@@ -71,7 +71,7 @@ macro_rules! TVMPODValue {
             Context(TVMContext),
             Handle(*mut c_void),
             ArrayHandle(TVMArrayHandle),
-            NodeHandle(*mut c_void),
+            ObjectHandle(*mut c_void),
             ModuleHandle(TVMModuleHandle),
             FuncHandle(TVMFunctionHandle),
             NDArrayContainer(*mut c_void),
@@ -92,7 +92,7 @@ macro_rules! TVMPODValue {
                         TVMTypeCode_kTVMContext => Context($value.v_ctx),
                         TVMTypeCode_kHandle => Handle($value.v_handle),
                         TVMTypeCode_kArrayHandle => ArrayHandle($value.v_handle as TVMArrayHandle),
-                        TVMTypeCode_kNodeHandle => NodeHandle($value.v_handle),
+                        TVMTypeCode_kObjectHandle => ObjectHandle($value.v_handle),
                         TVMTypeCode_kModuleHandle => ModuleHandle($value.v_handle),
                         TVMTypeCode_kFuncHandle => FuncHandle($value.v_handle),
                         TVMTypeCode_kNDArrayContainer => NDArrayContainer($value.v_handle),
@@ -124,7 +124,7 @@ macro_rules! TVMPODValue {
                             TVMTypeCode_kArrayHandle,
                         )
                     },
-                    NodeHandle(val) => (TVMValue { v_handle: *val }, TVMTypeCode_kNodeHandle),
+                    ObjectHandle(val) => (TVMValue { v_handle: *val }, TVMTypeCode_kObjectHandle),
                     ModuleHandle(val) =>
                         (TVMValue { v_handle: *val }, TVMTypeCode_kModuleHandle),
                     FuncHandle(val) => (
diff --git a/rust/frontend/src/function.rs b/rust/frontend/src/function.rs
index 948711276304..01d0c58cfc5d 100644
--- a/rust/frontend/src/function.rs
+++ b/rust/frontend/src/function.rs
@@ -264,7 +264,7 @@ unsafe extern "C" fn tvm_callback(
     for i in 0..len {
         value = args_list[i];
         tcode = type_codes_list[i];
-        if tcode == ffi::TVMTypeCode_kNodeHandle as c_int
+        if tcode == ffi::TVMTypeCode_kObjectHandle as c_int
             || tcode == ffi::TVMTypeCode_kFuncHandle as c_int
             || tcode == ffi::TVMTypeCode_kModuleHandle as c_int
         {
diff --git a/src/README.md b/src/README.md
index 0c6f30a881b8..599f41dfdc5f 100644
--- a/src/README.md
+++ b/src/README.md
@@ -22,6 +22,8 @@ There can be internal header files within each module that sit in src.
 
 ## Modules
 - common: Internal common utilities.
+- runtime: Minimum runtime related codes.
+- node: base infra for IR/AST nodes that is dialect independent.
 - api: API function registration.
 - lang: The definition of DSL related data structure.
 - arithmetic: Arithmetic expression and set simplification.
@@ -29,7 +31,6 @@ There can be internal header files within each module that sit in src.
 - schedule: The operations on the schedule graph before converting to IR.
 - pass: The optimization pass on the IR structure.
 - codegen: The code generator.
-- runtime: Minimum runtime related codes.
 - autotvm: The auto-tuning module.
 - relay: Implementation of Relay. The second generation of NNVM, a new IR for deep learning frameworks.
 - contrib: Contrib extension libraries.
diff --git a/src/api/api_arith.cc b/src/api/api_arith.cc
index f31f02b1eaf4..c57e2afaa8eb 100644
--- a/src/api/api_arith.cc
+++ b/src/api/api_arith.cc
@@ -117,8 +117,7 @@ TVM_REGISTER_API("arith._CreateAnalyzer")
         });
       } else if (name == "bind") {
         return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
-            auto& sptr = args[1].node_sptr();
-            if (sptr->is_type<Range::ContainerType>()) {
+            if (args[1].IsObjectRef<Range>()) {
               self->Bind(args[0], args[1].operator Range());
             } else {
               self->Bind(args[0], args[1].operator Expr());
diff --git a/src/api/api_base.cc b/src/api/api_base.cc
index 28ebb4d65005..42367efb15bb 100644
--- a/src/api/api_base.cc
+++ b/src/api/api_base.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -26,11 +26,12 @@
 #include <tvm/expr.h>
 #include <tvm/tensor.h>
 #include <tvm/api_registry.h>
+#include <tvm/node/serialization.h>
 
 namespace tvm {
 TVM_REGISTER_API("_format_str")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
-    CHECK(args[0].type_code() == kNodeHandle);
+    CHECK(args[0].type_code() == kObjectHandle);
     std::ostringstream os;
     os << args[0].operator NodeRef();
     *ret = os.str();
@@ -38,16 +39,15 @@ TVM_REGISTER_API("_format_str")
 
 TVM_REGISTER_API("_raw_ptr")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
-    CHECK(args[0].type_code() == kNodeHandle);
-    *ret = reinterpret_cast<int64_t>(
-        args[0].node_sptr().get());
+    CHECK(args[0].type_code() == kObjectHandle);
+    *ret = reinterpret_cast<int64_t>(args[0].value().v_handle);
   });
 
 TVM_REGISTER_API("_save_json")
-.set_body_typed<std::string(NodeRef)>(SaveJSON);
+.set_body_typed<std::string(ObjectRef)>(SaveJSON);
 
 TVM_REGISTER_API("_load_json")
-.set_body_typed<NodeRef(std::string)>(LoadJSON<NodeRef>);
+.set_body_typed<ObjectRef(std::string)>(LoadJSON);
 
 TVM_REGISTER_API("_TVMSetStream")
 .set_body_typed(TVMSetStream);
diff --git a/src/api/api_codegen.cc b/src/api/api_codegen.cc
index 73e26719cf15..f2ca67e6e2f9 100644
--- a/src/api/api_codegen.cc
+++ b/src/api/api_codegen.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -33,7 +33,7 @@ namespace codegen {
 
 TVM_REGISTER_API("codegen._Build")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    if (args[0].IsNodeType<LoweredFunc>()) {
+    if (args[0].IsObjectRef<LoweredFunc>()) {
       *ret = Build({args[0]}, args[1]);
     } else {
       *ret = Build(args[0], args[1]);
diff --git a/src/api/api_ir.cc b/src/api/api_ir.cc
index b8ee1441fe12..9312c5532302 100644
--- a/src/api/api_ir.cc
+++ b/src/api/api_ir.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2016 by Contributors
  *  Implementation of API functions related to IR build
  * \file api_ir.cc
  */
diff --git a/src/api/api_lang.cc b/src/api/api_lang.cc
index aa0ce47b4a37..f3d6c5f6ab62 100644
--- a/src/api/api_lang.cc
+++ b/src/api/api_lang.cc
@@ -57,25 +57,26 @@ TVM_REGISTER_API("_str")
 
 TVM_REGISTER_API("_Array")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
-    std::vector<NodePtr<Node> > data;
+    std::vector<ObjectRef> data;
     for (int i = 0; i < args.size(); ++i) {
       if (args[i].type_code() != kNull) {
-        data.push_back(args[i].node_sptr());
+        data.push_back(args[i].operator ObjectRef());
       } else {
-        data.push_back(NodePtr<Node>(nullptr));
+        data.push_back(ObjectRef(nullptr));
       }
     }
     auto node = make_node<ArrayNode>();
     node->data = std::move(data);
-    *ret = node;
+    *ret = runtime::ObjectRef(node);
   });
 
 TVM_REGISTER_API("_ArrayGetItem")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
     int64_t i = args[1];
-    auto& sptr = args[0].node_sptr();
-    CHECK(sptr->is_type<ArrayNode>());
-    auto* n = static_cast<const ArrayNode*>(sptr.get());
+    CHECK_EQ(args[0].type_code(), kObjectHandle);
+    Object* ptr = static_cast<Object*>(args[0].value().v_handle);
+    CHECK(ptr->IsInstance<ArrayNode>());
+    auto* n = static_cast<const ArrayNode*>(ptr);
     CHECK_LT(static_cast<size_t>(i), n->data.size())
         << "out of bound of array";
     *ret = n->data[static_cast<size_t>(i)];
@@ -83,10 +84,11 @@ TVM_REGISTER_API("_ArrayGetItem")
 
 TVM_REGISTER_API("_ArraySize")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
-    auto& sptr = args[0].node_sptr();
-    CHECK(sptr->is_type<ArrayNode>());
+    CHECK_EQ(args[0].type_code(), kObjectHandle);
+    Object* ptr = static_cast<Object*>(args[0].value().v_handle);
+    CHECK(ptr->IsInstance<ArrayNode>());
     *ret = static_cast<int64_t>(
-        static_cast<const ArrayNode*>(sptr.get())->data.size());
+        static_cast<const ArrayNode*>(ptr)->data.size());
   });
 
 TVM_REGISTER_API("_Map")
@@ -98,10 +100,10 @@ TVM_REGISTER_API("_Map")
       for (int i = 0; i < args.num_args; i += 2) {
         CHECK(args[i].type_code() == kStr)
             << "key of str map need to be str";
-        CHECK(args[i + 1].type_code() == kNodeHandle)
+        CHECK(args[i + 1].type_code() == kObjectHandle)
             << "value of the map to be NodeRef";
         data.emplace(std::make_pair(args[i].operator std::string(),
-                                    args[i + 1].node_sptr()));
+                                    args[i + 1].operator ObjectRef()));
       }
       auto node = make_node<StrMapNode>();
       node->data = std::move(data);
@@ -110,12 +112,12 @@ TVM_REGISTER_API("_Map")
       // Container node.
       MapNode::ContainerType data;
       for (int i = 0; i < args.num_args; i += 2) {
-        CHECK(args[i].type_code() == kNodeHandle)
+        CHECK(args[i].type_code() == kObjectHandle)
             << "key of str map need to be str";
-        CHECK(args[i + 1].type_code() == kNodeHandle)
+        CHECK(args[i + 1].type_code() == kObjectHandle)
             << "value of map to be NodeRef";
-        data.emplace(std::make_pair(args[i].node_sptr(),
-                                    args[i + 1].node_sptr()));
+        data.emplace(std::make_pair(args[i].operator ObjectRef(),
+                                    args[i + 1].operator ObjectRef()));
       }
       auto node = make_node<MapNode>();
       node->data = std::move(data);
@@ -125,31 +127,33 @@ TVM_REGISTER_API("_Map")
 
 TVM_REGISTER_API("_MapSize")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
-    auto& sptr = args[0].node_sptr();
-    if (sptr->is_type<MapNode>()) {
-      auto* n = static_cast<const MapNode*>(sptr.get());
+    CHECK_EQ(args[0].type_code(), kObjectHandle);
+    Object* ptr = static_cast<Object*>(args[0].value().v_handle);
+    if (ptr->IsInstance<MapNode>()) {
+      auto* n = static_cast<const MapNode*>(ptr);
       *ret = static_cast<int64_t>(n->data.size());
     } else {
-      CHECK(sptr->is_type<StrMapNode>());
-      auto* n = static_cast<const StrMapNode*>(sptr.get());
+      CHECK(ptr->IsInstance<StrMapNode>());
+      auto* n = static_cast<const StrMapNode*>(ptr);
       *ret = static_cast<int64_t>(n->data.size());
     }
   });
 
 TVM_REGISTER_API("_MapGetItem")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
-    CHECK(args[0].type_code() == kNodeHandle);
-    auto& sptr = args[0].node_sptr();
-    if (sptr->is_type<MapNode>()) {
-      CHECK(args[1].type_code() == kNodeHandle);
-      auto* n = static_cast<const MapNode*>(sptr.get());
-      auto it = n->data.find(args[1].node_sptr());
+    CHECK_EQ(args[0].type_code(), kObjectHandle);
+    Object* ptr = static_cast<Object*>(args[0].value().v_handle);
+
+    if (ptr->IsInstance<MapNode>()) {
+      CHECK(args[1].type_code() == kObjectHandle);
+      auto* n = static_cast<const MapNode*>(ptr);
+      auto it = n->data.find(args[1].operator ObjectRef());
       CHECK(it != n->data.end())
           << "cannot find the corresponding key in the Map";
       *ret = (*it).second;
     } else {
-      CHECK(sptr->is_type<StrMapNode>());
-      auto* n = static_cast<const StrMapNode*>(sptr.get());
+      CHECK(ptr->IsInstance<StrMapNode>());
+      auto* n = static_cast<const StrMapNode*>(ptr);
       auto it = n->data.find(args[1].operator std::string());
       CHECK(it != n->data.end())
           << "cannot find the corresponding key in the Map";
@@ -159,16 +163,17 @@ TVM_REGISTER_API("_MapGetItem")
 
 TVM_REGISTER_API("_MapCount")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
-    CHECK(args[0].type_code() == kNodeHandle);
-    auto& sptr = args[0].node_sptr();
-    if (sptr->is_type<MapNode>()) {
-      auto* n = static_cast<const MapNode*>(sptr.get());
-      CHECK(args[1].type_code() == kNodeHandle);
+    CHECK_EQ(args[0].type_code(), kObjectHandle);
+    Object* ptr = static_cast<Object*>(args[0].value().v_handle);
+
+    if (ptr->IsInstance<MapNode>()) {
+      auto* n = static_cast<const MapNode*>(ptr);
+    CHECK_EQ(args[0].type_code(), kObjectHandle);
       *ret = static_cast<int64_t>(
-          n->data.count(args[1].node_sptr()));
+          n->data.count(args[1].operator ObjectRef()));
     } else {
-      CHECK(sptr->is_type<StrMapNode>());
-      auto* n = static_cast<const StrMapNode*>(sptr.get());
+      CHECK(ptr->IsInstance<StrMapNode>());
+      auto* n = static_cast<const StrMapNode*>(ptr);
       *ret = static_cast<int64_t>(
           n->data.count(args[1].operator std::string()));
     }
@@ -176,9 +181,11 @@ TVM_REGISTER_API("_MapCount")
 
 TVM_REGISTER_API("_MapItems")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
-    auto& sptr = args[0].node_sptr();
-    if (sptr->is_type<MapNode>()) {
-      auto* n = static_cast<const MapNode*>(sptr.get());
+    CHECK_EQ(args[0].type_code(), kObjectHandle);
+    Object* ptr = static_cast<Object*>(args[0].value().v_handle);
+
+    if (ptr->IsInstance<MapNode>()) {
+      auto* n = static_cast<const MapNode*>(ptr);
       auto rkvs = make_node<ArrayNode>();
       for (const auto& kv : n->data) {
         rkvs->data.push_back(kv.first);
@@ -186,10 +193,10 @@ TVM_REGISTER_API("_MapItems")
       }
       *ret = rkvs;
     } else {
-      auto* n = static_cast<const StrMapNode*>(sptr.get());
+      auto* n = static_cast<const StrMapNode*>(ptr);
       auto rkvs = make_node<ArrayNode>();
       for (const auto& kv : n->data) {
-        rkvs->data.push_back(ir::StringImm::make(kv.first).node_);
+        rkvs->data.push_back(ir::StringImm::make(kv.first));
         rkvs->data.push_back(kv.second);
       }
       *ret = rkvs;
@@ -426,7 +433,7 @@ TVM_REGISTER_API("_ScheduleCacheRead")
 
 TVM_REGISTER_API("_ScheduleCacheWrite")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
-    if (args[1].IsNodeType<Tensor>()) {
+    if (args[1].IsObjectRef<Tensor>()) {
       *ret = args[0].operator Schedule()
           .cache_write(args[1].operator Tensor(), args[2]);
     } else {
diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc
index 25cd5838385f..d7f621f3ade1 100644
--- a/src/api/api_pass.cc
+++ b/src/api/api_pass.cc
@@ -35,7 +35,7 @@ namespace ir {
 
 TVM_REGISTER_API("ir_pass.Simplify")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    if (args[0].IsNodeType<Stmt>()) {
+    if (args[0].IsObjectRef<Stmt>()) {
       if (args.size() > 1) {
         *ret = Simplify(args[0].operator Stmt(), args[1]);
       } else {
@@ -52,7 +52,7 @@ TVM_REGISTER_API("ir_pass.Simplify")
 
 TVM_REGISTER_API("ir_pass.CanonicalSimplify")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    if (args[0].IsNodeType<Stmt>()) {
+    if (args[0].IsObjectRef<Stmt>()) {
       if (args.size() > 1) {
         *ret = CanonicalSimplify(args[0].operator Stmt(), args[1]);
       } else {
@@ -69,7 +69,7 @@ TVM_REGISTER_API("ir_pass.CanonicalSimplify")
 
 TVM_REGISTER_API("ir_pass.Substitute")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    if (args[0].IsNodeType<Stmt>()) {
+    if (args[0].IsObjectRef<Stmt>()) {
       *ret = Substitute(args[0].operator Stmt(), args[1].operator Map<Var, Expr>());
     } else {
       *ret = Substitute(args[0].operator Expr(), args[1].operator Map<Var, Expr>());
@@ -78,7 +78,7 @@ TVM_REGISTER_API("ir_pass.Substitute")
 
 TVM_REGISTER_API("ir_pass.Equal")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
-    if (args[0].IsNodeType<Stmt>()) {
+    if (args[0].IsObjectRef<Stmt>()) {
       *ret = Equal(args[0].operator Stmt(), args[1].operator Stmt());
     } else {
       *ret = Equal(args[0].operator Expr(), args[1].operator Expr());
@@ -118,6 +118,14 @@ TVM_REGISTER_API("ir_pass.PostOrderVisit")
       });
   });
 
+TVM_REGISTER_API("ir_pass.LowerStorageAccess")
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+  LoweredFunc f = args[0];
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
+  n->body = LowerStorageAccessInfo(f->body);
+  *ret = LoweredFunc(n);
+});
+
 // make from two arguments
 #define REGISTER_PASS(PassName)                                   \
   TVM_REGISTER_API("ir_pass."#PassName)                           \
@@ -140,6 +148,7 @@ REGISTER_PASS(SplitHostDevice);
 REGISTER_PASS(StorageRewrite);
 REGISTER_PASS(CoProcSync);
 REGISTER_PASS(LowerStorageAccessInfo);
+REGISTER_PASS(LowerDeviceStorageAccessInfo)
 REGISTER_PASS(InjectVirtualThread);
 REGISTER_PASS(InjectPrefetch);
 REGISTER_PASS(InjectDoubleBuffer);
@@ -160,5 +169,7 @@ REGISTER_PASS(VerifyGPUCode);
 REGISTER_PASS(DecorateDeviceScope);
 REGISTER_PASS(InstrumentBoundCheckers);
 REGISTER_PASS(VerifyCompactBuffer);
+REGISTER_PASS(HoistIfThenElse);
+REGISTER_PASS(InferFragment)
 }  // namespace ir
 }  // namespace tvm
diff --git a/src/api/api_schedule.cc b/src/api/api_schedule.cc
index 177360bf2ebb..cf0e0f3c6b7a 100644
--- a/src/api/api_schedule.cc
+++ b/src/api/api_schedule.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2017 by Contributors
  *  Implementation of API functions related to schedule pass.
  * \file api_schedule.cc
  */
diff --git a/src/api/dsl_api.cc b/src/api/dsl_api.cc
deleted file mode 100644
index 89e999f73edb..000000000000
--- a/src/api/dsl_api.cc
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2016 by Contributors
- *  Implementation of DSL API
- * \file dsl_api.cc
- */
-#include <dmlc/base.h>
-#include <dmlc/logging.h>
-#include <dmlc/thread_local.h>
-#include <tvm/api_registry.h>
-#include <tvm/attrs.h>
-#include <vector>
-#include <string>
-#include <exception>
-#include "../runtime/dsl_api.h"
-
-namespace tvm {
-namespace runtime {
-/*! \brief entry to to easily hold returning information */
-struct TVMAPIThreadLocalEntry {
-  /*! \brief result holder for returning strings */
-  std::vector<std::string> ret_vec_str;
-  /*! \brief result holder for returning string pointers */
-  std::vector<const char *> ret_vec_charp;
-  /*! \brief result holder for retruning string */
-  std::string ret_str;
-};
-
-/*! \brief Thread local store that can be used to hold return values. */
-typedef dmlc::ThreadLocalStore<TVMAPIThreadLocalEntry> TVMAPIThreadLocalStore;
-
-using TVMAPINode = NodePtr<Node>;
-
-struct APIAttrGetter : public AttrVisitor {
-  std::string skey;
-  TVMRetValue* ret;
-  bool found_ref_object{false};
-
-  void Visit(const char* key, double* value) final {
-    if (skey == key) *ret = value[0];
-  }
-  void Visit(const char* key, int64_t* value) final {
-    if (skey == key) *ret = value[0];
-  }
-  void Visit(const char* key, uint64_t* value) final {
-    CHECK_LE(value[0], static_cast<uint64_t>(std::numeric_limits<int64_t>::max()))
-        << "cannot return too big constant";
-    if (skey == key) *ret = static_cast<int64_t>(value[0]);
-  }
-  void Visit(const char* key, int* value) final {
-    if (skey == key) *ret = static_cast<int64_t>(value[0]);
-  }
-  void Visit(const char* key, bool* value) final {
-    if (skey == key) *ret = static_cast<int64_t>(value[0]);
-  }
-  void Visit(const char* key, void** value) final {
-    if (skey == key) *ret = static_cast<void*>(value[0]);
-  }
-  void Visit(const char* key, Type* value) final {
-    if (skey == key) *ret = value[0];
-  }
-  void Visit(const char* key, std::string* value) final {
-    if (skey == key) *ret = value[0];
-  }
-  void Visit(const char* key, NodeRef* value) final {
-    if (skey == key) {
-      *ret = value[0];
-      found_ref_object = true;
-    }
-  }
-  void Visit(const char* key, runtime::NDArray* value) final {
-    if (skey == key) {
-      *ret = value[0];
-      found_ref_object = true;
-    }
-  }
-  void Visit(const char* key, runtime::ObjectRef* value) final {
-    if (skey == key) {
-      *ret = value[0];
-      found_ref_object = true;
-    }
-  }
-};
-
-struct APIAttrDir : public AttrVisitor {
-  std::vector<std::string>* names;
-
-  void Visit(const char* key, double* value) final {
-    names->push_back(key);
-  }
-  void Visit(const char* key, int64_t* value) final {
-    names->push_back(key);
-  }
-  void Visit(const char* key, uint64_t* value) final {
-    names->push_back(key);
-  }
-  void Visit(const char* key, bool* value) final {
-    names->push_back(key);
-  }
-  void Visit(const char* key, int* value) final {
-    names->push_back(key);
-  }
-  void Visit(const char* key, void** value) final {
-    names->push_back(key);
-  }
-  void Visit(const char* key, Type* value) final {
-    names->push_back(key);
-  }
-  void Visit(const char* key, std::string* value) final {
-    names->push_back(key);
-  }
-  void Visit(const char* key, NodeRef* value) final {
-    names->push_back(key);
-  }
-  void Visit(const char* key, runtime::NDArray* value) final {
-    names->push_back(key);
-  }
-  void Visit(const char* key, runtime::ObjectRef* value) final {
-    names->push_back(key);
-  }
-};
-
-class DSLAPIImpl : public DSLAPI {
- public:
-  void NodeFree(NodeHandle handle) const final {
-    delete static_cast<TVMAPINode*>(handle);
-  }
-  void NodeTypeKey2Index(const char* type_key,
-                        int* out_index) const final {
-    *out_index = static_cast<int>(Node::TypeKey2Index(type_key));
-  }
-  void NodeGetTypeIndex(NodeHandle handle,
-                        int* out_index) const final {
-    *out_index = static_cast<int>(
-        (*static_cast<TVMAPINode*>(handle))->type_index());
-  }
-  void NodeGetAttr(NodeHandle handle,
-                   const char* key,
-                   TVMValue* ret_val,
-                   int* ret_type_code,
-                   int* ret_success) const final {
-    TVMRetValue rv;
-    APIAttrGetter getter;
-    TVMAPINode* tnode = static_cast<TVMAPINode*>(handle);
-    getter.skey = key;
-    getter.ret = &rv;
-    if (getter.skey == "type_key") {
-      ret_val->v_str = (*tnode)->type_key();
-      *ret_type_code = kStr;
-      *ret_success = 1;
-      return;
-    } else if (!(*tnode)->is_type<DictAttrsNode>()) {
-      (*tnode)->VisitAttrs(&getter);
-      *ret_success = getter.found_ref_object || rv.type_code() != kNull;
-    } else {
-      // specially handle dict attr
-      DictAttrsNode* dnode = static_cast<DictAttrsNode*>(tnode->get());
-      auto it = dnode->dict.find(key);
-      if (it != dnode->dict.end()) {
-        *ret_success = 1;
-        rv = (*it).second;
-      } else {
-        *ret_success = 0;
-      }
-    }
-    if (*ret_success) {
-      if (rv.type_code() == kStr ||
-          rv.type_code() == kTVMType) {
-        TVMAPIThreadLocalEntry *e = TVMAPIThreadLocalStore::Get();
-        e->ret_str = rv.operator std::string();
-        *ret_type_code = kStr;
-        ret_val->v_str = e->ret_str.c_str();
-      } else {
-        rv.MoveToCHost(ret_val, ret_type_code);
-      }
-    }
-  }
-  void NodeListAttrNames(NodeHandle handle,
-                        int *out_size,
-                        const char*** out_array) const final {
-    TVMAPIThreadLocalEntry *ret = TVMAPIThreadLocalStore::Get();
-    ret->ret_vec_str.clear();
-    TVMAPINode* tnode = static_cast<TVMAPINode*>(handle);
-    APIAttrDir dir;
-    dir.names = &(ret->ret_vec_str);
-
-    if (!(*tnode)->is_type<DictAttrsNode>()) {
-      (*tnode)->VisitAttrs(&dir);
-    } else {
-      // specially handle dict attr
-      DictAttrsNode* dnode = static_cast<DictAttrsNode*>(tnode->get());
-      for (const auto& kv : dnode->dict) {
-        ret->ret_vec_str.push_back(kv.first);
-      }
-    }
-    ret->ret_vec_charp.clear();
-    for (size_t i = 0; i < ret->ret_vec_str.size(); ++i) {
-      ret->ret_vec_charp.push_back(ret->ret_vec_str[i].c_str());
-    }
-    *out_array = dmlc::BeginPtr(ret->ret_vec_charp);
-    *out_size = static_cast<int>(ret->ret_vec_str.size());
-  }
-};
-
-TVM_REGISTER_GLOBAL("dsl_api.singleton")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    static DSLAPIImpl impl;
-    void* ptr = &impl;
-    *rv = ptr;
-  });
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/arithmetic/analyzer.cc b/src/arithmetic/analyzer.cc
index acd964935c25..98e25742592d 100644
--- a/src/arithmetic/analyzer.cc
+++ b/src/arithmetic/analyzer.cc
@@ -36,9 +36,7 @@ Analyzer::Analyzer()
       int_set(this) {
 }
 
-void Analyzer::Bind(const VarExpr& v, const Expr& expr) {
-  Var var(v.node_);
-
+void Analyzer::Bind(const VarExpr& var, const Expr& expr) {
   Expr new_expr = expr;
   new_expr = this->canonical_simplify(new_expr);
   new_expr = this->rewrite_simplify(new_expr);
@@ -49,9 +47,8 @@ void Analyzer::Bind(const VarExpr& v, const Expr& expr) {
   this->canonical_simplify.Update(var, new_expr);
 }
 
-void Analyzer::Bind(const VarExpr& v, const Range& range) {
+void Analyzer::Bind(const VarExpr& var, const Range& range) {
   CHECK(range.defined());
-  Var var(v.node_);
   if (is_one(range->extent)) {
     this->Bind(var, range->min);
   } else {
diff --git a/src/arithmetic/bound_deducer.cc b/src/arithmetic/bound_deducer.cc
index 6f7b4d78da05..9c3a706e2ad0 100644
--- a/src/arithmetic/bound_deducer.cc
+++ b/src/arithmetic/bound_deducer.cc
@@ -53,17 +53,17 @@ class VariablePathFinder: public IRVisitor {
     if (!found_) path_.pop_back();
   }
 
-  std::vector<const Node*> path_;
+  std::vector<const Object*> path_;
 
  private:
   bool found_{false};
   Expr target_;
-  std::unordered_set<const Node*> visited_;
+  std::unordered_set<const Object*> visited_;
 };
 
 // get the path to the variable,
 // return empty vector to represent failure
-std::vector<const Node*> GetPath(Expr target, Expr expr) {
+std::vector<const Object*> GetPath(Expr target, Expr expr) {
   VariablePathFinder v(target);
   v.Visit(expr);
   return v.path_;
@@ -189,7 +189,7 @@ class BoundDeducer: public IRVisitor {
   const std::unordered_map<const Variable*, IntSet>& hint_map_;
   const std::unordered_map<const Variable*, IntSet>& relax_map_;
   ExprIntSetMap expr_map_;
-  std::vector<const Node*> path_;
+  std::vector<const Object*> path_;
   size_t iter_{0};
   // internal analzyer
   Analyzer analyzer_;
diff --git a/src/arithmetic/canonical_simplify.cc b/src/arithmetic/canonical_simplify.cc
index d80e4969d5c2..1b576a645824 100644
--- a/src/arithmetic/canonical_simplify.cc
+++ b/src/arithmetic/canonical_simplify.cc
@@ -43,6 +43,7 @@ class SplitExpr;
  */
 class CanonicalExprNode : public BaseExprNode {
  public:
+  virtual ~CanonicalExprNode() {}
   /*!
    * \brief Return the normal Expr that is equivalent to self.
    * \note Can mutate the internal data structure.
@@ -51,7 +52,7 @@ class CanonicalExprNode : public BaseExprNode {
   virtual Expr Normalize() const = 0;
 
   // overrides
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
   }
 
   static constexpr const char* _type_key = "arith.CanonicalExpr";
@@ -485,7 +486,7 @@ class CanonicalSimplifier::Impl : public RewriteSimplifier::Impl {
    * \return Normalized expr.
    */
   Expr Normalize(Expr expr) {
-    if (const auto* op = expr.as_derived<CanonicalExprNode>()) {
+    if (const auto* op = expr.as<CanonicalExprNode>()) {
       return op->Normalize();
     } else {
       return expr;
@@ -503,7 +504,7 @@ class CanonicalSimplifier::Impl : public RewriteSimplifier::Impl {
     if (const auto* op = expr.as<SumExprNode>()) {
       if (op->base == 0 && op->args.size() == 1) return op->args[0];
     }
-    if (const auto* op = expr.as_derived<CanonicalExprNode>()) {
+    if (const auto* op = expr.as<CanonicalExprNode>()) {
       expr = op->Normalize();
     }
     NodePtr<SplitExprNode> n = make_node<SplitExprNode>();
@@ -629,7 +630,7 @@ Mutate_(const Mul* op, const Expr& self) {
   }
   if (const auto* bconst = b.as<IntImm>()) {
     if (a.as<SumExprNode>()) {
-      SumExpr ret(std::move(a.node_));
+      SumExpr ret = Downcast<SumExpr>(std::move(a));
       ret.CopyOnWrite()->MulToSelf(bconst->value);
       return std::move(ret);
     } else {
@@ -931,7 +932,7 @@ Mutate_(const Mod* op, const Expr& self) {
       int64_t new_base = psum->base % cval;
       if (cbound->min_value >= 0 &&
           cbound->min_value - psum->base + new_base >= 0) {
-        SumExpr sum_expr(std::move(a.node_));
+        SumExpr sum_expr = Downcast<SumExpr>(a);
         sum_expr.CopyOnWrite()->base = new_base;
         return SplitModConst(ToSplitExpr(std::move(sum_expr)), cval, kTruncDiv);
       }
@@ -992,7 +993,7 @@ Mutate_(const FloorMod* op, const Expr& self) {
       // Simplify the offset constant if necessary.
       // floormod(x - 5, 3) => floormod(x + 1, 3)
       int64_t new_base = floormod(psum->base, cval);
-      SumExpr sum_expr(std::move(a.node_));
+      SumExpr sum_expr = Downcast<SumExpr>(std::move(a));
       sum_expr.CopyOnWrite()->base = new_base;
       return SplitModConst(ToSplitExpr(std::move(sum_expr)), cval, kFloorDiv);
     } else {
diff --git a/src/arithmetic/const_fold.h b/src/arithmetic/const_fold.h
index 57f90534fbb4..86f1927f2abe 100644
--- a/src/arithmetic/const_fold.h
+++ b/src/arithmetic/const_fold.h
@@ -28,6 +28,7 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/expr_operator.h>
 #include <algorithm>
+#include <cmath>
 #include "int_operator.h"
 
 namespace tvm {
diff --git a/src/arithmetic/const_int_bound.cc b/src/arithmetic/const_int_bound.cc
index d5c012d302dc..168486ee0018 100644
--- a/src/arithmetic/const_int_bound.cc
+++ b/src/arithmetic/const_int_bound.cc
@@ -39,7 +39,7 @@ ConstIntBound::ConstIntBound(
   auto node = make_node<ConstIntBoundNode>();
   node->min_value = min_value;
   node->max_value = max_value;
-  node_ = std::move(node);
+  data_ = std::move(node);
 }
 
 inline void PrintBoundValue(std::ostream& os, int64_t val) {
diff --git a/src/arithmetic/detect_linear_equation.cc b/src/arithmetic/detect_linear_equation.cc
index 3c5f12a7379e..7da020efc42a 100644
--- a/src/arithmetic/detect_linear_equation.cc
+++ b/src/arithmetic/detect_linear_equation.cc
@@ -176,7 +176,7 @@ bool DetectClipBound(
     if (const Variable* v = n.as<Variable>()) {
       if (bmap->count(v)) {
         if (flag == 0) {
-          var = Var(n.node_);
+          var = Downcast<Var>(n);
           flag = 1;
         } else if (flag == 1) {
           if (!var.same_as(n)) {
diff --git a/src/arithmetic/int_set.cc b/src/arithmetic/int_set.cc
index 0e24714daf1f..409477578758 100644
--- a/src/arithmetic/int_set.cc
+++ b/src/arithmetic/int_set.cc
@@ -40,7 +40,7 @@ IntervalSet::IntervalSet(Expr min_value, Expr max_value) {
   auto node = make_node<IntervalSetNode>();
   node->min_value = std::move(min_value);
   node->max_value = std::move(max_value);
-  node_ = std::move(node);
+  data_ = std::move(node);
 }
 
 IntervalSet MakeIntervalSet(Expr min_value, Expr max_value) {
@@ -506,7 +506,7 @@ class IntervalSetEvaluator :
   }
 
   IntervalSet VisitExprDefault_(const Node* op) final {
-    DLOG(WARNING) << "cannot evaluate set type " << op->type_key();
+    DLOG(WARNING) << "cannot evaluate set type " << op->GetTypeKey();
     return IntervalSet::Everything();
   }
 
@@ -807,6 +807,8 @@ IntSet EvalSet(Range r,
   return EvalSet(r, ConvertDomMap(dom_map));
 }
 
+TVM_REGISTER_NODE_TYPE(IntervalSetNode);
+
 TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 .set_dispatch<IntervalSetNode>([](const IntervalSetNode *op, IRPrinter *p) {
     p->stream << "IntervalSet"
diff --git a/src/arithmetic/int_set.h b/src/arithmetic/int_set.h
index 306361868759..831b44409030 100644
--- a/src/arithmetic/int_set.h
+++ b/src/arithmetic/int_set.h
@@ -47,7 +47,7 @@ class IntervalSetNode : public IntSetNode {
   Expr max_value;
 
   // visitor overload.
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("min_value", &min_value);
     v->Visit("max_value", &max_value);
   }
diff --git a/src/arithmetic/ir_mutator_with_analyzer.cc b/src/arithmetic/ir_mutator_with_analyzer.cc
index 04e166ae52c0..cda9d585ace1 100644
--- a/src/arithmetic/ir_mutator_with_analyzer.cc
+++ b/src/arithmetic/ir_mutator_with_analyzer.cc
@@ -87,7 +87,7 @@ Stmt IRMutatorWithAnalyzer::
 Mutate_(const AttrStmt* op, const Stmt& s) {
   if (op->attr_key == attr::thread_extent ||
       op->attr_key == attr::virtual_thread) {
-    IterVar iv(op->node.node_);
+    IterVar iv = Downcast<IterVar>(op->node);
     CHECK_NE(iv->thread_tag.length(), 0U);
     analyzer_->Bind(iv->var,
                     Range::make_by_min_extent(0, op->value));
diff --git a/src/arithmetic/ir_visitor_with_analyzer.h b/src/arithmetic/ir_visitor_with_analyzer.h
index 71eea50e4c72..918f2e89501f 100644
--- a/src/arithmetic/ir_visitor_with_analyzer.h
+++ b/src/arithmetic/ir_visitor_with_analyzer.h
@@ -47,7 +47,7 @@ class IRVisitorWithAnalyzer final : public IRVisitor {
   void Visit_(const AttrStmt* op) {
     if (op->attr_key == attr::thread_extent ||
         op->attr_key == attr::virtual_thread) {
-      IterVar iv(op->node.node_);
+      IterVar iv = Downcast<IterVar>(op->node);
       CHECK_NE(iv->thread_tag.length(), 0U);
       analyzer_.Bind(iv->var,
                       Range::make_by_min_extent(0, op->value));
diff --git a/src/arithmetic/modular_set.cc b/src/arithmetic/modular_set.cc
index 08454dd0ef5a..9e363e7cf99a 100644
--- a/src/arithmetic/modular_set.cc
+++ b/src/arithmetic/modular_set.cc
@@ -41,7 +41,7 @@ ModularSet::ModularSet(int64_t coeff, int64_t base) {
   node->coeff = coeff;
   node->base = base;
   // finish construction.
-  node_ = std::move(node);
+  data_ = std::move(node);
 }
 
 TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index 3f1c32243a23..cfcb0607858f 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -34,6 +34,7 @@
 namespace tvm {
 
 TVM_REGISTER_NODE_TYPE(TargetNode);
+TVM_REGISTER_NODE_TYPE(GenericFuncNode);
 
 TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 .set_dispatch<TargetNode>([](const TargetNode *op, IRPrinter *p) {
@@ -51,9 +52,7 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 */
 Target CreateTarget(const std::string& target_name,
                     const std::vector<std::string>& options) {
-  auto target = Target(make_node<TargetNode>());
-  auto t = static_cast<TargetNode*>(target.node_.get());
-
+  auto t = make_node<TargetNode>();
   t->target_name = target_name;
 
   std::string libs_flag = "-libs=";
@@ -137,7 +136,7 @@ Target CreateTarget(const std::string& target_name,
     return target::stackvm();
   }
 
-  return target;
+  return Target(t);
 }
 
 TVM_REGISTER_API("_TargetCreate")
@@ -423,7 +422,6 @@ Stmt BuildStmt(Schedule sch,
 
   // Phase 2
   stmt = ir::Simplify(stmt);
-  stmt = ir::LowerStorageAccessInfo(stmt);
   stmt = ir::RemoveNoOp(stmt);
 
   if (!(config->disable_select_rewriting))
@@ -518,6 +516,7 @@ Array<Array<LoweredFunc> > split_dev_host_funcs(const Array<LoweredFunc>& funcs,
   for (size_t i = 0; i < fhost.size(); ++i) {
     auto func = fhost[i];
     func = ir::BindDeviceType(func, target->device_type);
+    func = ir::LowerDeviceStorageAccessInfo(func);
     func = ir::LowerTVMBuiltin(func);
     fhost.Set(i, func);
   }
@@ -525,6 +524,7 @@ Array<Array<LoweredFunc> > split_dev_host_funcs(const Array<LoweredFunc>& funcs,
   for (size_t i = 0; i < fhost.size(); ++i) {
     auto func = fhost[i];
     func = ir::LowerIntrin(func, target_host->target_name);
+    func = ir::LowerDeviceStorageAccessInfo(func);
     func = ir::CombineContextCall(func);
     fhost.Set(i, func);
   }
@@ -674,7 +674,7 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 });
 
 struct GenericFunc::Manager {
-  std::unordered_map<std::string, NodePtr<Node> > fmap;
+  std::unordered_map<std::string, GenericFunc> fmap;
   // mutex
   std::mutex mutex;
 
@@ -694,10 +694,11 @@ GenericFunc GenericFunc::Get(const std::string& name) {
   if (it == m->fmap.end()) {
     auto f = make_node<GenericFuncNode>();
     f->name_ = name;
-    m->fmap[name] = f;
-    return GenericFunc(f);
+    auto gf = GenericFunc(f);
+    m->fmap[name] = gf;
+    return gf;
   } else {
-    return GenericFunc(it->second);
+    return it->second;
   }
 }
 
@@ -707,12 +708,12 @@ void GenericFunc::RegisterGenericFunc(GenericFunc func, const std::string& name)
   auto it = m->fmap.find(name);
   CHECK(it == m->fmap.end()) << "GenericFunc already registered " << name;
   func->name_ = name;
-  m->fmap[name] = func.node_;
+  m->fmap[name] = func;
 }
 
 GenericFunc& GenericFunc::set_default(const PackedFunc value,
-                                           bool allow_override) {
-  auto node = static_cast<GenericFuncNode*>(node_.get());
+                                      bool allow_override) {
+  auto node = static_cast<GenericFuncNode*>(operator->());
   if (!allow_override) {
     CHECK(node->generic_func_ == nullptr)
       << "Generic function already registered for " << node->name_;
@@ -736,7 +737,7 @@ GenericFunc& GenericFunc::register_func(const std::vector<std::string>& tags,
 }
 
 void GenericFunc::CallPacked(TVMArgs args, TVMRetValue* ret) const {
-  auto node = static_cast<GenericFuncNode*>(node_.get());
+  auto node = static_cast<const GenericFuncNode*>(get());
   auto target = Target::Current(true);
   PackedFunc func;
 
diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc
index ecf62ab0cfac..ab203f2aa28a 100644
--- a/src/codegen/codegen_c.cc
+++ b/src/codegen/codegen_c.cc
@@ -806,7 +806,7 @@ void CodeGenC::VisitStmt_(const Allocate* op) {
 
 void CodeGenC::VisitStmt_(const AttrStmt* op) {
   if (op->attr_key == ir::attr::thread_extent) {
-    IterVar iv(op->node.node_);
+    IterVar iv = Downcast<IterVar>(op->node);
     if (iv->thread_tag.length() != 0) {
       if (!var_idmap_.count(iv->var.get())) {
         BindThreadIndex(iv);
diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
index 241310fd00d4..39a3ab7df0cc 100644
--- a/src/codegen/codegen_cuda.cc
+++ b/src/codegen/codegen_cuda.cc
@@ -24,6 +24,7 @@
 #include <tvm/base.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/packed_func_ext.h>
+#include <cmath>
 #include <vector>
 #include <string>
 #include "codegen_cuda.h"
@@ -74,6 +75,10 @@ std::string CodeGenCUDA::Finish() {
     decl_stream << "#include <math_constants.h>\n";
   }
 
+  if (need_mma_h_) {
+    decl_stream << "#include <mma.h>\n";
+  }
+
   return CodeGenC::Finish();
 }
 
@@ -102,14 +107,22 @@ void CodeGenCUDA::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
   bool fail = false;
   if (t.is_float()) {
     switch (t.bits()) {
-      case 16: os << "half";
+      case 16:
         enable_fp16_ = true;
+        if (lanes == 1) {
+          os << "half";
+        } else if (lanes <= 8) {
+          CHECK_EQ(lanes % 2, 0) << "only support even lane for half type";
+          os << "float" << lanes / 2;
+        } else {
+          fail = true;
+        }
         break;
       case 32: os << "float"; break;
       case 64: os << "double"; break;
       default: fail = true; break;
     }
-    if (!fail && lanes == 1) return;
+    if (!fail && (lanes == 1 || t.bits() == 16)) return;
     if (!fail && (lanes >= 2 && lanes <= 4)) {
       os << lanes; return;
     }
@@ -290,6 +303,113 @@ void CodeGenCUDA::PrintStorageScope(
   }
 }
 
+void CodeGenCUDA::VisitExpr_(const Call *op, std::ostream& os) {
+  if (op->is_intrinsic(intrinsic::tvm_fill_fragment)) {
+    need_mma_h_ = true;
+    CHECK_EQ(op->args.size(), 6U);
+    os << "nvcuda::wmma::fill_fragment(";
+    this->PrintExpr(op->args[0], os);
+    os << "[";
+    this->PrintExpr(op->args[4], os);
+    os << "], ";
+    this->PrintExpr(op->args[5], os);
+    os << ")";
+  } else if (op->is_intrinsic(intrinsic::tvm_load_matrix_sync)) {
+    need_mma_h_ = true;
+    CHECK_EQ(op->args.size(), 8U);
+    os << "nvcuda::wmma::load_matrix_sync(";
+    this->PrintExpr(op->args[0], os);
+    os << "[";
+    this->PrintExpr(op->args[4], os);
+    os << "], ";
+    this->PrintExpr(op->args[5], os);
+    os << ", ";
+    this->PrintExpr(op->args[6], os);
+    os << ")";
+  } else if (op->is_intrinsic(intrinsic::tvm_store_matrix_sync)) {
+    need_mma_h_ = true;
+    CHECK_EQ(op->args.size(), 8U);
+    os << "nvcuda::wmma::store_matrix_sync(";
+    this->PrintExpr(op->args[5], os);
+    os << ", ";
+    this->PrintExpr(op->args[0], os);
+    os << "[";
+    this->PrintExpr(op->args[4], os);
+    os << "], ";
+    this->PrintExpr(op->args[6], os);
+    if (const StringImm *str = op->args[7].as<StringImm>()) {
+      os << ", nvcuda::wmma::mem_" << str->value;
+    } else {
+      LOG(FATAL) << "Invalid parameters";
+    }
+    os << ")";
+  } else if (op->is_intrinsic(intrinsic::tvm_mma_sync)) {
+    need_mma_h_ = true;
+    CHECK_EQ(op->args.size(), 8U);
+    os << "nvcuda::wmma::mma_sync(";
+    for (int i = 0; i < 4; ++i) {
+      this->PrintExpr(op->args[i * 2], os);
+      os << "[";
+      this->PrintExpr(op->args[i * 2 + 1], os);
+      os << "]" << ((i < 3) ? ", ": ")");
+    }
+  } else {
+    CodeGenC::VisitExpr_(op, os);
+  }
+}
+
+void CodeGenCUDA::VisitStmt_(const AttrStmt* op) {
+  if (op->attr_key == attr::fragment_shape) {
+    const Variable* buffer = op->node.as<Variable>();
+    const StringImm* shape_str = op->value.as<StringImm>();
+    fragment_shapes[buffer] = shape_str->value;
+  } else if (op->attr_key == attr::fragment_layout) {
+    const Variable* buffer = op->node.as<Variable>();
+    const StringImm* layout_str = op->value.as<StringImm>();
+    fragment_layouts[buffer] = layout_str->value;
+  }
+  CodeGenC::VisitStmt_(op);
+}
+
+void CodeGenCUDA::VisitStmt_(const Allocate* op) {
+  CHECK(!is_zero(op->condition));
+  std::string vid = AllocVarID(op->buffer_var.get());
+  if (op->new_expr.defined()) {
+    // Prefer global static allocation for the program
+    CHECK_EQ(op->free_function, "nop");
+    std::string new_data = PrintExpr(op->new_expr);
+    this->PrintIndent();
+    PrintType(op->type, stream);
+    stream << "* "<< vid << '=' << new_data << ";\n";
+  } else {
+    this->PrintIndent();
+    int32_t constant_size = op->constant_allocation_size();
+    CHECK_GT(constant_size, 0)
+      << "Can only handle constant size stack allocation for now";
+    const Variable* buffer = op->buffer_var.as<Variable>();
+    std::string scope = alloc_storage_scope_.at(buffer);
+    if (scope.find("wmma.") == 0) {
+      if (scope == "wmma.matrix_a" || scope == "wmma.matrix_b") {
+        CHECK(op->type == Float(16) || op->type == Int(8) || op->type == UInt(8))
+          << "Matrix_a and matrix_b only support half or char or unsigned char type for now";
+      } else {
+        CHECK(op->type == Float(16) || op->type == Float(32) || op->type == Int(32))
+          << "Accumulator only support half, float and int type for now";
+      }
+      constant_size = GetWmmaFragmentSize(scope, buffer, constant_size);
+      PrintWmmaScope(scope, op->type, buffer, stream);
+    } else {
+      PrintStorageScope(scope, stream);
+      stream << ' ';
+      PrintType(op->type, stream);
+    }
+    stream << ' '<< vid << '['
+           << constant_size << "];\n";
+  }
+  RegisterHandleType(op->buffer_var.get(), op->type);
+  this->PrintStmt(op->body);
+}
+
 void CodeGenCUDA::VisitStmt_(const Evaluate *op) {
   if (is_const(op->value)) return;
   const Call* call = op->value.as<Call>();
@@ -392,5 +512,49 @@ void CodeGenCUDA::VisitExpr_(const FloatImm *op, std::ostream& os) { // NOLINT(*
   PrintConst(op, os, this);
 }
 
+void CodeGenCUDA::PrintWmmaScope(const std::string &scope, Type t,
+    const Variable* variable, std::ostream &os) {
+  std::stringstream type;
+  PrintType(t, type);
+  std::string shape_str = fragment_shapes[variable];
+  if (scope == "wmma.matrix_a") {
+    need_mma_h_ = true;
+    std::string layout_str = fragment_layouts[variable];
+    os << "nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, "
+      << shape_str << ", " << type.str() << ", nvcuda::wmma::" << layout_str <<">";
+  } else if (scope == "wmma.matrix_b") {
+    need_mma_h_ = true;
+    std::string layout_str = fragment_layouts[variable];
+    os << "nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, "
+       << shape_str << ", " << type.str() << ", nvcuda::wmma::" << layout_str <<">";
+  } else if (scope == "wmma.accumulator") {
+    need_mma_h_ = true;
+    os << "nvcuda::wmma::fragment<nvcuda::wmma::accumulator, "
+       << shape_str << ", "<< type.str() << ">";
+  }
+}
+
+int32_t CodeGenCUDA::GetWmmaFragmentSize(const std::string &scope,
+                                         const Variable* variable, int32_t size) {
+  std::string shape_str = fragment_shapes[variable];
+  size_t m, n, k;
+  size_t last_pos = 0, pos = 0;
+  pos = shape_str.find(", ", last_pos);
+  m = std::stoi(shape_str.substr(last_pos, pos - last_pos));
+  last_pos = pos + 2;
+  pos = shape_str.find(", ", last_pos);
+  n = std::stoi(shape_str.substr(last_pos, pos - last_pos));
+  last_pos = pos + 2;
+  k = std::stoi(shape_str.substr(last_pos, shape_str.length() - last_pos));
+  if (scope == "wmma.matrix_a") {
+    return size / m / k;
+  } else if (scope == "wmma.matrix_b") {
+    return size / n / k;
+  } else if (scope == "wmma.accumulator") {
+    return size / m / n;
+  }
+  return 0;
+}
+
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h
index 61c6fa3a5170..53e7db45efc6 100644
--- a/src/codegen/codegen_cuda.h
+++ b/src/codegen/codegen_cuda.h
@@ -28,6 +28,7 @@
 #include <tvm/codegen.h>
 #include <tvm/packed_func_ext.h>
 #include <string>
+#include <unordered_map>
 #include "codegen_c.h"
 
 namespace tvm {
@@ -40,7 +41,7 @@ class CodeGenCUDA final : public CodeGenC {
   void AddFunction(LoweredFunc f);
   std::string Finish();
   bool need_include_path() {
-    return (enable_fp16_ || enable_int8_ || need_math_constants_h_);
+    return (enable_fp16_ || enable_int8_ || need_math_constants_h_ || need_mma_h_);
   }
   // override behavior
   void VisitStmt_(const ir::For* op) final;
@@ -60,7 +61,10 @@ class CodeGenCUDA final : public CodeGenC {
   void VisitExpr_(const Shuffle* op, std::ostream& os) final; // NOLINT(*)
   void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*)
   void VisitExpr_(const FloatImm *op, std::ostream& os) final;
+  void VisitExpr_(const Call *op, std::ostream& os) final;
   void VisitStmt_(const Evaluate *op) final;
+  void VisitStmt_(const Allocate *op) final;
+  void VisitStmt_(const AttrStmt *op) final;
 
  private:
   // Whether global barrier is needed.
@@ -75,7 +79,14 @@ class CodeGenCUDA final : public CodeGenC {
   bool enable_int8_{false};
   // whether need math_constants.h
   bool need_math_constants_h_{false};
+  // whether need mma.h
+  bool need_mma_h_{false};
+
+  std::unordered_map<const Variable*, std::string> fragment_shapes;
+  std::unordered_map<const Variable*, std::string> fragment_layouts;
   friend void PrintConst(const FloatImm* op, std::ostream& os, CodeGenCUDA* p);
+  void PrintWmmaScope(const std::string& scope, Type t, const Variable* variable, std::ostream& os);
+  int32_t GetWmmaFragmentSize(const std::string &scope, const Variable* variable, int32_t size);
 };
 
 }  // namespace codegen
diff --git a/src/codegen/codegen_opencl.cc b/src/codegen/codegen_opencl.cc
index 0b33bf43c151..3120bb543aea 100644
--- a/src/codegen/codegen_opencl.cc
+++ b/src/codegen/codegen_opencl.cc
@@ -22,6 +22,7 @@
  * \file codegen_opencl.cc
  */
 #include <tvm/packed_func_ext.h>
+#include <cmath>
 #include <vector>
 #include <string>
 #include "codegen_opencl.h"
diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc
index d009290bb2fe..de54e242ff40 100644
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -1173,7 +1173,7 @@ void CodeGenLLVM::VisitStmt_(const Allocate* op) {
 
 void CodeGenLLVM::VisitStmt_(const AttrStmt* op) {
   if (op->attr_key == attr::thread_extent) {
-    IterVar iv(op->node.node_);
+    IterVar iv = Downcast<IterVar>(op->node);
     if (iv->thread_tag.length() != 0) {
       if (!var_map_.count(iv->var.get())) {
         var_map_[iv->var.get()] = GetThreadIndex(iv);
diff --git a/src/codegen/spirv/codegen_spirv.cc b/src/codegen/spirv/codegen_spirv.cc
index 7caf3a258b6f..6a3b0571c9ab 100644
--- a/src/codegen/spirv/codegen_spirv.cc
+++ b/src/codegen/spirv/codegen_spirv.cc
@@ -606,7 +606,7 @@ void CodeGenSPIRV::VisitStmt_(const Allocate* op) {
 
 void CodeGenSPIRV::VisitStmt_(const AttrStmt* op) {
   if (op->attr_key == attr::thread_extent) {
-    IterVar iv(op->node.node_);
+    IterVar iv = Downcast<IterVar>(op->node);
     if (iv->thread_tag.length() != 0) {
       if (!var_map_.count(iv->var.get())) {
         var_map_[iv->var.get()] = GetThreadIndex(iv, op->value);
diff --git a/src/codegen/spirv/intrin_rule_spirv.cc b/src/codegen/spirv/intrin_rule_spirv.cc
index a046cc4f458c..fca9aa203f80 100644
--- a/src/codegen/spirv/intrin_rule_spirv.cc
+++ b/src/codegen/spirv/intrin_rule_spirv.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,9 +18,9 @@
  */
 
 /*!
- *  Copyright (c) 2017 by Contributors
  * \file intrin_rule_spirv.cc
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/packed_func_ext.h>
 #include <tvm/ir.h>
 #include <GLSL.std.450.h>
diff --git a/src/common/socket.h b/src/common/socket.h
index 2a2d9166a134..39bcff863c10 100644
--- a/src/common/socket.h
+++ b/src/common/socket.h
@@ -27,8 +27,10 @@
 #define TVM_COMMON_SOCKET_H_
 
 #if defined(_WIN32)
+#define NOMINMAX
 #include <winsock2.h>
 #include <ws2tcpip.h>
+#undef NOMINMAX
 using ssize_t = int;
 #ifdef _MSC_VER
 #pragma comment(lib, "Ws2_32.lib")
diff --git a/src/contrib/hybrid/codegen_hybrid.cc b/src/contrib/hybrid/codegen_hybrid.cc
index 54616adc214e..778b6b1a7811 100644
--- a/src/contrib/hybrid/codegen_hybrid.cc
+++ b/src/contrib/hybrid/codegen_hybrid.cc
@@ -300,7 +300,7 @@ void CodeGenHybrid::VisitStmt_(const AttrStmt* op) {
     PrintStmt(op->body);
     indent_ -= tab_;
   } else if (op->attr_key == ir::attr::realize_scope) {
-    auto v = FunctionRef(op->node.node_);
+    auto v = Downcast<FunctionRef>(op->node);
     alloc_storage_scope_[v] = op->value.as<StringImm>()->value;
     PrintStmt(op->body);
   } else {
@@ -408,7 +408,7 @@ void CodeGenHybrid::PrintIndent() {
 std::string CodeGenHybrid::GetVarID(const Variable *v) {
   if (binds_.count(v))
     return binds_[v];
-  auto key = std::make_pair(v->GetNodePtr().get(), 0);
+  auto key = std::make_pair(static_cast<const Node*>(v), 0);
   if (id_map_.count(key)) {
     return id_map_[key];
   }
diff --git a/src/contrib/hybrid/codegen_hybrid.h b/src/contrib/hybrid/codegen_hybrid.h
index 498838fc908f..866756996f8d 100644
--- a/src/contrib/hybrid/codegen_hybrid.h
+++ b/src/contrib/hybrid/codegen_hybrid.h
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors
  * \file codegen_hybrid.h
  * \brief Common utilities to generated C style code.
  */
diff --git a/src/contrib/sort/sort.cc b/src/contrib/sort/sort.cc
index a87ce07cb602..0ccaee515acb 100644
--- a/src/contrib/sort/sort.cc
+++ b/src/contrib/sort/sort.cc
@@ -75,7 +75,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.sort.argsort_nms")
   // Currently only supports input dtype to be float32.
   CHECK_EQ(dtype.code, 2) << "Currently only supports input dtype "
       "to be float.";
-#if (__ARM_FP16_FORMAT_IEEE != 1)
+#if (__ARM_FEATURE_FP16_SCALAR_ARITHMETIC != 1)
   CHECK_EQ(dtype.bits, 32) << "Currently only supports input dtype "
       "to be float32.";
 #endif
@@ -100,23 +100,23 @@ TVM_REGISTER_GLOBAL("tvm.contrib.sort.argsort_nms")
         sorter.emplace_back(std::make_pair(k, *(data_ptr + full_idx)));
       }
       if (is_ascend) {
-#if (__ARM_FP16_FORMAT_IEEE == 1)
+#if (__ARM_FEATURE_FP16_SCALAR_ARITHMETIC == 1)
         if (dtype.bits == 16) {
           std::stable_sort(sorter.begin(), sorter.end(), CompareAscend<__fp16>);
         } else {
 #endif
         std::stable_sort(sorter.begin(), sorter.end(), CompareAscend<float>);
-#if (__ARM_FP16_FORMAT_IEEE == 1)
+#if (__ARM_FEATURE_FP16_SCALAR_ARITHMETIC == 1)
         }
 #endif
       } else {
-#if (__ARM_FP16_FORMAT_IEEE == 1)
+#if (__ARM_FEATURE_FP16_SCALAR_ARITHMETIC == 1)
         if (dtype.bits == 16) {
           std::stable_sort(sorter.begin(), sorter.end(), CompareDescend<__fp16>);
         } else {
 #endif
         std::stable_sort(sorter.begin(), sorter.end(), CompareDescend<float>);
-#if (__ARM_FP16_FORMAT_IEEE == 1)
+#if (__ARM_FEATURE_FP16_SCALAR_ARITHMETIC == 1)
         }
 #endif
       }
@@ -210,7 +210,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.sort.argsort")
     } else {
       LOG(FATAL) << "Unsupported output dtype: " << out_dtype;
     }
-#if (__ARM_FP16_FORMAT_IEEE == 1)
+#if (__ARM_FEATURE_FP16_SCALAR_ARITHMETIC == 1)
   } else if (data_dtype == "float16") {
     if (out_dtype == "float16") {
       argsort<__fp16, __fp16>(input, output, axis, is_ascend);
diff --git a/src/lang/api_registry.cc b/src/lang/api_registry.cc
index e041f3a2dd2d..cd3d43b7dcf3 100644
--- a/src/lang/api_registry.cc
+++ b/src/lang/api_registry.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -62,7 +62,7 @@ TVM_REGISTER_API("_EnvFuncGetPackedFunc")
 
 TVM_REGISTER_NODE_TYPE(EnvFuncNode)
 .set_creator(CreateEnvNode)
-.set_global_key([](const Node* n) {
+.set_global_key([](const Object* n) {
     return static_cast<const EnvFuncNode*>(n)->name;
   });
 
diff --git a/src/lang/attr_functor.h b/src/lang/attr_functor.h
index 995dfb392e87..b9391e4895b9 100644
--- a/src/lang/attr_functor.h
+++ b/src/lang/attr_functor.h
@@ -44,17 +44,17 @@ class AttrFunctor;
 
 #define ATTR_FUNCTOR_DISPATCH(OP)                                       \
   vtable.template set_dispatch<OP>(                                     \
-      [](const NodeRef& n, TSelf* self, Args... args) {                 \
-        return self->VisitAttr_(static_cast<const OP*>(n.node_.get()),  \
+      [](const ObjectRef& n, TSelf* self, Args... args) {                 \
+        return self->VisitAttr_(static_cast<const OP*>(n.get()),  \
                                 std::forward<Args>(args)...);           \
       });                                                               \
 
 // A functor for common attribute information.
 template <typename R, typename... Args>
-class AttrFunctor<R(const NodeRef& n, Args...)> {
+class AttrFunctor<R(const ObjectRef& n, Args...)> {
  private:
-  using TSelf = AttrFunctor<R(const NodeRef& n, Args...)>;
-  using FType = tvm::IRFunctor<R(const NodeRef& n, TSelf* self, Args...)>;
+  using TSelf = AttrFunctor<R(const ObjectRef& n, Args...)>;
+  using FType = tvm::IRFunctor<R(const ObjectRef& n, TSelf* self, Args...)>;
 
  public:
   /*! \brief the result type of this functor */
@@ -65,7 +65,7 @@ class AttrFunctor<R(const NodeRef& n, Args...)> {
    * \param args Additional arguments.
    * \return The result of the call
    */
-  virtual R VisitAttr(const NodeRef& n, Args... args) {
+  virtual R VisitAttr(const ObjectRef& n, Args... args) {
     static FType vtable = InitVTable();
     if (vtable.can_dispatch(n)) {
       return vtable(n, this, std::forward<Args>(args)...);
@@ -73,7 +73,7 @@ class AttrFunctor<R(const NodeRef& n, Args...)> {
       return VisitAttrDefault_(n.get(), std::forward<Args>(args)...);
     }
   }
-  virtual R VisitAttrDefault_(const Node* node, Args... args) = 0;
+  virtual R VisitAttrDefault_(const Object* node, Args... args) = 0;
   virtual R VisitAttr_(const ArrayNode* op, Args... args) ATTR_FUNCTOR_DEFAULT;
   virtual R VisitAttr_(const StrMapNode* op, Args... args) ATTR_FUNCTOR_DEFAULT;
   virtual R VisitAttr_(const ir::IntImm* op, Args... args) ATTR_FUNCTOR_DEFAULT;
@@ -143,60 +143,60 @@ class AttrFunctor<R(const NodeRef& n, Args...)> {
 };
 
 class AttrsEqualHandler :
-      protected AttrFunctor<bool(const NodeRef&, const NodeRef&)> {
+      protected AttrFunctor<bool(const ObjectRef&, const ObjectRef&)> {
  public:
   /*!
    * \brief Check if lhs equals rhs
    * \param lhs The left operand.
    * \param rhs The right operand.
    */
-  bool Equal(const NodeRef& lhs, const NodeRef& rhs);
+  bool Equal(const ObjectRef& lhs, const ObjectRef& rhs);
 
  protected:
-  bool VisitAttrDefault_(const Node* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ArrayNode* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const StrMapNode* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::IntImm* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::UIntImm* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::FloatImm* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::StringImm* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Add* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Sub* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Mul* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Div* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Mod* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::FloorDiv* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::FloorMod* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Min* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Max* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::GE* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::GT* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::LT* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::LE* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::EQ* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::NE* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::And* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Or* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Not* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Cast* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Call* lhs, const NodeRef& other) final;
-  bool VisitAttr_(const ir::Select* lhs, const NodeRef& other) final;
+  bool VisitAttrDefault_(const Object* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ArrayNode* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const StrMapNode* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::IntImm* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::UIntImm* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::FloatImm* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::StringImm* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Add* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Sub* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Mul* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Div* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Mod* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::FloorDiv* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::FloorMod* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Min* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Max* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::GE* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::GT* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::LT* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::LE* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::EQ* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::NE* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::And* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Or* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Not* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Cast* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Call* lhs, const ObjectRef& other) final;
+  bool VisitAttr_(const ir::Select* lhs, const ObjectRef& other) final;
 };
 
 class AttrsHashHandler :
-      protected AttrFunctor<size_t(const NodeRef&)> {
+      protected AttrFunctor<size_t(const ObjectRef&)> {
  public:
   /*!
    * \brief Get hash value of node
    * \param node The node to be hashed.
    */
-  size_t Hash(const NodeRef& node) {
+  size_t Hash(const ObjectRef& node) {
     if (!node.defined()) return 0;
     return this->VisitAttr(node);
   }
 
  protected:
-  size_t VisitAttrDefault_(const Node* lhs) final;
+  size_t VisitAttrDefault_(const Object* lhs) final;
   size_t VisitAttr_(const ir::IntImm* lhs) final;
   size_t VisitAttr_(const ir::UIntImm* lhs) final;
   size_t VisitAttr_(const ir::FloatImm* lhs) final;
diff --git a/src/lang/attrs.cc b/src/lang/attrs.cc
index c5b14ac577ec..a299e17996e0 100644
--- a/src/lang/attrs.cc
+++ b/src/lang/attrs.cc
@@ -40,7 +40,7 @@ void DictAttrsNode::InitByPackedArgs(
   for (int i = 0; i < args.size(); i += 2) {
     std::string key = args[i];
     runtime::TVMArgValue val = args[i + 1];
-    if (val.type_code() == kNodeHandle) {
+    if (val.type_code() == kObjectHandle) {
       dict.Set(key, val.operator NodeRef());
     } else if (val.type_code() == kStr) {
       dict.Set(key, Expr(val.operator std::string()));
@@ -72,14 +72,14 @@ TVM_REGISTER_NODE_TYPE(AttrFieldInfoNode);
 
 using namespace ir;
 // Equal handler.
-bool AttrsEqualHandler::Equal(const NodeRef& lhs, const NodeRef& rhs) {
+bool AttrsEqualHandler::Equal(const ObjectRef& lhs, const ObjectRef& rhs) {
   if (lhs.same_as(rhs)) return true;
   if (!lhs.defined() || !rhs.defined()) return false;
   return this->VisitAttr(lhs, rhs);
 }
 
-bool AttrsEqualHandler::VisitAttrDefault_(const Node* lhs, const NodeRef& other) {
-  if (lhs->derived_from<BaseAttrsNode>()) {
+bool AttrsEqualHandler::VisitAttrDefault_(const Object* lhs, const ObjectRef& other) {
+  if (lhs->IsInstance<BaseAttrsNode>()) {
     AttrsEqual equal;
     equal.handler_ = this;
     return static_cast<const BaseAttrsNode*>(lhs)->ContentEqual(
@@ -88,58 +88,58 @@ bool AttrsEqualHandler::VisitAttrDefault_(const Node* lhs, const NodeRef& other)
   return lhs == other.get();
 }
 
-bool AttrsEqualHandler::VisitAttr_(const IntImm* lhs, const NodeRef& other) {
+bool AttrsEqualHandler::VisitAttr_(const IntImm* lhs, const ObjectRef& other) {
   if (const auto* rhs = other.as<IntImm>()) {
     return lhs->value == rhs->value;
   }
   return false;
 }
 
-bool AttrsEqualHandler::VisitAttr_(const UIntImm* lhs, const NodeRef& other) {
+bool AttrsEqualHandler::VisitAttr_(const UIntImm* lhs, const ObjectRef& other) {
   if (const auto* rhs = other.as<UIntImm>()) {
     return lhs->value == rhs->value;
   }
   return false;
 }
 
-bool AttrsEqualHandler::VisitAttr_(const FloatImm* lhs, const NodeRef& other) {
+bool AttrsEqualHandler::VisitAttr_(const FloatImm* lhs, const ObjectRef& other) {
   if (const auto* rhs = other.as<FloatImm>()) {
     return lhs->value == rhs->value;
   }
   return false;
 }
 
-bool AttrsEqualHandler::VisitAttr_(const StringImm* lhs, const NodeRef& other) {
+bool AttrsEqualHandler::VisitAttr_(const StringImm* lhs, const ObjectRef& other) {
   if (const auto* rhs = other.as<StringImm>()) {
     return lhs->value == rhs->value;
   }
   return false;
 }
 
-bool AttrsEqualHandler::VisitAttr_(const ArrayNode* lhs, const NodeRef& other) {
+bool AttrsEqualHandler::VisitAttr_(const ArrayNode* lhs, const ObjectRef& other) {
   if (const auto* rhs = other.as<ArrayNode>()) {
     if (rhs->data.size() != lhs->data.size()) return false;
     for (size_t  i = 0; i < lhs->data.size(); ++i) {
-      if (!Equal(NodeRef(lhs->data[i]), NodeRef(rhs->data[i]))) return false;
+      if (!Equal(lhs->data[i], rhs->data[i])) return false;
     }
   }
   return true;
 }
 
-bool AttrsEqualHandler::VisitAttr_(const StrMapNode* lhs, const NodeRef& other) {
+bool AttrsEqualHandler::VisitAttr_(const StrMapNode* lhs, const ObjectRef& other) {
   if (const auto* rhs = other.as<StrMapNode>()) {
     if (rhs->data.size() != lhs->data.size()) return false;
     for (const auto& kv : lhs->data) {
       auto it = rhs->data.find(kv.first);
       if (it == rhs->data.end()) return false;
-      if (!Equal(NodeRef(kv.second), NodeRef(it->second))) return false;
+      if (!Equal(kv.second, it->second)) return false;
     }
   }
   return true;
 }
 
 #define TVM_DEFINE_ATTRS_BINOP_EQUAL(NodeName)                          \
-  bool AttrsEqualHandler::VisitAttr_(const NodeName* lhs, const NodeRef& other) { \
+  bool AttrsEqualHandler::VisitAttr_(const NodeName* lhs, const ObjectRef& other) { \
     if (const auto* rhs = other.as<NodeName>()) {                       \
       if (!Equal(lhs->a, rhs->a)) return false;                         \
       if (!Equal(lhs->b, rhs->b)) return false;                         \
@@ -167,7 +167,7 @@ TVM_DEFINE_ATTRS_BINOP_EQUAL(NE);
 TVM_DEFINE_ATTRS_BINOP_EQUAL(And);
 TVM_DEFINE_ATTRS_BINOP_EQUAL(Or);
 
-bool AttrsEqualHandler::VisitAttr_(const Not* lhs, const NodeRef& other) {
+bool AttrsEqualHandler::VisitAttr_(const Not* lhs, const ObjectRef& other) {
   if (const auto* rhs = other.as<Not>()) {
     return Equal(lhs->a, rhs->a);
   } else {
@@ -175,7 +175,7 @@ bool AttrsEqualHandler::VisitAttr_(const Not* lhs, const NodeRef& other) {
   }
 }
 
-bool AttrsEqualHandler::VisitAttr_(const Cast* lhs, const NodeRef& other) {
+bool AttrsEqualHandler::VisitAttr_(const Cast* lhs, const ObjectRef& other) {
   if (const auto* rhs = other.as<Cast>()) {
     if (lhs->type != rhs->type) return false;
     return Equal(lhs->value, rhs->value);
@@ -184,7 +184,7 @@ bool AttrsEqualHandler::VisitAttr_(const Cast* lhs, const NodeRef& other) {
   }
 }
 
-bool AttrsEqualHandler::VisitAttr_(const Call* lhs, const NodeRef& other) {
+bool AttrsEqualHandler::VisitAttr_(const Call* lhs, const ObjectRef& other) {
   if (const auto* rhs = other.as<Call>()) {
     return
         lhs->name == rhs->name &&
@@ -196,7 +196,7 @@ bool AttrsEqualHandler::VisitAttr_(const Call* lhs, const NodeRef& other) {
   }
 }
 
-bool AttrsEqualHandler::VisitAttr_(const Select* lhs, const NodeRef& other) {
+bool AttrsEqualHandler::VisitAttr_(const Select* lhs, const ObjectRef& other) {
   if (const auto* rhs = other.as<Select>()) {
     return
         Equal(lhs->condition, rhs->condition) &&
@@ -208,13 +208,13 @@ bool AttrsEqualHandler::VisitAttr_(const Select* lhs, const NodeRef& other) {
 }
 
 // Hash Handler.
-size_t AttrsHashHandler::VisitAttrDefault_(const Node* value) {
-  if (value->derived_from<BaseAttrsNode>()) {
+size_t AttrsHashHandler::VisitAttrDefault_(const Object* value) {
+  if (value->IsInstance<BaseAttrsNode>()) {
     AttrsHash hasher;
     hasher.handler_ = this;
     return static_cast<const BaseAttrsNode*>(value)->ContentHash(hasher);
   } else {
-    return NodeHash()(GetRef<NodeRef>(value));
+    return ObjectHash()(GetRef<ObjectRef>(value));
   }
 }
 
@@ -237,13 +237,13 @@ size_t AttrsHashHandler::VisitAttr_(const StringImm* op) {
 size_t AttrsHashHandler::VisitAttr_(const ArrayNode* op) {
   size_t result = op->data.size();
   for (size_t  i = 0; i < op->data.size(); ++i) {
-    result = Combine(result, this->Hash(NodeRef(op->data[i])));
+    result = Combine(result, this->Hash(op->data[i]));
   }
   return result;
 }
 
 size_t AttrsHashHandler::VisitAttr_(const StrMapNode* lhs) {
-    using Entry = std::pair<std::string, NodePtr<Node> >;
+    using Entry = std::pair<std::string, ObjectRef>;
     std::vector<Entry> data(lhs->data.begin(), lhs->data.end());
     std::sort(data.begin(), data.end(), [](const Entry& a, const Entry& b) {
         return a.first < b.first;
@@ -251,7 +251,7 @@ size_t AttrsHashHandler::VisitAttr_(const StrMapNode* lhs) {
     size_t result = 0;
     for (const Entry& kv : data) {
       result = Combine(result, std::hash<std::string>()(kv.first));
-      result = Combine(result, this->Hash(NodeRef(kv.second)));
+      result = Combine(result, this->Hash(kv.second));
     }
     return result;
 }
@@ -316,7 +316,7 @@ size_t AttrsHashHandler::VisitAttr_(const Select* op) {
 
 
 // Default case
-bool AttrsEqual::operator()(const NodeRef& lhs, const NodeRef& rhs) const {
+bool AttrsEqual::operator()(const ObjectRef& lhs, const ObjectRef& rhs) const {
   if (lhs.same_as(rhs)) return true;
   if (handler_ == nullptr) {
     return AttrsEqualHandler().Equal(lhs, rhs);
@@ -325,7 +325,7 @@ bool AttrsEqual::operator()(const NodeRef& lhs, const NodeRef& rhs) const {
   }
 }
 
-size_t AttrsHash::operator()(const NodeRef& node) const {
+size_t AttrsHash::operator()(const ObjectRef& node) const {
   if (!node.defined()) return 0;
   if (handler_ == nullptr) {
     return AttrsHashHandler().Hash(node);
@@ -338,7 +338,7 @@ size_t DictAttrsNode::ContentHash(AttrsHash hasher) const {
   return hasher(this->dict);
 }
 
-bool DictAttrsNode::ContentEqual(const Node* other, AttrsEqual equal) const {
+bool DictAttrsNode::ContentEqual(const Object* other, AttrsEqual equal) const {
   if (this == other) return true;
   if (other == nullptr) return false;
   if (this->type_index() != other->type_index()) return false;
diff --git a/src/lang/data_layout.cc b/src/lang/data_layout.cc
index 8718039c32c7..3686d5f887b8 100644
--- a/src/lang/data_layout.cc
+++ b/src/lang/data_layout.cc
@@ -69,8 +69,7 @@ const LayoutAxis& LayoutAxis::make(const std::string& name) {
 }
 
 Layout::Layout(const Array<IterVar>& axes) {
-  node_ = make_node<LayoutNode>();
-  LayoutNode *node = operator->();
+  auto node = make_node<LayoutNode>();
   node->axes = axes;
   std::ostringstream repr;
   for (const IterVar& axis : axes) {
@@ -85,13 +84,13 @@ Layout::Layout(const Array<IterVar>& axes) {
     repr << axis->var.get()->name_hint;
   }
   node->name = repr.str();
+  data_ = std::move(node);
 }
 
 Layout::Layout(const std::string& name) { // NOLINT(*)
   if (name == "__undef__") return;
 
-  node_ = make_node<LayoutNode>();
-  LayoutNode *node = operator->();
+  auto node = make_node<LayoutNode>();
   node->name = name;
 
   if (name.empty()) return;  // scalar
@@ -140,6 +139,7 @@ Layout::Layout(const std::string& name) { // NOLINT(*)
                                       << std::toupper(axis);
     }
   }
+  data_ = std::move(node);
 }
 
 Layout LayoutNode::make(const std::string& layout) {
diff --git a/src/lang/expr.cc b/src/lang/expr.cc
index 11b72c71fda7..31ade90dd587 100644
--- a/src/lang/expr.cc
+++ b/src/lang/expr.cc
@@ -156,7 +156,7 @@ Var var(std::string name_hint, Type t) {
   return Var(name_hint, t);
 }
 
-void IRPrinter::Print(const NodeRef& ir) {
+void IRPrinter::Print(const ObjectRef& ir) {
   static const FType& f = vtable();
   if (!ir.defined()) {
     stream << "(nullptr)";
@@ -165,7 +165,7 @@ void IRPrinter::Print(const NodeRef& ir) {
       f(ir, this);
     } else {
       // default value, output type key and addr.
-      stream << ir->type_key() << "(" << ir.get() << ")";
+      stream << ir->GetTypeKey() << "(" << ir.get() << ")";
     }
   }
 }
diff --git a/src/lang/ir.cc b/src/lang/ir.cc
index e66d21bba1f7..04e04aef455c 100644
--- a/src/lang/ir.cc
+++ b/src/lang/ir.cc
@@ -1061,7 +1061,7 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
       if (i != 0) {
         p->stream << ", ";
       }
-      p->Print(NodeRef(op->data[i]));
+      p->Print(op->data[i]);
     }
     p->stream << ']';
 });
@@ -1073,9 +1073,9 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
       if (it != op->data.begin()) {
         p->stream << ", ";
       }
-      p->Print(NodeRef(it->first));
+      p->Print(it->first);
       p->stream << ": ";
-      p->Print(NodeRef(it->second));
+      p->Print(it->second);
     }
     p->stream << '}';
   });
@@ -1088,7 +1088,7 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
         p->stream << ", ";
       }
       p->stream << '\"' << it->first << "\": ";
-      p->Print(NodeRef(it->second));
+      p->Print(it->second);
     }
     p->stream << '}';
   });
@@ -1150,6 +1150,8 @@ TVM_REGISTER_NODE_TYPE(Select);
 TVM_REGISTER_NODE_TYPE(Load);
 TVM_REGISTER_NODE_TYPE(Ramp);
 TVM_REGISTER_NODE_TYPE(Broadcast);
+TVM_REGISTER_NODE_TYPE(Shuffle);
+TVM_REGISTER_NODE_TYPE(Prefetch);
 TVM_REGISTER_NODE_TYPE(Call);
 TVM_REGISTER_NODE_TYPE(Let);
 TVM_REGISTER_NODE_TYPE(LetStmt);
diff --git a/src/lang/target_info.cc b/src/lang/target_info.cc
index ff6a35286f20..481a9269193b 100644
--- a/src/lang/target_info.cc
+++ b/src/lang/target_info.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,9 +18,9 @@
  */
 
 /*!
- *  Copyright (c) 2017 by Contributors
  * \file target_info.cc
  */
+#include <tvm/runtime/registry.h>
 #include <tvm/target_info.h>
 #include <tvm/packed_func_ext.h>
 
diff --git a/src/node/node.cc b/src/node/node.cc
deleted file mode 100644
index 6b2f3c0365ad..000000000000
--- a/src/node/node.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*!
- *  Implementation of Node API
- * \file node.cc
- */
-#include <tvm/node/node.h>
-#include <memory>
-#include <atomic>
-#include <mutex>
-#include <unordered_map>
-
-// TODO(tqchen):
-// Think of re-organize and consolidate with object.
-namespace tvm {
-
-namespace {
-// single manager of operator information.
-struct TypeManager {
-  // mutex to avoid registration from multiple threads.
-  // recursive is needed for trigger(which calls UpdateAttrMap)
-  std::mutex mutex;
-  std::atomic<uint32_t> type_counter{0};
-  std::unordered_map<std::string, uint32_t> key2index;
-  std::vector<std::string> index2key;
-  // get singleton of the
-  static TypeManager* Global() {
-    static TypeManager inst;
-    return &inst;
-  }
-};
-}  // namespace
-
-TVM_DLL bool Node::_DerivedFrom(uint32_t tid) const {
-  static uint32_t tindex = TypeKey2Index(Node::_type_key);
-  return tid == tindex;
-}
-
-// this is slow, usually caller always hold the result in a static variable.
-TVM_DLL uint32_t Node::TypeKey2Index(const char* key) {
-  TypeManager *t = TypeManager::Global();
-  std::lock_guard<std::mutex>(t->mutex);
-  std::string skey = key;
-  auto it = t->key2index.find(skey);
-  if (it != t->key2index.end()) {
-    return it->second;
-  }
-  uint32_t tid = ++(t->type_counter);
-  t->key2index[skey] = tid;
-  t->index2key.push_back(skey);
-  return tid;
-}
-
-TVM_DLL const char* Node::TypeIndex2Key(uint32_t index) {
-  TypeManager *t = TypeManager::Global();
-  std::lock_guard<std::mutex>(t->mutex);
-  CHECK_NE(index, 0);
-  return t->index2key.at(index - 1).c_str();
-}
-}  // namespace tvm
diff --git a/src/node/reflection.cc b/src/node/reflection.cc
new file mode 100644
index 000000000000..e92ca92834a2
--- /dev/null
+++ b/src/node/reflection.cc
@@ -0,0 +1,306 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Reflection utilities.
+ * \file node/reflection.cc
+ */
+#include <tvm/runtime/registry.h>
+#include <tvm/node/node.h>
+#include <tvm/node/container.h>
+#include <tvm/node/reflection.h>
+#include <tvm/attrs.h>
+
+namespace tvm {
+
+// Attr getter.
+class AttrGetter : public AttrVisitor {
+ public:
+  const std::string& skey;
+  TVMRetValue* ret;
+
+  AttrGetter(const std::string &skey,
+             TVMRetValue* ret)
+      : skey(skey), ret(ret) {}
+
+  bool found_ref_object{false};
+
+  void Visit(const char* key, double* value) final {
+    if (skey == key) *ret = value[0];
+  }
+  void Visit(const char* key, int64_t* value) final {
+    if (skey == key) *ret = value[0];
+  }
+  void Visit(const char* key, uint64_t* value) final {
+    CHECK_LE(value[0], static_cast<uint64_t>(std::numeric_limits<int64_t>::max()))
+        << "cannot return too big constant";
+    if (skey == key) *ret = static_cast<int64_t>(value[0]);
+  }
+  void Visit(const char* key, int* value) final {
+    if (skey == key) *ret = static_cast<int64_t>(value[0]);
+  }
+  void Visit(const char* key, bool* value) final {
+    if (skey == key) *ret = static_cast<int64_t>(value[0]);
+  }
+  void Visit(const char* key, void** value) final {
+    if (skey == key) *ret = static_cast<void*>(value[0]);
+  }
+  void Visit(const char* key, Type* value) final {
+    if (skey == key) *ret = value[0];
+  }
+  void Visit(const char* key, std::string* value) final {
+    if (skey == key) *ret = value[0];
+  }
+
+  void Visit(const char* key, runtime::NDArray* value) final {
+    if (skey == key) {
+      *ret = value[0];
+      found_ref_object = true;
+    }
+  }
+  void Visit(const char* key, runtime::ObjectRef* value) final {
+    if (skey == key) {
+      *ret = value[0];
+      found_ref_object = true;
+    }
+  }
+};
+
+runtime::TVMRetValue ReflectionVTable::GetAttr(
+    Object* self, const std::string& field_name) const {
+  runtime::TVMRetValue ret;
+  AttrGetter getter(field_name, &ret);
+
+  bool success;
+  if (getter.skey == "type_key") {
+    ret = self->GetTypeKey();
+    success = true;
+  } else if (!self->IsInstance<DictAttrsNode>()) {
+    VisitAttrs(self, &getter);
+    success = getter.found_ref_object || ret.type_code() != kNull;
+  } else {
+    // specially handle dict attr
+    DictAttrsNode* dnode = static_cast<DictAttrsNode*>(self);
+    auto it = dnode->dict.find(getter.skey);
+    if (it != dnode->dict.end()) {
+      success = true;
+      ret = (*it).second;
+    } else {
+      success = false;
+    }
+  }
+  if (!success) {
+      LOG(FATAL) << "AttributeError: " << self->GetTypeKey()
+                 << " object has no attributed " << getter.skey;
+  }
+  return ret;
+}
+
+// List names;
+class AttrDir : public AttrVisitor {
+ public:
+  std::vector<std::string>* names;
+
+  void Visit(const char* key, double* value) final {
+    names->push_back(key);
+  }
+  void Visit(const char* key, int64_t* value) final {
+    names->push_back(key);
+  }
+  void Visit(const char* key, uint64_t* value) final {
+    names->push_back(key);
+  }
+  void Visit(const char* key, bool* value) final {
+    names->push_back(key);
+  }
+  void Visit(const char* key, int* value) final {
+    names->push_back(key);
+  }
+  void Visit(const char* key, void** value) final {
+    names->push_back(key);
+  }
+  void Visit(const char* key, Type* value) final {
+    names->push_back(key);
+  }
+  void Visit(const char* key, std::string* value) final {
+    names->push_back(key);
+  }
+  void Visit(const char* key, runtime::NDArray* value) final {
+    names->push_back(key);
+  }
+  void Visit(const char* key, runtime::ObjectRef* value) final {
+    names->push_back(key);
+  }
+};
+
+std::vector<std::string>
+ReflectionVTable::ListAttrNames(Object* self) const {
+  std::vector<std::string> names;
+  AttrDir dir;
+  dir.names = &names;
+
+  if (!self->IsInstance<DictAttrsNode>()) {
+    VisitAttrs(self, &dir);
+  } else {
+    // specially handle dict attr
+    DictAttrsNode* dnode = static_cast<DictAttrsNode*>(self);
+    for (const auto& kv : dnode->dict) {
+      names.push_back(kv.first);
+    }
+  }
+  return names;
+}
+
+ReflectionVTable* ReflectionVTable::Global() {
+  static ReflectionVTable inst;
+  return &inst;
+}
+
+ObjectPtr<Object>
+ReflectionVTable::CreateInitObject(const std::string& type_key,
+                                   const std::string& global_key) const {
+  uint32_t tindex = Object::TypeKey2Index(type_key);
+  if (tindex >= fvisit_attrs_.size() || fvisit_attrs_[tindex] == nullptr) {
+    LOG(FATAL) << "TypeError: " << type_key
+               << " is not registered via TVM_REGISTER_NODE_TYPE";
+  }
+  return fcreate_[tindex](global_key);
+}
+
+class NodeAttrSetter : public AttrVisitor {
+ public:
+  std::string type_key;
+  std::unordered_map<std::string, runtime::TVMArgValue> attrs;
+
+  void Visit(const char* key, double* value) final {
+    *value = GetAttr(key).operator double();
+  }
+  void Visit(const char* key, int64_t* value) final {
+    *value = GetAttr(key).operator int64_t();
+  }
+  void Visit(const char* key, uint64_t* value) final {
+    *value = GetAttr(key).operator uint64_t();
+  }
+  void Visit(const char* key, int* value) final {
+    *value = GetAttr(key).operator int();
+  }
+  void Visit(const char* key, bool* value) final {
+    *value = GetAttr(key).operator bool();
+  }
+  void Visit(const char* key, std::string* value) final {
+    *value = GetAttr(key).operator std::string();
+  }
+  void Visit(const char* key, void** value) final {
+    *value = GetAttr(key).operator void*();
+  }
+  void Visit(const char* key, DataType* value) final {
+    *value = GetAttr(key).operator DataType();
+  }
+  void Visit(const char* key, runtime::NDArray* value) final {
+    *value = GetAttr(key).operator runtime::NDArray();
+  }
+  void Visit(const char* key, ObjectRef* value) final {
+    *value = GetAttr(key).operator ObjectRef();
+  }
+
+ private:
+  runtime::TVMArgValue GetAttr(const char* key) {
+    auto it = attrs.find(key);
+    if (it == attrs.end()) {
+      LOG(FATAL) << type_key << ": require field " << key;
+    }
+    runtime::TVMArgValue v = it->second;
+    attrs.erase(it);
+    return v;
+  }
+};
+
+void InitNodeByPackedArgs(Object* n, const TVMArgs& args) {
+  NodeAttrSetter setter;
+  setter.type_key = n->GetTypeKey();
+  CHECK_EQ(args.size() % 2, 0);
+  for (int i = 0; i < args.size(); i += 2) {
+    setter.attrs.emplace(args[i].operator std::string(),
+                         args[i + 1]);
+  }
+  auto* reflection = ReflectionVTable::Global();
+  reflection->VisitAttrs(n, &setter);
+
+  if (setter.attrs.size() != 0) {
+    std::ostringstream os;
+    os << setter.type_key << " does not contain field ";
+    for (const auto &kv : setter.attrs) {
+      os << " " << kv.first;
+    }
+    LOG(FATAL) << os.str();
+  }
+}
+
+// Expose to FFI APIs.
+void NodeGetAttr(TVMArgs args, TVMRetValue* ret) {
+  CHECK_EQ(args[0].type_code(), kObjectHandle);
+  Object* self = static_cast<Object*>(args[0].value().v_handle);
+  *ret = ReflectionVTable::Global()->GetAttr(self, args[1]);
+}
+
+void NodeListAttrNames(TVMArgs args, TVMRetValue* ret) {
+  CHECK_EQ(args[0].type_code(), kObjectHandle);
+  Object* self = static_cast<Object*>(args[0].value().v_handle);
+
+  auto names = std::make_shared<std::vector<std::string> >(
+      ReflectionVTable::Global()->ListAttrNames(self));
+
+  *ret = PackedFunc([names](TVMArgs args, TVMRetValue *rv) {
+      int64_t i = args[0];
+      if (i == -1) {
+        *rv = static_cast<int64_t>(names->size());
+      } else {
+        *rv = (*names)[i];
+      }
+    });
+}
+
+// API function to make node.
+// args format:
+//   key1, value1, ..., key_n, value_n
+void MakeNode(const TVMArgs& args, TVMRetValue* rv) {
+  std::string type_key = args[0];
+  std::string empty_str;
+  TVMArgs kwargs(args.values + 1, args.type_codes + 1, args.size() - 1);
+  auto* reflection = ReflectionVTable::Global();
+  ObjectPtr<Object> n = reflection->CreateInitObject(type_key);
+  if (n->IsInstance<BaseAttrsNode>()) {
+    static_cast<BaseAttrsNode*>(n.get())->InitByPackedArgs(kwargs);
+  } else {
+    InitNodeByPackedArgs(n.get(), kwargs);
+  }
+  *rv = ObjectRef(n);
+}
+
+
+TVM_REGISTER_GLOBAL("_NodeGetAttr")
+.set_body(NodeGetAttr);
+
+TVM_REGISTER_GLOBAL("_NodeListAttrNames")
+.set_body(NodeListAttrNames);
+
+TVM_REGISTER_GLOBAL("make._Node")
+.set_body(MakeNode);
+
+}  // namespace tvm
diff --git a/src/lang/reflection.cc b/src/node/serialization.cc
similarity index 58%
rename from src/lang/reflection.cc
rename to src/node/serialization.cc
index 651312a949c4..d270e72d3958 100644
--- a/src/lang/reflection.cc
+++ b/src/node/serialization.cc
@@ -18,51 +18,42 @@
  */
 
 /*!
- *  Copyright (c) 2016 by Contributors
- * \file reflection.cc
- * \brief Utilities to save/load/construct TVM objects
+ * \file node/serialization.cc
+ * \brief Utilities to serialize TVM AST/IR objects.
  */
-#include <tvm/base.h>
-#include <tvm/expr.h>
-#include <tvm/attrs.h>
-#include <tvm/node/container.h>
-#include <tvm/packed_func_ext.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/packed_func.h>
 #include <dmlc/json.h>
 #include <dmlc/memory_io.h>
+
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/node/container.h>
+#include <tvm/node/reflection.h>
+#include <tvm/node/serialization.h>
+#include <tvm/attrs.h>
+
 #include <string>
-#include "../common/base64.h"
+#include <map>
 
-namespace dmlc {
-DMLC_REGISTRY_ENABLE(::tvm::NodeFactoryReg);
-}  // namespace dmlc
+#include "../common/base64.h"
 
 namespace tvm {
 
-::dmlc::Registry<NodeFactoryReg>* NodeFactoryReg::Registry() {
-  return ::dmlc::Registry<NodeFactoryReg>::Get();
-}
-
-inline std::string Type2String(const Type& t) {
+inline std::string Type2String(const DataType& t) {
   return runtime::TVMType2String(Type2TVMType(t));
 }
 
-
 inline Type String2Type(std::string s) {
   return TVMType2Type(runtime::String2TVMType(s));
 }
 
-using runtime::Object;
-using runtime::ObjectRef;
-
-// indexer to index all the ndoes
+// indexer to index all the nodes
 class NodeIndexer : public AttrVisitor {
  public:
-  std::unordered_map<Node*, size_t> node_index{{nullptr, 0}};
-  std::vector<Node*> node_list{nullptr};
-  std::unordered_map<DLTensor*, size_t> tensor_index;
-  std::vector<DLTensor*> tensor_list;
+  std::unordered_map<Object*, size_t> node_index_{{nullptr, 0}};
+  std::vector<Object*> node_list_{nullptr};
+  std::unordered_map<DLTensor*, size_t> tensor_index_;
+  std::vector<DLTensor*> tensor_list_;
+  ReflectionVTable* reflection_ = ReflectionVTable::Global();
 
   void Visit(const char* key, double* value) final {}
   void Visit(const char* key, int64_t* value) final {}
@@ -71,49 +62,48 @@ class NodeIndexer : public AttrVisitor {
   void Visit(const char* key, bool* value) final {}
   void Visit(const char* key, std::string* value) final {}
   void Visit(const char* key, void** value) final {}
-  void Visit(const char* key, Type* value) final {}
-  void Visit(const char* key, NodeRef* value) final {
-    MakeIndex(value->node_.get());
-  }
+  void Visit(const char* key, DataType* value) final {}
 
   void Visit(const char* key, runtime::NDArray* value) final {
     DLTensor* ptr = const_cast<DLTensor*>((*value).operator->());
-    if (tensor_index.count(ptr)) return;
-    CHECK_EQ(tensor_index.size(), tensor_list.size());
-    tensor_index[ptr] = tensor_list.size();
-    tensor_list.push_back(ptr);
+    if (tensor_index_.count(ptr)) return;
+    CHECK_EQ(tensor_index_.size(), tensor_list_.size());
+    tensor_index_[ptr] = tensor_list_.size();
+    tensor_list_.push_back(ptr);
   }
 
   void Visit(const char* key, ObjectRef* value) final {
-    LOG(FATAL) << "Do not support json serialize non-node object";
+    MakeIndex(const_cast<Object*>(value->get()));
   }
 
   // make index of all the children of node
-  void MakeIndex(Node* node) {
+  void MakeIndex(Object* node) {
     if (node == nullptr) return;
-    if (node_index.count(node)) return;
-    CHECK_EQ(node_index.size(), node_list.size());
-    node_index[node] = node_list.size();
-    node_list.push_back(node);
+    CHECK(node->IsInstance<Node>());
 
-    if (node->is_type<ArrayNode>()) {
+    if (node_index_.count(node)) return;
+    CHECK_EQ(node_index_.size(), node_list_.size());
+    node_index_[node] = node_list_.size();
+    node_list_.push_back(node);
+
+    if (node->IsInstance<ArrayNode>()) {
       ArrayNode* n = static_cast<ArrayNode*>(node);
       for (const auto& sp : n->data) {
-        MakeIndex(sp.get());
+        MakeIndex(const_cast<Object*>(sp.get()));
       }
-    } else if (node->is_type<MapNode>()) {
+    } else if (node->IsInstance<MapNode>()) {
       MapNode* n = static_cast<MapNode*>(node);
       for (const auto& kv : n->data) {
-        MakeIndex(kv.first.get());
-        MakeIndex(kv.second.get());
+        MakeIndex(const_cast<Object*>(kv.first.get()));
+        MakeIndex(const_cast<Object*>(kv.second.get()));
       }
-    } else if (node->is_type<StrMapNode>()) {
+    } else if (node->IsInstance<StrMapNode>()) {
       StrMapNode* n = static_cast<StrMapNode*>(node);
       for (const auto& kv : n->data) {
-        MakeIndex(kv.second.get());
+        MakeIndex(const_cast<Object*>(kv.second.get()));
       }
     } else {
-      node->VisitAttrs(this);
+      reflection_->VisitAttrs(node, this);
     }
   }
 };
@@ -121,17 +111,17 @@ class NodeIndexer : public AttrVisitor {
 // use map so attributes are ordered.
 using AttrMap = std::map<std::string, std::string>;
 
-// A Node structure for JSON node.
+/*! \brief Node structure for json format. */
 struct JSONNode {
-  // The type key of the data
+  /*! \brief The type of key of the object. */
   std::string type_key;
-  // The global key for global object
+  /*! \brief The global key for global object. */
   std::string global_key;
-  // the attributes
+  /*! \brief the attributes */
   AttrMap attrs;
-  // container keys
+  /*! \brief keys of a map. */
   std::vector<std::string> keys;
-  // container data
+  /*! \brief values of a map or array. */
   std::vector<size_t> data;
 
   void Save(dmlc::JSONWriter *writer) const {
@@ -167,11 +157,14 @@ struct JSONNode {
   }
 };
 
+// Helper class to populate the json node
+// using the existing index.
 class JSONAttrGetter : public AttrVisitor {
  public:
-  const std::unordered_map<Node*, size_t>* node_index_;
+  const std::unordered_map<Object*, size_t>* node_index_;
   const std::unordered_map<DLTensor*, size_t>* tensor_index_;
   JSONNode* node_;
+  ReflectionVTable* reflection_ = ReflectionVTable::Global();
 
   void Visit(const char* key, double* value) final {
     node_->attrs[key] = std::to_string(*value);
@@ -194,76 +187,74 @@ class JSONAttrGetter : public AttrVisitor {
   void Visit(const char* key, void** value) final {
     LOG(FATAL) << "not allowed to serialize a pointer";
   }
-  void Visit(const char* key, Type* value) final {
+  void Visit(const char* key, DataType* value) final {
     node_->attrs[key] = Type2String(*value);
   }
-  void Visit(const char* key, NodeRef* value) final {
-    node_->attrs[key] = std::to_string(
-        node_index_->at(value->node_.get()));
-  }
+
   void Visit(const char* key, runtime::NDArray* value) final {
     node_->attrs[key] = std::to_string(
         tensor_index_->at(const_cast<DLTensor*>((*value).operator->())));
   }
+
   void Visit(const char* key, ObjectRef* value) final {
-    LOG(FATAL) << "Do not support json serialize non-node object";
+    node_->attrs[key] = std::to_string(
+        node_index_->at(const_cast<Object*>(value->get())));
   }
+
   // Get the node
-  void Get(Node* node) {
+  void Get(Object* node) {
     if (node == nullptr) {
       node_->type_key.clear();
       return;
     }
-    node_->type_key = node->type_key();
-    // sepcially handle global object
-    auto* f = dmlc::Registry<NodeFactoryReg>::Find(node_->type_key);
-    CHECK(f != nullptr)
-        << "Node type \'" << node_->type_key << "\' is not registered in TVM";
-    if (f->fglobal_key != nullptr) {
-      node_->global_key = f->fglobal_key(node);
-      return;
-    }
+    node_->type_key = node->GetTypeKey();
+    node_->global_key = reflection_->GetGlobalKey(node);
+    // No need to recursively visit fields of global singleton
+    // They are registered via the environment.
+    if (node_->global_key.length() != 0) return;
+
+    // populates the fields.
     node_->attrs.clear();
     node_->data.clear();
-    if (node->is_type<ArrayNode>()) {
+
+    if (node->IsInstance<ArrayNode>()) {
       ArrayNode* n = static_cast<ArrayNode*>(node);
       for (size_t i = 0; i < n->data.size(); ++i) {
         node_->data.push_back(
-            node_index_->at(n->data[i].get()));
+            node_index_->at(const_cast<Object*>(n->data[i].get())));
       }
-    } else if (node->is_type<MapNode>()) {
+    } else if (node->IsInstance<MapNode>()) {
       MapNode* n = static_cast<MapNode*>(node);
       for (const auto& kv : n->data) {
         node_->data.push_back(
-            node_index_->at(kv.first.get()));
+            node_index_->at(const_cast<Object*>(kv.first.get())));
         node_->data.push_back(
-            node_index_->at(kv.second.get()));
+            node_index_->at(const_cast<Object*>(kv.second.get())));
       }
-    } else if (node->is_type<StrMapNode>()) {
+    } else if (node->IsInstance<StrMapNode>()) {
       StrMapNode* n = static_cast<StrMapNode*>(node);
       for (const auto& kv : n->data) {
         node_->keys.push_back(kv.first);
         node_->data.push_back(
-            node_index_->at(kv.second.get()));
+            node_index_->at(const_cast<Object*>(kv.second.get())));
       }
     } else {
-      // do not need to recover content of global singleton object
-      // they are registered via the environment
-      auto* f = dmlc::Registry<NodeFactoryReg>::Find(node->type_key());
-      if (f != nullptr && f->fglobal_key != nullptr) return;
       // recursively index normal object.
-      node->VisitAttrs(this);
+      reflection_->VisitAttrs(node, this);
     }
   }
 };
 
+// Helper class to set the attributes of a node
+// from given json node.
 class JSONAttrSetter : public AttrVisitor {
  public:
-  const std::vector<NodePtr<Node> >* node_list_;
+  const std::vector<ObjectPtr<Object> >* node_list_;
   const std::vector<runtime::NDArray>* tensor_list_;
-
   JSONNode* node_;
 
+  ReflectionVTable* reflection_ = ReflectionVTable::Global();
+
   std::string GetValue(const char* key) const {
     auto it = node_->attrs.find(key);
     if (it == node_->attrs.end()) {
@@ -300,16 +291,10 @@ class JSONAttrSetter : public AttrVisitor {
   void Visit(const char* key, void** value) final {
     LOG(FATAL) << "not allowed to deserialize a pointer";
   }
-  void Visit(const char* key, Type* value) final {
+  void Visit(const char* key, DataType* value) final {
     std::string stype = GetValue(key);
     *value = String2Type(stype);
   }
-  void Visit(const char* key, NodeRef* value) final {
-    size_t index;
-    ParseValue(key, &index);
-    CHECK_LE(index, node_list_->size());
-    value->node_ = node_list_->at(index);
-  }
   void Visit(const char* key, runtime::NDArray* value) final {
     size_t index;
     ParseValue(key, &index);
@@ -317,33 +302,37 @@ class JSONAttrSetter : public AttrVisitor {
     *value = tensor_list_->at(index);
   }
   void Visit(const char* key, ObjectRef* value) final {
-    LOG(FATAL) << "Do not support json serialize non-node object";
+    size_t index;
+    ParseValue(key, &index);
+    CHECK_LE(index, node_list_->size());
+    *value = ObjectRef(node_list_->at(index));
   }
   // set node to be current JSONNode
-  void Set(Node* node) {
+  void Set(Object* node) {
     if (node == nullptr) return;
-    if (node->is_type<ArrayNode>()) {
+
+    if (node->IsInstance<ArrayNode>()) {
       ArrayNode* n = static_cast<ArrayNode*>(node);
       n->data.clear();
       for (size_t index : node_->data) {
-        n->data.push_back(node_list_->at(index));
+        n->data.push_back(ObjectRef(node_list_->at(index)));
       }
-    } else if (node->is_type<MapNode>()) {
+    } else if (node->IsInstance<MapNode>()) {
       MapNode* n = static_cast<MapNode*>(node);
       CHECK_EQ(node_->data.size() % 2, 0U);
       for (size_t i = 0; i < node_->data.size(); i += 2) {
-        n->data[node_list_->at(node_->data[i])]
-            = node_list_->at(node_->data[i + 1]);
+        n->data[ObjectRef(node_list_->at(node_->data[i]))]
+            = ObjectRef(node_list_->at(node_->data[i + 1]));
       }
-    } else if (node->is_type<StrMapNode>()) {
+    } else if (node->IsInstance<StrMapNode>()) {
       StrMapNode* n = static_cast<StrMapNode*>(node);
       CHECK_EQ(node_->data.size(), node_->keys.size());
       for (size_t i = 0; i < node_->data.size(); ++i) {
         n->data[node_->keys[i]]
-            = node_list_->at(node_->data[i]);
+            = ObjectRef(node_list_->at(node_->data[i]));
       }
     } else {
-      node->VisitAttrs(this);
+      reflection_->VisitAttrs(node, this);
     }
   }
 };
@@ -380,23 +369,23 @@ struct JSONGraph {
     helper.ReadAllFields(reader);
   }
 
-  static JSONGraph Create(const NodeRef& root) {
+  static JSONGraph Create(const ObjectRef& root) {
     JSONGraph g;
     NodeIndexer indexer;
-    indexer.MakeIndex(root.node_.get());
+    indexer.MakeIndex(const_cast<Object*>(root.get()));
     JSONAttrGetter getter;
-    getter.node_index_ = &indexer.node_index;
-    getter.tensor_index_ = &indexer.tensor_index;
-    for (Node* n : indexer.node_list) {
+    getter.node_index_ = &indexer.node_index_;
+    getter.tensor_index_ = &indexer.tensor_index_;
+    for (Object* n : indexer.node_list_) {
       JSONNode jnode;
       getter.node_ = &jnode;
       getter.Get(n);
       g.nodes.emplace_back(std::move(jnode));
     }
     g.attrs["tvm_version"] = TVM_VERSION;
-    g.root = indexer.node_index.at(root.node_.get());
+    g.root = indexer.node_index_.at(const_cast<Object*>(root.get()));
     // serialize tensor
-    for (DLTensor* tensor : indexer.tensor_list) {
+    for (DLTensor* tensor : indexer.tensor_list_) {
       std::string blob;
       dmlc::MemoryStringStream mstrm(&blob);
       common::Base64OutStream b64strm(&mstrm);
@@ -408,7 +397,7 @@ struct JSONGraph {
   }
 };
 
-std::string SaveJSON(const NodeRef& n) {
+std::string SaveJSON(const ObjectRef& n) {
   auto jgraph = JSONGraph::Create(n);
   std::ostringstream os;
   dmlc::JSONWriter writer(&os);
@@ -416,13 +405,13 @@ std::string SaveJSON(const NodeRef& n) {
   return os.str();
 }
 
-NodePtr<Node> LoadJSON_(std::string json_str) {
+ObjectRef LoadJSON(std::string json_str) {
   std::istringstream is(json_str);
   dmlc::JSONReader reader(&is);
   JSONGraph jgraph;
   // load in json graph.
   jgraph.Load(&reader);
-  std::vector<NodePtr<Node> > nodes;
+  std::vector<ObjectPtr<Object> > nodes;
   std::vector<runtime::NDArray> tensors;
   // load in tensors
   for (const std::string& blob : jgraph.b64ndarrays) {
@@ -433,16 +422,18 @@ NodePtr<Node> LoadJSON_(std::string json_str) {
     CHECK(temp.Load(&b64strm));
     tensors.emplace_back(temp);
   }
+  ReflectionVTable* reflection = ReflectionVTable::Global();
+
   // node 0 is always null
   nodes.reserve(jgraph.nodes.size());
+
   for (const JSONNode& jnode : jgraph.nodes) {
     if (jnode.type_key.length() != 0) {
-      auto* f = dmlc::Registry<NodeFactoryReg>::Find(jnode.type_key);
-      CHECK(f != nullptr)
-          << "Node type \'" << jnode.type_key << "\' is not registered in TVM";
-      nodes.emplace_back(f->fcreator(jnode.global_key));
+      ObjectPtr<Object> node =
+          reflection->CreateInitObject(jnode.type_key, jnode.global_key);
+      nodes.emplace_back(node);
     } else {
-      nodes.emplace_back(NodePtr<Node>());
+      nodes.emplace_back(ObjectPtr<Object>());
     }
   }
   CHECK_EQ(nodes.size(), jgraph.nodes.size());
@@ -458,101 +449,6 @@ NodePtr<Node> LoadJSON_(std::string json_str) {
       setter.Set(nodes[i].get());
     }
   }
-  return nodes.at(jgraph.root);
-}
-
-class NodeAttrSetter : public AttrVisitor {
- public:
-  std::string type_key;
-  std::unordered_map<std::string, runtime::TVMArgValue> attrs;
-
-  void Visit(const char* key, double* value) final {
-    *value = GetAttr(key).operator double();
-  }
-  void Visit(const char* key, int64_t* value) final {
-    *value = GetAttr(key).operator int64_t();
-  }
-  void Visit(const char* key, uint64_t* value) final {
-    *value = GetAttr(key).operator uint64_t();
-  }
-  void Visit(const char* key, int* value) final {
-    *value = GetAttr(key).operator int();
-  }
-  void Visit(const char* key, bool* value) final {
-    *value = GetAttr(key).operator bool();
-  }
-  void Visit(const char* key, std::string* value) final {
-    *value = GetAttr(key).operator std::string();
-  }
-  void Visit(const char* key, void** value) final {
-    *value = GetAttr(key).operator void*();
-  }
-  void Visit(const char* key, Type* value) final {
-    *value = GetAttr(key).operator Type();
-  }
-  void Visit(const char* key, NodeRef* value) final {
-    *value = GetAttr(key).operator NodeRef();
-  }
-  void Visit(const char* key, runtime::NDArray* value) final {
-    *value = GetAttr(key).operator runtime::NDArray();
-  }
-  void Visit(const char* key, ObjectRef* value) final {
-    *value = GetAttr(key).operator ObjectRef();
-  }
-
- private:
-  runtime::TVMArgValue GetAttr(const char* key) {
-    auto it = attrs.find(key);
-    if (it == attrs.end()) {
-      LOG(FATAL) << type_key << ": require field " << key;
-    }
-    runtime::TVMArgValue v = it->second;
-    attrs.erase(it);
-    return v;
-  }
-};
-
-
-void InitNodeByPackedArgs(Node* n, const TVMArgs& args) {
-  NodeAttrSetter setter;
-  setter.type_key = n->type_key();
-  CHECK_EQ(args.size() % 2, 0);
-  for (int i = 0; i < args.size(); i += 2) {
-    setter.attrs.emplace(args[i].operator std::string(),
-                         args[i + 1]);
-  }
-  n->VisitAttrs(&setter);
-  if (setter.attrs.size() != 0) {
-    std::ostringstream os;
-    os << setter.type_key << " does not contain field ";
-    for (const auto &kv : setter.attrs) {
-      os << " " << kv.first;
-    }
-    LOG(FATAL) << os.str();
-  }
-}
-
-// API function to make node.
-// args format:
-//   key1, value1, ..., key_n, value_n
-void MakeNode(const TVMArgs& args, TVMRetValue* rv) {
-  std::string type_key = args[0];
-  std::string empty_str;
-  auto* f = dmlc::Registry<NodeFactoryReg>::Find(type_key);
-  CHECK(f != nullptr)
-      << "Node type \'" << type_key << "\' is not registered in TVM";
-  TVMArgs kwargs(args.values + 1, args.type_codes + 1, args.size() - 1);
-  CHECK(f->fglobal_key == nullptr)
-      << "Cannot make node type \'" << type_key << "\' with global_key.";
-  NodePtr<Node> n = f->fcreator(empty_str);
-  if (n->derived_from<BaseAttrsNode>()) {
-    static_cast<BaseAttrsNode*>(n.get())->InitByPackedArgs(kwargs);
-  } else {
-    InitNodeByPackedArgs(n.get(), kwargs);
-  }
-  *rv = NodeRef(n);
+  return ObjectRef(nodes.at(jgraph.root));
 }
-
-TVM_REGISTER_GLOBAL("make._Node")
-.set_body(MakeNode);
 }  // namespace tvm
diff --git a/src/op/compute_op.cc b/src/op/compute_op.cc
index 8d67632d9be4..69589423b663 100644
--- a/src/op/compute_op.cc
+++ b/src/op/compute_op.cc
@@ -147,7 +147,7 @@ Operation ComputeOpNode::make(std::string name,
   n->attrs = std::move(attrs);
   n->axis = std::move(axis);
   n->body = std::move(body);
-  if (n->body[0]->is_type<ir::Reduce>()) {
+  if (n->body[0]->IsInstance<ir::Reduce>()) {
     const ir::Reduce* reduce = n->body[0].as<ir::Reduce>();
     n->reduce_axis = reduce->axis;
   }
@@ -163,7 +163,7 @@ Array<Tensor> ComputeOpNode::InputTensors() const {
     ir::PostOrderVisit(e, [&ret, &visited](const NodeRef& n) {
         const ir::Call *call = n.as<ir::Call>();
         if (call != nullptr && call->func.defined()) {
-          Tensor t = Operation(call->func.node_).output(call->value_index);
+          Tensor t = Downcast<Operation>(call->func).output(call->value_index);
           if (!visited.count(t)) {
             ret.push_back(t);
             visited.insert(t);
@@ -180,7 +180,7 @@ Operation ComputeOpNode::ReplaceInputs(
   CHECK_EQ(self.operator->(), this);
   VerifyComputeOp(this);
   Array<Expr> arr;
-  if (this->body[0]->is_type<ir::Reduce>()) {
+  if (this->body[0]->IsInstance<ir::Reduce>()) {
     // Specially handle reduce so the replaced op
     // still share all the components
     Expr new_reduce = op::ReplaceTensor(this->body[0], rmap);
@@ -217,7 +217,7 @@ void ComputeOpNode::PropBoundToInputs(
   auto fvisit = [&dom_map, out_dom_map, analyzer](const NodeRef& n) {
     auto *call = n.as<ir::Call>();
     if (call != nullptr && call->func.defined()) {
-      Tensor t = Operation(call->func.node_).output(call->value_index);
+      Tensor t = Downcast<Operation>(call->func).output(call->value_index);
       if (t->op.defined() && out_dom_map->count(t)) {
         TensorDom& dom = out_dom_map->at(t);
         for (size_t i = 0; i < t.ndim(); ++i) {
diff --git a/src/op/hybrid_op.cc b/src/op/hybrid_op.cc
index 9ba71eb83919..e6a46fe19846 100644
--- a/src/op/hybrid_op.cc
+++ b/src/op/hybrid_op.cc
@@ -93,7 +93,7 @@ Array<Tensor> HybridOpNode::InputTensors() const {
   ir::PostOrderVisit(body, [&curr_inputs, &orig_inputs, &visited](const NodeRef& n) {
       const ir::Call *call = n.as<ir::Call>();
       if (call != nullptr && call->func.defined()) {
-        Tensor t = Operation(call->func.node_).output(call->value_index);
+        Tensor t = Downcast<Operation>(call->func).output(call->value_index);
         if (orig_inputs.count(t) && !visited.count(t)) {
           curr_inputs.push_back(t);
           visited.insert(t);
@@ -483,7 +483,7 @@ class ProviderReplacer : public ir::IRMutator {
       : vmap_(vmap) {}
 
   Stmt Mutate_(const ir::Provide* op, const Stmt &s) {
-    Tensor t = Operation(op->func.node_).output(op->value_index);
+    Tensor t = Downcast<Operation>(op->func).output(op->value_index);
     auto it = vmap_.find(t);
     if (it != vmap_.end()) {
       Stmt ret = ir::Provide::make(
diff --git a/src/op/op_util.cc b/src/op/op_util.cc
index 801f4fae48b3..691603157b1c 100644
--- a/src/op/op_util.cc
+++ b/src/op/op_util.cc
@@ -193,7 +193,7 @@ class TensorReplacer : public ir::IRMutator {
 
   Expr Mutate_(const ir::Call* op, const Expr& e) {
     if (op->call_type == ir::Call::Halide) {
-      Tensor t = Operation(op->func.node_).output(op->value_index);
+      Tensor t = Downcast<Operation>(op->func).output(op->value_index);
       auto it = vmap_.find(t);
       if (it != vmap_.end()) {
         Expr ret = ir::Call::make(
diff --git a/src/op/tensorize.cc b/src/op/tensorize.cc
index 230472f2ddee..d9b09eba1c1e 100644
--- a/src/op/tensorize.cc
+++ b/src/op/tensorize.cc
@@ -164,7 +164,7 @@ class TensorIntrinMatcher final : public IRMutator {
     Expr expr = IRMutator::Mutate_(op, e);
     op = expr.as<Call>();
     if (op->call_type == Call::Halide) {
-      Tensor t = Operation(op->func.node_).output(op->value_index);
+      Tensor t = Downcast<Operation>(op->func).output(op->value_index);
       auto it = in_remap_.find(t);
       if (it != in_remap_.end()) {
         const InputEntry& e = it->second;
diff --git a/src/pass/arg_binder.cc b/src/pass/arg_binder.cc
index 75be84290970..f892b6b957f8 100644
--- a/src/pass/arg_binder.cc
+++ b/src/pass/arg_binder.cc
@@ -54,7 +54,7 @@ bool ArgBinder::Bind_(const Expr& arg,
   if (const Variable* v = arg.as<Variable>()) {
     auto it = def_map_->find(v);
     if (it == def_map_->end()) {
-      Var v_arg(arg.node_);
+      Var v_arg = Downcast<Var>(arg);
       defs_.emplace_back(v_arg);
       if (with_lets) {
         (*def_map_)[v] = arg;
diff --git a/src/pass/coproc_sync.cc b/src/pass/coproc_sync.cc
index 0afe1ca9935b..d40b370d73fa 100644
--- a/src/pass/coproc_sync.cc
+++ b/src/pass/coproc_sync.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -66,7 +66,7 @@ class CoProcTouchedBuffer : public IRVisitor {
   void Visit_(const AttrStmt* op) final {
     if (op->attr_key == attr::coproc_scope && !in_scope_) {
       in_scope_ = true;
-      IterVar iv(op->node.node_);
+      IterVar iv = Downcast<IterVar>(op->node);
       coproc_.insert(iv);
       IRVisitor::Visit_(op);
       in_scope_ = false;
diff --git a/src/pass/hoist_if_then_else.cc b/src/pass/hoist_if_then_else.cc
new file mode 100644
index 000000000000..a1c635e2692b
--- /dev/null
+++ b/src/pass/hoist_if_then_else.cc
@@ -0,0 +1,423 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file hoist_if_then_else.cc
+ */
+#include <tvm/ir.h>
+#include <tvm/ir_visitor.h>
+#include <tvm/ir_mutator.h>
+#include <tvm/ir_pass.h>
+#include <tvm/arithmetic.h>
+#include <tvm/api_registry.h>
+#include <unordered_map>
+#include <unordered_set>
+#include <queue>
+#include "../arithmetic/int_set.h"
+#include "../runtime/thread_storage_scope.h"
+
+namespace tvm {
+namespace ir {
+
+using HoistMap = std::unordered_map<const Node*, std::vector<Stmt>>;
+using VarMap = std::unordered_map<const Node*, std::unordered_set<const Node*>>;
+
+/*
+ * This pass tries to hoist IfThenElse stmt out of For loop if condition is loop invariant.
+ * For example, given the following block:
+ * for (i = 0; i < 3; i++)
+ *    for (j = 0; j < 4; j++)
+ *        for (k = 0; k < 5; k++)
+ *            if (likely(i*2 < 4))
+ *                A[3*i+2j+k] = B[7*i+3j+k]
+ *
+ * We first detect all IfThenElse stmt and find the corresponding loop invariant For stmt.
+ * Then we hoist IfThenElse stmt by one For stmt each step:
+ *
+ * Step 1:
+ * for (i = 0; i < 3; i++)
+ *     for (j = 0; j < 4; j++)
+ *         if (likely(i*2 < 4))
+ *             for (k = 0; k < 5; k++)
+ *                 A[3*i+2j+k] = B[7*i+3j+k]
+ *
+ * Step 2:
+ * for (i = 0; i < 3; i++)
+ *     if (likely(i*2 < 4))
+ *         for (j = 0; j < 4; j++)
+ *             for (k = 0; k < 5; k++)
+ *                 A[3*i+2j+k] = B[7*i+3j+k]
+ *
+ * In this pass, we only continue detecting possible hoisting chance when visiting For,
+ * IfThenElse or AttrStmt Node. For example, for the following block:
+ * for (i = 0; i < 3; i++)
+ *    for (j = 0; j < 4; j++)
+ *        A[i + j] = A[i + j] - 1
+ *        for (k = 0; k < 5; k++)
+ *            if (likely(i*2 < 4))
+ *                A[3*i+2j+k] = B[7*i+3j+k]
+ *
+ * Only the For with k variable will be considered and the resulting stmt would be:
+ * for (i = 0; i < 3; i++)
+ *    for (j = 0; j < 4; j++)
+ *        A[i + j] = A[i + j] - 1
+ *        if (likely(i*2 < 4))
+ *            for (k = 0; k < 5; k++)
+ *                A[3*i+2j+k] = B[7*i+3j+k]
+ *
+ * This pass doesn't do hoisting for consecutive IfThenElse stmt. The following
+ * block won't be optimized:
+ * for (i = 0; i < 3; i++)
+ *    for (j = 0; j < 4; j++)
+ *        for (k = 0; k < 5; k++)
+ *            if (likely(i*2 < 4))
+ *                A[3*i+2j+k] = B[7*i+3j+k]
+ *            if (likely(j > 2))
+ *                A[i+j+k] = B[i+j+k]
+ *
+ */
+class IfThenElseHoist {
+ public:
+  Stmt VisitAndMutate(const Stmt& stmt) {
+    SelectCandidates(stmt);
+    LocateTopFor();
+    return PostOrderMutate(stmt);
+  }
+
+ private:
+  void SelectCandidates(const Stmt& stmt);
+  void LocateTopFor();
+  Stmt PostOrderMutate(const Stmt& stmt);
+  size_t GetUpdatedFor(const Stmt& for_stmt, const Stmt& if_stmt);
+  Stmt HoistIf(const Stmt& if_stmt);
+
+  // Map of all For nodes to all child IfThenElse nodes.
+  HoistMap for2if_map_;
+  // Map of all IfThenElse nodes to all For nodes which are loop invariant.
+  HoistMap if2for_map_;
+  // Map of highest loop invariant For to child IfThenElse.
+  HoistMap top_for_var_map_;
+  // Map of original For to list of update For nodes.
+  HoistMap for_tracking_map_;
+  // Map of all IfThenElse nodes to condition variable nodes.
+  VarMap cond_var_map_;
+  // List of For nodes added in post order DFS visiting.
+  std::vector<Stmt> ordered_for_list_;
+};
+
+// Check whether a given IfThenElse stmt is the first one appearing
+// in a For stmt.
+bool is_first_if(const Stmt& for_stmt, const Stmt& if_stmt) {
+  std::vector<const Node*> if_node_list;
+  const For* for_node = for_stmt.as<For>();
+  CHECK(for_node);
+  CHECK(if_stmt.as<IfThenElse>());
+
+  PostOrderVisit(for_node->body, [&](const NodeRef& node) {
+    if (node.as<IfThenElse>()) {
+      if_node_list.push_back(node.get());
+    }
+  });
+  return if_node_list.empty() ? false : if_stmt.get() == if_node_list.back();
+}
+
+// Update upper level For node when current For node is modified.
+// With this function we only need to visit and mutate top level For node
+// in the main VisitAndMutate function.
+Stmt update_for(const Stmt& parent_for_stmt, const Stmt& new_if_stmt) {
+  const Node* top_for_node;
+  const For* parent_for_node = parent_for_stmt.as<For>();
+  CHECK(parent_for_node);
+  CHECK(new_if_stmt.as<IfThenElse>());
+
+  PostOrderVisit(parent_for_node->body, [&](const NodeRef& node) {
+    if (node.as<For>()) {
+      top_for_node = node.get();
+    }
+  });
+
+  PackedFunc replace_target_for = PackedFunc(
+    [&](TVMArgs args, TVMRetValue *ret){
+      const NodeRef& current_for = args[0];
+      if (current_for.get() == top_for_node) {
+        *ret = new_if_stmt;
+      }
+    });
+
+  return IRTransform(parent_for_stmt, nullptr, replace_target_for,
+                     {Expr("For")});
+}
+
+// Remove IfThenElse node from a For node.
+// A pair of For nodes will be generated.
+std::pair<Stmt, Stmt> RemoveIf(const Stmt& for_stmt, const Stmt& if_stmt) {
+  Stmt then_for;
+  Stmt else_for;
+  CHECK(if_stmt.as<IfThenElse>());
+
+  PackedFunc replace_then_case = PackedFunc(
+    [&](TVMArgs args, TVMRetValue *ret){
+      const NodeRef& node  = args[0];
+      if (node == if_stmt) {
+        *ret = node.as<IfThenElse>()->then_case;
+      }
+    });
+
+  PackedFunc replace_else_case = PackedFunc(
+    [&](TVMArgs args, TVMRetValue *ret){
+      const NodeRef& node  = args[0];
+      if (node == if_stmt) {
+        *ret = node.as<IfThenElse>()->else_case;
+      }
+    });
+
+  then_for = IRTransform(for_stmt, nullptr, replace_then_case,
+                         {Expr("IfThenElse")});
+  if (if_stmt.as<IfThenElse>()->else_case) {
+    else_for = IRTransform(for_stmt, nullptr, replace_else_case,
+                           {Expr("IfThenElse")});
+  }
+
+  return std::make_pair(then_for, else_for);
+}
+
+// Locate all For nodes and capture child IfThenElse nodes.
+void IfThenElseHoist::SelectCandidates(const Stmt& stmt) {
+  PostOrderVisit(stmt, [&](const NodeRef& node){
+    const For* for_node = node.as<For>();
+    if (!for_node) return;
+
+    std::queue<Stmt> tracker;
+    tracker.push(for_node->body);
+    Stmt for_stmt = Downcast<Stmt, NodeRef>(node);
+    for2if_map_.insert({for_stmt.get(), std::vector<Stmt>()});
+    while (!tracker.empty()) {
+      Stmt head = tracker.front();
+      tracker.pop();
+      if (head->IsInstance<For>()) {
+        for (const auto& if_stmt : for2if_map_.at(head.get())) {
+          for2if_map_[for_stmt.get()].push_back(if_stmt);
+        }
+      } else if (head->IsInstance<AttrStmt>()) {
+        const AttrStmt* attr_node = head.as<AttrStmt>();
+        tracker.push(attr_node->body);
+      } else if (head->IsInstance<IfThenElse>()) {
+        for2if_map_[for_stmt.get()].push_back(head);
+        const IfThenElse* if_node = head.as<IfThenElse>();
+        tracker.push(if_node->then_case);
+        if (if_node->else_case) {
+          tracker.push(if_node->else_case);
+        }
+
+        // Record condition variables.
+        if (!cond_var_map_.count(head.get())) {
+          std::unordered_set<const Node*> new_var_set;
+          cond_var_map_.insert({head.get(), new_var_set});
+          PostOrderVisit(if_node->condition, [&](const NodeRef& cond_node) {
+            if (cond_node.as<Variable>()) {
+              cond_var_map_[head.get()].insert(cond_node.get());
+            }
+          });
+        }
+      } else {
+        continue;
+      }
+    }
+    ordered_for_list_.emplace_back(Downcast<Stmt, NodeRef>(node));
+  });
+}
+
+// For each IfThenElse node, find the highest For node which
+// meets loop invariant condition.
+void IfThenElseHoist::LocateTopFor() {
+  std::unordered_map<const Node*, Stmt> if_position_map;
+  std::unordered_set<const Node*> top_for_var_set;
+
+  // Create IfThenElse -> For map.
+  for (const Stmt& for_stmt : ordered_for_list_) {
+    std::vector<Stmt> if_list = for2if_map_[for_stmt.get()];
+    const For* for_node = for_stmt.as<For>();
+    CHECK(for_node);
+    top_for_var_map_.insert({for_node->loop_var.get(), if_list});
+    for (const Stmt& if_stmt : if_list) {
+      const Node* if_node = if_stmt.get();
+      if2for_map_[if_node].push_back(for_stmt);
+    }
+  }
+
+  // Locate the highest For node which is loop invariant.
+  for (const auto& item : if2for_map_) {
+    Stmt top_for;
+    const Node* if_stmt = item.first;
+    std::vector<Stmt> for_list = item.second;
+    for (size_t i = 0; i < for_list.size(); ++i) {
+      const Stmt& for_stmt = for_list.at(i);
+      const For* for_node = for_stmt.as<For>();
+      CHECK(for_node);
+      std::vector<Stmt> new_for_list{for_stmt};
+      for_tracking_map_.insert({for_stmt.get(), new_for_list});
+      if (cond_var_map_[if_stmt]
+        .count(for_node->loop_var.get())) {
+        std::vector<Stmt> updated_for_list(for_list.begin(),
+                                           for_list.begin() + i);
+        if2for_map_[if_stmt] = updated_for_list;
+        break;
+      } else {
+        top_for = for_stmt;
+      }
+    }
+    if (top_for.as<For>()) {
+      if_position_map.insert({if_stmt, top_for});
+    }
+  }
+
+  for (const auto& item : if_position_map) {
+    top_for_var_set.insert(item.second.as<For>()->loop_var.get());
+  }
+
+  std::vector<const Node*> removed_for_var_list;
+  for (const auto& item : top_for_var_map_) {
+    const Node* top_for_var = item.first;
+    std::vector<Stmt> if_list = item.second;
+    if (!top_for_var_set.count(top_for_var)) {
+      removed_for_var_list.push_back(top_for_var);
+    } else {
+      std::vector<Stmt> actual_if_list;
+      for (const Stmt& if_stmt : if_list) {
+        if (if_position_map.count(if_stmt.get())) {
+          actual_if_list.push_back(if_stmt);
+        }
+      }
+      top_for_var_map_[top_for_var] = actual_if_list;
+    }
+  }
+  for (const Node* top_for_var : removed_for_var_list) {
+    top_for_var_map_.erase(top_for_var);
+  }
+}
+
+// When we try to mutate a For node, some child For nodes can have already
+// been mutated. This function is to get the updated For node and further
+// hoisting can be done based on this new node.
+// We keep all For nodes tracing in for_tracking_map_. When we get a
+// hoisted IfThenElse, we match it with tracing For nodes to pick
+// the updated one.
+size_t IfThenElseHoist::GetUpdatedFor(const Stmt& for_stmt,
+                                       const Stmt& if_stmt) {
+  std::vector<Stmt> tracked_for_list = for_tracking_map_[for_stmt.get()];
+  size_t updated_for_idx = 0;
+  for (size_t i = 0; i < tracked_for_list.size(); ++i) {
+    const Stmt& current_for =
+      tracked_for_list.at(tracked_for_list.size() - 1 - i);
+    if (is_first_if(current_for, if_stmt)) {
+      updated_for_idx = tracked_for_list.size() - 1 - i;
+      break;
+    }
+  }
+  return updated_for_idx;
+}
+
+// Hoist an IfThenElse node as high as possible.
+// This function iterates on all candidate For nodes. For each For node,
+// it first removes IfThenElse nodes. Then it generates a new IfThenElse
+// node using mutated For nodes.
+Stmt IfThenElseHoist::HoistIf(const Stmt& if_stmt) {
+  Stmt new_if = if_stmt;
+
+  for (size_t i = 0; i < if2for_map_[if_stmt.get()].size(); ++i) {
+    const Stmt& for_stmt = if2for_map_[if_stmt.get()].at(i);
+    size_t updated_for_idx = GetUpdatedFor(for_stmt, new_if);
+    const Stmt& updated_for_node =
+      for_tracking_map_[for_stmt.get()].at(updated_for_idx);
+    auto generated_for_pair = RemoveIf(updated_for_node, new_if);
+    const Stmt& then_for = generated_for_pair.first;
+    const Stmt& else_for = generated_for_pair.second;;
+    for_tracking_map_[for_stmt.get()].at(updated_for_idx) = then_for;
+
+    if (else_for.get()) {
+      for_tracking_map_[for_stmt.get()].push_back(else_for);
+    }
+
+    const IfThenElse* new_if_node = new_if.as<IfThenElse>();
+    CHECK(new_if_node);
+    new_if = IfThenElse::make(new_if_node->condition, then_for, else_for);
+    if (i < if2for_map_[if_stmt.get()].size() - 1) {
+      const Stmt& original_next_for = if2for_map_[if_stmt.get()].at(i + 1);
+      const Stmt& actual_next_for =
+        for_tracking_map_[original_next_for.get()].at(updated_for_idx);
+      Stmt update_for_stmt = update_for(actual_next_for, new_if);
+
+      for_tracking_map_[original_next_for.get()].
+        at(updated_for_idx) = update_for_stmt;
+    }
+  }
+  return new_if;
+}
+
+// Mutate For nodes in post order DFS manner.
+Stmt IfThenElseHoist::PostOrderMutate(const Stmt& stmt) {
+  PackedFunc replace_top_for = PackedFunc(
+    [&](TVMArgs args, TVMRetValue *ret){
+      const NodeRef& current_for = args[0];
+      const For* for_node = current_for.as<For>();
+      if (!for_node) return;
+
+      if (top_for_var_map_.count(for_node->loop_var.get())) {
+        std::vector<Stmt> new_if_list;
+        for (const Stmt& if_stmt :
+          top_for_var_map_[for_node->loop_var.get()]) {
+          new_if_list.emplace_back(HoistIf(if_stmt));
+        }
+
+        const IfThenElse* next_if_node;
+        const IfThenElse* current_if_node =
+          new_if_list.back().as<IfThenElse>();
+        Stmt new_for = Stmt();
+        for (size_t i = new_if_list.size() - 1; i > 0; --i) {
+          CHECK(current_if_node);
+          const Stmt current_if_stmt =
+            IfThenElse::make(current_if_node->condition,
+                             current_if_node->then_case,
+                             current_if_node->else_case);
+          next_if_node = new_if_list[i - 1].as<IfThenElse>();
+          CHECK(next_if_node);
+          new_for = IfThenElse::make(next_if_node->condition, current_if_stmt,
+                                     next_if_node->else_case);
+          current_if_node = new_for.as<IfThenElse>();
+        }
+
+        if (!new_for.get()) {
+          const IfThenElse* first_if_node = new_if_list[0].as<IfThenElse>();
+          CHECK(first_if_node);
+          new_for = IfThenElse::make(first_if_node->condition,
+                                     first_if_node->then_case,
+                                     first_if_node->else_case);
+        }
+        *ret = new_for;
+      }
+    });
+  return IRTransform(stmt, nullptr, replace_top_for, {Expr("For")});
+}
+
+Stmt HoistIfThenElse(Stmt stmt) {
+  return IfThenElseHoist().VisitAndMutate(stmt);
+}
+
+}  // namespace ir
+}  // namespace tvm
diff --git a/src/pass/infer_fragment.cc b/src/pass/infer_fragment.cc
new file mode 100644
index 000000000000..d9c0ef04787b
--- /dev/null
+++ b/src/pass/infer_fragment.cc
@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \brief Infer TensorCore metadata from tensor intrinsic.
+ * \file tensorcore_fragment.cc
+ */
+#include <tvm/ir.h>
+#include <tvm/ir_pass.h>
+#include <tvm/ir_mutator.h>
+#include <tvm/ir_visitor.h>
+#include <unordered_map>
+#include <unordered_set>
+#include "ir_util.h"
+#include "storage_access.h"
+#include "../runtime/thread_storage_scope.h"
+
+namespace tvm {
+namespace ir {
+
+// Get fragment information from tensor intrinsics
+class FragmentGetter : public IRVisitor {
+ public:
+  // fragment metadata
+  struct FragmentInfo {
+    // fragment shape
+    int m, n, k;
+    // fragment layout (row-major or column-major)
+    std::string layout;
+    FragmentInfo() = default;
+    FragmentInfo(int _m, int _n, int _k, const std::string& _layout)
+      : m(_m), n(_n), k(_k), layout(_layout) {}
+  };
+
+  void Visit_(const Call* op) final {
+    IRVisitor::Visit_(op);
+
+    if (op->is_intrinsic(intrinsic::tvm_load_matrix_sync) ||
+        op->is_intrinsic(intrinsic::tvm_store_matrix_sync)) {
+      // Get shape and layout information from load and store intrinsic
+      CHECK_EQ(op->args.size(), 8U);
+      const Variable* buffer_var = op->args[0].as<Variable>();
+      CHECK(buffer_var);
+      // Get shape
+      const IntImm* m = op->args[1].as<IntImm>();
+      const IntImm* n = op->args[2].as<IntImm>();
+      const IntImm* k = op->args[3].as<IntImm>();
+      const StringImm* layout = op->args[7].as<StringImm>();
+      CHECK(m);
+      CHECK(n);
+      CHECK(k);
+      CHECK(layout);
+
+      std::string scope = scopes[buffer_var];
+      if (fragments.count(buffer_var)) {
+        // check if the fragment has met before
+        FragmentInfo info = fragments[buffer_var];
+        CHECK_EQ(m->value, info.m);
+        CHECK_EQ(n->value, info.n);
+        CHECK_EQ(k->value, info.k);
+        if (scope == "wmma.matrix_a" || scope == "wmma.matrix_b") {
+          CHECK_EQ(layout->value, info.layout);
+        }
+      } else {
+        // store metadata
+        FragmentInfo info;
+        if (scope == "wmma.matrix_a" || scope == "wmma.matrix_b") {
+          info = FragmentInfo(m->value, n->value, k->value, layout->value);
+        } else if (scope == "wmma.accumulator") {
+          info = FragmentInfo(m->value, n->value, k->value, "");
+        }
+        fragments[buffer_var] = info;
+      }
+    } else if (op->is_intrinsic(intrinsic::tvm_fill_fragment)) {
+      // Get shape information from fill intrinsic
+      CHECK_EQ(op->args.size(), 6U);
+      const Variable* buffer_var = op->args[0].as<Variable>();
+      CHECK(buffer_var);
+      // Get shape
+      const IntImm* m = op->args[1].as<IntImm>();
+      const IntImm* n = op->args[2].as<IntImm>();
+      const IntImm* k = op->args[3].as<IntImm>();
+      CHECK(m);
+      CHECK(n);
+      CHECK(k);
+
+      std::string scope = scopes[buffer_var];
+      // Only wmma.accumulator can use tvm_fill_fragment
+      CHECK_EQ(scope, "wmma.accumulator");
+      if (fragments.count(buffer_var)) {
+        FragmentInfo info = fragments[buffer_var];
+        CHECK_EQ(m->value, info.m);
+        CHECK_EQ(n->value, info.n);
+        CHECK_EQ(k->value, info.k);
+      } else {
+        FragmentInfo info(m->value, n->value, k->value, "");
+        fragments[buffer_var] = info;
+      }
+    }
+  }
+
+  // Get memory scope
+  void Visit_(const AttrStmt* op) final {
+    if (op->attr_key == attr::storage_scope) {
+      const Variable* buffer = op->node.as<Variable>();
+      CHECK(buffer);
+      scopes[buffer] = op->value.as<StringImm>()->value;
+    }
+    IRVisitor::Visit_(op);
+  }
+
+  // Memory scope for allocations
+  std::unordered_map<const Variable*, std::string> scopes;
+  // Fragment metadata for all fragments
+  std::unordered_map<const Variable*, FragmentInfo> fragments;
+};
+
+// Check shape of fragment making sure it is a valid shape for tvm_mma_sync
+class FragmentChecker : public IRVisitor {
+ public:
+  explicit FragmentChecker(const FragmentGetter &getter) : fragment_getter(getter) {}
+
+  void Visit_(const Call* op) final {
+    // Check shape when calling tvm_mma_sync
+    if (op->is_intrinsic(intrinsic::tvm_mma_sync)) {
+      CHECK_EQ(op->args.size(), 8U);
+      const Variable* buffer_var_d = op->args[0].as<Variable>();
+      const Variable* buffer_var_a = op->args[2].as<Variable>();
+      const Variable* buffer_var_b = op->args[4].as<Variable>();
+      const Variable* buffer_var_c = op->args[6].as<Variable>();
+      CHECK(buffer_var_d);
+      CHECK(buffer_var_a);
+      CHECK(buffer_var_b);
+      CHECK(buffer_var_c);
+
+      // Check all fragment A, B, C and D have the same shape
+      CHECK(CheckShape(buffer_var_d, buffer_var_a));
+      CHECK(CheckShape(buffer_var_d, buffer_var_b));
+      CHECK(CheckShape(buffer_var_d, buffer_var_c));
+    }
+  }
+
+ private:
+  // A tool for checking shapes of two fragments
+  bool CheckShape(const Variable* buffer1, const Variable* buffer2) {
+    CHECK(fragment_getter.fragments.count(buffer1));
+    CHECK(fragment_getter.fragments.count(buffer2));
+    FragmentGetter::FragmentInfo info1 = fragment_getter.fragments.at(buffer1);
+    FragmentGetter::FragmentInfo info2 = fragment_getter.fragments.at(buffer2);
+    return info1.m == info2.m && info1.n == info2.n && info1.k == info2.k;
+  }
+  // Fragment infomation
+  const FragmentGetter &fragment_getter;
+};
+
+// Store the metadata into attributes
+class InferFragmenter : public IRMutator {
+ public:
+  explicit InferFragmenter(const FragmentGetter &getter) : fragment_getter(getter) {}
+
+  Stmt Mutate_(const Allocate* op, const Stmt& s) final {
+    Stmt stmt = IRMutator::Mutate_(op, s);
+    const Variable* buffer = op->buffer_var.get();
+    if (fragment_getter.fragments.count(buffer)) {
+      // Add attribute to fragments allocation
+      FragmentGetter::FragmentInfo info = fragment_getter.fragments.at(buffer);
+
+      // Add shape attribute to all fragments
+      std::string shape = std::to_string(info.m) + ", " +
+                          std::to_string(info.n) + ", " +
+                          std::to_string(info.k);
+      Expr shape_expr = StringImm::make(shape);
+      Stmt shape_attr = AttrStmt::make(op->buffer_var, attr::fragment_shape, shape_expr, stmt);
+      if (info.layout != "") {
+        // Add shape attribute to matrix_a and matrix_b
+        Stmt layout_attr = AttrStmt::make(op->buffer_var, attr::fragment_layout,
+                                          StringImm::make(info.layout), shape_attr);
+        return layout_attr;
+      } else {
+        return shape_attr;
+      }
+    }
+    return stmt;
+  }
+
+ private:
+  // Fragment infomation
+  const FragmentGetter &fragment_getter;
+};
+
+Stmt InferFragment(Stmt stmt) {
+  FragmentGetter getter;
+  getter.Visit(stmt);
+  FragmentChecker(getter).Visit(stmt);
+  stmt = InferFragmenter(getter).Mutate(stmt);
+  return stmt;
+}
+
+LoweredFunc InferFragment(LoweredFunc f) {
+  CHECK_NE(f->func_type, kHostFunc);
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
+  n->body = InferFragment(f->body);
+  return LoweredFunc(n);
+}
+
+}  // namespace ir
+}  // namespace tvm
diff --git a/src/pass/inject_copy_intrin.cc b/src/pass/inject_copy_intrin.cc
index 8df5fe1f7757..d73e78e7f08b 100644
--- a/src/pass/inject_copy_intrin.cc
+++ b/src/pass/inject_copy_intrin.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -92,7 +92,7 @@ class CopyIntrinInjector : public IRMutator {
     if (load->type.lanes() != 1) return false;
     Array<Var> loop_vars;
     for (const For* op : loops) {
-      loop_vars.push_back(Var(op->loop_var.node_));
+      loop_vars.push_back(op->loop_var);
     }
     Array<Expr> store_strides =
         arith::DetectLinearEquation(store->index, loop_vars);
@@ -153,7 +153,7 @@ class CopyIntrinInjector : public IRMutator {
         dst_strides.push_back(make_const(Int(32), 1));
     }
     Buffer dst = BufferNode::make(
-        Var(store->buffer_var.node_),
+        store->buffer_var,
         store->value.type(),
         dst_shape,
         dst_strides,
@@ -162,7 +162,7 @@ class CopyIntrinInjector : public IRMutator {
         GetStorageScope(store->buffer_var.get()),
         0, 0, kDefault);
     Buffer src = BufferNode::make(
-        Var(load->buffer_var.node_),
+        load->buffer_var,
         load->type,
         src_shape,
         src_strides,
diff --git a/src/pass/inject_double_buffer.cc b/src/pass/inject_double_buffer.cc
index 10a132f43741..065bbd4e4db3 100644
--- a/src/pass/inject_double_buffer.cc
+++ b/src/pass/inject_double_buffer.cc
@@ -212,7 +212,7 @@ class DoubleBufferInjector : public IRMutator {
 
  private:
   Stmt MakeProducer(const AttrStmt* op, const Stmt& s) {
-    const VarExpr buffer(op->node.node_);
+    const VarExpr buffer = Downcast<VarExpr>(op->node);
     CHECK_NE(loop_nest_.size(), 0U)
         << "Double buffer scope must be inside a loop";
     auto it = dbuffer_info_.find(buffer.get());
diff --git a/src/pass/inject_prefetch.cc b/src/pass/inject_prefetch.cc
index 3ad7f8a22124..a3a583f37e13 100644
--- a/src/pass/inject_prefetch.cc
+++ b/src/pass/inject_prefetch.cc
@@ -40,7 +40,7 @@ class PrefetchInjector : public IRMutator {
     Stmt ret = IRMutator::Mutate_(op, s);
     op = ret.as<AttrStmt>();
     if (op && op->attr_key == attr::prefetch_scope) {
-      Tensor ts(op->node.node_);
+      Tensor ts = Downcast<Tensor>(op->node);
       CHECK_NE(loop_nest_.size(), 0U);
       Domain domain = DomainTouched(op->body, ts, true, false);
       Region region;
diff --git a/src/pass/inject_virtual_thread.cc b/src/pass/inject_virtual_thread.cc
index 88e7f4370126..eafe5a928cd7 100644
--- a/src/pass/inject_virtual_thread.cc
+++ b/src/pass/inject_virtual_thread.cc
@@ -190,8 +190,7 @@ class VTInjector : public IRMutator {
   }
   // Inject VTLoop when needed.
   Stmt Mutate(Stmt stmt) final {
-    CHECK(!visit_touched_var_)
-        << stmt->type_key() << stmt;
+    CHECK(!visit_touched_var_);
     stmt = IRMutator::Mutate(stmt);
     if (visit_touched_var_ || trigger_base_inject_) {
       if (!vt_loop_injected_)  {
@@ -485,7 +484,7 @@ class VirtualThreadInjector : public IRMutator {
     Stmt stmt = IRMutator::Mutate_(op, s);
     op = stmt.as<AttrStmt>();
     if (op->attr_key == attr::virtual_thread) {
-      IterVar iv(op->node.node_);
+      IterVar iv = Downcast<IterVar>(op->node);
       bool allow_share = iv->thread_tag == "vthread";
       int nthread = static_cast<int>(op->value.as<IntImm>()->value);
       VarTouchedAnalysis vs;
diff --git a/src/pass/ir_mutator.cc b/src/pass/ir_mutator.cc
index 1bc856cf601c..fda12378a766 100644
--- a/src/pass/ir_mutator.cc
+++ b/src/pass/ir_mutator.cc
@@ -76,7 +76,7 @@ Stmt IRTransform(const Stmt& ir_node,
                  const Array<Expr>& only_enable) {
   std::unordered_set<uint32_t> only_type_index;
   for (Expr s : only_enable) {
-    only_type_index.insert(Node::TypeKey2Index(s.as<StringImm>()->value.c_str()));
+    only_type_index.insert(Object::TypeKey2Index(s.as<StringImm>()->value.c_str()));
   }
   return IRTransformer(f_preorder, f_postorder, only_type_index)
       .Mutate(ir_node);
diff --git a/src/pass/lift_attr_scope.cc b/src/pass/lift_attr_scope.cc
index 13e3a4e1755a..e300400e6a28 100644
--- a/src/pass/lift_attr_scope.cc
+++ b/src/pass/lift_attr_scope.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -198,7 +198,7 @@ class AttrScopeLifter : public IRMutator {
   // value comparison that also compares content of int constant
   static bool ValueSame(const Expr& a, const Expr& b) {
     if (a.same_as(b)) return true;
-    if (a->type_key() != b->type_key()) return false;
+    if (a->type_index() != b->type_index()) return false;
     if (a.type() != b.type()) return false;
     if (const IntImm* op = a.as<IntImm>()) {
       return op->value == b.as<IntImm>()->value;
diff --git a/src/pass/lower_thread_allreduce.cc b/src/pass/lower_thread_allreduce.cc
index 02c72d03fea8..b63010f9c28d 100644
--- a/src/pass/lower_thread_allreduce.cc
+++ b/src/pass/lower_thread_allreduce.cc
@@ -122,7 +122,7 @@ class ThreadAllreduceBuilder final : public IRMutator {
     size_t size = combiner->result.size();
 
     const UIntImm *size_of_args = call->args[0].as<UIntImm>();
-    CHECK(size_of_args) << call->args[0]->type_key();
+    CHECK(size_of_args) << call->args[0]->GetTypeKey();
     CHECK_EQ(size, size_of_args->value);
     Array<Expr> inits = combiner->identity_element;
     std::vector<Expr> values(size);
@@ -152,7 +152,7 @@ class ThreadAllreduceBuilder final : public IRMutator {
     std::vector<ThreadEntry> vred, vpar;
     for (const AttrStmt* attr : thread_extents_) {
       ThreadEntry e;
-      IterVar iv(attr->node.node_);
+      IterVar iv = Downcast<IterVar>(attr->node);
       e.scope = runtime::ThreadScope::make(iv->thread_tag);
       e.iv = iv;
       CHECK_LE(e.scope.rank, 1);
@@ -183,7 +183,7 @@ class ThreadAllreduceBuilder final : public IRMutator {
       std::vector<Stmt> stores(size);
       for (size_t i = 0; i < size; ++i) {
         Expr pred = const_true(types[i].lanes());
-        Var buffer_var(call->args[2+size+i].node_);
+        Var buffer_var = Downcast<Var>(call->args[2+size+i]);
         stores[i] = Store::make(buffer_var, values[i], 0, pred);
       }
       return Block::make(stores);
diff --git a/src/pass/lower_tvm_builtin.cc b/src/pass/lower_tvm_builtin.cc
index 69618985d50c..79329cbe717f 100644
--- a/src/pass/lower_tvm_builtin.cc
+++ b/src/pass/lower_tvm_builtin.cc
@@ -230,7 +230,7 @@ class BuiltinLower : public IRMutator {
                      cast(Int(32), device_type_)));
     return TVMStructGet(Handle(), stack_array_, idx, intrinsic::kArrAddr);
   }
-  // call packled.
+  // call packed.
   Expr MakeCallPacked(const Call* op, const Expr& e) {
     size_t restore_shape_stack = run_shape_stack_;
     size_t restore_array_stack = run_array_stack_;
diff --git a/src/pass/lower_warp_memory.cc b/src/pass/lower_warp_memory.cc
index 03d0cad40af5..bfd5c3979e69 100644
--- a/src/pass/lower_warp_memory.cc
+++ b/src/pass/lower_warp_memory.cc
@@ -158,7 +158,7 @@ class WarpIndexFinder : private IRVisitor {
   /// Visitor implementation
   void Visit_(const AttrStmt *op) final {
     if (op->attr_key == attr::thread_extent) {
-      IterVar iv(op->node.node_);
+      IterVar iv = Downcast<IterVar>(op->node);
       if (iv->thread_tag == "threadIdx.x") {
         int value;
         CHECK(arith::GetConstInt(op->value, &value) &&
@@ -303,7 +303,7 @@ class BindVarBoundInfo : public IRVisitor {
       : analyzer_(analyzer) {}
 
   void Visit_(const For* op) final {
-    Var loop_var(op->loop_var.node_);
+    const Var& loop_var = op->loop_var;
     analyzer_->Bind(loop_var, Range::make_by_min_extent(op->min, op->extent));
     IRVisitor::Visit_(op);
   }
@@ -311,7 +311,7 @@ class BindVarBoundInfo : public IRVisitor {
   void Visit_(const AttrStmt* op) {
     if (op->attr_key == attr::thread_extent ||
         op->attr_key == attr::virtual_thread) {
-      IterVar iv(op->node.node_);
+      IterVar iv = Downcast<IterVar>(op->node);
       CHECK_NE(iv->thread_tag.length(), 0U);
       if (!var_dom_.count(iv->var.get())) {
         Range dom = Range::make_by_min_extent(0, op->value);
diff --git a/src/pass/make_api.cc b/src/pass/make_api.cc
index 27c15138adf1..d7e6d02d72ed 100644
--- a/src/pass/make_api.cc
+++ b/src/pass/make_api.cc
@@ -143,12 +143,12 @@ LoweredFunc MakeAPI(Stmt body,
     }
     // add checks for functions.
     if (api_args[i].as<Variable>()) {
-      var_defs.emplace_back(std::make_pair(Var(api_args[i].node_), v_arg));
+      var_defs.emplace_back(std::make_pair(Downcast<Var>(api_args[i]), v_arg));
     } else {
       // Buffer checks
       CHECK(api_args[i].as<BufferNode>())
           << "api_args can only be Buffer or Var";
-      buf_defs.emplace_back(std::make_pair(Buffer(api_args[i].node_), v_arg));
+      buf_defs.emplace_back(std::make_pair(Downcast<Buffer>(api_args[i]), v_arg));
     }
   }
 
diff --git a/src/pass/narrow_channel_access.cc b/src/pass/narrow_channel_access.cc
index 57f3baf20e10..a6f3d67bb33c 100644
--- a/src/pass/narrow_channel_access.cc
+++ b/src/pass/narrow_channel_access.cc
@@ -187,7 +187,7 @@ class ChannelAccessRewriter : public IRMutator {
     const Expr& window = e->window->value;
     bool read_access = e->read_access;
     Var var(for_op->loop_var);
-    Channel ch(adv_op->node.node_);
+    Channel ch = Downcast<Channel>(adv_op->node);
     ChannelAccessBound acc(ch->handle_var.get(), read_access);
     IntSet iset = acc.Eval(for_op->body);
     Range r = iset.cover_range(Range::make_by_min_extent(0, window));
diff --git a/src/pass/remap_thread_axis.cc b/src/pass/remap_thread_axis.cc
index 0a155967820d..3e3ed80db2e4 100644
--- a/src/pass/remap_thread_axis.cc
+++ b/src/pass/remap_thread_axis.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -46,7 +46,7 @@ class ThreadAxisRewriter : private IRMutator {
  private:
   Stmt Mutate_(const AttrStmt* op, const Stmt& stmt) final {
     if (op->attr_key == attr::thread_extent) {
-      IterVar iv(op->node.node_);
+      IterVar iv = Downcast<IterVar>(op->node);
       CHECK_NE(iv->thread_tag.length(), 0U);
       auto it = tmap_.find(iv->thread_tag);
       if (it != tmap_.end()) {
diff --git a/src/pass/split_host_device.cc b/src/pass/split_host_device.cc
index 27d8a310221e..6fac7be20acf 100644
--- a/src/pass/split_host_device.cc
+++ b/src/pass/split_host_device.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -38,7 +38,7 @@ class IRUseDefAnalysis : public IRMutator {
  public:
   Stmt Mutate_(const AttrStmt *op, const Stmt& s) final {
     if (op->attr_key == attr::thread_extent) {
-      IterVar iv(op->node.node_);
+      IterVar iv = Downcast<IterVar>(op->node);
       CHECK_NE(iv->thread_tag.length(), 0U);
       // thread_extent can appear multiple times
       // use the first appearance as def.
@@ -57,7 +57,7 @@ class IRUseDefAnalysis : public IRMutator {
       return AttrStmt::make(op->node, op->attr_key, value, body);
     } else if (op->attr_key == attr::channel_write_scope ||
                op->attr_key == attr::channel_read_scope) {
-      Channel ch(op->node.node_);
+      Channel ch = Downcast<Channel>(op->node);
       if (!use_count_.count(ch->handle_var.get())) {
         this->HandleDef(ch->handle_var.get());
       }
@@ -141,7 +141,7 @@ class IRUseDefAnalysis : public IRMutator {
 
   void HandleUse(const Expr& v) {
     CHECK(v.as<Variable>());
-    Var var(v.node_);
+    Var var = Downcast<Var>(v);
     auto it = use_count_.find(var.get());
     if (it != use_count_.end()) {
       if (it->second >= 0) {
diff --git a/src/pass/split_pipeline.cc b/src/pass/split_pipeline.cc
index 9d379ac748f4..f33566cb051c 100644
--- a/src/pass/split_pipeline.cc
+++ b/src/pass/split_pipeline.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -97,7 +97,7 @@ class MarkChannelAccess : public IRMutator {
   }
   Stmt Mutate_(const AttrStmt* op, const Stmt& s) final {
     if (op->attr_key == ir::attr::storage_scope) {
-      Var buf_var(op->node.node_);
+      Var buf_var = Downcast<Var>(op->node);
       if (cmap_.count(buf_var.get())) return Mutate(op->body);
     }
     return IRMutator::Mutate_(op, s);
@@ -251,7 +251,7 @@ class StageSplitter : public IRMutator {
             op->condition, no_op, op->new_expr, op->free_function));
         MarkChannel(op);
       } else {
-        LOG(FATAL) << "not supported nest type " << s->type_key();
+        LOG(FATAL) << "not supported nest type " << s->GetTypeKey();
       }
     }
     body = Substitute(MergeNest(nest, body), subst);
diff --git a/src/pass/storage_access.cc b/src/pass/storage_access.cc
index d0f77343ec15..8cad36d0e287 100644
--- a/src/pass/storage_access.cc
+++ b/src/pass/storage_access.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -114,12 +114,12 @@ void StorageAccessVisitor::Visit_(const AttrStmt* op) {
     }
     double_buffer_write_ = nullptr;
   } else if (op->attr_key == attr::coproc_scope) {
-    IterVar iv(op->node.node_);
+    IterVar iv = Downcast<IterVar>(op->node);
     env_threads_.push_back(iv);
     IRVisitor::Visit_(op);
     env_threads_.CopyOnWrite()->data.pop_back();
   } else if (op->attr_key == attr::thread_extent) {
-    IterVar iv(op->node.node_);
+    IterVar iv = Downcast<IterVar>(op->node);
     env_threads_.push_back(iv);
     if (!in_device_env_) {
       in_device_env_ = true;
@@ -199,7 +199,7 @@ void StorageAccessVisitor::Visit_(const Call* op) {
       AccessEntry e;
       e.threads = env_threads();
       e.dtype = dtype;
-      e.buffer = VarExpr(op->args[1].node_);
+      e.buffer = Downcast<VarExpr>(op->args[1]);
       e.touched = arith::IntSet::range(
           Range::make_by_min_extent(offset, extent));
       e.scope = scope;
@@ -295,7 +295,7 @@ class StorageAccessInfoLower : public IRMutator {
     CHECK_EQ(op->args.size(), 5U);
     Type dtype = op->args[0].type();
     const Variable* buffer = op->args[1].as<Variable>();
-    Var buffer_var(op->args[1].node_);
+    Var buffer_var = Downcast<Var>(op->args[1]);
     Expr offset = op->args[2];
     auto it = storage_info_.find(buffer);
     if (it != storage_info_.end() && it->second.info.defined()) {
@@ -341,5 +341,11 @@ Stmt LowerStorageAccessInfo(Stmt stmt) {
   return StorageAccessInfoLower().Mutate(stmt);
 }
 
+LoweredFunc LowerDeviceStorageAccessInfo(LoweredFunc f) {
+  auto n = make_node<LoweredFuncNode>(*f.operator->());
+  n->body = LowerStorageAccessInfo(f->body);
+  return LoweredFunc(n);
+}
+
 }  // namespace ir
 }  // namespace tvm
diff --git a/src/pass/storage_flatten.cc b/src/pass/storage_flatten.cc
index 3d84d42b7a06..eb7314e8a290 100644
--- a/src/pass/storage_flatten.cc
+++ b/src/pass/storage_flatten.cc
@@ -72,7 +72,7 @@ class StorageFlattener : public IRMutator {
     if (it != var_remap_.end() &&
         !it->second.same_as(op->buffer_var)) {
       CHECK(it->second.as<Variable>());
-      VarExpr buf_var(it->second.node_);
+      VarExpr buf_var = Downcast<VarExpr>(it->second);
       return Store::make(buf_var, op->value, op->index, op->predicate);
     } else {
       return stmt;
@@ -84,8 +84,8 @@ class StorageFlattener : public IRMutator {
       storage_scope_[op->node.get()] = op->value.as<StringImm>()->value;
       return this->Mutate(op->body);
     } else if (op->attr_key == attr::double_buffer_scope &&
-               op->node.node_->derived_from<OperationNode>()) {
-      Operation func(op->node.node_);
+               op->node->IsInstance<OperationNode>()) {
+      Operation func = Downcast<Operation>(op->node);
       Stmt body = Mutate(op->body);
       for (int i = 0; i < func->num_outputs(); ++i) {
         TensorKey key{func, i};
@@ -97,7 +97,7 @@ class StorageFlattener : public IRMutator {
       }
       return body;
     } else if (op->attr_key == attr::thread_extent) {
-      IterVar iv(op->node.node_);
+      IterVar iv = Downcast<IterVar>(op->node);
       ThreadScope ts = ThreadScope::make(iv->thread_tag);
       curr_thread_scope_.push_back(ts);
       Stmt stmt = IRMutator::Mutate_(op, s);
@@ -106,7 +106,7 @@ class StorageFlattener : public IRMutator {
     } else if (op->attr_key == attr::buffer_bind_scope) {
       return HandleBufferBindScope(op);
     } else if (op->attr_key == attr::buffer_dim_align) {
-      Tensor tensor(op->node.node_);
+      Tensor tensor = Downcast<Tensor>(op->node);
       const Call* tuple = op->value.as<Call>();
       CHECK(tuple && tuple->is_intrinsic(intrinsic::tvm_tuple));
       TensorKey key{tensor->op, tensor->value_index};
@@ -271,7 +271,7 @@ class StorageFlattener : public IRMutator {
     if (it != var_remap_.end() &&
         !it->second.same_as(op->buffer_var)) {
       CHECK(it->second.as<Variable>());
-      VarExpr buf_var(it->second.node_);
+      VarExpr buf_var = Downcast<VarExpr>(it->second);
       return Load::make(op->type, buf_var, op->index, op->predicate);
     } else {
       return expr;
@@ -401,7 +401,7 @@ class StorageFlattener : public IRMutator {
   // We do support a few relaxed case, such as bindingx
   // region with shape [1, 1, n, m] to buffer with shape [n, m]
   Stmt HandleBufferBindScope(const AttrStmt* op) {
-    Array<NodeRef> arr(op->node.node_);
+    Array<NodeRef> arr = Downcast<Array<NodeRef> > (op->node);
     CHECK_EQ(arr.size(), 2U);
     const BufferNode* buffer = arr[0].as<BufferNode>();
     const TensorNode* tensor = arr[1].as<TensorNode>();
@@ -438,7 +438,7 @@ class StorageFlattener : public IRMutator {
     }
     // start binding
     ArgBinder binder(&var_remap_);
-    binder.BindBuffer(Buffer(arr[0].node_), slice, buffer->name, true);
+    binder.BindBuffer(Downcast<Buffer>(arr[0]), slice, buffer->name, true);
     // Apply the remaps
     Stmt body = MergeNest(binder.asserts(), op->body);
     body = MergeNest(binder.init_nest(), body);
diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc
index 56359c8b6cf1..18b6634ec422 100644
--- a/src/pass/storage_rewrite.cc
+++ b/src/pass/storage_rewrite.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- * Copyright (c) 2017 by Contributors
  * \file storage_rewrite.cc
  * \brief Memory access pattern analysis and optimization.
  *  Re-write data access to enable memory sharing when possible.
@@ -243,13 +242,13 @@ class InplaceOpVerifier : public IRVisitor {
     dst_ = dst;
     src_ = src;
     result_ = true;
-    if (stmt->is_type<AttrStmt>()) {
+    if (stmt->IsInstance<AttrStmt>()) {
       Visit_(static_cast<const AttrStmt*>(stmt));
-    } else if (stmt->is_type<For>()) {
+    } else if (stmt->IsInstance<For>()) {
       Visit_(static_cast<const For*>(stmt));
-    } else if (stmt->is_type<IfThenElse>()) {
+    } else if (stmt->IsInstance<IfThenElse>()) {
       Visit_(static_cast<const IfThenElse*>(stmt));
-    } else if (stmt->is_type<Store>()) {
+    } else if (stmt->IsInstance<Store>()) {
       Visit_(static_cast<const Store*>(stmt));
     } else {
       return false;
@@ -776,7 +775,7 @@ class StoragePlanRewriter : public IRMutator {
         }
       }
       // enter/exit new scope
-      if (s.stmt->is_type<AttrStmt>()) {
+      if (s.stmt->IsInstance<AttrStmt>()) {
         const auto* op = static_cast<const AttrStmt*>(s.stmt);
         if (op->attr_key == attr::thread_extent ||
             op->attr_key == attr::virtual_thread ||
@@ -785,7 +784,7 @@ class StoragePlanRewriter : public IRMutator {
         } else {
           CHECK(op->attr_key == attr::extern_scope);
         }
-      } else if (s.stmt->is_type<For>()) {
+      } else if (s.stmt->IsInstance<For>()) {
         const auto* op = static_cast<const For*>(s.stmt);
         if (op->for_type == ForType::Parallel) {
           if (thread_scope_ == nullptr || thread_scope_ == op) {
diff --git a/src/pass/storage_sync.cc b/src/pass/storage_sync.cc
index c686bc37a02d..34dac52b6d85 100644
--- a/src/pass/storage_sync.cc
+++ b/src/pass/storage_sync.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -263,6 +263,28 @@ class ThreadSyncInserter : public IRMutator {
     }
   }
 
+  Expr Mutate_(const Call* op, const Expr& e) final {
+    if (op->is_intrinsic(intrinsic::tvm_access_ptr)) {
+      Expr expr = IRMutator::Mutate_(op, e);
+      op = expr.as<Call>();
+      CHECK_EQ(op->args.size(), 5U);
+      const Variable* buffer_var = op->args[1].as<Variable>();
+      Var var(GetRef<Var>(buffer_var));
+      const IntImm* flag = op->args[4].as<IntImm>();
+      if ((flag->value & 1) && sync_scope_.rank == StorageRank::kGlobal &&
+          GetScope(buffer_var).rank == StorageRank::kGlobal) {
+        ++rw_stats_[var].read_count;
+      }
+      if (flag->value & 2 && sync_scope_.rank == StorageRank::kGlobal &&
+          GetScope(buffer_var).rank == StorageRank::kGlobal) {
+        ++rw_stats_[var].write_count;
+      }
+      return expr;
+    } else {
+      return IRMutator::Mutate_(op, e);
+    }
+  }
+
  private:
   // RW statistics about data
   struct Entry {
@@ -304,7 +326,7 @@ class ThreadSyncInserter : public IRMutator {
       CHECK(!is_lead_.defined());
       num_work_dim_ = thread_extents_.size();
       for (const AttrStmt* attr : thread_extents_) {
-        IterVar iv(attr->node.node_);
+        IterVar iv = Downcast<IterVar>(attr->node);
         runtime::ThreadScope s = runtime::ThreadScope::make(iv->thread_tag);
         if (s.rank == 0) {
           num_blocks_ = (num_blocks_.defined() ?
diff --git a/src/pass/unroll_loop.cc b/src/pass/unroll_loop.cc
index 756130886e13..11d9daf3cf5d 100644
--- a/src/pass/unroll_loop.cc
+++ b/src/pass/unroll_loop.cc
@@ -151,8 +151,7 @@ class LoopUnroller : public IRMutator {
     Map<Var, Expr> vmap;
     Stmt unrolled;
     for (int i = 0; i < value; ++i) {
-      Var lv(op->loop_var.node_);
-      vmap.Set(lv, op->min + make_const(op->loop_var.type(), i));
+      vmap.Set(op->loop_var, op->min + make_const(op->loop_var.type(), i));
       Stmt step = Substitute(body, vmap);
       if (unrolled.defined()) {
         unrolled = Block::make(unrolled, step);
diff --git a/src/pass/vectorize_loop.cc b/src/pass/vectorize_loop.cc
index aed92d692e53..187033092e76 100644
--- a/src/pass/vectorize_loop.cc
+++ b/src/pass/vectorize_loop.cc
@@ -368,7 +368,6 @@ class Vectorizer : public IRMutator {
     CHECK(!op->extent.type().is_vector());
     Expr extent = Mutate(op->extent);
     if (extent.type().is_vector()) {
-      LOG(WARNING) << "Detect vectorized extent type, scalarizing...";
       return Scalarize(s);
     }
     Stmt body = Mutate(op->body);
@@ -386,7 +385,6 @@ class Vectorizer : public IRMutator {
     CHECK(!op->condition.type().is_vector());
     Expr condition = this->Mutate(op->condition);
     if (condition.type().is_vector()) {
-      LOG(WARNING) << "Detect vector condition in Vectorized Loop, scalarizing...";
       return Scalarize(s);
     }
     Stmt then_case = this->Mutate(op->then_case);
@@ -533,8 +531,7 @@ class LoopVectorizer : public IRMutator {
       if (!succ || lanes < 1) {
         LOG(FATAL) << "Failed to vectorize loop with extent " << op->extent;
       }
-      Var var(op->loop_var.node_);
-      return Vectorizer(var, lanes).Mutate(op->body);
+      return Vectorizer(op->loop_var, lanes).Mutate(op->body);
     } else {
       return IRMutator::Mutate_(op, s);
     }
diff --git a/src/pass/verify_memory.cc b/src/pass/verify_memory.cc
index 447691ca0e23..0069182f8367 100644
--- a/src/pass/verify_memory.cc
+++ b/src/pass/verify_memory.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -111,7 +111,7 @@ class MemoryAccessVerifier final : protected IRVisitor {
       CHECK(V) << "Invalid Variable\n";
 
       // Variable is from function args. Return true.
-      if (V == func_->args[0].node_.get()) return true;
+      if (V == func_->args[0].get()) return true;
 
       // The value is expected to come from a tvm_struct_get Call.
       // Get the first argument of tvm_struct_get, and continue.
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 4cf13a3d21a2..dfe85fc10908 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -460,7 +460,9 @@ class RelayBuildModule : public runtime::ModuleNode {
     ret_.params = graph_codegen_->GetParams();
 
     auto lowered_funcs = graph_codegen_->GetLoweredFunc();
-    if (lowered_funcs.size() != 0) {
+    if (lowered_funcs.size() == 0) {
+      LOG(WARNING) << "no lowered funcs exist in the compiled module";
+    } else {
       ret_.mod = tvm::build(
         lowered_funcs,
         target_host_,
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index a75cdb299bf4..993c4bf51f67 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors
  * \file relay/backend/compile_engine.cc
  * \brief Internal compialtion engine.
  */
@@ -43,6 +42,11 @@
 namespace tvm {
 namespace relay {
 
+TVM_REGISTER_NODE_TYPE(CachedFuncNode);
+TVM_REGISTER_NODE_TYPE(CCacheKeyNode);
+TVM_REGISTER_NODE_TYPE(CCacheValueNode);
+TVM_REGISTER_OBJECT_TYPE(CompileEngineNode);
+
 CCacheKey CCacheKeyNode::make(Function source_func, Target target) {
   auto n = make_node<CCacheKeyNode>();
   n->source_func = std::move(source_func);
@@ -78,7 +82,7 @@ Array<IndexExpr> GetShape(const Array<IndexExpr>& shape) {
       CHECK_LE(pval[0], std::numeric_limits<int32_t>::max());
       CHECK_GE(pval[0], std::numeric_limits<int32_t>::min());
       res.push_back(ir::IntImm::make(Int(32), *pval));
-    } else if (val->is_type<ir::Any>()) {
+    } else if (val->IsInstance<ir::Any>()) {
       res.push_back(val.as<ir::Any>()->ToVar());
     } else {
       res.push_back(val);
diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h
index 84197dbfabc7..65f5eed8d405 100644
--- a/src/relay/backend/compile_engine.h
+++ b/src/relay/backend/compile_engine.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -59,7 +59,7 @@ struct CachedFuncNode : public Node {
   /*! \brief Parameter usage states in the shape function. */
   tvm::Array<Integer> shape_func_param_states;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("target", &target);
     v->Visit("func_name", &func_name);
     v->Visit("inputs", &inputs);
@@ -84,7 +84,7 @@ class CCacheKeyNode : public Node {
   /*! \brief The hardware target.*/
   Target target;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("source_func", &source_func);
     v->Visit("target", &target);
   }
@@ -119,9 +119,9 @@ class CCacheKeyNode : public Node {
 class CCacheKey : public NodeRef {
  public:
   CCacheKey() {}
-  explicit CCacheKey(NodePtr<Node> n) : NodeRef(n) {}
+  explicit CCacheKey(ObjectPtr<Object> n) : NodeRef(n) {}
   const CCacheKeyNode* operator->() const {
-    return static_cast<CCacheKeyNode*>(node_.get());
+    return static_cast<const CCacheKeyNode*>(get());
   }
   // comparator
   inline bool operator==(const CCacheKey& other) const {
@@ -141,7 +141,7 @@ class CCacheValueNode : public Node {
   /*! \brief usage statistics */
   int use_count{0};
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("cached_func", &cached_func);
     v->Visit("use_count", &use_count);
   }
@@ -153,12 +153,12 @@ class CCacheValueNode : public Node {
 class CCacheValue : public NodeRef {
  public:
   CCacheValue() {}
-  explicit CCacheValue(NodePtr<Node> n) : NodeRef(n) {}
+  explicit CCacheValue(ObjectPtr<Object> n) : NodeRef(n) {}
   CCacheValueNode* operator->() {
-    return static_cast<CCacheValueNode*>(node_.get());
+    return static_cast<CCacheValueNode*>(get_mutable());
   }
   const CCacheValueNode* operator->() const {
-    return static_cast<const CCacheValueNode*>(node_.get());
+    return static_cast<const CCacheValueNode*>(get());
   }
   using ContainerType = CCacheValueNode;
 };
@@ -191,7 +191,7 @@ class CompileEngineNode : public Node {
   virtual void Clear() = 0;
 
   // VisitAttrs
-  void VisitAttrs(AttrVisitor*) final {}
+  void VisitAttrs(AttrVisitor*) {}
 
   static constexpr const char* _type_key = "relay.CompileEngine";
   TVM_DECLARE_NODE_TYPE_INFO(CompileEngineNode, Node);
@@ -201,9 +201,9 @@ class CompileEngineNode : public Node {
 class CompileEngine : public NodeRef {
  public:
   CompileEngine() {}
-  explicit CompileEngine(NodePtr<Node> n) : NodeRef(n) {}
+  explicit CompileEngine(ObjectPtr<Object> n) : NodeRef(n) {}
   CompileEngineNode* operator->() {
-    return static_cast<CompileEngineNode*>(node_.get());
+    return static_cast<CompileEngineNode*>(get_mutable());
   }
   using ContainerType = CompileEngineNode;
   /*! \brief The global compile engine. */
diff --git a/src/relay/backend/graph_runtime_codegen.cc b/src/relay/backend/graph_runtime_codegen.cc
index 382ae6954a80..7ec287b0e0a2 100644
--- a/src/relay/backend/graph_runtime_codegen.cc
+++ b/src/relay/backend/graph_runtime_codegen.cc
@@ -292,7 +292,7 @@ class GraphRuntimeCodegen
           shape.emplace_back(_ShapeToJSON(typ->shape));
           dtype.emplace_back(DType2String(typ->dtype));
         } else {
-          LOG(FATAL) << "type " << checked_type->type_key() << " not supported";
+          LOG(FATAL) << "type " << checked_type->GetTypeKey() << " not supported";
         }
       }
       CHECK_EQ(node->Type(), kGraphOpNode);
@@ -311,7 +311,7 @@ class GraphRuntimeCodegen
       node->attrs_["shape"] = shape;
       node->attrs_["dtype"] = dtype;
     } else {
-      LOG(FATAL) << "type " << checked_type->type_key() << " not supported";
+      LOG(FATAL) << "type " << checked_type->GetTypeKey() << " not supported";
     }
     return {GraphNodeRef(node_id, 0)};
   }
@@ -392,7 +392,7 @@ class GraphRuntimeCodegen
     } else if (op->op.as<FunctionNode>()) {
       func = GetRef<Function>(op->op.as<FunctionNode>());
     } else {
-      LOG(FATAL) << "TVM runtime does not support calls to " << op->op->type_key();
+      LOG(FATAL) << "TVM runtime does not support calls to " << op->op->GetTypeKey();
     }
     if (!func->IsPrimitive()) {
       LOG(FATAL) << "TVM only support calls to primitive functions "
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index 86a4ebb4ebd2..8c6daceedd5c 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors
  * \file src/tvm/relay/interpreter.cc
  * \brief An interpreter for the Relay IR.
  */
@@ -56,9 +55,27 @@ TVM_REGISTER_API("relay._make.Closure")
 
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<ClosureNode>([](const ClosureNode* node, tvm::IRPrinter* p) {
-    p->stream << "ClosureNode(" << node->func << ")";
+    p->stream << "ClosureNode(" << node->func << ", " << node->env << ")";
   });
 
+
+// TODO(@jroesch): this doesn't support mutual letrec
+/* Value Implementation */
+RecClosure RecClosureNode::make(Closure clos, Var bind) {
+  NodePtr<RecClosureNode> n = make_node<RecClosureNode>();
+  n->clos = std::move(clos);
+  n->bind = std::move(bind);
+  return RecClosure(n);
+}
+
+TVM_REGISTER_API("relay._make.RecClosure")
+.set_body_typed(RecClosureNode::make);
+
+TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
+.set_dispatch<RecClosureNode>([](const RecClosureNode* node, tvm::IRPrinter* p) {
+                                p->stream << "RecClosureNode(" << node->clos << ")";
+                              });
+
 TupleValue TupleValueNode::make(tvm::Array<Value> value) {
   NodePtr<TupleValueNode> n = make_node<TupleValueNode>();
   n->fields = value;
@@ -98,6 +115,8 @@ RefValue RefValueNode::make(Value value) {
 TVM_REGISTER_API("relay._make.RefValue")
 .set_body_typed(RefValueNode::make);
 
+TVM_REGISTER_NODE_TYPE(RefValueNode);
+
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<RefValueNode>([](const RefValueNode* node,
                                tvm::IRPrinter* p) {
@@ -117,6 +136,8 @@ ConstructorValue ConstructorValueNode::make(int32_t tag,
 TVM_REGISTER_API("relay._make.ConstructorValue")
 .set_body_typed(ConstructorValueNode::make);
 
+TVM_REGISTER_NODE_TYPE(ConstructorValueNode);
+
 TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 .set_dispatch<ConstructorValueNode>([](const ConstructorValueNode* node,
                                        tvm::IRPrinter* p) {
@@ -189,7 +210,7 @@ class InterpreterStateNode : public Node {
   /*! \brief The call stack of the interpreter. */
   Stack stack;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("current_expr", &current_expr);
     v->Visit("stack", &stack);
   }
@@ -281,7 +302,6 @@ class Interpreter :
     return TupleValueNode::make(values);
   }
 
-  // TODO(@jroesch): this doesn't support mutual letrec
   inline Value MakeClosure(const Function& func, Var letrec_name = Var()) {
     tvm::Map<Var, Value> captured_mod;
     Array<Var> free_vars = FreeVars(func);
@@ -298,10 +318,8 @@ class Interpreter :
 
     // We must use mutation here to build a self referential closure.
     auto closure = ClosureNode::make(captured_mod, func);
-    auto mut_closure =
-        static_cast<ClosureNode*>(const_cast<Node*>(closure.get()));
     if (letrec_name.defined()) {
-      mut_closure->env.Set(letrec_name, closure);
+      return RecClosureNode::make(closure, letrec_name);
     }
     return std::move(closure);
   }
@@ -559,7 +577,7 @@ class Interpreter :
   }
 
   // Invoke the closure
-  Value Invoke(const Closure& closure, const tvm::Array<Value>& args) {
+  Value Invoke(const Closure& closure, const tvm::Array<Value>& args, const Var& bind = Var()) {
     // Get a reference to the function inside the closure.
     if (closure->func->IsPrimitive()) {
       return InvokePrimitiveOp(closure->func, args);
@@ -575,12 +593,16 @@ class Interpreter :
       locals.Set(func->params[i], args[i]);
     }
 
-    // Add the var to value mappings from the Closure's modironment.
+    // Add the var to value mappings from the Closure's environment.
     for (auto it = closure->env.begin(); it != closure->env.end(); ++it) {
       CHECK_EQ(locals.count((*it).first), 0);
       locals.Set((*it).first, (*it).second);
     }
 
+    if (bind.defined()) {
+      locals.Set(bind, RecClosureNode::make(closure, bind));
+    }
+
     return WithFrame<Value>(Frame(locals), [&]() { return Eval(func->body); });
   }
 
@@ -607,6 +629,8 @@ class Interpreter :
     if (const ClosureNode* closure_node = fn_val.as<ClosureNode>()) {
       auto closure = GetRef<Closure>(closure_node);
       return this->Invoke(closure, args);
+    } else if (const RecClosureNode* closure_node = fn_val.as<RecClosureNode>()) {
+      return this->Invoke(closure_node->clos, args, closure_node->bind);
     } else {
       LOG(FATAL) << "internal error: type error, expected function value in the call "
                  << "position";
diff --git a/src/relay/backend/param_dict.cc b/src/relay/backend/param_dict.cc
index 0b9a299ae59b..9bde3a0b4edd 100644
--- a/src/relay/backend/param_dict.cc
+++ b/src/relay/backend/param_dict.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,19 +18,21 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors
  * \file param_dict.cc
  * \brief Implementation and registration of parameter dictionary
  * serializing/deserializing functions.
  */
-#include "param_dict.h"
-
+#include <tvm/runtime/registry.h>
 #include <dmlc/memory_io.h>
 
 #include <string>
 #include <vector>
 #include <utility>
 
+#include "param_dict.h"
+
+
+
 namespace tvm {
 namespace relay {
 
diff --git a/src/relay/backend/param_dict.h b/src/relay/backend/param_dict.h
index 296c71ced644..e7695dc74c09 100644
--- a/src/relay/backend/param_dict.h
+++ b/src/relay/backend/param_dict.h
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -45,7 +45,7 @@ struct NamedNDArrayNode : public ::tvm::Node {
   std::string name;
   tvm::runtime::NDArray array;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("name", &name);
     v->Visit("array", &array);
   }
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 0cfae374ab2c..fab01bd40423 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -239,7 +239,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
     DLOG(INFO) << "VMCompiler::Emit: instr=" << instr;
     CHECK((int)instr.op < 100) << "Invalid opcode " << (int)instr.op;
     switch (instr.op) {
-      case Opcode::AllocDatatype:
+      case Opcode::AllocADT:
       case Opcode::AllocTensor:
       case Opcode::AllocTensorReg:
       case Opcode::GetField:
@@ -287,7 +287,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
     }
 
     // TODO(@jroesch): use correct tag
-    Emit(Instruction::AllocDatatype(
+    Emit(Instruction::AllocADT(
       0,
       tuple->fields.size(),
       fields_registers,
@@ -626,7 +626,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
       for (size_t i = arity - return_count; i < arity; ++i) {
         fields_registers.push_back(unpacked_arg_regs[i]);
       }
-      Emit(Instruction::AllocDatatype(0, return_count, fields_registers, NewRegister()));
+      Emit(Instruction::AllocADT(0, return_count, fields_registers, NewRegister()));
     }
   }
 
@@ -659,7 +659,7 @@ class VMFunctionCompiler : ExprFunctor<void(const Expr& expr)> {
       }
     } else if (auto constructor_node = op.as<ConstructorNode>()) {
       auto constructor = GetRef<Constructor>(constructor_node);
-      Emit(Instruction::AllocDatatype(constructor->tag, call_node->args.size(), args_registers,
+      Emit(Instruction::AllocADT(constructor->tag, call_node->args.size(), args_registers,
                                       NewRegister()));
     } else if (auto var_node = op.as<VarNode>()) {
       VisitExpr(GetRef<Var>(var_node));
@@ -783,9 +783,9 @@ PackedFunc VMCompiler::GetFunction(const std::string& name,
       Module mod = args[0];
       this->Compile(mod, args[1], args[2]);
     });
-  } else if (name == "get_vm") {
+  } else if (name == "get_executable") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      *rv = runtime::Module(vm_);
+      *rv = runtime::Module(exec_);
     });
   } else if (name == "set_params") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
@@ -864,7 +864,7 @@ void VMCompiler::Compile(Module mod,
 
   // Next we get ready by allocating space for
   // the global state.
-  vm_->functions.resize(context_.module->functions.size());
+  exec_->functions.resize(context_.module->functions.size());
 
   for (auto named_func : context_.module->functions) {
     auto gvar = named_func.first;
@@ -873,25 +873,25 @@ void VMCompiler::Compile(Module mod,
     auto vm_func = func_compiler.Compile(gvar, func);
 
     size_t func_index = context_.global_map.at(gvar);
-    CHECK(func_index < vm_->functions.size());
-    vm_->functions[func_index] = vm_func;
+    CHECK(func_index < exec_->functions.size());
+    exec_->functions[func_index] = vm_func;
   }
 
 #if USE_RELAY_DEBUG
-  for (auto vm_func : vm_->functions) {
+  for (auto vm_func : exec_->functions) {
     DLOG(INFO) << vm_func << "-------------";
   }
 #endif  // USE_RELAY_DEBUG
 
   // populate constants
   for (auto data : context_.constants) {
-    vm_->constants.push_back(runtime::vm::Tensor(data));
+    exec_->constants.push_back(runtime::vm::Tensor(data));
   }
 
   LibraryCodegen();
 
   for (auto gv : context_.global_map) {
-    vm_->global_map.insert({gv.first->name_hint, gv.second});
+    exec_->global_map.insert({gv.first->name_hint, gv.second});
   }
 }
 
@@ -987,13 +987,13 @@ void VMCompiler::LibraryCodegen() {
     // therefore target won't be used in the build function
     runtime::Module mod = (*f)(funcs, Target(), target_host_);
     CHECK(mod.operator->());
-    vm_->lib = mod;
+    exec_->lib = mod;
   } else {
     LOG(FATAL) << "relay.backend.build is not registered";
   }
   size_t primitive_index = 0;
   for (auto cfunc : cached_funcs) {
-    vm_->primitive_map.insert({cfunc->funcs[0]->name, primitive_index++});
+    exec_->primitive_map.insert({cfunc->funcs[0]->name, primitive_index++});
   }
 }
 
diff --git a/src/relay/backend/vm/compiler.h b/src/relay/backend/vm/compiler.h
index dff1ef7f4569..215cc12c4cdb 100644
--- a/src/relay/backend/vm/compiler.h
+++ b/src/relay/backend/vm/compiler.h
@@ -92,12 +92,8 @@ class VMCompiler : public runtime::ModuleNode {
     return "VMCompiler";
   }
 
-  std::shared_ptr<VirtualMachine> GetVirtualMachine() const {
-    return vm_;
-  }
-
-  virtual void InitVM() {
-    vm_ = std::make_shared<VirtualMachine>();
+  void InitVM() {
+    exec_ = std::make_shared<Executable>();
   }
 
   /*!
@@ -144,8 +140,8 @@ class VMCompiler : public runtime::ModuleNode {
   tvm::Target target_host_;
   /*! \brief Global shared meta data */
   VMCompilerContext context_;
-  /*! \brief Compiled virtual machine. */
-  std::shared_ptr<VirtualMachine> vm_;
+  /*! \brief Compiled executable. */
+  std::shared_ptr<Executable> exec_;
   /*! \brief parameters */
   std::unordered_map<std::string, runtime::NDArray> params_;
 };
diff --git a/src/relay/backend/vm/deserializer.cc b/src/relay/backend/vm/deserializer.cc
deleted file mode 100644
index 777282782e99..000000000000
--- a/src/relay/backend/vm/deserializer.cc
+++ /dev/null
@@ -1,324 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file src/relay/backend/vm/deserializer.cc
- * \brief Implementation of APIs to deserialize the serialized VM bytecode.
- */
-
-#include "deserializer.h"
-
-#include <tvm/runtime/registry.h>
-#include <memory>
-#include <sstream>
-
-#include "serialize_util.h"
-
-namespace tvm {
-namespace relay {
-namespace vm {
-
-#define STREAM_CHECK(val, section)                                         \
-  CHECK(val) << "Invalid VM file format in the " << section << " section." \
-             << "\n";
-
-void Deserializer::Init(const std::string& code, const runtime::Module& lib) {
-  code_ = code;
-  vm_ = std::make_shared<VirtualMachine>();
-  vm_->lib = lib;
-  strm_ = new dmlc::MemoryStringStream(&code_);
-}
-
-runtime::PackedFunc Deserializer::GetFunction(
-    const std::string& name,
-    const std::shared_ptr<ModuleNode>& sptr_to_self) {
-  if (name == "deserialize") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      this->Deserialize();
-      *rv = runtime::Module(vm_);
-    });
-  } else {
-    LOG(FATAL) << "Unknown packed function: " << name;
-    return PackedFunc([sptr_to_self, name](TVMArgs args, TVMRetValue* rv) {});
-  }
-}
-
-void Deserializer::Deserialize() {
-  // Check header.
-  uint64_t header;
-  STREAM_CHECK(strm_->Read(&header), "header");
-  STREAM_CHECK(header == kTVMVMBytecodeMagic, "header");
-
-  // Check version.
-  std::string version;
-  STREAM_CHECK(strm_->Read(&version), "version");
-  STREAM_CHECK(version == TVM_VERSION, "version");
-
-  // Global section.
-  DeserializeGlobalSection();
-
-  // Constant section.
-  DeserializeConstantSection();
-
-  // Primitive names that will be invoked by `InvokePacked` instructions.
-  DeserializePrimitiveOpNames();
-
-  // Code section.
-  DeserializeCodeSection();
-}
-
-void Deserializer::DeserializeGlobalSection() {
-  std::vector<std::string> globals;
-  STREAM_CHECK(strm_->Read(&globals), "global");
-  for (size_t i = 0; i < globals.size(); i++) {
-    vm_->global_map.insert({globals[i], i});
-  }
-}
-
-void Deserializer::DeserializeConstantSection() {
-  uint64_t sz;
-  // Load the number of constants.
-  STREAM_CHECK(strm_->Read(&sz, sizeof(sz)), "constant");
-
-  size_t size = static_cast<size_t>(sz);
-  // Load each of the constants.
-  for (size_t i = 0; i < size; i++) {
-    runtime::NDArray constant;
-    STREAM_CHECK(constant.Load(strm_), "constant");
-    runtime::ObjectRef obj = runtime::vm::Tensor(constant);
-    vm_->constants.push_back(obj);
-  }
-}
-
-void Deserializer::DeserializePrimitiveOpNames() {
-  std::vector<std::string> primitive_names;
-  STREAM_CHECK(strm_->Read(&primitive_names), "primitive name");
-  for (size_t i = 0; i < primitive_names.size(); i++) {
-    vm_->primitive_map.insert({primitive_names[i], i});
-  }
-}
-
-// Extract the `cnt` number of fields started at `start` from the list
-// `instr_fields`.
-inline std::vector<Index> ExtractFields(const std::vector<Index>& instr_fields,
-                                        Index start,
-                                        Index cnt) {
-  CHECK_LE(static_cast<size_t>(start + cnt), instr_fields.size());
-  std::vector<Index> ret;
-  for (auto i = start; i < start + cnt; i++) {
-    ret.push_back(instr_fields[i]);
-  }
-  return ret;
-}
-
-Instruction DeserializeInstruction(const VMInstructionSerializer& instr) {
-  Opcode opcode = static_cast<Opcode>(instr.opcode);
-  switch (opcode) {
-    case Opcode::Move: {
-      // Number of fields = 2
-      DCHECK_EQ(instr.fields.size(), 2U);
-      return Instruction::Move(instr.fields[0], instr.fields[1]);
-    }
-    case Opcode::Ret: {
-      // Number of fields = 1
-      DCHECK_EQ(instr.fields.size(), 1U);
-      return Instruction::Ret(instr.fields[0]);
-    }
-    case Opcode::Fatal: {
-      // Number of fields = 0
-      DCHECK(instr.fields.empty());
-      return Instruction::Fatal();
-    }
-    case Opcode::InvokePacked: {
-      // Number of fields = 3 + instr.arity
-      DCHECK_GE(instr.fields.size(), 3U);
-      DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
-
-      Index packed_index = instr.fields[0];
-      Index arity = instr.fields[1];
-      Index output_size = instr.fields[2];
-      std::vector<RegName> args = ExtractFields(instr.fields, 3, arity);
-      return Instruction::InvokePacked(packed_index, arity, output_size, args);
-    }
-    case Opcode::AllocTensor: {
-      // Number of fields = 5 + instr.alloc_tensor.ndim
-      DCHECK_GE(instr.fields.size(), 5U);
-      DCHECK_EQ(instr.fields.size(), 5U + static_cast<size_t>(instr.fields[3]));
-
-      DLDataType dtype;
-      dtype.code = instr.fields[0];
-      dtype.bits = instr.fields[1];
-      dtype.lanes = instr.fields[2];
-
-      Index ndim = instr.fields[3];
-      RegName dst = instr.fields[4];
-
-      std::vector<Index> shape = ExtractFields(instr.fields, 5, ndim);
-
-      return Instruction::AllocTensor(shape, dtype, dst);
-    }
-    case Opcode::AllocTensorReg: {
-      // Number of fields = 5
-      DCHECK_EQ(instr.fields.size(), 5U);
-      Index shape_register = instr.fields[0];
-
-      DLDataType dtype;
-      dtype.code = instr.fields[1];
-      dtype.bits = instr.fields[2];
-      dtype.lanes = instr.fields[3];
-
-      RegName dst = instr.fields[4];
-
-      return Instruction::AllocTensorReg(shape_register, dtype, dst);
-    }
-    case Opcode::AllocDatatype: {
-      // Number of fields = 3 + instr.num_fields
-      DCHECK_GE(instr.fields.size(), 3U);
-      DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
-
-      Index constructor_tag = instr.fields[0];
-      Index num_fields = instr.fields[1];
-      RegName dst = instr.fields[2];
-      std::vector<Index> fields = ExtractFields(instr.fields, 3, num_fields);
-
-      return Instruction::AllocDatatype(constructor_tag, num_fields, fields, dst);
-    }
-    case Opcode::AllocClosure: {
-      // Number of fields = 3 + instr.num_freevar
-      DCHECK_GE(instr.fields.size(), 3U);
-      DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
-
-      Index clo_index = instr.fields[0];
-      Index num_freevar = instr.fields[1];
-      RegName dst = instr.fields[2];
-      std::vector<Index> free_vars = ExtractFields(instr.fields, 3, num_freevar);
-
-      return Instruction::AllocClosure(clo_index, num_freevar, free_vars, dst);
-    }
-    case Opcode::If: {
-      // Number of fields = 4
-      DCHECK_EQ(instr.fields.size(), 4U);
-      Index test = instr.fields[0];
-      Index target = instr.fields[1];
-      Index true_offset = instr.fields[2];
-      Index false_offset = instr.fields[3];
-
-      return Instruction::If(test, target, true_offset, false_offset);
-    }
-    case Opcode::Invoke: {
-      // Number of fields = 3 + instr.num_args
-      DCHECK_GE(instr.fields.size(), 3U);
-      DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
-
-      Index func_index = instr.fields[0];
-      Index num_args = instr.fields[1];
-      RegName dst = instr.fields[2];
-      std::vector<Index> args = ExtractFields(instr.fields, 3, num_args);
-
-      return Instruction::Invoke(func_index, args, dst);
-    }
-    case Opcode::InvokeClosure: {
-      // Number of fields = 3 + instr.num_closure_args
-      DCHECK_GE(instr.fields.size(), 3U);
-      DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
-
-      Index closure = instr.fields[0];
-      Index num_closure_args = instr.fields[1];
-      RegName dst = instr.fields[2];
-      std::vector<Index> args = ExtractFields(instr.fields, 3, num_closure_args);
-
-      return Instruction::InvokeClosure(closure, args, dst);
-    }
-    case Opcode::LoadConst: {
-      // Number of fields = 2
-      DCHECK_EQ(instr.fields.size(), 2U);
-      return Instruction::LoadConst(instr.fields[0], instr.fields[1]);
-    }
-    case Opcode::LoadConsti: {
-      // Number of fields = 2
-      DCHECK_EQ(instr.fields.size(), 2U);
-      return Instruction::LoadConsti(instr.fields[0], instr.fields[1]);
-    }
-    case Opcode::GetField: {
-      // Number of fields = 3
-      DCHECK_EQ(instr.fields.size(), 3U);
-      return Instruction::GetField(instr.fields[0], instr.fields[1], instr.fields[2]);
-    }
-    case Opcode::GetTag: {
-      // Number of fields = 2
-      DCHECK_EQ(instr.fields.size(), 2U);
-      return Instruction::GetTag(instr.fields[0], instr.fields[1]);
-    }
-    case Opcode::Goto: {
-      // Number of fields = 1
-      DCHECK_EQ(instr.fields.size(), 1U);
-      return Instruction::Goto(instr.fields[0]);
-    }
-    default:
-      LOG(FATAL) << "Invalid opcode" << instr.opcode;
-      return Instruction();
-  }
-}
-
-void Deserializer::DeserializeCodeSection() {
-  // Load the number of functions.
-  uint64_t sz;
-  STREAM_CHECK(strm_->Read(&sz, sizeof(sz)), "code");
-
-  size_t num_funcs = static_cast<size_t>(sz);
-  vm_->functions.resize(num_funcs);
-  for (size_t i = 0; i < num_funcs; i++) {
-    // Load the function info.
-    VMFunctionSerializer loaded_func;
-    STREAM_CHECK(loaded_func.Load(strm_), "code/function");
-
-    // Load the instructions.
-    std::vector<Instruction> instructions;
-    for (size_t j = 0; j < loaded_func.num_instructions; j++) {
-      VMInstructionSerializer instr;
-      std::vector<Index> instr_fields;
-      STREAM_CHECK(instr.Load(strm_), "code/instruction");
-      instructions.push_back(DeserializeInstruction(instr));
-    }
-
-    // Create the VM function.
-    VMFunction vm_func = VMFunction(loaded_func.name,
-                                    loaded_func.params,
-                                    instructions,
-                                    loaded_func.register_file_size);
-    auto it = vm_->global_map.find(loaded_func.name);
-    CHECK(it != vm_->global_map.end());
-    CHECK_LE(it->second, vm_->global_map.size());
-    vm_->functions[it->second] = vm_func;
-  }
-}
-
-runtime::Module CreateDeserializer(const std::string& code, const runtime::Module lib) {
-  std::shared_ptr<Deserializer> exec = std::make_shared<Deserializer>();
-  exec->Init(code, lib);
-  return runtime::Module(exec);
-}
-
-TVM_REGISTER_GLOBAL("relay._vm._Deserializer")
-.set_body_typed(CreateDeserializer);
-
-}  // namespace vm
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/backend/vm/deserializer.h b/src/relay/backend/vm/deserializer.h
deleted file mode 100644
index 0caf72bee92c..000000000000
--- a/src/relay/backend/vm/deserializer.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file src/relay/backend/vm/deserializer.h
- * \brief Define a deserializer for the serialized Relay VM.
- */
-
-#ifndef TVM_RELAY_BACKEND_VM_DESERIALIZER_H_
-#define TVM_RELAY_BACKEND_VM_DESERIALIZER_H_
-
-#include <dmlc/memory_io.h>
-#include <tvm/packed_func_ext.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/runtime/vm.h>
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace tvm {
-namespace relay {
-namespace vm {
-
-using namespace tvm::runtime::vm;
-namespace runtime = tvm::runtime;
-
-class Deserializer : public runtime::ModuleNode {
- public:
-  /*!
-   * \brief Initialize the deserializer for creating a virtual machine object.
-   *
-   * \param code The serialized code.
-   * \param lib The serialized runtime module/library that contains the
-   * hardware dependent code.
-   */
-  inline void Init(const std::string& code, const runtime::Module& lib);
-
-  /*!
-   * \brief Return the member function to the frontend.
-   *
-   * \param name The name of the function.
-   * \param sptr_to_self The pointer to the module node.
-   *
-   * \return The corresponding member function.
-   */
-  PackedFunc GetFunction(const std::string& name,
-                         const std::shared_ptr<ModuleNode>& sptr_to_self) final;
-
-  const char* type_key() const final { return "Deserializer"; }
-
-  /*! \brief Deserialize the serialized VM. */
-  void Deserialize();
-
-  virtual ~Deserializer() { delete strm_; }
-
- private:
-  /*! \brief Deserialize the globals in `vm_`. */
-  void DeserializeGlobalSection();
-
-  /*! \brief Deserialize the constant pool in `vm_`. */
-  void DeserializeConstantSection();
-
-  /*! \brief Deserialize primitive op names in `vm_`. */
-  void DeserializePrimitiveOpNames();
-
-  /*! \brief Deserialize the vm functions in `vm_`. */
-  void DeserializeCodeSection();
-
-  /*! \brief The code to be serialized. */
-  std::string code_;
-
-  /*! \brief The stream used for serialization. */
-  dmlc::Stream* strm_;
-
-  /*! \brief The VM to be created. */
-  std::shared_ptr<VirtualMachine> vm_;
-};
-
-}  // namespace vm
-}  // namespace relay
-}  // namespace tvm
-
-#endif  // TVM_RELAY_BACKEND_VM_DESERIALIZER_H_
diff --git a/src/relay/backend/vm/profiler/compiler.cc b/src/relay/backend/vm/profiler/compiler.cc
index 9fd28e8c7f46..60c441a60cf0 100644
--- a/src/relay/backend/vm/profiler/compiler.cc
+++ b/src/relay/backend/vm/profiler/compiler.cc
@@ -33,7 +33,6 @@ namespace vm {
 class VMCompilerDebug : public VMCompiler {
  public:
   VMCompilerDebug() {}
-  void InitVM() override { vm_ = std::make_shared<VirtualMachineDebug>(); }
   virtual ~VMCompilerDebug() {}
 };
 
diff --git a/src/relay/backend/vm/serializer.cc b/src/relay/backend/vm/serializer.cc
deleted file mode 100644
index 0040ef9db470..000000000000
--- a/src/relay/backend/vm/serializer.cc
+++ /dev/null
@@ -1,439 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file src/relay/backend/vm/serializer.cc
- * \brief Implementation of serializing APIs for the Relay VM.
- */
-#include "serializer.h"
-
-#include <tvm/runtime/registry.h>
-#include <tvm/runtime/c_runtime_api.h>
-
-#include <algorithm>
-#include <memory>
-#include <sstream>
-#include <utility>
-#include <vector>
-
-#include "serialize_util.h"
-
-namespace tvm {
-namespace relay {
-namespace vm {
-
-void Serializer::Init(const VirtualMachine* vm) {
-  vm_ = vm;
-  // Initialize the stream object.
-  strm_ = new dmlc::MemoryStringStream(&code_);
-}
-
-runtime::PackedFunc Serializer::GetFunction(
-    const std::string& name,
-    const std::shared_ptr<ModuleNode>& sptr_to_self) {
-  if (name == "get_lib") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      *rv = this->GetLib();
-    });
-  } else if (name == "get_primitive_ops") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      *rv = this->GetPrimitiveOps();
-    });
-  } else if (name == "get_bytecode") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      *rv = this->GetBytecode();
-    });
-  } else if (name == "get_globals") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      *rv = this->GetGlobals();
-    });
-  } else if (name == "get_stats") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      *rv = this->Stats();
-    });
-  } else if (name == "serialize") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      *rv = this->Serialize();
-    });
-  } else {
-    LOG(FATAL) << "Unknown packed function: " << name;
-    return PackedFunc([sptr_to_self, name](TVMArgs args, TVMRetValue* rv) {});
-  }
-}
-
-tvm::Array<tvm::Expr> Serializer::GetPrimitiveOps() const {
-  std::vector<tvm::Expr> ret;
-  for (const auto& it : vm_->primitive_map) {
-    auto packed_name = tvm::ir::StringImm::make(it.first);
-    auto packed_index = static_cast<size_t>(it.second);
-    if (ret.size() <= packed_index) {
-      ret.resize(packed_index + 1);
-    }
-    ret[packed_index] = packed_name;
-  }
-  return ret;
-}
-
-std::string Serializer::Stats() const {
-  std::ostringstream oss;
-  oss << "Relay VM statistics:" << std::endl;
-
-  // Get the number of constants and the shape of each of them.
-  oss << "  Constant shapes (# " << vm_->constants.size() << "): [";
-  for (const auto& it : vm_->constants) {
-    auto* cell = it.as<runtime::vm::TensorObj>();
-    CHECK(cell != nullptr);
-    runtime::NDArray data = cell->data;
-    const auto& shape = data.Shape();
-
-    // Scalar
-    if (shape.empty()) {
-      oss << "scalar, ";
-      continue;
-    }
-
-    oss << "[";
-    for (auto s : shape) {
-      oss << s << ", ";
-    }
-    oss.seekp(-2, oss.cur);
-    oss << "], " << std::endl;
-  }
-  if (!vm_->constants.empty()) oss.seekp(-2, oss.cur);
-  oss << "]" << std::endl;
-
-  // Get the number of globals and the name of each of them.
-  oss << "  Globals (#" << vm_->global_map.size() << "): [";
-  for (const auto& it : vm_->global_map) {
-    oss << "(\"" << it.first << "\", " << it.second << ")" << ", ";
-  }
-  if (!vm_->global_map.empty()) oss.seekp(-2, oss.cur);
-  oss << "]" << std::endl;
-
-  // Get the number of primitive ops and the name of each of them.
-  oss << "  Primitive ops (#" << vm_->primitive_map.size() << "): [";
-  const auto& prim_ops = GetPrimitiveOps();
-  for (const auto& it : prim_ops) {
-    oss << it << ", ";
-  }
-  if (!prim_ops.empty()) oss.seekp(-2, oss.cur);
-  oss << "]" << std::endl;
-
-  return oss.str();
-}
-
-TVMByteArray Serializer::Serialize() {
-  uint64_t header = kTVMVMBytecodeMagic;
-  strm_->Write(header);
-  std::string version = TVM_VERSION;
-  strm_->Write(version);
-
-  // Global section.
-  SerializeGlobalSection();
-
-  // Constant section.
-  SerializeConstantSection();
-
-  // Primitive names.
-  SerializePrimitiveOpNames();
-
-  // Code section.
-  SerializeCodeSection();
-
-  TVMByteArray arr;
-  arr.data = code_.c_str();
-  arr.size = code_.length();
-  return arr;
-}
-
-void Serializer::SerializeGlobalSection() {
-  auto globals = GetGlobals();
-  std::vector<std::string> glbs;
-  for (const auto& it : globals) {
-    glbs.push_back(it.as<tvm::ir::StringImm>()->value);
-  }
-  strm_->Write(glbs);
-}
-
-void Serializer::SerializeConstantSection() {
-  std::vector<DLTensor*> arrays;
-  for (const auto& obj : vm_->constants) {
-    const auto* cell = obj.as<runtime::vm::TensorObj>();
-    CHECK(cell != nullptr);
-    runtime::NDArray data = cell->data;
-    arrays.push_back(const_cast<DLTensor*>(data.operator->()));
-  }
-  strm_->Write(static_cast<uint64_t>(vm_->constants.size()));
-  for (const auto& it : arrays) {
-    runtime::SaveDLTensor(strm_, it);
-  }
-}
-
-void Serializer::SerializePrimitiveOpNames() {
-  auto names = GetPrimitiveOps();
-  std::vector<std::string> primitive_names;
-  for (const auto& it : names) {
-    primitive_names.push_back(it.as<tvm::ir::StringImm>()->value);
-  }
-  strm_->Write(primitive_names);
-}
-
-// Serialize a virtual machine instruction. It creates a list that contains the
-// hash, opcode, and all fields of an instruction.
-//
-// For example, the function signature used to create an `AllocTensor`
-// instruction is:
-//   Instruction AllocTensor(std::vector<Index> shape, DLDataType dtype, RegName dst)
-//
-// The serialized form will be:
-//   `hash 5 dtype.code dtype.bits dtype.lanes ndim dst_register val1 val2 ... valn`
-//
-// where hash is the hash of serialized instruction that is computed internally
-// by the `VMInstructionSerializer`. It is used for sanity check before decoding.
-// 5 shows opcode of `AllocTensor`, `(dtype.code dtype.bits dtype.lanes)`
-// represents a `DLDataType`, `ndim` is the number of dimensions, `dst_register`
-// is the destination register, and the rest of it together indicates the shape
-// of the tensor to be allocated.
-VMInstructionSerializer SerializeInstruction(const Instruction& instr) {
-  std::vector<Index> fields;
-  // Save the opcode.
-  DLOG(INFO) << "Serializing: " << instr << std::endl;
-  switch (instr.op) {
-    case Opcode::Move: {
-      // Number of fields = 2
-      fields.assign({instr.from, instr.dst});
-      break;
-    }
-    case Opcode::Ret: {
-      // Number of fields = 1
-      fields.push_back(instr.result);
-      break;
-    }
-    case Opcode::Fatal: {
-      // Number of fields = 0
-      break;
-    }
-    case Opcode::InvokePacked: {
-      // Number of fields = 3 + instr.arity
-      // Note that arity includes both input arguments and outputs. We will
-      // put all the `arity` number of fields in the end for serialization.
-      fields.assign({instr.packed_index, instr.arity, instr.output_size});
-      // Save the args.
-      fields.insert(fields.end(), instr.packed_args, instr.packed_args + instr.arity);
-      break;
-    }
-    case Opcode::AllocTensor: {
-      // Number of fields = 5 + instr.alloc_tensor.ndim
-      // Save `DLDataType` and the dst register.
-      const auto& dtype = instr.alloc_tensor.dtype;
-      fields.assign({dtype.code, dtype.bits, dtype.lanes});
-
-      // The number of dimensions is not needed for constructing an
-      // `AllocTensor` instruction as it equals to the length of the `shape`
-      // vector. However, we save it to conveniently deserialize the instruction
-      // because we will know how many fields are needed by the `shape` argument.
-      fields.push_back(instr.alloc_tensor.ndim);
-      fields.push_back(instr.dst);
-
-      // Save the shape of the tensor.
-      // Note that this field is rotated to the end of the list.
-      fields.insert(fields.end(), instr.alloc_tensor.shape,
-                    instr.alloc_tensor.shape + instr.alloc_tensor.ndim);
-      break;
-    }
-    case Opcode::AllocTensorReg: {
-      // Number of fields = 5
-      fields.push_back(instr.alloc_tensor_reg.shape_register);
-      // Save `DLDataType` and the dst register.
-      const auto& dtype = instr.alloc_tensor.dtype;
-      fields.assign({dtype.code, dtype.bits, dtype.lanes});
-      fields.push_back(instr.dst);
-      break;
-    }
-    case Opcode::AllocDatatype: {
-      // Number of fields = 3 + instr.num_fields
-      fields.assign({instr.constructor_tag, instr.num_fields, instr.dst});
-
-      // Save the fields.
-      fields.insert(fields.end(), instr.datatype_fields,
-                    instr.datatype_fields + instr.num_fields);
-      break;
-    }
-    case Opcode::AllocClosure: {
-      // Number of fields = 3 + instr.num_freevar
-      fields.assign({instr.clo_index, instr.num_freevar, instr.dst});
-
-      // Save the free vars.
-      fields.insert(fields.end(), instr.free_vars,
-                    instr.free_vars + instr.num_freevar);
-      break;
-    }
-    case Opcode::If: {
-      // Number of fields = 4
-      fields.assign({instr.if_op.test,
-                     instr.if_op.target,
-                     instr.if_op.true_offset,
-                     instr.if_op.false_offset});
-      break;
-    }
-    case Opcode::Invoke: {
-      // Number of fields = 3 + instr.num_args
-      fields.assign({instr.func_index, instr.num_args, instr.dst});
-
-      // Save the args.
-      fields.insert(fields.end(), instr.invoke_args_registers,
-                    instr.invoke_args_registers + instr.num_args);
-      break;
-    }
-    case Opcode::InvokeClosure: {
-      // Number of fields = 3 + instr.num_closure_args
-      fields.assign({instr.closure, instr.num_closure_args, instr.dst});
-
-      // Save the args.
-      fields.insert(fields.end(), instr.closure_args,
-                    instr.closure_args + instr.num_closure_args);
-      break;
-    }
-    case Opcode::LoadConst: {
-      // Number of fields = 2
-      fields.assign({instr.const_index, instr.dst});
-      break;
-    }
-    case Opcode::LoadConsti: {
-      // Number of fields = 2
-      fields.assign({instr.load_consti.val, instr.dst});
-      break;
-    }
-    case Opcode::GetField: {
-      // Number of fields = 3
-      fields.assign({instr.object, instr.field_index, instr.dst});
-      break;
-    }
-    case Opcode::GetTag: {
-      // Number of fields = 2
-      fields.assign({instr.get_tag.object, instr.dst});
-      break;
-    }
-    case Opcode::Goto: {
-      // Number of fields = 1
-      fields.push_back(instr.pc_offset);
-      break;
-    }
-    default:
-      LOG(FATAL) << "Invalid opcode" << static_cast<int>(instr.op);
-      break;
-  }
-
-  return VMInstructionSerializer(static_cast<Index>(instr.op), fields);
-}
-
-void Serializer::SerializeCodeSection() {
-  // Save the number of functions.
-  strm_->Write(static_cast<uint64_t>(vm_->functions.size()));
-  for (const auto& func : vm_->functions) {
-    // Serialize the function info.
-    VMFunctionSerializer func_format(func.name,
-                                     func.register_file_size,
-                                     func.instructions.size(),
-                                     func.params);
-    func_format.Save(strm_);
-
-    // Serialize each instruction.
-    for (const auto& instr : func.instructions) {
-      const auto& serialized_instr = SerializeInstruction(instr);
-      serialized_instr.Save(strm_);
-    }
-  }
-}
-
-tvm::Array<tvm::Expr> Serializer::GetGlobals() const {
-  tvm::Array<tvm::Expr> ret;
-  std::vector<std::pair<std::string, Index> > globals(vm_->global_map.begin(),
-                                                      vm_->global_map.end());
-  auto comp = [](const std::pair<std::string, Index>& a,
-                 const std::pair<std::string, Index>& b) {
-    return a.second < b.second;
-  };
-  std::sort(globals.begin(), globals.end(), comp);
-  for (const auto& it : globals) {
-    ret.push_back(tvm::ir::StringImm::make(it.first));
-  }
-  return ret;
-}
-
-std::string Serializer::GetBytecode() const {
-  std::ostringstream oss;
-
-  for (const auto& func : vm_->functions) {
-    // Print the header of the function format.
-    oss << "# func name, reg file size, param count, inst count:"
-        << std::endl;
-    oss << func.name << " "
-        << func.register_file_size << " "
-        << func.params.size() << " "
-        << func.instructions.size() << std::endl;
-
-    // Print pramams of a `VMFunction`.
-    oss << "# Parameters:"<< std::endl;
-    for (const auto& param : func.params) {
-      oss << param << " ";
-    }
-    oss << std::endl;
-
-    // Print the instructions of a `VMFunction`.
-    // The part after ";" is the instruction in text format.
-    oss << "hash, opcode, fields # inst(text):"<< std::endl;
-    for (const auto& instr : func.instructions) {
-      const auto& serialized_instr = SerializeInstruction(instr);
-      oss << std::hex << "0x" << serialized_instr.Hash() << " "
-          << std::dec << serialized_instr.opcode << " ";
-      for (auto it : serialized_instr.fields) {
-        oss << it << " ";
-      }
-      oss << "  # " << instr;
-      if (oss.str().back() != '\n') oss << std::endl;
-    }
-  }
-
-  return oss.str();
-}
-
-runtime::Module Serializer::GetLib() const {
-  return vm_->lib;
-}
-
-runtime::Module CreateSerializer(const VirtualMachine* vm) {
-  std::shared_ptr<Serializer> exec = std::make_shared<Serializer>();
-  exec->Init(vm);
-  return runtime::Module(exec);
-}
-
-TVM_REGISTER_GLOBAL("relay._vm._Serializer")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-  runtime::Module mod = args[0];
-  const auto* vm = dynamic_cast<VirtualMachine*>(mod.operator->());
-  CHECK(vm) << "Virtual machine has not been defined yet."
-            << "\n";
-  *rv = CreateSerializer(vm);
-});
-
-}  // namespace vm
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/backend/vm/serializer.h b/src/relay/backend/vm/serializer.h
deleted file mode 100644
index 2371bb4c94f5..000000000000
--- a/src/relay/backend/vm/serializer.h
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file src/relay/backend/vm/serializer.h
- * \brief Define a serializer for the Relay VM.
- *
- * The following components of a Relay VM will be serialized:
- *  - The `constants`, e.g., the constant pool, that contains the
- *  constants used in a Relay program.
- *  - The `packed_funcs` that essentially contains the generated code for
- *  a specific target. We return it as a runtime module that can be exported as
- *  a library file (e.g., .so, .o, or .tar).
- *  - The `global_map` that contains the globals.
- *  - The `primitive_map` that contains the name of individual primitive operators.
- *  - The `functions`, e.g., the `VMFunction`. Each `VMFunction` is composed of
- *  a list of instructions/bytecode.
- *
- * Note that only the library is returned as a separate module. All othere parts
- * are stored in a single serialized code that is organized with the following
- * sections in order.
- *  - Global section, containing all globals.
- *  - Constant section, storing the constant pool.
- *  - Primitive name section, containing the function name of the primitive ops
- *  used by the virtual machine.
- *  - Code section, handling the VM functions and bytecode.
- *
- * The code section is again organized as follows for each VM function:
- *   func_name, register_file_size, num_instructions (N)
- *   param1, param2, ..., paramM
- *   instruction1
- *   instruction2
- *   ...
- *   instructionN
- *
- * Serializing an `Instruction` requires us to deal with the bytecode. Each line
- * of the instructions could be serialized as the following format:
- *   hash, opcode, f1, f2, ..., fX, field with variable length
- *   1. hash: the hash of the instruction. This number will be used to help us
- * validate if an instruction is well-formed during deserialization.
- *   2. opcode: the opcode code of the instruction.
- *   3. f1, f2, ..., fX. These fields together represent the fixed fields in
- * an instruction, e.g., `from` and `dst` fields of a `Move` instruction. For
- * example, `DLDataType` will be unpacked into three fields (code, bits, lanes).
- *   4. The rest of the line indicates the field with variable length, e.g.,
- * the shape of a tensor, the args used by an `InvokPacked` instruction, etc.
- */
-
-#ifndef TVM_RELAY_BACKEND_VM_SERIALIZER_H_
-#define TVM_RELAY_BACKEND_VM_SERIALIZER_H_
-
-#include <dmlc/io.h>
-#include <dmlc/memory_io.h>
-#include <tvm/ir.h>
-#include <tvm/node/container.h>
-#include <tvm/packed_func_ext.h>
-#include <tvm/runtime/packed_func.h>
-#include <tvm/runtime/vm.h>
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace tvm {
-namespace relay {
-namespace vm {
-
-using namespace tvm::runtime;
-using namespace tvm::runtime::vm;
-
-/*!
- * \brief The Relay VM serializer.
- */
-class Serializer : public runtime::ModuleNode {
- public:
-  /*!
-   * \brief Initialize the serializer for a virtual machine.
-   *
-   *  \param vm The Relay virtual machine.
-   */
-  inline void Init(const VirtualMachine* vm);
-
-  /*!
-   * \brief Return the member function to the frontend.
-   *
-   * \param name The name of the function.
-   * \param sptr_to_self The pointer to the module node.
-   *
-   * \return The corresponding member function.
-   */
-  PackedFunc GetFunction(const std::string& name,
-                         const std::shared_ptr<ModuleNode>& sptr_to_self) final;
-
-  const char* type_key() const final { return "Serializer"; }
-
-  /*!
-   * \brief Print the detailed statistics of the given code, i.e. number of
-   * globls and constants, etc.
-   */
-  std::string Stats() const;
-
-  /*!
-   * \brief Serialize the `vm_` into global section, constant section, and code
-   * section.
-   *
-   * \return The binary representation of the VM.
-   */
-  TVMByteArray Serialize();
-
-  /*!
-   * \brief Get a list of the globals used by the `_vm`.
-   *
-   * \return The global map in the form a list.
-   */
-  tvm::Array<tvm::Expr> GetGlobals() const;
-
-  /*!
-   * \brief Get the primitive operators that are contained in the Relay VM.
-   *
-   * \return The list of primitve operators.
-   */
-  tvm::Array<tvm::Expr> GetPrimitiveOps() const;
-
-  /*!
-   * \brief Get the serialized form of the `functions` in `vm_`. This is
-   * essentially bytecode serialization.
-   *
-   * \return The serialized vm bytecode.
-   *
-   * \note The bytecode is in the following format:
-   *   func_name reg_file_size num_instructions
-   *   param1 param2 ... paramM
-   *   instruction1
-   *   instruction2
-   *   ...
-   *   instructionN
-   *
-   * Each instruction is printed in the following format:
-   *   opcode num_fields field1 ... fieldX # The text format.
-   *
-   * The field starting from # is only used for debugging. The serialized code
-   * doesn't contain it, therefore the deserializer doens't need to handle it.
-   */
-  std::string GetBytecode() const;
-
-  /*! \brief Get the `lib` module in vm_. Serialization of `runtime::module`
-   * has already been supported by TVM. Therefore, we only return the runtime
-   * module and let users have the flexibility to call `export_library` from
-   * the frontend to save the library to disk.
-   *
-   * \return The runtime module that contains the hardwre dependent code.
-   */
-  inline runtime::Module GetLib() const;
-
-  virtual ~Serializer() { delete strm_; }
-
- private:
-  /*! \brief Serialize the globals in vm_. */
-  void SerializeGlobalSection();
-
-  /*! \brief Serialize the constant pool in vm_. */
-  void SerializeConstantSection();
-
-  /*! \brief Serialize primitive op names in vm_. */
-  void SerializePrimitiveOpNames();
-
-  /*! \brief Serialize the vm functions in vm_. */
-  void SerializeCodeSection();
-
-  /*! \brief The Relay virtual machine for to be serialized. */
-  const VirtualMachine* vm_;
-
-  /*! \brief The stream used for serialization. */
-  dmlc::Stream* strm_;
-
-  /*! \brief The serialized code. */
-  std::string code_;
-};
-
-}  // namespace vm
-}  // namespace relay
-}  // namespace tvm
-
-#endif  // TVM_RELAY_BACKEND_VM_SERIALIZER_H_
diff --git a/src/relay/ir/adt.cc b/src/relay/ir/adt.cc
index 9c670bf47e8c..12cebe5f5d3c 100644
--- a/src/relay/ir/adt.cc
+++ b/src/relay/ir/adt.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors
  * \file src/tvm/ir/adt.cc
  * \brief AST nodes for Relay algebraic data types (ADTs).
  */
diff --git a/src/relay/ir/alpha_equal.cc b/src/relay/ir/alpha_equal.cc
index 878795d0b9f2..0dbcf992e028 100644
--- a/src/relay/ir/alpha_equal.cc
+++ b/src/relay/ir/alpha_equal.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors
  * \file src/tvm/relay/ir/alpha_equal.cc
  * \brief Alpha equality check by deep comparing two nodes.
  */
@@ -53,12 +52,12 @@ class AlphaEqualHandler:
   bool Equal(const NodeRef& lhs, const NodeRef& rhs) {
     if (lhs.same_as(rhs)) return true;
     if (!lhs.defined() || !rhs.defined()) return false;
-    if (lhs->derived_from<TypeNode>()) {
-      if (!rhs->derived_from<TypeNode>()) return false;
+    if (lhs->IsInstance<TypeNode>()) {
+      if (!rhs->IsInstance<TypeNode>()) return false;
       return TypeEqual(Downcast<Type>(lhs), Downcast<Type>(rhs));
     }
-    if (lhs->derived_from<ExprNode>()) {
-      if (!rhs->derived_from<ExprNode>()) return false;
+    if (lhs->IsInstance<ExprNode>()) {
+      if (!rhs->IsInstance<ExprNode>()) return false;
       return ExprEqual(Downcast<Expr>(lhs), Downcast<Expr>(rhs));
     }
     if (const auto lhsm = lhs.as<ModuleNode>()) {
@@ -181,7 +180,7 @@ class AlphaEqualHandler:
    * \param rhs The right hand operand.
    * \return The compare result.
    */
-  bool LeafNodeEqual(const NodeRef& lhs, const NodeRef& rhs) {
+  bool LeafNodeEqual(const ObjectRef& lhs, const ObjectRef& rhs) {
     if (lhs.same_as(rhs)) return true;
     auto it = equal_map_.find(lhs);
     if (it != equal_map_.end()) {
@@ -197,7 +196,7 @@ class AlphaEqualHandler:
     }
   }
   using AttrsEqualHandler::VisitAttr_;
-  bool VisitAttr_(const Variable* lhs, const NodeRef& other) final {
+  bool VisitAttr_(const Variable* lhs, const ObjectRef& other) final {
     return LeafNodeEqual(GetRef<NodeRef>(lhs), other);
   }
 
@@ -588,7 +587,7 @@ class AlphaEqualHandler:
   // if in assert mode, must return true, and will throw error otherwise.
   bool assert_mode_;
   // renaming of NodeRef to indicate two nodes equals to each other
-  std::unordered_map<NodeRef, NodeRef, NodeHash, NodeEqual> equal_map_;
+  std::unordered_map<ObjectRef, ObjectRef, ObjectHash, ObjectEqual> equal_map_;
 };
 
 bool AlphaEqual(const Type& lhs, const Type& rhs) {
diff --git a/src/relay/ir/base.cc b/src/relay/ir/base.cc
index 2032112f2a85..80f07904662f 100644
--- a/src/relay/ir/base.cc
+++ b/src/relay/ir/base.cc
@@ -61,7 +61,7 @@ TVM_STATIC_IR_FUNCTOR_REGISTER(IRPrinter, vtable)
 
 TVM_REGISTER_NODE_TYPE(SourceNameNode)
 .set_creator(GetSourceNameNode)
-.set_global_key([](const Node* n) {
+.set_global_key([](const Object* n) {
     return static_cast<const SourceNameNode*>(n)->name;
   });
 
@@ -88,7 +88,7 @@ TVM_REGISTER_NODE_TYPE(IdNode);
 
 TVM_REGISTER_API("relay._base.set_span")
 .set_body_typed<void(NodeRef, Span)>([](NodeRef node_ref, Span sp) {
-    auto rn = node_ref.as_derived<RelayNode>();
+    auto rn = node_ref.as<RelayNode>();
     CHECK(rn);
     rn->span = sp;
 });
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index 35e4f2b4ab13..c36b4c8566b8 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -159,6 +159,26 @@ bool FunctionNode::IsPrimitive() const {
   return pval && pval->value != 0;
 }
 
+Function FunctionNode::SetParams(const tvm::Map<Var, Constant>& parameters) const {
+  return FunctionSetAttr(GetRef<Function>(this), "__params__", parameters);
+}
+
+TVM_REGISTER_API("relay._expr.FunctionSetParams")
+.set_body_typed<Function(const Function&, const tvm::Map<Var, Constant>&)>(
+  [](const Function& func, const tvm::Map<Var, Constant>& parameters) {
+    return func->SetParams(parameters);
+});
+
+tvm::Map<Var, Constant> FunctionNode::GetParams() const {
+  auto node_ref = FunctionGetAttr(GetRef<Function>(this), "__params__");
+  return Downcast<tvm::Map<Var, Constant>>(node_ref);
+}
+
+TVM_REGISTER_API("relay._expr.FunctionGetParams")
+.set_body_typed<tvm::Map<Var, Constant>(const Function&)>([](const Function& func) {
+  return func->GetParams();
+});
+
 NodeRef FunctionGetAttr(const Function& func, const std::string& key) {
   if (!func->attrs.defined()) { return NodeRef(); }
 
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index 6a2db6b46d64..ac45d61e873d 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -18,8 +18,7 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors
- * \file src/tvm/relay/expr_mutator.cc
+ * \file src/tvm/relay/expr_functor.cc
  * \brief A wrapper around ExprFunctor which functionally updates the AST.
  *
  * ExprMutator uses memoization and self return in order to amortize
@@ -447,10 +446,10 @@ Expr Bind(const Expr& expr, const tvm::Map<Var, Expr>& args_map) {
 TVM_REGISTER_API("relay._expr.Bind")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
     NodeRef input = args[0];
-    if (input->derived_from<ExprNode>()) {
+    if (input->IsInstance<ExprNode>()) {
       *ret = Bind(Downcast<Expr>(input), args[1]);
     } else {
-      CHECK(input->derived_from<TypeNode>());
+      CHECK(input->IsInstance<TypeNode>());
       *ret = Bind(Downcast<Type>(input), args[1]);
     }
   });
diff --git a/src/relay/ir/hash.cc b/src/relay/ir/hash.cc
index d39253372830..bce3610da47b 100644
--- a/src/relay/ir/hash.cc
+++ b/src/relay/ir/hash.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors
  * \file src/tvm/relay/ir/hash.cc
  * \brief Hash functions for Relay types and expressions.
  */
@@ -49,12 +48,12 @@ class RelayHashHandler:
    * \return the hash value.
    */
   size_t Hash(const NodeRef& ref) {
-    if (!ref.defined()) return ref.hash();
+    if (!ref.defined()) return NodeHash()(ref);
 
-    if (ref->derived_from<TypeNode>()) {
+    if (ref->IsInstance<TypeNode>()) {
       return TypeHash(Downcast<Type>(ref));
     }
-    if (ref->derived_from<ExprNode>()) {
+    if (ref->IsInstance<ExprNode>()) {
       return ExprHash(Downcast<Expr>(ref));
     }
     return AttrHash(ref);
@@ -66,7 +65,9 @@ class RelayHashHandler:
    * \return the hash value
    */
   size_t AttrHash(const NodeRef& ref) {
-    if (!ref.defined()) { return ref.hash(); }
+    if (!ref.defined()) {
+      return NodeHash()(ref);
+    }
     return AttrsHashHandler::Hash(ref);
   }
   /*!
@@ -76,7 +77,9 @@ class RelayHashHandler:
    * \return the hash value.
    */
   size_t TypeHash(const Type& type) {
-    if (!type.defined()) { return type.hash(); }
+    if (!type.defined()) {
+      return NodeHash()(type);
+    }
     auto found = hash_map_.find(type);
     if (found != hash_map_.end()) {
       return found->second;
@@ -98,7 +101,9 @@ class RelayHashHandler:
    * \return the hash value.
    */
   size_t ExprHash(const Expr& expr) {
-    if (!expr.defined()) return expr.hash();
+    if (!expr.defined()) {
+      return NodeHash()(expr);
+    }
     auto found = hash_map_.find(expr);
     if (found != hash_map_.end()) {
       return found->second;
@@ -301,7 +306,7 @@ class RelayHashHandler:
   }
 
   size_t VisitExpr_(const OpNode* op) final {
-    return GetRef<Op>(op).hash();
+    return NodeHash()(GetRef<Op>(op));
   }
 
   size_t VisitExpr_(const ConstantNode* rconst) final {
diff --git a/src/relay/ir/module.cc b/src/relay/ir/module.cc
index 0e8e6f5591dd..cd5b1e69f1d2 100644
--- a/src/relay/ir/module.cc
+++ b/src/relay/ir/module.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors
  * \file  module.cc
  * \brief The global module in Relay.
  */
@@ -208,7 +207,7 @@ void ModuleNode::UpdateDef(const GlobalTypeVar& var, const TypeData& type) {
 
 void ModuleNode::Remove(const GlobalVar& var) {
   auto functions_node = this->functions.CopyOnWrite();
-  functions_node->data.erase(var.node_);
+  functions_node->data.erase(var);
   auto gvar_node = global_var_map_.CopyOnWrite();
   gvar_node->data.erase(var->name_hint);
 }
@@ -321,10 +320,10 @@ TVM_REGISTER_API("relay._module.Module_Add")
   GlobalVar var = args[1];
   NodeRef val = args[2];
   bool update = args[3];
-  CHECK(val->derived_from<ExprNode>());
-  if (val->derived_from<FunctionNode>()) {
+  CHECK(val->IsInstance<ExprNode>());
+  if (val->IsInstance<FunctionNode>()) {
     mod->Add(var, Downcast<Function>(val), update);
-  } else if (val->derived_from<GlobalVarNode>()) {
+  } else if (val->IsInstance<GlobalVarNode>()) {
     GlobalVar gv = Downcast<GlobalVar>(val);
     auto mod_copy = Module(make_node<ModuleNode>(*mod.operator->()));
     mod_copy = transform::EtaExpand()(mod_copy);
diff --git a/src/relay/ir/op.cc b/src/relay/ir/op.cc
index d098863208fc..7bfe41c05058 100644
--- a/src/relay/ir/op.cc
+++ b/src/relay/ir/op.cc
@@ -164,7 +164,7 @@ TVM_REGISTER_API("relay.op._Register")
     if (attr_key == "num_inputs" && plevel > 128) {
       reg.set_num_inputs(value);
     } else if (attr_key == "attrs_type_key" && plevel > 128) {
-      reg.set_attrs_type_key(value);
+      LOG(FATAL) << "attrs type key no longer supported";
     } else {
       // normal attr table override.
       if (args[2].type_code() == kFuncHandle) {
@@ -179,15 +179,23 @@ TVM_REGISTER_API("relay.op._Register")
     }
   });
 
+// helper to get internal dev function in objectref.
+struct Op2NodePtr : public ObjectRef {
+  static NodePtr<Node> Get(const Op& op) {
+    return GetDataPtr<Node>(op);
+  }
+};
+
 NodePtr<Node> CreateOp(const std::string& name) {
+  // Hack use TVMRetValue as exchange
   auto op = Op::Get(name);
   CHECK(op.defined()) << "Cannot find op \'" << name << '\'';
-  return op.node_;
+  return Op2NodePtr::Get(op);
 }
 
 TVM_REGISTER_NODE_TYPE(OpNode)
 .set_creator(CreateOp)
-.set_global_key([](const Node* n) {
+.set_global_key([](const Object* n) {
     return static_cast<const OpNode*>(n)->name;
   });
 
diff --git a/src/relay/ir/pretty_printer.cc b/src/relay/ir/pretty_printer.cc
index 31218be4a6d4..b2a8396706f2 100644
--- a/src/relay/ir/pretty_printer.cc
+++ b/src/relay/ir/pretty_printer.cc
@@ -32,7 +32,7 @@
  *  - Otherwise, inline if the node is at the end of a scope and is used at most once.
  */
 
-#include <dmlc/json.h>
+#include <tvm/node/serialization.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/module.h>
 #include <tvm/relay/pattern_functor.h>
@@ -122,14 +122,14 @@ class TextMetaDataContext {
     if (it != meta_repr_.end()) {
       return it->second;
     }
-    std::string type_key = node->type_key();
+    std::string type_key = node->GetTypeKey();
     CHECK(!type_key.empty());
     Array<NodeRef>& mvector =
         meta_data_[type_key];
     int64_t index = static_cast<int64_t>(mvector.size());
     mvector.push_back(node);
     Doc doc;
-    doc << "meta[" << node->type_key() << "][" << index << "]";
+    doc << "meta[" << type_key << "][" << index << "]";
     meta_repr_[node] = doc;
     return meta_repr_[node];
   }
@@ -163,7 +163,7 @@ class PrettyPrinter :
     public ExprFunctor<Doc(const Expr&)>,
     public PatternFunctor<Doc(const Pattern&)>,
     public TypeFunctor<Doc(const Type&)>,
-    public AttrFunctor<Doc(const NodeRef&)> {
+    public AttrFunctor<Doc(const ObjectRef&)> {
  public:
   explicit PrettyPrinter(bool show_meta_data,
                          runtime::TypedPackedFunc<std::string(Expr)> annotate) :
@@ -214,7 +214,7 @@ class PrettyPrinter :
   }
 
   Doc PrintFinal(const NodeRef& node) {
-    if (node.as_derived<ExprNode>()) {
+    if (node.as<ExprNode>()) {
       Expr expr = Downcast<Expr>(node);
       dg_ = DependencyGraph::Create(&arena_, expr);
     }
@@ -237,13 +237,13 @@ class PrettyPrinter :
   std::vector<Doc> PrintFuncAttrs(const Attrs& attrs);
 
   Doc Print(const NodeRef& node, bool meta = false, bool try_inline = false) {
-    if (node.as_derived<ExprNode>()) {
+    if (node.as<ExprNode>()) {
       return PrintExpr(Downcast<Expr>(node), meta, try_inline);
-    } else if (node.as_derived<TypeNode>()) {
+    } else if (node.as<TypeNode>()) {
       return PrintType(Downcast<Type>(node), meta);
-    } else if (node.as_derived<PatternNode>()) {
+    } else if (node.as<PatternNode>()) {
       return PrintPattern(Downcast<Pattern>(node), meta);
-    } else if (node.as_derived<ModuleNode>()) {
+    } else if (node.as<ModuleNode>()) {
       return PrintMod(Downcast<Module>(node));
     } else {
       Doc doc;
@@ -809,13 +809,13 @@ class PrettyPrinter :
   // Overload of Attr printing functions
   //------------------------------------
 
-  Doc PrintAttr(const NodeRef& value, bool meta = false) {
+  Doc PrintAttr(const ObjectRef& value, bool meta = false) {
     if (value.defined()) {
       Doc printed_attr;
       if (value.as<tvm::ir::Any>()) {
         printed_attr << "?";
       } else if (meta) {
-        printed_attr = meta_.GetMetaNode(value);
+        printed_attr = meta_.GetMetaNode(Downcast<NodeRef>(value));
       } else {
         printed_attr = VisitAttr(value);
       }
@@ -825,16 +825,16 @@ class PrettyPrinter :
     }
   }
 
-  Doc VisitAttrDefault_(const Node* op) final {
-    return PrintAttr(GetRef<NodeRef>(op), true);
+  Doc VisitAttrDefault_(const Object* op) final {
+    return PrintAttr(GetRef<ObjectRef>(op), true);
   }
 
   Doc VisitAttr_(const ArrayNode* op) final {
     Doc doc;
     doc << "[";
     std::vector<Doc> arr_vals;
-    for (NodePtr<Node> val : op->data) {
-      arr_vals.push_back(PrintAttr(NodeRef(val)));
+    for (auto val : op->data) {
+      arr_vals.push_back(PrintAttr(val));
     }
     doc << PrintSep(arr_vals);
     doc << "]";
@@ -924,14 +924,11 @@ class PrettyPrinter::AttrPrinter : public AttrVisitor {
   void Visit(const char* key, DataType* value) final {
     PrintKV(key, PrintString(runtime::TVMType2String(Type2TVMType(*value))));
   }
-  void Visit(const char* key, NodeRef* value) final {
-    PrintKV(key, parent_->PrintAttr(*value));
-  }
   void Visit(const char* key, runtime::NDArray* value) final {
     LOG(FATAL) << "do not allow NDarray as argument";
   }
   void Visit(const char* key, runtime::ObjectRef* obj) final {
-    LOG(FATAL) << "do not allow Object as argument";
+    PrintKV(key, parent_->PrintAttr(*obj));
   }
 
  private:
diff --git a/src/relay/ir/type_functor.cc b/src/relay/ir/type_functor.cc
index cde68c50daef..b93d9cc79433 100644
--- a/src/relay/ir/type_functor.cc
+++ b/src/relay/ir/type_functor.cc
@@ -132,7 +132,7 @@ Type TypeMutator::VisitType_(const FuncTypeNode* op) {
     if (const TypeVarNode* tin = new_type_param.as<TypeVarNode>()) {
       type_params.push_back(GetRef<TypeVar>(tin));
     } else {
-      LOG(FATAL) << new_type_param << std::endl;
+      LOG(FATAL) << new_type_param;
     }
   }
 
@@ -141,10 +141,10 @@ Type TypeMutator::VisitType_(const FuncTypeNode* op) {
     auto new_type_cs = VisitType(type_cs);
     changed = changed || !new_type_cs.same_as(type_cs);
     if (const TypeConstraintNode* tin =
-        new_type_cs.as_derived<TypeConstraintNode>()) {
+        new_type_cs.as<TypeConstraintNode>()) {
       type_constraints.push_back(GetRef<TypeConstraint>(tin));
     } else {
-      LOG(FATAL) << new_type_cs << std::endl;
+      LOG(FATAL) << new_type_cs;
     }
   }
 
diff --git a/src/relay/ir/type_functor.h b/src/relay/ir/type_functor.h
index c3ee14eedd48..bd9e649a3b4f 100644
--- a/src/relay/ir/type_functor.h
+++ b/src/relay/ir/type_functor.h
@@ -43,18 +43,18 @@ class TypeFunctor;
   { return VisitTypeDefault_(op, std::forward<Args>(args)...); }
 
 
-#define RELAY_TYPE_FUNCTOR_DISPATCH(OP)                                   \
-  vtable.template set_dispatch<OP>(                                       \
-      [](const NodeRef& n, TSelf* self, Args... args) {                   \
-        return self->VisitType_(static_cast<const OP*>(n.node_.get()),    \
-                                std::forward<Args>(args)...);             \
+#define RELAY_TYPE_FUNCTOR_DISPATCH(OP)                                 \
+  vtable.template set_dispatch<OP>(                                     \
+      [](const ObjectRef& n, TSelf* self, Args... args) {               \
+        return self->VisitType_(static_cast<const OP*>(n.get()),        \
+                                std::forward<Args>(args)...);           \
       });
 
 template <typename R, typename... Args>
 class TypeFunctor<R(const Type& n, Args...)> {
  private:
   using TSelf = TypeFunctor<R(const Type& n, Args...)>;
-  using FType = tvm::IRFunctor<R(const NodeRef& n, TSelf* self, Args...)>;
+  using FType = tvm::IRFunctor<R(const ObjectRef& n, TSelf* self, Args...)>;
 
  public:
   /*! \brief the result type of this functor */
@@ -95,7 +95,7 @@ class TypeFunctor<R(const Type& n, Args...)> {
   virtual R VisitType_(const TypeCallNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
   virtual R VisitType_(const TypeDataNode* op, Args... args) TYPE_FUNCTOR_DEFAULT;
   virtual R VisitTypeDefault_(const Node* op, Args...) {
-    LOG(FATAL) << "Do not have a default for " << op->type_key();
+    LOG(FATAL) << "Do not have a default for " << op->GetTypeKey();
     throw;  // unreachable, written to stop compiler warning
   }
 
diff --git a/src/relay/op/algorithm/argsort.cc b/src/relay/op/algorithm/argsort.cc
index 31aa88808a23..78e7ff6851d6 100644
--- a/src/relay/op/algorithm/argsort.cc
+++ b/src/relay/op/algorithm/argsort.cc
@@ -69,7 +69,7 @@ RELAY_REGISTER_OP("argsort")
 input array along the given axis.
 )doc" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.ArgsortAttrs")
+.set_attrs_type<ArgsortAttrs>()
 .add_argument("data", "Tensor", "Input data.")
 .set_support_level(6)
 .add_type_rel("Argsort", ArgsortRel);
diff --git a/src/relay/op/algorithm/topk.cc b/src/relay/op/algorithm/topk.cc
index c88e2c3ea007..207ed01fe454 100644
--- a/src/relay/op/algorithm/topk.cc
+++ b/src/relay/op/algorithm/topk.cc
@@ -91,7 +91,7 @@ RELAY_REGISTER_OP("topk")
 .describe(R"doc(Get the top k elements in an input tensor along the given axis.
 )doc" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.TopKAttrs")
+.set_attrs_type<TopKAttrs>()
 .add_argument("data", "Tensor", "Input data.")
 .set_support_level(6)
 .add_type_rel("TopK", TopKRel);
diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc
index eeacc6cbf999..5a8ad33c63a7 100644
--- a/src/relay/op/annotation/annotation.cc
+++ b/src/relay/op/annotation/annotation.cc
@@ -144,5 +144,32 @@ Mark the end of bitpacking.
                          return {topi::identity(inputs[0])};
                        });
 
+TVM_REGISTER_API("relay.op.annotation._make.checkpoint")
+.set_body_typed<Expr(Expr)>([](Expr data) {
+  static const Op& op = Op::Get("annotation.checkpoint");
+  return CallNode::make(op, {data}, Attrs{}, {});
+});
+
+RELAY_REGISTER_OP("annotation.checkpoint")
+.describe(R"code(
+Mark a checkpoint for checkpointing memory optimization.
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_support_level(10)
+.add_type_rel("Identity", IdentityRel)
+.set_attr<TOpPattern>("TOpPattern", kOpaque)
+.set_attr<TOpIsStateful>("TOpIsStateful", false)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                               ElemwiseArbitraryLayout)
+.set_attr<FTVMCompute>("FTVMCompute",
+                       [](const Attrs& attrs, const Array<Tensor>& inputs,
+                          const Type& out_dtype, const Target& target) -> Array<Tensor> {
+                         Array<Tensor> outputs;
+                         for (size_t i = 0; i < inputs.size(); ++i) {
+                           outputs.push_back(topi::identity(inputs[i]));
+                         }
+                         return outputs;
+                       });
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/debug.cc b/src/relay/op/debug.cc
index 37fb090aa231..c9d6c32a85fb 100644
--- a/src/relay/op/debug.cc
+++ b/src/relay/op/debug.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -34,10 +34,12 @@
 namespace tvm {
 namespace relay {
 
+TVM_REGISTER_NODE_TYPE(DebugAttrs);
+
 Array<Tensor> DebugCompute(const Attrs& attrs,
-                               const Array<Tensor>& inputs,
-                               const Type& out_type,
-                               const Target& target) {
+                           const Array<Tensor>& inputs,
+                           const Type& out_type,
+                           const Target& target) {
   return Array<Tensor>{ topi::identity(inputs[0]) };
 }
 
@@ -48,6 +50,7 @@ RELAY_REGISTER_OP("debug")
 .set_num_inputs(1)
 .add_argument("program", "Tuple", "The program to execute before debugging.")
 .set_support_level(1)
+.set_attrs_type<DebugAttrs>()
 .add_type_rel("Debug", IdentityRel)
 .set_attr<TOpPattern>("TOpPattern", kOpaque)
 .set_attr<FTVMCompute>("FTVMCompute", DebugCompute);
diff --git a/src/relay/op/image/resize.cc b/src/relay/op/image/resize.cc
index dbdf89790ac5..001dd217cf83 100644
--- a/src/relay/op/image/resize.cc
+++ b/src/relay/op/image/resize.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -103,7 +103,7 @@ RELAY_REGISTER_OP("image.resize")
            for layout NHWC
            (batch_size, size[0], size[1], channels)
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ResizeAttrs")
+.set_attrs_type<ResizeAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(5)
diff --git a/src/relay/op/nn/bitserial.cc b/src/relay/op/nn/bitserial.cc
index 4eec8238f4a9..7afaa805544a 100644
--- a/src/relay/op/nn/bitserial.cc
+++ b/src/relay/op/nn/bitserial.cc
@@ -109,11 +109,11 @@ efficient implementation of bitserial operations.
             packed must be divisible by number of bits.
 - **out**:  Packed tensor with shape appropriately compressed.
 )code" TVM_ADD_FILELINE)
-    .set_num_inputs(1)
-    .set_attrs_type_key("relay.attrs.BitPackAttrs")
-    .add_argument("data", "Tensor", "Input data.")
-    .set_support_level(2)
-    .add_type_rel("BitPack", BitPackRel);
+.set_num_inputs(1)
+.set_attrs_type<BitPackAttrs>()
+.add_argument("data", "Tensor", "Input data.")
+.set_support_level(2)
+.add_type_rel("BitPack", BitPackRel);
 
 // relay.nn.bitserial_conv2d
 TVM_REGISTER_NODE_TYPE(BinaryConv2DAttrs);
@@ -185,14 +185,14 @@ on some platforms.
 
 - **out**:    Output with same layout as input.
 )code" TVM_ADD_FILELINE)
-    .set_attrs_type_key("relay.attrs.BinaryConv2DAttrs")
-    .set_num_inputs(2)
-    .add_argument("data", "Tensor", "The input tensor.")
-    .add_argument("weight", "Tensor", "The weight tensor.")
-    .set_support_level(2)
-    .add_type_rel("BinaryConv2D", BinaryConv2DRel)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
-                                   BinaryConv2DInferCorrectLayout<BinaryConv2DAttrs>);
+.set_attrs_type<BinaryConv2DAttrs>()
+.set_num_inputs(2)
+.add_argument("data", "Tensor", "The input tensor.")
+.add_argument("weight", "Tensor", "The weight tensor.")
+.set_support_level(2)
+.add_type_rel("BinaryConv2D", BinaryConv2DRel)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                               BinaryConv2DInferCorrectLayout<BinaryConv2DAttrs>);
 
 // relay.nn.bitserial_dense
 TVM_REGISTER_NODE_TYPE(BinaryDenseAttrs);
@@ -246,12 +246,12 @@ RELAY_REGISTER_OP("nn.bitserial_dense")
 - **out**: `(x1, x2, ..., xn, units)`.
 
 )code" TVM_ADD_FILELINE)
-    .set_attrs_type_key("relay.attrs.BinaryDenseAttrs")
-    .set_num_inputs(2)
-    .add_argument("data", "2D Tensor", "Input data.")
-    .add_argument("weight", "2D Tensor", "Weight matrix.")
-    .set_support_level(1)
-    .add_type_rel("BinaryDense", BinaryDenseRel);
+.set_attrs_type<BinaryDenseAttrs>()
+.set_num_inputs(2)
+.add_argument("data", "2D Tensor", "Input data.")
+.add_argument("weight", "2D Tensor", "Weight matrix.")
+.set_support_level(1)
+.add_type_rel("BinaryDense", BinaryDenseRel);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index 33ee3a35bf90..bf4e54ba5ff0 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -99,7 +99,7 @@ with the layer input to produce a tensor of outputs.
             (batch_size, channels, out_height, out_width) if `layout` is `NCHW`.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.Conv2DAttrs")
+.set_attrs_type<Conv2DAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("weight", "Tensor", "The weight tensor.")
@@ -261,7 +261,7 @@ v            (batch_size, channels, out_height, out_width) if `layout` is `NCHW`
                 out_width = (width-1)*strides[1]-2*padding[1]+kernel_size[1]+output_padding[1]
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.Conv2DTransposeAttrs")
+.set_attrs_type<Conv2DTransposeAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("weight", "Tensor", "The weight tensor.")
@@ -391,7 +391,7 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_without_weight_transform")
 
 - **out**:  Output is 4D array of shape (batch_size, channels, out_height, out_width)
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.Conv2DWinograd")
+.set_attrs_type<Conv2DWinogradAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("weight", "Tensor", "The weight tensor.")
@@ -450,7 +450,7 @@ weight transformation in advance.
 
 - **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.Conv2DWinogradWeightTransformAttrs")
+.set_attrs_type<Conv2DWinogradWeightTransformAttrs>()
 .set_num_inputs(1)
 .add_argument("weight", "Tensor", "The weight tensor.")
 .set_support_level(10)
@@ -501,7 +501,7 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_nnpack_without_weight_transform")
 
 - **out**:  Output is 4D array of shape (batch_size, channels, out_height, out_width)
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.Conv2DAttrs")
+.set_attrs_type<Conv2DAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("weight", "Tensor", "The weight tensor.")
@@ -564,7 +564,7 @@ weight transformation in advance.
 - **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.Conv2DWinogradNNPACKWeightTransformAttrs")
+.set_attrs_type<Conv2DWinogradNNPACKWeightTransformAttrs>()
 .set_num_inputs(1)
 .add_argument("weight", "Tensor", "The weight tensor.")
 .set_support_level(10)
@@ -610,7 +610,7 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_NCHWc_int8")
 
 - **out**:  Output is 5D packed tensor
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.Conv2D")
+.set_attrs_type<Conv2DAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("weight", "Tensor", "The weight tensor.")
@@ -659,7 +659,7 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_NCHWc")
 
 - **out**:  Output is 5D packed tensor
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.Conv2D")
+.set_attrs_type<Conv2DAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("weight", "Tensor", "The weight tensor.")
@@ -709,7 +709,7 @@ RELAY_REGISTER_OP("nn.contrib_depthwise_conv2d_NCHWc")
 
 - **out**:  Output is 5D packed tensor
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.DepthwiseConv2D")
+.set_attrs_type<Conv2DAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("weight", "Tensor", "The weight tensor.")
@@ -818,7 +818,7 @@ along the channel axis, and also evenly split `weight` along the first dimension
 the convolution on the *i*-th part of the data with the *i*-th weight part. The output is obtained
 by concating all the *g* results.
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.DeformableConv2D")
+.set_attrs_type<DeformableConv2DAttrs>()
 .set_num_inputs(3)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("offset", "Tensor", "The offset tensor.")
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 6d8c3acf1e00..416a0d7b543f 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -87,7 +87,7 @@ RELAY_REGISTER_OP("nn.bias_add")
 .describe(R"code(Add bias to an axis of the input.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.BiasAddAttrs")
+.set_attrs_type<BiasAddAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "nD Tensor", "Input data.")
 .add_argument("bias", "1D Tensor", "Bias.")
@@ -158,7 +158,7 @@ Useful for
 * Encoding explicit re-use of computation in convolution ops operated on a sliding window input
 * Implementing a FIFO queue to cache intermediate results, e.g. as in Fast WaveNet.
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.FIFOBufferAttrs")
+.set_attrs_type<FIFOBufferAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "Latest input")
 .add_argument("buffer", "Tensor",
@@ -195,7 +195,7 @@ RELAY_REGISTER_OP("nn.dense")
 - **out**: `(x1, x2, ..., xn, units)`.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.DenseAttrs")
+.set_attrs_type<DenseAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "nD Tensor", "Input data.")
 .add_argument("weight", "2D Tensor", "Weight matrix.")
@@ -225,7 +225,7 @@ RELAY_REGISTER_OP("nn.leaky_relu")
 `y = x > 0 ? x : alpha * x`
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.LeakyReluAttrs")
+.set_attrs_type<LeakyReluAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "Input data.")
 .set_support_level(3)
@@ -305,7 +305,7 @@ It accepts two arguments: an input ``x`` and a channelwise slope ``alpha``
 and computes the output as :math:`PReLU(x) y = x > 0 ? x : alpha * x`,
 where :math:`*` is an channelwise multiplication for each sample in the batch.
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.PReluAttrs")
+.set_attrs_type<PReluAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "Input data.")
 .add_argument("alpha", "Tensor", "Input channelwise alpha.")
@@ -344,7 +344,7 @@ RELAY_REGISTER_OP("nn.softmax")
 
 - **data**: The input data
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.SoftmaxAttrs")
+.set_attrs_type<SoftmaxAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(1)
@@ -378,7 +378,7 @@ RELAY_REGISTER_OP("nn.log_softmax")
 
 - **data**: The input data
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.SoftmaxAttrs")
+.set_attrs_type<SoftmaxAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(1)
@@ -526,7 +526,7 @@ centered at that value (zero padding is added where necessary).
 
 - **data**: The input tensor.
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.LRNAttrs")
+.set_attrs_type<LRNAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
@@ -560,7 +560,7 @@ Normalizes along dimension axis using an L2 norm
 
 - **data**: The input tensor.
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.L2NormalizeAttrs")
+.set_attrs_type<L2NormalizeAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
@@ -602,7 +602,7 @@ During training, each element of the input is set to zero with probability ``p``
 The whole array is rescaled by ``1/(1-p)`` to keep the expected sum of the input unchanged.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.DropoutAttrs")
+.set_attrs_type<DropoutAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "Input to which dropout will be applied.")
 .set_support_level(1)
@@ -696,7 +696,7 @@ axis to be the last item in the input shape.
 .. note::
     This operator can be optimized away for inference.
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.BatchNormAttrs")
+.set_attrs_type<BatchNormAttrs>()
 .set_num_inputs(5)
 .add_argument("data", "Tensor", "Input to which batch_norm will be applied.")
 .add_argument("gamma", "Tensor", "The gamma scale factor.")
@@ -768,7 +768,7 @@ to be the last item in the input shape.
 
     This operator can be optimized away for inference.
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.InstanceNormAttrs")
+.set_attrs_type<InstanceNormAttrs>()
 .set_num_inputs(3)
 .add_argument("data", "Tensor", "Input to which instance_norm will be applied.")
 .add_argument("gamma", "Tensor", "The gamma scale factor.")
@@ -816,7 +816,7 @@ TVM_REGISTER_API("relay.op.nn._make.layer_norm")
 RELAY_REGISTER_OP("nn.layer_norm")
 .describe(R"code(
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.LayerNormAttrs")
+.set_attrs_type<LayerNormAttrs>()
 .set_num_inputs(3)
 .add_argument("data", "Tensor", "Input to which layer_norm will be applied.")
 .add_argument("gamma", "Tensor", "The gamma scale factor.")
@@ -910,7 +910,7 @@ bool CrossEntropyRel(const Array<Type>& types,
   return true;
 }
 
-// Positional relay function to create batch_matmul operator used by frontend FFI.
+// Positional relay function to create cross_entropy operator used by frontend FFI.
 Expr MakeCrossEntropy(Expr predictions, Expr targets) {
   static const Op& op = Op::Get("nn.cross_entropy");
   return CallNode::make(op, {predictions, targets}, Attrs(), {});
@@ -933,5 +933,28 @@ Do log on the data - do not accept logits.
 .add_type_rel("CrossEntropy", CrossEntropyRel);
 
 
+// Positional relay function to create cross_entropy_with_logits operator used by frontend FFI.
+Expr MakeCrossEntropyWithLogits(Expr predictions, Expr targets) {
+  static const Op& op = Op::Get("nn.cross_entropy_with_logits");
+  return CallNode::make(op, {predictions, targets}, Attrs(), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.cross_entropy_with_logits")
+.set_body_typed(MakeCrossEntropyWithLogits);
+
+
+RELAY_REGISTER_OP("nn.cross_entropy_with_logits")
+.describe(R"code(
+Computes cross entropy given predictions and targets.
+Accept logits.
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(2)
+.add_argument("x", "1D Tensor", "Predictions.")
+.add_argument("y", "1D Tensor", "Targets.")
+.set_support_level(10)
+.add_type_rel("CrossEntropy", CrossEntropyRel);
+
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/nn/pad.cc b/src/relay/op/nn/pad.cc
index 2e34580d1b72..2342880063ad 100644
--- a/src/relay/op/nn/pad.cc
+++ b/src/relay/op/nn/pad.cc
@@ -204,7 +204,7 @@ RELAY_REGISTER_OP("nn.pad")
 .describe(R"code(Pad for n-D tensor.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.PadAttrs")
+.set_attrs_type<PadAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
@@ -278,7 +278,7 @@ RELAY_REGISTER_OP("nn.mirror_pad")
 .describe(R"code(MirrorPad for n-D tensor.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.MirrorPadAttrs")
+.set_attrs_type<MirrorPadAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index 503db4116a22..94f8a5442d6c 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -207,7 +207,7 @@ RELAY_REGISTER_OP("nn.max_pool2d")
            equation.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.MaxPool2DAttrs")
+.set_attrs_type<MaxPool2DAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
@@ -262,7 +262,7 @@ Average pooling operation for one dimensional data.
            equation.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.AvgPool2DAttrs")
+.set_attrs_type<AvgPool2DAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
@@ -349,7 +349,7 @@ RELAY_REGISTER_OP("nn.global_avg_pool2d")
            (batch_size, channels, 1, 1)  if `layout` is `NCHW`.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.GlobalPool2DAttrs")
+.set_attrs_type<GlobalPool2DAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
@@ -380,7 +380,7 @@ RELAY_REGISTER_OP("nn.global_max_pool2d")
            (batch_size, channels, 1, 1)  if `layout` is `NCHW`.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.GlobalPool2DAttrs")
+.set_attrs_type<GlobalPool2DAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
@@ -506,7 +506,7 @@ RELAY_REGISTER_OP("contrib.adaptive_avg_pool2d")
            (batch_size, channels, output_height, output_width)  if `layout` is `NCHW`.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.AdaptivePool2DAttrs")
+.set_attrs_type<AdaptivePool2DAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(10)
@@ -545,7 +545,7 @@ RELAY_REGISTER_OP("contrib.adaptive_max_pool2d")
            (batch_size, channels, output_height, output_width)  if `layout` is `NCHW`.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.AdaptivePool2DAttrs")
+.set_attrs_type<AdaptivePool2DAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(10)
@@ -653,7 +653,7 @@ RELAY_REGISTER_OP("nn.max_pool2d_grad")
            (batch_size, channels, height, width)  if `layout` is `NCHW`.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.MaxPool2DAttrs")
+.set_attrs_type<MaxPool2DAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
@@ -702,7 +702,7 @@ RELAY_REGISTER_OP("nn.avg_pool2d_grad")
            (batch_size, channels, height, width)  if `layout` is `NCHW`.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.MaxPool2DAttrs")
+.set_attrs_type<MaxPool2DAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
diff --git a/src/relay/op/nn/sparse.cc b/src/relay/op/nn/sparse.cc
index 48a9b11f7651..08bbbef9eec9 100644
--- a/src/relay/op/nn/sparse.cc
+++ b/src/relay/op/nn/sparse.cc
@@ -84,7 +84,7 @@ RELAY_REGISTER_OP("nn.sparse_dense")
 - **out**: `(x1, x2, ..., xn, units)`.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.SparseDenseAttrs")
+.set_attrs_type<SparseDenseAttrs>()
 .set_num_inputs(4)
 .add_argument("data", "nD Tensor", "Input data.")
 .add_argument("weight_data", "1D Tensor", "Weight data matrix.")
@@ -131,7 +131,7 @@ RELAY_REGISTER_OP("nn.sparse_transpose")
 - **out**: `(N, N)`.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.SparseTransposeAttrs")
+.set_attrs_type<SparseTransposeAttrs>()
 .set_num_inputs(3)
 .add_argument("sparse_data", "1D Tensor", "Sparse data matrix.")
 .add_argument("sparse_indices", "1D Tensor", "Sparse indices matrix.")
diff --git a/src/relay/op/nn/upsampling.cc b/src/relay/op/nn/upsampling.cc
index 99892034ad02..c473f86a39ca 100644
--- a/src/relay/op/nn/upsampling.cc
+++ b/src/relay/op/nn/upsampling.cc
@@ -128,7 +128,7 @@ RELAY_REGISTER_OP("nn.upsampling")
            (batch_size, in_height*scale, in_width*scale, channels)
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.UpSamplingAttrs")
+.set_attrs_type<UpSamplingAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(2)
diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
index e41cfdaeca10..51714bd9f756 100644
--- a/src/relay/op/tensor/reduce.cc
+++ b/src/relay/op/tensor/reduce.cc
@@ -324,7 +324,7 @@ RELAY_REGISTER_REDUCE_OP("argmax")
 values over a given axis.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_attrs_type<ReduceAttrs>()
 .set_support_level(4)
 .add_type_rel("ArgReduce", ArgReduceRel)
 .set_attr<FTVMCompute>("FTVMCompute", ArgMaxCompute)
@@ -343,7 +343,7 @@ RELAY_REGISTER_REDUCE_OP("argmin")
 values over a given axis.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_attrs_type<ReduceAttrs>()
 .set_support_level(4)
 .add_type_rel("ArgReduce", ArgReduceRel)
 .set_attr<FTVMCompute>("FTVMCompute", ArgMinCompute)
@@ -375,7 +375,7 @@ Example::
   [ 12.  19.  27.]
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_attrs_type<ReduceAttrs>()
 .set_support_level(4)
 .add_type_rel("Reduce", ReduceRel)
 .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ReduceInferCorrectLayout)
@@ -413,7 +413,7 @@ Example::
    [False,  True, False]]
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_attrs_type<ReduceAttrs>()
 .set_support_level(4)
 .add_type_rel("Reduce", ReduceRel)
 .set_attr<FTVMCompute>("FTVMCompute", AllCompute)
@@ -431,7 +431,7 @@ RELAY_REGISTER_REDUCE_OP("max")
 .describe(R"code(Computes the max of array elements over given axes.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_attrs_type<ReduceAttrs>()
 .set_support_level(4)
 .add_type_rel("Reduce", ReduceRel)
 .set_attr<FTVMCompute>("FTVMCompute", MaxCompute)
@@ -450,7 +450,7 @@ RELAY_REGISTER_REDUCE_OP("min")
 .describe(R"code(Computes the min of array elements over given axes.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_attrs_type<ReduceAttrs>()
 .set_support_level(4)
 .add_type_rel("Reduce", ReduceRel)
 .set_attr<FTVMCompute>("FTVMCompute", MinCompute)
@@ -480,7 +480,7 @@ Example::
   [ 36  480  2058]
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_attrs_type<ReduceAttrs>()
 .set_support_level(4)
 .add_type_rel("Reduce", ReduceRel)
 .set_attr<FTVMCompute>("FTVMCompute", ProdCompute)
@@ -521,7 +521,7 @@ Example::
   [ 2.  3.16666667  4.5]
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_attrs_type<ReduceAttrs>()
 .set_support_level(4)
 .add_type_rel("Reduce", ReduceRel)
 .set_attr<FTVMCompute>("FTVMCompute", MeanCompute)
@@ -596,7 +596,7 @@ RELAY_REGISTER_OP("variance")
 .describe(R"code(Computes the variance of array elements over given axes.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ReduceAttrs")
+.set_attrs_type<ReduceAttrs>()
 .set_support_level(4)
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 34d260a83fb7..e1239ae5b9e2 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2019 by Contributors
  * \file transform.cc
  * \brief Transform operators.
  */
@@ -92,7 +91,7 @@ RELAY_REGISTER_OP("cast")
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.CastAttrs")
+.set_attrs_type<CastAttrs>()
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
 .add_type_rel("Cast", CastRel)
@@ -177,16 +176,16 @@ TVM_REGISTER_API("relay._make.reinterpret").set_body([](const TVMArgs& args, TVM
 });
 
 RELAY_REGISTER_OP("reinterpret")
-    .describe(R"code(Reinterpret the data into a new data type.
+.describe(R"code(Reinterpret the data into a new data type.
 )code" TVM_ADD_FILELINE)
-    .set_num_inputs(1)
-    .set_attrs_type_key("relay.attrs.CastAttrs")
-    .add_argument("data", "Tensor", "The input tensor.")
-    .set_support_level(3)
-    .add_type_rel("Reinterpret", CastRel)
-    .set_attr<FTVMCompute>("FTVMCompute", ReinterpretCompute)
-    .set_attr<TOpPattern>("TOpPattern", kElemWise)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout);
+.set_num_inputs(1)
+.set_attrs_type<CastAttrs>()
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(3)
+.add_type_rel("Reinterpret", CastRel)
+.set_attr<FTVMCompute>("FTVMCompute", ReinterpretCompute)
+.set_attr<TOpPattern>("TOpPattern", kElemWise)
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout);
 
 // relay.expand_dims
 TVM_REGISTER_NODE_TYPE(ExpandDimsAttrs);
@@ -260,7 +259,7 @@ RELAY_REGISTER_OP("expand_dims")
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.ExpandDimsAttrs")
+.set_attrs_type<ExpandDimsAttrs>()
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(1)
 .add_type_rel("ExpandDims", ExpandDimsRel)
@@ -346,7 +345,7 @@ RELAY_REGISTER_OP("concatenate")
 - **axis** : The axis along which the tensors are concatenated.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ConcatenateAttrs")
+.set_attrs_type<ConcatenateAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input list of tensors.")
 .set_support_level(1)
@@ -441,7 +440,7 @@ RELAY_REGISTER_OP("stack")
 - **axis** : The axis along which the tensors are stacked.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.StackAttrs")
+.set_attrs_type<StackAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input list of tensors.")
 .set_support_level(3)
@@ -534,7 +533,7 @@ RELAY_REGISTER_OP("transpose")
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.TransposeAttrs")
+.set_attrs_type<TransposeAttrs>()
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
 .add_type_rel("Transpose", TransposeRel)
@@ -696,7 +695,7 @@ Array<Tensor> ReshapeCompute(const Attrs& attrs,
   CHECK(out_ttype != nullptr);
   Array<IndexExpr> newshape;
   for (auto val : out_ttype->shape) {
-    if (val->is_type<ir::Any>()) {
+    if (val->IsInstance<ir::Any>()) {
       newshape.push_back(val.as<ir::Any>()->ToVar());
     } else {
       newshape.push_back(val);
@@ -769,7 +768,7 @@ Example::
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.ReshapeAttrs")
+.set_attrs_type<ReshapeAttrs>()
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
 .add_type_rel("Reshape", ReshapeRel)
@@ -859,7 +858,7 @@ RELAY_REGISTER_OP("argwhere")
 .describe(R"doc(Find the indices of elements of a tensor that are
 non-zero)doc" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.ArgWhereAttrs")
+.set_attrs_type<ArgWhereAttrs>()
 .add_argument("condition", "Tensor", "The input condition tensor.")
 .add_type_rel("ArgWhere", ArgWhereRel)
 .set_attr<TOpIsStateful>("TOpIsStateful", false)
@@ -962,7 +961,7 @@ Examples::
                               [ 4., 3.]]
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.TakeAttrs")
+.set_attrs_type<TakeAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("indices", "Tensor", "The indices tensor.")
@@ -1024,7 +1023,7 @@ RELAY_REGISTER_OP("full")
 .describe(R"code(Fill array with scalar value.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.InitOpAttrs")
+.set_attrs_type<InitOpAttrs>()
 .set_num_inputs(1)
 .add_argument("fill_value", "double", "The value to fill.")
 .set_support_level(3)
@@ -1059,7 +1058,7 @@ RELAY_REGISTER_OP("zeros")
 .describe(R"code(Fill array with zeros.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.InitOpAttrs")
+.set_attrs_type<InitOpAttrs>()
 .set_num_inputs(0)
 .set_support_level(3)
 .add_type_rel("InitOp", InitOpRel);
@@ -1080,7 +1079,7 @@ RELAY_REGISTER_OP("ones")
 .describe(R"code(Fill array with ones.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.InitOpAttrs")
+.set_attrs_type<InitOpAttrs>()
 .set_num_inputs(0)
 .set_support_level(3)
 .add_type_rel("InitOp", InitOpRel);
@@ -1261,7 +1260,7 @@ RELAY_REGISTER_OP("arange")
 .describe(R"code(Returns evenly spaced values within a given interval.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.ArangeAttrs")
+.set_attrs_type<ArangeAttrs>()
 .set_num_inputs(3)
 .set_support_level(3)
 .add_type_rel("Arange", ArangeRel)
@@ -1340,7 +1339,7 @@ RELAY_REGISTER_OP("repeat")
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.Repeat")
+.set_attrs_type<RepeatAttrs>()
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
 .add_type_rel("Repeat", RepeatRel)
@@ -1438,7 +1437,7 @@ RELAY_REGISTER_OP("tile")
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.Tile")
+.set_attrs_type<TileAttrs>()
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
 .add_type_rel("Tile", TileRel)
@@ -1499,7 +1498,7 @@ RELAY_REGISTER_OP("reverse")
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.Reverse")
+.set_attrs_type<ReverseAttrs>()
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
 .add_type_rel("Reverse", ReverseRel)
@@ -1677,7 +1676,7 @@ RELAY_REGISTER_OP("squeeze")
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.SqueezeAttrs")
+.set_attrs_type<SqueezeAttrs>()
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
 .add_type_rel("Squeeze", SqueezeRel)
@@ -1814,7 +1813,7 @@ Array<Integer> GetIntArray(Array<IndexExpr> arr) {
     CHECK(!arr[i].defined() || arr[i].as<IntImm>())
       << "Expect an int array";
   }
-  return Array<Integer>(arr.node_);
+  return Downcast<Array<Integer> >(arr);
 }
 
 
@@ -2034,7 +2033,7 @@ Examples::
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(4)
-.set_attrs_type_key("relay.attrs.StridedSliceAttrs")
+.set_attrs_type<StridedSliceAttrs>()
 .add_type_rel("StridedSlice", StridedSliceRel)
 .set_attr<FTVMCompute>("FTVMCompute", StridedSliceCompute)
 .set_attr<TOpPattern>("TOpPattern", kInjective)
@@ -2081,11 +2080,11 @@ bool SplitRel(const Array<Type>& types,
     auto begin = IndexExpr(make_zero(Int(32)));
     std::vector<Type> fields;
     for (unsigned int i = 0; i < indices.size(); ++i) {
-      CHECK(reporter->Assert(IndexExpr(indices[i]) > begin))
+      CHECK(reporter->Assert(Downcast<IndexExpr>(indices[i]) > begin))
           << "indices_or_sections need to be a sorted ascending list";
       std::vector<IndexExpr> oshape(data->shape.begin(), data->shape.end());
-      oshape[axis] = IndexExpr(indices[i]) - begin;
-      begin = IndexExpr(indices[i]);
+      oshape[axis] = Downcast<IndexExpr>(indices[i]) - begin;
+      begin = Downcast<IndexExpr>(indices[i]);
       auto vec_type = TensorTypeNode::make(oshape, data->dtype);
       fields.push_back(vec_type);
     }
@@ -2147,7 +2146,7 @@ If indices_or_sections is a tuple of sorted integers,
 the entries indicate where along axis the array is split.
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.SplitAttrs")
+.set_attrs_type<SplitAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(3)
@@ -2283,7 +2282,7 @@ TVM_REGISTER_API("relay.op._make.slice_like")
 RELAY_REGISTER_OP("slice_like")
 .describe(R"code(Slice the first input respect to the second input.
 )code" TVM_ADD_FILELINE)
-  .set_attrs_type_key("relay.attrs.SlicelikeAttrs")
+.set_attrs_type<SliceLikeAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("shape_like", "Tensor", "Shape tensor.")
@@ -2347,7 +2346,7 @@ For transforming from NCHW to N16cHWC, the `__layout_transform__` operator resha
 the input array by output[n, c, h, w, C] = data[n, C*16+c, h, w]
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.LayoutTransformAttrs")
+.set_attrs_type<LayoutTransformAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_type_rel("layout_transform", LayoutTransformRel)
@@ -2383,7 +2382,7 @@ example below::
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.ReshapeAttrs")
+.set_attrs_type<ReshapeAttrs>()
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(10)
 .add_type_rel("Reshape", ReshapeRel)
@@ -2552,7 +2551,7 @@ Examples::
         [[  0.1,  0.1,  0.1],
          [  16.,  17.,  18.]]]
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.SequenceMaskAttrs")
+.set_attrs_type<SequenceMaskAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The input tensor.")
 .add_argument("valid_length", "Tensor", "The real (valid) length of each sequence.")
@@ -2640,7 +2639,7 @@ RELAY_REGISTER_OP("one_hot")
     **axis** Axis to fill.
 
     **dtype**)code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.OneHotAttrs")
+.set_attrs_type<OneHotAttrs>()
 .set_num_inputs(3)
 .add_argument("indices", "Tensor", "Locations to set to on_value.")
 .add_argument("on_value", "Expr", "Value to fill at indices.")
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
index 0e3e539cc928..1979d067c917 100644
--- a/src/relay/op/tensor/unary.cc
+++ b/src/relay/op/tensor/unary.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -177,7 +177,7 @@ This function takes a tensor, a minimum value `a_min`, and a maximum value `a_ma
 .set_attr<TOpPattern>("TOpPattern", kElemWise)
 .set_attr<TOpIsStateful>("TOpIsStateful", false)
 .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
-.set_attrs_type_key("relay.attrs.ClipAttrs")
+.set_attrs_type<ClipAttrs>()
 .set_support_level(3);
 
 
@@ -314,7 +314,7 @@ RELAY_REGISTER_OP("shape_of")
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.ShapeOfAttrs")
+.set_attrs_type<ShapeOfAttrs>()
 .add_argument("data", "Tensor", "The input tensor.")
 .add_type_rel("ShapeOf", ShapeOfRel)
 .set_attr<TOpIsStateful>("TOpIsStateful", false)
@@ -365,7 +365,7 @@ RELAY_REGISTER_OP("contrib.ndarray_size")
 
 )code" TVM_ADD_FILELINE)
 .set_num_inputs(1)
-.set_attrs_type_key("relay.attrs.NdarraySizeAttrs")
+.set_attrs_type<NdarraySizeAttrs>()
 .add_argument("data", "Tensor", "The input tensor.")
 .add_type_rel("NdarraySize", NdarraySizeRel)
 .set_attr<TOpIsStateful>("TOpIsStateful", false)
diff --git a/src/relay/op/vision/multibox_op.cc b/src/relay/op/vision/multibox_op.cc
index 56a03ff80bc9..f329cfdd710b 100644
--- a/src/relay/op/vision/multibox_op.cc
+++ b/src/relay/op/vision/multibox_op.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -79,7 +79,7 @@ TVM_REGISTER_API("relay.op.vision._make.multibox_prior")
 RELAY_REGISTER_OP("vision.multibox_prior")
 .describe(R"doc("Generate prior(anchor) boxes from data, sizes and ratios."
 )doc" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.MultiBoxPriorAttrs")
+.set_attrs_type<MultiBoxPriorAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_support_level(5)
@@ -150,7 +150,7 @@ TVM_REGISTER_API("relay.op.vision._make.multibox_transform_loc")
 RELAY_REGISTER_OP("vision.multibox_transform_loc")
 .describe(R"doc("Location transformation for multibox detection."
 )doc" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.MultiBoxTransformLocAttrs")
+.set_attrs_type<MultiBoxTransformLocAttrs>()
 .set_num_inputs(3)
 .add_argument("cls_prob", "Tensor", "Class probabilities.")
 .add_argument("loc_pred", "Tensor", "Location regression predictions.")
diff --git a/src/relay/op/vision/yolo.cc b/src/relay/op/vision/yolo.cc
index b96ab10aa650..a55bf0231956 100644
--- a/src/relay/op/vision/yolo.cc
+++ b/src/relay/op/vision/yolo.cc
@@ -80,7 +80,7 @@ Its function is mostly shape transform.")doc" TVM_ADD_FILELINE)
 .add_argument("data", "Tensor", "The input tensor.")
 .set_num_inputs(1)
 .set_support_level(5)
-.set_attrs_type_key("relay.attrs.YoloReorgAttrs")
+.set_attrs_type<YoloReorgAttrs>()
 .add_type_rel("YoloReorg", YoloReorgRel)
 .set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
                                          const Array<Tensor>& inputs,
diff --git a/src/relay/pass/alter_op_layout.cc b/src/relay/pass/alter_op_layout.cc
index 23a480b4e42f..bbfb97c56dc2 100644
--- a/src/relay/pass/alter_op_layout.cc
+++ b/src/relay/pass/alter_op_layout.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- * Copyright (c) 2019 by Contributors
  * \file alter_op_layout.cc
  * \brief Alternate the layouts of operators or replace primitive operators with
           other expressions. This pass can be used for computing convolution in
@@ -97,10 +96,10 @@ struct key_hash : public std::function<std::size_t(TransformKey)> {
 class TransformMemorizer : public NodeRef {
  public:
   TransformMemorizer() {}
-  explicit TransformMemorizer(NodePtr<Node> n) : NodeRef(n) {}
+  explicit TransformMemorizer(ObjectPtr<Object> n) : NodeRef(n) {}
 
   TransformMemorizerNode* operator->() {
-    return static_cast<TransformMemorizerNode*>(node_.get());
+    return static_cast<TransformMemorizerNode*>(get_mutable());
   }
 
   // Transform layout with memorizer
@@ -141,7 +140,7 @@ class LayoutAlternatedExprNode : public TempExprNode {
     return tmp_memorizer.Transform(value, new_layout, old_layout);
   }
 
-  void VisitAttrs(AttrVisitor *v) final {
+  void VisitAttrs(AttrVisitor *v) {
     v->Visit("value", &value);
     v->Visit("old_layout", &old_layout);
     v->Visit("new_layout", &new_layout);
@@ -242,7 +241,7 @@ Expr AlterOpLayoutRewrite(const Call &ref_call,
 
   for (auto new_arg : new_args) {
     // NOTE: do not support nested tuple
-    if (new_arg->is_type<TupleNode>()) {
+    if (new_arg->IsInstance<TupleNode>()) {
       Tuple tuple_new_arg = Downcast<Tuple>(new_arg);
       std::vector<Expr> fields;
       for (auto x : tuple_new_arg->fields) {
@@ -264,7 +263,7 @@ Expr AlterOpLayoutRewrite(const Call &ref_call,
   }
 
   for (auto arg : ref_call->args) {
-    if (arg->is_type<TupleNode>()) {  // flatten tuple
+    if (arg->IsInstance<TupleNode>()) {  // flatten tuple
       Tuple tuple_arg = Downcast<Tuple>(arg);
       for (auto x : tuple_arg->fields) {
         input_shapes.push_back(x->type_as<TensorTypeNode>()->shape);
@@ -293,7 +292,7 @@ Expr AlterOpLayoutRewrite(const Call &ref_call,
   Call new_call = CallAlter(ref_call, normal_new_args);
 
   // new_in2, new_out = op.infer(new_in)
-  if (new_call->op->is_type<OpNode>()) {
+  if (new_call->op->IsInstance<OpNode>()) {
     success = false;
     std::tie(new_in2, new_out, success) = CallInfer(new_call, new_in, old_in, input_shapes);
     if (!success) { return Expr(nullptr); }
@@ -310,7 +309,7 @@ Expr AlterOpLayoutRewrite(const Call &ref_call,
   Array<Expr> transformed_args;
   size_t pt = 0;
   for (auto arg : new_call->args) {
-    if (arg->is_type<TupleNode>()) {  // unflatten tuple
+    if (arg->IsInstance<TupleNode>()) {  // unflatten tuple
       Tuple tuple_arg = Downcast<Tuple>(arg);
       std::vector<Expr> transformed_tuple_arg;
       for (auto arg_item : tuple_arg->fields) {
@@ -329,7 +328,7 @@ Expr AlterOpLayoutRewrite(const Call &ref_call,
 
   // state[node] = (old_out, new_out)
   // (handle tuple output)
-  if (ref_call->checked_type()->is_type<TupleTypeNode>()) {
+  if (ref_call->checked_type()->IsInstance<TupleTypeNode>()) {
     Expr tuple_output = CallNode::make(new_call->op, transformed_args,
                                        new_call->attrs);
     Array<Expr> fields;
diff --git a/src/relay/pass/de_duplicate.cc b/src/relay/pass/de_duplicate.cc
index 332803cb71ba..38acdcde94b0 100644
--- a/src/relay/pass/de_duplicate.cc
+++ b/src/relay/pass/de_duplicate.cc
@@ -52,7 +52,9 @@ Expr DeDup(const Expr& e) {
     }
 
     Expr VisitExpr(const Expr& e) final {
-      return ExprMutator::VisitExpr(e);
+      auto ret = ExprMutator::VisitExpr(e);
+      ret->checked_type_ = e->checked_type_;
+      return ret;
     }
 
     Expr VisitExpr_(const VarNode* op) final {
diff --git a/src/relay/pass/device_annotation.cc b/src/relay/pass/device_annotation.cc
index aec974b184d3..21992ab7abb7 100644
--- a/src/relay/pass/device_annotation.cc
+++ b/src/relay/pass/device_annotation.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,8 +18,6 @@
  */
 
 /*!
- * Copyright (c) 2018 by Contributors
- *
  * \file deivce_annotation.cc
  * \brief Passes to rewrite annotated program and retrieve the device allocation
  * of expression.
@@ -46,13 +44,15 @@ namespace relay {
 namespace {
 
 bool IsOnDeviceNode(const ExprNode* node) {
-  const auto* call_node = dynamic_cast<const CallNode*>(node);
-  return call_node != nullptr && call_node->attrs.as<OnDeviceAttrs>();
+  if (!node->IsInstance<CallNode>()) return false;
+  const auto* call_node = static_cast<const CallNode*>(node);
+  return call_node->attrs.as<OnDeviceAttrs>();
 }
 
 bool IsDeviceCopyNode(const ExprNode* node) {
-  const auto* call_node = dynamic_cast<const CallNode*>(node);
-  return call_node != nullptr && call_node->attrs.as<DeviceCopyAttrs>();
+  if (!node->IsInstance<CallNode>()) return false;
+  const auto* call_node = static_cast<const CallNode*>(node);
+  return call_node->attrs.as<DeviceCopyAttrs>();
 }
 
 }  // namespace
@@ -257,12 +257,12 @@ class RewriteAnnotation : public ExprMutator {
         // Here we need across device data transferring only when `src` is a
         // CallNode or FunctionNode and the `dst` is annotated with any device
         // id other than fallback_device_.
-        if (src->is_type<CallNode>() || src->is_type<FunctionNode>()) {
+        if (src->IsInstance<CallNode>() || src->IsInstance<FunctionNode>()) {
           return annotation_map_.at(dst) != fallback_device_;
         } else {
           // There shouldn't be any copy nodes between var/constant and another
           // expression.
-          return !(src->is_type<VarNode>() || src->is_type<ConstantNode>());
+          return !(src->IsInstance<VarNode>() || src->IsInstance<ConstantNode>());
         }
       } else {
         return false;
@@ -349,7 +349,7 @@ class AnnotatationVisitor : private ExprVisitor {
  *           ancestors until encountering another copy op. For example, this way
  *           provides add, x, and y device types from the copy operator, `copy1`.
  *  -Pass 2: Propagating the destination device type of "the last" copy op to the
- *           remain nodes. For instance, this offers `subtract` and `exp` the 
+ *           remain nodes. For instance, this offers `subtract` and `exp` the
  *           same device type as `copy3`.
  */
 
@@ -415,7 +415,6 @@ class DeviceInfo {
 
     void VisitExpr_(const TupleGetItemNode* op) final {
       ExprVisitor::VisitExpr_(op);
-      std::make_pair(op, has_copy_);
     }
 
     void VisitExpr_(const VarNode* vn) final {
@@ -448,7 +447,8 @@ class DeviceInfo {
   static const ExprNode* GetDeviceCopyNode(const ExprNode* node) {
     if (IsDeviceCopyNode(node)) {
       return node;
-    } else if (const auto* call_node = dynamic_cast<const CallNode*>(node)) {
+    } else if (node->IsInstance<CallNode>()) {
+      const auto* call_node = static_cast<const CallNode*>(node);
       if (const auto* fn = call_node->op.as<FunctionNode>()) {
         const ExprNode* body = fn->body.operator->();
         if (IsDeviceCopyNode(body)) {
@@ -473,7 +473,8 @@ class DeviceInfo {
     for (auto it = post_visitor_.post_dfs_order_.crbegin();
          it != post_visitor_.post_dfs_order_.crend(); ++it) {
       if (const auto* node = GetDeviceCopyNode(it->first)) {
-        last_copy_node = dynamic_cast<const CallNode*>(node);
+        CHECK(node->IsInstance<CallNode>());
+        last_copy_node = static_cast<const CallNode*>(node);
         const auto* attrs = last_copy_node->attrs.as<DeviceCopyAttrs>();
         cur_dev_type = attrs->src_dev_type;
         if (out_dev_type == -1) out_dev_type = attrs->dst_dev_type;
diff --git a/src/relay/pass/eta_expand.cc b/src/relay/pass/eta_expand.cc
index f4b7dbfaf622..a5d04871ba95 100644
--- a/src/relay/pass/eta_expand.cc
+++ b/src/relay/pass/eta_expand.cc
@@ -36,15 +36,15 @@ Expr EtaExpand(const Expr& e, const Module& mod) {
   tvm::Array<TypeVar> original_type_params;
   Type ret_type;
 
-  if (e->is_type<GlobalVarNode>()) {
-    auto gvar_node = e.as_derived<GlobalVarNode>();
+  if (e->IsInstance<GlobalVarNode>()) {
+    auto gvar_node = e.as<GlobalVarNode>();
     auto func = mod->Lookup(GetRef<GlobalVar>(gvar_node));
     original_params = func->params;
     original_type_params = func->type_params;
     ret_type = func->ret_type;
   } else {
-    CHECK(e->is_type<FunctionNode>());
-    auto func = GetRef<Function>(e.as_derived<FunctionNode>());
+    CHECK(e->IsInstance<FunctionNode>());
+    auto func = GetRef<Function>(e.as<FunctionNode>());
     original_params = func->params;
     original_type_params = func->type_params;
     ret_type = func->ret_type;
diff --git a/src/relay/pass/fold_constant.cc b/src/relay/pass/fold_constant.cc
index eba77c7241a7..684887683a04 100644
--- a/src/relay/pass/fold_constant.cc
+++ b/src/relay/pass/fold_constant.cc
@@ -157,7 +157,7 @@ class ConstantFolder : public ExprMutator {
       }
       return TupleNode::make(fields);
     } else {
-      LOG(FATAL) << "Cannot handle " << value->type_key();
+      LOG(FATAL) << "Cannot handle " << value->GetTypeKey();
       return Expr();
     }
   }
diff --git a/src/relay/pass/fold_scale_axis.cc b/src/relay/pass/fold_scale_axis.cc
index 868a08f8b576..e13a50a99c58 100644
--- a/src/relay/pass/fold_scale_axis.cc
+++ b/src/relay/pass/fold_scale_axis.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,8 +18,6 @@
  */
 
 /*!
- * Copyright (c) 2018 by Contributors
- *
  * \file fold_scale_axis.cc
  *
  * \brief Fold axis scaling into weights of
@@ -178,7 +176,7 @@ class ScaledExprNode : public TempExprNode {
     return value;
   }
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("value", &value);
     v->Visit("axes", &axes);
     v->Visit("scale", &scale);
@@ -666,7 +664,7 @@ class BackwardTransformerNode :
   }
 
   // solver is not serializable.
-  void VisitAttrs(tvm::AttrVisitor* v) final {}
+  void VisitAttrs(tvm::AttrVisitor* v) {}
 
   static constexpr const char* _type_key = "relay.fold_scale_axis.FBackwardTransformer";
   TVM_DECLARE_NODE_TYPE_INFO(BackwardTransformerNode, Node);
@@ -686,10 +684,10 @@ class BackwardTransformer : public NodeRef {
  public:
   BackwardTransformer() {}
   explicit BackwardTransformer(
-      ::tvm::NodePtr<::tvm::Node> n) : NodeRef(n) {
+      ::tvm::ObjectPtr<::tvm::Object> n) : NodeRef(n) {
   }
   BackwardTransformerNode* operator->() const {
-    return static_cast<BackwardTransformerNode*>(node_.get());
+    return static_cast<BackwardTransformerNode*>(get_mutable());
   }
   using ContainerType = BackwardTransformerNode;
 };
diff --git a/src/relay/pass/forward_rewrite.cc b/src/relay/pass/forward_rewrite.cc
index 6c66d6e982a7..f7d463a0547e 100644
--- a/src/relay/pass/forward_rewrite.cc
+++ b/src/relay/pass/forward_rewrite.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -47,7 +47,7 @@ class TempRealizer : private ExprMutator {
       return it->second;
     } else {
       Expr res;
-      if (const auto* temp = expr.as_derived<TempExprNode>()) {
+      if (const auto* temp = expr.as<TempExprNode>()) {
         res = temp->Realize();
 
       } else {
diff --git a/src/relay/pass/gradient.cc b/src/relay/pass/gradient.cc
index 2606910d3906..b93c110a71c6 100644
--- a/src/relay/pass/gradient.cc
+++ b/src/relay/pass/gradient.cc
@@ -273,24 +273,29 @@ Type ReverseType(const Type& t) {
  * by doing a structure preserving map.
  */
 Expr LiftTensor(const std::function<Expr(const Expr& t)>& f,
-                const Type& t,
+                const std::function<Type(const Type&)>& tf,
+                const Type& forward_type,
                 const Expr& e,
                 LetList* ll) {
   CHECK(IsAtomic(e)) << e;
-  if (t.as<TensorTypeNode>()) {
+  if (forward_type.as<TensorTypeNode>()) {
     auto ret = f(e);
-    ret->checked_type_ = t;
+    ret->checked_type_ = tf(forward_type);
     return ret;
-  } else if (auto* tt = t.as<TupleTypeNode>()) {
+  } else if (auto* tt = forward_type.as<TupleTypeNode>()) {
     tvm::Array<Expr> fields;
+    tvm::Array<Type> types;
     for (size_t i = 0; i < tt->fields.size(); ++i) {
-      fields.push_back(LiftTensor(f,
-                                  tt->fields[i],
-                                  ll->Push(GetField(e, i)),
-                                  ll));
+      auto field = LiftTensor(f,
+                              tf,
+                              tt->fields[i],
+                              ll->Push(GetField(e, i)),
+                              ll);
+      fields.push_back(field);
+      types.push_back(field->checked_type_);
     }
     auto ret = TupleNode::make(fields);
-    ret->checked_type_ = t;
+    ret->checked_type_ = TupleTypeNode::make(types);
     return std::move(ret);
   } else {
     LOG(FATAL) << "unsupported input/output type: " << tt;
@@ -298,25 +303,63 @@ Expr LiftTensor(const std::function<Expr(const Expr& t)>& f,
   }
 }
 
+/*! \brief Transfers the gradients from an Expr to a deep duplication of the Expr,
+ * by stitching the references in the AD values.
+ */
+void TransferGrads(const Type& forward_type,
+                   const Expr& from,
+                   const Expr& to,
+                   LetList* ll) {
+  CHECK(IsAtomic(from)) << from;
+  CHECK(IsAtomic(to)) << to;
+  if (forward_type.as<TensorTypeNode>()) {
+    auto from_ref = TupleGetItemNode::make(from, 1);
+    auto to_ref = TupleGetItemNode::make(to, 1);
+    ll->Push(RefWriteNode::make(to_ref, RefReadNode::make(from_ref)));
+  } else if (auto* tt = forward_type.as<TupleTypeNode>()) {
+    for (size_t i = 0; i < tt->fields.size(); ++i) {
+      TransferGrads(tt->fields[i],
+                    ll->Push(TupleGetItemNode::make(from, i)),
+                    ll->Push(TupleGetItemNode::make(to, i)),
+                    ll);
+    }
+  } else {
+    LOG(FATAL) << "Unsupported input/output type: " << forward_type;
+    throw;
+  }
+}
+
 /*! \brief t -> ReverseType(t). Transform to Reverse Mode Value. */
-Expr GetRev(const Type& t, const Expr& e, LetList* ll) {
+Expr GetRev(const Type& forward_type, const Expr& e, LetList* ll) {
   auto rev = [&](const Expr& e) {
     return Pair(e, ll->Push(RefCreateNode::make(ZerosLike(e))));
   };
-  return LiftTensor(rev, t, e, ll);
+  auto rev_type = [&](const Type& forward_type) {
+    return ReverseType(forward_type);
+  };
+  return LiftTensor(rev, rev_type, forward_type, e, ll);
 }
 
 /*! \brief ReverseType(t) -> t. Get the original value. */
-Expr GetValue(const Type& t, const Expr& e, LetList* ll) {
-  return LiftTensor([&](const Expr& e) { return GetField(e, 0); }, t, e, ll);
+Expr GetValue(const Type& forward_type, const Expr& e, LetList* ll) {
+  auto val = [&](const Expr& e) {
+    return GetField(e, 0);
+  };
+  auto val_type = [&](const Type& forward_type) {
+    return forward_type;
+  };
+  return LiftTensor(val, val_type, forward_type, e, ll);
 }
 
 /*! \brief ReverseType(t) -> t. Get the gradient. */
-Expr GetGrad(const Type& t, const Expr& e, LetList* ll) {
+Expr GetGrad(const Type& forward_type, const Expr& e, LetList* ll) {
   auto grad = [&](const Expr& e) {
     return ll->Push(RefReadNode::make(GetField(e, 1)));
   };
-  return LiftTensor(grad, t, e, ll);
+  auto grad_type = [&](const Type& forward_type) {
+    return forward_type;
+  };
+  return LiftTensor(grad, grad_type, forward_type, e, ll);
 }
 
 void UpdateGrad(const Type& t, const Expr& arg, const Expr& grad, LetList* ll) {
@@ -337,44 +380,87 @@ void UpdateGrad(const Type& t, const Expr& arg, const Expr& grad, LetList* ll) {
   }
 }
 
+Expr BPEmpty() {
+  Expr unitF = FunctionNode::make({}, TupleNode::make({}), TupleTypeNode::make({}), {});
+  return RefCreateNode::make(unitF);
+}
+
 struct ReverseAD : ExprMutator {
+  using ADVarMap = std::unordered_map<Var, Var, NodeHash, NodeEqual>;
+
   Var bp;
+  std::shared_ptr<ADVarMap> ad_vars;
   const OpMap<FPrimalGradient> rev_map = Op::GetAttr<FPrimalGradient>("FPrimalGradient");
 
-  explicit ReverseAD(const Var& bp) : bp(bp) { }
+  explicit ReverseAD(const Var& bp, std::shared_ptr<ADVarMap> ad_vars)
+      : bp(bp), ad_vars(ad_vars) { }
 
   Expr VisitExpr_(const OpNode* op) final {
     LOG(FATAL) << "op should only be inside call";
     throw;
   }
 
-  Expr VisitExpr_(const CallNode* op) final {
-    if (const OpNode* op_node = op->op.as<OpNode>()) {
+  Expr VisitCheckpoint(const CallNode *call) {
+    const OpNode* op_node = call->op.as<OpNode>();
+    CHECK(op_node) << "expected op in call";
+    Op op_ref = GetRef<Op>(op_node);
+    CHECK(op_ref->name == "annotation.checkpoint") << "expected checkpoint annotation";
+    auto x = call->args[0];
+    return LetList::With([&](LetList* ll) {
+      auto x_var = ll->Push(x);
+      auto ret = ll->Push(GetRev(call->checked_type(), x_var, ll));
+      auto bpv = ll->Push(RefReadNode::make(bp));
+      Expr nbp = FunctionNode::make(
+        {},
+        LetList::With([&](LetList* ll) {
+          // we need a new ReverseAD visitor to avoid clobbering the bp local var
+          auto dup_bp = ll->Push(BPEmpty());
+          ReverseAD dup_diff(dup_bp, ad_vars);
+          auto dup_ad = ll->Push(dup_diff.VisitExpr(DeDup(x)));
+
+          TransferGrads(call->checked_type(), ret, dup_ad, ll);
+          ll->Push(CallNode::make(RefReadNode::make(dup_bp), {}));
+          return CallNode::make(bpv, {});
+        }),
+        TupleTypeNode::make({}),
+        {});
+      ll->Push(RefWriteNode::make(bp, nbp));
+      return ret;
+    });
+  }
+
+  Expr VisitExpr_(const CallNode* call) final {
+    if (const OpNode* op_node = call->op.as<OpNode>()) {
       Op op_ref = GetRef<Op>(op_node);
+
+      if (op_ref->name == "annotation.checkpoint") {
+        return VisitCheckpoint(call);
+      }
+
       CHECK(rev_map.count(op_ref))
         << op_node->name << " does not have reverse mode defined";
       return LetList::With([&](LetList* ll) {
         std::vector<Var> args;
-        for (const auto& arg : op->args) {
+        for (const auto& arg : call->args) {
           args.push_back(ll->Push(VisitExpr(arg)));
         }
         std::vector<Expr> orig_args;
         for (size_t i = 0; i < args.size(); i++) {
-          orig_args.push_back(GetValue(op->args[i]->checked_type(), args[i], ll));
+          orig_args.push_back(GetValue(call->args[i]->checked_type(), args[i], ll));
         }
-        Expr orig = CallNode::make(op->op, orig_args, op->attrs, op->type_args);
-        orig->checked_type_ = op->checked_type();
+        Expr orig = CallNode::make(call->op, orig_args, call->attrs, call->type_args);
+        orig->checked_type_ = call->checked_type();
         Var orig_var = ll->Push(orig);
-        orig_var->checked_type_ = op->checked_type();
-        auto ret = ll->Push(GetRev(op->checked_type(), orig_var, ll));
+        orig_var->checked_type_ = call->checked_type();
+        auto ret = ll->Push(GetRev(call->checked_type(), orig_var, ll));
         auto bpv = ll->Push(RefReadNode::make(bp));
         Expr nbp = FunctionNode::make(
           {},
           LetList::With([&](LetList* ll) {
-            tvm::Array<Expr> rev = rev_map[op_ref](orig, GetGrad(op->checked_type(), ret, ll));
+            tvm::Array<Expr> rev = rev_map[op_ref](orig, GetGrad(call->checked_type(), ret, ll));
             CHECK(args.size() == rev.size());
             for (size_t i = 0; i < args.size(); ++i) {
-              UpdateGrad(op->args[i]->checked_type(), args[i], rev[i], ll);
+              UpdateGrad(call->args[i]->checked_type(), args[i], rev[i], ll);
             }
             return CallNode::make(bpv, {});
           }),
@@ -384,7 +470,7 @@ struct ReverseAD : ExprMutator {
         return ret;
       });
     }
-    return ExprMutator::VisitExpr_(op);
+    return ExprMutator::VisitExpr_(call);
   }
 
   Expr VisitExpr_(const ConstantNode* op) final {
@@ -398,14 +484,48 @@ struct ReverseAD : ExprMutator {
                         VisitExpr(op->false_branch));
   }
 
+  Expr VisitExpr_(const VarNode* var) final {
+    // memoize Var -> ADVar so we don't end up with free Vars when checkpointing
+    auto var_ref = GetRef<Var>(var);
+    if (!ad_vars->count(var_ref)) {
+      auto res = Downcast<Var>(ExprMutator::VisitExpr_(var));
+      (*ad_vars)[var_ref] = res;
+    }
+
+    return ad_vars->at(var_ref);
+  }
+
   Type VisitType(const Type& t) final {
     return t.defined() ? ReverseType(t) : t;
   }
 };
 
-Expr BPEmpty() {
-  Expr unitF = FunctionNode::make({}, TupleNode::make({}), TupleTypeNode::make({}), {});
-  return RefCreateNode::make(unitF);
+bool MissingGrad(const Expr& e) {
+  struct MGVisitor : ExprVisitor {
+    const OpMap<FPrimalGradient> rev_map = Op::GetAttr<FPrimalGradient>("FPrimalGradient");
+    std::unordered_set<std::string> op_names;
+
+    void VisitExpr_(const OpNode* op) final {
+      Op op_ref = GetRef<Op>(op);
+      if (op_ref->name != "annotation.checkpoint" && !rev_map.count(op_ref)) {
+        op_names.insert(op_ref->name);
+      }
+      ExprVisitor::VisitExpr_(op);
+    }
+  };
+
+  MGVisitor mg;
+  mg.VisitExpr(e);
+
+  if (mg.op_names.size() > 0) {
+    LOG(WARNING) << "found operators with missing gradients:";
+    for (const auto& op : mg.op_names) {
+      LOG(WARNING) << "    " << op;
+    }
+    return true;
+  }
+
+  return false;
 }
 
 Expr Gradient(const Expr& re, const Module& mod) {
@@ -416,9 +536,10 @@ Expr Gradient(const Expr& re, const Module& mod) {
   for (const auto& p : f->params) {
     CHECK(p->checked_type().as<TensorTypeNode>()) << "input parameters need to be tensor";
   }
+  CHECK(!MissingGrad(e)) << "input has operators with missing gradients";
   Expr body = LetList::With([&](LetList* ll) {
     Var bp = ll->Push(BPEmpty());
-    Expr rev = ReverseAD(bp)(e);
+    Expr rev = ReverseAD(bp, std::make_shared<ReverseAD::ADVarMap>())(e);
     std::vector<Expr> args;
     for (const auto& p : f->params) {
       args.push_back(ll->Push(Pair(p, RefCreateNode::make(ZerosLike(p)))));
diff --git a/src/relay/pass/mac_count.cc b/src/relay/pass/mac_count.cc
index 48a0dfb84746..000783c5bf01 100644
--- a/src/relay/pass/mac_count.cc
+++ b/src/relay/pass/mac_count.cc
@@ -66,7 +66,7 @@ int64_t ConvMacCount(const Call& call_node) {
     return 0;
   }
   Array<Expr> args = call_node->args;
-  CHECK(args.size() == 2)
+  CHECK_EQ(args.size(), 2)
     << "The number of input arguments of a CONV 2D node should be 2.";
   const auto* conv_2d_attr = call_node->attrs.as<Conv2DAttrs>();
   const auto* data_type = args[0]->checked_type().as<TensorTypeNode>();
@@ -74,13 +74,13 @@ int64_t ConvMacCount(const Call& call_node) {
   std::string data_layout = conv_2d_attr->data_layout;
   int32_t C_ind = Layout(data_layout).IndexOf(LayoutAxis::Get('C'));
   int32_t c_ind = Layout(data_layout).IndexOf(LayoutAxis::Get('c'));
-  CHECK(C_ind != -1)
+  CHECK_NE(C_ind, -1)
     << "There is no input channel dimension.";
   int64_t input_channel = static_cast<int64_t>(data_shape[C_ind].as<IntImm>()->value);
   if (c_ind != -1)
     input_channel *= static_cast<int64_t>(data_shape[c_ind].as<IntImm>()->value);
   Array<IndexExpr> kernel_size = conv_2d_attr->kernel_size;
-  CHECK(kernel_size.size() == 2)
+  CHECK_EQ(kernel_size.size(), 2)
     << "The dimension of the kernel in Conv 2D should be 2.";
   const auto* expr = call_node->checked_type().as<TensorTypeNode>();
   Array<IndexExpr> output_tensor = expr->shape;
@@ -99,7 +99,7 @@ int64_t Conv2dTransposeMacCount(const Call& call_node) {
     return 0;
   }
   Array<Expr> args = call_node->args;
-  CHECK(args.size() == 2)
+  CHECK_EQ(args.size(), 2)
     << "The number of input arguments of a CONV 2D Transpose node should be 2.";
   const auto* conv_2d_transpose_attr = call_node->attrs.as<Conv2DTransposeAttrs>();
   const auto* data_type = args[0]->checked_type().as<TensorTypeNode>();
@@ -107,13 +107,13 @@ int64_t Conv2dTransposeMacCount(const Call& call_node) {
   std::string data_layout = conv_2d_transpose_attr->data_layout;
   int32_t C_ind = Layout(data_layout).IndexOf(LayoutAxis::Get('C'));
   int32_t c_ind = Layout(data_layout).IndexOf(LayoutAxis::Get('c'));
-  CHECK(C_ind != -1)
+  CHECK_NE(C_ind, -1)
     << "There is no input channel dimension.";
   int64_t input_channel = static_cast<int64_t>(data_shape[C_ind].as<IntImm>()->value);
   if (c_ind != -1)
     input_channel *= static_cast<int64_t>(data_shape[c_ind].as<IntImm>()->value);
   Array<IndexExpr> kernel_size = conv_2d_transpose_attr->kernel_size;
-  CHECK(kernel_size.size() == 2)
+  CHECK_EQ(kernel_size.size(), 2)
     << "The dimension of the kernel in Conv 2D Transpose should be 2.";
   const auto* expr = call_node->checked_type().as<TensorTypeNode>();
   Array<IndexExpr> output_tensor = expr->shape;
@@ -132,7 +132,7 @@ int64_t DenseMacCount(const Call& call_node) {
     return 0;
   }
   Array<Expr> args = call_node->args;
-  CHECK(args.size() == 2)
+  CHECK_EQ(args.size(), 2)
       << "The number of input arguments of a Dense node should be 2.";
   const auto* data_type = args[0]->checked_type().as<TensorTypeNode>();
   const auto* weight_type = args[1]->checked_type().as<TensorTypeNode>();
@@ -144,12 +144,28 @@ int64_t DenseMacCount(const Call& call_node) {
   int64_t d2 = static_cast<int64_t>(data_shape[1].as<IntImm>()->value);
   int64_t d3 = static_cast<int64_t>(weight_shape[0].as<IntImm>()->value);
   int64_t d4 = static_cast<int64_t>(weight_shape[1].as<IntImm>()->value);
-  CHECK(d2 == d4)
+  CHECK_EQ(d2, d4)
     << "The dimensions of input arguments do not match.";
   int64_t count = d1 * d2 * d3;
   return count;
 }
 
+int64_t BatchMatmulMacCount(const Call& call_node) {
+  if (!call_node->checked_type_.defined()) {
+    LOG(WARNING) << "The infer type pass should be called before the mac count pass";
+    return 0;
+  }
+  Array<Expr> args = call_node->args;
+  CHECK_EQ(args.size(), 2);
+  Array<IndexExpr> x_shape = args[0]->checked_type().as<TensorTypeNode>()->shape;
+  Array<IndexExpr> y_shape = args[1]->checked_type().as<TensorTypeNode>()->shape;
+  int64_t batch = x_shape[0].as<IntImm>()->value;
+  int64_t m = x_shape[1].as<IntImm>()->value;
+  int64_t k = x_shape[2].as<IntImm>()->value;
+  int64_t n = y_shape[1].as<IntImm>()->value;
+  return batch * m * k * n;
+}
+
 RELAY_REGISTER_OP("nn.conv2d")
 .set_attr<FMacCount>("FMacCount", ConvMacCount);
 
@@ -159,14 +175,17 @@ RELAY_REGISTER_OP("nn.conv2d_transpose")
 RELAY_REGISTER_OP("nn.dense")
 .set_attr<FMacCount>("FMacCount", DenseMacCount);
 
+RELAY_REGISTER_OP("nn.batch_matmul")
+.set_attr<FMacCount>("FMacCount", BatchMatmulMacCount);
+
 class MacCounter : private ExprVisitor {
  public:
   MacCounter() {
     count_ = 0;
   }
   static int64_t GetTotalMacNumber(const Expr& expr) {
-    LOG(INFO) << "This pass only counts MACs in direct CONV 2D, "
-              << "CONV 2D Transpose and Dense ops";
+    LOG(INFO) << "This pass only counts MACs in direct conv2d, "
+              << "conv2d_transpose, dense, and batch_matmul ops";
     MacCounter counter;
     counter(expr);
     return counter.count_;
diff --git a/src/relay/pass/partial_eval.cc b/src/relay/pass/partial_eval.cc
index 906d245e4601..92f0db5d8ebe 100644
--- a/src/relay/pass/partial_eval.cc
+++ b/src/relay/pass/partial_eval.cc
@@ -18,8 +18,6 @@
  */
 
 /*!
- * Copyright (c) 2019 by Contributors
- *
  * \file partial_eval.cc
  *
  * \brief Perform known computation in compile time.
@@ -112,7 +110,7 @@ using namespace runtime;
  */
 struct VarHash {
   size_t operator()(const Var& v) const {
-    return v->vid.hash();
+    return NodeHash()(v->vid);
   }
 };
 
@@ -138,9 +136,9 @@ class StaticNode : public RelayNode {
 class Static : public NodeRef {
  public:
   Static() {}
-  explicit Static(NodePtr<Node> n) : NodeRef(n) {}
-  const ValueNode* operator->() const {
-    return static_cast<const ValueNode*>(node_.get());
+  explicit Static(ObjectPtr<Object> n) : NodeRef(n) {}
+  const StaticNode* operator->() const {
+    return static_cast<const StaticNode*>(get());
   }
 
   using ContainerType = StaticNode;
@@ -251,7 +249,7 @@ class FuelNode;
 class Fuel : public NodeRef {
  public:
   Fuel() {}
-  explicit Fuel(NodePtr<Node> n) : NodeRef(n) {}
+  explicit Fuel(ObjectPtr<Object> n) : NodeRef(n) {}
   const FuelNode* operator->() const;
 
   using ContainerType = FuelNode;
@@ -285,7 +283,7 @@ class FuelNode : public RelayNode {
 };
 
 const FuelNode* Fuel::operator->() const {
-  return static_cast<const FuelNode*>(node_.get());
+  return static_cast<const FuelNode*>(get());
 }
 
 Fuel MkFSeq(const std::vector<Fuel>& fuels);
diff --git a/src/relay/pass/pass_manager.cc b/src/relay/pass/pass_manager.cc
index 153b90cd8833..d2688620b0c3 100644
--- a/src/relay/pass/pass_manager.cc
+++ b/src/relay/pass/pass_manager.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- * Copyright (c) 2019 by Contributors
  * \file src/relay/pass/pass_manager.cc
  * \brief Relay pass manager implementation.
  */
@@ -103,7 +102,7 @@ class ModulePassNode : public PassNode {
 
   ModulePassNode() = default;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("pass_info", &pass_info);
   }
 
@@ -157,7 +156,7 @@ class FunctionPassNode : public PassNode {
 
   FunctionPassNode() = default;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("pass_info", &pass_info);
   }
 
@@ -212,7 +211,7 @@ class SequentialNode : public PassNode {
   /*! \brief A list of passes that used to compose a sequential pass. */
   tvm::Array<Pass> passes;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("pass_info", &pass_info);
     v->Visit("passes", &passes);
   }
@@ -342,7 +341,7 @@ Sequential::Sequential(tvm::Array<Pass> passes, PassInfo pass_info) {
   auto n = make_node<SequentialNode>();
   n->passes = std::move(passes);
   n->pass_info = std::move(pass_info);
-  node_ = std::move(n);
+  data_ = std::move(n);
 }
 
 Sequential::Sequential(tvm::Array<Pass> passes, std::string name) {
@@ -350,11 +349,11 @@ Sequential::Sequential(tvm::Array<Pass> passes, std::string name) {
   n->passes = std::move(passes);
   PassInfo pass_info = PassInfoNode::make(2, std::move(name), {});
   n->pass_info = std::move(pass_info);
-  node_ = std::move(n);
+  data_ = std::move(n);
 }
 
 const SequentialNode* Sequential::operator->() const {
-  return static_cast<const SequentialNode*>(this->node_.get());
+  return static_cast<const SequentialNode*>(get());
 }
 
 void SequentialNode::ResolveDependency(const Module& mod) {
diff --git a/src/relay/pass/quantize/annotate.cc b/src/relay/pass/quantize/annotate.cc
index d8a7a0f24818..31e95fc6fb8d 100644
--- a/src/relay/pass/quantize/annotate.cc
+++ b/src/relay/pass/quantize/annotate.cc
@@ -41,7 +41,7 @@ class QAnnotateExprNode : public TempExprNode {
   Expr expr;
   QAnnotateKind kind;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("expr", &expr);
     v->Visit("kind", &kind);
   }
@@ -79,7 +79,7 @@ Pass QuantizeAnnotate() {
   // TODO(tvm-teams): since partition has added cast_hint in different
   // branches, try to remove this in the future.
   std::function<Expr(const Expr&)> fmulti_ref = [](const Expr& e) {
-    if (e->derived_from<TempExprNode>()) {
+    if (e->IsInstance<TempExprNode>()) {
       const auto* n = e.as<QAnnotateExprNode>();
       CHECK(n);
       const PackedFunc* f =
@@ -109,6 +109,8 @@ Pass QuantizeAnnotate() {
 TVM_REGISTER_API("relay._quantize.QuantizeAnnotate")
 .set_body_typed(QuantizeAnnotate);
 
+TVM_REGISTER_NODE_TYPE(QAnnotateExprNode);
+
 }  // namespace quantize
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/quantize/partition.cc b/src/relay/pass/quantize/partition.cc
index 3f46cf2f227e..f66aed3549a2 100644
--- a/src/relay/pass/quantize/partition.cc
+++ b/src/relay/pass/quantize/partition.cc
@@ -35,13 +35,14 @@ namespace quantize {
 
 using namespace relay::transform;
 
+
 class QPartitionExpr;
 class QPartitionExprNode : public TempExprNode {
  public:
   /*! \brief The original expression */
   Expr expr;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("expr", &expr);
   }
 
@@ -87,6 +88,8 @@ Pass QuantizePartition() {
 TVM_REGISTER_API("relay._quantize.QuantizePartition")
 .set_body_typed(QuantizePartition);
 
+TVM_REGISTER_NODE_TYPE(QPartitionExprNode);
+
 }  // namespace quantize
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/quantize/quantize.cc b/src/relay/pass/quantize/quantize.cc
index c6d71ba0ed32..3d0e71edfb7c 100644
--- a/src/relay/pass/quantize/quantize.cc
+++ b/src/relay/pass/quantize/quantize.cc
@@ -18,8 +18,6 @@
  */
 
 /*!
- * Copyright (c) 2018 by Contributors
- *
  * \file quantize.cc
  *
  * \brief transform a graph to a low-bit graph
@@ -64,7 +62,7 @@ RELAY_REGISTER_OP("relay.op.annotation.simulated_quantize")
 .add_argument("dom_scale", "Tensor", "The domain scale of input data. It should be a scalar")
 .add_argument("clip_min", "Tensor", "lower bound. It should be a scalar")
 .add_argument("clip_max", "Tensor", "upper bound. It should be a scalar")
-.set_attrs_type_key("relay.attrs.SimulatedQuantizeAttrs")
+.set_attrs_type<SimulatedQuantizeAttrs>()
 .set_support_level(11)
 .add_type_rel("SimulatedQuantize", SimulatedQuantizeRel);
 
diff --git a/src/relay/pass/quantize/quantize.h b/src/relay/pass/quantize/quantize.h
index 4c153d522d69..412bce0a394e 100644
--- a/src/relay/pass/quantize/quantize.h
+++ b/src/relay/pass/quantize/quantize.h
@@ -18,8 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2018 by Contributors.
- *
  * \file tvm/relay/pass/quantize.h
  * \brief Header of definitions for quantization
  */
@@ -78,7 +76,7 @@ class QConfigNode : public Node {
   bool round_for_shift = true;
   Array<Expr> debug_enabled_ops = Array<Expr>(NodePtr<Node>(nullptr));
 
-  void VisitAttrs(AttrVisitor* v) final {
+  void VisitAttrs(AttrVisitor* v) {
     v->Visit("nbit_input", &nbit_input);
     v->Visit("nbit_weight", &nbit_weight);
     v->Visit("nbit_activation", &nbit_activation);
@@ -102,14 +100,14 @@ class QConfigNode : public Node {
 class QConfig : public NodeRef {
  public:
   QConfig() {}
-  explicit QConfig(NodePtr<Node> n) : NodeRef(n) {}
+  explicit QConfig(ObjectPtr<Object> n) : NodeRef(n) {}
 
   const QConfigNode* operator->() const {
-    return static_cast<const QConfigNode*>(node_.get());
+    return static_cast<const QConfigNode*>(get());
   }
 
   QConfigNode* operator->() {
-    return static_cast<QConfigNode*>(node_.get());
+    return static_cast<QConfigNode*>(get_mutable());
   }
 
   /*!
diff --git a/src/relay/pass/quantize/realize.cc b/src/relay/pass/quantize/realize.cc
index 7eae9992c9e4..bdd0d732d146 100644
--- a/src/relay/pass/quantize/realize.cc
+++ b/src/relay/pass/quantize/realize.cc
@@ -56,7 +56,7 @@ class QRealizeIntExprNode : public QRealizeExprNode {
   Expr dom_scale;
   DataType dtype;
 
-  void VisitAttrs(tvm::AttrVisitor* v) final {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("data", &data);
     v->Visit("dom_scale", &dom_scale);
     v->Visit("dtype", &dtype);
@@ -173,7 +173,7 @@ Expr QuantizeRealize(const Call& ref_call,
   }
 
   // quantize from real
-  CHECK(!new_args[0]->derived_from<TempExprNode>());
+  CHECK(!new_args[0]->IsInstance<TempExprNode>());
   Expr data = new_args[0];
   Expr scaled_data = Multiply(data, MakeConstantScalar(Float(32), 1 / dom_scale_imm));
   Expr round_data = Clip(Round(scaled_data), clip_min_imm, clip_max_imm);
@@ -196,7 +196,7 @@ Expr Conv2dRealize(const Call& ref_call,
                    const NodeRef& ctx) {
   const QConfig& cfg = QConfig::Current();
   CHECK_EQ(new_args.size(), 2);
-  if (!new_args[0]->derived_from<TempExprNode>() && !new_args[1]->derived_from<TempExprNode>()) {
+  if (!new_args[0]->IsInstance<TempExprNode>() && !new_args[1]->IsInstance<TempExprNode>()) {
     return Expr(nullptr);
   }
   const auto* lhs = new_args[0].as<QRealizeIntExprNode>();
@@ -232,7 +232,7 @@ Expr DenseRealize(const Call& ref_call,
                   const NodeRef& ctx) {
   const QConfig& cfg = QConfig::Current();
   CHECK_EQ(new_args.size(), 2);
-  if (!new_args[0]->derived_from<TempExprNode>() || !new_args[1]->derived_from<TempExprNode>()) {
+  if (!new_args[0]->IsInstance<TempExprNode>() || !new_args[1]->IsInstance<TempExprNode>()) {
     return Expr(nullptr);
   }
   const auto* lhs = new_args[0].as<QRealizeIntExprNode>();
@@ -290,7 +290,7 @@ Expr MulRealize(const Call& ref_call,
     Expr dom_scale = FoldConstantOpt(mul);
     return QRealizeIntExprNode::make(ret, dom_scale, dtype);
   }
-  CHECK(!new_args[0]->derived_from<TempExprNode>() && !new_args[1]->derived_from<TempExprNode>());
+  CHECK(!new_args[0]->IsInstance<TempExprNode>() && !new_args[1]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
@@ -375,7 +375,7 @@ Expr AddRealize(const Call& ref_call,
     return QRealizeIntExprNode::make(ret, dom_scale, dtype);
   }
 
-  CHECK(!new_args[0]->derived_from<TempExprNode>() && !new_args[1]->derived_from<TempExprNode>());
+  CHECK(!new_args[0]->IsInstance<TempExprNode>() && !new_args[1]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
@@ -397,7 +397,7 @@ Expr ClipRealize(const Call& ref_call,
       {n->data}, Attrs(attrs), ref_call->type_args);
     return QRealizeIntExprNode::make(ret, n->dom_scale, n->dtype);
   }
-  CHECK(!new_args[0]->derived_from<TempExprNode>());
+  CHECK(!new_args[0]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
@@ -426,7 +426,7 @@ Expr ConcatenateRealize(const Call& ref_call,
     return QRealizeIntExprNode::make(ret, dom_scale, dtype);
   } else {
     for (auto arg : new_args) {
-      CHECK(!arg->derived_from<TempExprNode>());
+      CHECK(!arg->IsInstance<TempExprNode>());
     }
     return Expr(nullptr);
   }
@@ -445,7 +445,7 @@ Expr IdentityRealize(const Call& ref_call,
     Expr ret = ForwardOp(ref_call, {n->data});
     return QRealizeIntExprNode::make(ret, n->dom_scale, n->dtype);
   }
-  CHECK(!new_args[0]->derived_from<TempExprNode>());
+  CHECK(!new_args[0]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
@@ -469,7 +469,7 @@ Expr CastDtypeInputRealize(const Call& ref_call,
     Expr ret = ForwardOp(ref_call, {data});
     return QRealizeIntExprNode::make(ret, n->dom_scale, cfg->dtype_input);
   }
-  CHECK(!new_args[0]->derived_from<TempExprNode>());
+  CHECK(!new_args[0]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
@@ -490,7 +490,7 @@ Expr AvgPoolRealize(const Call& ref_call,
     Expr ret = ForwardOp(ref_call, {data});
     return QRealizeIntExprNode::make(ret, n->dom_scale, cfg->dtype_activation);
   }
-  CHECK(!new_args[0]->derived_from<TempExprNode>());
+  CHECK(!new_args[0]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
@@ -506,7 +506,7 @@ Expr CastHintRealize(const Call& ref_call,
     Expr ret = Cast(n->data, param->dtype);
     return QRealizeIntExprNode::make(ret, n->dom_scale, param->dtype);
   }
-  CHECK(!new_args[0]->derived_from<TempExprNode>());
+  CHECK(!new_args[0]->IsInstance<TempExprNode>());
   return Expr(nullptr);
 }
 
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index 5b9b25bd61f9..cb0ea1bbc0ed 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -211,8 +211,8 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
 
   Type VisitExpr_(const TupleGetItemNode* op) final {
     if (!tuple_getitem_rel_.defined())  {
-      tuple_getitem_rel_ = TypeRelationFn(
-          EnvFunc::Get("tvm.relay.type_relation.TupleGetItem").node_);
+      tuple_getitem_rel_ = Downcast<TypeRelationFn>(
+          EnvFunc::Get("tvm.relay.type_relation.TupleGetItem"));
     }
     Type tuple_type = GetType(op->tuple);
     Type rtype = IncompleteTypeNode::make(Kind::kType);
@@ -682,13 +682,13 @@ class TypeInferencer::Resolver : public ExprMutator, PatternMutator {
     // Compiler optimization will likely fold these away for other nodes.
     CallNode* new_call =(
         std::is_base_of<CallNode, T>::value ?
-        static_cast<CallNode*>(new_e.node_.get()) : nullptr);
+        const_cast<CallNode*>(static_cast<const CallNode*>(new_e.get())) : nullptr);
     VarNode* new_var =(
         std::is_base_of<VarNode, T>::value ?
-        static_cast<VarNode*>(new_e.node_.get()) : nullptr);
+        const_cast<VarNode*>(static_cast<const VarNode*>(new_e.get())) : nullptr);
     FunctionNode* new_fn =(
         std::is_base_of<FunctionNode, T>::value ?
-        static_cast<FunctionNode*>(new_e.node_.get()) : nullptr);
+        const_cast<FunctionNode*>(static_cast<const FunctionNode*>(new_e.get())) : nullptr);
 
     // check if we need update the new_e
     bool need_update_type = !checked_type.same_as(new_e->checked_type_);
@@ -713,20 +713,21 @@ class TypeInferencer::Resolver : public ExprMutator, PatternMutator {
       return new_e;
     }
 
-    if (!new_e.node_.unique()) {
+    if (!new_e.unique()) {
       // Copy on write optimization
       // If new_e is an old expression,
       // we make a copy mutating an existing reference.
-      new_e = Expr(make_node<T>(*new_e.as<T>()));
+      NodePtr<ExprNode> ptr = make_node<T>(*new_e.as<T>());
+      new_e = Expr(ptr);
       new_call = (
           std::is_base_of<CallNode, T>::value ?
-          static_cast<CallNode*>(new_e.node_.get()) : nullptr);
+          static_cast<CallNode*>(ptr.get()) : nullptr);
       new_var = (
           std::is_base_of<VarNode, T>::value ?
-          static_cast<VarNode*>(new_e.node_.get()) : nullptr);
+          static_cast<VarNode*>(ptr.get()) : nullptr);
       new_fn = (
           std::is_base_of<FunctionNode, T>::value ?
-          static_cast<FunctionNode*>(new_e.node_.get()) : nullptr);
+          static_cast<FunctionNode*>(ptr.get()) : nullptr);
     }
 
     // attach the information.
diff --git a/src/relay/pass/type_solver.cc b/src/relay/pass/type_solver.cc
index 31edd3b0e80e..6035790225aa 100644
--- a/src/relay/pass/type_solver.cc
+++ b/src/relay/pass/type_solver.cc
@@ -153,7 +153,7 @@ class TypeSolver::Unifier : public TypeFunctor<Type(const Type&, const Type&)> {
   // default: unify only if alpha-equal
   Type VisitTypeDefault_(const Node* op, const Type& tn) final {
     NodeRef nr = GetRef<NodeRef>(op);
-    Type t1 = GetRef<Type>(nr.as_derived<tvm::relay::TypeNode>());
+    Type t1 = GetRef<Type>(nr.as<tvm::relay::TypeNode>());
     if (!AlphaEqual(t1, tn)) {
       return Type(nullptr);
     }
@@ -411,7 +411,7 @@ class TypeSolver::Propagator : public TypeFunctor<void(const Type&)> {
 
   void VisitTypeDefault_(const Node* op) override {
     NodeRef nr = GetRef<NodeRef>(op);
-    Type t = GetRef<Type>(nr.as_derived<tvm::relay::TypeNode>());
+    Type t = GetRef<Type>(nr.as<tvm::relay::TypeNode>());
     UpdateRelSet(t);
   }
 
@@ -495,7 +495,7 @@ class TypeSolver::Merger : public TypeFunctor<void(const Type&)> {
 
   void VisitTypeDefault_(const Node* op) override {
     NodeRef nr = GetRef<NodeRef>(op);
-    Type t = GetRef<Type>(nr.as_derived<tvm::relay::TypeNode>());
+    Type t = GetRef<Type>(nr.as<tvm::relay::TypeNode>());
     TransferLinks(t);
   }
 
@@ -592,7 +592,7 @@ void TypeSolver::AddConstraint(const TypeConstraint& constraint, const NodeRef&
     this->AddToQueue(rnode);
   } else {
     LOG(FATAL) << "Do not know how to handle constraint type"
-               << constraint->type_key();
+               << constraint->GetTypeKey();
   }
 }
 
diff --git a/src/relay/pass/util.cc b/src/relay/pass/util.cc
index 90c3de857329..fe1cc14b304d 100644
--- a/src/relay/pass/util.cc
+++ b/src/relay/pass/util.cc
@@ -280,7 +280,7 @@ TVM_REGISTER_API("relay._analysis.free_vars")
 TVM_REGISTER_API("relay._analysis.bound_vars")
   .set_body([](TVMArgs args, TVMRetValue* ret) {
       NodeRef x = args[0];
-      if (x.as_derived<ExprNode>()) {
+      if (x.as<ExprNode>()) {
         *ret = BoundVars(Downcast<Expr>(x));
       } else {
         *ret = BoundVars(Downcast<Pattern>(x));
@@ -294,7 +294,7 @@ TVM_REGISTER_API("relay._analysis.free_type_vars")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
     NodeRef x = args[0];
     Module mod = args[1];
-    if (x.as_derived<TypeNode>()) {
+    if (x.as<TypeNode>()) {
       *ret = FreeTypeVars(Downcast<Type>(x), mod);
     } else {
       *ret = FreeTypeVars(Downcast<Expr>(x), mod);
@@ -305,7 +305,7 @@ TVM_REGISTER_API("relay._analysis.bound_type_vars")
   .set_body([](TVMArgs args, TVMRetValue* ret) {
       NodeRef x = args[0];
       Module mod = args[1];
-      if (x.as_derived<TypeNode>()) {
+      if (x.as<TypeNode>()) {
         *ret = BoundTypeVars(Downcast<Type>(x), mod);
       } else {
         *ret = BoundTypeVars(Downcast<Expr>(x), mod);
@@ -316,7 +316,7 @@ TVM_REGISTER_API("relay._analysis.all_type_vars")
   .set_body([](TVMArgs args, TVMRetValue* ret) {
       NodeRef x = args[0];
       Module mod = args[1];
-      if (x.as_derived<TypeNode>()) {
+      if (x.as<TypeNode>()) {
         *ret = AllTypeVars(Downcast<Type>(x), mod);
       } else {
         *ret = AllTypeVars(Downcast<Expr>(x), mod);
diff --git a/src/relay/qnn/op/concatenate.cc b/src/relay/qnn/op/concatenate.cc
index e87eaa18ddd7..6eceb8d40532 100644
--- a/src/relay/qnn/op/concatenate.cc
+++ b/src/relay/qnn/op/concatenate.cc
@@ -119,7 +119,7 @@ Expr ConcatenateQnnCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
 RELAY_REGISTER_OP("qnn.concatenate")
 .describe(R"code(Concatenate the quantized input tensors along the given axis.
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.QnnConcatenateAttrs")
+.set_attrs_type<QnnConcatenateAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The tensor to concatenate.")
 .set_support_level(11)
diff --git a/src/relay/qnn/op/convolution.cc b/src/relay/qnn/op/convolution.cc
index a73a65804e98..d17a18589d75 100644
--- a/src/relay/qnn/op/convolution.cc
+++ b/src/relay/qnn/op/convolution.cc
@@ -478,7 +478,7 @@ operator to understand how to scale back the int32 output to (u)int8.
 - **out**:  This depends on the `layout` parameter. Output is 4D array of shape
             (batch_size, channels, out_height, out_width) if `layout` is `NCHW`.
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.QnnConv2DAttrs")
+.set_attrs_type<QnnConv2DAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "The quantized input data tensor.")
 .add_argument("weight", "Tensor", "The quantized weight tensor.")
diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc
index 5473d139e215..c708cfa3dc63 100644
--- a/src/relay/qnn/op/dense.cc
+++ b/src/relay/qnn/op/dense.cc
@@ -115,7 +115,7 @@ RELAY_REGISTER_OP("qnn.dense")
 - **weight**: quantized(int8, unit8) `(units, input_dim)`
 - **out**: quantized(int32) `(x1, x2, ..., xn, units)`.
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.qnn.QnnDenseAttrs")
+.set_attrs_type<QnnDenseAttrs>()
 .set_num_inputs(2)
 .add_argument("data", "quantized nD Tensor", "Input data.")
 .add_argument("weight", "quantized 2D Tensor", "Weight matrix.")
diff --git a/src/relay/qnn/op/dequantize.cc b/src/relay/qnn/op/dequantize.cc
index ff37e2dd09e3..2baa1a57d96d 100644
--- a/src/relay/qnn/op/dequantize.cc
+++ b/src/relay/qnn/op/dequantize.cc
@@ -43,8 +43,9 @@ bool DequantizeRel(const Array<Type>& types,
   CHECK_EQ(types.size(), 2);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto input_dtype = data->dtype;
-  CHECK(input_dtype == Int(8) || input_dtype == UInt(8))
-    << "Input type should be one of the quantized types [unit8, int8] but was " <<  input_dtype;
+  CHECK(input_dtype == Int(8) || input_dtype == UInt(8) || input_dtype == Int(32))
+    << "Input type should be one of the quantized types [unit8, int8, int32] but was "
+    <<  input_dtype;
   const Array<tvm::Expr> oshape = data->shape;
   // assign output type, output will always be float 32.
   reporter->Assign(types[1], TensorTypeNode::make(oshape, Float(32)));
@@ -88,7 +89,7 @@ RELAY_REGISTER_OP("qnn.dequantize")
 The input is always quantized (int8, uint8) and will be converted to float32 given input scale and zero_point.
 - **data**: Quantized tensor of any shape to dequantize. The input data can be of floating point
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.DequantizeAttrs")
+.set_attrs_type<DequantizeAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The tensor to dequantize.")
 .set_support_level(11)
diff --git a/src/relay/qnn/op/mul.cc b/src/relay/qnn/op/mul.cc
new file mode 100644
index 000000000000..24264022c2ea
--- /dev/null
+++ b/src/relay/qnn/op/mul.cc
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file src/relay/qnn/op/mul.cc
+ * \brief QNN mul operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/qnn/attrs.h>
+#include "../../pass/pattern_util.h"
+#include "../util.h"
+#include "op_common.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+/*
+ * \brief Canonicalizes the QNN mul op.
+ * \param attrs The QNN concatenate attrs.
+ * \param new_args The new mutated args to the call node.
+ * \param arg_types The types of input and output.
+ * \return The sequence of Relay ops for mul op.
+ */
+Expr QnnMulCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                        const Array<tvm::relay::Type>& arg_types) {
+  // Get the attrs.
+  CHECK_EQ(new_args.size(), 2);
+  auto& lhs = new_args[0];
+  auto& rhs = new_args[1];
+  const auto* binary_op_attrs = attrs.as<QnnBinaryOpAttrs>();
+  CHECK(binary_op_attrs != nullptr);
+  auto lhs_scale = binary_op_attrs->lhs_scale;
+  auto lhs_zero_point = binary_op_attrs->lhs_zero_point;
+  auto rhs_scale = binary_op_attrs->rhs_scale;
+  auto rhs_zero_point = binary_op_attrs->rhs_zero_point;
+  auto output_scale = binary_op_attrs->output_scale;
+  auto output_zero_point = binary_op_attrs->output_zero_point;
+
+  // Get the input dtype and shape.
+  CHECK_EQ(arg_types.size(), 3);
+  auto tensor_type = arg_types[0].as<TensorTypeNode>();
+  auto input_dtype = tensor_type->dtype;
+  auto input_shape = tensor_type->shape;
+
+  /*
+  A tensor multiplication c = a * b can be written in terms of respective
+  quantized tensors, scales and zero points as
+  S_c * (Q_c - zp_c) = S_a * (Q_a - zp_a) * S_b * (Q_b - zp_b).
+
+  We can consider the product (Q_a - zp_a) * (Q_b - zp_b) as a different
+  quantized tensor of c, Q', with corresponding scale S' = S_a * S_b and zp' =
+  0. The quantized multiplication then becomes
+  Q_c = S'/S_c Q' + z_c,
+  which is essentially a requantization of tensor Q' into tensor Q_c.
+  */
+
+  auto lhs_shifted = Cast(lhs, Int(32));
+  auto rhs_shifted = Cast(rhs, Int(32));
+
+  if (lhs_zero_point != 0) {
+    auto lhs_zp = MakeConstantScalar(Int(32), lhs_zero_point);
+    lhs_shifted = Subtract(lhs_shifted, lhs_zp);
+  }
+
+  if (rhs_zero_point != 0) {
+    auto rhs_zp = MakeConstantScalar(Int(32), rhs_zero_point);
+    rhs_shifted = Subtract(rhs_shifted, rhs_zp);
+  }
+
+  // Create a new tensor Q'
+  auto output = Multiply(lhs_shifted, rhs_shifted);
+
+  auto scale_new = rhs_scale * lhs_scale;
+
+  // Requantize to get Q_c
+  output = Requantize(output, input_shape, scale_new, 0, output_scale,
+    output_zero_point, input_dtype);
+
+  return output;
+}
+
+// QNN Multiplication operator.
+QNN_REGISTER_BINARY_OP("mul")
+.describe("Elementwise mul with with broadcasting for quantized tensors.")
+.set_support_level(11)
+.set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnMulCanonicalize);
+
+}  // namespace qnn
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/qnn/op/quantize.cc b/src/relay/qnn/op/quantize.cc
index 97f1a220302e..1f7dbc1b6bb6 100644
--- a/src/relay/qnn/op/quantize.cc
+++ b/src/relay/qnn/op/quantize.cc
@@ -106,7 +106,7 @@ scale and zero point.
 - **data**: Tensor of any shape to quantize. The input data can be of floating point
           or quantized.
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.QuantizeAttrs")
+.set_attrs_type<QuantizeAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The tensor to quantize.")
 .set_support_level(11)
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 85d8dc3609f8..4a424d1df693 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -183,7 +183,7 @@ point. The computation looks like this
 Q_output = zp_output +  (scale_input)/(scale_output) * (Q_input - zp_input)
 
 )code" TVM_ADD_FILELINE)
-.set_attrs_type_key("relay.attrs.RequantizeAttrs")
+.set_attrs_type<RequantizeAttrs>()
 .set_num_inputs(1)
 .add_argument("data", "Tensor", "The quantized input tensor.")
 .set_support_level(11)
diff --git a/src/relay/qnn/util.h b/src/relay/qnn/util.h
index c26183705b89..f94860d28cf9 100644
--- a/src/relay/qnn/util.h
+++ b/src/relay/qnn/util.h
@@ -76,7 +76,7 @@ Expr RequantizeLower(const Expr& input_tensor, const RequantizeAttrs* param,
 static inline Expr Requantize(const Expr& data, const Array<IndexExpr>& input_shape,
                               double input_scale, int32_t input_zero_point, double output_scale,
                               int32_t output_zero_point, const DataType& out_dtype,
-                              const std::string& rounding = "TONEAREST") {
+                              const std::string& rounding = "UPWARD") {
   auto attrs = make_node<RequantizeAttrs>();
   attrs->input_scale = std::move(input_scale);
   attrs->input_zero_point = std::move(input_zero_point);
diff --git a/src/runtime/c_dsl_api.cc b/src/runtime/c_dsl_api.cc
deleted file mode 100644
index e45c89a0e9b3..000000000000
--- a/src/runtime/c_dsl_api.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file cpu_dsl_api.cc
- * \brief DSL API dispatcher
- */
-#include <tvm/runtime/registry.h>
-#include <tvm/c_dsl_api.h>
-#include "dsl_api.h"
-#include "runtime_base.h"
-
-namespace tvm {
-namespace runtime {
-
-DSLAPI* FindDSLAPI() {
-  auto* f = Registry::Get("dsl_api.singleton");
-  if (f == nullptr) {
-    throw dmlc::Error("TVM runtime only environment,"\
-                      " DSL API is not available");
-  }
-  void* ptr = (*f)();
-  return static_cast<DSLAPI*>(ptr);
-}
-
-static DSLAPI* GetDSLAPI() {
-  static DSLAPI* inst = FindDSLAPI();
-  return inst;
-}
-}  // namespace runtime
-}  // namespace tvm
-
-using namespace tvm::runtime;
-
-int TVMNodeFree(NodeHandle handle) {
-  API_BEGIN();
-  GetDSLAPI()->NodeFree(handle);
-  API_END();
-}
-
-int TVMNodeTypeKey2Index(const char* type_key,
-                         int* out_index) {
-  API_BEGIN();
-  GetDSLAPI()->NodeTypeKey2Index(type_key, out_index);
-  API_END();
-}
-
-
-int TVMNodeGetTypeIndex(NodeHandle handle,
-                        int* out_index) {
-  API_BEGIN();
-  GetDSLAPI()->NodeGetTypeIndex(handle, out_index);
-  API_END();
-}
-
-int TVMNodeGetAttr(NodeHandle handle,
-                   const char* key,
-                   TVMValue* out_value,
-                   int* out_type_code,
-                   int* out_success) {
-  API_BEGIN();
-  GetDSLAPI()->NodeGetAttr(
-      handle, key, out_value, out_type_code, out_success);
-  API_END();
-}
-
-int TVMNodeListAttrNames(NodeHandle handle,
-                         int *out_size,
-                         const char*** out_array) {
-  API_BEGIN();
-  GetDSLAPI()->NodeListAttrNames(
-      handle, out_size, out_array);
-  API_END();
-}
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index 20793b4618b3..13181da7303a 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -471,7 +471,7 @@ int TVMFuncCall(TVMFunctionHandle func,
       TVMArgs(args, arg_type_codes, num_args), &rv);
   // handle return string.
   if (rv.type_code() == kStr ||
-     rv.type_code() == kTVMType ||
+      rv.type_code() == kTVMType ||
       rv.type_code() == kBytes) {
     TVMRuntimeEntry* e = TVMAPIRuntimeStore::Get();
     if (rv.type_code() != kTVMType) {
diff --git a/src/runtime/dsl_api.h b/src/runtime/dsl_api.h
deleted file mode 100644
index 6e79e250c56d..000000000000
--- a/src/runtime/dsl_api.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file cpu_dsl_api.cc
- * \brief DSL API dispatcher
- */
-#ifndef TVM_RUNTIME_DSL_API_H_
-#define TVM_RUNTIME_DSL_API_H_
-
-#include <tvm/c_dsl_api.h>
-
-namespace tvm {
-namespace runtime {
-/*!
- * \brief Common interface for DSL API
- *  Used for runtime registration
- */
-class DSLAPI {
- public:
-  virtual ~DSLAPI() = default;
-  virtual void NodeFree(NodeHandle handle) const = 0;
-
-  virtual void NodeTypeKey2Index(const char* type_key,
-                                 int* out_index) const = 0;
-
-  virtual void NodeGetTypeIndex(NodeHandle handle,
-                                int* out_index) const = 0;
-
-  virtual void NodeGetAttr(NodeHandle handle,
-                           const char* key,
-                           TVMValue* out_value,
-                           int* out_type_code,
-                           int* out_success) const = 0;
-
-  virtual void NodeListAttrNames(NodeHandle handle,
-                                 int *out_size,
-                                 const char*** out_array) const = 0;
-};
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_DSL_API_H_
diff --git a/src/runtime/object.cc b/src/runtime/object.cc
index 5248da00245a..5d71c2fd2fa1 100644
--- a/src/runtime/object.cc
+++ b/src/runtime/object.cc
@@ -25,7 +25,9 @@
 #include <mutex>
 #include <string>
 #include <vector>
+#include <utility>
 #include <unordered_map>
+#include "runtime_base.h"
 
 namespace tvm {
 namespace runtime {
@@ -46,6 +48,8 @@ struct TypeInfo {
   bool child_slots_can_overflow{true};
   /*! \brief name of the type. */
   std::string name;
+  /*! \brief hash of the name */
+  size_t name_hash{0};
 };
 
 /*!
@@ -69,13 +73,12 @@ class TypeContext {
     return child_tindex == parent_tindex;
   }
 
-  uint32_t GetOrAllocRuntimeTypeIndex(const char* key,
+  uint32_t GetOrAllocRuntimeTypeIndex(const std::string& skey,
                                       uint32_t static_tindex,
                                       uint32_t parent_tindex,
                                       uint32_t num_child_slots,
                                       bool child_slots_can_overflow) {
     std::lock_guard<std::mutex> lock(mutex_);
-    std::string skey = key;
     auto it = type_key2index_.find(skey);
     if (it != type_key2index_.end()) {
       return it->second;
@@ -102,7 +105,7 @@ class TypeContext {
           << "Conflicting static index " << static_tindex
           << " between " << type_table_[allocated_tindex].name
           << " and "
-          << key;
+          << skey;
     } else if (pinfo.allocated_slots + num_slots < pinfo.num_slots) {
       // allocate the slot from parent's reserved pool
       allocated_tindex = parent_tindex + pinfo.allocated_slots;
@@ -126,6 +129,7 @@ class TypeContext {
     type_table_[allocated_tindex].child_slots_can_overflow =
         child_slots_can_overflow;
     type_table_[allocated_tindex].name = skey;
+    type_table_[allocated_tindex].name_hash = std::hash<std::string>()(skey);
     // update the key2index mapping.
     type_key2index_[skey] = allocated_tindex;
     return allocated_tindex;
@@ -139,11 +143,18 @@ class TypeContext {
     return type_table_[tindex].name;
   }
 
-  uint32_t TypeKey2Index(const char* key) {
-    std::string skey = key;
+  size_t TypeIndex2KeyHash(uint32_t tindex) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    CHECK(tindex < type_table_.size() &&
+          type_table_[tindex].allocated_slots != 0)
+        << "Unknown type index " << tindex;
+    return type_table_[tindex].name_hash;
+  }
+
+  uint32_t TypeKey2Index(const std::string& skey) {
     auto it = type_key2index_.find(skey);
     CHECK(it != type_key2index_.end())
-        << "Cannot find type " << key;
+        << "Cannot find type " << skey;
     return it->second;
   }
 
@@ -163,7 +174,7 @@ class TypeContext {
   std::unordered_map<std::string, uint32_t> type_key2index_;
 };
 
-uint32_t Object::GetOrAllocRuntimeTypeIndex(const char* key,
+uint32_t Object::GetOrAllocRuntimeTypeIndex(const std::string& key,
                                             uint32_t static_tindex,
                                             uint32_t parent_tindex,
                                             uint32_t num_child_slots,
@@ -181,8 +192,45 @@ std::string Object::TypeIndex2Key(uint32_t tindex) {
   return TypeContext::Global()->TypeIndex2Key(tindex);
 }
 
-uint32_t Object::TypeKey2Index(const char* key) {
+size_t Object::TypeIndex2KeyHash(uint32_t tindex) {
+  return TypeContext::Global()->TypeIndex2KeyHash(tindex);
+}
+
+uint32_t Object::TypeKey2Index(const std::string& key) {
   return TypeContext::Global()->TypeKey2Index(key);
 }
+
+class TVMObjectCAPI {
+ public:
+  static void Free(TVMObjectHandle obj) {
+    if (obj != nullptr) {
+      static_cast<Object*>(obj)->DecRef();
+    }
+  }
+
+  static uint32_t TypeKey2Index(const std::string& type_key) {
+    return Object::TypeKey2Index(type_key);
+  }
+};
 }  // namespace runtime
 }  // namespace tvm
+
+int TVMObjectGetTypeIndex(TVMObjectHandle obj, unsigned* out_tindex) {
+  API_BEGIN();
+  CHECK(obj != nullptr);
+  out_tindex[0] = static_cast<tvm::runtime::Object*>(obj)->type_index();
+  API_END();
+}
+
+int TVMObjectFree(TVMObjectHandle obj) {
+  API_BEGIN();
+  tvm::runtime::TVMObjectCAPI::Free(obj);
+  API_END();
+}
+
+int TVMObjectTypeKey2Index(const char* type_key, unsigned* out_tindex) {
+  API_BEGIN();
+  out_tindex[0] = tvm::runtime::TVMObjectCAPI::TypeKey2Index(
+      type_key);
+  API_END();
+}
diff --git a/src/runtime/rpc/rpc_event_impl.cc b/src/runtime/rpc/rpc_event_impl.cc
index 7a142f3373db..3f4782693d8a 100644
--- a/src/runtime/rpc/rpc_event_impl.cc
+++ b/src/runtime/rpc/rpc_event_impl.cc
@@ -29,32 +29,14 @@
 namespace tvm {
 namespace runtime {
 
-class CallbackChannel final : public RPCChannel {
- public:
-  explicit CallbackChannel(PackedFunc fsend)
-      : fsend_(fsend) {}
-
-  size_t Send(const void* data, size_t size) final {
-    TVMByteArray bytes;
-    bytes.data = static_cast<const char*>(data);
-    bytes.size = size;
-    uint64_t ret = fsend_(bytes);
-    return static_cast<size_t>(ret);
-  }
-
-  size_t Recv(void* data, size_t size) final {
-    LOG(FATAL) << "Do not allow explicit receive for";
-    return 0;
-  }
-
- private:
-  PackedFunc fsend_;
-};
-
 PackedFunc CreateEventDrivenServer(PackedFunc fsend,
                                    std::string name,
                                    std::string remote_key) {
-  std::unique_ptr<CallbackChannel> ch(new CallbackChannel(fsend));
+  static PackedFunc frecv([](TVMArgs args, TVMRetValue* rv) {
+    LOG(FATAL) << "Do not allow explicit receive";
+    return 0;
+  });
+  std::unique_ptr<CallbackChannel> ch(new CallbackChannel(fsend, frecv));
   std::shared_ptr<RPCSession> sess =
       RPCSession::Create(std::move(ch), name, remote_key);
   return PackedFunc([sess](TVMArgs args, TVMRetValue* rv) {
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index f235ec8e8f0c..39db150bd3a0 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -36,6 +36,7 @@
 #include <algorithm>
 #include "rpc_session.h"
 #include "../../common/ring_buffer.h"
+#include "../../common/socket.h"
 
 namespace tvm {
 namespace runtime {
@@ -1260,5 +1261,26 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf,
   return PackedFunc(ftimer);
 }
 
+size_t CallbackChannel::Send(const void* data, size_t size) {
+  TVMByteArray bytes;
+  bytes.data = static_cast<const char*>(data);
+  bytes.size = size;
+  int64_t n = fsend_(bytes);
+  if (n == -1) {
+    common::Socket::Error("CallbackChannel::Send");
+  }
+  return static_cast<size_t>(n);
+}
+
+size_t CallbackChannel::Recv(void* data, size_t size) {
+  TVMRetValue ret = frecv_(size);
+  if (ret.type_code() != kBytes) {
+    common::Socket::Error("CallbackChannel::Recv");
+  }
+  std::string* bytes = ret.ptr<std::string>();
+  memcpy(static_cast<char*>(data), bytes->c_str(), bytes->length());
+  return bytes->length();
+}
+
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h
index bc0bc8fe5918..d982f68bcb6e 100644
--- a/src/runtime/rpc/rpc_session.h
+++ b/src/runtime/rpc/rpc_session.h
@@ -87,7 +87,7 @@ class RPCChannel {
    */
   virtual size_t Send(const void* data, size_t size) = 0;
   /*!
-e   * \brief Recv data from channel.
+   * \brief Recv data from channel.
    *
    * \param data The data pointer.
    * \param size The size fo the data.
@@ -253,6 +253,37 @@ class RPCSession {
   std::string remote_key_;
 };
 
+/*!
+ * \brief RPC channel which callback
+ * frontend (Python/Java/etc.)'s send & recv function
+ */
+class CallbackChannel final : public RPCChannel {
+ public:
+  explicit CallbackChannel(PackedFunc fsend, PackedFunc frecv)
+      : fsend_(std::move(fsend)), frecv_(std::move(frecv)) {}
+
+  ~CallbackChannel() {}
+  /*!
+   * \brief Send data over to the channel.
+   * \param data The data pointer.
+   * \param size The size fo the data.
+   * \return The actual bytes sent.
+   */
+  size_t Send(const void* data, size_t size) final;
+  /*!
+   * \brief Recv data from channel.
+   *
+   * \param data The data pointer.
+   * \param size The size fo the data.
+   * \return The actual bytes received.
+   */
+  size_t Recv(void* data, size_t size) final;
+
+ private:
+  PackedFunc fsend_;
+  PackedFunc frecv_;
+};
+
 /*!
  * \brief Wrap a timer function to measure the time cost of a given packed function.
  * \param f The function argument.
diff --git a/src/runtime/rpc/rpc_socket_impl.cc b/src/runtime/rpc/rpc_socket_impl.cc
index 16528bcc68a1..65d37531159f 100644
--- a/src/runtime/rpc/rpc_socket_impl.cc
+++ b/src/runtime/rpc/rpc_socket_impl.cc
@@ -36,7 +36,7 @@ class SockChannel final : public RPCChannel {
       : sock_(sock) {}
   ~SockChannel() {
     if (!sock_.BadSocket()) {
-        sock_.Close();
+      sock_.Close();
     }
   }
   size_t Send(const void* data, size_t size) final {
@@ -109,12 +109,25 @@ void RPCServerLoop(int sockfd) {
       "SockServerLoop", "")->ServerLoop();
 }
 
+void RPCServerLoop(PackedFunc fsend, PackedFunc frecv) {
+  RPCSession::Create(std::unique_ptr<CallbackChannel>(
+      new CallbackChannel(fsend, frecv)),
+      "SockServerLoop", "")->ServerLoop();
+}
+
 TVM_REGISTER_GLOBAL("rpc._Connect")
 .set_body_typed(RPCClientConnect);
 
 TVM_REGISTER_GLOBAL("rpc._ServerLoop")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
-    RPCServerLoop(args[0]);
+    if (args.size() == 1) {
+      RPCServerLoop(args[0]);
+    } else {
+      CHECK_EQ(args.size(), 2);
+      RPCServerLoop(
+          args[0].operator tvm::runtime::PackedFunc(),
+          args[1].operator tvm::runtime::PackedFunc());
+    }
   });
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc
index 2e101364db2a..e9e6d03243e3 100644
--- a/src/runtime/thread_pool.cc
+++ b/src/runtime/thread_pool.cc
@@ -29,6 +29,9 @@
 #include <tvm/runtime/threading_backend.h>
 #include <dmlc/thread_local.h>
 #include <dmlc/logging.h>
+#if TVM_THREADPOOL_USE_OPENMP
+#include <omp.h>
+#endif
 #include <thread>
 #include <condition_variable>
 #include <mutex>
@@ -394,12 +397,34 @@ int TVMBackendParallelLaunch(
     FTVMParallelLambda flambda,
     void* cdata,
     int num_task) {
+#if !TVM_THREADPOOL_USE_OPENMP
   int res = tvm::runtime::ThreadPool::ThreadLocal()->Launch(
       flambda, cdata, num_task, 1);
   return res;
+#else
+  int num_workers = tvm::runtime::threading::MaxConcurrency();
+  if (num_task == 0) num_task = num_workers;
+  omp_set_num_threads(num_workers);
+  #pragma omp parallel num_threads(num_workers)
+  {
+    TVMParallelGroupEnv env;
+    env.num_task = num_task;
+    std::atomic<int32_t>* sync_counter = new std::atomic<int>[num_task * tvm::runtime::kSyncStride];
+    for (int i = 0; i < num_task; ++i) {
+      sync_counter[i * tvm::runtime::kSyncStride].store(
+          0, std::memory_order_relaxed);
+    }
+    env.sync_handle = sync_counter;
+    (*flambda)(omp_get_thread_num(), &env, cdata);
+  }
+  return 0;
+#endif
 }
 
 int TVMBackendParallelBarrier(int task_id, TVMParallelGroupEnv* penv) {
+#if TVM_THREADPOOL_USE_OPENMP
+  #pragma omp barrier
+#else
   using tvm::runtime::kSyncStride;
   int num_task = penv->num_task;
   std::atomic<int>* sync_counter =
@@ -415,5 +440,6 @@ int TVMBackendParallelBarrier(int task_id, TVMParallelGroupEnv* penv) {
     }
   }
   std::atomic_thread_fence(std::memory_order_acquire);
+#endif
   return 0;
 }
diff --git a/src/runtime/thread_storage_scope.h b/src/runtime/thread_storage_scope.h
index 0934e46d4e21..8e75fab87849 100644
--- a/src/runtime/thread_storage_scope.h
+++ b/src/runtime/thread_storage_scope.h
@@ -50,7 +50,13 @@ enum class StorageRank {
    */
   kWarp = 2,
   /*! \brief thread local memory */
-  kLocal = 3
+  kLocal = 3,
+  /*! \brief wmma scope memory of matrix_a */
+  kWMMAMatrixA = 4,
+  /*! \brief wmma scope memory of matrix_b */
+  kWMMAMatrixB = 5,
+  /*! \brief wmma scope memory of accumulator */
+  kWMMAAccumulator = 6,
 };
 
 /*!
@@ -89,6 +95,9 @@ struct StorageScope {
       case StorageRank::kShared: return "shared" + tag;
       case StorageRank::kWarp: return "warp" + tag;
       case StorageRank::kLocal: return "local" + tag;
+      case StorageRank::kWMMAMatrixA: return "wmma.matrix_a" + tag;
+      case StorageRank::kWMMAMatrixB: return "wmma.matrix_b" + tag;
+      case StorageRank::kWMMAAccumulator: return "wmma.accumulator" + tag;
       default: LOG(FATAL) << "unknown storage scope"; return "";
     }
   }
@@ -111,6 +120,15 @@ struct StorageScope {
     } else if (s.compare(0, 5, "local") == 0) {
       r.rank = StorageRank::kLocal;
       r.tag = s.substr(5, std::string::npos);
+    } else if (s.compare(0, 13, "wmma.matrix_a") == 0) {
+      r.rank = StorageRank::kWMMAMatrixA;
+      r.tag = s.substr(13, std::string::npos);
+    } else if (s.compare(0, 13, "wmma.matrix_b") == 0) {
+      r.rank = StorageRank::kWMMAMatrixB;
+      r.tag = s.substr(13, std::string::npos);
+    } else if (s.compare(0, 16, "wmma.accumulator") == 0) {
+      r.rank = StorageRank::kWMMAAccumulator;
+      r.tag = s.substr(16, std::string::npos);
     } else {
       LOG(FATAL) << "unknown storage scope " << s;
     }
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
new file mode 100644
index 000000000000..32032b5a1e64
--- /dev/null
+++ b/src/runtime/vm/executable.cc
@@ -0,0 +1,736 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file tvm/runtime/vm/executable.cc
+ * \brief The implementation of a virtual machine executable APIs.
+ */
+
+#include <dmlc/memory_io.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/vm.h>
+
+#include <algorithm>
+#include <memory>
+#include <iostream>
+#include <sstream>
+#include <utility>
+#include <vector>
+
+#include "serialize_util.h"
+
+namespace tvm {
+namespace runtime {
+namespace vm {
+
+#define STREAM_CHECK(val, section)                                         \
+  CHECK(val) << "Invalid VM file format in the " << section << " section." \
+             << "\n";
+
+// Helper to serialize a vm instruction.
+VMInstructionSerializer SerializeInstruction(const Instruction& instr);
+// Helper to deserialize a serialized vm instruction.
+Instruction DeserializeInstruction(const VMInstructionSerializer& instr);
+
+PackedFunc Executable::GetFunction(const std::string& name,
+    const std::shared_ptr<ModuleNode>& sptr_to_self) {
+  if (name == "get_lib") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      *rv = this->GetLib();
+    });
+  } else if (name == "get_bytecode") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      *rv = this->GetBytecode();
+    });
+  } else if (name == "get_stats") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      *rv = this->Stats();
+    });
+  } else if (name == "save") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      *rv = this->Save();
+    });
+  } else {
+    LOG(FATAL) << "Unknown packed function: " << name;
+    return PackedFunc(nullptr);
+  }
+}
+
+std::string Executable::GetBytecode() const {
+  std::ostringstream oss;
+
+  for (const auto& func : functions) {
+    // Print the header of the function format.
+    oss << "# func name, reg file size, param count, inst count:"
+        << std::endl;
+    oss << func.name << " "
+        << func.register_file_size << " "
+        << func.params.size() << " "
+        << func.instructions.size() << std::endl;
+
+    // Print pramams of a `VMFunction`.
+    oss << "# Parameters: "<< std::endl;
+    for (const auto& param : func.params) {
+      oss << param << " ";
+    }
+    oss << std::endl;
+
+    // Print the instructions of a `VMFunction`.
+    // The part after ";" is the instruction in text format.
+    oss << "hash, opcode, fields # inst(text):"<< std::endl;
+    for (const auto& instr : func.instructions) {
+      const auto& serialized_instr = SerializeInstruction(instr);
+      oss << std::hex << "0x" << serialized_instr.Hash() << " "
+          << std::dec << serialized_instr.opcode << " ";
+      for (auto it : serialized_instr.fields) {
+        oss << it << " ";
+      }
+      oss << "  # " << instr;
+      if (oss.str().back() != '\n') oss << std::endl;
+    }
+  }
+
+  return oss.str();
+}
+
+std::string Executable::Stats() const {
+  std::ostringstream oss;
+  oss << "Relay VM executable statistics:" << std::endl;
+
+  // Get the number of constants and the shape of each of them.
+  oss << "  Constant shapes (# " << constants.size() << "): [";
+  for (const auto& it : constants) {
+    const auto* cell = it.as<TensorObj>();
+    CHECK(cell);
+    runtime::NDArray data = cell->data;
+    const auto& shape = data.Shape();
+
+    // Scalar
+    if (shape.empty()) {
+      oss << "scalar, ";
+      continue;
+    }
+
+    oss << "[";
+    for (auto s : shape) {
+      oss << s << ", ";
+    }
+    oss.seekp(-2, oss.cur);
+    oss << "], " << std::endl;
+  }
+  if (!constants.empty()) oss.seekp(-2, oss.cur);
+  oss << "]" << std::endl;
+
+  // Get the number of globals and the name of each of them.
+  oss << "  Globals (#" << global_map.size() << "): [";
+  for (const auto& it : global_map) {
+    oss << "(\"" << it.first << "\", " << it.second << ")" << ", ";
+  }
+  if (!global_map.empty()) oss.seekp(-2, oss.cur);
+  oss << "]" << std::endl;
+
+  // Get the number of primitive ops and the name of each of them.
+  oss << "  Primitive ops (#" << primitive_map.size() << "): [";
+  std::vector<std::string> prim_ops;
+  for (const auto& it : primitive_map) {
+    auto packed_index = static_cast<size_t>(it.second);
+    if (prim_ops.size() <= packed_index) {
+      prim_ops.resize(packed_index + 1);
+    }
+    prim_ops[packed_index] = it.first;
+  }
+  for (const auto& it : prim_ops) {
+    oss << it << ", ";
+  }
+  if (!prim_ops.empty()) oss.seekp(-2, oss.cur);
+  oss << "]" << std::endl;
+
+  return oss.str();
+}
+
+void SaveHeader(dmlc::Stream* strm) {
+  uint64_t header = kTVMVMBytecodeMagic;
+  strm->Write(header);
+  std::string version = TVM_VERSION;
+  strm->Write(version);
+}
+
+TVMByteArray Executable::Save() {
+  // Initialize the stream object.
+  code_.clear();
+  dmlc::MemoryStringStream strm(&code_);
+
+  // Save header
+  SaveHeader(&strm);
+
+  // Global section.
+  SaveGlobalSection(&strm);
+
+  // Constant section.
+  SaveConstantSection(&strm);
+
+  // Primitive names.
+  SavePrimitiveOpNames(&strm);
+
+  // Code section.
+  SaveCodeSection(&strm);
+
+  TVMByteArray arr;
+  arr.data = code_.c_str();
+  arr.size = code_.length();
+  return arr;
+}
+
+void Executable::SaveGlobalSection(dmlc::Stream* strm) {
+  std::vector<std::pair<std::string, Index> > globals(this->global_map.begin(),
+                                                      this->global_map.end());
+  auto comp = [](const std::pair<std::string, Index>& a,
+                 const std::pair<std::string, Index>& b) {
+    return a.second < b.second;
+  };
+  std::sort(globals.begin(), globals.end(), comp);
+
+  std::vector<std::string> glbs;
+  for (const auto& it : globals) {
+    glbs.push_back(it.first);
+  }
+  strm->Write(glbs);
+}
+
+void Executable::SaveConstantSection(dmlc::Stream* strm) {
+  std::vector<DLTensor*> arrays;
+  for (const auto& obj : this->constants) {
+    const auto* cell = obj.as<runtime::vm::TensorObj>();
+    CHECK(cell != nullptr);
+    runtime::NDArray data = cell->data;
+    arrays.push_back(const_cast<DLTensor*>(data.operator->()));
+  }
+  strm->Write(static_cast<uint64_t>(this->constants.size()));
+  for (const auto& it : arrays) {
+    runtime::SaveDLTensor(strm, it);
+  }
+}
+
+void Executable::SavePrimitiveOpNames(dmlc::Stream* strm) {
+  std::vector<std::string> primitive_names;
+  for (const auto& it : this->primitive_map) {
+    auto packed_index = static_cast<size_t>(it.second);
+    if (primitive_names.size() <= packed_index) {
+      primitive_names.resize(packed_index + 1);
+    }
+    primitive_names[packed_index] = it.first;
+  }
+  strm->Write(primitive_names);
+}
+
+// Serialize a virtual machine instruction. It creates a list that contains the
+// hash, opcode, and all fields of an instruction.
+//
+// For example, the function signature used to create an `AllocTensor`
+// instruction is:
+//   Instruction AllocTensor(std::vector<Index> shape, DLDataType dtype, RegName dst)
+//
+// The serialized form will be:
+//   `hash 5 dtype.code dtype.bits dtype.lanes ndim dst_register val1 val2 ... valn`
+//
+// where hash is the hash of serialized instruction that is computed internally
+// by the `VMInstructionExecutable`. It is used for sanity check before decoding.
+// 5 shows opcode of `AllocTensor`, `(dtype.code dtype.bits dtype.lanes)`
+// represents a `DLDataType`, `ndim` is the number of dimensions, `dst_register`
+// is the destination register, and the rest of it together indicates the shape
+// of the tensor to be allocated.
+VMInstructionSerializer SerializeInstruction(const Instruction& instr) {
+  std::vector<Index> fields;
+  // Save the opcode.
+  DLOG(INFO) << "Serializing: " << instr << std::endl;
+  switch (instr.op) {
+    case Opcode::Move: {
+      // Number of fields = 2
+      fields.assign({instr.from, instr.dst});
+      break;
+    }
+    case Opcode::Ret: {
+      // Number of fields = 1
+      fields.push_back(instr.result);
+      break;
+    }
+    case Opcode::Fatal: {
+      // Number of fields = 0
+      break;
+    }
+    case Opcode::InvokePacked: {
+      // Number of fields = 3 + instr.arity
+      // Note that arity includes both input arguments and outputs. We will
+      // put all the `arity` number of fields in the end for serialization.
+      fields.assign({instr.packed_index, instr.arity, instr.output_size});
+      // Save the args.
+      fields.insert(fields.end(), instr.packed_args, instr.packed_args + instr.arity);
+      break;
+    }
+    case Opcode::AllocTensor: {
+      // Number of fields = 5 + instr.alloc_tensor.ndim
+      // Save `DLDataType` and the dst register.
+      const auto& dtype = instr.alloc_tensor.dtype;
+      fields.assign({dtype.code, dtype.bits, dtype.lanes});
+
+      // The number of dimensions is not needed for constructing an
+      // `AllocTensor` instruction as it equals to the length of the `shape`
+      // vector. However, we save it to conveniently deserialize the instruction
+      // because we will know how many fields are needed by the `shape` argument.
+      fields.push_back(instr.alloc_tensor.ndim);
+      fields.push_back(instr.dst);
+
+      // Save the shape of the tensor.
+      // Note that this field is rotated to the end of the list.
+      fields.insert(fields.end(), instr.alloc_tensor.shape,
+                    instr.alloc_tensor.shape + instr.alloc_tensor.ndim);
+      break;
+    }
+    case Opcode::AllocTensorReg: {
+      // Number of fields = 5
+      fields.push_back(instr.alloc_tensor_reg.shape_register);
+      // Save `DLDataType` and the dst register.
+      const auto& dtype = instr.alloc_tensor.dtype;
+      fields.push_back(dtype.code);
+      fields.push_back(dtype.bits);
+      fields.push_back(dtype.lanes);
+      fields.push_back(instr.dst);
+      break;
+    }
+    case Opcode::AllocADT: {
+      // Number of fields = 3 + instr.num_fields
+      fields.assign({instr.constructor_tag, instr.num_fields, instr.dst});
+
+      // Save the fields.
+      fields.insert(fields.end(), instr.datatype_fields,
+                    instr.datatype_fields + instr.num_fields);
+      break;
+    }
+    case Opcode::AllocClosure: {
+      // Number of fields = 3 + instr.num_freevar
+      fields.assign({instr.clo_index, instr.num_freevar, instr.dst});
+
+      // Save the free vars.
+      fields.insert(fields.end(), instr.free_vars,
+                    instr.free_vars + instr.num_freevar);
+      break;
+    }
+    case Opcode::If: {
+      // Number of fields = 4
+      fields.assign({instr.if_op.test,
+                     instr.if_op.target,
+                     instr.if_op.true_offset,
+                     instr.if_op.false_offset});
+      break;
+    }
+    case Opcode::Invoke: {
+      // Number of fields = 3 + instr.num_args
+      fields.assign({instr.func_index, instr.num_args, instr.dst});
+
+      // Save the args.
+      fields.insert(fields.end(), instr.invoke_args_registers,
+                    instr.invoke_args_registers + instr.num_args);
+      break;
+    }
+    case Opcode::InvokeClosure: {
+      // Number of fields = 3 + instr.num_closure_args
+      fields.assign({instr.closure, instr.num_closure_args, instr.dst});
+
+      // Save the args.
+      fields.insert(fields.end(), instr.closure_args,
+                    instr.closure_args + instr.num_closure_args);
+      break;
+    }
+    case Opcode::LoadConst: {
+      // Number of fields = 2
+      fields.assign({instr.const_index, instr.dst});
+      break;
+    }
+    case Opcode::LoadConsti: {
+      // Number of fields = 2
+      fields.assign({instr.load_consti.val, instr.dst});
+      break;
+    }
+    case Opcode::GetField: {
+      // Number of fields = 3
+      fields.assign({instr.object, instr.field_index, instr.dst});
+      break;
+    }
+    case Opcode::GetTag: {
+      // Number of fields = 2
+      fields.assign({instr.get_tag.object, instr.dst});
+      break;
+    }
+    case Opcode::Goto: {
+      // Number of fields = 1
+      fields.push_back(instr.pc_offset);
+      break;
+    }
+    default:
+      LOG(FATAL) << "Invalid opcode" << static_cast<int>(instr.op);
+      break;
+  }
+
+  return VMInstructionSerializer(static_cast<Index>(instr.op), fields);
+}
+
+void Executable::SaveCodeSection(dmlc::Stream* strm) {
+  // Save the number of functions.
+  strm->Write(static_cast<uint64_t>(this->functions.size()));
+  for (const auto& func : this->functions) {
+    // Save the function info.
+    VMFunctionSerializer func_format(func.name,
+                                     func.register_file_size,
+                                     func.instructions.size(),
+                                     func.params);
+    func_format.Save(strm);
+
+    // Serialize each instruction.
+    for (const auto& instr : func.instructions) {
+      const auto& serialized_instr = SerializeInstruction(instr);
+      serialized_instr.Save(strm);
+    }
+  }
+}
+
+void LoadHeader(dmlc::Stream* strm) {
+  // Check header.
+  uint64_t header;
+  STREAM_CHECK(strm->Read(&header), "header");
+  STREAM_CHECK(header == kTVMVMBytecodeMagic, "header");
+
+  // Check version.
+  std::string version;
+  STREAM_CHECK(strm->Read(&version), "version");
+  STREAM_CHECK(version == TVM_VERSION, "version");
+}
+
+runtime::Module Executable::Load(const std::string& code, const runtime::Module lib) {
+  std::shared_ptr<Executable> exec = std::make_shared<Executable>();
+  exec->lib = lib;
+  exec->code_ = code;
+  dmlc::MemoryStringStream strm(&exec->code_);
+
+  // Load header.
+  LoadHeader(&strm);
+
+  // Global section.
+  exec->LoadGlobalSection(&strm);
+
+  // Constant section.
+  exec->LoadConstantSection(&strm);
+
+  // Primitive names that will be invoked by `InvokePacked` instructions.
+  exec->LoadPrimitiveOpNames(&strm);
+
+  // Code section.
+  exec->LoadCodeSection(&strm);
+
+  return runtime::Module(exec);
+}
+
+void Executable::LoadGlobalSection(dmlc::Stream* strm) {
+  std::vector<std::string> globals;
+  STREAM_CHECK(strm->Read(&globals), "global");
+  for (size_t i = 0; i < globals.size(); i++) {
+    this->global_map.insert({globals[i], i});
+  }
+}
+
+void Executable::LoadConstantSection(dmlc::Stream* strm) {
+  uint64_t sz;
+  // Load the number of constants.
+  STREAM_CHECK(strm->Read(&sz, sizeof(sz)), "constant");
+
+  size_t size = static_cast<size_t>(sz);
+  // Load each of the constants.
+  for (size_t i = 0; i < size; i++) {
+    runtime::NDArray constant;
+    STREAM_CHECK(constant.Load(strm), "constant");
+    runtime::ObjectRef obj = runtime::vm::Tensor(constant);
+    this->constants.push_back(obj);
+  }
+}
+
+void Executable::LoadPrimitiveOpNames(dmlc::Stream* strm) {
+  std::vector<std::string> primitive_names;
+  STREAM_CHECK(strm->Read(&primitive_names), "primitive name");
+  for (size_t i = 0; i < primitive_names.size(); i++) {
+    this->primitive_map.insert({primitive_names[i], i});
+  }
+}
+
+// Extract the `cnt` number of fields started at `start` from the list
+// `instr_fields`.
+inline std::vector<Index> ExtractFields(const std::vector<Index>& instr_fields,
+                                        Index start,
+                                        Index cnt) {
+  CHECK_LE(static_cast<size_t>(start + cnt), instr_fields.size());
+  std::vector<Index> ret;
+  for (auto i = start; i < start + cnt; i++) {
+    ret.push_back(instr_fields[i]);
+  }
+  return ret;
+}
+
+Instruction DeserializeInstruction(const VMInstructionSerializer& instr) {
+  Opcode opcode = static_cast<Opcode>(instr.opcode);
+  switch (opcode) {
+    case Opcode::Move: {
+      // Number of fields = 2
+      DCHECK_EQ(instr.fields.size(), 2U);
+      return Instruction::Move(instr.fields[0], instr.fields[1]);
+    }
+    case Opcode::Ret: {
+      // Number of fields = 1
+      DCHECK_EQ(instr.fields.size(), 1U);
+      return Instruction::Ret(instr.fields[0]);
+    }
+    case Opcode::Fatal: {
+      // Number of fields = 0
+      DCHECK(instr.fields.empty());
+      return Instruction::Fatal();
+    }
+    case Opcode::InvokePacked: {
+      // Number of fields = 3 + instr.arity
+      DCHECK_GE(instr.fields.size(), 3U);
+      DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
+
+      Index packed_index = instr.fields[0];
+      Index arity = instr.fields[1];
+      Index output_size = instr.fields[2];
+      std::vector<RegName> args = ExtractFields(instr.fields, 3, arity);
+      return Instruction::InvokePacked(packed_index, arity, output_size, args);
+    }
+    case Opcode::AllocTensor: {
+      // Number of fields = 5 + instr.alloc_tensor.ndim
+      DCHECK_GE(instr.fields.size(), 5U);
+      DCHECK_EQ(instr.fields.size(), 5U + static_cast<size_t>(instr.fields[3]));
+
+      DLDataType dtype;
+      dtype.code = instr.fields[0];
+      dtype.bits = instr.fields[1];
+      dtype.lanes = instr.fields[2];
+
+      Index ndim = instr.fields[3];
+      RegName dst = instr.fields[4];
+
+      std::vector<Index> shape = ExtractFields(instr.fields, 5, ndim);
+
+      return Instruction::AllocTensor(shape, dtype, dst);
+    }
+    case Opcode::AllocTensorReg: {
+      // Number of fields = 5
+      DCHECK_EQ(instr.fields.size(), 5U);
+      Index shape_register = instr.fields[0];
+
+      DLDataType dtype;
+      dtype.code = instr.fields[1];
+      dtype.bits = instr.fields[2];
+      dtype.lanes = instr.fields[3];
+
+      RegName dst = instr.fields[4];
+
+      return Instruction::AllocTensorReg(shape_register, dtype, dst);
+    }
+    case Opcode::AllocADT: {
+      // Number of fields = 3 + instr.num_fields
+      DCHECK_GE(instr.fields.size(), 3U);
+      DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
+
+      Index constructor_tag = instr.fields[0];
+      Index num_fields = instr.fields[1];
+      RegName dst = instr.fields[2];
+      std::vector<Index> fields = ExtractFields(instr.fields, 3, num_fields);
+
+      return Instruction::AllocADT(constructor_tag, num_fields, fields, dst);
+    }
+    case Opcode::AllocClosure: {
+      // Number of fields = 3 + instr.num_freevar
+      DCHECK_GE(instr.fields.size(), 3U);
+      DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
+
+      Index clo_index = instr.fields[0];
+      Index num_freevar = instr.fields[1];
+      RegName dst = instr.fields[2];
+      std::vector<Index> free_vars = ExtractFields(instr.fields, 3, num_freevar);
+
+      return Instruction::AllocClosure(clo_index, num_freevar, free_vars, dst);
+    }
+    case Opcode::If: {
+      // Number of fields = 4
+      DCHECK_EQ(instr.fields.size(), 4U);
+      Index test = instr.fields[0];
+      Index target = instr.fields[1];
+      Index true_offset = instr.fields[2];
+      Index false_offset = instr.fields[3];
+
+      return Instruction::If(test, target, true_offset, false_offset);
+    }
+    case Opcode::Invoke: {
+      // Number of fields = 3 + instr.num_args
+      DCHECK_GE(instr.fields.size(), 3U);
+      DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
+
+      Index func_index = instr.fields[0];
+      Index num_args = instr.fields[1];
+      RegName dst = instr.fields[2];
+      std::vector<Index> args = ExtractFields(instr.fields, 3, num_args);
+
+      return Instruction::Invoke(func_index, args, dst);
+    }
+    case Opcode::InvokeClosure: {
+      // Number of fields = 3 + instr.num_closure_args
+      DCHECK_GE(instr.fields.size(), 3U);
+      DCHECK_EQ(instr.fields.size(), 3U + static_cast<size_t>(instr.fields[1]));
+
+      Index closure = instr.fields[0];
+      Index num_closure_args = instr.fields[1];
+      RegName dst = instr.fields[2];
+      std::vector<Index> args = ExtractFields(instr.fields, 3, num_closure_args);
+
+      return Instruction::InvokeClosure(closure, args, dst);
+    }
+    case Opcode::LoadConst: {
+      // Number of fields = 2
+      DCHECK_EQ(instr.fields.size(), 2U);
+      return Instruction::LoadConst(instr.fields[0], instr.fields[1]);
+    }
+    case Opcode::LoadConsti: {
+      // Number of fields = 2
+      DCHECK_EQ(instr.fields.size(), 2U);
+      return Instruction::LoadConsti(instr.fields[0], instr.fields[1]);
+    }
+    case Opcode::GetField: {
+      // Number of fields = 3
+      DCHECK_EQ(instr.fields.size(), 3U);
+      return Instruction::GetField(instr.fields[0], instr.fields[1], instr.fields[2]);
+    }
+    case Opcode::GetTag: {
+      // Number of fields = 2
+      DCHECK_EQ(instr.fields.size(), 2U);
+      return Instruction::GetTag(instr.fields[0], instr.fields[1]);
+    }
+    case Opcode::Goto: {
+      // Number of fields = 1
+      DCHECK_EQ(instr.fields.size(), 1U);
+      return Instruction::Goto(instr.fields[0]);
+    }
+    default:
+      LOG(FATAL) << "Invalid opcode" << instr.opcode;
+      return Instruction();
+  }
+}
+
+void Executable::LoadCodeSection(dmlc::Stream* strm) {
+  // Load the number of functions.
+  uint64_t sz;
+  STREAM_CHECK(strm->Read(&sz, sizeof(sz)), "code");
+
+  size_t num_funcs = static_cast<size_t>(sz);
+  this->functions.resize(num_funcs);
+  for (size_t i = 0; i < num_funcs; i++) {
+    // Load the function info.
+    VMFunctionSerializer loaded_func;
+    STREAM_CHECK(loaded_func.Load(strm), "code/function");
+
+    // Load the instructions.
+    std::vector<Instruction> instructions;
+    for (size_t j = 0; j < loaded_func.num_instructions; j++) {
+      VMInstructionSerializer instr;
+      std::vector<Index> instr_fields;
+      STREAM_CHECK(instr.Load(strm), "code/instruction");
+      instructions.push_back(DeserializeInstruction(instr));
+    }
+
+    // Create the VM function.
+    VMFunction vm_func = VMFunction(loaded_func.name,
+                                    loaded_func.params,
+                                    instructions,
+                                    loaded_func.register_file_size);
+    auto it = this->global_map.find(loaded_func.name);
+    CHECK(it != this->global_map.end());
+    CHECK_LE(it->second, this->global_map.size());
+    this->functions[it->second] = vm_func;
+  }
+}
+
+TVM_REGISTER_GLOBAL("relay._vm.GetNumOfGlobals")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  runtime::Module mod = args[0];
+  const auto* exec = dynamic_cast<Executable*>(mod.operator->());
+  CHECK(exec);
+  *rv = static_cast<int>(exec->global_map.size());
+});
+
+TVM_REGISTER_GLOBAL("relay._vm.GetGlobalFields")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  runtime::Module mod = args[0];
+  const auto* exec = dynamic_cast<Executable*>(mod.operator->());
+  CHECK(exec);
+  int idx = args[1];
+  std::vector<std::pair<std::string, Index> > globals(exec->global_map.begin(),
+                                                      exec->global_map.end());
+  auto comp = [](const std::pair<std::string, Index>& a,
+                 const std::pair<std::string, Index>& b) {
+    return a.second < b.second;
+  };
+  std::sort(globals.begin(), globals.end(), comp);
+  CHECK_LT(idx, globals.size());
+  *rv = globals[idx].first;
+});
+
+TVM_REGISTER_GLOBAL("relay._vm.GetNumOfPrimitives")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  runtime::Module mod = args[0];
+  const auto* exec = dynamic_cast<Executable*>(mod.operator->());
+  CHECK(exec);
+  *rv = static_cast<int>(exec->primitive_map.size());
+});
+
+
+TVM_REGISTER_GLOBAL("relay._vm.GetPrimitiveFields")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  runtime::Module mod = args[0];
+  const auto* exec = dynamic_cast<Executable*>(mod.operator->());
+  CHECK(exec);
+  int idx = args[1];
+  CHECK_GE(idx, 0);
+  CHECK_LT(idx, exec->primitive_map.size());
+
+  for (const auto& it : exec->primitive_map) {
+    if (idx == static_cast<int>(it.second)) {
+      *rv = it.first;
+      break;
+    }
+  }
+});
+
+TVM_REGISTER_GLOBAL("relay._vm.Load_Executable")
+.set_body_typed<runtime::Module(std::string, runtime::Module)>([](
+    std::string code,
+    runtime::Module lib) {
+  return Executable::Load(code, lib);
+});
+
+}  // namespace vm
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/vm/object.cc b/src/runtime/vm/object.cc
index c20a1ce9de27..12edf511db66 100644
--- a/src/runtime/vm/object.cc
+++ b/src/runtime/vm/object.cc
@@ -39,15 +39,15 @@ Tensor::Tensor(NDArray data) {
   data_ = std::move(ptr);
 }
 
-Datatype::Datatype(size_t tag, std::vector<ObjectRef> fields) {
-  auto ptr = make_object<DatatypeObj>();
+ADT::ADT(size_t tag, std::vector<ObjectRef> fields) {
+  auto ptr = make_object<ADTObj>();
   ptr->tag = tag;
   ptr->fields = std::move(fields);
   data_ = std::move(ptr);
 }
 
-Datatype Datatype::Tuple(std::vector<ObjectRef> fields) {
-  return Datatype(0, fields);
+ADT ADT::Tuple(std::vector<ObjectRef> fields) {
+  return ADT(0, fields);
 }
 
 Closure::Closure(size_t func_index, std::vector<ObjectRef> free_vars) {
@@ -66,28 +66,28 @@ TVM_REGISTER_GLOBAL("_vmobj.GetTensorData")
   *rv = cell->data;
 });
 
-TVM_REGISTER_GLOBAL("_vmobj.GetDatatypeTag")
+TVM_REGISTER_GLOBAL("_vmobj.GetADTTag")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
   ObjectRef obj = args[0];
-  const auto* cell = obj.as<DatatypeObj>();
+  const auto* cell = obj.as<ADTObj>();
   CHECK(cell != nullptr);
   *rv = static_cast<int64_t>(cell->tag);
 });
 
-TVM_REGISTER_GLOBAL("_vmobj.GetDatatypeNumberOfFields")
+TVM_REGISTER_GLOBAL("_vmobj.GetADTNumberOfFields")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
   ObjectRef obj = args[0];
-  const auto* cell = obj.as<DatatypeObj>();
+  const auto* cell = obj.as<ADTObj>();
   CHECK(cell != nullptr);
   *rv = static_cast<int64_t>(cell->fields.size());
 });
 
 
-TVM_REGISTER_GLOBAL("_vmobj.GetDatatypeFields")
+TVM_REGISTER_GLOBAL("_vmobj.GetADTFields")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
   ObjectRef obj = args[0];
   int idx = args[1];
-  const auto* cell = obj.as<DatatypeObj>();
+  const auto* cell = obj.as<ADTObj>();
   CHECK(cell != nullptr);
   CHECK_LT(idx, cell->fields.size());
   *rv = cell->fields[idx];
@@ -104,10 +104,10 @@ TVM_REGISTER_GLOBAL("_vmobj.Tuple")
   for (auto i = 0; i < args.size(); ++i) {
     fields.push_back(args[i]);
   }
-  *rv = Datatype::Tuple(fields);
+  *rv = ADT::Tuple(fields);
 });
 
-TVM_REGISTER_GLOBAL("_vmobj.Datatype")
+TVM_REGISTER_GLOBAL("_vmobj.ADT")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
   int itag = args[0];
   size_t tag = static_cast<size_t>(itag);
@@ -115,11 +115,11 @@ TVM_REGISTER_GLOBAL("_vmobj.Datatype")
   for (int i = 1; i < args.size(); i++) {
     fields.push_back(args[i]);
   }
-  *rv = Datatype(tag, fields);
+  *rv = ADT(tag, fields);
 });
 
 TVM_REGISTER_OBJECT_TYPE(TensorObj);
-TVM_REGISTER_OBJECT_TYPE(DatatypeObj);
+TVM_REGISTER_OBJECT_TYPE(ADTObj);
 TVM_REGISTER_OBJECT_TYPE(ClosureObj);
 }  // namespace vm
 }  // namespace runtime
diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc
index 80e0ce57a8ae..821de0bda245 100644
--- a/src/runtime/vm/profiler/vm.cc
+++ b/src/runtime/vm/profiler/vm.cc
@@ -85,19 +85,25 @@ PackedFunc VirtualMachineDebug::GetFunction(
   }
 }
 
-void VirtualMachineDebug::Init(const std::vector<TVMContext>& ctxs) {
-  VirtualMachine::Init(ctxs);
-  for (auto kv : primitive_map) {
+void VirtualMachineDebug::LoadExecutable(const Executable* exec) {
+  VirtualMachine::LoadExecutable(exec);
+  CHECK(this->exec);
+  for (auto kv : this->exec->primitive_map) {
     packed_index_map[kv.second] = kv.first;
     op_invokes[kv.second] = 0;
   }
 }
 
+void VirtualMachineDebug::Init(const std::vector<TVMContext>& ctxs) {
+  VirtualMachine::Init(ctxs);
+}
+
 void VirtualMachineDebug::InvokePacked(Index packed_index,
                                        const PackedFunc& func, Index arg_count,
                                        Index output_size,
                                        const std::vector<ObjectRef>& args) {
-  auto ctx = VirtualMachine::GetParamsContext();
+  CHECK(this->exec);
+  auto ctx = this->GetParamsContext();
   // warmup
   VirtualMachine::InvokePacked(packed_index, func, arg_count, output_size,
                                args);
@@ -117,6 +123,21 @@ void VirtualMachineDebug::InvokePacked(Index packed_index,
   op_invokes[packed_index] += 1;
 }
 
+runtime::Module CreateVirtualMachineDebug(const Executable* exec) {
+  std::shared_ptr<VirtualMachineDebug> vm = std::make_shared<VirtualMachineDebug>();
+  vm->LoadExecutable(exec);
+  return runtime::Module(vm);
+}
+
+TVM_REGISTER_GLOBAL("relay._vm._VirtualMachineDebug")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  runtime::Module mod = args[0];
+  const auto* exec = dynamic_cast<Executable*>(mod.operator->());
+  CHECK(exec) << "Virtual machine has not been defined yet."
+              << "\n";
+  *rv = CreateVirtualMachineDebug(exec);
+});
+
 }  // namespace vm
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/vm/profiler/vm.h b/src/runtime/vm/profiler/vm.h
index 447967cafeb0..ff3296cb6c16 100644
--- a/src/runtime/vm/profiler/vm.h
+++ b/src/runtime/vm/profiler/vm.h
@@ -47,6 +47,8 @@ class VirtualMachineDebug : public VirtualMachine {
   void InvokePacked(Index packed_index, const PackedFunc& func, Index arg_count,
                     Index output_size, const std::vector<ObjectRef>& args) final;
 
+  void LoadExecutable(const Executable* exec);
+
   ~VirtualMachineDebug() {}
 
  private:
diff --git a/src/relay/backend/vm/serialize_util.h b/src/runtime/vm/serialize_util.h
similarity index 95%
rename from src/relay/backend/vm/serialize_util.h
rename to src/runtime/vm/serialize_util.h
index 3e7508ebee9b..3931f2f0e023 100644
--- a/src/relay/backend/vm/serialize_util.h
+++ b/src/runtime/vm/serialize_util.h
@@ -19,11 +19,11 @@
 
 /*!
  *  Copyright (c) 2019 by Contributors
- * \file src/relay/backend/vm/serialize_util.h
+ * \file src/runtime/vm/serialize_util.h
  * \brief Definitions of helpers for serializing and deserializing a Relay VM.
  */
-#ifndef TVM_RELAY_BACKEND_VM_SERIALIZE_UTIL_H_
-#define TVM_RELAY_BACKEND_VM_SERIALIZE_UTIL_H_
+#ifndef TVM_RUNTIME_VM_SERIALIZE_UTIL_H_
+#define TVM_RUNTIME_VM_SERIALIZE_UTIL_H_
 
 #include <dmlc/common.h>
 #include <dmlc/memory_io.h>
@@ -34,7 +34,7 @@
 #include <vector>
 
 namespace tvm {
-namespace relay {
+namespace runtime {
 namespace vm {
 
 /*! \brief The magic number for the serialized VM bytecode file  */
@@ -158,7 +158,7 @@ struct VMInstructionSerializer {
 };
 
 }  // namespace vm
-}  // namespace relay
+}  // namespace runtime
 }  // namespace tvm
 
-#endif  // TVM_RELAY_BACKEND_VM_SERIALIZE_UTIL_H_
+#endif  // TVM_RUNTIME_VM_SERIALIZE_UTIL_H_
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 7dea9bdb95ea..ab0e06208de9 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -74,7 +74,7 @@ Instruction::Instruction(const Instruction& instr) {
       this->alloc_tensor_reg.shape_register = instr.alloc_tensor_reg.shape_register;
       this->alloc_tensor_reg.dtype = instr.alloc_tensor_reg.dtype;
       return;
-    case Opcode::AllocDatatype:
+    case Opcode::AllocADT:
       this->constructor_tag = instr.constructor_tag;
       this->num_fields = instr.num_fields;
       this->datatype_fields = Duplicate<RegName>(instr.datatype_fields, instr.num_fields);
@@ -159,7 +159,7 @@ Instruction& Instruction::operator=(const Instruction& instr) {
       this->alloc_tensor_reg.shape_register = instr.alloc_tensor_reg.shape_register;
       this->alloc_tensor_reg.dtype = instr.alloc_tensor_reg.dtype;
       return *this;
-    case Opcode::AllocDatatype:
+    case Opcode::AllocADT:
       this->constructor_tag = instr.constructor_tag;
       this->num_fields = instr.num_fields;
       FreeIf(this->datatype_fields);
@@ -229,7 +229,7 @@ Instruction::~Instruction() {
     case Opcode::AllocTensor:
       delete this->alloc_tensor.shape;
       return;
-    case Opcode::AllocDatatype:
+    case Opcode::AllocADT:
       delete this->datatype_fields;
       return;
     case Opcode::AllocClosure:
@@ -301,10 +301,10 @@ Instruction Instruction::AllocTensorReg(RegName shape_register, DLDataType dtype
   return instr;
 }
 
-Instruction Instruction::AllocDatatype(Index tag, Index num_fields,
+Instruction Instruction::AllocADT(Index tag, Index num_fields,
                                        const std::vector<RegName>& datatype_fields, Index dst) {
   Instruction instr;
-  instr.op = Opcode::AllocDatatype;
+  instr.op = Opcode::AllocADT;
   instr.dst = dst;
   instr.constructor_tag = tag;
   instr.num_fields = num_fields;
@@ -485,7 +485,7 @@ void InstructionPrint(std::ostream& os, const Instruction& instr) {
       DLDatatypePrint(os, instr.alloc_tensor_reg.dtype);
       break;
     }
-    case Opcode::AllocDatatype: {
+    case Opcode::AllocADT: {
       os << "alloc_data $" << instr.dst << " tag(" << instr.constructor_tag << ") [$"
          << StrJoin<RegName>(instr.datatype_fields, 0, instr.num_fields, ",$") << "]";
       break;
@@ -575,11 +575,12 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
                                        const std::shared_ptr<ModuleNode>& sptr_to_self) {
   if (name == "invoke") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      CHECK(exec) << "The executable is not created yet.";
       std::string func_name = args[0];
-      auto gvit = this->global_map.find(func_name);
-      CHECK(gvit != this->global_map.end()) << "Cannot find function " << func_name;
+      auto gvit = exec->global_map.find(func_name);
+      CHECK(gvit != exec->global_map.end()) << "Cannot find function " << func_name;
       auto func_index = gvit->second;
-      const auto& vm_func = this->functions[func_index];
+      const auto& vm_func = exec->functions[func_index];
       const auto& param_names = vm_func.params;
       auto ctx = this->GetParamsContext();
 
@@ -617,10 +618,6 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
       }
       this->Init(contexts);
     });
-  } else if (name == "load_params") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      this->LoadParams(args[0].operator std::string());
-    });
   } else {
     LOG(FATAL) << "Unknown packed function: " << name;
     return PackedFunc([sptr_to_self, name](TVMArgs args, TVMRetValue* rv) {});
@@ -628,43 +625,20 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
 }
 
 TVMContext VirtualMachine::GetParamsContext() const {
+  CHECK(!ctxs.empty()) << "Context has not been initialized yet."
+                       << "\n";
+
   // Use the fallback device if no device index is available.
   int fallback_device_type = static_cast<int>(ctxs[0].device_type);
   // TODO(wweic): For heterogeneous execution, get device information from byte
 
   const auto& cit =
-    std::find_if(ctxs.begin(), ctxs.end(), [&fallback_device_type](const TVMContext& c) {
-      return fallback_device_type == static_cast<int>(c.device_type);
-    });
+      std::find_if(ctxs.begin(), ctxs.end(), [&fallback_device_type](const TVMContext& c) {
+        return fallback_device_type == static_cast<int>(c.device_type);
+      });
   return (cit == ctxs.end() ? ctxs[0] : *cit);
 }
 
-void VirtualMachine::LoadParams(const std::string& params) {
-  dmlc::MemoryStringStream mss(const_cast<std::string*>(&params));
-  dmlc::Stream* strm = &mss;
-  uint64_t header, reserved;
-  CHECK(strm->Read(&header)) << "Invalid parameter file";
-  CHECK(header == kTVMNDArrayListMagic) << "Invalid parameter file";
-  CHECK(strm->Read(&reserved)) << "Invalid parameter file";
-
-  std::vector<std::string> names;
-  CHECK(strm->Read(&names)) << "Invalid parameter file";
-
-  uint64_t sz;
-  strm->Read(&sz);
-  size_t size = static_cast<size_t>(sz);
-  CHECK(size == names.size()) << "Invalid parameter file";
-
-  auto ctx = GetParamsContext();
-  for (size_t i = 0; i < size; i++) {
-    NDArray arr;
-    CHECK(arr.Load(strm)) << "Invalid parameter file";
-    ObjectRef obj = Tensor(arr);
-    auto copy = CopyTo(obj, ctx);
-    params_.emplace(std::make_pair(names[i], copy));
-  }
-}
-
 void VirtualMachine::PushFrame(Index arg_count, Index ret_pc, const VMFunction& vm_func) {
   auto frame = VMFrame(ret_pc, func_index, arg_count, code, vm_func.register_file_size);
   frames.push_back(frame);
@@ -699,15 +673,17 @@ ObjectRef VirtualMachine::Invoke(const VMFunction& func, const std::vector<Objec
 
   InvokeGlobal(func, args);
   RunLoop();
+  // TODO(wweic) ctx could be obtained from the ctxs list.
   auto alloc = MemoryManager::Global()->GetAllocator(ctxs[0]);
   DLOG(INFO) << "Memory used: " << alloc->UsedMemory() << " B";
   return return_register;
 }
 
 ObjectRef VirtualMachine::Invoke(const std::string& name, const std::vector<ObjectRef>& args) {
-  auto func_index = this->global_map[name];
+  CHECK(exec) << "The executable has not been created yet.";
+  auto func_index = exec->global_map.at(name);
   DLOG(INFO) << "Invoke Global " << name << " at index " << func_index;
-  return Invoke(this->functions[func_index], args);
+  return Invoke(exec->functions[func_index], args);
 }
 
 void VirtualMachine::InvokePacked(Index packed_index, const PackedFunc& func,
@@ -715,7 +691,7 @@ void VirtualMachine::InvokePacked(Index packed_index, const PackedFunc& func,
                                   const std::vector<ObjectRef>& args) {
   size_t arity = 0;
   for (Index i = 0; i < arg_count; i++) {
-    if (const auto* obj = args[i].as<DatatypeObj>()) {
+    if (const auto* obj = args[i].as<ADTObj>()) {
       arity += obj->fields.size();
     } else {
       ++arity;
@@ -727,7 +703,7 @@ void VirtualMachine::InvokePacked(Index packed_index, const PackedFunc& func,
   runtime::TVMArgsSetter setter(values.data(), codes.data());
   int idx = 0;
   for (Index i = 0; i < arg_count; i++) {
-    if (const auto* dt_cell = args[i].as<DatatypeObj>()) {
+    if (const auto* dt_cell = args[i].as<ADTObj>()) {
       for (auto obj : dt_cell->fields) {
         const auto* tensor = obj.as<TensorObj>();
         CHECK(tensor != nullptr);
@@ -744,14 +720,16 @@ void VirtualMachine::InvokePacked(Index packed_index, const PackedFunc& func,
   func.CallPacked(TVMArgs(values.data(), codes.data(), arity), &rv);
 }
 
-void VirtualMachine::Init(const std::vector<TVMContext>& ctxs) {
-  this->ctxs = ctxs;
+void VirtualMachine::LoadExecutable(const Executable* exec) {
+  CHECK(exec) << "The executable is not created yet.";
+  this->exec = exec;
 
+  runtime::Module lib = this->exec->lib;
   // Get the list of packed functions.
-  CHECK(primitive_map.empty() || lib.operator->())
+  CHECK(exec->primitive_map.empty() || lib.operator->())
       << "runtime module should have been built for primitive functions"
       << "\n";
-  for (const auto& it : primitive_map) {
+  for (const auto& it : this->exec->primitive_map) {
     const auto& packed_name = it.first;
     auto packed_index = static_cast<size_t>(it.second);
     if (packed_funcs.size() <= packed_index) {
@@ -761,6 +739,11 @@ void VirtualMachine::Init(const std::vector<TVMContext>& ctxs) {
   }
 }
 
+
+void VirtualMachine::Init(const std::vector<TVMContext>& ctxs) {
+  this->ctxs = ctxs;
+}
+
 inline void VirtualMachine::WriteRegister(Index r, const ObjectRef& val) {
   frames.back().register_file[r] = val;
 }
@@ -788,6 +771,7 @@ inline int32_t VirtualMachine::LoadScalarInt(Index r) const {
 
 void VirtualMachine::RunLoop() {
   CHECK(this->code);
+  CHECK(this->exec);
   this->pc = 0;
   Index frame_start = frames.size();
   while (true) {
@@ -810,9 +794,19 @@ void VirtualMachine::RunLoop() {
         throw std::runtime_error("VM encountered fatal error");
       }
       case Opcode::LoadConst: {
-        auto constant_obj = this->constants[instr.const_index];
-        auto device_obj = CopyTo(constant_obj, ctxs[0]);
-        WriteRegister(instr.dst, device_obj);
+        auto constant_obj = exec->constants[instr.const_index];
+        // We cache the allocated object in the constant pool. To measure, the
+        // first iteration will set the pool up. The other iterations will
+        // directly reuse the allocated objects.
+        if (const_pool_.size() <= static_cast<size_t>(instr.const_index)) {
+          const_pool_.resize(instr.const_index + 1);
+        }
+
+        if (!const_pool_[instr.const_index].defined()) {
+          // TODO(wweic) ctx could be obtained from the ctxs list.
+          const_pool_[instr.const_index] = CopyTo(constant_obj, ctxs[0]);
+        }
+        WriteRegister(instr.dst, const_pool_[instr.const_index]);
         pc++;
         goto main_loop;
       }
@@ -828,7 +822,7 @@ void VirtualMachine::RunLoop() {
         for (Index i = 0; i < instr.num_args; ++i) {
           args.push_back(ReadRegister(instr.invoke_args_registers[i]));
         }
-        InvokeGlobal(this->functions[instr.func_index], args);
+        InvokeGlobal(exec->functions[instr.func_index], args);
         frames.back().caller_return_register = instr.dst;
         goto main_loop;
       }
@@ -858,13 +852,13 @@ void VirtualMachine::RunLoop() {
         for (Index i = 0; i < instr.num_closure_args; ++i) {
           args.push_back(ReadRegister(instr.closure_args[i]));
         }
-        InvokeGlobal(this->functions[closure->func_index], args);
+        InvokeGlobal(exec->functions[closure->func_index], args);
         frames.back().caller_return_register = instr.dst;
         goto main_loop;
       }
       case Opcode::GetField: {
         auto object = ReadRegister(instr.object);
-        const auto* tuple = object.as<DatatypeObj>();
+        const auto* tuple = object.as<ADTObj>();
         CHECK(tuple != nullptr)
             << "Object is not data type object, register " << instr.object << ", Object tag "
             << object->type_index();
@@ -875,7 +869,7 @@ void VirtualMachine::RunLoop() {
       }
       case Opcode::GetTag: {
         auto object = ReadRegister(instr.get_tag.object);
-        const auto* data = object.as<DatatypeObj>();
+        const auto* data = object.as<ADTObj>();
         CHECK(data != nullptr)
             << "Object is not data type object, register "
             << instr.get_tag.object << ", Object tag "
@@ -910,6 +904,7 @@ void VirtualMachine::RunLoop() {
         for (uint32_t i = 0; i < instr.alloc_tensor.ndim; ++i) {
           shape[i] = instr.alloc_tensor.shape[i];
         }
+        // TODO(wweic) ctx could be obtained from the ctxs list.
         auto allocator = MemoryManager::Global()->GetAllocator(ctxs[0]);
         auto data = allocator->Empty(shape, instr.alloc_tensor.dtype, ctxs[0]);
         auto obj = Tensor(data);
@@ -931,6 +926,7 @@ void VirtualMachine::RunLoop() {
         auto num_dims = shape_tensor->shape[0];
         auto shape = std::vector<int64_t>(shape_tensor->shape[0]);
         shape.assign(dims, dims + num_dims);
+        // TODO(wweic) ctx could be obtained from the ctxs list.
         auto allocator = MemoryManager::Global()->GetAllocator(ctxs[0]);
         auto data = allocator->Empty(shape, instr.alloc_tensor_reg.dtype, ctxs[0]);
         auto obj = Tensor(data);
@@ -938,12 +934,12 @@ void VirtualMachine::RunLoop() {
         pc++;
         goto main_loop;
       }
-      case Opcode::AllocDatatype: {
+      case Opcode::AllocADT: {
         std::vector<ObjectRef> fields;
         for (Index i = 0; i < instr.num_fields; ++i) {
           fields.push_back(ReadRegister(instr.datatype_fields[i]));
         }
-        ObjectRef obj = Datatype(instr.constructor_tag, fields);
+        ObjectRef obj = ADT(instr.constructor_tag, fields);
         WriteRegister(instr.dst, obj);
         pc++;
         goto main_loop;
@@ -976,6 +972,21 @@ void VirtualMachine::RunLoop() {
   }
 }
 
+runtime::Module CreateVirtualMachine(const Executable* exec) {
+  std::shared_ptr<VirtualMachine> vm = std::make_shared<VirtualMachine>();
+  vm->LoadExecutable(exec);
+  return runtime::Module(vm);
+}
+
+TVM_REGISTER_GLOBAL("relay._vm._VirtualMachine")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+  runtime::Module mod = args[0];
+  const auto* exec = dynamic_cast<Executable*>(mod.operator->());
+  CHECK(exec) << "The virtual machine executable has not been defined yet."
+              << "\n";
+  *rv = CreateVirtualMachine(exec);
+});
+
 }  // namespace vm
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/schedule/graph.cc b/src/schedule/graph.cc
index 506702ad52b5..bb0be274b583 100644
--- a/src/schedule/graph.cc
+++ b/src/schedule/graph.cc
@@ -62,7 +62,7 @@ namespace std {
 template <>
 struct hash<::tvm::schedule::TensorDimKey> {
   std::size_t operator()(const ::tvm::schedule::TensorDimKey& k) const {
-    size_t lhs = k.f.hash();
+    size_t lhs = ::tvm::NodeHash()(k.f);
     size_t rhs = static_cast<size_t>(k.value_index) << 16UL |
         static_cast<size_t>(k.dim);
     lhs ^= rhs + 0x9e3779b9 + (lhs << 6) + (lhs >> 2);
diff --git a/src/schedule/schedule_dataflow_rewrite.cc b/src/schedule/schedule_dataflow_rewrite.cc
index 31f6169c899f..6400eeaab69a 100644
--- a/src/schedule/schedule_dataflow_rewrite.cc
+++ b/src/schedule/schedule_dataflow_rewrite.cc
@@ -18,7 +18,6 @@
  */
 
 /*!
- *  Copyright (c) 2017 by Contributors
  * \file schedule_dataflow_rewrite.cc
  */
 #include <tvm/schedule.h>
@@ -178,7 +177,7 @@ Tensor Schedule::cache_read(const Tensor& tensor,
   cache_stage.set_scope(scope);
   CHECK_LT(pos, stages->data.size());
   stages->data.insert(stages->data.begin() + pos + 1,
-                      cache_stage.node_);
+                      cache_stage);
   (*this)->stage_map.Set(cache->op, cache_stage);
   // Update group
   cache_stage->group = op_stage->group;
@@ -281,7 +280,7 @@ Array<Tensor> ReplaceOriginalOp(Schedule sch,
   cache_stage.set_scope(scope);
   CHECK_LT(pos, stages->data.size());
   stages->data.insert(stages->data.begin() + pos,
-                      cache_stage.node_);
+                      cache_stage);
   sch->stage_map.Set(cache_op, cache_stage);
   // Update group
   cache_stage->group = orig_stage->group;
@@ -322,7 +321,7 @@ Array<Tensor> CacheWriteWithReLayout(Schedule sch,
     body = VarReplacer(vsub2newvar).Mutate(body);
     // Reduce nodes in ONE computeOp must be the same except value_index
     // This is right only if the original body ensures Reduce nodes are the same
-    if (body->is_type<ir::Reduce>()) {
+    if (body->IsInstance<ir::Reduce>()) {
       const ir::Reduce* reduce_body = body.as<ir::Reduce>();
       if (first_reduce != nullptr) {
         CHECK(ReduceEqual(reduce_body, first_reduce));
@@ -486,10 +485,9 @@ Tensor Schedule::cache_write(const Tensor& tensor,
                              const std::string& scope) {
   // support original compute and tensor compute both
   (*this)->InvalidateCache();
-  const char* type_key = tensor->op->type_key();
-  if (!strcmp(type_key, "ComputeOp")) {
+  if (tensor->op.as<ComputeOpNode>()) {
     return (CacheWriteWithReLayout(*this, {tensor}, scope))[0];
-  } else if (!strcmp(type_key, "TensorComputeOp")) {
+  } else if (tensor->op.as<TensorComputeOpNode>()) {
     return (CacheWriteWithReLayoutTensor(*this, {tensor}, scope))[0];
   } else {
     LOG(FATAL) << "cache write only take ComputeOp or TensorComputeOp as writers";
@@ -521,7 +519,7 @@ void RebaseNonZeroMinLoop(const Schedule& sch) {
         if (s->iter_var_attrs.count(iv)) {
           s->iter_var_attrs.Set(rebased, s->iter_var_attrs.at(iv));
         }
-        leaf_vars->data[idx] = rebased.node_;
+        leaf_vars->data[idx] = rebased;
         rebase_map[iv] = rebased;
       }
     }
@@ -575,7 +573,7 @@ void InjectInline(ScheduleNode* sch) {
           if (!new_body[j].size()) {
             new_body[j] = compute->body;
           }
-          if (new_body[j][0]->is_type<ir::Reduce>()) {
+          if (new_body[j][0]->IsInstance<ir::Reduce>()) {
             // specially handle reduction inline for multiplre reductions.
             const ir::Reduce* reduce = new_body[j][0].as<ir::Reduce>();
             for (size_t k = 1; k < new_body[j].size(); ++k) {
@@ -826,7 +824,7 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor,
   factor_stage->relations = rels;
   CHECK_LT(stage_pos, stages->data.size());
   stages->data.insert(stages->data.begin() + stage_pos,
-                      factor_stage.node_);
+                      factor_stage);
   (*this)->stage_map.Set(factor_op, factor_stage);
   factor_stage->group = reduce_stage->group;
   if (factor_stage->group.defined()) {
diff --git a/src/schedule/schedule_lang.cc b/src/schedule/schedule_lang.cc
index 7e61479a5a48..407729df8038 100644
--- a/src/schedule/schedule_lang.cc
+++ b/src/schedule/schedule_lang.cc
@@ -77,12 +77,12 @@ void Split(StageNode* self,
   size_t pos = FindLeafVar(all_vars, leaf_vars, parent);
   self->relations.push_back(SplitNode::make(parent, outer, inner, factor, nparts));
   // add vars to all vars
-  all_vars->data.push_back(outer.node_);
-  all_vars->data.push_back(inner.node_);
+  all_vars->data.push_back(outer);
+  all_vars->data.push_back(inner);
   // replace the position.
   leaf_vars->data.erase(leaf_vars->data.begin() + pos);
-  leaf_vars->data.insert(leaf_vars->data.begin() + pos, inner.node_);
-  leaf_vars->data.insert(leaf_vars->data.begin() + pos, outer.node_);
+  leaf_vars->data.insert(leaf_vars->data.begin() + pos, inner);
+  leaf_vars->data.insert(leaf_vars->data.begin() + pos, outer);
 }
 
 }  // namespace
@@ -102,7 +102,7 @@ Stage::Stage(Operation op) {
   } else {
     n->leaf_iter_vars = clean;
   }
-  node_ = n;
+  data_ = std::move(n);
 }
 
 bool Stage::is_scheduled() const {
@@ -206,9 +206,9 @@ Stage& Stage::env_threads(Array<IterVar> threads) {
       << "Already set env_threads";
   ArrayNode* leaf_vars = self->leaf_iter_vars.CopyOnWrite();
   ArrayNode* all_vars = self->all_iter_vars.CopyOnWrite();
-  std::vector<NodePtr<Node> > temp;
+  std::vector<ObjectRef> temp;
   for (IterVar iv : threads) {
-    temp.push_back(iv.node_);
+    temp.push_back(iv);
   }
   leaf_vars->data.insert(
       leaf_vars->data.begin(), temp.begin(), temp.end());
@@ -265,13 +265,13 @@ Stage& Stage::fuse(IterVar outer, IterVar inner, IterVar* p_target) {  // NOLINT
     std::swap(pos_inner, pos_outer);
   }
   self->relations.push_back(FuseNode::make(outer, inner, fused));
-  all_vars->data.push_back(fused.node_);
+  all_vars->data.push_back(fused);
   CHECK_EQ(pos_inner, pos_outer + 1)
       << "Can only fuse iterations that are consecutive between each other";
   leaf_vars->data.erase(leaf_vars->data.begin() + pos_outer,
                         leaf_vars->data.begin() + pos_inner + 1);
   leaf_vars->data.insert(leaf_vars->data.begin() + pos_outer,
-                         fused.node_);
+                         fused);
   *p_target = fused;
   return *this;
 }
@@ -293,8 +293,8 @@ Stage& Stage::fuse(const Array<IterVar>& axes, IterVar* p_target) {  // NOLINT(*
     self->relations.push_back(SingletonNode::make(singleton));
     ArrayNode* all_vars = self->all_iter_vars.CopyOnWrite();
     ArrayNode* leaf_vars = self->leaf_iter_vars.CopyOnWrite();
-    all_vars->data.push_back(singleton.node_);
-    leaf_vars->data.insert(leaf_vars->data.begin(), singleton.node_);
+    all_vars->data.push_back(singleton);
+    leaf_vars->data.insert(leaf_vars->data.begin(), singleton);
     *p_target = singleton;
   }
   return *this;
@@ -321,7 +321,7 @@ Stage& Stage::reorder(const Array<IterVar>& order) {  // NOLINT(*)
   for (size_t i = 0; i < order.size(); ++i) {
     pos.push_back(FindLeafVar(all_vars, leaf_vars, order[i]));
   }
-  std::vector<NodePtr<Node> > temp;
+  std::vector<ObjectRef> temp;
   for (size_t i = 0; i < pos.size(); ++i) {
     temp.emplace_back(leaf_vars->data[pos[i]]);
   }
diff --git a/src/schedule/schedule_ops.cc b/src/schedule/schedule_ops.cc
index d9d28ffdca7e..10c9e3cab8f6 100644
--- a/src/schedule/schedule_ops.cc
+++ b/src/schedule/schedule_ops.cc
@@ -221,8 +221,8 @@ class SchedulePostProc : public IRMutator {
         }
       }
     } else if (op->attr_key == ir::attr::buffer_bind_scope) {
-      Array<NodeRef> tuple(op->node.node_);
-      Tensor tensor(tuple[1].node_);
+      Array<NodeRef> tuple = Downcast<Array<NodeRef> >(op->node);
+      Tensor tensor = Downcast<Tensor>(tuple[1]);
       auto it = replace_op_.find(tensor->op.get());
       if (it != replace_op_.end()) {
         if (it->second.defined()) {
@@ -234,7 +234,7 @@ class SchedulePostProc : public IRMutator {
         }
       }
     } else if (op->attr_key == ir::attr::buffer_dim_align) {
-      Tensor tensor(op->node.node_);
+      Tensor tensor = Downcast<Tensor>(op->node);
       auto it = replace_op_.find(tensor->op.get());
       if (it != replace_op_.end()) {
         if (it->second.defined()) {
diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc
index a7237db482ac..6e43b408978a 100644
--- a/tests/cpp/build_module_test.cc
+++ b/tests/cpp/build_module_test.cc
@@ -21,6 +21,7 @@
 #include <gtest/gtest.h>
 #include <topi/cuda/injective.h>
 #include <tvm/operation.h>
+#include <tvm/runtime/registry.h>
 #include <tvm/packed_func_ext.h>
 #include <tvm/build_module.h>
 
diff --git a/tests/cpp/expr_test.cc b/tests/cpp/expr_test.cc
index 1d241ddc34bd..7ecf4590ca12 100644
--- a/tests/cpp/expr_test.cc
+++ b/tests/cpp/expr_test.cc
@@ -26,7 +26,7 @@ TEST(Expr, Basic) {
   Var x("x");
   auto z = max(x + 1 + 2, 100);
   NodeRef tmp = z;
-  Expr zz(tmp.node_);
+  Expr zz = Downcast<Expr>(tmp);
   std::ostringstream os;
   os << z;
   CHECK(zz.same_as(z));
@@ -39,7 +39,7 @@ TEST(ExprNodeRef, Basic) {
   Var x("x");
   Expr z = max(x + 1 + 2, 100);
   const ir::Max* op = z.as<ir::Max>();
-  CHECK(NodeRef(op->GetNodePtr()).same_as(z));
+  CHECK(GetRef<NodeRef>(op).same_as(z));
 }
 
 
diff --git a/tests/cpp/ir_functor_test.cc b/tests/cpp/ir_functor_test.cc
index d3584c633199..fef43f97d3c3 100644
--- a/tests/cpp/ir_functor_test.cc
+++ b/tests/cpp/ir_functor_test.cc
@@ -30,7 +30,7 @@ TEST(IRF, Basic) {
   Var x("x");
   auto z = x + 1;
 
-  IRFunctor<int(const NodeRef& n, int b)> f;
+  IRFunctor<int(const ObjectRef& n, int b)> f;
   LOG(INFO) << "x";
   f.set_dispatch<Variable>([](const Variable* n, int b) {
       return b;
diff --git a/tests/cpp/object_protocol_test.cc b/tests/cpp/object_protocol_test.cc
index 9f3ce00f3b24..2977b6805e5c 100644
--- a/tests/cpp/object_protocol_test.cc
+++ b/tests/cpp/object_protocol_test.cc
@@ -72,7 +72,7 @@ TEST(ObjectHierachy, Basic) {
   using namespace tvm::test;
 
   ObjectRef refA(make_object<ObjA>());
-  CHECK_EQ(refA->type_index(), ObjA::type_index());
+  CHECK_EQ(refA->type_index(), ObjA::RuntimeTypeIndex());
   CHECK(refA.as<Object>() != nullptr);
   CHECK(refA.as<ObjA>() != nullptr);
   CHECK(refA.as<ObjBase>() != nullptr);
@@ -80,7 +80,7 @@ TEST(ObjectHierachy, Basic) {
   CHECK(refA.as<ObjAA>() == nullptr);
 
   ObjectRef refAA(make_object<ObjAA>());
-  CHECK_EQ(refAA->type_index(), ObjAA::type_index());
+  CHECK_EQ(refAA->type_index(), ObjAA::RuntimeTypeIndex());
   CHECK(refAA.as<Object>() != nullptr);
   CHECK(refAA.as<ObjBase>() != nullptr);
   CHECK(refAA.as<ObjA>() != nullptr);
@@ -88,7 +88,7 @@ TEST(ObjectHierachy, Basic) {
   CHECK(refAA.as<ObjB>() == nullptr);
 
   ObjectRef refB(make_object<ObjB>());
-  CHECK_EQ(refB->type_index(), ObjB::type_index());
+  CHECK_EQ(refB->type_index(), ObjB::RuntimeTypeIndex());
   CHECK(refB.as<Object>() != nullptr);
   CHECK(refB.as<ObjBase>() != nullptr);
   CHECK(refB.as<ObjA>() == nullptr);
diff --git a/tests/cpp/packed_func_test.cc b/tests/cpp/packed_func_test.cc
index 4e2ec065710c..70a4c32bedac 100644
--- a/tests/cpp/packed_func_test.cc
+++ b/tests/cpp/packed_func_test.cc
@@ -20,6 +20,7 @@
 #include <dmlc/logging.h>
 #include <gtest/gtest.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
 #include <tvm/packed_func_ext.h>
 #include <tvm/ir.h>
 
@@ -49,7 +50,7 @@ TEST(PackedFunc, Node) {
   Var x;
   Var t = PackedFunc([&](TVMArgs args, TVMRetValue* rv) {
       CHECK(args.num_args == 1);
-      CHECK(args.type_codes[0] == kNodeHandle);
+      CHECK(args.type_codes[0] == kObjectHandle);
       Var b = args[0];
       CHECK(x.same_as(b));
       *rv = b;
diff --git a/tests/python/contrib/test_gemm_acc16.py b/tests/python/contrib/test_gemm_acc16.py
index 555187838723..17f920efeb8a 100644
--- a/tests/python/contrib/test_gemm_acc16.py
+++ b/tests/python/contrib/test_gemm_acc16.py
@@ -17,7 +17,7 @@
 # pylint: disable=import-self, invalid-name, unused-argument, too-many-lines, len-as-condition
 import tvm
 import numpy as np
-from topi.x86.tensor_intrin import dot_16x1x16_int8_int8_int16
+from topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int16
 
 
 def benchmark_fc_int8_acc16():
@@ -40,7 +40,7 @@ def verify(target="llvm -mcpu=skylake-avx512"):
         ctx = tvm.context(target, 0)
         X = tvm.placeholder((m, k), name='X', dtype="uint8")
         W = tvm.placeholder((n, k), name='W', dtype="int8")
-        pc = dot_16x1x16_int8_int8_int16()
+        pc = dot_16x1x16_uint8_int8_int16()
         ak = tvm.reduce_axis((0, k), name='k')
 
         packedW = tvm.placeholder((n//128, 128*(k//2), 2), name='packedW', dtype="int8")
diff --git a/tests/python/contrib/test_gemm_acc32_vnni.py b/tests/python/contrib/test_gemm_acc32_vnni.py
index 34518f4ed9d6..4f535918ba15 100644
--- a/tests/python/contrib/test_gemm_acc32_vnni.py
+++ b/tests/python/contrib/test_gemm_acc32_vnni.py
@@ -18,8 +18,8 @@
 
 import tvm
 import numpy as np
-from topi.x86.tensor_intrin import dot_16x1x16_int8_int8_int32_vnni
-from topi.x86.tensor_intrin import dot_16x1x16_int8_int8_int32
+from topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int32_cascadelake
+from topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int32
 import pytest
 
 
@@ -46,7 +46,7 @@ def verify(target="llvm -mcpu=cascadelake"):
             return
 
         ctx = tvm.context(target, 0)
-        pc = dot_16x1x16_int8_int8_int32_vnni()
+        pc = dot_16x1x16_uint8_int8_int32_cascadelake()
         ak = tvm.reduce_axis((0, k), name='k')
         packedW = tvm.placeholder(
             (n // 16, 16 * (k // 4), 4), name='packedW', dtype="int8")
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 16e717401174..741902012f8e 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -1299,6 +1299,94 @@ def test_erf():
     z = scipy.special.erf(x)
     verify_erf(x, z)
 
+def verify_where(condition, x, y, dtype, outdata):
+    node = helper.make_node('Where', inputs=['condition', 'x', 'y'], outputs=['out'])
+    graph = helper.make_graph([node],
+                              'where_test',
+                              inputs=[helper.make_tensor_value_info('condition', TensorProto.BOOL, list(condition.shape)),
+                                      helper.make_tensor_value_info('x', dtype, list(x.shape)),
+                                      helper.make_tensor_value_info('y', dtype, list(y.shape))],
+                              outputs=[helper.make_tensor_value_info('out', dtype, list(outdata.shape))])
+    model = helper.make_model(graph, producer_name='where_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [condition, x, y], target, ctx, outdata.shape)
+        tvm.testing.assert_allclose(outdata, tvm_out)
+
+def test_where():
+    condition = np.array([[1, 0], [1, 1]], dtype=np.bool)
+    x = np.array([[1, 2], [3, 4]], dtype=np.int64)
+    y = np.array([[9, 8], [7, 6]], dtype=np.int64)
+    outdata = np.where(condition, x, y)
+    verify_where(condition, x, y, TensorProto.INT64, outdata)
+
+    x = np.array([[1, 2], [3, 4]], dtype=np.float32)
+    y = np.array([[9, 8], [7, 6]], dtype=np.float32)
+    outdata = np.where(condition, x, y)
+    verify_where(condition, x, y, TensorProto.FLOAT, outdata)
+
+
+def test_constantofshape_with_input_shape():
+    ref_node = helper.make_node('Constant',
+                                inputs=['x'],
+                                outputs=['y'],
+                                value=onnx.helper.make_tensor('value', TensorProto.INT32, (1,), (2,)))
+
+    node = helper.make_node("ConstantOfShape",
+                            inputs=['y'],
+                            outputs=['z'],
+                            value=onnx.helper.make_tensor('value', TensorProto.INT64, (3,), (3, 4, 5)))
+
+    graph = helper.make_graph([ref_node, node],
+                              'ConstantOfShape_test',
+                              inputs=[helper.make_tensor_value_info("x", TensorProto.INT64, list((1,)))],
+                              outputs=[helper.make_tensor_value_info("z", TensorProto.INT32, list((3, 4, 5)))],
+                              initializer=None,
+                              value_info=[])
+
+    model = helper.make_model(graph, producer_name='ConstantOfShape_test')
+
+    for target, ctx in ctx_list():
+        x = np.array([1]).astype(np.int64)
+        tvm_out = get_tvm_output(model, x, target, ctx, (3, 4, 5))
+        tvm.testing.assert_allclose((2,), tvm_out)
+
+
+def test_constantofshape():
+    # test cases: https://github.com/onnx/onnx/blob/master/docs/Operators.md#ConstantOfShape
+    test_constantofshape_with_input_shape()
+
+
+def verify_constantofshape_with_input_shape():
+    outdata = np.full((3, 4, 5), 2,  dtype=np.int32)
+
+    ref_node = helper.make_node('Constant',
+                                inputs=[],
+                                outputs=['y'],
+                                value=onnx.helper.make_tensor('value', TensorProto.INT32, (1, ), (2, )))
+
+    node = helper.make_node("ConstantOfShape",
+                            inputs=['y'],
+                            outputs=['z'],
+                            value=onnx.helper.make_tensor('value', TensorProto.INT64, (3,), (3, 4, 5)))
+
+    graph = helper.make_graph([ref_node, node],
+                              'ConstantOfShape_test',
+                              inputs=[],
+                              outputs=[helper.make_tensor_value_info("z", TensorProto.INT32, list((3, 4, 5)))],
+                              initializer=None,
+                              value_info=[])
+
+    model = helper.make_model(graph, producer_name='ConstantOfShape_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [], target, ctx, outdata.shape)
+        tvm.testing.assert_allclose(outdata, tvm_out)
+
+
+def test_constantofshape():
+    verify_constantofshape_with_input_shape()
+
 
 if __name__ == '__main__':
     test_flatten()
@@ -1347,3 +1435,5 @@ def test_erf():
     test_and()
     test_tile()
     test_erf()
+    test_where()
+    test_constantofshape()
diff --git a/tests/python/frontend/tensorflow/test_debugging.py b/tests/python/frontend/tensorflow/test_debugging.py
new file mode 100644
index 000000000000..c7da636e28aa
--- /dev/null
+++ b/tests/python/frontend/tensorflow/test_debugging.py
@@ -0,0 +1,93 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Unit tests for converting TensorFlow debugging ops to Relay."""
+import tensorflow as tf
+import numpy as np
+from tvm import relay
+from tvm.relay.frontend.tensorflow import from_tensorflow
+
+def run_relay(graph, *vars):
+    mod, params = from_tensorflow(graph.as_graph_def(add_shapes=True))
+    ex = relay.create_executor('debug', mod=mod)
+    return ex.evaluate()(*vars)
+
+def test_assert_true():
+    g = tf.Graph()
+    with g.as_default():
+        x = tf.placeholder(tf.float32, shape=())
+        assert_op = tf.Assert(tf.less_equal(x, x), ["it failed"])
+
+        with tf.Session() as sess:
+            x_value = np.random.rand()
+            assert sess.run(assert_op, feed_dict={x: x_value}) is None
+
+        # In TVM, tf.assert is converted to a no-op which is actually a 0,
+        # though it should probably be none or an empty tuple.
+        #
+        # ToDo: It appears that the frontend converter gets confused here and
+        # entirely eliminates all operands from main(). Likely because x <= x
+        # is always true, so the placeholder can be eliminated. But TF doesn't
+        # do that, it's happening in Relay, and that optimization shouldn't
+        # affect the arity of the main function. We should have to pass in
+        # x_value here.
+        np.testing.assert_allclose(0, run_relay(g).asnumpy())
+
+def test_assert_true_var_capture():
+    g = tf.Graph()
+    with g.as_default():
+        x = tf.placeholder(tf.float32, shape=())
+
+        # It turns out that tf.assert() creates a large and complex subgraph if
+        # you capture a variable as part of the error message. So we need to
+        # test that, too.
+        assert_op = tf.Assert(tf.less_equal(x, x), ["it failed", x])
+
+        with tf.Session() as sess:
+            x_value = np.random.rand()
+            assert sess.run(assert_op, feed_dict={x: x_value}) is None
+
+        # ToDo: The frontend converter gets confused here as well, thinking
+        # that it needs to be told what x is twice. It also notes the output of
+        # the graph as a boolean, which is not correct - as you can see above,
+        # TF believes that the value of this graph is None. In addition, the
+        # arity of the translated function should be 1, not 2.
+        np.testing.assert_allclose(True, run_relay(g, x_value, x_value).asnumpy())
+
+def test_assert_false():
+    g = tf.Graph()
+    with g.as_default():
+        assert_op = tf.Assert(tf.constant(False), ["it failed"])
+
+        with tf.Session() as sess:
+            try:
+                print(sess.run(assert_op))
+                assert False  # TF should have thrown an exception
+            except tf.errors.InvalidArgumentError as e:
+                assert "it failed" in e.message
+
+        # In TVM, tf.assert is converted to a no-op which is actually a 0,
+        # though it should probably be none or an empty tuple. For the same
+        # reason, there should not be an error here, even though the assertion
+        # argument is false.
+        np.testing.assert_allclose(0, run_relay(g).asnumpy())
+
+        
+if __name__ == "__main__":
+    test_assert_true()
+    test_assert_true_var_capture()
+    test_assert_false()
+    
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 6432bbde98c6..11c6a7befca6 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -47,9 +47,9 @@ def convert_to_list(x):
     return x
 
 def vmobj_to_list(o):
-    if isinstance(o, tvm.relay.backend.vmobj.TensorObject):
+    if isinstance(o, tvm.relay.backend.vmobj.Tensor):
         return [o.asnumpy().tolist()]
-    elif isinstance(o, tvm.relay.backend.vmobj.DatatypeObject):
+    elif isinstance(o, tvm.relay.backend.vmobj.ADT):
         result = []
         for f in o:
             result.extend(vmobj_to_list(f))
@@ -60,13 +60,19 @@ def vmobj_to_list(o):
             result.append(vmobj_to_list(f))
         return result
     elif isinstance(o, tvm.relay.backend.interpreter.ConstructorValue):
-        if o.constructor.name_hint == 'cons':
+        if o.constructor.name_hint == 'Cons':
             tl = vmobj_to_list(o.fields[1])
             hd = vmobj_to_list(o.fields[0])
             hd.extend(tl)
             return hd
-        elif o.constructor.name_hint == 'nil':
+        elif o.constructor.name_hint == 'Nil':
             return []
+        elif 'tensor_nil' in o.constructor.name_hint:
+            return [0]
+        elif 'tensor' in o.constructor.name_hint:
+            return [o.fields[0].asnumpy()]
+        else:
+            raise RuntimeError("Unknown object type: %s" % o.constructor.name_hint)
     elif isinstance(o, tvm.relay.backend.interpreter.TensorValue):
         return [o.data.asnumpy()]
     else:
@@ -77,14 +83,11 @@ def run_tvm_graph(graph_def, input_data, input_node, num_output=1,
     """ Generic function to compile on relay and execute on tvm """
     input_data = convert_to_list(input_data)
     input_node = convert_to_list(input_node)
-
     layout = None
     if target == "cuda":
         layout = "NCHW"
     target_host = None
-
     shape_dict = {e: i.shape for e, i in zip(input_node, input_data)}
-
     mod, params = relay.frontend.from_tensorflow(graph_def,
                                                  layout=layout,
                                                  shape=shape_dict,
@@ -581,6 +584,111 @@ def test_forward_squeeze():
     _test_squeeze(np.arange(6).reshape((1, 2, 1, 3, 1)), [-3, -5])
     _test_squeeze(np.arange(6).reshape((1, 2, 1, 3, 1)), [-3, -5, -1])
 
+def test_tensor_array_constructor():
+    def run(dtype_str):
+        with tf.Graph().as_default():
+            dtype =  {
+                'float32': tf.float32,
+                'int32'  : tf.int32
+            }[dtype_str]
+            t = tf.constant(np.array([[1.0, 2.0], [3.0, 4.0]]).astype(dtype_str), dtype=dtype)
+            t2 = tf.constant(np.array([[1.0, 2.0], [3.0, 4.0]]).astype(dtype_str), dtype=dtype)
+            ta1 = tf.TensorArray(dtype=dtype, size=2, infer_shape=False, dynamic_size=False)
+            ta2 = ta1.write(0, t)
+            ta3 = ta2.write(1, t2)
+            out = ta3.read(0)
+            g = tf.get_default_graph()
+            compare_tf_with_tvm([], [], 'TensorArrayReadV3:0', mode='debug')
+    run('float32')
+    run('int32')
+
+def test_tensor_array_scatter():
+    def run(dtype_str):
+        with tf.Graph().as_default():
+            dtype =  {
+                'float32': tf.float32,
+                'int32'  : tf.int32
+            }[dtype_str]
+            t = tf.constant(np.array([[1.0], [2.0], [3.0]]).astype(dtype_str), dtype=dtype)
+            indices = tf.constant([2, 1, 0])
+            ta1 = tf.TensorArray(dtype=dtype, size=3, infer_shape=False, dynamic_size=False)
+            ta2 = ta1.scatter(indices, t)
+            out0 = ta2.read(0)
+            out1 = ta2.read(1)
+            out2 = ta2.read(2)
+            g = tf.get_default_graph()
+            compare_tf_with_tvm([], [], ['TensorArrayReadV3:0'], mode='debug')
+            compare_tf_with_tvm([], [], ['TensorArrayReadV3_1:0'], mode='debug')
+            compare_tf_with_tvm([], [], ['TensorArrayReadV3_2:0'], mode='debug')
+    run('float32')
+    run('int32')
+
+# TODO(wweic): Fix gather issue with PartialEvaluate
+# def test_tensor_array_gather():
+#     with tf.Graph().as_default():
+#         dtype = 'float32'
+#         t = tf.constant([[1.0], [2.0], [3.0]])
+#         scatter_indices = tf.constant([2, 1, 0])
+#         gather_indices = tf.constant([1, 2])
+#         ta1 = tf.TensorArray(dtype=tf.float32, size=3, infer_shape=False, dynamic_size=False)
+#         ta2 = ta1.scatter(scatter_indices, t)
+#         t1 = ta2.gather(gather_indices)
+#         g = tf.get_default_graph()
+#         compare_tf_with_tvm([], [], ['TensorArrayGatherV3:0'], mode='debug')
+
+def test_tensor_array_split():
+    def run(dtype_str):
+        with tf.Graph().as_default():
+            dtype =  {
+                'float32': tf.float32,
+                'int32'  : tf.int32
+            }[dtype_str]
+            t = tf.constant(np.array([[1.0], [2.0], [3.0], [4.0], [5.0], [6.0], [7.0], [8.0]]).astype(dtype_str), dtype=dtype)
+            split_length = tf.constant([2, 2, 2, 2], dtype=tf.int32)
+            ta1 = tf.TensorArray(dtype=dtype, size=4, infer_shape=False, dynamic_size=False)
+            ta2 = ta1.split(t, split_length)
+            out0 = ta2.read(0)
+            out1 = ta2.read(1)
+            out2 = ta2.read(2)
+            out3 = ta2.read(3)
+            g = tf.get_default_graph()
+            compare_tf_with_tvm([], [], ['TensorArrayReadV3:0'], mode='debug')
+            compare_tf_with_tvm([], [], ['TensorArrayReadV3_1:0'], mode='debug')
+            compare_tf_with_tvm([], [], ['TensorArrayReadV3_2:0'], mode='debug')
+            compare_tf_with_tvm([], [], ['TensorArrayReadV3_3:0'], mode='debug')
+    run('float32')
+    run('int32')
+
+def test_tensor_array_concat():
+    def run(dtype_str):
+        with tf.Graph().as_default():
+            dtype = {
+                'float32': tf.float32,
+                'int32'  : tf.int32
+            }[dtype_str]
+            t = tf.constant(np.array([[1.0], [2.0], [3.0], [4.0], [5.0], [6.0], [7.0], [8.0]]).astype(dtype_str), dtype=dtype)
+            split_length = tf.constant([2, 2, 2, 2], dtype=tf.int32)
+            ta1 = tf.TensorArray(dtype=dtype, size=4, infer_shape=False, dynamic_size=False)
+            ta2 = ta1.split(t, split_length)
+            t = ta2.concat()
+            compare_tf_with_tvm([], [], ['TensorArrayConcatV3:0'], mode='debug')
+    run('float32')
+    run('int32')
+
+def test_tensor_array_size():
+    def run(dtype_str):
+        with tf.Graph().as_default():
+            dtype =  {
+                'float32': tf.float32,
+                'int32'  : tf.int32
+            }[dtype_str]
+            ta1 = tf.TensorArray(dtype=dtype, size=2, infer_shape=False, dynamic_size=False)
+            out = ta1.size()
+            g = tf.get_default_graph()
+            compare_tf_with_tvm([], [], 'TensorArraySizeV3:0', mode='debug')
+    run('float32')
+    run('int32')
+
 #######################################################################
 # ConcatV2
 # --------
@@ -2076,15 +2184,18 @@ def check_mean(ishape, **kwargs):
 def test_forward_size():
     def check_size(ishape):
         np_input = np.random.uniform(size=ishape).astype(np.float32)
+
+        # if all dimensions are constant, TF will optimize away size operator into constant
+        tf_input_shape = list(np_input.shape)
+        tf_input_shape[0] = None
+
         with tf.Graph().as_default():
-            input = tf.placeholder(shape=np_input.shape, dtype=np_input.dtype, name='input')
+            input = tf.placeholder(shape=tf_input_shape, dtype=np_input.dtype, name='input')
             tf.size(input, name='size')
             compare_tf_with_tvm([np_input], ['input:0'], 'size:0')
 
-    if tf.__version__ < LooseVersion('1.1'):
-        check_size((10, 8, 16, 32))
-        check_size((10,))
-        check_size(())
+    check_size((10, 8, 16, 32))
+    check_size((10,))
 
 #######################################################################
 # All, Max, Min
diff --git a/tests/python/frontend/tensorflow/test_no_op.py b/tests/python/frontend/tensorflow/test_no_op.py
new file mode 100644
index 000000000000..0d09cf4b8949
--- /dev/null
+++ b/tests/python/frontend/tensorflow/test_no_op.py
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Unit tests for converting TensorFlow debugging ops to Relay."""
+import tensorflow as tf
+import numpy as np
+from tvm import relay
+from tvm.relay.frontend.tensorflow import from_tensorflow
+
+def run_relay(graph):
+    mod, params = from_tensorflow(graph.as_graph_def(add_shapes=True))
+    ex = relay.create_executor('debug', mod=mod)
+    return ex.evaluate()(**params)
+
+def test_no_op():
+    g = tf.Graph()
+    with g.as_default():
+        no_op = tf.no_op()
+        with tf.Session() as sess:
+            # In TF, the type of a no-op is None.
+            assert sess.run(no_op) is None
+
+        # In TVM, no-op is currently translated to 0, though it should
+        # probably be none or an empty tuple.
+        np.testing.assert_allclose(0, run_relay(g).asnumpy())
+
+
+if __name__ == "__main__":
+    test_no_op()
+
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index a71a24ee0a4f..de19fe34f811 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -714,6 +714,14 @@ def _test_reduce_prod(data, keep_dims=None):
     """ One iteration of reduce_prod """
     return _test_reduce(math_ops.reduce_prod, data, keep_dims)
 
+#######################################################################
+# Reduce_sum
+# -----------
+
+def _test_reduce_sum(data, keep_dims=None):
+    """ One iteration of reduce_sum """
+    return _test_reduce(math_ops.reduce_sum, data, keep_dims)
+
 
 def _test_forward_reduce(testop):
     """ Reduce """
@@ -732,6 +740,7 @@ def test_all_reduce():
     _test_forward_reduce(_test_reduce_max)
     _test_forward_reduce(_test_reduce_mean)
     _test_forward_reduce(_test_reduce_prod)
+    _test_forward_reduce(_test_reduce_sum)
 
 
 #######################################################################
@@ -1037,6 +1046,26 @@ def test_forward_qnn_mobilenet_v1_net():
     tvm_sorted_labels = tvm_predictions.argsort()[-3:][::-1]
     tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels)
 
+def test_forward_qnn_mobilenet_v2_net():
+    """Test the Quantized TFLite Mobilenet V2 model."""
+    # MobilenetV2
+    tflite_model_file = tf_testing.get_workload_official(
+        "https://storage.googleapis.com/download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz",
+        "mobilenet_v2_1.0_224_quant.tflite")
+    with open(tflite_model_file, "rb") as f:
+        tflite_model_buf = f.read()
+    # Checking the labels because the requantize implementation is different between TFLite and
+    # Relay. This cause final output numbers to mismatch. So, testing accuracy via labels.
+    np.random.seed(0)
+    data = np.random.random_integers(low=0, high=128, size=(1, 224, 224, 3)).astype('uint8')
+    tflite_output = run_tflite_graph(tflite_model_buf, data)
+    tflite_predictions = np.squeeze(tflite_output)
+    tflite_sorted_labels = tflite_predictions.argsort()[-3:][::-1]
+    tvm_output = run_tvm_graph(tflite_model_buf, data, 'input')
+    tvm_predictions = np.squeeze(tvm_output)
+    tvm_sorted_labels = tvm_predictions.argsort()[-3:][::-1]
+    tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels)
+
 #######################################################################
 # SSD Mobilenet
 # -------------
@@ -1111,6 +1140,6 @@ def test_forward_ssd_mobilenet_v1():
     test_forward_ssd_mobilenet_v1()
 
     # End to End quantized
-    # TODO - MobilenetV2 fails for now. Remove when fixed.
     test_forward_qnn_inception_v1_net()
     test_forward_qnn_mobilenet_v1_net()
+    test_forward_qnn_mobilenet_v2_net()
diff --git a/tests/python/relay/benchmarking/benchmark_vm.py b/tests/python/relay/benchmarking/benchmark_vm.py
index b1d8b9cb9130..cfb3fd42f834 100644
--- a/tests/python/relay/benchmarking/benchmark_vm.py
+++ b/tests/python/relay/benchmarking/benchmark_vm.py
@@ -21,16 +21,20 @@
 from tvm.contrib import graph_runtime
 from tvm import relay
 from tvm.relay import testing
+from tvm.relay import vm
+from tvm.relay import vmobj as _obj
 
 
 def benchmark_execution(mod,
                         params,
-                        measure=False,
+                        measure=True,
                         data_shape=(1, 3, 224, 224),
                         out_shape=(1, 1000),
-                        dtype='float32'):
-    def get_tvm_output(mod, data, params, target, ctx, dtype='float32'):
-        with relay.build_config(opt_level=1):
+                        dtype='float32',
+                        model="unknown"):
+    def get_graph_runtime_output(mod, data, params, target, ctx,
+                                 dtype='float32', number=2, repeat=20):
+        with relay.build_config(opt_level=3):
             graph, lib, params = relay.build(mod, target, params=params)
 
         m = graph_runtime.create(graph, lib, ctx)
@@ -41,18 +45,34 @@ def get_tvm_output(mod, data, params, target, ctx, dtype='float32'):
         out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
 
         if measure:
-            print("Evaluate graph runtime inference time cost...")
+            print("Evaluate graph runtime inference cost of {} on "
+                  "{}".format(model, repr(ctx)))
             ftimer = m.module.time_evaluator("run", ctx, number=1, repeat=20)
             # Measure in millisecond.
             prof_res = np.array(ftimer().results) * 1000
-            print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
+            print("Mean graph runtime inference time (std dev): %.2f ms (%.2f ms)" %
                   (np.mean(prof_res), np.std(prof_res)))
 
         return out.asnumpy()
 
-    def get_tvm_vm_output(mod, data, params, target, ctx, dtype='float32'):
-        ex = relay.create_executor('vm', mod=mod, ctx=ctx)
-        result = ex.evaluate()(data, **params)
+    def get_vm_output(mod, data, params, target, ctx, dtype='float32',
+                      number=2, repeat=20):
+        with relay.build_config(opt_level=3):
+            exe = vm.compile(mod, target, params=params)
+            rly_vm = vm.VirtualMachine(exe)
+            rly_vm.init(ctx)
+            result = rly_vm.run(data)
+
+        if measure:
+            print("Evaluate vm inference cost of {} on {}".format(model,
+                                                                  repr(ctx)))
+            ftimer = rly_vm.mod.time_evaluator("invoke", ctx, number=number,
+                                               repeat=repeat)
+            # Measure in millisecond.
+            prof_res = np.array(ftimer("main", _obj.Tensor(data)).results) * 1000
+            print("Mean vm inference time (std dev): %.2f ms (%.2f ms)" %
+                  (np.mean(prof_res), np.std(prof_res)))
+            
         return result.asnumpy().astype(dtype)
 
     # random input
@@ -60,41 +80,46 @@ def get_tvm_vm_output(mod, data, params, target, ctx, dtype='float32'):
     target = "llvm"
     ctx = tvm.cpu(0)
 
-    tvm_out = get_tvm_output(mod, tvm.nd.array(data.astype(dtype)), params,
-                             target, ctx, dtype)
-    vm_out = get_tvm_vm_output(mod, tvm.nd.array(data.astype(dtype)), params,
-                               target, ctx, dtype)
+    tvm_out = get_graph_runtime_output(mod, tvm.nd.array(data.astype(dtype)),
+                                       params, target, ctx, dtype)
+    vm_out = get_vm_output(mod, tvm.nd.array(data.astype(dtype)), params,
+                           target, ctx, dtype)
     tvm.testing.assert_allclose(vm_out, tvm_out, rtol=1e-5, atol=1e-5)
 
 
 def test_mlp():
     image_shape = (1, 1, 28, 28)
     mod, params = testing.mlp.get_workload(1)
-    benchmark_execution(mod, params, data_shape=image_shape, out_shape=(1, 10))
+    benchmark_execution(mod, params, data_shape=image_shape, out_shape=(1, 10),
+                       model="mlp")
 
 
 def test_vgg():
     for n in [11, 16]:
         mod, params = testing.vgg.get_workload(1, num_layers=n)
-        benchmark_execution(mod, params)
+        model = "vgg" + str(n)
+        benchmark_execution(mod, params, model=model)
 
 
 def test_resnet():
     for n in [18, 50]:
         mod, params = testing.resnet.get_workload(batch_size=1, num_layers=n)
-        benchmark_execution(mod, params, True)
+        model = "resnet" + str(n)
+        benchmark_execution(mod, params, model=model)
 
 
 def test_squeezenet():
     for version in ['1.0', '1.1']:
         mod, params = testing.squeezenet.get_workload(version=version)
-        benchmark_execution(mod, params)
+        model = "squeezenet" + version
+        benchmark_execution(mod, params, model=model)
 
 
 def test_inception_v3():
     image_shape = (3, 299, 299)
     mod, params = testing.inception_v3.get_workload(image_shape=image_shape)
-    benchmark_execution(mod, params, data_shape=(1, 3, 299, 299))
+    benchmark_execution(mod, params, data_shape=(1, 3, 299, 299),
+                        model="inception_v3")
 
 
 def test_dqn():
@@ -112,7 +137,7 @@ def test_dcgan():
 
 def test_mobilenet():
     mod, params = testing.mobilenet.get_workload(batch_size=1)
-    benchmark_execution(mod, params)
+    benchmark_execution(mod, params, model="mobilenet")
 
 # TODO: enable when the low building performance (several minutes) fixed.
 def test_mobilenet_nhwc():
@@ -124,7 +149,7 @@ def test_mobilenet_nhwc():
 
 def test_densenet():
     mod, params = testing.densenet.get_workload(batch_size=1)
-    benchmark_execution(mod, params)
+    benchmark_execution(mod, params, model="densenet")
 
 
 if __name__ == '__main__':
diff --git a/tests/python/relay/test_adt.py b/tests/python/relay/test_adt.py
index 7be7c75dfe64..32bc22f9031a 100644
--- a/tests/python/relay/test_adt.py
+++ b/tests/python/relay/test_adt.py
@@ -21,6 +21,8 @@
 from tvm.relay.prelude import Prelude
 from tvm.relay.testing import add_nat_definitions, count as count_, make_nat_value, make_nat_expr
 
+import numpy as np
+
 mod = relay.Module()
 p = Prelude(mod)
 add_nat_definitions(p)
@@ -683,6 +685,146 @@ def test_iterate():
     res = intrp.evaluate(relay.Function([], expr)())
     assert count(res) == 12
 
+def test_tensor_expand_dims():
+    def run(dtype):
+        x = relay.var('x')
+        mod = relay.Module()
+        p = Prelude(mod)
+        expand_dims_func = p.get_var('tensor_expand_dims', dtype)
+        tensor1 = p.get_var('tensor1', dtype)
+        mod["main"] = relay.Function([x], expand_dims_func(tensor1(x)))
+        for kind in ["debug"]:
+            ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
+            x_np = np.random.uniform(size=(1,)).astype(dtype)
+            result = ex.evaluate()(x_np)
+            got = vmobj_to_list(result)
+            expected = [np.expand_dims(x_np, axis=0)]
+            tvm.testing.assert_allclose(expected, got)
+    run('float32')
+    run('int32')
+
+def test_tensor_array_constructor():
+    def run(dtype):
+        x = relay.var('x')
+        mod = relay.Module()
+        p = Prelude(mod)
+        tensor_array = p.get_var('tensor_array', dtype)
+        mod["main"] = relay.Function([x], tensor_array(x))
+        for kind in ["debug"]:
+            ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
+            result = ex.evaluate()(5)
+            got = vmobj_to_list(result)
+            expected = np.array([0, 0, 0, 0, 0])
+            tvm.testing.assert_allclose(expected, got)
+    run('float32')
+    run('int32')
+
+def test_tensor_array_read():
+    def run(dtype):
+        mod = relay.Module()
+        p = Prelude(mod)
+        l = relay.var('l')
+        i = relay.var('i')
+        read_func = p.get_var('tensor_array_read', dtype)
+        tensor_array = p.get_var('tensor_array', dtype)
+        mod["main"] = relay.Function([l, i], read_func(tensor_array(l), i))
+        for kind in ["debug"]:
+            ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
+            result = ex.evaluate()(10, 5)
+            got = vmobj_to_list(result)
+            expected = [0]
+            tvm.testing.assert_allclose(expected, got)
+    run('float32')
+    run('int32')
+
+def vmobj_to_list(o):
+    if isinstance(o, tvm.relay.backend.vmobj.Tensor):
+        return [o.asnumpy().tolist()]
+    elif isinstance(o, tvm.relay.backend.interpreter.TensorValue):
+        return [o.asnumpy()]
+    elif isinstance(o, tvm.relay.backend.vmobj.ADT):
+        result = []
+        for f in o:
+            result.extend(vmobj_to_list(f))
+        return result
+    elif isinstance(o, tvm.relay.backend.interpreter.ConstructorValue):
+        if o.constructor.name_hint == 'Cons':
+            tl = vmobj_to_list(o.fields[1])
+            hd = vmobj_to_list(o.fields[0])
+            hd.extend(tl)
+            return hd
+        elif o.constructor.name_hint == 'Nil':
+            return []
+        elif 'tensor_nil' in o.constructor.name_hint:
+            return [0]
+        elif 'tensor' in o.constructor.name_hint:
+            return [o.fields[0].asnumpy()]
+        else:
+            raise RuntimeError("Unknown object type: %s" % o.constructor.name_hint)
+    else:
+        raise RuntimeError("Unknown object type: %s" % type(o))
+
+def test_tensor_array_stack():
+    def run(dtype):
+        mod = relay.Module()
+        p = Prelude(mod)
+        tensor_array = p.get_var('tensor_array', dtype)
+        tensor1 = p.get_var('tensor1', dtype)
+        write = p.get_var('tensor_array_write', dtype)
+        stack = p.get_var('tensor_array_stack', dtype)
+        l = relay.var('l')
+        v = relay.var('v')
+        init_tensor_array = tensor_array(relay.const(3))
+        tensor_array1 = write(init_tensor_array, relay.const(0), tensor1(v))
+        tensor_array2 = write(tensor_array1, relay.const(1), tensor1(v))    
+        tensor_array3 = write(tensor_array2, relay.const(2), tensor1(v))        
+        tensor_array4 = stack(tensor_array3)
+        mod["main"] = relay.Function([v], tensor_array4)
+        for kind in ["debug"]:
+            ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
+            t = np.random.uniform(size=(1,)).astype(dtype)
+            result = ex.evaluate()(t)
+            res = vmobj_to_list(result)
+            expected = [np.stack([t, t, t])]
+            tvm.testing.assert_allclose(expected, res)
+    run('float32')
+    run('int32')
+
+def test_tensor_array_unstack():
+    def run(dtype):
+        mod = relay.Module()
+        p = Prelude(mod)
+        unstack_tensor1 = p.get_var('tensor_array_unstack_tensor1', dtype)
+        v = relay.var('v')
+        mod["main"] = relay.Function([v], unstack_tensor1(v))
+        for kind in ["debug"]:
+            ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
+            t = np.random.uniform(size=(1,)).astype(dtype)
+            result = ex.evaluate()(t)
+            res = vmobj_to_list(result)
+            tvm.testing.assert_allclose(t, res)
+    run('float32')
+    run('int32')
+
+def test_tensor_take():
+    def run(dtype):
+        mod = relay.Module()
+        p = Prelude(mod)
+        take = p.get_var('tensor_take', dtype)
+        tensor2 = p.get_var('tensor2', dtype)
+        v = relay.var('v')
+        lower = relay.var('lower')
+        upper = relay.var('upper')
+        mod["main"] = relay.Function([v, lower, upper], take(tensor2(v), lower, upper))
+        for kind in ["debug"]:
+            ex = relay.create_executor(kind, mod=mod, ctx=tvm.cpu(), target="llvm")
+            t = np.random.uniform(size=(10, 10)).astype(dtype)
+            result = ex.evaluate()(t, 2, 5)
+            res = vmobj_to_list(result)
+            expected = [np.take(t, range(2, 5), axis=0)]
+            tvm.testing.assert_allclose(expected, res)
+    run('float32')
+    run('int32')
 
 if __name__ == "__main__":
     test_nat_constructor()
@@ -707,3 +849,9 @@ def test_iterate():
     test_size()
     test_compose()
     test_iterate()
+
+    test_tensor_expand_dims()
+    test_tensor_array_constructor()
+    test_tensor_array_read()
+    test_tensor_array_stack()
+    test_tensor_array_unstack()
diff --git a/tests/python/relay/test_autotvm_task_extraction.py b/tests/python/relay/test_autotvm_task_extraction.py
index 0bef382cb5d0..242096fb66e1 100644
--- a/tests/python/relay/test_autotvm_task_extraction.py
+++ b/tests/python/relay/test_autotvm_task_extraction.py
@@ -37,36 +37,47 @@ def get_network(name, batch_size):
 
 def test_task_extraction():
     target = 'llvm'
+    mod_list = []
+    params_list = []
 
-    mod, params, input_shape = get_network('resnet-18', batch_size=1)
+    mod, params, _ = get_network('resnet-18', batch_size=1)
     tasks = autotvm.task.extract_from_program(mod["main"], target=target,
                                               params=params,
                                               ops=(relay.op.nn.conv2d,))
     assert len(tasks) == 12
 
-    mod, params, input_shape = get_network('resnet-18', batch_size=1)
+    mod, params, _ = get_network('resnet-18', batch_size=1)
     tasks = autotvm.task.extract_from_program(mod["main"], target=target,
                                               params=params,
                                               ops=(relay.op.nn.dense,))
     assert len(tasks) == 1
 
-    mod, params, input_shape = get_network('resnet-18', batch_size=1)
+    mod, params, _ = get_network('resnet-18', batch_size=1)
+    mod_list.append(mod)
+    params_list.append(params)
     tasks = autotvm.task.extract_from_program(mod["main"], target=target,
                                               params=params,
                                               ops=(relay.op.nn.conv2d, relay.op.nn.dense))
     assert len(tasks) == 13
 
-    mod, params, input_shape = get_network('mobilenet', batch_size=1)
+    mod, params, _ = get_network('mobilenet', batch_size=1)
+    mod_list.append(mod)
+    params_list.append(params)
     tasks = autotvm.task.extract_from_program(mod["main"], target=target,
                                               params=params,
                                               ops=(relay.op.nn.conv2d, relay.op.nn.dense))
     assert len(tasks) == 20
 
-    mod, params, input_shape = get_network('dcgan', batch_size=1)
+    mod, params, _ = get_network('dcgan', batch_size=1)
     tasks = autotvm.task.extract_from_program(mod["main"], target=target,
                                               params=params,
                                               ops=(relay.op.nn.conv2d_transpose,))
     assert len(tasks) == 4
 
+    tasks = autotvm.task.extract_from_multiple_program([m['main'] for m in mod_list], params_list,
+                                                       target=target,
+                                                       ops=(relay.op.nn.conv2d,))
+    assert len(tasks) == 31
+
 if __name__ == '__main__':
     test_task_extraction()
diff --git a/tests/python/relay/test_feature.py b/tests/python/relay/test_feature.py
index 8f0e90de0315..64eda9d04e7c 100644
--- a/tests/python/relay/test_feature.py
+++ b/tests/python/relay/test_feature.py
@@ -38,7 +38,8 @@ def test_prelude():
         Feature.fLet,
         Feature.fIf,
         Feature.fConstructor,
-        Feature.fMatch
+        Feature.fMatch,
+        Feature.fGraph
     ])
 
 
diff --git a/tests/python/relay/test_ir_nodes.py b/tests/python/relay/test_ir_nodes.py
index b42a1e6d52c6..dec840a214a0 100644
--- a/tests/python/relay/test_ir_nodes.py
+++ b/tests/python/relay/test_ir_nodes.py
@@ -20,7 +20,7 @@
 from tvm.expr import *
 from tvm.relay import op
 from tvm.relay.analysis import graph_equal
-
+import numpy as np
 
 def check_json_roundtrip(node):
     json_str = tvm.save_json(node)
@@ -160,7 +160,6 @@ def test_global_var():
     str(gv)
     check_json_roundtrip(gv)
 
-
 def test_function():
     param_names = ['a', 'b', 'c', 'd']
     params = tvm.convert([relay.Var(n) for n in param_names])
@@ -175,6 +174,34 @@ def test_function():
     str(fn)
     check_json_roundtrip(fn)
 
+def test_function_attrs():
+    param_names = ['a', 'b', 'c', 'd']
+    params = tvm.convert([relay.var(n, shape=(5, 2)) for n in param_names])
+    ret_type = relay.TupleType(tvm.convert([]))
+    body = relay.Tuple(tvm.convert([]))
+    type_params = tvm.convert([])
+    fn = relay.Function(params, body, ret_type, type_params)
+    model_params = {}
+    for param in params[:1]:
+        cty = param.type_annotation
+        tensor = np.random.rand(*[int(sh) for sh in cty.shape]).astype(cty.dtype)
+        model_params[param] = tvm.nd.array(tensor)
+    fn = fn.set_params(model_params)
+    assert fn.params == params
+    assert fn.body == body
+    assert fn.type_params == type_params
+    assert fn.span == None
+    str(fn)
+    check_json_roundtrip(fn)
+    json_str = tvm.save_json(fn)
+    fn_after = tvm.load_json(json_str)
+    model_params_after = fn_after.get_params()
+    after_keys = [item[0] for item in model_params_after.items()]
+    for key1, key2 in zip(model_params, after_keys):
+        assert key1.name_hint == key2.name_hint
+        p1 = model_params[key1]
+        p2 = model_params_after[key2]
+        np.testing.assert_allclose(p1.data.asnumpy(), p2.data.asnumpy())
 
 def test_call():
     op = relay.Var('f')
@@ -257,9 +284,11 @@ def test_conv2d_attrs():
     test_local_var()
     test_global_var()
     test_function()
+    test_function_attrs()
     test_call()
     test_let()
     test_if()
     test_tuple_get_item()
     test_op()
     test_conv2d_attrs()
+
diff --git a/tests/python/relay/test_op_grad_level10.py b/tests/python/relay/test_op_grad_level10.py
index 2592d181240a..acf3b75e0cb5 100644
--- a/tests/python/relay/test_op_grad_level10.py
+++ b/tests/python/relay/test_op_grad_level10.py
@@ -14,15 +14,35 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import pytest
+
 from tvm import relay
 from tvm.relay.testing import check_grad
 
 
 def test_cross_entropy_grad():
-    x = relay.var("x", shape=(1, 5))
-    y = relay.var("y", shape=(1, 5))
+    x = relay.var("x", shape=(2, 5))
+    y = relay.var("y", shape=(2, 5))
     check_grad(relay.Function([x, y], relay.op.nn.cross_entropy(x, y)), eps=0.01, scale=0.1, mean=1)
 
 
+def test_cross_entropy_with_logits_grad():
+    x = relay.var("x", shape=(2, 5))
+    y = relay.var("y", shape=(2, 5))
+    check_grad(relay.Function([x, y], relay.op.nn.cross_entropy_with_logits(x, y)), eps=0.01, scale=0.1, mean=1)
+    
+def test_checkpoint():
+    inputs = [relay.var("x{}".format(i), shape=(1,)) for i in range(4)]
+    output = relay.multiply(relay.add(inputs[0], inputs[1]),
+                            relay.add(inputs[2], inputs[3]))
+    check_grad(relay.Function(inputs, relay.annotation.checkpoint(output)))
+
+    out_tuple = relay.Tuple([relay.add(inputs[0], inputs[1]),
+                             relay.multiply(inputs[2], inputs[3])])
+    out_single = relay.subtract(relay.TupleGetItem(relay.annotation.checkpoint(out_tuple), 0),
+                                relay.TupleGetItem(out_tuple, 1))
+    check_grad(relay.Function(inputs, out_single))
+
+
 if __name__ == "__main__":
-    test_cross_entropy_grad()
+    pytest.main([__file__])
diff --git a/tests/python/relay/test_op_grad_level2.py b/tests/python/relay/test_op_grad_level2.py
index 8e809250d1de..57b1e2c676ac 100644
--- a/tests/python/relay/test_op_grad_level2.py
+++ b/tests/python/relay/test_op_grad_level2.py
@@ -48,8 +48,7 @@ def verify_max_pool2d_grad(x_shape, pool_size, strides, padding, ceil_mode):
 
 
 def test_max_pool2d_grad():
-    verify_max_pool2d_grad((1, 4, 16, 16), pool_size=(2, 2), strides=(2, 2), padding=(0, 0),
-                           ceil_mode=False)
+    verify_max_pool2d_grad((1, 4, 16, 16), pool_size=(2, 2), strides=(2, 2), padding=(0, 0), ceil_mode=False)
     verify_max_pool2d_grad((1, 4, 16, 16), pool_size=(1, 1), strides=(1, 1), padding=(1, 1), ceil_mode=False)
 
 
@@ -75,7 +74,6 @@ def verify_avg_pool2d_grad(x_shape, pool_size, strides, padding, ceil_mode, coun
         op_res, (op_grad, ) = intrp.evaluate(bwd_func)(data)
         np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01)
 
-
 def test_avg_pool2d_grad():
     verify_avg_pool2d_grad((1, 4, 16, 16), pool_size=(2, 2), strides=(2, 2), padding=(0, 0),
                            ceil_mode=False, count_include_pad=True)
@@ -83,6 +81,30 @@ def test_avg_pool2d_grad():
                            ceil_mode=False, count_include_pad=False)
 
 
+def verify_global_avg_pool2d_grad(x_shape):
+    x = relay.var("x", relay.TensorType(x_shape, "float32"))
+    y = tvm.relay.nn.global_avg_pool2d(x)
+
+    fwd_func = relay.Function([x], y)
+    fwd_func = run_infer_type(fwd_func)
+    bwd_func = run_infer_type(gradient(fwd_func))
+
+    data = np.random.rand(*x_shape).astype("float32")
+    y_shape = topi.util.get_const_tuple(fwd_func.ret_type.shape)
+    out_grad = np.ones(shape=y_shape)
+    ref_grad = topi.testing.pool_grad_nchw(data, out_grad, pool_size=(x_shape[2], x_shape[3]), 
+                                            strides=(1, 1), padding=[0, 0, 0, 0], pool_type='avg', 
+                                            ceil_mode=False)
+
+    for target, ctx in ctx_list():
+        intrp = relay.create_executor(ctx=ctx, target=target)
+        op_res, (op_grad, ) = intrp.evaluate(bwd_func)(data)
+        np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01)
+
+def test_global_avg_pool2d_grad():
+    verify_global_avg_pool2d_grad((1, 4, 16, 16))
+    verify_global_avg_pool2d_grad((1, 8, 8, 24))
+
 def verify_conv2d_grad(dshape, wshape, strides, padding, dilation, groups=1, mode='higher_order'):
     try:
         import torch
@@ -155,6 +177,7 @@ def test_batch_flatten_grad():
 if __name__ == "__main__":
     test_max_pool2d_grad()
     test_avg_pool2d_grad()
+    test_global_avg_pool2d_grad()
     test_conv2d_grad()
     test_dense_grad()
     test_batch_flatten_grad()
diff --git a/tests/python/relay/test_op_grad_level4.py b/tests/python/relay/test_op_grad_level4.py
index f8d6c3a56c93..f690a186ea41 100644
--- a/tests/python/relay/test_op_grad_level4.py
+++ b/tests/python/relay/test_op_grad_level4.py
@@ -29,18 +29,21 @@ def test_sum_grad():
     verify_sum_grad((4, 2))
     verify_sum_grad((4, 2), axis=-1, keepdims=True)
     verify_sum_grad((4, 2, 1), axis=(1, 2), exclude=True)
+    verify_sum_grad((4, 2, 1), axis=1)
 
 
-def test_max_grad():
-    s = (10, 10)
-    t = relay.TensorType(s)
-    x = relay.var("x", t)
-    axis = 0
-    z = relay.max(x, axis)
-
-    fwd_func = relay.Function([x], z)
+def verify_max_grad(d_shape, axis=None, keepdims=False, exclude=False):
+    data = relay.var("data", relay.TensorType(d_shape, "float32"))
+    fwd_func = relay.Function([data], relay.max(data, axis=axis, keepdims=keepdims, exclude=exclude))
     check_grad(fwd_func, scale=1e-3)
 
 
+def test_max_grad():
+    verify_max_grad((10, 10), axis=None)
+    verify_max_grad((10, 10), axis=-1)
+    verify_max_grad((6, 3, 2), axis=(1, 2), keepdims=True)
+    verify_max_grad((5, 4, 3), axis=(0, 2), exclude=True)
+
+
 if __name__ == "__main__":
     pytest.main()
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index e828fa30de56..d9e29d8bbd9f 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -31,6 +31,127 @@ def run_infer_type(expr):
     entry = mod["main"]
     return entry if isinstance(expr, relay.Function) else entry.body
 
+def test_checkpoint():
+    dtype = "float32"
+    xs = [relay.var("x{}".format(i), dtype) for i in range(4)]
+    f = relay.multiply(relay.add(xs[0], xs[1]), relay.add(xs[2], xs[3]))
+    f_checkpoint = relay.annotation.checkpoint(f)
+
+    func, func_checkpoint = relay.Function(xs, f), relay.Function(xs, f_checkpoint)
+    f, f_checkpoint = run_infer_type(func), run_infer_type(func_checkpoint)
+    assert f.checked_type == f_checkpoint.checked_type
+
+    inputs = [np.random.uniform() for _ in range(len(xs))]
+    for target, ctx in ctx_list():
+        for kind in ["graph", "debug"]:
+            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            f_res = intrp.evaluate(f)(*inputs)
+            f_checkpoint_res = intrp.evaluate(f_checkpoint)(*inputs)
+            tvm.testing.assert_allclose(f_res.asnumpy(), f_checkpoint_res.asnumpy(), 0, 0)
+
+def test_checkpoint_alpha_equal():
+    xs = [relay.var("x{}".format(i), relay.TensorType((1,), "float32")) for i in range(4)]
+    f = relay.Function(xs, relay.annotation.checkpoint(
+        relay.multiply(relay.add(xs[0], xs[1]), relay.add(xs[2], xs[3]))
+    ))
+    df = transform.gradient(run_infer_type(f))
+
+    # run PE and DCE
+    with transform.PassContext(opt_level=3):
+        passes = [transform.PartialEvaluate(),
+                  transform.DeadCodeElimination(inline_once=True)]
+        mod = transform.Sequential(passes)(relay.Module.from_expr(df))
+        df = mod["main"]
+
+    df_parsed = relay.parser.fromtext(
+        """
+        v0.0.4
+        fn (%x: Tensor[(1), float32], %y: Tensor[(1), float32],
+            %z: Tensor[(1), float32], %w: Tensor[(1), float32])
+            ->  (Tensor[(1), float32],
+                (Tensor[(1), float32], Tensor[(1), float32],
+                 Tensor[(1), float32], Tensor[(1), float32])) {
+            %0 = add(%x, %y);
+            %1 = add(%z, %w);
+            let %x1: Tensor[(1), float32] = multiply(%0, %1);
+            let %x2: Tensor[(1), float32] = ones_like(%x1);
+            let %x3: Tensor[(1), float32] = add(%x, %y);
+            let %x4: Tensor[(1), float32] = add(%z, %w);
+            %2 = zeros_like(%x3);
+            %3 = multiply(%x2, %x4);
+            %4 = collapse_sum_like(%3, %x3);
+            let %x5: Tensor[(1), float32] = add(%2, %4);
+            %5 = zeros_like(%x4);
+            %6 = multiply(%x2, %x3);
+            %7 = collapse_sum_like(%6, %x4);
+            let %x6: Tensor[(1), float32] = add(%5, %7);
+            %8 = zeros_like(%x);
+            %9 = collapse_sum_like(%x5, %x);
+            %10 = add(%8, %9);
+            %11 = zeros_like(%y);
+            %12 = collapse_sum_like(%x5, %y);
+            %13 = add(%11, %12);
+            %14 = zeros_like(%z);
+            %15 = collapse_sum_like(%x6, %z);
+            %16 = add(%14, %15);
+            %17 = zeros_like(%w);
+            %18 = collapse_sum_like(%x6, %w);
+            %19 = add(%17, %18);
+            %20 = (%10, %13, %16, %19);
+            (%x1, %20)
+        }
+        """
+    )
+
+    relay.analysis.assert_alpha_equal(df, df_parsed)
+
+def test_checkpoint_alpha_equal_tuple():
+    xs = [relay.var("x{}".format(i), relay.TensorType((1,), "float32")) for i in range(4)]
+    f = relay.Function(xs, relay.annotation.checkpoint(
+        relay.Tuple([relay.add(xs[0], xs[1]), relay.add(xs[2], xs[3])])
+    ))
+    df = transform.gradient(run_infer_type(f))
+
+    # run PE and DCE
+    with transform.PassContext(opt_level=3):
+        passes = [transform.PartialEvaluate(),
+                  transform.DeadCodeElimination(inline_once=True)]
+        mod = transform.Sequential(passes)(relay.Module.from_expr(df))
+        df = mod["main"]
+
+    df_parsed = relay.parser.fromtext(
+        """
+        v0.0.4
+        fn (%x: Tensor[(1), float32], %y: Tensor[(1), float32],
+            %z: Tensor[(1), float32], %w: Tensor[(1), float32])
+            -> ((Tensor[(1), float32], Tensor[(1), float32]),
+                (Tensor[(1), float32], Tensor[(1), float32],
+                 Tensor[(1), float32], Tensor[(1), float32])) {
+        let %x1: Tensor[(1), float32] = add(%x, %y) /* ty=Tensor[(1), float32] */;
+        let %x2: Tensor[(1), float32] = add(%z, %w) /* ty=Tensor[(1), float32] */;
+        let %x3: Tensor[(1), float32] = zeros_like(%x2) /* ty=Tensor[(1), float32] */;
+        let %x4: Tensor[(1), float32] = ones_like(%x1) /* ty=Tensor[(1), float32] */;
+        %0 = (%x1, %x2);
+        %1 = zeros_like(%x) /* ty=Tensor[(1), float32] */;
+        %2 = collapse_sum_like(%x4, %x) /* ty=Tensor[(1), float32] */;
+        %3 = add(%1, %2) /* ty=Tensor[(1), float32] */;
+        %4 = zeros_like(%y) /* ty=Tensor[(1), float32] */;
+        %5 = collapse_sum_like(%x4, %y) /* ty=Tensor[(1), float32] */;
+        %6 = add(%4, %5) /* ty=Tensor[(1), float32] */;
+        %7 = zeros_like(%z) /* ty=Tensor[(1), float32] */;
+        %8 = collapse_sum_like(%x3, %z) /* ty=Tensor[(1), float32] */;
+        %9 = add(%7, %8) /* ty=Tensor[(1), float32] */;
+        %10 = zeros_like(%w) /* ty=Tensor[(1), float32] */;
+        %11 = collapse_sum_like(%x3, %w) /* ty=Tensor[(1), float32] */;
+        %12 = add(%10, %11) /* ty=Tensor[(1), float32] */;
+        %13 = (%3, %6, %9, %12);
+        (%0, %13)
+        }
+        """
+    )
+
+    relay.analysis.assert_alpha_equal(df, df_parsed)
+
 def test_collapse_sum_like():
     shape = (3, 4, 5, 6)
     shape_like = (4, 5, 6)
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 015582468289..9236d6e55fa0 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -546,9 +546,11 @@ def _compile(ic, oc, target, data_layout, kernel_layout, dtypes):
 
         n, h, w, ch, cw = 1, 64, 64, 3, 3
         if data_layout == 'NCHW':
-            x = relay.var("x", relay.TensorType((n, ic, h, w), input_dtype))
+            data_shape = (n, ic, h, w)
+            x = relay.var("x", relay.TensorType(data_shape, input_dtype))
         elif data_layout == 'NHWC':
-            x = relay.var("x", relay.TensorType((n, h, w, ic), input_dtype))
+            data_shape = (n, h, w, ic)
+            x = relay.var("x", relay.TensorType(data_shape, input_dtype))
         else:
             raise ValueError('Not supported')
 
@@ -559,8 +561,8 @@ def _compile(ic, oc, target, data_layout, kernel_layout, dtypes):
         else:
             raise ValueError('Not supported')
 
-        w = relay.var("w", relay.TensorType(kernel_shape, weight_dtype))
-        y = relay.nn.conv2d(x, w,
+        weight = relay.var("weight", relay.TensorType(kernel_shape, weight_dtype))
+        y = relay.nn.conv2d(x, weight,
                             kernel_size=(ch, cw),
                             channels=oc,
                             padding=(1, 1),
@@ -568,65 +570,86 @@ def _compile(ic, oc, target, data_layout, kernel_layout, dtypes):
                             data_layout=data_layout,
                             kernel_layout=kernel_layout,
                             out_dtype=output_dtype)
-        func = relay.Function([x, w], y)
+        func = relay.Function([x, weight], y)
         wdata = np.random.rand(*kernel_shape) * 10
-        parameters = {"w": tvm.nd.array(wdata.astype(weight_dtype))}
+        parameters = {"weight": tvm.nd.array(wdata.astype(weight_dtype))}
+
         with relay.build_config(opt_level=3):
             graph, lib, params = relay.build(func, target, params=parameters)
+
         assembly = lib.get_source("asm")
         return assembly
 
-    # compile conv2d for x86 (skylake) and test assembly contains *pmadd* instructions
-    target = "llvm -mcpu=skylake-avx512"
-    name = "llvm.x86.avx512.pmaddubs.w.512"
-    llvm_id = tvm.codegen.llvm_lookup_intrinsic_id(name)
-    if llvm_id != 0:
-        fast_int8_dtypes = ('uint8', 'int8', 'int32')
-        # Sweep the input channels to check int8 robustness
-        for ic in range(1, 24):
-            asm = _compile(ic=ic, oc=32, target=target, data_layout="NCHW", kernel_layout='OIHW',
-                           dtypes=fast_int8_dtypes)
-            assert "pmaddubs" in asm
-
-        for ic in range(1, 24):
-            asm = _compile(ic=ic, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
-                           dtypes=fast_int8_dtypes)
-            assert "pmaddubs" in asm
-
-
-        # Sweep the output channels to check int8 robustness
-        for oc in range(2, 24):
-            asm = _compile(ic=16, oc=oc, target=target, data_layout="NCHW", kernel_layout='OIHW',
-                           dtypes=fast_int8_dtypes)
-            assert "pmaddubs" in asm
-
-        for oc in range(2, 24):
-            asm = _compile(ic=16, oc=oc, target=target, data_layout="NHWC", kernel_layout='HWIO',
-                           dtypes=fast_int8_dtypes)
-            assert "pmaddubs" in asm
-
-        # Check that both non-divisible oc and ic work
-        asm = _compile(ic=17, oc=29, target=target, data_layout="NCHW", kernel_layout='OIHW',
-                       dtypes=fast_int8_dtypes)
-        assert "pmaddubs" in asm
-
-        asm = _compile(ic=17, oc=29, target=target, data_layout="NHWC", kernel_layout='HWIO',
-                       dtypes=fast_int8_dtypes)
-        assert "pmaddubs" in asm
-
-        # Ensure that code is generated when datatypes are not HW supported.
-        dtypes = ('int8', 'int8', 'int32')
-        asm = _compile(ic=16, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
-                       dtypes=dtypes)
-        # Check that intrinisic is not present in the assembly.
-        assert "pmaddubs" not in asm
-
-        # Ensure that code is generated when datatypes are not HW supported.
-        dtypes = ('uint8', 'uint8', 'int32')
-        asm = _compile(ic=16, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
-                       dtypes=dtypes)
-        # Check that intrinisic is not present in the assembly.
-        assert "pmaddubs" not in asm
+    def _has_fast_int8_instructions(asm, target):
+        if 'skylake-avx512' in target:
+            return "pmaddubs" in asm
+        elif 'cascadelake' in target:
+            return "vpdpbusd" in asm
+        else:
+            assert False, "Target should be Skylake or Cascadelake"
+
+    # compile conv2d for x86 (skylake, cascadelake) and test assembly contains *pmadd* instructions
+    targets = ["llvm -mcpu=skylake-avx512", "llvm -mcpu=cascadelake"]
+    llvm_version = tvm.codegen.llvm_version_major()
+    for target in targets:
+        if llvm_version >= 8:
+            dtypes = ('uint8', 'int8', 'int32')
+            # Sweep the input channels to check int8 robustness
+            # Input channels should be a multiple of 4 internally.
+            for ic in [1, 4, 6]:
+                asm = _compile(ic=ic, oc=16, target=target, data_layout="NCHW",
+                               kernel_layout='OIHW',
+                               dtypes=dtypes)
+                assert _has_fast_int8_instructions(asm, target)
+
+            for ic in [1, 4, 6]:
+                asm = _compile(ic=ic, oc=16, target=target, data_layout="NHWC",
+                               kernel_layout='HWIO',
+                               dtypes=dtypes)
+                assert _has_fast_int8_instructions(asm, target)
+
+            # Sweep the output channels to check int8 robustness
+            # Output channels should be a multiple of 16 internally.
+            for oc in [4, 16, 20]:
+                asm = _compile(ic=8, oc=oc, target=target, data_layout="NCHW",
+                               kernel_layout='OIHW',
+                               dtypes=dtypes)
+                assert _has_fast_int8_instructions(asm, target)
+
+            for oc in [4, 16, 20]:
+                asm = _compile(ic=8, oc=oc, target=target, data_layout="NHWC",
+                               kernel_layout='HWIO',
+                               dtypes=dtypes)
+                assert _has_fast_int8_instructions(asm, target)
+
+            # Check that both non-divisible oc and ic work
+            asm = _compile(ic=17, oc=29, target=target, data_layout="NCHW", kernel_layout='OIHW',
+                           dtypes=dtypes)
+            assert _has_fast_int8_instructions(asm, target)
+
+            asm = _compile(ic=17, oc=29, target=target, data_layout="NHWC", kernel_layout='HWIO',
+                           dtypes=dtypes)
+            assert _has_fast_int8_instructions(asm, target)
+
+    # Check that int8 x int8 goes through legalization so that fast instructions can be picked up.
+    for target in targets:
+        if llvm_version >= 8:
+            dtypes = (('int8', 'int8', 'int32'))
+            # Check that both non-divisible oc and ic work
+            asm = _compile(ic=17, oc=29, target=target, data_layout="NCHW", kernel_layout='OIHW',
+                           dtypes=dtypes)
+            assert _has_fast_int8_instructions(asm, target)
+
+            asm = _compile(ic=17, oc=29, target=target, data_layout="NHWC", kernel_layout='HWIO',
+                           dtypes=dtypes)
+            assert _has_fast_int8_instructions(asm, target)
+
+    # Ensure that code is generated when datatypes are not HW supported.
+    dtypes = ('uint8', 'uint8', 'int32')
+    asm = _compile(ic=16, oc=32, target=target, data_layout="NHWC", kernel_layout='HWIO',
+                   dtypes=dtypes)
+    # Check that intrinisic is not present in the assembly.
+    assert not _has_fast_int8_instructions(asm, target)
 
     # Check that a vectorized instruction is generated for older Intel
     # generations, because we default to NCHWc layout.
diff --git a/tests/python/relay/test_op_qnn_dequantize.py b/tests/python/relay/test_op_qnn_dequantize.py
index 51258651ab36..a99e78d3a1db 100644
--- a/tests/python/relay/test_op_qnn_dequantize.py
+++ b/tests/python/relay/test_op_qnn_dequantize.py
@@ -44,10 +44,10 @@ def quantize_test_driver(in_dtype, quant_args, in_data, verify_output_data):
     def test_uint8_to_float32():
         data = np.array([0, 1, 2, 3, 4, 251, 252, 253, 254, 255]) \
             .astype('uint8') \
-            .reshape((2,5))
+            .reshape((2, 5))
         output = np.array([-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64]) \
             .astype('float32') \
-            .reshape((2,5))
+            .reshape((2, 5))
         quant_args = {"in_zero_point":127, "in_scale":0.5}
         quantize_test_driver(in_dtype='uint8', quant_args=quant_args, in_data=data,
                              verify_output_data=output)
@@ -55,16 +55,24 @@ def test_uint8_to_float32():
     def test_int8_to_float32():
         data = np.array([-128, -127, -126, -125, -124, 123, 124, 125, 126, 127]) \
             .astype('int8') \
-            .reshape((2,5))
+            .reshape((2, 5))
         output = np.array([-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64]) \
             .astype('float32') \
-            .reshape((2,5))
-        quant_args = {"in_zero_point":-1, "in_scale":0.5}
+            .reshape((2, 5))
+        quant_args = {"in_zero_point": -1, "in_scale": 0.5}
         quantize_test_driver(in_dtype='int8', quant_args=quant_args, in_data=data,
                              verify_output_data=output)
 
+    def test_int32_to_float32():
+        data = np.array([113, 29, -1052]).astype('int32')
+        output = np.array([0.6550452, 0.16810896, -6.098297]).astype('float32')
+        quant_args = {"in_zero_point": 0, "in_scale": 0.0057968604}
+        quantize_test_driver(in_dtype='int32', quant_args=quant_args, in_data=data,
+                             verify_output_data=output)
+
     test_uint8_to_float32()
     test_int8_to_float32()
+    test_int32_to_float32()
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relay/test_qnn_mul.py b/tests/python/relay/test_qnn_mul.py
new file mode 100644
index 000000000000..8c08c1abe10c
--- /dev/null
+++ b/tests/python/relay/test_qnn_mul.py
@@ -0,0 +1,236 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.contrib import graph_runtime
+import topi.testing
+
+# "unquantize" a quantized tensor
+def recover(data, scale, zp):
+    return scale * (np.asarray(data) - zp)
+
+
+def generate_golden_output(x_recovered, y_recovered, scale, zp):
+    mul = x_recovered * y_recovered
+    output = np.around(mul / scale + zp)
+
+    q_min = np.iinfo(np.uint8).min
+    q_max = np.iinfo(np.uint8).max
+    return np.clip(output, q_min, q_max)
+
+
+def test_tflite_same_io_qnn_params():
+    data_dtype = "uint8"
+
+    lhs_scale = rhs_scale = output_scale = 0.00784314
+    lhs_zero_point = rhs_zero_point = output_zero_point = 127
+
+    x = relay.var("x", shape=(1, 4), dtype=data_dtype)
+    y = relay.var("y", shape=(1, 4), dtype=data_dtype)
+    z = relay.qnn.op.mul(lhs=x, rhs=y,
+                         lhs_scale=lhs_scale,
+                         lhs_zero_point=lhs_zero_point,
+                         rhs_scale=rhs_scale,
+                         rhs_zero_point=rhs_zero_point,
+                         output_scale=output_scale,
+                         output_zero_point=output_zero_point)
+
+    func = relay.Function([x, y], z)
+    mod = relay.Module.from_expr(func)
+    mod = relay.qnn.transform.CanonicalizeOps()(mod)
+    func = mod["main"]
+
+    x_datas = [
+        np.array((1, 153, 2, 178)).reshape((1, 4)),
+        np.array((25, 1, 178, 216)).reshape((1, 4)),
+        np.array((25, 153, 1, 165)).reshape((1, 4)),
+    ]
+    y_datas = [
+        np.array((204, 178, 1, 8)).reshape((1, 4)),
+        np.array((204, 178, 191, 1)).reshape((1, 4)),
+        np.array((204, 178, 1, 191)).reshape((1, 4)),
+    ]
+
+    for i in range(0, 3):
+        x_data = x_datas[i]
+        y_data = y_datas[i]
+
+        x_rec = recover(x_data, lhs_scale, lhs_zero_point)
+        y_rec = recover(y_data, rhs_scale, rhs_zero_point)
+        golden = generate_golden_output(x_rec, y_rec, output_scale,
+            output_zero_point)
+
+        intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+        op_res = intrp.evaluate(func)(x_data, y_data)
+
+        np.testing.assert_equal(op_res.asnumpy(), np.uint8(golden))
+
+
+def test_tflite_different_io_qnn_params():
+    data_dtype = "uint8"
+
+    lhs_scale = 0.0156863
+    lhs_zero_point = 127
+    rhs_scale = 0.0117647
+    rhs_zero_point = 85
+    output_scale = 0.0235294
+    output_zero_point = 128
+
+    x = relay.var("x", shape=(1, 4), dtype=data_dtype)
+    y = relay.var("y", shape=(1, 4), dtype=data_dtype)
+    z = relay.qnn.op.mul(lhs=x, rhs=y,
+                         lhs_scale=lhs_scale,
+                         lhs_zero_point=lhs_zero_point,
+                         rhs_scale=rhs_scale,
+                         rhs_zero_point=rhs_zero_point,
+                         output_scale=output_scale,
+                         output_zero_point=output_zero_point)
+
+    func = relay.Function([x, y], z)
+    mod = relay.Module.from_expr(func)
+    mod = relay.qnn.transform.CanonicalizeOps()(mod)
+    func = mod["main"]
+
+    x_datas = [
+        np.array((76, 140, 153, 172)).reshape((1, 4)),
+        np.array((133, 140, 146, 153)).reshape((1, 4)),
+        np.array((76, 140, 172, 146)).reshape((1, 4)),
+    ]
+    y_datas = [
+        np.array((136, 119, 128, 17)).reshape((1, 4)),
+        np.array((136, 119, 111, 94)).reshape((1, 4)),
+        np.array((136, 119, 17, 128)).reshape((1, 4)),
+    ]
+
+    for i in range(0, 3):
+        x_data = x_datas[i]
+        y_data = y_datas[i]
+
+        x_rec = recover(x_data, lhs_scale, lhs_zero_point)
+        y_rec = recover(y_data, rhs_scale, rhs_zero_point)
+        golden = generate_golden_output(x_rec, y_rec, output_scale,
+            output_zero_point)
+
+        intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+        op_res = intrp.evaluate(func)(x_data, y_data)
+        np.testing.assert_equal(op_res.asnumpy(), np.uint8(golden))
+
+
+def test_saturation():
+    # Same params
+    data_dtype = "uint8"
+    lhs_scale = rhs_scale = output_scale = 0.125
+    lhs_zero_point = rhs_zero_point = output_zero_point = 0
+
+    x = relay.var("x", shape=(1, 4), dtype=data_dtype)
+    y = relay.var("y", shape=(1, 4), dtype=data_dtype)
+    z = relay.qnn.op.mul(lhs=x, rhs=y,
+                         lhs_scale=lhs_scale,
+                         lhs_zero_point=lhs_zero_point,
+                         rhs_scale=rhs_scale,
+                         rhs_zero_point=rhs_zero_point,
+                         output_scale=output_scale,
+                         output_zero_point=output_zero_point)
+
+    func = relay.Function([x, y], z)
+    mod = relay.Module.from_expr(func)
+    mod = relay.qnn.transform.CanonicalizeOps()(mod)
+    func = mod["main"]
+
+    x_data = np.array((255, 1, 1, 0)).reshape((1, 4))
+    y_data = np.array((255, 255, 128, 0)).reshape((1, 4))
+
+    x_rec = recover(x_data, lhs_scale, lhs_zero_point)
+    y_rec = recover(y_data, rhs_scale, rhs_zero_point)
+
+    golden = generate_golden_output(x_rec, y_rec, output_scale,
+        output_zero_point)
+
+    intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+    op_res = intrp.evaluate(func)(x_data, y_data)
+    np.testing.assert_equal(op_res.asnumpy(), np.uint8(golden))
+
+    # Same params, different scale
+
+    lhs_scale = rhs_scale = 0.125
+    output_scale = 0.25
+
+    z = relay.qnn.op.mul(lhs=x, rhs=y,
+                         lhs_scale=lhs_scale,
+                         lhs_zero_point=lhs_zero_point,
+                         rhs_scale=rhs_scale,
+                         rhs_zero_point=rhs_zero_point,
+                         output_scale=output_scale,
+                         output_zero_point=output_zero_point)
+
+    func = relay.Function([x, y], z)
+    mod = relay.Module.from_expr(func)
+    mod = relay.qnn.transform.CanonicalizeOps()(mod)
+    func = mod["main"]
+
+    x_data = np.array((255, 1, 1, 0)).reshape((1, 4))
+    y_data = np.array((255, 255, 127, 0)).reshape((1, 4))
+
+    x_rec = recover(x_data, lhs_scale, lhs_zero_point)
+    y_rec = recover(y_data, rhs_scale, rhs_zero_point)
+
+    golden = generate_golden_output(x_rec, y_rec, output_scale,
+        output_zero_point)
+
+    intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+    op_res = intrp.evaluate(func)(x_data, y_data)
+    np.testing.assert_equal(op_res.asnumpy(), np.uint8(golden))
+
+    # All params different
+
+    lhs_scale = 0.5
+    rhs_scale = 0.25
+    output_scale = 0.125
+
+    z = relay.qnn.op.mul(lhs=x, rhs=y,
+                         lhs_scale=lhs_scale,
+                         lhs_zero_point=lhs_zero_point,
+                         rhs_scale=rhs_scale,
+                         rhs_zero_point=rhs_zero_point,
+                         output_scale=output_scale,
+                         output_zero_point=output_zero_point)
+
+    func = relay.Function([x, y], z)
+    mod = relay.Module.from_expr(func)
+    mod = relay.qnn.transform.CanonicalizeOps()(mod)
+    func = mod["main"]
+
+    x_data = np.array((255, 0, 1, 0)).reshape((1, 4))
+    y_data = np.array((0, 128, 64, 0)).reshape((1, 4))
+
+    x_rec = recover(x_data, lhs_scale, lhs_zero_point)
+    y_rec = recover(y_data, rhs_scale, rhs_zero_point)
+
+    golden = generate_golden_output(x_rec, y_rec, output_scale,
+        output_zero_point)
+
+    intrp = relay.create_executor("graph", ctx=tvm.cpu(0), target="llvm")
+    op_res = intrp.evaluate(func)(x_data, y_data)
+    np.testing.assert_equal(op_res.asnumpy(), np.uint8(golden))
+
+
+if __name__ == "__main__":
+    test_tflite_same_io_qnn_params()
+    test_tflite_different_io_qnn_params()
+    test_saturation()
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index 5289fe9f5411..a3b251c38e00 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -47,21 +47,23 @@ def veval(f, *args, ctx=tvm.cpu(), target="llvm"):
     if isinstance(f, relay.Expr):
         mod = relay.Module()
         mod["main"] = f
-        vm = relay.vm.compile(mod, target)
-        vm.init(tvm.cpu())
+        exe = relay.vm.compile(mod, target)
+        vm = relay.vm.VirtualMachine(exe)
+        vm.init(ctx)
         return vm.invoke("main", *args)
     else:
         assert isinstance(f, relay.Module), "expected expression or module"
         mod = f
-        vm = relay.vm.compile(mod, target)
-        vm.init(tvm.cpu())
+        exe = relay.vm.compile(mod, target)
+        vm = relay.vm.VirtualMachine(exe)
+        vm.init(ctx)
         ret = vm.invoke("main", *args)
         return ret
 
 def vmobj_to_list(o):
-    if isinstance(o, tvm.relay.backend.vmobj.TensorObject):
+    if isinstance(o, tvm.relay.backend.vm.Tensor):
         return [o.asnumpy().tolist()]
-    elif isinstance(o, tvm.relay.backend.vmobj.DatatypeObject):
+    elif isinstance(o, tvm.relay.backend.vm.ADT):
         result = []
         for f in o:
             result.extend(vmobj_to_list(f))
@@ -573,25 +575,6 @@ def test_add_op_broadcast():
     mod["main"] = func
     check_result([x_data, y_data], x_data + y_data, mod=mod)
 
-def test_set_params():
-    mod = relay.Module()
-    x = relay.var('x', shape=(10, 5))
-    w = relay.var('w', shape=(6, 5))
-    b = relay.var('b', shape=(6,))
-    y = relay.nn.bias_add(relay.nn.dense(x, w), b)
-    mod["main"] = relay.Function([x, w, b], y)
-    vm = relay.vm.compile(mod, 'llvm')
-    vm.init(tvm.cpu())
-
-    x_np = np.random.uniform(size=(10, 5)).astype('float32')
-    w_np = np.random.uniform(size=(6, 5)).astype('float32')
-    b_np = np.random.uniform(size=(6,)).astype('float32')
-    ref_np = np.dot(x_np, w_np.T) + b_np
-    params = {'w': w_np}
-    vm.load_params(params)
-    out = vm.run(x_np, b_np)
-    tvm.testing.assert_allclose(out.asnumpy(), ref_np)
-
 
 if __name__ == "__main__":
     test_id()
@@ -626,4 +609,3 @@ def test_set_params():
     test_add_op_scalar()
     test_add_op_tensor()
     test_add_op_broadcast()
-    test_set_params()
diff --git a/tests/python/relay/test_vm_object.py b/tests/python/relay/test_vm_object.py
new file mode 100644
index 000000000000..12d263d1125b
--- /dev/null
+++ b/tests/python/relay/test_vm_object.py
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import tvm
+from tvm.relay import vm
+
+def test_tensor():
+    arr = tvm.nd.array([1,2,3])
+    x = vm.Tensor(arr)
+    assert isinstance(x, vm.Tensor)
+    assert x.asnumpy()[0] == 1
+    assert x.asnumpy()[-1] == 3
+    assert isinstance(x.data, tvm.nd.NDArray)
+
+
+def test_adt():
+    arr = tvm.nd.array([1,2,3])
+    x = vm.Tensor(arr)
+    y = vm.ADT(0, [x, x])
+
+    assert len(y) == 2
+    assert isinstance(y, vm.ADT)
+    y[0:1][-1].data == x.data
+    assert y.tag == 0
+    assert isinstance(x.data, tvm.nd.NDArray)
+
+
+
+if __name__ == "__main__":
+    test_tensor()
+    test_adt()
diff --git a/tests/python/relay/test_vm_serialization.py b/tests/python/relay/test_vm_serialization.py
index 3a317fc2d111..014648099aeb 100644
--- a/tests/python/relay/test_vm_serialization.py
+++ b/tests/python/relay/test_vm_serialization.py
@@ -22,29 +22,25 @@
 from tvm import relay
 from tvm.relay.module import Module as rly_module
 from tvm.relay import vm as _vm
-from tvm.relay import serializer, deserializer
 from tvm.relay.scope_builder import ScopeBuilder
 from tvm.relay.prelude import Prelude
 from tvm.contrib import util
 from tvm.relay import testing
 
-def create_vm(f, ctx=tvm.cpu(), target="llvm", params=None):
+def create_exec(f, target="llvm", params=None):
     if isinstance(f, relay.Expr):
         mod = relay.Module()
         mod["main"] = f
-        vm = _vm.compile(mod, target=target, params=params)
-        vm.init(ctx)
-        return vm
+        executable = _vm.compile(mod, target=target, params=params)
+        return executable
     else:
         assert isinstance(f, relay.Module), "expected mod as relay.Module"
-        vm = _vm.compile(f, target=target, params=params)
-        vm.init(ctx)
-        return vm
+        executable = _vm.compile(f, target=target, params=params)
+        return executable
 
 
 def veval(vm, *args, ctx=tvm.cpu()):
     assert isinstance(vm, _vm.VirtualMachine), "expected VirtualMachine"
-    vm.init(ctx)
     ret = vm.run(*args)
     return ret
 
@@ -59,13 +55,11 @@ def get_vm_output(mod, data, params, target, ctx, dtype='float32'):
         return result.asnumpy().astype(dtype)
 
     def get_serialized_output(mod, data, params, target, ctx, dtype='float32'):
-        vm = create_vm(mod, ctx, target, params=params)
-        ser = serializer.Serializer(vm)
-        code, lib = ser.serialize()
-        deser = deserializer.Deserializer(code, lib)
-        des_vm = deser.deserialize()
+        exe = create_exec(mod, target, params=params)
+        code, lib = exe.save()
+        des_exec = _vm.Executable.load_exec(code, lib)
+        des_vm = _vm.VirtualMachine(des_exec)
         des_vm.init(ctx)
-        des_vm.load_params(params)
         result = des_vm.run(data)
         return result.asnumpy().astype(dtype)
 
@@ -99,26 +93,25 @@ def test_serializer():
     main = relay.Function([x1, y1], glb_f1(x1) * glb_f2(y1))
     mod["main"] = main
 
-    vm = create_vm(mod)
-    ser = serializer.Serializer(vm)
+    exe = create_exec(mod)
 
-    glbs = ser.globals
+    glbs = exe.globals
     assert len(glbs) == 3
     assert "f1" in glbs
     assert "f2" in glbs
     assert "main" in glbs
 
-    prim_ops = ser.primitive_ops
+    prim_ops = exe.primitive_ops
     assert any(item.startswith('fused_add') for item in prim_ops)
     assert any(item.startswith('fused_subtract') for item in prim_ops)
     assert any(item.startswith('fused_multiply') for item in prim_ops)
 
-    code = ser.bytecode
+    code = exe.bytecode
     assert "main 5 2 5" in code
     assert "f1 2 1 3" in code
     assert "f2 2 1 3" in code
 
-    code, lib = ser.serialize()
+    code, lib = exe.save()
     assert isinstance(code, bytearray)
     assert isinstance(lib, tvm.module.Module)
 
@@ -129,24 +122,24 @@ def test_save_load():
     x_data = np.random.rand(10, 10).astype('float32')
 
     # serialize.
-    vm = create_vm(f)
-    ser = serializer.Serializer(vm)
-    code, lib = ser.serialize()
+    vm = create_exec(f)
+    code, lib = vm.save()
     assert isinstance(code, bytearray)
 
     # save and load the code and lib file.
     tmp = util.tempdir()
     path_lib = tmp.relpath("lib.so")
     lib.export_library(path_lib)
-    with open(tmp.relpath("code.bc"), "wb") as fo:
+    with open(tmp.relpath("code.ro"), "wb") as fo:
         fo.write(code)
 
     loaded_lib = tvm.module.load(path_lib)
-    loaded_code = bytearray(open(tmp.relpath("code.bc"), "rb").read())
+    loaded_code = bytearray(open(tmp.relpath("code.ro"), "rb").read())
 
     # deserialize.
-    deser = deserializer.Deserializer(loaded_code, loaded_lib)
-    des_vm = deser.deserialize()
+    des_exec = _vm.Executable.load_exec(loaded_code, loaded_lib)
+    des_vm = _vm.VirtualMachine(des_exec)
+    des_vm.init(tvm.cpu())
 
     res = veval(des_vm, x_data)
     tvm.testing.assert_allclose(res.asnumpy(), x_data + x_data)
@@ -156,12 +149,12 @@ def test_const():
     c = relay.const(1.0, "float32")
     x = relay.var('x', shape=(10, 10), dtype='float32')
     f = relay.Function([x], x + c)
-    vm = create_vm(f)
-    ser = serializer.Serializer(vm)
-    code, lib = ser.serialize()
+    exe = create_exec(f)
+    code, lib = exe.save()
     assert isinstance(code, bytearray)
-    deser = deserializer.Deserializer(code, lib)
-    des_vm = deser.deserialize()
+    des_exec = _vm.Executable.load_exec(code, lib)
+    des_vm = _vm.VirtualMachine(des_exec)
+    des_vm.init(tvm.cpu())
     x_data = np.random.rand(10, 10).astype('float32')
     res = veval(des_vm, x_data)
     tvm.testing.assert_allclose(res.asnumpy(), x_data + 1)
@@ -177,11 +170,11 @@ def test_if():
     x_data = np.random.rand(10, 10).astype('float32')
     y_data = np.random.rand(10, 10).astype('float32')
 
-    vm = create_vm(f)
-    ser = serializer.Serializer(vm)
-    code, lib = ser.serialize()
-    deser = deserializer.Deserializer(code, lib)
-    des_vm = deser.deserialize()
+    exe = create_exec(f)
+    code, lib = exe.save()
+    des_exec = _vm.Executable.load_exec(code, lib)
+    des_vm = _vm.VirtualMachine(des_exec)
+    des_vm.init(tvm.cpu())
 
     # same
     res = veval(des_vm, x_data, x_data)
@@ -213,11 +206,11 @@ def test_loop():
     aarg = relay.var('accum', shape=[], dtype='int32')
     mod["main"] = relay.Function([iarg, aarg], sum_up(iarg, aarg))
 
-    vm = create_vm(mod)
-    ser = serializer.Serializer(vm)
-    code, lib = ser.serialize()
-    deser = deserializer.Deserializer(code, lib)
-    des_vm = deser.deserialize()
+    exe = create_exec(mod)
+    code, lib = exe.save()
+    des_exec = _vm.Executable.load_exec(code, lib)
+    des_vm = _vm.VirtualMachine(des_exec)
+    des_vm.init(tvm.cpu())
 
     result = veval(des_vm, i_data, accum_data)
     tvm.testing.assert_allclose(result.asnumpy(), sum(range(1, loop_bound + 1)))
@@ -230,11 +223,11 @@ def test_tuple():
     i_data = np.random.rand(41).astype('float32')
     j_data = np.random.rand(10).astype('float32')
 
-    vm = create_vm(f)
-    ser = serializer.Serializer(vm)
-    code, lib = ser.serialize()
-    deser = deserializer.Deserializer(code, lib)
-    des_vm = deser.deserialize()
+    exe = create_exec(f)
+    code, lib = exe.save()
+    des_exec = _vm.Executable.load_exec(code, lib)
+    des_vm = _vm.VirtualMachine(des_exec)
+    des_vm.init(tvm.cpu())
 
     result = veval(des_vm, (i_data, j_data))
     tvm.testing.assert_allclose(result.asnumpy(), j_data)
@@ -251,11 +244,11 @@ def test_adt_list():
     f = relay.Function([], l321)
     mod["main"] = f
 
-    vm = create_vm(mod)
-    ser = serializer.Serializer(vm)
-    code, lib = ser.serialize()
-    deser = deserializer.Deserializer(code, lib)
-    des_vm = deser.deserialize()
+    exe = create_exec(mod)
+    code, lib = exe.save()
+    des_exec = _vm.Executable.load_exec(code, lib)
+    des_vm = _vm.VirtualMachine(des_exec)
+    des_vm.init(tvm.cpu())
 
     result = veval(des_vm)
     assert len(result) == 2
@@ -297,11 +290,11 @@ def test_adt_compose():
     f = relay.Function([y], add_two_body)
     mod["main"] = f
 
-    vm = create_vm(mod)
-    ser = serializer.Serializer(vm)
-    code, lib = ser.serialize()
-    deser = deserializer.Deserializer(code, lib)
-    des_vm = deser.deserialize()
+    exe = create_exec(mod)
+    code, lib = exe.save()
+    des_exec = _vm.Executable.load_exec(code, lib)
+    des_vm = _vm.VirtualMachine(des_exec)
+    des_vm.init(tvm.cpu())
 
     x_data = np.array(np.random.rand()).astype('float32')
     result = veval(des_vm, x_data)
@@ -317,11 +310,11 @@ def test_closure():
     clo = ff(relay.const(1.0))
     main = clo(relay.const(2.0))
 
-    vm = create_vm(main)
-    ser = serializer.Serializer(vm)
-    code, lib = ser.serialize()
-    deser = deserializer.Deserializer(code, lib)
-    des_vm = deser.deserialize()
+    exe = create_exec(main)
+    code, lib = exe.save()
+    des_exec = _vm.Executable.load_exec(code, lib)
+    des_vm = _vm.VirtualMachine(des_exec)
+    des_vm.init(tvm.cpu())
 
     res = veval(des_vm)
     tvm.testing.assert_allclose(res.asnumpy(), 3.0)
diff --git a/tests/python/unittest/test_lang_schedule.py b/tests/python/unittest/test_lang_schedule.py
index 62c0a675569b..0a653066bff7 100644
--- a/tests/python/unittest/test_lang_schedule.py
+++ b/tests/python/unittest/test_lang_schedule.py
@@ -103,16 +103,20 @@ def test_fuse():
 
 
 def test_singleton():
+    print("test singleton")
     A = tvm.placeholder((), name='A')
     T = tvm.compute((), lambda : A() + 1)
     s = tvm.create_schedule(T.op)
+    print("test singleton fin1")
     fused = s[T].fuse()
     assert any(isinstance(x, tvm.schedule.Singleton) for x in s[T].relations)
     assert tuple(s[T].leaf_iter_vars) == (fused,)
     dump = pkl.dumps(s)
+    print("test singleton fin3")
     s_loaded = pkl.loads(dump)
+    print("test singleton fin2")
     assert isinstance(s_loaded, tvm.schedule.Schedule)
-
+    print("test singleton fin")
 
 def test_vectorize():
     m = tvm.var('m')
diff --git a/tests/python/unittest/test_pass_hoist_if.py b/tests/python/unittest/test_pass_hoist_if.py
new file mode 100644
index 000000000000..4a28cf6b318a
--- /dev/null
+++ b/tests/python/unittest/test_pass_hoist_if.py
@@ -0,0 +1,185 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+
+
+var_list = []
+
+def verify_structure(stmt, expected_struct):
+    node_dict = {}
+    struct = {}
+    def _extract_vars(op):
+        global var_list
+        if isinstance(op, tvm.expr.Var):
+            var_list.append(op.name)
+
+    def _visit(op):
+        key = op
+        if isinstance(op, tvm.stmt.IfThenElse):
+            global var_list
+            tvm.ir_pass.PostOrderVisit(op.condition, _extract_vars)
+            val = [(op.then_case, op.else_case), ("IfThenElse", tuple(var_list))]
+            var_list.clear()
+        elif isinstance(op, tvm.stmt.For):
+            val = [(op.body,), ("For", op.loop_var.name)]
+        elif isinstance(op, tvm.stmt.AttrStmt):
+            val = [(op.body,), ("AttrStmt", op.attr_key, int(op.value))]
+        else:
+            return
+        node_dict[key] = val
+
+    tvm.ir_pass.PostOrderVisit(stmt, _visit)
+    for key, val in node_dict.items():
+        struct[val[1]] = tuple(node_dict[child][1] if child in node_dict
+                               else None for child in val[0])
+
+    assert struct == expected_struct, "Structure mismatch: expect %s but got %s" \
+                                      % (expected_struct, struct)
+    var_list.clear()
+
+def test_basic():
+    ib = tvm.ir_builder.create()
+    l = tvm.var('l')
+    m = tvm.var('m')
+    n = tvm.var('n')
+
+    with ib.for_range(0, l, "i") as i:
+        with ib.for_range(0, m, "j") as j:
+            with ib.for_range(0, n, "k") as k:
+                with ib.if_scope(ib.likely(i < 2)):
+                    ib.emit(tvm.make.Evaluate(m))
+                with ib.else_scope():
+                    ib.emit(tvm.make.Evaluate(n))
+
+    stmt = ib.get()
+    new_stmt = tvm.ir_pass.HoistIfThenElse(stmt)
+    expected_struct = {('For', 'k'): (None,), ('For', 'j'): (('For', 'k'),),
+                       ('IfThenElse', ('i',)): (('For', 'j'), ('For', 'j')),
+                       ('For', 'i'): (('IfThenElse', ('i',)),)}
+    verify_structure(new_stmt, expected_struct)
+
+def test_no_else():
+    ib = tvm.ir_builder.create()
+    l = tvm.var('l')
+    m = tvm.var('m')
+    n = tvm.var('n')
+
+    with ib.for_range(0, l, "i") as i:
+        with ib.for_range(0, m, "j") as j:
+            with ib.for_range(0, n, "k") as k:
+                with ib.if_scope(ib.likely(i < 2)):
+                    ib.emit(tvm.make.Evaluate(m))
+
+    stmt = ib.get()
+    new_stmt = tvm.ir_pass.HoistIfThenElse(stmt)
+    expected_struct = {('For', 'k'): (None,), ('For', 'j'): (('For', 'k'),),
+                       ('IfThenElse', ('i',)): (('For', 'j'), None),
+                       ('For', 'i'): (('IfThenElse', ('i',)),)}
+    verify_structure(new_stmt, expected_struct)
+
+def test_attr_stmt():
+    ib = tvm.ir_builder.create()
+    dshape = (32, 64)
+    data = ib.pointer("float32", name="data")
+    l = tvm.var('l')
+    m = tvm.var('m')
+    n = tvm.var('n')
+
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    ib.scope_attr(tx, "thread_extent", dshape[0])
+    ib.scope_attr(bx, "thread_extent", dshape[1])
+    with ib.for_range(0, l, "i") as i:
+        with ib.for_range(0, m, "j") as j:
+            with ib.for_range(0, n, "k") as k:
+                with ib.if_scope(tvm.any(i < 4, j >= 8)):
+                    data[bx * j + tx * j * k] = data[bx * j + tx * j * k]  + 0.5
+                with ib.else_scope():
+                    data[bx * j + tx * j * k] = data[bx * j + tx * j * k]  + 1.0
+
+    stmt = ib.get()
+    new_stmt = tvm.ir_pass.HoistIfThenElse(stmt)
+    expected_struct = {('For', 'k'): (None,), ('IfThenElse', ('i', 'j')): (('For', 'k'), ('For', 'k')),
+                       ('For', 'j'): (('IfThenElse', ('i', 'j')),), ('For', 'i'): (('For', 'j'),),
+                       ('AttrStmt', 'thread_extent', 64): (('For', 'i'),),
+                       ('AttrStmt', 'thread_extent', 32): (('AttrStmt', 'thread_extent', 64),)}
+    verify_structure(new_stmt, expected_struct)
+
+def test_nested_for():
+    ib = tvm.ir_builder.create()
+    data = ib.pointer("float32", name="data")
+
+
+    with ib.for_range(0, 5, "i") as i:
+        with ib.for_range(0, 10, "j") as j:
+            with ib.if_scope(i >= 3):
+                data[i * 3 + j] = data[i * 3 + j] + 0.5
+                with ib.for_range(0, 15, "k") as k:
+                    with ib.for_range(0, 20, "l") as l:
+                        with ib.if_scope(tvm.any(i < 4, j >= 8)):
+                            data[i * 3 + j + k + l] = data[i * 3 + j + k + l] * 2
+                        with ib.else_scope():
+                            data[i * 3 + j + k + l] = data[i * 3 + j + k + l] * 1.5
+
+    stmt = ib.get()
+    new_stmt = tvm.ir_pass.HoistIfThenElse(stmt)
+    expected_struct = {('IfThenElse', ('i', 'j')): (None, None), ('For', 'l'): (('IfThenElse', ('i', 'j')),),
+                       ('For', 'k'): (('For', 'l'),), ('For', 'j'): (None,), ('IfThenElse', ('i',)): (('For', 'j'), None),
+                       ('For', 'i'): (('IfThenElse', ('i',)),)}
+    verify_structure(new_stmt, expected_struct)
+
+def test_if_block():
+    ib = tvm.ir_builder.create()
+    data = ib.pointer("float32", name="data")
+    n = tvm.var("n")
+
+
+    with ib.for_range(0, 5, "i") as i:
+        with ib.for_range(0, 10, "j") as j:
+            with ib.if_scope(i >= 3):
+                data[i * 3 + j] = data[i * 3 + j] + 0.5
+                with ib.for_range(0, 15, "k") as k:
+                    with ib.for_range(0, 20, "l") as l:
+                        with ib.if_scope(tvm.any(i < 4, j >= 8)):
+                            data[i * 3 + j + k + l] = data[i * 3 + j + k + l] * 2
+                        with ib.else_scope():
+                            data[i * 3 + j + k + l] = data[i * 3 + j + k + l] * 1.5
+                        with ib.if_scope(j <5):
+                            data[i * 3 + j + k + l] = data[i * 3 + j + k + l] - 1
+
+
+    with ib.for_range(0, 5, "i") as i:
+        with ib.for_range(0, 10, "j") as j:
+                with ib.for_range(0, 15, "k") as k:
+                    with ib.if_scope(n >= 3):
+                        data[i * 3 + j + k] = data[i * 3 + j + k] + 0.6
+
+    stmt = ib.get()
+    new_stmt = tvm.ir_pass.HoistIfThenElse(stmt)
+    expected_struct = {('IfThenElse', ('i', 'j')): (None, None), ('IfThenElse', ('j',)): (None, None),
+                       ('For', 'l'): (None,), ('For', 'k'): (None,), ('For', 'j'): (('For', 'j'),),
+                       ('IfThenElse', ('i',)): (('For', 'j'), None), ('For', 'i'): (('IfThenElse', ('i',)),),
+                       ('IfThenElse', ('n',)): (('For', 'j'), None)}
+    verify_structure(new_stmt, expected_struct)
+
+
+if __name__ == "__main__":
+    test_basic()
+    test_no_else()
+    test_attr_stmt()
+    test_nested_for()
+    test_if_block()
diff --git a/tests/python/unittest/test_runtime_vm_profiler.py b/tests/python/unittest/test_runtime_vm_profiler.py
index b5ce0ec70e51..35f59052c80b 100644
--- a/tests/python/unittest/test_runtime_vm_profiler.py
+++ b/tests/python/unittest/test_runtime_vm_profiler.py
@@ -26,9 +26,11 @@ def test_basic():
     mod, params = resnet.get_workload()
     target = 'llvm'
     ctx = tvm.cpu()
-    vm = relay.profiler_vm.compile(mod, target)
+    if not relay.profiler_vm.enabled():
+        return
+    exe = relay.profiler_vm.compile(mod, target, params=params)
+    vm = relay.profiler_vm.VirtualMachineProfiler(exe)
     vm.init(ctx)
-    vm.load_params(params)
 
     data = np.random.rand(1, 3, 224, 224).astype('float32')
     res = vm.invoke("main", [data])
diff --git a/tests/python/unittest/test_schedule_tensor_core.py b/tests/python/unittest/test_schedule_tensor_core.py
new file mode 100644
index 000000000000..9fe72cd4e5d2
--- /dev/null
+++ b/tests/python/unittest/test_schedule_tensor_core.py
@@ -0,0 +1,386 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# 'License'); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+import numpy as np
+from topi.testing import conv2d_nhwc_python
+from tvm.contrib import nvcc
+
+VERIFY = True
+
+
+def intrin_wmma_load_matrix(shape, scope):
+    n, m, l = shape
+    if scope == "wmma.matrix_a":
+        row, col = n, l
+    elif scope == "wmma.matrix_b":
+        row, col = l, m
+    A = tvm.placeholder((row, col), name='A', dtype='float16')
+    BA = tvm.decl_buffer(A.shape, A.dtype, scope='shared', data_alignment=32, offset_factor=row * col)
+    C = tvm.compute((row, col), lambda i, j: A[i, j], name='C')
+    BC = tvm.decl_buffer(C.shape, C.dtype, scope=scope, data_alignment=32, offset_factor=row * col)
+
+    def intrin_func(ins, outs):
+        ib = tvm.ir_builder.create()
+
+        BA = ins[0]
+        BC = outs[0]
+        ib.emit(tvm.call_intrin('handle', 'tvm_load_matrix_sync',
+                                BC.data, n, m, l, BC.elem_offset // (row * col),
+                                BA.access_ptr('r'), col, 'row_major'))
+        return ib.get()
+
+    return tvm.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
+
+
+def intrin_wmma_gemm(shape):
+    n, m, l = shape
+    A = tvm.placeholder((n, l), name='A', dtype='float16')
+    B = tvm.placeholder((l, m), name='B', dtype='float16')
+    k = tvm.reduce_axis((0, l), name="k")
+    C = tvm.compute((n, m),
+                    lambda ii, jj:
+                    tvm.sum(A[ii, k].astype('float') * B[k, jj].astype('float'), axis=k),
+                    name='C')
+    BA = tvm.decl_buffer(A.shape, A.dtype, name='BA', scope='wmma.matrix_a', data_alignment=32, offset_factor=n * l)
+    BB = tvm.decl_buffer(B.shape, B.dtype, name='BB', scope='wmma.matrix_b', data_alignment=32, offset_factor=l * m)
+    BC = tvm.decl_buffer(C.shape, C.dtype, name='BC', scope='wmma.accumulator', data_alignment=32, offset_factor=n * m)
+
+    def intrin_func(ins, outs):
+        BA, BB = ins
+        BC, = outs
+
+        def init():
+            ib = tvm.ir_builder.create()
+            ib.emit(tvm.call_intrin('handle', 'tvm_fill_fragment', BC.data, n, m, l, BC.elem_offset // (n * m), 0.0))
+            return ib.get()
+
+        def update():
+            ib = tvm.ir_builder.create()
+            ib.emit(tvm.call_intrin('handle', 'tvm_mma_sync',
+                                    BC.data, BC.elem_offset // (n * m),
+                                    BA.data, BA.elem_offset // (n * l),
+                                    BB.data, BB.elem_offset // (l * m),
+                                    BC.data, BC.elem_offset // (n * m)))
+            return ib.get()
+
+        return update(), init(), update()
+
+    return tvm.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC})
+
+
+def intrin_wmma_store_matrix(shape):
+    n, m, l = shape
+    A = tvm.placeholder((n, m), name='A', dtype='float32')
+    BA = tvm.decl_buffer(A.shape, A.dtype, scope='wmma.accumulator', data_alignment=32, offset_factor=n * m)
+    C = tvm.compute((n, m), lambda i, j: A[i, j], name='C')
+    BC = tvm.decl_buffer(C.shape, C.dtype, scope='global', data_alignment=32, offset_factor=n * m)
+
+    def intrin_func(ins, outs):
+        ib = tvm.ir_builder.create()
+
+        BA = ins[0]
+        BC = outs[0]
+        ib.emit(tvm.call_intrin('handle', 'tvm_store_matrix_sync',
+                                BA.data, n, m, l, BA.elem_offset // (n * m),
+                                BC.access_ptr('w'), m, 'row_major'))
+        return ib.get()
+
+    return tvm.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
+
+
+def test_tensor_core_batch_matmal():
+    if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
+        print("skip because cuda is not enabled..")
+        return
+    if not nvcc.have_tensorcore(tvm.gpu(0).compute_version):
+        print("skip because gpu does not support tensor core")
+        return
+
+    batch_size = 4
+    n = 512
+    m, l = n, n
+    assert (n % 32 == 0)
+    assert (m % 8 == 0)
+    assert (l % 16 == 0)
+    nn, mm, ll = n // 32, m // 8, l // 16
+    A = tvm.placeholder((batch_size, nn, ll, 32, 16), name='A', dtype='float16')
+    B = tvm.placeholder((batch_size, ll, mm, 16, 8), name='B', dtype='float16')
+    k1 = tvm.reduce_axis((0, ll), name='k1')
+    k2 = tvm.reduce_axis((0, 16), name='k2')
+    C = tvm.compute((batch_size, nn, mm, 32, 8),
+                    lambda b, i, j, ii, jj:
+                    tvm.sum(A[b, i, k1, ii, k2].astype('float') * B[b, k1, j, k2, jj].astype('float'), axis=[k1, k2]),
+                    name='Fragment_C')
+    s = tvm.create_schedule(C.op)
+
+    warp_size = 32
+    kernel_size = 16
+    block_row_warps = 2
+    block_col_warps = 4
+    warp_row_tiles = 4
+    warp_col_tiles = 2
+    chunk = 4
+
+    block_x = tvm.thread_axis('blockIdx.x')
+    block_y = tvm.thread_axis('blockIdx.y')
+    block_z = tvm.thread_axis('blockIdx.z')
+    thread_x = tvm.thread_axis('threadIdx.x')
+    thread_y = tvm.thread_axis('threadIdx.y')
+    thread_z = tvm.thread_axis('threadIdx.z')
+
+    AS = s.cache_read(A, 'shared', [C])
+    BS = s.cache_read(B, 'shared', [C])
+    AF = s.cache_read(AS, 'wmma.matrix_a', [C])
+    BF = s.cache_read(BS, 'wmma.matrix_b', [C])
+    CF = s.cache_write(C, 'wmma.accumulator')
+
+    b, i, j, kernel_i, kernel_j = s[C].op.axis
+    i, ii = s[C].split(i, factor=warp_row_tiles)
+    block_i, i = s[C].split(i, factor=block_row_warps)
+    j, jj = s[C].split(j, factor=warp_col_tiles)
+    block_j, j = s[C].split(j, factor=block_col_warps)
+    s[C].reorder(block_i, block_j, i, j, ii, jj, kernel_i, kernel_j)
+    s[C].bind(b, block_z)
+    s[C].bind(block_i, block_x)
+    s[C].bind(block_j, block_y)
+    s[C].bind(i, thread_y)
+    s[C].bind(j, thread_z)
+
+    s[CF].compute_at(s[C], j)
+    b, warp_i, warp_j, _i, _j = s[CF].op.axis
+    k, _k = CF.op.reduce_axis
+    ko, ki = s[CF].split(k, factor=chunk)
+    s[CF].reorder(ko, ki, warp_i, warp_j, _i, _j, _k)
+
+    s[AF].compute_at(s[CF], ki)
+    s[BF].compute_at(s[CF], ki)
+
+    s[AS].compute_at(s[CF], ko)
+    b, xo, yo, xi, yi = AS.op.axis
+    tx, xo = s[AS].split(xo, nparts=block_row_warps)
+    ty, yo = s[AS].split(yo, nparts=block_col_warps)
+    t = s[AS].fuse(xi, yi)
+    to, ti = s[AS].split(t, nparts=warp_size)
+    s[AS].bind(tx, thread_y)
+    s[AS].bind(ty, thread_z)
+    s[AS].bind(to, thread_x)
+
+    s[BS].compute_at(s[CF], ko)
+    b, xo, yo, xi, yi = BS.op.axis
+    tx, xo = s[BS].split(xo, nparts=block_row_warps)
+    ty, yo = s[BS].split(yo, nparts=block_col_warps)
+    t = s[BS].fuse(xi, yi)
+    to, ti = s[BS].split(t, nparts=warp_size)
+    s[BS].bind(tx, thread_y)
+    s[BS].bind(ty, thread_z)
+    s[BS].bind(to, thread_x)
+
+    s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((32, 8, 16), 'wmma.matrix_a'))
+    s[BF].tensorize(BF.op.axis[-2], intrin_wmma_load_matrix((32, 8, 16), 'wmma.matrix_b'))
+    s[C].tensorize(kernel_i, intrin_wmma_store_matrix((32, 8, 16)))
+    s[CF].tensorize(_i, intrin_wmma_gemm((32, 8, 16)))
+
+    func = tvm.build(s, [A, B, C], 'cuda')
+
+    ctx = tvm.gpu(0)
+    a_np = np.random.uniform(size=(batch_size, nn, ll, 32, 16)).astype(A.dtype)
+    b_np = np.random.uniform(size=(batch_size, ll, mm, 16, 8)).astype(B.dtype)
+    a = tvm.nd.array(a_np, ctx)
+    b = tvm.nd.array(b_np, ctx)
+    c = tvm.nd.array(np.zeros((batch_size, nn, mm, 32, 8), dtype=C.dtype), ctx)
+    func(a, b, c)
+    evaluator = func.time_evaluator(func.entry_name, ctx, number=3)
+    print('gemm with tensor core: %f ms' % (evaluator(a, b, c).mean * 1e3))
+
+    if VERIFY:
+        func(a, b, c)
+        a_np = a_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
+        b_np = b_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
+        c_np = c.asnumpy().transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
+        np.testing.assert_allclose(c_np, np.matmul(a_np.astype(C.dtype), b_np.astype(C.dtype)), rtol=1e-4, atol=1e-4)
+
+
+
+def test_tensor_core_batch_conv():
+    if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
+        print("skip because cuda is not enabled..")
+        return
+    if not nvcc.have_tensorcore(tvm.gpu(0).compute_version):
+        print("skip because gpu does not support tensor core")
+        return
+
+    # The sizes of inputs and filters
+    batch_size = 32
+    height = 14
+    width = 14
+    in_channels = 32
+    out_channels = 64
+    kernel_h = 3
+    kernel_w = 3
+    pad_h = 1
+    pad_w = 1
+    stride_h = 1
+    stride_w = 1
+    block_size = 16
+
+    block_row_warps = 2
+    block_col_warps = 4
+    warp_row_tiles = 4
+    warp_col_tiles = 2
+    warp_size = 32
+    chunk = 2
+
+    # Input feature map: (N, H, W, IC, n, ic)
+    data_shape = (batch_size // block_size,
+                  height,
+                  width,
+                  in_channels // block_size,
+                  block_size,
+                  block_size)
+    # Kernel: (H, W, IC, OC, ic, oc)
+    kernel_shape = (kernel_h,
+                    kernel_w,
+                    in_channels // block_size,
+                    out_channels // block_size,
+                    block_size,
+                    block_size)
+
+    # Output feature map: (N, H, W, OC, n, oc)
+    output_shape = (batch_size // block_size,
+                    height,
+                    width,
+                    out_channels // block_size,
+                    block_size,
+                    block_size)
+
+    assert (batch_size % block_size == 0)
+    assert (in_channels % block_size == 0)
+    assert (out_channels % block_size == 0)
+
+    kh = tvm.reduce_axis((0, kernel_h), name='kh')
+    kw = tvm.reduce_axis((0, kernel_w), name='kw')
+    ic = tvm.reduce_axis((0, in_channels // block_size), name='ic')
+    ii = tvm.reduce_axis((0, block_size), name='ii')
+
+    # Algorithm
+    A = tvm.placeholder(data_shape, name='A', dtype="float16")
+    W = tvm.placeholder(kernel_shape, name='W', dtype="float16")
+    Apad = tvm.compute(
+        (batch_size // block_size, height + 2 * pad_h, width + 2 * pad_w, in_channels // block_size, block_size,
+         block_size),
+        lambda n, h, w, i, nn, ii: tvm.if_then_else(
+            tvm.all(h >= pad_h, h - pad_h < height,
+                    w >= pad_w, w - pad_w < width),
+            A[n, h - pad_h, w - pad_w, i, nn, ii], tvm.const(0., "float16")),
+        name='Apad')
+    Conv = tvm.compute(output_shape,
+                       lambda n, h, w, o, nn, oo: tvm.sum(
+                           Apad[n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype("float32") *
+                           W[kh, kw, ic, o, ii, oo].astype("float32"),
+                           axis=[ic, kh, kw, ii]),
+                       name="Conv")
+
+    s = tvm.create_schedule(Conv.op)
+    s[Apad].compute_inline()
+
+    AS = s.cache_read(Apad, 'shared', [Conv])
+    WS = s.cache_read(W, 'shared', [Conv])
+    AF = s.cache_read(AS, 'wmma.matrix_a', [Conv])
+    WF = s.cache_read(WS, 'wmma.matrix_b', [Conv])
+    ConvF = s.cache_write(Conv, 'wmma.accumulator')
+
+    block_x = tvm.thread_axis('blockIdx.x')
+    block_y = tvm.thread_axis('blockIdx.y')
+    block_z = tvm.thread_axis('blockIdx.z')
+    thread_x = tvm.thread_axis('threadIdx.x')
+    thread_y = tvm.thread_axis('threadIdx.y')
+    thread_z = tvm.thread_axis('threadIdx.z')
+
+    nc, hc, wc, oc, nnc, ooc = Conv.op.axis
+    block_k = s[Conv].fuse(hc, wc)
+    s[Conv].bind(block_k, block_z)
+    nc, nci = s[Conv].split(nc, factor=warp_row_tiles)
+    block_i, nc = s[Conv].split(nc, factor=block_row_warps)
+    oc, oci = s[Conv].split(oc, factor=warp_col_tiles)
+    block_j, oc = s[Conv].split(oc, factor=block_col_warps)
+    s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc)
+    s[Conv].bind(block_i, block_x)
+    s[Conv].bind(block_j, block_y)
+    s[Conv].bind(nc, thread_y)
+    s[Conv].bind(oc, thread_z)
+
+    s[ConvF].compute_at(s[Conv], oc)
+    n, h, w, o, nnf, oof = ConvF.op.axis
+    ko, ki = s[ConvF].split(ic, factor=chunk)
+    s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii)
+
+    s[AF].compute_at(s[ConvF], kw)
+    s[WF].compute_at(s[ConvF], kw)
+
+    s[WS].compute_at(s[ConvF], kh)
+    s[AS].compute_at(s[ConvF], kh)
+
+    n, h, w, i, nn, ii = AS.op.axis
+    tx, xo = s[AS].split(n, nparts=block_row_warps)
+    ty, yo = s[AS].split(xo, nparts=block_col_warps)
+    t = s[AS].fuse(nn, ii)
+    to, ti = s[AS].split(t, factor=warp_size)
+    s[AS].bind(tx, thread_y)
+    s[AS].bind(ty, thread_z)
+    s[AS].bind(ti, thread_x)
+
+    kh, kw, ic, o, ii, oo = WS.op.axis
+    tx, xo = s[WS].split(o, nparts=block_row_warps)
+    ty, yo = s[WS].split(xo, nparts=block_col_warps)
+    t = s[WS].fuse(ii, oo)
+    to, ti = s[WS].split(t, nparts=warp_size)
+    s[WS].bind(tx, thread_y)
+    s[WS].bind(ty, thread_z)
+    s[WS].bind(to, thread_x)
+    s[WS].vectorize(ti)
+
+    s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), 'wmma.matrix_a'))
+    s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), 'wmma.matrix_b'))
+    s[Conv].tensorize(nnc, intrin_wmma_store_matrix((16, 16, 16)))
+    s[ConvF].tensorize(nnf, intrin_wmma_gemm((16, 16, 16)))
+
+    func = tvm.build(s, [A, W, Conv], 'cuda')
+
+    ctx = tvm.gpu(0)
+    a_np = np.random.uniform(size=data_shape).astype(A.dtype)
+    w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)
+    a = tvm.nd.array(a_np, ctx)
+    w = tvm.nd.array(w_np, ctx)
+    c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx)
+    evaluator = func.time_evaluator(func.entry_name, ctx, number=3)
+    print('conv2d with tensor core: %f ms' % (evaluator(a, w, c).mean * 1e3))
+
+    if VERIFY:
+        func(a, w, c)
+        a_np = a_np.transpose(0, 4, 1, 2, 3, 5).reshape(batch_size, height, width, in_channels)
+        w_np = w_np.transpose(0, 1, 2, 4, 3, 5).reshape(kernel_h, kernel_w, in_channels, out_channels)
+        c_np = c.asnumpy().transpose((0, 4, 1, 2, 3, 5)).reshape(batch_size, height, width, out_channels)
+        c_std = conv2d_nhwc_python(a_np.astype(Conv.dtype),
+                                   w_np.astype(Conv.dtype),
+                                   (stride_h, stride_w),
+                                   (pad_h, pad_w)).astype(Conv.dtype)
+        np.testing.assert_allclose(c_np, c_std, rtol=1e-4, atol=1e-4)
+
+
+if __name__ == '__main__':
+    test_tensor_core_batch_matmal()
+    test_tensor_core_batch_conv()
diff --git a/topi/include/topi/cuda/pooling.h b/topi/include/topi/cuda/pooling.h
index 0869adbc2877..df4ab3362634 100644
--- a/topi/include/topi/cuda/pooling.h
+++ b/topi/include/topi/cuda/pooling.h
@@ -51,7 +51,7 @@ inline Schedule schedule_pool(const Target &target, const Array<Tensor>& outs) {
   auto s = create_schedule(out_ops);
 
   auto _schedule = [&](const Tensor& padded_input, const Tensor& pool) {
-    if (padded_input->op->is_type<ComputeOpNode>()) {
+    if (padded_input->op->IsInstance<ComputeOpNode>()) {
       s[padded_input].compute_inline();
     }
     auto num_thread = target->max_num_threads;
diff --git a/topi/include/topi/cuda/reduction.h b/topi/include/topi/cuda/reduction.h
index 85d7d86b9236..3166d0836247 100644
--- a/topi/include/topi/cuda/reduction.h
+++ b/topi/include/topi/cuda/reduction.h
@@ -137,7 +137,7 @@ Schedule ScheduleReduce(const Target& target,
  * \param op The current op in the traversal
  */
 void TraverseBeforeReduce(Schedule s, Operation op) {
-  if (op->derived_from<PlaceholderOpNode>()) {
+  if (op->IsInstance<PlaceholderOpNode>()) {
     return;
   } else if (is_injective(op->tag)) {
     s[op].compute_inline();
diff --git a/topi/include/topi/detail/constant_utils.h b/topi/include/topi/detail/constant_utils.h
index d68c308c88ae..6d93f9d28264 100644
--- a/topi/include/topi/detail/constant_utils.h
+++ b/topi/include/topi/detail/constant_utils.h
@@ -18,10 +18,9 @@
  */
 
 /*!
-*  Copyright (c) 2017 by Contributors
-* \file constant_utils.h
-* \brief Utility functions for handling constants in TVM expressions
-*/
+ * \file constant_utils.h
+ * \brief Utility functions for handling constants in TVM expressions
+ */
 #ifndef TOPI_DETAIL_CONSTANT_UTILS_H_
 #define TOPI_DETAIL_CONSTANT_UTILS_H_
 
@@ -44,8 +43,8 @@ using namespace tvm;
  */
 inline bool IsConstInt(Expr expr) {
   return
-    expr->derived_from<tvm::ir::IntImm>() ||
-    expr->derived_from<tvm::ir::UIntImm>();
+    expr->IsInstance<tvm::ir::IntImm>() ||
+    expr->IsInstance<tvm::ir::UIntImm>();
 }
 
 /*!
@@ -57,10 +56,10 @@ inline bool IsConstInt(Expr expr) {
  * \return The integer value.
  */
 inline int64_t GetConstInt(Expr expr) {
-  if (expr->derived_from<tvm::ir::IntImm>()) {
+  if (expr->IsInstance<tvm::ir::IntImm>()) {
     return expr.as<tvm::ir::IntImm>()->value;
   }
-  if (expr->derived_from<tvm::ir::UIntImm>()) {
+  if (expr->IsInstance<tvm::ir::UIntImm>()) {
     return expr.as<tvm::ir::UIntImm>()->value;
   }
   LOG(ERROR) << "expr must be a constant integer";
diff --git a/topi/include/topi/generic/extern.h b/topi/include/topi/generic/extern.h
index 5c0c392f768d..03e362c399df 100644
--- a/topi/include/topi/generic/extern.h
+++ b/topi/include/topi/generic/extern.h
@@ -51,7 +51,7 @@ inline Schedule schedule_extern(const Target& target, Array<Tensor> outs) {
 
   tvm::schedule::AutoInlineInjective(s);
   for (auto out : outs) {
-    if (out->op->derived_from<ExternOpNode>()) {
+    if (out->op->IsInstance<ExternOpNode>()) {
       continue;
     }
     tvm::GenericFunc::Get("schedule_injective_from_existing")(s, out);
diff --git a/topi/include/topi/nn/pooling.h b/topi/include/topi/nn/pooling.h
index 289452e26869..ca35e6e43498 100644
--- a/topi/include/topi/nn/pooling.h
+++ b/topi/include/topi/nn/pooling.h
@@ -492,7 +492,7 @@ inline Tensor adaptive_pool_impl(const Tensor& x,
       return tvm::max(x(indices), { dheight, dwidth });  // NOLINT(*)
     }, "tensor", "adaptive_pool_max");
   } else if (pool_type == kAvgPool) {
-    return tvm::compute(out_shape, [&](const Array<Var>& output) {
+    auto pool_sum = tvm::compute(out_shape, [&](const Array<Var>& output) {
       Array<Expr> indices;
       for (const Var& var : output) indices.push_back(var);
       auto i_start_h = start_index(output[height_axis], out_height, height);
@@ -505,8 +505,20 @@ inline Tensor adaptive_pool_impl(const Tensor& x,
       auto dwidth = tvm::reduce_axis(Range(0, i_end_w - i_start_w), "rv2");
       indices.Set(height_axis, i_start_h + dheight);
       indices.Set(width_axis, i_start_w + dwidth);
-      return tvm::sum(div(x(indices), divide_factor), { dheight, dwidth });
-    }, "tensor", "adaptive_pool_avg");
+      return tvm::sum(x(indices), { dheight, dwidth });
+    }, "tensor", "adaptive_pool_sum");
+
+    return tvm::compute(out_shape, [&](const Array<Var>& output) {
+      Array<Expr> indices;
+      for (const Var& var : output) indices.push_back(var);
+      auto i_start_h = start_index(output[height_axis], out_height, height);
+      auto i_end_h = end_index(output[height_axis], out_height, height);
+      auto i_start_w = start_index(output[width_axis], out_width, width);
+      auto i_end_w = end_index(output[width_axis], out_width, width);
+      auto divide_factor = tvm::cast(x->dtype, (i_end_h - i_start_h)
+                                               * (i_end_w - i_start_w));
+      return div(pool_sum(indices), divide_factor);
+    }, "tensor", kElementWise);
   } else {
     LOG(ERROR) << "Unrecognized pool_type: " << pool_type;
     return x;
diff --git a/topi/python/topi/__init__.py b/topi/python/topi/__init__.py
index fd293a09b9e7..484cb2d11993 100644
--- a/topi/python/topi/__init__.py
+++ b/topi/python/topi/__init__.py
@@ -28,6 +28,7 @@
 from . import cuda
 from . import arm_cpu
 from . import mali
+from . import bifrost
 from . import intel_graphics
 from . import opengl
 from . import util
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index f5cbbf0f7bad..c06c7397d118 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -552,6 +552,9 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
             if "-device=arm_cpu" in target.options:
                 tile_size = 4
                 VC = cfg['tile_k'].size[-1]
+            elif "-device=bifrost" in target.options:
+                tile_size = 2
+                VC = 0
             else:
                 from ..mali.conv2d import _pick_tile_size
                 tile_size = _pick_tile_size(tinfos[0], tinfos[1])
@@ -559,21 +562,28 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
 
             weight = F.nn.contrib_conv2d_winograd_weight_transform(copy_inputs[1],
                                                                    tile_size=tile_size)
-            weight = F.reshape(weight,
-                               newshape=(KH + tile_size - 1,
-                                         KW + tile_size - 1,
-                                         idxd(CO, VC), VC, CI))
-            weight = F.transpose(weight, axes=[0, 1, 2, 4, 3])
+            if VC > 0:
+                weight = F.reshape(weight,
+                                   newshape=(KH + tile_size - 1,
+                                             KW + tile_size - 1,
+                                             idxd(CO, VC), VC, CI))
+                weight = F.transpose(weight, axes=[0, 1, 2, 4, 3])
+                new_weight = tvm.placeholder((KH + tile_size - 1,
+                                              KW + tile_size -1,
+                                              idxd(CO, VC), CI, VC),
+                                             kernel.dtype)
+            else:
+                weight = F.reshape(weight,
+                                   newshape=(KH + tile_size - 1, KW + tile_size - 1, CO, CI))
+                new_weight = tvm.placeholder(
+                    (KH + tile_size - 1, KW + tile_size -1, CO, CI), kernel.dtype
+                )
 
             copy_inputs[1] = weight
             new_attrs['tile_size'] = tile_size
 
             # Store the same config for the altered operator (workload)
             new_data = data
-            new_weight = tvm.placeholder((KH + tile_size - 1,
-                                          KH + tile_size -1,
-                                          idxd(CO, VC), CI, VC),
-                                         kernel.dtype)
             new_workload = autotvm.task.args_to_workload(
                 [new_data, new_weight, strides, padding, dilation,
                  new_attrs[data_layout_key], out_dtype, tile_size],
diff --git a/topi/python/topi/bifrost/__init__.py b/topi/python/topi/bifrost/__init__.py
new file mode 100644
index 000000000000..a8bc545af44b
--- /dev/null
+++ b/topi/python/topi/bifrost/__init__.py
@@ -0,0 +1,8 @@
+# pylint: disable=redefined-builtin, wildcard-import
+"""ARM Mali GPU specific declaration and schedules."""
+from __future__ import absolute_import as _abs
+
+from .gemm import *
+from .conv2d import *
+from .dense import *
+from .depthwise_conv2d import *
diff --git a/topi/python/topi/bifrost/conv2d.py b/topi/python/topi/bifrost/conv2d.py
new file mode 100644
index 000000000000..1ed3f2c6e300
--- /dev/null
+++ b/topi/python/topi/bifrost/conv2d.py
@@ -0,0 +1,485 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""conv2d schedule on ARM Mali (Bifrost) GPU"""
+
+import tvm
+from tvm import autotvm
+
+from .gemm import decl_winograd_gemm, schedule_gemm
+from .transforms import tile_and_bind, tile_and_bind3d
+from ..generic import schedule_conv2d_nchw, schedule_conv2d_winograd_without_weight_transform
+from ..util import traverse_inline, get_const_int, get_const_tuple
+from ..nn import conv2d, conv2d_winograd_without_weight_transform, \
+    get_pad_tuple, pad, conv2d_alter_layout, dilate
+from ..nn.winograd_util import winograd_transform_matrices
+
+# reuse some compute declarations from ARM CPU
+from ..arm_cpu.conv2d_spatial_pack import conv2d_spatial_pack_nchw
+from ..arm_cpu.conv2d import _alter_conv2d_layout_arm
+
+
+@autotvm.register_topi_compute(conv2d, 'bifrost', ['direct'])
+def conv2d_bifrost(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
+    """TOPI compute callback for conv2d
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    data : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    kernel : tvm.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width] or
+        pre-packed 5-D with shape [num_filter_chunk, in_channel, filter_height,
+        filter_width, num_filter_block]
+
+    strides : list of two ints
+        [stride_height, stride_width]
+
+    padding : list of two ints
+        [pad_height, pad_width]
+
+    dilation : list of two ints
+        [dilation_height, dilation_width]
+
+    layout : str
+        layout of data
+
+    out_dtype: str
+        The output type. This is used for mixed precision.
+
+    Returns
+    -------
+    output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    if layout == 'NCHW':
+        return conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding,
+                                        dilation, out_dtype, num_tile=3)
+    else:
+        raise ValueError("Unsupported layout {}".format(layout))
+
+
+@autotvm.register_topi_schedule(schedule_conv2d_nchw, 'bifrost', ['direct', 'winograd'])
+def schedule_conv2d_nchw_bifrost(cfg, outs):
+    """TOPI schedule callback for conv2d
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The configuration of this template
+    outs: Array of Tensor
+        The computation graph description of convolution2d
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for conv2d
+    """
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        # schedule conv2d
+        if 'spatial_conv2d_output' in op.tag:
+            output = op.output(0)
+            conv = op.input_tensors[0]
+
+            data_vec = conv.op.input_tensors[0]
+            data_pad = data_vec.op.input_tensors[0]
+            s[data_pad].compute_inline()
+
+            kernel_vec = conv.op.input_tensors[1]
+            if kernel_vec.op.name == 'kernel_vec':
+                kernel = kernel_vec.op.input_tensors[0]
+            else:
+                kernel = kernel_vec
+            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+                s[kernel].compute_inline()
+
+            _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec)
+
+        if 'winograd_conv2d_output' in op.tag:
+            _schedule_winograd(cfg, s, op)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+def _schedule_spatial_pack(cfg, s, output, conv, data_vec, kernel_vec):
+    """schedule the spatial packing for conv2d"""
+    data = s[data_vec].op.input_tensors[0]
+
+    max_unroll = 16
+    vec_size = [1, 2, 4, 8, 16]
+    # get tunable parameters (they are defined in compute)
+    BC, TC, VC = cfg["tile_co"].size
+    BH, TH, VH = cfg["tile_oh"].size
+    BW, TW, VW = cfg["tile_ow"].size
+
+    # schedule padding
+    if isinstance(data.op, tvm.tensor.ComputeOp) and "pad" in data.op.tag:
+        data_pad = data
+        s[data_pad].compute_inline()
+
+    # schedule data packing
+    if isinstance(data_vec.op, tvm.tensor.ComputeOp) and data_vec.op.name == 'data_vec_undilated':
+        _, h, w, ci, _, _, vh, vw = s[data_vec].op.axis
+    else:
+        _, h, w, ci, vh, vw = s[data_vec].op.axis
+    tile_and_bind3d(s, data_vec, h, w, ci, 1)
+    if vh.dom.extent.value < max_unroll:
+        s[data_vec].unroll(vh)
+    if vw.dom.extent.value < max_unroll:
+        s[data_vec].unroll(vw)
+
+    if isinstance(kernel_vec.op, tvm.tensor.ComputeOp) and kernel_vec.name == 'kernel_vec':
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # kernel packing will be pre-computed during compilation, so we skip
+            # this part to make tuning records correct
+            s[kernel_vec].pragma(s[kernel_vec].op.axis[0], 'debug_skip_region')
+        else:
+            max_threads = tvm.target.current_target(allow_none=False).max_num_threads
+            co, ci, kh, kw, vc = s[kernel_vec].op.axis
+            fused = s[kernel_vec].fuse(co, ci, kh, kw, vc)
+            fused, vec = s[kernel_vec].split(fused, VC)
+            bb, tt = s[kernel_vec].split(fused, max_threads)
+            s[kernel_vec].bind(bb, tvm.thread_axis("blockIdx.x"))
+            s[kernel_vec].bind(tt, tvm.thread_axis("threadIdx.x"))
+            if VC in vec_size:
+                s[kernel_vec].vectorize(vec)
+
+    # schedule convolution
+    n, c, h, w, vh, vw, vc = s[conv].op.axis
+    kc, kh, kw = s[conv].op.reduce_axis
+
+    cfg["reorder_0"].apply(s, conv, [n, c, h, w, kc, kh, kw, vh, vw, vc])
+    tile_and_bind3d(s, conv, c, h, w, TC, TH, TW)
+
+    cfg["ann_reduce"].apply(s, conv, [kh, kw],
+                            axis_lens=[get_const_int(kernel_vec.shape[2]),
+                                       get_const_int(kernel_vec.shape[3])],
+                            max_unroll=max_unroll)
+
+    cfg["ann_spatial"].apply(s, conv, [vh, vw, vc],
+                             axis_lens=[VH, VW, VC],
+                             max_unroll=max_unroll,
+                             vec_size=vec_size,
+                             cfg=cfg)
+
+    # schedule output
+    if output.op not in s.outputs:  # has bias
+        s[output].compute_inline()
+        output = s.outputs[0]
+
+    _, co, oh, ow = s[output].op.axis
+    tile_and_bind3d(s, output, co, oh, ow, TC, TH, TW)
+
+    return s
+
+
+@autotvm.register_topi_compute(conv2d, 'bifrost', ['winograd'])
+def conv2d_bifrost_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
+    """Use Winograd as the convolution method"""
+    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype)
+
+
+def _decl_winograd_kernel_transform(kernel, tile_size, G):
+    """Declare a Winograd kernel transform
+    This exists separately to allow for precomputation
+    The precomputation will most often happen on CPU
+
+    Parameters
+    ----------
+    kernel : tvm.Tensor
+        The kernel to transform
+
+    tile_size : int
+        The size of the tile to use for the Winograd filter
+
+    Returns
+    -------
+    U : tvm.Tensor
+        Transformed kernel
+
+    """
+    CO, CI, KH, KW = [get_const_int(x) for x in kernel.shape]
+    # Only support 32 bit floats
+    out_dtype = 'float32'
+
+    alpha = G.shape[0]
+    K = CO
+    C = CI
+
+    def upround(x, align):
+        return (x + align - 1) // align * align
+
+    ALIGN = 16
+    K_round = upround(K, ALIGN)
+
+    # Padded Kernel [K_round, C, KH, KW]
+    # Pad the number of kernels to multiple of ALIGN
+    padded_kernel = tvm.compute((K_round, C, KH, KW),
+                                lambda k, c, h, w:
+                                tvm.if_then_else(k < K,
+                                                 kernel[k][c][h][w],
+                                                 tvm.const(0, out_dtype)),
+                                name='padded_kernel')
+
+    # U [alpha, alpha, K_round, C]
+    # Perform the kernel transform
+    r_kh = tvm.reduce_axis((0, KH), 'r_kh')
+    r_kw = tvm.reduce_axis((0, KW), 'r_kw')
+    U = tvm.compute((alpha, alpha, K_round, C),
+                    lambda eps, nu, k, c:
+                    tvm.sum(padded_kernel[k][c][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
+                            axis=[r_kh, r_kw]),
+                    name='U')
+
+    return U
+
+
+def _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size=2):
+    """Declare a winograd convolution - only tile_size=2 is currently supported"""
+    N, CI, IH, IW = get_const_tuple(data.shape)
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    if int(kernel.shape[2]) == 3:
+        if dilation_h != 1 or dilation_w != 1:
+            kernel = dilate(kernel, (1, 1, dilation_h, dilation_w))
+        pre_computed = False
+        CO, _, KH, KW = get_const_tuple(kernel.shape)
+    else:
+        assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
+        pre_computed = True
+        H_CAT, W_CAT, CO, CI = get_const_tuple(kernel.shape)
+        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
+    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)
+
+    assert layout == 'NCHW'
+    assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1
+    data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
+
+    r = KW
+    m = tile_size
+    alpha = m + r - 1
+    A, B, G = winograd_transform_matrices(m, r, out_dtype)
+
+    K = CO
+    C = CI
+    H = (IH + 2 * HPAD - 3) // HSTR + 1
+    W = (IW + 2 * WPAD - 3) // WSTR + 1
+    nH, nW = (H + m-1) // m, (W + m-1) // m
+    P = N * nH * nW
+
+    def upround(x, align):
+        return (x + align - 1) // align * align
+
+    ALIGN = 16
+    P_round = upround(P, ALIGN)
+    K_round = upround(K, ALIGN)
+
+    # CONFIG
+
+    cfg.define_knob("data_transform_wgx", [1, 2, 4, 8, 16, 32, 64])
+    cfg.define_knob("data_transform_wgy", [1, 2, 4, 8, 16, 32, 64])
+
+    # Pack input tile
+    input_tile = tvm.compute((N, C, H + 2, W + 2),
+                             lambda n, c, h, w:
+                             data_pad[n][c][h][w],
+                             name='d')
+
+    if pre_computed:
+        U = kernel
+    else:
+        U = _decl_winograd_kernel_transform(kernel, tile_size, G)
+
+    # V [alpha * alpha, C, P_round)
+    # Perform the image transform
+    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
+    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
+    V = tvm.compute((alpha * alpha, C, P_round),
+                    lambda epsnu, c, b:
+                    tvm.sum(input_tile[b // (nH*nW)][c][b // nW % nH * m + r_eps][b % nW * m +r_nu]\
+                            * B[r_eps][epsnu // alpha] * B[r_nu][epsnu % alpha],
+                            axis=[r_eps, r_nu]),
+                    name='V')
+
+    # Winograd GEMM is a wrapper around batched GEMM to convert U to a 3D Tensor
+    _, M = decl_winograd_gemm(cfg, U, V)
+
+    # Y [K, P, m, m]
+    # Winograd output transform
+    r_eps = tvm.reduce_axis((0, alpha), 'r_eps')
+    r_nu = tvm.reduce_axis((0, alpha), 'r_nu')
+    Y = tvm.compute((K, P, m, m), lambda k, b, vh, vw:
+                    tvm.sum(M[r_eps * alpha + r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw],
+                            axis=[r_eps, r_nu]), name='Y')
+
+    # Output [N, K, H, W]
+    # Unpack back to NCHW format
+    # The last term ensures alignment is not lost to bound inference
+    output = tvm.compute((N, K, H, W), lambda n, k, h, w:
+                         Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m]
+                         + tvm.const(0, out_dtype) * M[(alpha*alpha)-1][K_round-1][P_round-1],
+                         name='output', tag='winograd_conv2d_output')
+
+    return output
+
+
+def _schedule_winograd(cfg, s, op):
+    """Schedule Winograd convolution for Bifrost"""
+
+    # Get ops and tensors
+    output = op.output(0)
+
+    Y = op.input_tensors[0]
+    M, A = s[Y].op.input_tensors
+    U_3D, V = s[M].op.input_tensors
+    U = s[U_3D].op.input_tensors[0]
+    d, B = s[V].op.input_tensors
+    data_pad = s[d].op.input_tensors[0]
+
+    if isinstance(U.op, tvm.tensor.ComputeOp):
+        padded_kernel, G = s[U].op.input_tensors
+        kernel = s[padded_kernel].op.input_tensors[0]
+        s[G].compute_inline()
+        eps, _, _, _ = s[U].op.axis
+        y, _, _, _ = s[padded_kernel].op.axis
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # Kernel transformation will be pre-computed during compilation, so we skip
+            # this part to make tuning records correct
+            s[U].pragma(eps, 'debug_skip_region')
+            s[padded_kernel].pragma(y, 'debug_skip_region')
+        else:
+            # Pad kernel
+            y, x, ky, kx = s[padded_kernel].op.axis
+            s[padded_kernel].unroll(ky)
+            s[padded_kernel].unroll(kx)
+            tile_and_bind(s, padded_kernel, y, x, 1, 8)
+
+            # Transform kernel
+            eps, nu, k, c = s[U].op.axis
+            s[U].reorder(k, c, eps, nu)
+            r_kh, r_kw = s[U].op.reduce_axis
+            _ = [s[U].unroll(x) for x in [eps, nu, r_kh, r_kw]]
+
+            yo, xo, yi, xi = tile_and_bind(s, U, k, c, 1, 4)
+
+        # Dilation
+        if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+            s[kernel].compute_inline()
+
+    # Pad data
+    s[data_pad].compute_inline()
+
+    # Pack data
+    n, c, h, w = s[d].op.axis
+    w, wi = s[d].split(w, 4)
+    s[d].unroll(wi)
+    b = s[d].fuse(n, c)
+    tile_and_bind3d(s, d, b, h, w, 1, 4, 2)
+
+    # Transform data
+    bIL_d = s.cache_read(d, 'local', [V])
+
+    s[B].compute_inline()
+    epsnu, c, b = s[V].op.axis
+    r_eps, r_nu = s[V].op.reduce_axis
+    s[V].reorder(b, c, epsnu, r_nu, r_eps)
+    _ = [s[V].unroll(x) for x in [epsnu, r_eps, r_nu]]
+    yo, xo, yi, xi = tile_and_bind(
+        s, V, b, c, cfg["data_transform_wgy"].val, cfg["data_transform_wgx"].val
+    )
+
+    s[bIL_d].compute_at(s[V], xi)
+    n, c, h, w = s[bIL_d].op.axis
+    s[bIL_d].unroll(h)
+    s[bIL_d].vectorize(w)
+
+    # Batched GEMM
+    # Inline the 4D -> 3D tensor transform on the kernel
+    s[U_3D].compute_inline()
+    U_transform, V_transform = schedule_gemm(
+        cfg, s, U_3D, V, M, batched=True, schedule_transforms=True
+    )
+
+    # Inverse transform
+    CR_M = s.cache_read(M, 'local', [Y])
+    CW_Y = s.cache_write(Y, 'local')
+
+    s[A].compute_inline()
+    k, b, vh, vw = s[Y].op.axis
+    fused = s[Y].fuse(vh, vw)
+    s[Y].vectorize(fused)
+    yo, xo, yi, xi = tile_and_bind(s, Y, k, b, 1, 4)
+
+    s[CR_M].compute_at(s[Y], xi)
+    k, b, epsnu = s[CR_M].op.axis
+    s[CR_M].unroll(k)
+
+    s[CW_Y].compute_at(s[Y], xi)
+    k, b, vh, vw = s[CW_Y].op.axis
+    r_eps, r_nu = s[CW_Y].op.reduce_axis
+    _ = [s[CW_Y].unroll(x) for x in [vh, vw, r_eps, r_nu]]
+
+    # Schedule output and fusion
+    if output.op not in s.outputs:
+        s[output].compute_inline()
+        output = s.outputs[0]
+
+    _, k, h, w = s[output].op.axis
+    tile_and_bind3d(s, output, k, h, w, 1, 2, 2)
+
+
+##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM #####
+@autotvm.register_topi_compute(conv2d_winograd_without_weight_transform, 'bifrost', ['winograd'])
+def conv2d_winograd_ww(cfg, data, kernel, strides, padding, dilation, layout, out_dtype, tile_size):
+    """TOPI compute callback"""
+    return _decl_winograd(cfg, data, kernel, strides, padding, dilation, layout, out_dtype)
+
+
+@autotvm.register_topi_schedule(schedule_conv2d_winograd_without_weight_transform,
+                                'bifrost', ['winograd'])
+def schedule_conv2d_winograd_without_weight_transform_(cfg, outs):
+    """TOPI schedule callback"""
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'winograd_conv2d_output' in op.tag:
+            _schedule_winograd(cfg, s, op)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+##### REGISTER ALTER OP LAYOUT #####
+@conv2d_alter_layout.register(["bifrost"])
+def _alter_conv2d_layout(attrs, inputs, tinfos, F):
+    try:
+        return _alter_conv2d_layout_arm(attrs, inputs, tinfos, F)
+    except KeyError:  # to filter out fallback opencl templates
+        return None
diff --git a/topi/python/topi/bifrost/dense.py b/topi/python/topi/bifrost/dense.py
new file mode 100644
index 000000000000..114168f27514
--- /dev/null
+++ b/topi/python/topi/bifrost/dense.py
@@ -0,0 +1,110 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable
+"""dense schedule on ARM Mali GPU"""
+
+from __future__ import absolute_import as _abs
+
+import tvm
+from tvm import autotvm
+
+from .. import generic, nn
+from ..util import traverse_inline
+
+autotvm.register_topi_compute(nn.dense, 'bifrost', 'direct', nn.dense.fdefault)
+
+@autotvm.register_topi_schedule(generic.schedule_dense, 'bifrost', 'direct')
+def schedule_dense(cfg, outs):
+    """Schedule for dense operator.
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config entity for this template
+    outs: Array of Tensor
+        The computation graph description of dense
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for dense.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == 'dense':
+            vec_size = [1, 2, 4, 8, 16]
+            max_unroll = 32
+
+            dense = op.output(0)
+            output = outs[0]
+
+            y, x = s[output].op.axis
+            c = s[dense].op.reduce_axis[0]
+
+            ##### space definition begin #####
+            cfg.define_split('tile_y', y, num_outputs=3)
+            cfg.define_split('tile_x', x, num_outputs=3)
+            cfg.define_split('c_unroll', c, num_outputs=2, max_factor=64)
+
+            # fallback support
+            if cfg.is_fallback:
+                ref_log = autotvm.tophub.load_reference_log(
+                    'mali', 'rk3399', 'dense', 'direct')
+                cfg.fallback_with_reference_log(ref_log)
+            ##### space definition end #####
+
+            if dense.op in s.outputs:
+                dense = s.cache_write(output, 'local')
+
+            by, ty, yi = cfg['tile_y'].apply(s, output, y)
+            bx, tx, xi = cfg['tile_x'].apply(s, output, x)
+
+            s[output].bind(by, tvm.thread_axis('blockIdx.y'))
+            s[output].bind(bx, tvm.thread_axis('blockIdx.x'))
+            s[output].bind(ty, tvm.thread_axis('threadIdx.y'))
+            s[output].bind(tx, tvm.thread_axis('threadIdx.x'))
+
+            if cfg['tile_y'].size[-1] < max_unroll:
+                s[output].unroll(yi)
+            if cfg['tile_x'].size[-1] in vec_size:
+                s[output].vectorize(xi)
+            s[dense].compute_at(s[output], tx)
+
+            k = s[dense].op.reduce_axis[0]
+            y, x = s[dense].op.axis
+            k, k_unroll = cfg['c_unroll'].apply(s, dense, k)
+            s[dense].reorder(k, k_unroll, y, x)
+            s[dense].unroll(k_unroll)
+            if cfg['tile_y'].size[-1] < max_unroll:
+                s[dense].unroll(y)
+            if cfg['tile_x'].size[-1] in vec_size:
+                s[dense].vectorize(x)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+def fuse_and_bind(s, tensor, axis=None, num_thread=None):
+    """ fuse all the axis and bind to GPU threads """
+    axis = axis or s[tensor].op.axis
+    fused = s[tensor].fuse(*axis)
+    bx, tx = s[tensor].split(fused, num_thread)
+    s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
+    return bx, tx
diff --git a/topi/python/topi/bifrost/depthwise_conv2d.py b/topi/python/topi/bifrost/depthwise_conv2d.py
new file mode 100644
index 000000000000..0cde1ea6f413
--- /dev/null
+++ b/topi/python/topi/bifrost/depthwise_conv2d.py
@@ -0,0 +1,126 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""depthwise_conv2d schedule on ARM Mali GPU"""
+
+from __future__ import absolute_import as _abs
+import tvm
+
+from .. import generic
+from .. import util
+from .. import tag
+
+@generic.schedule_depthwise_conv2d_nchw.register(["bifrost"])
+def schedule_depthwise_conv2d_nchw(outs):
+    """Schedule for depthwise_conv2d nchw forward.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of depthwise_conv2d
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for depthwise_conv2d nchw.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    def _schedule(pad_data, kernel, conv):
+        raw_data = s[pad_data].op.input_tensors[0]
+
+        if conv.op not in s.outputs:  # has bias or relu
+            output = outs[0]
+        else:                         # no bias or relu
+            output = conv
+
+        def tile_and_bind3d(tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
+            """ tile and bind 3d """
+            y_factor = y_factor or z_factor
+            x_factor = x_factor or y_factor
+            zo, zi = s[tensor].split(z, z_factor)
+            yo, yi = s[tensor].split(y, y_factor)
+            xo, xi = s[tensor].split(x, x_factor)
+            s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
+            s[tensor].bind(zi, tvm.thread_axis("threadIdx.z"))
+            s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
+            s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
+            s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
+            s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+            return zo, zi, yo, yi, xo, xi
+
+        # set tunable parameters
+        VH = 1
+        VW = 1
+        num_thread = 4
+        while util.get_const_int(conv.shape[3]) % (VW * 2) == 0 and VW * 2 <= 4:
+            VW = VW * 2
+        while util.get_const_int(conv.shape[2]) % (VH * 2) == 0 and VH * 2 <= 2:
+            VH = VH * 2
+        if raw_data.dtype == 'float16':
+            if util.get_const_int(conv.shape[3]) % (VW * 2) == 0:
+                VW *= 2
+                num_thread *= 2
+            else:
+                num_thread *= 2
+
+        # schedule padding
+        _, c, y, x = s[pad_data].op.axis
+        tile_and_bind3d(pad_data, c, y, x, num_thread, 1, 1)
+
+        # schedule conv
+        di, dj = s[conv].op.reduce_axis
+        s[conv].unroll(di)
+        s[conv].unroll(dj)
+
+        _, c, y, x = s[output].op.axis
+        y, x, yi, xi = s[output].tile(y, x, VH, VW)
+        s[output].unroll(yi)
+        s[output].vectorize(xi)
+
+        _, _, _, _, _, ji = tile_and_bind3d(output, c, y, x, num_thread, 1, 1)
+
+        if conv.op not in s.outputs:
+            _, c, y, x = s[conv].op.axis
+            y, x, yi, xi = s[conv].tile(y, x, VH, VW)
+            s[conv].unroll(yi)
+            s[conv].vectorize(xi)
+            s[conv].compute_at(s[output], ji)
+
+    def traverse(op):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+
+        # schedule depthwise_conv2d
+        if op.tag == 'depthwise_conv2d_nchw':
+            pad_data = op.input_tensors[0]
+            kernel = op.input_tensors[1]
+            if isinstance(kernel.op, tvm.tensor.ComputeOp) and 'dilate' in kernel.op.tag:
+                s[kernel].compute_inline()
+            conv = op.output(0)
+            _schedule(pad_data, kernel, conv)
+
+    traverse(outs[0].op)
+    return s
diff --git a/topi/python/topi/bifrost/gemm.py b/topi/python/topi/bifrost/gemm.py
new file mode 100644
index 000000000000..cc6cf09de4ce
--- /dev/null
+++ b/topi/python/topi/bifrost/gemm.py
@@ -0,0 +1,363 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""GEMM schedules for Mali Bifrost"""
+
+import tvm
+
+from .transforms import tile_and_bind, tile_and_bind3d, interleave_transpose, \
+    transpose_interleave
+from .. import util
+
+def decl_gemm(cfg, A, B):
+    """Declare a single GEMM computation for Mali Bifrost GPUs
+
+    Parameters
+    ----------
+    cfg : Config
+        Schedule configuration
+
+    A : tvm.Tensor
+        2D Tensor, shape [n, k]
+
+    B : tvm.Tensor
+        2D Tensor, shape [k, m]
+
+    Returns
+    -------
+    C : tvm.Tensor
+        2D Tensor, shape [n, m]
+    """
+
+    cfg.define_knob("work_group_x", [1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 64])
+    cfg.define_knob("work_group_y", [1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 64])
+    cfg.define_knob("unroll_k_factor", [1, 2, 4])
+    cfg.define_knob("A_interleave", [1, 4, 8, 16, 24, 32, 48, 64])
+    cfg.define_knob("B_interleave", [1, 4, 8, 16, 32])
+    cfg.define_knob("split_k_factor", [1, 4, 16])
+
+
+    # Mutual k axis must be of equal extent
+    assert util.get_const_int(A.shape[1]) == util.get_const_int(B.shape[0])
+    n = A.shape[0]
+    m = B.shape[1]
+    k_size = util.get_const_int(A.shape[1])
+    unroll_gemm = cfg["split_k_factor"].val
+    if unroll_gemm == 1:
+        # No unrolling case must have the same set of tensors to keep scheduling consistent
+        # Create identity tensors to take the place of A_unrolled, B_unrolled and R
+        A_unrolled = tvm.compute((n, k_size), lambda i, j: A[i, j], name="A_unrolled")
+        B_unrolled = tvm.compute((k_size, m), lambda i, j: B[i, j], name="B_unrolled")
+
+        # Declare standard GEMM
+        k = tvm.reduce_axis((0, A.shape[1]), name='k')
+        C = tvm.compute((n, m), lambda i, j:
+                        tvm.sum(A_unrolled[i, k] * B_unrolled[k, j], axis=k), name='C')
+
+        R = tvm.compute((n, m), lambda i, j: C[i, j], name="R")
+
+    else:
+        unrolled_k_size = k_size // unroll_gemm
+
+        # Unroll the two input matrices along the shared k axis
+        A_unrolled = tvm.compute((unroll_gemm, n, unrolled_k_size), lambda b, i, j:
+                                 A[i][unrolled_k_size * b + j], name='A_unrolled')
+
+        B_unrolled = tvm.compute((unroll_gemm, unrolled_k_size, m), lambda b, i, j:
+                                 B[unrolled_k_size * b + i][j], name='B_unrolled')
+
+        # Declare a batched GEMM
+        k = tvm.reduce_axis((0, unrolled_k_size), name='k')
+        C = tvm.compute((unroll_gemm, n, m), lambda b, i, j:
+                        tvm.sum(A_unrolled[b][i][k] * B_unrolled[b][k][j], axis=k), name='C')
+
+        # Then declare a reduction to reduce the sub matrices
+        k = tvm.reduce_axis((0, unroll_gemm), name='k')
+        R = tvm.compute((n, m), lambda i, j:
+                        tvm.sum(C[k][i][j], axis=k), name='R')
+
+    return R
+
+def decl_batched_gemm(cfg, A, B):
+    """Declare a batched GEMM computation for Mali Bifrost GPUs
+    Parameters
+    ----------
+    cfg : Config
+        Schedule configuration
+
+    A : tvm.Tensor
+        3D Tensor, shape [b, n, k]
+
+    B : tvm.Tensor
+        3D Tensor, shape [b, k, m]
+
+    Returns
+    -------
+    C : tvm.Tensor
+        3D Tensor, shape [b, n, m]
+
+    """
+    # Mutual b and k axis must be of equal extent
+    assert util.get_const_int(A.shape[2]) == util.get_const_int(B.shape[1])
+    assert util.get_const_int(A.shape[0]) == util.get_const_int(B.shape[0])
+
+    cfg.define_knob("work_group_x", [1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 64])
+    cfg.define_knob("work_group_y", [1, 2, 3, 4, 6, 8, 12, 16, 24, 32, 64])
+    cfg.define_knob("unroll_k_factor", [1, 2, 4])
+    cfg.define_knob("A_interleave", [1, 4, 8, 16, 32, 64])
+    cfg.define_knob("B_interleave", [1, 4, 8, 16, 32])
+
+    n = A.shape[1]
+    m = B.shape[2]
+    k_size = util.get_const_int(A.shape[2])
+    b_size = util.get_const_int(A.shape[0])
+
+    # Declare a batched GEMM
+    k = tvm.reduce_axis((0, k_size), name='k')
+    C = tvm.compute((b_size, n, m), lambda b, i, j:
+                    tvm.sum(A[b][i][k] * B[b][k][j], axis=k), name='C')
+
+    return C
+
+def decl_winograd_gemm(cfg, A, B):
+    """Declare a winograd GEMM for Mali Bifrost GPUs
+    Winograd uses batched GEMM, however the input tensors are 4D
+    This wraps decl_batched_gemm to provide it with 3D tensors
+
+    Parameters
+    ----------
+    cfg : Config
+        Schedule configuration
+
+    A : tvm.Tensor
+        4D Tensor, shape [a, a, n, k]
+
+    B : tvm.Tensor
+        4D Tensor, shape [a * a, k, m]
+
+    Returns
+    -------
+
+    """
+    alpha = util.get_const_int(A.shape[0])
+    n = util.get_const_int(A.shape[2])
+    k = util.get_const_int(A.shape[3])
+
+    A_3D = tvm.compute((alpha * alpha, n, k), lambda b, i, j:
+                       A[b // alpha][b % alpha][i][j], name='A_3D')
+
+    C = decl_batched_gemm(cfg, A_3D, B)
+    return A_3D, C
+
+def schedule_gemm(cfg, s, A, B, C, batched=False, schedule_transforms=True):
+    """Schedule GEMM, single and batched
+
+    Parameters
+    ----------
+    cfg : Config
+        Schedule configuration
+
+    s : tvm.schedule.Schedule
+        Operator schedule
+
+    A : tvm.Tensor
+        2D/3D Tensor, shape [n, k]/[b, n, k]
+
+    B : tvm.Tensor
+        2D/3D Tensor, shape [k, m]/[b, k, m]
+
+    C : tvm.Tensor
+        2D/3D Tensor, shape [n, m]/[b, n, m]
+
+    batched : bool
+        Whether the GEMM is batched
+
+    Returns
+    -------
+
+    """
+    block_size_x = 4
+    block_size_y = 4
+    warp_size_x = 2
+    warp_size_y = 2
+
+    work_group_x = cfg["work_group_x"].val
+    work_group_y = cfg["work_group_y"].val
+    k_unroll = cfg["unroll_k_factor"].val
+
+    if not batched:
+        y_index, x_index = (0, 1)
+    else:
+        y_index, x_index = (1, 2)
+
+    trans_inter, A_transposed_interleaved = transpose_interleave(
+        s, A, cfg["A_interleave"].val, y_index, x_index, [C], batched=batched
+    )
+    inter_trans, B_interleaved_transposed = interleave_transpose(
+        s, B, cfg["B_interleave"].val, y_index, x_index, [C], batched=batched
+    )
+
+    if schedule_transforms:
+        # Schedule A
+        y, x = s[trans_inter].op.axis
+        y, x, yi, xi = s[trans_inter].tile(y, x, 1, 8)
+        s[trans_inter].unroll(yi)
+        s[trans_inter].unroll(xi)
+        tile_and_bind(s, trans_inter, y, x, 1, 4)
+
+        # Schedule B
+        y, x = s[inter_trans].op.axis
+        xo, xi = s[inter_trans].split(x, 4)
+        s[inter_trans].vectorize(xi)
+        tile_and_bind(s, inter_trans, y, xo, 4, 4)
+
+    # Schedule C
+    CR_A = s.cache_read(A_transposed_interleaved, 'local', [C])
+    CR_B = s.cache_read(B_interleaved_transposed, 'local', [C])
+    CW_C = s.cache_write(C, 'local')
+
+    if not batched:
+        y, x = s[C].op.axis
+    else:
+        z, y, x = s[C].op.axis
+    y, x, yt, xt = s[C].tile(y, x, block_size_y, block_size_x)
+    s[C].unroll(yt)
+    s[C].vectorize(xt)
+    # Tile the global work space to generate 'square' warps -> 2x2 for warp size of 4
+    y, x, wy, wx = s[C].tile(y, x, warp_size_y, warp_size_x)
+    x = s[C].fuse(x, wy, wx)
+    if not batched:
+        yo, xo, yi, xi = tile_and_bind(s, C, y, x, work_group_y, work_group_x)
+    else:
+        # For batched GEMM bind batch to z axis
+        zo, yo, xo, zi, yi, xi = tile_and_bind3d(s, C, z, y, x, 1, work_group_y, work_group_x)
+
+    s[CW_C].compute_at(s[C], xi)
+    if not batched:
+        y, x = s[CW_C].op.axis
+    else:
+        _, y, x = s[CW_C].op.axis
+    y, x, yt, xt = s[CW_C].tile(y, x, block_size_y, block_size_x)
+    k = s[CW_C].op.reduce_axis[0]
+    s[CW_C].reorder(k, yt, xt)
+    ko, ki = s[CW_C].split(k, k_unroll)
+    s[CW_C].unroll(ki)
+    s[CW_C].unroll(yt)
+    s[CW_C].unroll(xt)
+
+    if not batched:
+        i, j = s[CR_A].op.axis
+    else:
+        _, i, j = s[CR_A].op.axis
+    s[CR_A].reorder(j, i)
+    s[CR_A].compute_at(s[CW_C], ki)
+    s[CR_A].unroll(j)
+    s[CR_A].vectorize(i)
+
+    if not batched:
+        i, j = s[CR_B].op.axis
+    else:
+        _, i, j = s[CR_B].op.axis
+    s[CR_B].compute_at(s[CW_C], ki)
+    s[CR_B].unroll(i)
+    s[CR_B].vectorize(j)
+
+    return trans_inter, inter_trans
+
+def schedule_unrollable_gemm(cfg, s, A, B, C, R):
+    """Schedule a GEMM that can be unrolled by a constant factor
+    along its inner dimension
+
+    Parameters
+    ----------
+    cfg : Config
+        Schedule configuration
+
+    s : tvm.schedule.Schedule
+        Operator schedule
+
+    A : tvm.Tensor
+        2D/3D Tensor, shape [n, k]/[b, n, k]
+
+    B : tvm.Tensor
+        2D/3D Tensor, shape [k, m]/[b, k, m]
+
+    C : tvm.Tensor
+        2D/3D Tensor, shape [n, m]/[b, n, m]
+
+    R : tvm.Tensor
+        2D Tensor, shape [n, m]
+
+    """
+    # If the GEMM is 2D, no unrolling has taken place
+    # Use non-batched GEMM schedule and inline identity matrices A, B and R
+    if len(C.op.axis) == 2:
+        s[A].compute_inline()
+        s[B].compute_inline()
+        schedule_gemm(cfg, s, A, B, C)
+        s[R].compute_inline()
+
+    # GEMM is 3D, use batched GEMM schedule, inline A and B and schedule R
+    else:
+        s[A].compute_inline()
+        s[B].compute_inline()
+        schedule_gemm(cfg, s, A, B, C, batched=True)
+
+        CR_C = s.cache_read(C, 'local', [R])
+
+        y, x = s[R].op.axis
+        xo, xi = s[R].split(x, 4)
+        k = s[R].op.reduce_axis[0]
+        s[R].reorder(k, xi)
+        ko, ki = s[R].split(k, 4)
+        s[R].unroll(xi)
+        s[R].unroll(ki)
+        tile_and_bind(s, R, y, xo, 1, 2)
+
+        s[CR_C].compute_at(s[R], ko)
+        _, y, x = s[CR_C].op.axis
+        s[CR_C].unroll(y)
+        s[CR_C].vectorize(x)
+
+def get_unrollable_gemm_ops(R):
+    """Get all GEMM operators from the final reduction
+    This is a helper function to more easily get all the GEMM operations
+    from an operator
+
+    Parameters
+    ----------
+    R : tvm.Tensor
+        Reduced tensor, final stage of GEMM
+
+    Returns
+    -------
+    A_unrolled : tvm.Tensor
+        Matrix A unrolled along k
+
+    B_unrolled: tvm.Tensor
+        Matrix B unrolled along k
+
+    C : tvm.Tensor
+        Result of batched GEMM
+
+    R : tvm.Tensor
+        Reduction of C, result of unrollable GEMM
+
+    """
+    C = R.op.input_tensors[0]
+    A_unrolled, B_unrolled = C.op.input_tensors
+    return A_unrolled, B_unrolled, C, R
diff --git a/topi/python/topi/bifrost/transforms.py b/topi/python/topi/bifrost/transforms.py
new file mode 100644
index 000000000000..ea3e51082657
--- /dev/null
+++ b/topi/python/topi/bifrost/transforms.py
@@ -0,0 +1,120 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""Utility scheduling functions for the Bifrost schedules"""
+
+from __future__ import absolute_import as _abs
+import tvm
+
+def fuse_and_bind(s, tensor, axis=None, num_thread=None):
+    """Fuse all the axis and bind to GPU threads"""
+    axis = axis or s[tensor].op.axis
+    fused = s[tensor].fuse(*axis)
+    max_threads = tvm.target.current_target(allow_none=False).max_num_threads
+    bx, tx = s[tensor].split(fused, num_thread or max_threads)
+    s[tensor].bind(bx, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(tx, tvm.thread_axis("threadIdx.x"))
+    return bx, tx
+
+def tile_and_bind(s, tensor, y, x, y_factor, x_factor=None):
+    """Tile and bind to GPU threads"""
+    x_factor = x_factor or y_factor
+    yo, xo, yi, xi = s[tensor].tile(y, x, y_factor, x_factor)
+    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
+    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
+    return yo, xo, yi, xi
+
+def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None):
+    """Tile and bind 3d"""
+    y_factor = y_factor or z_factor
+    x_factor = x_factor or y_factor
+    zo, zi = s[tensor].split(z, z_factor)
+    yo, yi = s[tensor].split(y, y_factor)
+    xo, xi = s[tensor].split(x, x_factor)
+    s[tensor].bind(zo, tvm.thread_axis("blockIdx.z"))
+    s[tensor].bind(zi, tvm.thread_axis("threadIdx.z"))
+    s[tensor].bind(yo, tvm.thread_axis("blockIdx.y"))
+    s[tensor].bind(yi, tvm.thread_axis("threadIdx.y"))
+    s[tensor].bind(xo, tvm.thread_axis("blockIdx.x"))
+    s[tensor].bind(xi, tvm.thread_axis("threadIdx.x"))
+    return zo, yo, xo, zi, yi, xi
+
+def pack_tensor(s, tensor, factor, readers):
+    """Do transform X[n, m] -> X[n / factor, m, factor]"""
+    tmp = s.cache_read(tensor, 'global', readers)
+    y, x = s[tmp].op.axis
+    yo, yi = s[tmp].split(y, factor)
+    s[tmp].reorder(yo, x, yi)
+    s[tmp].compute_inline()
+    return s.cache_write(tmp, 'global'), tmp
+
+def transpose(s, tensor, y_index, x_index, readers):
+    """Do transform X[n, m] -> X[m, n]"""
+    tmp = s.cache_read(tensor, 'global', readers)
+    y, x = s[tmp].op.axis[y_index], s[tmp].op.axis[x_index]
+    s[tmp].reorder(x, y)
+    s[tmp].compute_inline()
+    A_transpose = s.cache_write(tmp, "global")
+
+    CR_A = s.cache_read(tensor, 'local', [A_transpose])
+    CW_A_transpose = s.cache_write(A_transpose, 'local')
+
+    y, x = s[A_transpose].op.axis[y_index], s[A_transpose].op.axis[x_index]
+    yo, xo, yi, xi = s[A_transpose].tile(y, x, 4, 4)
+    s[A_transpose].unroll(yi)
+    s[A_transpose].vectorize(xi)
+    _, _, _, xi = tile_and_bind(s, A_transpose, yo, xo, 32, 2)
+
+    s[CW_A_transpose].compute_at(s[A_transpose], xi)
+    y, x = s[CW_A_transpose].op.axis[y_index], s[CW_A_transpose].op.axis[x_index]
+    s[CW_A_transpose].unroll(x)
+    s[CW_A_transpose].unroll(y)
+
+    s[CR_A].compute_at(s[A_transpose], xi)
+    y, x = s[CR_A].op.axis[y_index], s[CR_A].op.axis[x_index]
+    s[CR_A].unroll(y)
+    s[CR_A].vectorize(x)
+
+    return tmp
+
+def interleave_transpose(s, tensor, width, y_index, x_index, readers, batched=False):
+    """Interleave the tensor, then transpose it"""
+    tmp = s.cache_read(tensor, 'global', readers)
+    y, x = s[tmp].op.axis[y_index], s[tmp].op.axis[x_index]
+    xo, xi = s[tmp].split(x, width)
+    s[tmp].reorder(xo, y, xi)
+    s[tmp].fuse(y, xi)
+    if batched:
+        z = s[tmp].op.axis[0]
+        s[tmp].fuse(z, xo)
+    s[tmp].compute_inline()
+    return s.cache_write(tmp, 'global'), tmp
+
+def transpose_interleave(s, tensor, width, y_index, x_index, readers, batched=False):
+    """Transpose the tensor, then interleave it"""
+    tmp = s.cache_read(tensor, 'global', readers)
+    y, x = s[tmp].op.axis[y_index], s[tmp].op.axis[x_index]
+    yo, yi = s[tmp].split(y, width)
+    s[tmp].reorder(yo, x, yi)
+    s[tmp].fuse(x, yi)
+    if batched:
+        z = s[tmp].op.axis[0]
+        s[tmp].fuse(z, yo)
+    s[tmp].compute_inline()
+    return s.cache_write(tmp, 'global'), tmp
diff --git a/topi/python/topi/cuda/conv2d_hwcn.py b/topi/python/topi/cuda/conv2d_hwcn.py
index 5d101b9e010f..18a624a67aea 100644
--- a/topi/python/topi/cuda/conv2d_hwcn.py
+++ b/topi/python/topi/cuda/conv2d_hwcn.py
@@ -17,9 +17,14 @@
 # pylint: disable=invalid-name, too-many-locals, too-many-statements
 """Schedule for conv2d_hwcn with auto fusion"""
 import tvm
-from .. import tag
+from tvm import autotvm
+from tvm.autotvm.task.space import SplitEntity
 
-def schedule_conv2d_hwcn(outs):
+from .. import generic, tag
+
+
+@autotvm.register_topi_schedule(generic.schedule_conv2d_hwcn, ["cuda", "gpu"], ["direct"])
+def schedule_conv2d_hwcn(cfg, outs):
     """Schedule for conv2d_hwcn and any element-wise operations.
 
     Parameters
@@ -51,36 +56,44 @@ def schedule(Apad, W, B):
             sch[B].set_scope("local")
             BL = B
 
-        tile = 8
-        num_thread = 8
-        block_factor = tile * num_thread
-        step = 8
-        vthread = 2
+        hi, wi, fi, ni = sch[Out].op.axis
 
-        block_x = tvm.thread_axis("blockIdx.x")
-        block_y = tvm.thread_axis("blockIdx.y")
-        block_z = tvm.thread_axis("blockIdx.z")
-        thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
-        thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y")
-        thread_xz = tvm.thread_axis((0, vthread), "vthread", name="vx")
-        thread_yz = tvm.thread_axis((0, vthread), "vthread", name="vy")
+        # Create tuning space
+        n_thread_cand = [1, 2, 4, 8, 16, 32]
+        vthread_cand = [1, 2, 4, 8]
+
+        cfg.define_split(
+            'tile_fi',
+            fi,
+            num_outputs=4,
+            filter=lambda x:
+            (x.size[1] in vthread_cand and x.size[2] in n_thread_cand))
+        cfg.define_split(
+            'tile_ni',
+            ni,
+            num_outputs=4,
+            filter=lambda x:
+            (x.size[1] in vthread_cand and x.size[2] in n_thread_cand))
+
+        if cfg.is_fallback:
+            cfg['tile_fi'] = SplitEntity([-1, 2, 8, 4])
+            cfg['tile_ni'] = SplitEntity([-1, 2, 8, 4])
+
+        # Scheduling
+        step = 8
 
-        hi, wi, fi, ni = sch[Out].op.axis
         bz = sch[Out].fuse(hi, wi)
-        by, fi = sch[Out].split(fi, factor=block_factor)
-        bx, ni = sch[Out].split(ni, factor=block_factor)
-        tyz, fi = sch[Out].split(fi, nparts=vthread)
-        txz, ni = sch[Out].split(ni, nparts=vthread)
-        ty, fi = sch[Out].split(fi, nparts=num_thread)
-        tx, ni = sch[Out].split(ni, nparts=num_thread)
+        by, tyz, ty, fi = cfg['tile_fi'].apply(sch, Out, fi)
+        bx, txz, tx, ni = cfg['tile_ni'].apply(sch, Out, ni)
         sch[Out].reorder(bz, by, bx, tyz, txz, ty, tx, fi, ni)
-        sch[Out].bind(bz, block_z)
-        sch[Out].bind(by, block_y)
-        sch[Out].bind(bx, block_x)
-        sch[Out].bind(tyz, thread_yz)
-        sch[Out].bind(txz, thread_xz)
-        sch[Out].bind(ty, thread_y)
-        sch[Out].bind(tx, thread_x)
+
+        sch[Out].bind(bz, tvm.thread_axis('blockIdx.z'))
+        sch[Out].bind(by, tvm.thread_axis('blockIdx.y'))
+        sch[Out].bind(bx, tvm.thread_axis('blockIdx.x'))
+        sch[Out].bind(tyz, tvm.thread_axis('vthread'))
+        sch[Out].bind(txz, tvm.thread_axis('vthread'))
+        sch[Out].bind(ty, tvm.thread_axis('threadIdx.y'))
+        sch[Out].bind(tx, tvm.thread_axis('threadIdx.x'))
 
         # Schedule BL local write
         sch[BL].compute_at(sch[Out], tx)
@@ -98,21 +111,21 @@ def schedule(Apad, W, B):
         sch[WL].compute_at(sch[BL], rci)
         # Schedule for A's shared memory load
         yi, xi, ci, ni = sch[AA].op.axis
-        ty, ci = sch[AA].split(ci, nparts=num_thread)
-        tx, ni = sch[AA].split(ni, nparts=num_thread)
+        ty, ci = sch[AA].split(ci, nparts=cfg['tile_fi'].size[2])
+        tx, ni = sch[AA].split(ni, nparts=cfg['tile_ni'].size[2])
         _, ni = sch[AA].split(ni, factor=4)
         sch[AA].reorder(ty, tx, yi, xi, ci, ni)
-        sch[AA].bind(ty, thread_y)
-        sch[AA].bind(tx, thread_x)
+        sch[AA].bind(ty, tvm.thread_axis('threadIdx.y'))
+        sch[AA].bind(tx, tvm.thread_axis('threadIdx.x'))
         sch[AA].vectorize(ni)
         # Schedule for W's shared memory load
         yi, xi, ci, fi = sch[WW].op.axis
-        ty, ci = sch[WW].split(ci, nparts=num_thread)
-        tx, fi = sch[WW].split(fi, nparts=num_thread)
+        ty, ci = sch[WW].split(ci, nparts=cfg['tile_fi'].size[2])
+        tx, fi = sch[WW].split(fi, nparts=cfg['tile_ni'].size[2])
         _, fi = sch[WW].split(fi, factor=4)
         sch[WW].reorder(ty, tx, yi, xi, ci, fi)
-        sch[WW].bind(ty, thread_y)
-        sch[WW].bind(tx, thread_x)
+        sch[WW].bind(ty, tvm.thread_axis('threadIdx.y'))
+        sch[WW].bind(tx, tvm.thread_axis('threadIdx.x'))
         sch[WW].vectorize(fi)
 
     scheduled_ops = []
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index c2cb2b27c5f1..4043cb7e4606 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -34,6 +34,24 @@ def _default_schedule(outs, auto_inline):
     return s
 
 
+@tvm.target.generic_func
+def schedule_conv2d_hwcn(outs):
+    """Schedule for conv2d_hwcn
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of conv2d_hwcn
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
 @tvm.target.generic_func
 def schedule_conv2d_nchw(outs):
     """Schedule for conv2d_nchw
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index ffae4b2094e4..130632fd08a9 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -64,9 +64,9 @@ def conv2d(input, filter, strides, padding, dilation, layout='NCHW', out_dtype=N
     # default declaration
     if layout == 'NCHW':
         return conv2d_nchw(input, filter, strides, padding, dilation, out_dtype)
-    if layout == 'HWCN':
+    elif layout == 'HWCN':
         return conv2d_hwcn(input, filter, strides, padding, dilation, out_dtype)
-    if layout == 'NHWC':
+    elif layout == 'NHWC':
         return conv2d_nhwc(input, filter, strides, padding, dilation, out_dtype)
     raise ValueError("not support this layout {} yet".format(layout))
 
diff --git a/topi/python/topi/testing/conv2d_nhwc_python.py b/topi/python/topi/testing/conv2d_nhwc_python.py
index d2ef40c64d21..8a6a467a80c4 100644
--- a/topi/python/topi/testing/conv2d_nhwc_python.py
+++ b/topi/python/topi/testing/conv2d_nhwc_python.py
@@ -40,7 +40,7 @@ def conv2d_nhwc_python(a_np, w_np, stride, padding):
     Returns
     -------
     b_np : np.ndarray
-        4-D with shape [out_height, out_width, out_channel, batch]
+        4-D with shape [batch, out_height, out_width, out_channel]
     """
     batch, in_height, in_width, in_channel = a_np.shape
     kernel_h, kernel_w, _, num_filter = w_np.shape
diff --git a/topi/python/topi/x86/conv2d_alter_op.py b/topi/python/topi/x86/conv2d_alter_op.py
index aec3efc7a86c..f596bc0eb503 100644
--- a/topi/python/topi/x86/conv2d_alter_op.py
+++ b/topi/python/topi/x86/conv2d_alter_op.py
@@ -192,24 +192,72 @@ def _conv2d_legalize(attrs, inputs, arg_types):
         The legalized expr
     """
 
+    # Dilation not supported yet. Return None if dilation is not (1, 1)
+    dilation = attrs.get_int_tuple("dilation")
+    if not (dilation[0] == 1 and dilation[1] == 1):
+        return None
+
     # Collect the input tensors.
     data_tensor, kernel_tensor = arg_types[0], arg_types[1]
+    data_dtype = data_tensor.dtype
+    kernel_dtype = kernel_tensor.dtype
 
     # Collect the output tensor.
     output_tensor = arg_types[2]
 
+    # Collect the input exprs.
+    data, kernel = inputs
+
+    # Get the conv attrs
+    new_attrs = {k: attrs[k] for k in attrs.keys()}
+
+    is_int8_inputs = False
+    # If both the inputs are int8, we can add 128 to make the input dtype uint8, and then adjust the
+    # output. This will help picking up Intel VNNI instructions.
+    # Original --> C = A (conv) B
+    # A and B are int8
+    #   C = (A + 128 - 128) (conv) B
+    #   C = (A' conv B) - 128 (conv) B
+    # where A' = A + 128
+    # and 128 (conv) B is basically a reduce on CRS axis for weights.
+    if data_tensor.dtype == 'int8' and kernel_tensor.dtype == 'int8':
+        is_int8_inputs = True
+        padding = attrs.get_int_tuple("padding")
+
+        if attrs['data_layout'] == 'NHWC' and attrs['kernel_layout'] == 'HWIO':
+            adjust_shift = relay.sum(relay.cast(kernel, dtype='int32'), axis=(0, 1, 2))
+            pad_width = ((0, 0), (padding[0], padding[0]), (padding[1], padding[1]), (0, 0))
+        elif attrs['data_layout'] == 'NCHW' and attrs['kernel_layout'] == 'OIHW':
+            pad_width = ((0, 0), (0, 0), (padding[0], padding[0]), (padding[1], padding[1]))
+            adjust_shift = relay.sum(relay.cast(kernel, dtype='int32'), axis=(1, 2, 3))
+            adjust_shift = relay.expand_dims(adjust_shift, axis=1, num_newaxis=2)
+        else:
+            return None
+
+        data = relay.cast(data, 'int32')
+        data = relay.add(data, relay.const(128, 'int32'))
+        data = relay.cast(data, 'uint8')
+
+        # Do external padding as pad value has to be 128.
+        if not (padding[0] == 0 and padding[1] == 0):
+            data = relay.nn.pad(data, pad_width=pad_width, pad_value=128)
+        new_attrs['padding'] = (0, 0)
+
+        # The data type is now shifted to uint8
+        data_dtype = 'uint8'
+
+        # Multiply 128 to adjust shift.
+        adjust_shift = relay.multiply(adjust_shift, relay.const(128, 'int32'))
+
     # Legalize if the datatypes are suitable for fast Int8 instructions.  Int8 instructions require
     # input channel to be a multiple of 4 and output channels to be a multiple of 16. For input
     # channels, we pad both the inputs and weights input channels. For output channels, we pad the
     # weight and stride_slice the output.
-    if _is_int8_hw_support(data_tensor.dtype, kernel_tensor.dtype):
+    if _is_int8_hw_support(data_dtype, kernel_dtype):
         # Flags to remember if the expr is modified
         ic_modified = False
         oc_modified = False
 
-        # Collect the input exprs.
-        data, kernel = inputs
-
         # Find the value of input and output channel.
         in_channel = -1
         out_channel = -1
@@ -250,16 +298,16 @@ def _conv2d_legalize(attrs, inputs, arg_types):
             else:
                 return None
 
-        if not (ic_modified or oc_modified):
-            return None
-
-        if ic_modified and not oc_modified:
-            return relay.nn.conv2d(data, kernel, **attrs)
-
         if oc_modified:
-            new_attrs = {k: attrs[k] for k in attrs.keys()}
             new_attrs['channels'] = new_out_channel
             out = tvm.relay.nn.conv2d(data, kernel, **new_attrs)
             original_out_shape = [x.value for x in output_tensor.shape]
-            return relay.strided_slice(out, begin=(0, 0, 0, 0), end=original_out_shape)
+            out = relay.strided_slice(out, begin=(0, 0, 0, 0), end=original_out_shape)
+        else:
+            out = relay.nn.conv2d(data, kernel, **new_attrs)
+
+        if is_int8_inputs:
+            out = relay.subtract(out, adjust_shift)
+
+        return out
     return None
diff --git a/topi/python/topi/x86/conv2d_avx_1x1.py b/topi/python/topi/x86/conv2d_avx_1x1.py
index 96b6e47789f7..2a81dcc495d3 100644
--- a/topi/python/topi/x86/conv2d_avx_1x1.py
+++ b/topi/python/topi/x86/conv2d_avx_1x1.py
@@ -24,7 +24,7 @@
 from ..nn.util import infer_pad, get_pad_tuple
 from ..generic import conv2d as conv2d_generic
 from ..util import get_const_tuple, simplify
-from .tensor_intrin import dot_16x1x16_int8_int8_int32
+from .tensor_intrin import dot_16x1x16_uint8_int8_int32
 from .util import get_fp32_len
 
 def _fallback_schedule(cfg, wkl):
@@ -183,7 +183,7 @@ def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
 def _schedule_conv_NCHWc_int8(s, cfg, data, conv_out, last):
     return conv2d_generic.schedule_conv_NCHWc_cpu_1x1_int8(s, cfg, data, conv_out, last,
                                                            int32_lanes=16,
-                                                           intrin=dot_16x1x16_int8_int8_int32())
+                                                           intrin=dot_16x1x16_uint8_int8_int32())
 
 
 def _declaration_conv_nhwc_pack(cfg, Input, Filter, stride, padding, dilation, out_dtype):
@@ -282,7 +282,7 @@ def _schedule_conv_nhwc_pack_int8(s, cfg, data, conv_out, last):
     ic_f_outer, ic_s_outer = s[C].split(ic_outer, factor=ic_factor)
     s[C].reorder(oc_outer, oh, ow, ic_f_outer, ic_s_outer, kh, kw, oc_inner, ic_inner)
 
-    pc = dot_16x1x16_int8_int8_int32()
+    pc = dot_16x1x16_uint8_int8_int32()
     s[C].tensorize(oc_inner, pc)
 
     if C != O:
diff --git a/topi/python/topi/x86/conv2d_avx_common.py b/topi/python/topi/x86/conv2d_avx_common.py
index 53b79bdbeec9..7c5096dc2c1a 100644
--- a/topi/python/topi/x86/conv2d_avx_common.py
+++ b/topi/python/topi/x86/conv2d_avx_common.py
@@ -23,7 +23,7 @@
 from ..nn.util import infer_pad
 from ..generic import conv2d as conv2d_generic
 from ..util import get_const_tuple
-from .tensor_intrin import dot_16x1x16_int8_int8_int32
+from .tensor_intrin import dot_16x1x16_uint8_int8_int32
 from .util import get_fp32_len
 
 def _fallback_schedule(cfg, wkl):
@@ -209,4 +209,4 @@ def _schedule_conv_NCHWc(s, cfg, data, conv_out, last):
 def _schedule_conv_NCHWc_int8(s, cfg, data, conv_out, last):
     return conv2d_generic.schedule_conv_NCHWc_cpu_common_int8(s, cfg, data, conv_out, last,
                                                               int32_lanes=16,
-                                                              intrin=dot_16x1x16_int8_int8_int32())
+                                                              intrin=dot_16x1x16_uint8_int8_int32())
diff --git a/topi/python/topi/x86/conv2d_int8.py b/topi/python/topi/x86/conv2d_int8.py
index f701108071e5..df53850ec603 100644
--- a/topi/python/topi/x86/conv2d_int8.py
+++ b/topi/python/topi/x86/conv2d_int8.py
@@ -57,16 +57,14 @@ def _is_int8_hw_support(data_dtype, kernel_dtype):
     is_dtype_support = data_dtype == 'uint8' and kernel_dtype == 'int8'
 
     # 2) Check LLVM support
-    llvm_intrin_fast_int8 = "llvm.x86.avx512.pmaddubs.w.512"
-    llvm_id = tvm.codegen.llvm_lookup_intrinsic_id(llvm_intrin_fast_int8)
-    is_llvm_support = llvm_id != 0
+    llvm_version = tvm.codegen.llvm_version_major()
+    is_llvm_support = llvm_version >= 8
 
     # 3) Check target
-    target = tvm.target.current_target()
+    mcpu = tvm.target.current_target().mcpu
     is_target_support = False
-    for opt in target.options:
-        if opt == '-mcpu=skylake-avx512':
-            is_target_support = True
+    if mcpu == 'skylake-avx512' or mcpu == 'cascadelake':
+        is_target_support = True
 
     return is_dtype_support and is_llvm_support and is_target_support
 
diff --git a/topi/python/topi/x86/pooling.py b/topi/python/topi/x86/pooling.py
index ac19b19de28d..e9f832dde902 100644
--- a/topi/python/topi/x86/pooling.py
+++ b/topi/python/topi/x86/pooling.py
@@ -147,6 +147,11 @@ def traverse(OP):
                     traverse(tensor.op)
         # schedule pool
         elif OP.tag.startswith('adaptive_pool'):
+            if OP != outs[0].op:
+                output = outs[0]
+                output_fused = s[output].fuse(output.op.axis[0], output.op.axis[1])
+                s[output].parallel(output_fused)
+
             Pool = OP.output(0)
             _parallel_sch(s[Pool], outs[0].shape)
         else:
diff --git a/topi/python/topi/x86/tensor_intrin.py b/topi/python/topi/x86/tensor_intrin.py
index cba00c023f89..a8ad251115d7 100644
--- a/topi/python/topi/x86/tensor_intrin.py
+++ b/topi/python/topi/x86/tensor_intrin.py
@@ -19,15 +19,27 @@
 import tvm
 
 
-def dot_16x1x16_int8_int8_int32():
+def dot_16x1x16_uint8_int8_int32():
+    """Dispatch the most optimized intrin depending on the target"""
+    mcpu = tvm.target.current_target().mcpu
+
+    assert mcpu in ("skylake-avx512", "cascadelake"), \
+            "An old Intel machine that does not have fast Int8 support."
+    if mcpu == "skylake-avx512":
+        return dot_16x1x16_uint8_int8_int32_skylake()
+    # cascadelake
+    return dot_16x1x16_uint8_int8_int32_cascadelake()
+
+
+def dot_16x1x16_uint8_int8_int32_skylake():
     """
     Int8 dot product by every 4 elements using AVX512 Skylake instructions.
-    This function takes two arrays of int8 datatype -- data[4] and
+    This function takes two arrays of uint8 and int8 datatype -- data[4] and
     kernel[16][4] -- and computes a dot product of data[4] with every
     4 elements of kernels, resulting in output[16] of int32 datatype.
     The pseudo code is as follows.
     .. code-block:: c
-        void dot_16x1x16_int8_int8_int32(int8 data[4], int8 kernel[16][4],
+        void dot_16x1x16_uint8_int8_int32(uint8 data[4], int8 kernel[16][4],
                 int32 output[16]){
             for (int i = 0; i < 16; i++){
                 output[i] = 0;
@@ -100,15 +112,15 @@ def _instr(index):
         return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
 
 
-def dot_16x1x16_int8_int8_int16():
+def dot_16x1x16_uint8_int8_int16():
     """
     Int8 dot product by every 2 elements using AVX512 Skylake instructions.
-    This function takes two arrays of int8 datatype -- data[2] and
+    This function takes two arrays of uint8 and int8 datatype -- data[2] and
     kernel[4][32][2] -- and computes a dot product of data[2] with every
     2 elements of kernels, resulting in output[4][32] of int16 datatype.
     The pseudo code is as follows.
     .. code-block:: c
-        void dot_16x1x16_int8_int8_int16(int8 data[2], int8 kernel[32*4][2],
+        void dot_16x1x16_uint8_int8_int16(uint8 data[2], int8 kernel[32*4][2],
                 int16 output[32*4]){
             for (int i = 0; i< 4; i++){
                 for (int j = 0; j < 32; j++){
@@ -182,15 +194,15 @@ def _instr(index):
         return tvm.decl_tensor_intrin(C.op, _intrin_func, binds={data:a_buffer, kernel:b_buffer})
 
 
-def dot_16x1x16_int8_int8_int32_vnni():
+def dot_16x1x16_uint8_int8_int32_cascadelake():
     """
     Int8 dot product by every 4 elements using AVX512VNNI Cascade Lake instructions.
-    This function takes two arrays of int8 datatype -- data[4] and
+    This function takes two arrays of uint8 and int8 datatype -- data[4] and
     kernel[16][4] -- and computes a dot product of data[4] with every
     4 elements of kernels, resulting in output[16] of int32 datatype.
     The pseudo code is as follows.
     .. code-block:: c
-        void dot_16x1x16_int8_int8_int32_vnni(int8 data[4], int8 kernel[16][4],
+        void dot_16x1x16_uint8_int8_int32_cascadelake(uint8 data[4], int8 kernel[16][4],
                 int32 output[16]){
             for (int i = 0; i < 16; i++){
                 output[i] = 0;
diff --git a/topi/python/topi/x86/util.py b/topi/python/topi/x86/util.py
index f0b3c755e1e2..00f297e4307f 100644
--- a/topi/python/topi/x86/util.py
+++ b/topi/python/topi/x86/util.py
@@ -19,10 +19,8 @@
 import tvm
 
 def get_fp32_len():
+    mcpu = tvm.target.current_target().mcpu
     fp32_vec_len = 8
-    target = tvm.target.current_target()
-    if target is not None:
-        for opt in target.options:
-            if opt == '-mcpu=skylake-avx512':
-                fp32_vec_len = 16
+    if mcpu == 'skylake-avx512' or mcpu == 'cascadelake':
+        fp32_vec_len = 16
     return fp32_vec_len
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index 7114f4d878a8..a0700bffa7e3 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -86,8 +86,9 @@ Array<Integer> ArrayOrInt(TVMArgValue arg) {
 }
 
 inline bool IsTensorType(TVMArgValue arg) {
-  return (arg.type_code() == kNodeHandle &&
-          arg.node_sptr()->is_type<tvm::TensorNode>());
+  return (arg.type_code() == kObjectHandle &&
+          static_cast<Object*>(
+              arg.value().v_handle)->IsInstance<tvm::TensorNode>());
 }
 
 
diff --git a/topi/tests/python/test_topi_conv2d_hwcn.py b/topi/tests/python/test_topi_conv2d_hwcn.py
index 297df827b542..35423a686e8f 100644
--- a/topi/tests/python/test_topi_conv2d_hwcn.py
+++ b/topi/tests/python/test_topi_conv2d_hwcn.py
@@ -29,24 +29,25 @@ def verify_conv2d_hwcn(batch, in_channel, in_size, num_filter, kernel, stride, p
 
     A = tvm.placeholder((in_height, in_width, in_channel, batch), name='A')
     W = tvm.placeholder((kernel, kernel, in_channel, num_filter), name='W')
-    B = topi.nn.conv2d_hwcn(A, W, stride, padding, dilation)
-    C = topi.nn.relu(B)
-    s1 = topi.cuda.schedule_conv2d_hwcn([B])
-    s2 = topi.cuda.schedule_conv2d_hwcn([C])
+    B = tvm.placeholder((1, num_filter, 1), name='bias')
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
+    b_shape = get_const_tuple(B.shape)
     dtype = A.dtype
 
     @memoize("topi.tests.test_topi_conv2d_hwcn.verify_hwcn")
     def get_ref_data():
         a_np = np.random.uniform(size=a_shape).astype(dtype)
         w_np = np.random.uniform(size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=b_shape).astype(dtype)
         dw_np = topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
-        b_np = topi.testing.conv2d_hwcn_python(a_np, dw_np, stride, padding)
-        c_np = np.maximum(b_np, 0)
-        return a_np, w_np, b_np, c_np
-    a_np, w_np, b_np, c_np = get_ref_data()
+        c1_np = topi.testing.conv2d_hwcn_python(a_np, dw_np, stride, padding)
+        c2_np = c1_np + b_np
+        c3_np = np.maximum(c2_np, 0)
+        return a_np, w_np, b_np, c1_np, c2_np, c3_np
+
+    a_np, w_np, b_np, c1_np, c2_np, c3_np = get_ref_data()
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -54,16 +55,32 @@ def check_device(device):
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            t_conv = topi.nn.conv2d(A, W, stride, padding, dilation, layout='HWCN')
+            t_bias = topi.add(t_conv, B)
+            t_relu = topi.nn.relu(t_bias)
+            s1 = topi.generic.schedule_conv2d_hwcn([t_conv])
+            s2 = topi.generic.schedule_conv2d_hwcn([t_bias])
+            s3 = topi.generic.schedule_conv2d_hwcn([t_relu])
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
-        func1 = tvm.build(s1, [A, W, B], device)
-        func2 = tvm.build(s2, [A, W, C], device)
-        func1(a, w, b)
-        func2(a, w, c)
-        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+        b = tvm.nd.array(b_np, ctx)
+
+        conv_out = tvm.nd.array(
+            np.zeros(get_const_tuple(t_conv.shape), dtype=t_conv.dtype), ctx)
+        bias_out = tvm.nd.array(
+            np.zeros(get_const_tuple(t_bias.shape), dtype=t_bias.dtype), ctx)
+        relu_out = tvm.nd.array(
+            np.zeros(get_const_tuple(t_relu.shape), dtype=t_relu.dtype), ctx)
+        func1 = tvm.build(s1, [A, W, t_conv], device)
+        func2 = tvm.build(s2, [A, W, B, t_bias], device)
+        func3 = tvm.build(s3, [A, W, B, t_relu], device)
+        func1(a, w, conv_out)
+        func2(a, w, b, bias_out)
+        func3(a, w, b, relu_out)
+        tvm.testing.assert_allclose(conv_out.asnumpy(), c1_np, rtol=1e-5)
+        tvm.testing.assert_allclose(bias_out.asnumpy(), c2_np, rtol=1e-5)
+        tvm.testing.assert_allclose(relu_out.asnumpy(), c3_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
         check_device(device)
diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py
index ca1cef22d9c8..d7c39a9cc016 100644
--- a/topi/tests/python/test_topi_conv2d_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_nchw.py
@@ -48,7 +48,6 @@ def get_ref_data():
         dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
         c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding)
         if add_bias:
-            b_np = np.random.uniform(size=bias_shape).astype(dtype)
             c_np += b_np
         if add_relu:
             c_np = np.maximum(c_np, 0)
diff --git a/tutorials/optimize/opt_conv_tensorcore.py b/tutorials/optimize/opt_conv_tensorcore.py
new file mode 100644
index 000000000000..774b4c7258bb
--- /dev/null
+++ b/tutorials/optimize/opt_conv_tensorcore.py
@@ -0,0 +1,348 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+.. _opt-conv-tensorcore:
+
+How to optimize convolution using TensorCores
+==================================
+**Author**: `Siyuan Feng <https://github.com/Hzfengsy>`_
+
+In this tutorial, we will demonstrate how to write a high performance convolution
+schedule using TensorCores in TVM. In this example, we assume the input to
+convolution has a large batch. We strongly recommend covering the :ref:`opt-conv-gpu` tutorial first.
+
+"""
+
+################################################################
+# TensorCore Introduction
+# -------------------------
+# Each Tensor Core provides a 4x4x4 matrix processing array that operates
+# :code:`D = A * B + C`, where A, B, C and D are 4x4 matrices as Figure shows.
+# The matrix multiplication inputs A and B are FP16 matrices, while the accumulation
+# matrices C and D may be FP16 or FP32 matrices.
+#
+# However, CUDA programmers can only use warp-level primitive
+# :code:`wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag)` to perform
+# 16x16x16 half-precision matrix multiplication on tensor cores. Before invoking
+# the matrix multiplication, programmers must load data from memory into registers
+# with primitive :code:`wmma::load_matrix_sync`, explicitly. The NVCC compiler translates
+# that primitive into multiple memory load instructions. At run time, every thread loads
+# 16 elements from matrix A and 16 elements from B.
+
+################################################################
+# Preparation and Algorithm
+# --------------------------
+# We use the fixed size for input tensors with 256 channels and 14 x 14 dimensions.
+# The batch size is 256. Convolution filters contain 512 filters of size 3 x 3.
+# We use stride size 1 and padding size 1 for the convolution. In the example, we use
+# NHWCnc memory layout.The following code defines the convolution algorithm in TVM.
+
+import tvm
+import numpy as np
+from tvm.contrib import nvcc
+
+# The sizes of inputs and filters
+batch_size = 256
+height = 14
+width = 14
+in_channels = 256
+out_channels = 512
+kernel_h = 3
+kernel_w = 3
+pad_h = 1
+pad_w = 1
+stride_h = 1
+stride_w = 1
+
+# TensorCore shape
+block_size = 16
+
+assert (batch_size % block_size == 0)
+assert (in_channels % block_size == 0)
+assert (out_channels % block_size == 0)
+
+# Input feature map: (N, H, W, IC, n, ic)
+data_shape = (batch_size // block_size,
+              height,
+              width,
+              in_channels // block_size,
+              block_size,
+              block_size)
+# Kernel: (H, W, IC, OC, ic, oc)
+kernel_shape = (kernel_h,
+                kernel_w,
+                in_channels // block_size,
+                out_channels // block_size,
+                block_size,
+                block_size)
+# Output feature map: (N, H, W, OC, n, oc)
+output_shape = (batch_size // block_size,
+                height,
+                width,
+                out_channels // block_size,
+                block_size,
+                block_size)
+
+# Reduction axes
+kh = tvm.reduce_axis((0, kernel_h), name='kh')
+kw = tvm.reduce_axis((0, kernel_w), name='kw')
+ic = tvm.reduce_axis((0, in_channels // block_size), name='ic')
+ii = tvm.reduce_axis((0, block_size), name='ii')
+
+# Algorithm
+A = tvm.placeholder(data_shape, name='A', dtype="float16")
+W = tvm.placeholder(kernel_shape, name='W', dtype="float16")
+Apad = tvm.compute(
+    (batch_size // block_size, height + 2 * pad_h, width + 2 * pad_w, in_channels // block_size, block_size,
+     block_size),
+    lambda n, h, w, i, nn, ii: tvm.if_then_else(
+        tvm.all(h >= pad_h, h - pad_h < height,
+                w >= pad_w, w - pad_w < width),
+        A[n, h - pad_h, w - pad_w, i, nn, ii], tvm.const(0., "float16")),
+    name='Apad')
+Conv = tvm.compute(output_shape,
+                   lambda n, h, w, o, nn, oo: tvm.sum(
+                       Apad[n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype("float32") *
+                       W[kh, kw, ic, o, ii, oo].astype("float32"),
+                       axis=[ic, kh, kw, ii]),
+                   name="Conv")
+
+s = tvm.create_schedule(Conv.op)
+s[Apad].compute_inline()
+
+###############################################################################
+# Memory Scope
+# ----------------
+#
+# In traditional GPU schedule, we have global, shared and local memory scope.
+# To support TensorCores, we add another three special memory scope: :code:`wmma.matrix_a`,
+# :code:`wmma.matrix_b` and :code:`wmma.accumulator`. On hardware, all fragments scope
+# stores at the on-chip registers level, the same place with local memory.
+
+# Designate the memory hierarchy
+AS = s.cache_read(Apad, 'shared', [Conv])
+WS = s.cache_read(W, 'shared', [Conv])
+AF = s.cache_read(AS, 'wmma.matrix_a', [Conv])
+WF = s.cache_read(WS, 'wmma.matrix_b', [Conv])
+ConvF = s.cache_write(Conv, 'wmma.accumulator')
+
+###############################################################################
+# Define Tensor Intrinsic
+# In fact, TensorCore is a special hardware operation. So, we can just use tensorize
+# to replace a unit of computation with the TensorCore instruction. The first thing is
+# that we need to define tensor intrinsic.
+#
+# There are four basic operation in TensorCore: :code:`fill_fragment`, :code:`load_matrix`,
+# :code:`mma_sync` and :code:`store_matrix`. Since :code:`fill_fragment` and :code:`mma_sync`
+# are both used in matrix multiplication, so we can just write following three intrinsics.
+
+def intrin_wmma_load_matrix(scope):
+    n = 16
+    A = tvm.placeholder((n, n), name='A', dtype='float16')
+    BA = tvm.decl_buffer(A.shape, A.dtype, scope='shared', data_alignment=32, offset_factor=256)
+    C = tvm.compute((n, n), lambda i, j: A[i, j], name='C')
+    BC = tvm.decl_buffer(C.shape, C.dtype, scope=scope, data_alignment=32, offset_factor=256)
+
+    def intrin_func(ins, outs):
+        ib = tvm.ir_builder.create()
+
+        BA = ins[0]
+        BC = outs[0]
+        ib.emit(tvm.call_intrin('handle', 'tvm_load_matrix_sync',
+                                BC.data, n, n, n, BC.elem_offset // 256,
+                                BA.access_ptr('r'), n, 'row_major'))
+        return ib.get()
+
+    return tvm.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
+
+
+def intrin_wmma_gemm():
+    n = 16
+    A = tvm.placeholder((n, n), name='A', dtype='float16')
+    B = tvm.placeholder((n, n), name='B', dtype='float16')
+    k = tvm.reduce_axis((0, n), name="k")
+    C = tvm.compute((n, n),
+                    lambda ii, jj:
+                    tvm.sum(A[ii, k].astype('float') * B[k, jj].astype('float'), axis=k),
+                    name='C')
+    BA = tvm.decl_buffer(A.shape, A.dtype, name='BA', scope='wmma.matrix_a', data_alignment=32, offset_factor=256)
+    BB = tvm.decl_buffer(B.shape, B.dtype, name='BB', scope='wmma.matrix_b', data_alignment=32, offset_factor=256)
+    BC = tvm.decl_buffer(C.shape, C.dtype, name='BC', scope='wmma.accumulator', data_alignment=32, offset_factor=256)
+
+    def intrin_func(ins, outs):
+        BA, BB = ins
+        BC, = outs
+
+        def init():
+            ib = tvm.ir_builder.create()
+            ib.emit(tvm.call_intrin('handle', 'tvm_fill_fragment', BC.data, n, n, n, BC.elem_offset // 256, 0.0))
+            return ib.get()
+
+        def update():
+            ib = tvm.ir_builder.create()
+            ib.emit(tvm.call_intrin('handle', 'tvm_mma_sync',
+                                    BC.data, BC.elem_offset // 256,
+                                    BA.data, BA.elem_offset // 256,
+                                    BB.data, BB.elem_offset // 256,
+                                    BC.data, BC.elem_offset // 256))
+            return ib.get()
+
+        return update(), init(), update()
+
+    return tvm.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC})
+
+
+def intrin_wmma_store_matrix():
+    n = 16
+    A = tvm.placeholder((n, n), name='A', dtype='float32')
+    BA = tvm.decl_buffer(A.shape, A.dtype, scope='wmma.accumulator', data_alignment=32, offset_factor=256)
+    C = tvm.compute((n, n), lambda i, j: A[i, j], name='C')
+    BC = tvm.decl_buffer(C.shape, C.dtype, scope='global', data_alignment=32, offset_factor=256)
+
+    def intrin_func(ins, outs):
+        ib = tvm.ir_builder.create()
+        BA = ins[0]
+        BC = outs[0]
+        ib.emit(tvm.call_intrin('handle', 'tvm_store_matrix_sync',
+                                BA.data, n, n, n, BA.elem_offset // 256,
+                                BC.access_ptr('w'), n, 'row_major'))
+        return ib.get()
+
+    return tvm.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})
+
+###############################################################################
+# Scheduling the Computation
+# --------------------------
+# To use TensorCores in TVM, we must schedule the computation into specific structure
+# to match the tensor intrinsic. The same as traditional GPU programs, we can also use
+# shared memory to boost the speed. If you have any questions about blocking and shared
+# memory, please refer :ref:`opt-conv-gpu`.
+#
+# In this example, each block contains 2x4 warps, and each warp calls 4x2 TensorCore
+# instructions. Thus, the output shape of each warp is 64x32 and each block outputs
+# 128x128 titles. Due to the limit of shared memory space, we only load 2 blocks (2x128x128 tiles)
+# one time.
+#
+# .. note::
+#
+#   *Warp-level Operation*
+#
+#   Note that all TensorCore instructions are warp-level instructions, which means all 32 threads
+#   in a warp should do this instruction simultaneously. Making theadIdx.x extent=32 is one of the
+#   easiest way to solve this. Then We can bind threadIdx.x to any loops except those contain
+#   TensorCore intrinsics directly or indirectly. Also note that it is not the unique solution.
+#   The only thing we should do is to make sure all threads in a warp can call TensorCore at the same time.
+#
+
+# Define tiling sizes
+block_row_warps = 4
+block_col_warps = 2
+warp_row_tiles = 2
+warp_col_tiles = 4
+warp_size = 32
+chunk = 2
+
+block_x = tvm.thread_axis('blockIdx.x')
+block_y = tvm.thread_axis('blockIdx.y')
+block_z = tvm.thread_axis('blockIdx.z')
+thread_x = tvm.thread_axis('threadIdx.x')
+thread_y = tvm.thread_axis('threadIdx.y')
+thread_z = tvm.thread_axis('threadIdx.z')
+
+nc, hc, wc, oc, nnc, ooc = Conv.op.axis
+block_k = s[Conv].fuse(hc, wc)
+s[Conv].bind(block_k, block_z)
+nc, nci = s[Conv].split(nc, factor=warp_row_tiles)
+block_i, nc = s[Conv].split(nc, factor=block_row_warps)
+oc, oci = s[Conv].split(oc, factor=warp_col_tiles)
+block_j, oc = s[Conv].split(oc, factor=block_col_warps)
+s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc)
+s[Conv].bind(block_i, block_x)
+s[Conv].bind(block_j, block_y)
+s[Conv].bind(nc, thread_y)
+s[Conv].bind(oc, thread_z)
+
+# Schedule local computation
+s[ConvF].compute_at(s[Conv], oc)
+n, h, w, o, nnf, oof = ConvF.op.axis
+ko, ki = s[ConvF].split(ic, factor=chunk)
+s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii)
+
+# Move intermediate computation into each output compute tile
+s[AF].compute_at(s[ConvF], kw)
+s[WF].compute_at(s[ConvF], kw)
+
+# Schedule for A's share memory
+s[AS].compute_at(s[ConvF], kh)
+n, h, w, i, nn, ii = AS.op.axis
+tx, xo = s[AS].split(n, nparts=block_row_warps)
+ty, yo = s[AS].split(xo, nparts=block_col_warps)
+t = s[AS].fuse(nn, ii)
+to, ti = s[AS].split(t, factor=warp_size)
+s[AS].bind(tx, thread_y)
+s[AS].bind(ty, thread_z)
+s[AS].bind(ti, thread_x)
+
+# Schedule for W's share memory
+s[WS].compute_at(s[ConvF], kh)
+kh, kw, ic, o, ii, oo = WS.op.axis
+tx, xo = s[WS].split(o, nparts=block_row_warps)
+ty, yo = s[WS].split(xo, nparts=block_col_warps)
+t = s[WS].fuse(ii, oo)
+to, ti = s[WS].split(t, nparts=warp_size)
+s[WS].bind(tx, thread_y)
+s[WS].bind(ty, thread_z)
+s[WS].bind(to, thread_x)
+s[WS].vectorize(ti)
+print(tvm.lower(s, [A, W, Conv], simple_mode=True))
+
+###############################################################################
+# Lowering Computation to Intrinsics
+# --------------------------
+# The last phase is to lower the computation loops down to TensorCore hardware intrinsics
+# by mapping the 2D convolution to tensor intrinsics
+#
+
+s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_a'))
+s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_b'))
+s[Conv].tensorize(nnc, intrin_wmma_store_matrix())
+s[ConvF].tensorize(nnf, intrin_wmma_gemm())
+print(tvm.lower(s, [A, W, Conv], simple_mode=True))
+
+###############################################################################
+# Generate CUDA Kernel
+# --------------------
+# Finally we use TVM to generate and compile the CUDA kernel, and evaluate the latency of convolution.
+# Since TensorCores are only supported in NVIDIA GPU with Compute Capability 7.0 or higher, it may not
+# be able to run on our build server
+
+ctx = tvm.gpu(0)
+if nvcc.have_tensorcore(ctx.compute_version):
+    with tvm.build_config(auto_unroll_max_step=16):
+        func = tvm.build(s, [A, W, Conv], 'cuda')
+    a_np = np.random.uniform(size=data_shape).astype(A.dtype)
+    w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)
+    a = tvm.nd.array(a_np, ctx)
+    w = tvm.nd.array(w_np, ctx)
+    c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx)
+    evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
+    print('conv2d with tensor core: %f ms' % (evaluator(a, w, c).mean * 1e3))
+
+###############################################################################
+# Summary
+# This tutorial demonstrates how TVM scheduling primitives can be used to
+# call TensorCores on specific GPUs.
diff --git a/vta/apps/gemm/hardware/chisel/src/main/scala/accel/Compute.scala b/vta/apps/gemm/hardware/chisel/src/main/scala/accel/Compute.scala
index 325fce1bf38a..6bfe3e054121 100644
--- a/vta/apps/gemm/hardware/chisel/src/main/scala/accel/Compute.scala
+++ b/vta/apps/gemm/hardware/chisel/src/main/scala/accel/Compute.scala
@@ -22,21 +22,31 @@ package accel
 import chisel3._
 import chisel3.util._
 import vta.dpi._
+import vta.core._
+import vta.util.config._
+import vta.shell._
 
+class TestConfig extends Config(new CoreConfig ++ new PynqConfig)
 /** Compute
   *
   * Bit Slice GEMM:
   *
   * 1. Wait for launch to be asserted
-  * 2. Issue 2 read request for 8-byte value at inp1_baddr address and inp2_baddr address
+  * 2. Issue 1 read request for 8-bit value at inp1_baddr address (read matrix)
   * 3. Wait for the value
   * 4. Increment read-address for next value
-  * 5. Wait for sliced accumulator
-  * 6. Check if counter (cnt) is equal to length process,
-       otherwise goto step 2
-  * 7. Check if reset slice accumulator
-  * 8. Wait for overall accumulator
-  * 8. Issue a write request for 8-byte value at out_baddr address
+  * 5. Repeat until all inp1 data have been read
+
+  * 6. Issue 1 read request for 8-bit value at inp2_baddr address (read vector)
+  * 7. Wait for the value
+  * 8. Increment read-address for next value
+  * 9. Repeat until all inp2 data have been read
+
+  * 10. Wait for output to be calculated
+  * 11. Issue a write request for 8-byte value at out_baddr address
+  * 12. Increment write-address for next value to write
+  * 13. Check if counter (cntout) is equal to length to asser finish,
+       otherwise go to step 11
   */
 class Compute(implicit config: AccelConfig) extends Module {
   val io = IO(new Bundle {
@@ -47,19 +57,24 @@ class Compute(implicit config: AccelConfig) extends Module {
     val ptrs = Input(Vec(config.nPtrs, UInt(config.ptrBits.W)))
     val mem = new VTAMemDPIMaster
   })
-  val sIdle :: sReadAReq :: sReadAData :: sReadBReq :: sReadBData :: sWriteReq :: sWriteData :: Nil = Enum(7)
+  implicit val p: Parameters = new TestConfig
+  val sIdle :: sReadAReq :: sReadAData :: sReadADone ::sReadBReq :: sReadBData :: sReadBDone :: sInpDone ::sWait:: sWriteReq :: sWriteData :: sWriteDone :: Nil = Enum(12)
   val state = RegInit(sIdle)
   val shift = io.vals(0)
   val length = io.vals(1)
   val rstAccum = io.vals(2)
   val startDot = io.vals(3)
   val cycles = RegInit(0.U(config.regBits.W))
-  val reg1 = Reg(chiselTypeOf(io.mem.rd.bits))
-  val reg2 = Reg(chiselTypeOf(io.mem.rd.bits))
-  val cnt = Reg(UInt(config.regBits.W))
+  val mvc = Module(new MatrixVectorMultiplication)
+  val reg1 = Reg(chiselTypeOf(mvc.io.wgt.data.bits))
+  val reg2 = Reg(chiselTypeOf(mvc.io.inp.data.bits))
+  val cntwgt = Reg(UInt(config.regBits.W))
+  val cntinp = Reg(UInt(config.regBits.W))
+  val cntout = Reg(UInt(config.regBits.W))
   val raddr1 = Reg(UInt(config.ptrBits.W))
   val raddr2 = Reg(UInt(config.ptrBits.W))
   val waddr = Reg(UInt(config.ptrBits.W))
+  val accum = Module(new Accmulator(size = p(CoreKey).blockOut, accBits = p(CoreKey).accBits))
 
   switch (state) {
     is (sIdle) {
@@ -73,7 +88,14 @@ class Compute(implicit config: AccelConfig) extends Module {
     }
     is (sReadAData) {
       when (io.mem.rd.valid) {
+        state := sReadADone
+      }   
+    }
+    is (sReadADone) {
+      when (cntwgt === (length * length) - 1.U) {
         state := sReadBReq
+      } .otherwise {
+        state := sReadAReq
       }
     }
     is (sReadBReq) {
@@ -81,6 +103,23 @@ class Compute(implicit config: AccelConfig) extends Module {
     }
     is (sReadBData) {
       when (io.mem.rd.valid) {
+        state := sReadBDone
+      }
+    }
+    is (sReadBDone) {
+      when (cntinp === length-1.U) {
+        state := sInpDone
+      } .otherwise {
+        state := sReadBReq
+      }
+    }
+    // Both input is processed
+    is (sInpDone) {
+      state := sWait
+    }
+    // Wait for computation
+    is (sWait) {
+      when (accum.io.ready) {
         state := sWriteReq
       }
     }
@@ -89,15 +128,18 @@ class Compute(implicit config: AccelConfig) extends Module {
       state := sWriteData
     }
     is (sWriteData) {
-      when (cnt === (length - 1.U)) {
+        state := sWriteDone
+    }
+    is (sWriteDone) {
+      when (cntout === (length - 1.U)) {
         state := sIdle
       } .otherwise {
-        state := sReadAReq
+        state := sWriteReq
       }
     }
   }
 
-  val last = state === sWriteData && cnt === (length - 1.U)
+  val last = state === sWriteDone && cntout === (length - 1.U)
 
   // cycle counter
   when (state === sIdle) {
@@ -114,10 +156,12 @@ class Compute(implicit config: AccelConfig) extends Module {
     raddr1 := io.ptrs(0)
     raddr2 := io.ptrs(1)
     waddr := io.ptrs(2)
-  } .elsewhen (state === sWriteData) { // increment input array by 1-byte
+  } .elsewhen (state === sReadADone) { // increment input array by 1-byte
     raddr1 := raddr1 + 1.U
+  } .elsewhen (state === sReadBDone) { // increment input array by 1-byte
     raddr2 := raddr2 + 1.U
-    waddr := waddr
+  } .elsewhen (state === sWriteDone) {
+    waddr := waddr + 4.U // writing 4 bytes
   }
 
   // create request
@@ -128,59 +172,70 @@ class Compute(implicit config: AccelConfig) extends Module {
 
   // read
   when (state === sReadAData && io.mem.rd.valid) {
-    reg1 := io.mem.rd.bits(7, 0)
+    reg1(cntwgt/length)(cntwgt%length) := io.mem.rd.bits(7, 0)
   }
 
   when (state === sReadBData && io.mem.rd.valid) {
-    reg2 := io.mem.rd.bits(7, 0)
+    reg2(0)(cntinp) := io.mem.rd.bits(7, 0)
   }
 
   io.mem.rd.ready := state === sReadAData | state === sReadBData
+  mvc.io.inp.data.valid := state === sInpDone // 2 inputs have been processed 
+  mvc.io.wgt.data.valid := state === sInpDone // 2 inputs have been processed 
+
+  mvc.io.wgt.data.bits <> reg1
+  mvc.io.inp.data.bits <> reg2
+  // Modify when shift operation is supported
+  mvc.io.reset := false.B
+  mvc.io.acc_i.data.valid := true.B
+  for (i <- 0 until p(CoreKey).blockOut) {
+    mvc.io.acc_i.data.bits(0)(i) := 0.U
+  }
 
-  
-  val sliceAccum = Module(new Accumulator(63))
-  val overallAccum = Module(new Accumulator(64))
-
-  sliceAccum.io.valid := state === sWriteReq // 2 inputs have been processed 
-  sliceAccum.io.in := reg1 * reg2
-  sliceAccum.io.clear := startDot
-  overallAccum.io.clear := rstAccum
-  overallAccum.io.valid := last // last element has been processed
-  overallAccum.io.in := sliceAccum.io.sum << shift(7,0) // limit to 8 bits 
+  accum.io.in := mvc.io.acc_o.data.bits
+  accum.io.shift := shift
+  accum.io.clear := rstAccum
+  accum.io.valid := mvc.io.acc_o.data.valid
 
   // write
-  io.mem.wr.valid := overallAccum.io.ready 
-  io.mem.wr.bits := overallAccum.io.sum
-  
+  io.mem.wr.valid := state === sWriteData 
+  io.mem.wr.bits := accum.io.sum(cntout)
 
   // count read/write
   when (state === sIdle) {
-    cnt := 0.U
-  } .elsewhen (state === sWriteData) {
-    cnt := cnt + 1.U
+    cntwgt := 0.U
+    cntinp := 0.U
+    cntout := 0.U
+  } .elsewhen (state === sReadADone) {
+    cntwgt := cntwgt + 1.U
+  } .elsewhen (state === sReadBDone) {
+    cntinp := cntinp + 1.U
+  } .elsewhen (state === sWriteDone) {
+    cntout := cntout + 1.U
   }
 
-  io.finish := overallAccum.io.ready // data has been added
+  io.finish := last // data has been added
 }
-
-
-class Accumulator(dataBits: Int = 8) extends Module {
+// Shift operation until supported in MVM
+class Accmulator(size: Int = 16, accBits: Int = 32) extends Module {
   val io = IO(new Bundle {
     val clear = Input(Bool())
     val valid = Input(Bool())
     val ready = Output(Bool())
-    val in = Input(UInt(dataBits.W))
-    val sum = Output(UInt((dataBits).W))
+    val in = Input(Vec(1, Vec(size, (UInt(accBits.W)))))
+    val shift = Input(UInt(8.W))
+    val sum = Output(Vec(size, (UInt(accBits.W))))
   })
+    val reg = RegInit(VecInit(Seq.fill(size)(0.U(accBits.W))))
 
-  val reg = RegInit(0.U((dataBits).W))
-  val ready = RegNext(io.valid)
-  when (io.clear) {
-    reg := 0.U
-  } .elsewhen (io.valid) {
-    reg := reg + io.in
-  } 
-  io.ready := ready
-  io.sum := reg
+    for (i <- 0 until size) {
+      when (io.clear) {
+        reg(i) := 0.U
+      } .elsewhen(io.valid) {
+        reg(i) := reg(i) + (io.in(0)(i) << io.shift)
+      }
+    }
+    io.ready := RegNext(io.valid)
+    io.sum := reg
 }
 
diff --git a/vta/apps/gemm/hardware/chisel/src/main/scala/accel/RegFile.scala b/vta/apps/gemm/hardware/chisel/src/main/scala/accel/RegFile.scala
index 6f0bdbb6b34c..10c40b5c2e72 100644
--- a/vta/apps/gemm/hardware/chisel/src/main/scala/accel/RegFile.scala
+++ b/vta/apps/gemm/hardware/chisel/src/main/scala/accel/RegFile.scala
@@ -35,13 +35,9 @@ import vta.dpi._
   *  Shift value             | 0x08
   *  Vector length           | 0x0c
   *  Reset Accumulator       | 0x10
-  *  Reset Dot Module        | 0x14
-  *  Input1 pointer lsb      | 0x18
-  *  Input1 pointer msb      | 0x1c
-  *  Input2 pointer lsb      | 0x20
-  *  Input2 pointer msb      | 0x24
-  *  Output pointer lsb      | 0x28
-  *  Output pointer msb      | 0x2c
+  *  Input1 pointer          | 0x18
+  *  Input2 pointer          | 0x20
+  *  Output pointer          | 0x28
   * -------------------------------
 
   * ------------------------------
diff --git a/vta/apps/gemm/src/driver.cc b/vta/apps/gemm/src/driver.cc
index 8d380c323c9a..24b998edd211 100644
--- a/vta/apps/gemm/src/driver.cc
+++ b/vta/apps/gemm/src/driver.cc
@@ -66,10 +66,12 @@ class Device {
 
   uint32_t Run(DLTensor* inp1, DLTensor* inp2, uint32_t shiftVal, DLTensor* out, uint32_t reset) {
     uint32_t cycles;
-    uint32_t length = inp1->shape[0];
-    size_t size1 = (inp1->dtype.bits >> 3) * length;
+    uint32_t length = inp2->shape[0];
+    // 1 matrix 1 vector input
+    size_t size1 = (inp1->dtype.bits >> 3) * length * length;
     size_t size2 = (inp2->dtype.bits >> 3) * length;
-    size_t size3 = (64 >> 3);
+    // 1 vector output
+    size_t size3 = (32 >> 3) * length;
     inp1_ = this->MemAlloc(size1);
     inp2_ = this->MemAlloc(size2);
     out_ = this->MemAlloc(size3);
@@ -115,19 +117,17 @@ class Device {
 
   void Launch(uint32_t length, uint32_t shiftVal, uint32_t reset) {
     dpi_->WriteReg(0x08, shiftVal);
-    dpi_->WriteReg(0x0c, length); // vector length
+    dpi_->WriteReg(0x0c, length); // tensor size
     dpi_->WriteReg(0x18, this->MemGetPhyAddr(inp1_));
     dpi_->WriteReg(0x20, this->MemGetPhyAddr(inp2_));
     dpi_->WriteReg(0x28, this->MemGetPhyAddr(out_));
     dpi_->WriteReg(0x00, 0x1); // launch
-    dpi_->WriteReg(0x00, 0x0); // launch
+    dpi_->WriteReg(0x00, 0x0); 
 
     if (reset == 1) {
-      dpi_->WriteReg(0x10, 0x1); // reset accum
-      dpi_->WriteReg(0x10, 0x0); // stop reset accum
+      dpi_->WriteReg(0x10, 0x1); // reset accumulator
+      dpi_->WriteReg(0x10, 0x0); 
     }
-    dpi_->WriteReg(0x14, 0x1); // reset dot
-    dpi_->WriteReg(0x14, 0x0); // stop reset dot
   }
 
   uint32_t WaitForCompletion() {
diff --git a/vta/apps/gemm/tests/python/chisel_accel.py b/vta/apps/gemm/tests/python/chisel_accel.py
index 4aed5636b50e..4666661f9bc9 100644
--- a/vta/apps/gemm/tests/python/chisel_accel.py
+++ b/vta/apps/gemm/tests/python/chisel_accel.py
@@ -26,7 +26,7 @@
 A : Vector to be sliced and packed
 slice_width : slice width
 
-Returnsi
+Returns
 ---------
 C: 2d matrix where each cloumn (because of bit packing) represents each bit slice of A
 """
@@ -39,7 +39,7 @@ def slice(A, slice_width):
     elif dtype is np.uint16: row = 16 // slice_width
     elif dtype is np.uint32: row = 32 // slice_width
     elif dtype is np.uint64: row = 64 // slice_width
-    else: raise ValueError("datatype " + str(dtype) + "currently not supported")
+    else: raise ValueError("datatype currently not supported")
     if (row >= 8):
         dtype = 'uint' + str(row)
     else:
@@ -55,64 +55,88 @@ def slice(A, slice_width):
             C[y][x] = (np.uint64(A[x]) >> np.uint64(slice_width * y)) & np.uint64(slice_mask)
     return C
 
+def slice_mat(A, slice_width):
+    assert np.log2(slice_width) % 1 == 0, "only power of 2 is supported"
+    dtype = type(A[0][0]) 
+    row = 0
+    # currently only supports uint
+    if dtype is np.uint8: row = 8 // slice_width
+    elif dtype is np.uint16: row = 16 // slice_width
+    elif dtype is np.uint32: row = 32 // slice_width
+    elif dtype is np.uint64: row = 64 // slice_width
+    else: raise ValueError("datatype currently not supported")
+    if (row >= 8):
+        dtype = 'uint' + str(row)
+    else:
+        dtype = 'uint8'
+
+    # 3d array (bits, row, clmn)
+    C = np.zeros((row, A.shape[0], A.shape[1])).astype(dtype) # sliced and transform 
+
+    # create mask
+    slice_mask = 2**(slice_width)-1
+    # slice and pack
+    for z in range(A.shape[0]):
+        C[:, z, :] = slice(A[z], slice_width)
+    return C
+
 """ Matrix Multiplication Function
 Parameters
 ----------
 A : Matrix A
 B: Matrix B
-w_width : weight slice width
-a_width : activation slice width
+i_width : weight slice width
+w_width : activation slice width
 
 Returns
 ---------
 C: result of A * B
 """
 # A is a n*m matrix, B is a m*p matrix(not transposed yet)
-def matrix_multiply(A, B, w_width, a_width):
+def matrix_multiply(A, B, i_width, w_width):
     assert A.shape[1] == B.shape[0], "can't perform multiplication"
     BT = B.transpose()
     cycles = 0
+    B_sliced = slice_mat(BT, w_width)
     C = np.zeros((A.shape[0], B.shape[1])).astype('uint64')
     for i in range(A.shape[0]):
-        for j in range(B.shape[1]):
-            # C[i, j] = A[i].dot(BT[j])
-            A_sliced = slice(A[i], w_width)
-            B_sliced = slice(BT[j], a_width)
-
-            C[i, j] = compute(A_sliced, B_sliced, w_width, a_width)
-            test = test_accel(A_sliced, B_sliced, w_width, a_width)
-            cycles += test[1]
-            np.testing.assert_equal(C[i,j], A[i].astype('uint64').dot(BT[j]))
-            print("PASS SW serial & parallel")
-
-            np.testing.assert_equal(test[0], C[i, j])
-            print("PASS SW & HW bit serial")
-
-            np.testing.assert_equal(test[0], A[i].astype('uint64').dot(BT[j]))
-            print("PASS SW bit parallel & HW bit parallel")
-
+        A_sliced = slice(A[i], i_width)
+        test = test_accel(A_sliced, B_sliced, i_width, w_width)
+        C[i] = test[0]
+        cycles += test[1]
+        np.testing.assert_array_equal(C[i], compute(A_sliced, B_sliced, i_width, w_width))
+        print("PASS row " + str(i))
+
+    np.testing.assert_array_equal(C, np.matmul(A.astype('uint64'),B))
     print("result: ")
     print(C)
-    print("ALL TESTS PASSED, cycles: " + str(cycles))
+    print("TEST PASSED, cycles: " + str(cycles))
     return C
 
-""" Software Verification Function"""
-# takes 2 matrix input (sliced and packed)
-def compute(A, B, w_width, a_width):
+""" Software Verification Function
+Parameter Dimesions
+---------
+A (bits, y) and B (bits, y, x) (transposed)
+
+Takes 1 vector and 1 matrix input (sliced and packed)
+
+Returns
+---------
+Resulting vector
+"""
+def compute(A, B, i_width, w_width):
     assert A.shape[1] == B.shape[1], "sliced shape not match"
     # reset hardware accumulator
-    accum = 0
+    accum = np.zeros(A.shape[1])
     for x in range(A.shape[0]):
         for y in range(B.shape[0]):
-            # hardware implementation
-            accum += np.uint64(A[x]).dot(np.uint64(B[y])) << np.uint64(x*w_width + y*a_width)
+            accum += np.matmul(A[x].astype('uint64'), B[y].transpose()) << np.uint64(x*i_width + y*w_width)
     # get value from accumulator
     return accum
 
-"""Testing Function for Dot Product"""
-def test_accel(A, B, w_width, a_width):
-    assert A.shape[1] == B.shape[1], "sliced shape not match"
-
+"""Testing Function for Matrix Vector Multiplication"""
+def test_accel(A, B, i_width, w_width):
+    assert A.shape[1] == B.shape[2], "sliced shape not match"
     dtype = A.dtype
     ctx = tvm.cpu(0)
     f = tsim.load_module()
@@ -126,57 +150,54 @@ def test_accel(A, B, w_width, a_width):
         a_arr.append(tvm.nd.array(list_a.astype(dtype), ctx))
 
     for i in range(B.shape[0]):
-        list_b = np.zeros(B.shape[1]).astype(dtype)
-        for j in range(B.shape[1]):
-            list_b[j] = B[i][j]
+        # transpose
+        list_b = np.zeros((B.shape[2], B.shape[1])).astype(dtype)
+        for j in range(B.shape[2]):
+            for k in range(B.shape[1]):
+                list_b[j][k] = B[i][j][k]
         b_arr.append(tvm.nd.array(list_b.astype(dtype), ctx))
 
     cycles = 0
-
-    accum = tvm.nd.array(np.array([0]).astype("uint64"), ctx)
+    accum = tvm.nd.array(np.zeros(A.shape[1]).astype("uint32"), ctx)
     for i in range(len(a_arr)):
         for j in range(len(b_arr)):
-            shift = np.uint8(i*w_width + j*a_width)
+            shift = np.uint8(i*i_width + j*w_width)
             if i == 0 and j == 0: 
-                cycles += f(a_arr[i], b_arr[j], shift, accum, np.uint32(1)) # reset accumulator
+                cycles += f(b_arr[j], a_arr[i], shift, accum, np.uint32(1)) # reset accumulator
             else: 
-                cycles += f(a_arr[i], b_arr[j], shift, accum, np.uint32(0)) # no reset
+                cycles += f(b_arr[j], a_arr[i], shift, accum, np.uint32(0)) # no reset
 
-    return (accum.asnumpy()[0], cycles)
+    return (accum.asnumpy(), cycles)
 
 """ Matrix Generator
 Parameters
 ----------     
 dtype : String, datatype generated (supports only uint)
-w_width : weight bit slices(needs to be less than actual bit width)
-a_width : activation bit slices(needs to be less than actual bit width)
+i_width : weight bit slices(needs to be less than actual bit width)
+w_width : activation bit slices(needs to be less than actual bit width)
 """
-def top_test(dtype, w_width, a_width):
-
-    rmax = np.random.randint(256)
-    # random matrix generation (dimension up to 8)
-    rrow = np.random.randint(7) + 1
-    rclmn = np.random.randint(7) + 1
-    rrow2 = np.random.randint(7) + 1 
-    A = np.random.randint(rmax, size=(rrow,rclmn)).astype(dtype)
-    B = np.random.randint(rmax, size=(rclmn,rrow2)).astype(dtype)
+def top_test(dtype, i_width, w_width):
 
-    print("A: ")
-    print(A)
-    print("\n")
-    print("B: ")
-    print(B)
-    print("\n")
-    matrix_multiply(A, B, w_width, a_width)
+    # only supports positive values (up to 2**(bits-1))
+    rmax = 127 
+    # (m,16) * (16,16) GEMM
+    rrow = np.random.randint(7) + 1 
+    clmn = 16
+    A = np.random.randint(rmax, size=(rrow,clmn)).astype(dtype)
+    B = np.random.randint(rmax, size=(clmn,clmn)).astype(dtype)
 
+    print("A: " + str(A))
+    print("B: " + str(B))
+    # perform GEMM
+    matrix_multiply(A, B, i_width, w_width)
 
 if __name__ == "__main__":
     tsim.init("chisel")
     for i in range(1):
-        # reg1 and reg2 bits in Compute.scala must be modified for slices greater than 8 bits
+        # reg1 and reg2 bits in hardware/chisel/src/main/Compute.scala must be modified for slices greater than 8 bits
         if sys.argv[1] == 'serial':
-          # generates a random uint8 GEMM with 2-bit(8/4) weight and 4-bit(8/2) activation 
-          top_test("uint8",4, 2)
+          # generates a random uint8 GEMM with 2-bit(8/4) input and 4-bit(8/2) weight 
+          top_test("uint8", 4, 2)
         elif sys.argv[1] == 'parallel':
-          # generates a random uint8 GEMM with 8-bit weight and 8-bit activation (bit parallel) 
-          top_test('uint8', 1, 1)
+          # generates a random uint8 GEMM with 8-bit input and 8-bit weight (bit parallel) 
+          top_test('uint8', 8, 8)
diff --git a/vta/python/vta/build_module.py b/vta/python/vta/build_module.py
index 5c243751c340..cec217cbd393 100644
--- a/vta/python/vta/build_module.py
+++ b/vta/python/vta/build_module.py
@@ -80,6 +80,7 @@ def add_debug(stmt):
     if debug_flag:
         pass_list.append((1, add_debug))
     pass_list.append((2, ir_pass.inject_alu_intrin))
+    pass_list.append((3, tvm.ir_pass.LowerStorageAccessInfo))
     pass_list.append((3, ir_pass.fold_uop_loop))
     pass_list.append((3, ir_pass.cpu_access_rewrite))
     return tvm.build_config(add_lower_pass=pass_list, **kwargs)
diff --git a/web/tvm_runtime.js b/web/tvm_runtime.js
index a336a802b80f..2c36b9eb277d 100644
--- a/web/tvm_runtime.js
+++ b/web/tvm_runtime.js
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -100,7 +100,7 @@ var tvm_runtime = tvm_runtime || {};
     var kTVMType = 5;
     var kTVMContext = 6;
     var kArrayHandle = 7;
-    var kNodeHandle = 8;
+    var kObjectHandle = 8;
     var kModuleHandle = 9;
     var kFuncHandle = 10;
     var kStr = 11;
@@ -497,7 +497,7 @@ var tvm_runtime = tvm_runtime || {};
       for (var i = 0; i < nargs; ++i) {
         var vptr = arg_value + i * SIZEOF_TVMVALUE;
         var tcode = Module.getValue(arg_tcode + i * SIZEOF_INT, "i32");
-        if (tcode == kNodeHandle ||
+        if (tcode == kObjectHandle ||
             tcode == kFuncHandle ||
             tcode == kModuleHandle) {
           TVM_CALL(TVMCbArgToReturn(vptr, tcode));