diff --git a/benchmark/python/control_flow/rnn.py b/benchmark/python/control_flow/rnn.py
index 24e326c9afd1..fe016dd0293d 100644
--- a/benchmark/python/control_flow/rnn.py
+++ b/benchmark/python/control_flow/rnn.py
@@ -24,8 +24,8 @@
 from time import time
 
 import mxnet as mx
-import numpy as np
-from mxnet import gluon
+import numpy as onp
+from mxnet import gluon, np, npx
 
 
 _parser = argparse.ArgumentParser(description='Benchmark foreach and while_loop on RNN tasks.')
@@ -42,8 +42,8 @@ def __init__(self, cell, length, prefix=None, params=None):
         self.length = length
         self.cell = cell
 
-    def hybrid_forward(self, F, inputs, states):
-        out, states = F.contrib.foreach(self.cell, inputs, states)
+    def forward(self, inputs, states):
+        out, states = npx.foreach(self.cell, inputs, states)
         return out
 
 
@@ -53,15 +53,15 @@ def __init__(self, cell, length, prefix=None, params=None):
         self.length = length
         self.cell = cell
 
-    def hybrid_forward(self, F, inputs, states):
+    def forward(self, inputs, states):
         def _func(*states):
             i = states[0]
             s = states[1: ]
-            data = inputs.take(i).squeeze(axis=0)
+            data = np.squeeze(np.take(inputs, i), axis=0)
             out, new_s = self.cell(data, s)
             new_s = [i + 1] + new_s
             return out, new_s
-        out, states = F.contrib.while_loop(
+        out, states = npx.while_loop(
             cond=lambda i, *_: i < self.length,
             func=_func,
             loop_vars=states,
@@ -71,11 +71,11 @@ def _func(*states):
 
 
 def _zeros(shape, ctx):
-    return mx.nd.zeros(shape=shape, ctx=ctx)
+    return mx.np.zeros(shape=shape, ctx=ctx)
 
 
 def _array(shape, ctx):
-    return mx.nd.normal(loc=0.0, scale=1.0, shape=shape, ctx=ctx)
+    return mx.np.random.normal(loc=0.0, scale=1.0, size=shape, ctx=ctx)
 
 
 def _get_gpus():
@@ -107,11 +107,11 @@ def run_benchmark(cell_type, ctx, seq_len, batch_size, hidden_dim):
                     res = layer(inputs, states)
             if is_train:
                 res.backward()
-            mx.nd.waitall()
+            mx.npx.waitall()
             tock = time()
             times.append((tock - tick) * 1000.0)
         times = times[args.warmup_rounds: ]
-        print("Time used: mean = %.3f ms, std = %.3f ms" % (np.mean(times), np.std(times)))
+        print("Time used: mean = %.3f ms, std = %.3f ms" % (onp.mean(times), onp.std(times)))
 
 
 def main():
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md
index 2e378264aeab..4f0de0df200c 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md
@@ -35,7 +35,7 @@ In the example below, we define a new layer and implement `forward()` method to
 # Do some initial imports used throughout this tutorial 
 from __future__ import print_function
 import mxnet as mx
-from mxnet import nd, gluon, autograd
+from mxnet import np, npx, gluon, autograd
 from mxnet.gluon.nn import Dense
 mx.random.seed(1)                      # Set seed for reproducable results
 ```
@@ -47,7 +47,7 @@ class NormalizationLayer(gluon.Block):
         super(NormalizationLayer, self).__init__()
 
     def forward(self, x):
-        return (x - nd.min(x)) / (nd.max(x) - nd.min(x))
+        return (x - np.min(x)) / (np.max(x) - np.min(x))
 ```
 
 The rest of methods of the `Block` class are already implemented, and majority of them are used to work with parameters of a block. There is one very special method named [hybridize()](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/block.py#L384), though, which I am going to cover before moving to a more complex example of a custom layer.
@@ -56,17 +56,15 @@ The rest of methods of the `Block` class are already implemented, and majority o
 
 Looking into implementation of [existing layers](https://mxnet.apache.org/api/python/gluon/nn.html), one may find that more often a block inherits from a [HybridBlock](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/block.py#L428), instead of directly inheriting from `Block`.
 
-The reason for that is that `HybridBlock` allows to write custom layers that can be used in imperative programming as well as in symbolic programming. It is convinient to support both ways, because the imperative programming eases the debugging of the code and the symbolic one provides faster execution speed. You can learn more about the difference between symbolic vs. imperative programming from [this article](https://mxnet.apache.org/api/architecture/overview.html).
+The reason for that is that `HybridBlock` allows to write custom layers in imperative programming style, while computing in a symbolic way. It unifies the flexibility of imperative programming with the performance benefits of symbolic programming. You can learn more about the difference between symbolic and imperative programming from [this article](https://mxnet.apache.org/api/architecture/overview.html).
 
 Hybridization is a process that Apache MxNet uses to create a symbolic graph of a forward computation. This allows to increase computation performance by optimizing the computational symbolic graph. Once the symbolic graph is created, Apache MxNet caches and reuses it for subsequent computations.
 
-To simplify support of both imperative and symbolic programming, Apache MxNet introduce the `HybridBlock` class. Compare to the `Block` class, `HybridBlock` already has its [forward()](https://mxnet.apache.org/api/python/gluon/gluon.html#mxnet.gluon.HybridBlock.forward) method implemented, but it defines a [hybrid_forward()](https://mxnet.apache.org/api/python/gluon/gluon.html#mxnet.gluon.HybridBlock.hybrid_forward) method that needs to be implemented.
+Hybridization of HybridBlock.forward is based on a deferred computation mode in the MXNet backend, which enables recording computation via tracing in the mxnet.nd and mxnet.np interfaces. The recorded computation can be exported to a symbolic representation and is used for optimized execution with the CachedOp.
 
-The main difference between `forward()` and `hybrid_forward()` is an `F` argument. This argument sometimes is refered as a `backend` in the Apache MxNet community. Depending on if hybridization has been done or not, `F` can refer either to [mxnet.ndarray API](https://mxnet.apache.org/api/python/ndarray/ndarray.html) or [mxnet.symbol API](https://mxnet.apache.org/api/python/symbol/symbol.html). The former is used for imperative programming, and the latter for symbolic programming. 
+As tracing is based on the imperative APIs, users can access shape information of the arrays. As x.shape for some array x is a python tuple, any use of that shape will be a constant in the recorded graph and may limit the recorded graph to be used with inputs of the same shape only.
 
-To support hybridization, it is important to use only methods avaible directly from `F` parameter. Usually, there are equivalent methods in both APIs, but sometimes there are mismatches or small variations. For example, by default, subtraction and division of NDArrays support broadcasting, while in Symbol API broadcasting is supported in a separate operators. 
-
-Knowing this, we can can rewrite our example layer, using HybridBlock:
+Knowing this, we can rewrite our example layer, using HybridBlock:
 
 
 ```{.python .input}
@@ -74,8 +72,8 @@ class NormalizationHybridLayer(gluon.HybridBlock):
     def __init__(self):
         super(NormalizationHybridLayer, self).__init__()
 
-    def hybrid_forward(self, F, x):
-        return F.broadcast_div(F.broadcast_sub(x, F.min(x)), (F.broadcast_sub(F.max(x), F.min(x))))
+    def forward(self, x):
+        return (x - np.min(x)) / (np.max(x) - np.min(x))
 ```
 
 Thanks to inheriting from HybridBlock, one can easily do forward pass on a given ndarray, either on CPU or GPU:
@@ -83,14 +81,13 @@ Thanks to inheriting from HybridBlock, one can easily do forward pass on a given
 
 ```{.python .input}
 layer = NormalizationHybridLayer()
-layer(nd.array([1, 2, 3], ctx=mx.cpu()))
+layer(np.array([1, 2, 3], ctx=mx.cpu()))
 ```
 
 Output:
 
 ```bash
 [0.  0.5 1. ]
-<NDArray 3 @cpu(0)>
 ```
 
 
@@ -112,13 +109,13 @@ Below is an example of how to create a simple neural network with a custom layer
 ```{.python .input}
 net = gluon.nn.HybridSequential()                         # Define a Neural Network as a sequence of hybrid blocks
 net.add(Dense(5))                                     # Add Dense layer with 5 neurons
-net.add(NormalizationHybridLayer())                   # Add our custom layer
+net.add(NormalizationHybridLayer())                   # Add a custom layer
 net.add(Dense(1))                                     # Add Dense layer with 1 neurons
 
 
 net.initialize(mx.init.Xavier(magnitude=2.24))            # Initialize parameters of all layers
 net.hybridize()                                           # Create, optimize and cache computational graph
-input = nd.random_uniform(low=-10, high=10, shape=(5, 2)) # Create 5 random examples with 2 feature each in range [-10, 10]
+input = np.random.uniform(low=-10, high=10, size=(5, 2))  # Create 5 random examples with 2 feature each in range [-10, 10]
 net(input)
 ```
 
@@ -131,36 +128,36 @@ Output:
  [-0.05046433]
  [-1.2375476 ]
  [-0.15506986]]
-<NDArray 5x1 @cpu(0)>
 ```
 
 
 ## Parameters of a custom layer
 
-Usually, a layer has a set of associated parameters, sometimes also referred as weights. This is an internal state of a layer. Most often, these parameters are the ones, that we want to learn during backpropogation step, but sometimes these parameters might be just constants we want to use during forward pass.
-
-All parameters of a block are stored and accessed via [ParameterDict](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/parameter.py#L508) class. This class helps with initialization, updating, saving and loading of the parameters. Each layer can have multiple set of parameters, and all of them can be stored in a single instance of the `ParameterDict` class. On a block level, the instance of the `ParameterDict` class is accessible via `self.params` field, and outside of a block one can access all parameters of the network via [collect_params()](https://mxnet.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Block.collect_params) method called on a `container`. `ParameterDict` uses [Parameter](https://mxnet.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Parameter) class to represent parameters inside of Apache MxNet neural network. If parameter doesn't exist, trying to get a parameter via `self.params` will create it automatically.
+Usually, a layer has a set of associated parameters, sometimes also referred as weights. This is an internal state of a layer. Most often, these parameters are the ones, that we want to learn during backpropogation step, but sometimes these parameters might be just constants we want to use during forward pass. The parameters are usually represented as [Parameter](https://mxnet.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Parameter) class inside of Apache MXNet neural network.
 
 
 ```{.python .input}
 class NormalizationHybridLayer(gluon.HybridBlock):
     def __init__(self, hidden_units, scales):
         super(NormalizationHybridLayer, self).__init__()
-
+        self.hidden_units = hidden_units
         self.weights = gluon.Parameter('weights',
-                                       shape=(hidden_units, 0),
+                                       shape=(hidden_units, -1),
                                        allow_deferred_init=True)
 
         self.scales = gluon.Parameter('scales',
                                       shape=scales.shape,
-                                      init=mx.init.Constant(scales.asnumpy().tolist()), # Convert to regular list to make this object serializable
+                                      init=mx.init.Constant(scales), # Convert to regular list to make this object serializable
                                       differentiable=False)
             
-    def hybrid_forward(self, F, x, weights, scales):
-        normalized_data = F.broadcast_div(F.broadcast_sub(x, F.min(x)), (F.broadcast_sub(F.max(x), F.min(x))))
-        weighted_data = F.FullyConnected(normalized_data, weights, num_hidden=self.weights.shape[0], no_bias=True)
-        scaled_data = F.broadcast_mul(scales, weighted_data)
+    def forward(self, x):
+        normalized_data = (x - np.min(x)) / (np.max(x) - np.min(x))
+        weighted_data = npx.fully_connected(normalized_data, self.weights.data(), num_hidden=self.hidden_units, no_bias=True)
+        scaled_data = np.multiply(self.scales.data(), weighted_data)
         return scaled_data
+    
+    def infer_shape(self, x, *args):
+        self.weights.shape = (self.hidden_units, x.shape[x.ndim-1])
 ```
 
 In the example above 2 set of parameters are defined:
@@ -168,13 +165,8 @@ In the example above 2 set of parameters are defined:
 1. Parameter `scale` is a constant that doesn't change. Its shape is defined during construction.
 
 Notice a few aspects of this code:
-* Shape is not provided when creating `weights`. Instead it is going to be infered from the shape of the input
+* Shape is not provided when creating `weights`. Instead it is going to be infered from the shape of the input by `infer_shape` method.
 * `Scales` parameter is initialized and marked as `differentiable=False`.
-* `F` backend is used for all calculations
-* The calculation of dot product is done using `F.FullyConnected()` method instead of `F.dot()` method. The one was chosen over another because the former supports automatic infering shapes of inputs while the latter doesn't. This is extremely important to know, if one doesn't want to hard code all the shapes. The best way to learn what operators supports automatic inference of input shapes at the moment is browsing C++ implementation of operators to see if one uses a method `SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param.num_hidden, num_input));`
-* `hybrid_forward()` method signature has changed. It accepts two new arguments: `weights` and `scales`.
-
-The last peculiarity is due to support of imperative and symbolic programming by `HybridBlock`. During training phase, parameters are passed to the layer by Apache MxNet framework as additional arguments to the method, because they might need to be converted to a `Symbol` depending on if the layer was hybridized. One shouldn't use `self.weights` and `self.scales` in `hybrid_forward` except to get shapes of parameters. 
 
 Running forward pass on this network is very similar to the previous example, so instead of just doing one forward pass, let's run whole training for a few epochs to show that `scales` parameter doesn't change during the training while `weights` parameter is changing.
 
@@ -185,7 +177,7 @@ def print_params(title, net):
     Helper function to print out the state of parameters of NormalizationHybridLayer
     """
     print(title)
-    hybridlayer_params = {k: v for k, v in net.collect_params().items() if 'normalizationhybridlayer' in k }
+    hybridlayer_params = {k: v for k, v in net.collect_params().items()}
     
     for key, value in hybridlayer_params.items():
         print('{} = {}\n'.format(key, value.data()))
@@ -193,15 +185,15 @@ def print_params(title, net):
 net = gluon.nn.HybridSequential()                             # Define a Neural Network as a sequence of hybrid blocks
 net.add(Dense(5))                                         # Add Dense layer with 5 neurons
 net.add(NormalizationHybridLayer(hidden_units=5, 
-                                    scales = nd.array([2]))) # Add our custom layer
+                                 scales = np.array([2]))) # Add a custom layer
 net.add(Dense(1))                                         # Add Dense layer with 1 neurons
 
 
 net.initialize(mx.init.Xavier(magnitude=2.24))                # Initialize parameters of all layers
 net.hybridize()                                               # Create, optimize and cache computational graph
 
-input = nd.random_uniform(low=-10, high=10, shape=(5, 2))     # Create 5 random examples with 2 feature each in range [-10, 10]
-label = nd.random_uniform(low=-1, high=1, shape=(5, 1))
+input = np.random.uniform(low=-10, high=10, size=(5, 2))      # Create 5 random examples with 2 feature each in range [-10, 10]
+label = np.random.uniform(low=-1, high=1, size=(5, 1))
 
 mse_loss = gluon.loss.L2Loss()                                # Mean squared error between output and label
 trainer = gluon.Trainer(net.collect_params(),                 # Init trainer with Stochastic Gradient Descent (sgd) optimization method and parameters for it
@@ -231,11 +223,9 @@ hybridsequential94_normalizationhybridlayer0_weights =
  [ 0.6077331   0.03922009  0.13425875  0.5729856  -0.14446527]
  [-0.3572498   0.18545026 -0.09098256  0.5106366  -0.35151464]
  [-0.39846328  0.22245121  0.13075739  0.33387476 -0.10088372]]
-<NDArray 5x5 @cpu(0)>
 
 hybridsequential94_normalizationhybridlayer0_scales = 
 [2.]
-<NDArray 1 @cpu(0)>
 
 =========== Parameters after backward pass ===========
 
@@ -245,11 +235,9 @@ hybridsequential94_normalizationhybridlayer0_weights =
  [ 0.48665082 -0.00144657  0.00376363  0.47501418 -0.23885089]
  [-0.22626656  0.22944227  0.05018325  0.6166192  -0.24941102]
  [-0.44946212  0.20532274  0.07579394  0.29261002 -0.14063817]]
-<NDArray 5x5 @cpu(0)>
 
 hybridsequential94_normalizationhybridlayer0_scales = 
 [2.]
-<NDArray 1 @cpu(0)>
 ``` 
 
 
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/custom_layer_beginners.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/custom_layer_beginners.md
deleted file mode 100644
index 644a95f1f7a3..000000000000
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/custom_layer_beginners.md
+++ /dev/null
@@ -1,243 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# Customer Layers (Beginners)
-
-While Gluon API for Apache MxNet comes with [a decent number of pre-defined layers](https://mxnet.apache.org/api/python/gluon/nn.html), at some point one may find that a new layer is needed. Adding a new layer in Gluon API is straightforward, yet there are a few things that one needs to keep in mind.
-
-In this article, I will cover how to create a new layer from scratch, how to use it, what are possible pitfalls and how to avoid them.
-
-## The simplest custom layer
-
-To create a new layer in Gluon API, one must create a class that inherits from [Block](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/block.py#L123) class. This class provides the most basic functionality, and all pre-defined layers inherit from it directly or via other subclasses. Because each layer in Apache MxNet inherits from `Block`  words “layer” and “block” are used interchangeable inside of the Apache MxNet community.
-
-The only instance method needed to be implemented is [forward(self, x)](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/block.py#L415) which defines what exactly your layer is going to do during forward propagation. Notice, that it doesn’t require to provide what the block should do during back propogation. Back propogation pass for blocks is done by Apache MxNet for you.
-
-In the example below, we define a new layer and implement `forward()`  method to normalize input data by fitting it into a range of [0, 1].
-
-```{.python .input}
-# Do some initial imports used throughout this tutorial
-from __future__ import print_function
-import mxnet as mx
-from mxnet import nd, gluon, autograd
-from mxnet.gluon.nn import Dense
-mx.random.seed(1)                      # Set seed for reproducable results
-```
-
-```{.python .input}
-class NormalizationLayer(gluon.Block):
-    def __init__(self):
-        super(NormalizationLayer, self).__init__()
-
-    def forward(self, x):
-        return (x - nd.min(x)) / (nd.max(x) - nd.min(x))
-```
-
-The rest of methods of the `Block` class are already implemented, and majority of them are used to work with parameters of a block. There is one very special method named [hybridize()](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/block.py#L384), though, which I am going to cover before moving to a more complex example of a custom layer.
-
-## Hybridization and the difference between Block and HybridBlock
-
-Looking into implementation of [existing layers](https://mxnet.apache.org/api/python/gluon/nn.html), one may find that more often a block inherits from a [HybridBlock](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/block.py#L428), instead of directly inheriting from `Block`.
-
-The reason for that is that `HybridBlock` allows to write custom layers that can be used in imperative programming as well as in symbolic programming. It is convenient to support both ways, because the imperative programming eases the debugging of the code and the symbolic one provides faster execution speed. You can learn more about the difference between symbolic vs. imperative programming from this [deep learning programming paradigm](https://mxnet.apache.org/api/architecture/overview.html) article.
-
-Hybridization is a process that Apache MxNet uses to create a symbolic graph of a forward computation. This allows to increase computation performance by optimizing the computational symbolic graph. Once the symbolic graph is created, Apache MxNet caches and reuses it for subsequent computations.
-
-To simplify support of both imperative and symbolic programming, Apache MxNet introduce the `HybridBlock` class. Compare to the `Block` class, `HybridBlock` already has its [forward()](https://mxnet.apache.org/api/python/gluon/gluon.html#mxnet.gluon.HybridBlock.forward) method implemented, but it defines a [hybrid_forward()](https://mxnet.apache.org/api/python/gluon/gluon.html#mxnet.gluon.HybridBlock.hybrid_forward) method that needs to be implemented.
-
-The main difference between `forward()` and  `hybrid_forward()` is an `F` argument. This argument sometimes is refered as a `backend` in the Apache MxNet community. Depending on if hybridization has been done or not, `F` can refer either to [mxnet.ndarray API](https://mxnet.apache.org/api/python/ndarray/ndarray.html) or [mxnet.symbol API](https://mxnet.apache.org/api/python/symbol/symbol.html). The former is used for imperative programming, and the latter for symbolic programming.
-
-To support hybridization, it is important to use only methods available directly from `F` parameter.Usually, there are equivalent methods in both APIs, but sometimes there are mismatches or small variations. For example, by default, subtraction and division of NDArrays support broadcasting, while in Symbol API broadcasting is supported in a separate operators.
-
-Knowing this, we can can rewrite our example layer, using HybridBlock:
-
-```{.python .input}
-class NormalizationHybridLayer(gluon.HybridBlock):
-    def __init__(self):
-        super(NormalizationHybridLayer, self).__init__()
-
-    def hybrid_forward(self, F, x):
-        return F.broadcast_div(F.broadcast_sub(x, F.min(x)), (F.broadcast_sub(F.max(x), F.min(x))))
-```
-
-Thanks to inheriting from HybridBlock, one can easily do forward pass on a given ndarray, either on CPU or GPU:
-
-```{.python .input}
-layer = NormalizationHybridLayer()
-layer(nd.array([1, 2, 3], ctx=mx.cpu()))
-```
-
-```{.python .input}
-[0.  0.5 1. ]
-<NDArray 3 @cpu(0)>
-```
-
-As a rule of thumb, one should always implement custom layers by inheriting from `HybridBlock`. This allows to have more flexibility, and doesn’t affect execution speed once hybridization is done.
-
-Unfortunately, at the moment of writing this tutorial, NLP related layers such as [RNN](https://mxnet.apache.org/api/python/gluon/rnn.html#mxnet.gluon.rnn.RNN), [GRU](https://mxnet.apache.org/api/python/gluon/rnn.html#mxnet.gluon.rnn.GRU), and [LSTM](https://mxnet.apache.org/api/python/gluon/rnn.html#mxnet.gluon.rnn.LSTM) are directly inhereting from the `Block` class via common `_RNNLayer` class. That means that networks with such layers cannot be hybridized. But this might change in the future, so stay tuned.
-
-It is important to notice that hybridization has nothing to do with computation on GPU. One can train both hybridized and non-hybridized networks on both CPU and GPU, though hybridized networks would work faster. Though, it is hard to say in advance how much faster it is going to be.
-
-## Adding a custom layer to a network
-
-While it is possible, custom layers are rarely used separately. Most often they are used with predefined layers to create a neural network. Output of one layer is used as an input of another layer.
-
-Depending on which class you used as a base one, you can use either [Sequential](https://mxnet.apache.org/api/python/gluon/gluon.html#mxnet.gluon.nn.Sequential) or [HybridSequential](https://mxnet.apache.org/api/python/gluon/gluon.html#mxnet.gluon.nn.HybridSequential) container to form a sequential neural network. By adding layers one by one, one adds dependencies of one layer’s input from another layer’s output. It is worth noting, that both `Sequential` and `HybridSequential` containers inherit from `Block` and `HybridBlock` respectively.
-
-Below is an example of how to create a simple neural network with a custom layer. In this example, `NormalizationHybridLayer` gets as an input the output from `Dense(5)` layer and pass its output as an input to `Dense(1)` layer.
-
-```{.python .input}
-net = gluon.nn.HybridSequential()                         # Define a Neural Network as a sequence of hybrid blocks
-net.add(Dense(5))                                     # Add Dense layer with 5 neurons
-net.add(NormalizationHybridLayer())                   # Add our custom layer
-net.add(Dense(1))                                     # Add Dense layer with 1 neurons
-
-
-net.initialize(mx.init.Xavier(magnitude=2.24))            # Initialize parameters of all layers
-net.hybridize()                                           # Create, optimize and cache computational graph
-input = nd.random_uniform(low=-10, high=10, shape=(5, 2)) # Create 5 random examples with 2 feature each in range [-10, 10]
-net(input)
-```
-
-```{.python .input}
-[[-0.13601446]
- [ 0.26103732]
- [-0.05046433]
- [-1.2375476 ]
- [-0.15506986]]
-<NDArray 5x1 @cpu(0)>
-```
-
-## Parameters of a custom layer
-
-Usually, a layer has a set of associated parameters, sometimes also referred as weights. This is an internal state of a layer. Most often, these parameters are the ones, that we want to learn during backpropogation step, but sometimes these parameters might be just constants we want to use during forward pass.
-
-All parameters of a block are stored and accessed via [ParameterDict](https://github.com/apache/incubator-mxnet/blob/master/python/mxnet/gluon/parameter.py#L508) class. This class helps with initialization, updating, saving and loading of the parameters. Each layer can have multiple set of parameters, and all of them can be stored in a single instance of the `ParameterDict` class. On a block level, the instance of the `ParameterDict` class is accessible via `self.params` field, and outside of a block one can access all parameters of the network via [collect_params()](https://mxnet.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Block.collect_params) method called on a `container`. `ParamterDict` uses [Parameter](https://mxnet.apache.org/api/python/gluon/gluon.html#mxnet.gluon.Parameter) class to represent parameters inside of Apache MxNet neural network. If parameter doesn’t exist, trying to get a parameter via `self.params` will create it automatically.
-
-```{.python .input}
-class NormalizationHybridLayer(gluon.HybridBlock):
-    def __init__(self, hidden_units, scales):
-        super(NormalizationHybridLayer, self).__init__()
-
-        self.weights = gluon.Parameter('weights',
-                                       shape=(hidden_units, 0),
-                                       allow_deferred_init=True)
-
-        self.scales = gluon.Parameter('scales',
-                                      shape=scales.shape,
-                                      init=mx.init.Constant(scales.asnumpy()),
-                                      differentiable=False)
-
-    def hybrid_forward(self, F, x, weights, scales):
-        normalized_data = F.broadcast_div(F.broadcast_sub(x, F.min(x)), (F.broadcast_sub(F.max(x), F.min(x))))
-        weighted_data = F.FullyConnected(normalized_data, weights, num_hidden=self.weights.shape[0], no_bias=True)
-        scaled_data = F.broadcast_mul(scales, weighted_data)
-        return scaled_data
-```
-
-In the example above 2 set of parameters are defined:
-1. Parameter `weights` is trainable. Its shape is unknown during construction phase and will be infered on the first run of forward propogation.
-2. Parameter `scale` is a constant that doesn’t change. Its shape is defined during construction.
-
-Notice a few aspects of this code:
-
-+ Shape is not provided when creating `weights`. Instead it is going to be infered from the shape of the input
-+ `Scales` parameter is initialized and marked as `differentiable=False`.
-+ `F` backend is used for all calculations
-+ The calculation of dot product is done using `F.FullyConnected()` method instead of `F.dot()` method. The one was chosen over another because the former supports automatic infering shapes of inputs while the latter doesn’t. This is extremely important to know, if one doesn’t want to hard code all the shapes. The best way to learn what operators supports automatic inference of input shapes at the moment is browsing C++ implementation of operators to see if one uses a method `SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param.num_hidden, num_input));`
-+ `hybrid_forward()` method signature has changed. It accepts two new arguments: `weights` and `scales`.
-
-The last peculiarity is due to support of imperative and symbolic programming by `HybridBlock`. During training phase, parameters are passed to the layer by Apache MxNet framework as additional arguments to the method, because they might need to be converted to a `Symbol` depending on if the layer was hybridized. One shouldn’t use `self.weights` and `self.scales` in `hybrid_forward` except to get shapes of parameters.
-
-Running forward pass on this network is very similar to the previous example, so instead of just doing one forward pass, let’s run whole training for a few epochs to show that `scales` parameter doesn’t change during the training while `weights` parameter is changing.
-
-```{.python .input}
-def print_params(title, net):
-    """
-    Helper function to print out the state of parameters of NormalizationHybridLayer
-    """
-    print(title)
-    hybridlayer_params = {k: v for k, v in net.collect_params().items() if 'normalizationhybridlayer' in k }
-
-    for key, value in hybridlayer_params.items():
-        print('{} = {}\n'.format(key, value.data()))
-
-net = gluon.nn.HybridSequential()                             # Define a Neural Network as a sequence of hybrid blocks
-net.add(Dense(5))                                         # Add Dense layer with 5 neurons
-net.add(NormalizationHybridLayer(hidden_units=5,
-                                    scales = nd.array([2]))) # Add our custom layer
-net.add(Dense(1))                                         # Add Dense layer with 1 neurons
-
-
-net.initialize(mx.init.Xavier(magnitude=2.24))                # Initialize parameters of all layers
-net.hybridize()                                               # Create, optimize and cache computational graph
-
-input = nd.random_uniform(low=-10, high=10, shape=(5, 2))     # Create 5 random examples with 2 feature each in range [-10, 10]
-label = nd.random_uniform(low=-1, high=1, shape=(5, 1))
-
-mse_loss = gluon.loss.L2Loss()                                # Mean squared error between output and label
-trainer = gluon.Trainer(net.collect_params(),                 # Init trainer with Stochastic Gradient Descent (sgd) optimization method and parameters for it
-                        'sgd',
-                        {'learning_rate': 0.1, 'momentum': 0.9 })
-
-with autograd.record():                                       # Autograd records computations done on NDArrays inside "with" block
-    output = net(input)                                       # Run forward propogation
-
-    print_params("=========== Parameters after forward pass ===========\n", net)
-    loss = mse_loss(output, label)                            # Calculate MSE
-
-loss.backward()                                               # Backward computes gradients and stores them as a separate array within each NDArray in .grad field
-trainer.step(input.shape[0])                                  # Trainer updates parameters of every block, using .grad field using oprimization method (sgd in this example)
-                                                              # We provide batch size that is used as a divider in cost function formula
-print_params("=========== Parameters after backward pass ===========\n", net)
-```
-
-```{.python .input}
-=========== Parameters after forward pass ===========
-
-hybridsequential94_normalizationhybridlayer0_weights =
-[[-0.3983642  -0.505708   -0.02425683 -0.3133553  -0.35161012]
- [ 0.6467543   0.3918715  -0.6154656  -0.20702496 -0.4243446 ]
- [ 0.6077331   0.03922009  0.13425875  0.5729856  -0.14446527]
- [-0.3572498   0.18545026 -0.09098256  0.5106366  -0.35151464]
- [-0.39846328  0.22245121  0.13075739  0.33387476 -0.10088372]]
-<NDArray 5x5 @cpu(0)>
-
-hybridsequential94_normalizationhybridlayer0_scales =
-[2.]
-<NDArray 1 @cpu(0)>
-
-=========== Parameters after backward pass ===========
-
-hybridsequential94_normalizationhybridlayer0_weights =
-[[-0.29839832 -0.47213346  0.08348035 -0.2324698  -0.27368504]
- [ 0.76268613  0.43080837 -0.49052125 -0.11322092 -0.3339738 ]
- [ 0.48665082 -0.00144657  0.00376363  0.47501418 -0.23885089]
- [-0.22626656  0.22944227  0.05018325  0.6166192  -0.24941102]
- [-0.44946212  0.20532274  0.07579394  0.29261002 -0.14063817]]
-<NDArray 5x5 @cpu(0)>
-
-hybridsequential94_normalizationhybridlayer0_scales =
-[2.]
-<NDArray 1 @cpu(0)>
-```
-
-As it is seen from the output above, `weights` parameter has been changed by the training and `scales` not.
-
-## Conclusion
-
-One important quality of a Deep learning framework is extensibility. Empowered by flexible abstractions, like `Block` and `HybridBlock`, one can easily extend Apache MxNet functionality to match its needs.
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/hybridize.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/hybridize.md
index 5bf206a97fb4..f104e2cdd92b 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/hybridize.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/hybridize.md
@@ -98,7 +98,7 @@ Through the use of experiments, this section will demonstrate the benefits of hy
 Previously, we learned how to use the Sequential class to concatenate multiple layers. Next, we will replace the Sequential class with the HybridSequential class in order to make use of hybrid programming.
 
 ```{.python .input}
-from mxnet import nd, sym
+from mxnet import np, npx
 from mxnet.gluon import nn
 import time
 
@@ -110,7 +110,7 @@ def get_net():
     net.initialize()
     return net
 
-x = nd.random.normal(shape=(1, 512))
+x = np.random.normal(size=(1, 512))
 net = get_net()
 net(x)
 ```
@@ -133,7 +133,7 @@ def benchmark(net, x):
     start = time.time()
     for i in range(1000):
         _ = net(x)
-    nd.waitall()  # To facilitate timing, we wait for all computations to be completed.
+    npx.waitall()  # To facilitate timing, we wait for all computations to be completed.
     return time.time() - start
 
 net = get_net()
@@ -163,7 +163,7 @@ net(x)
 
 ## Constructing Models Using the HybridBlock Class
 
-Similar to the correlation between the Sequential Block classes, the HybridSequential class is a HybridBlock subclass. Contrary to the Block instance, which needs to use the `forward` function, for a HybridBlock instance we need to use the `hybrid_forward` function.
+Similar to the correlation between the Sequential Block classes, the HybridSequential class is a HybridBlock subclass. 
 
 Earlier, we demonstrated that, after calling the `hybridize` function, the model is able to achieve superior computing performance and portability. In addition, model flexibility can be affected after calling the `hybridize` function. We will demonstrate this by constructing a model using the HybridBlock class.
 
@@ -174,22 +174,17 @@ class HybridNet(nn.HybridBlock):
         self.hidden = nn.Dense(10)
         self.output = nn.Dense(2)
 
-    def hybrid_forward(self, F, x):
-        print('F: ', F)
+    def forward(self, x):
         print('x: ', x)
-        x = F.relu(self.hidden(x))
+        x = npx.relu(self.hidden(x))
         print('hidden: ', x)
         return self.output(x)
 ```
 
-We need to add the additional input `F` to the `hybrid_forward` function when inheriting the HybridBlock class. We already know that MXNet uses both an NDArray class and a Symbol class, which are based on imperative programming and symbolic programming, respectively. Since these two classes perform very similar functions, MXNet will determine whether `F` will call NDArray or Symbol based on the input provided.
-
-The following creates a HybridBlock instance. As we can see, by default, `F` uses NDArray. We also printed out the `x` input as well as the hidden layer’s output using the ReLU activation function.
-
 ```{.python .input}
 net = HybridNet()
 net.initialize()
-x = nd.random.normal(shape=(1, 4))
+x = np.random.normal(size=(1, 4))
 net(x)
 ```
 
@@ -206,103 +201,13 @@ net.hybridize()
 net(x)
 ```
 
-We can see that `F` turns into a Symbol. Moreover, even though the input data is still NDArray, the same input and intermediate output will all be converted to Symbol type in the `hybrid_forward` function.
-
 Now, we repeat the forward computation.
 
 ```{.python .input}
 net(x)
 ```
 
-We can see that the three lines of print statements defined in the `hybrid_forward` function will not print anything. This is because a symbolic program has been produced since the last time `net(x)` was run by calling the `hybridize` function. Afterwards, when we run `net(x)` again, MXNet will no longer need to access Python code, but can directly perform symbolic programming at the C++ backend. This is another reason why model computing performance will be improve after the `hybridize` function is called. However, there is always the potential that any programs we write will suffer a loss in flexibility. If we want to use the three lines of print statements to debug the code in the above example, they will be skipped over and we would not be able to print when the symbolic program is executed. Additionally, in the case of a few functions not supported by Symbol (like `asnumpy`), and operations in-place like `a += b` and `a[:] = a + b` (must be rewritten as `a = a + b`). Therefore, we will not be able to use the `hybrid_forward` function or perform forward computation after the `hybridize` function has been called.
-
-## Key differences and limitations of hybridization
-
-The difference between a purely imperative `Block` and hybridizable `HybridBlock` can superficially appear to be simply the injection of the `F` function space (resolving to [mx.nd](../../../../api/legacy/ndarray/ndarray.rst) or [mx.sym](../../../../api/legacy/symbol/index.rst)) in the forward function that is renamed from `forward` to `hybrid_forward`. However there are some limitations that apply when using hybrid blocks. In the following section we will review the main differences, giving example of code snippets that generate errors when such blocks get hybridized.
-
-### Indexing
-
-When trying to access specific elements in a tensor like this:
-
-```{.python .input}
-def hybrid_forward(self, F, x):
-    return x[0,0]
-```
-
-Would generate the following error:
-
-`TypeError: Symbol only support integer index to fetch i-th output`
-
-There are however several operators that can help you with array manipulations like: [F.split](../../../../api/legacy/ndarray/ndarray.rst#mxnet.ndarray.split), [F.slice](../../../../api/legacy/ndarray/ndarray.rst#mxnet.ndarray.slice), [F.take](../../../../api/legacy/ndarray/ndarray.rst#mxnet.ndarray.take),[F.pick](../../../../api/legacy/ndarray/ndarray.rst#mxnet.ndarray.pick), [F.where](../../../../api/legacy/ndarray/ndarray.rst#mxnet.ndarray.where), [F.reshape](../../../../api/legacy/ndarray/ndarray.rst#mxnet.ndarray.reshape) or [F.reshape_like](../../../../api/legacy/ndarray/ndarray.rst#mxnet.ndarray.reshape_like).
-
-### Data Type
-
-Sometimes one can be tempted to use conditional logic on the type of the input tensors however the following block:
-
-```{.python .input}
-def hybrid_forward(self, F, x):
-    if x.dtype =='float16':
-        return x
-    return x*2
-```
-
-Would generate a `AttributeError: 'Symbol' object has no attribute 'dtype'`
-
-You cannot use the `dtype` of the symbol at runtime. Symbols only describe operations and not the underlying data they operate on. One workaround is to pass the type as a constructor argument of your network and hence build the appropriate compute graph for each situation.
-
-### Compute Context
-
-Similarly you cannot use the compute context of symbol for the same reason that symbols only describe the operations on the data and not the data (or context). You cannot do this:
-
-```{.python .input}
-def hybrid_forward(self, F, x):
-    if x.context == mx.cpu():
-        return x
-    return x*2
-```
-
-Without getting a `AttributeError: 'Symbol' object has no attribute 'context'`
-
-Accessing the current compute context is not possible with symbols. Consider passing this information in the constructor if you require it to create the appropriate compute graph.
-
-### Shape
-
-Accessing shape information of tensors is very often used for example when trying to flatten a tensor and then reshape it back to its original shape.
-
-```{.python .input}
-def hybrid_forward(self, F, x):
-    return x*x.shape[0]
-```
-
-Trying to access the shape of a tensor in a hybridized block would result in this error: `AttributeError: 'Symbol' object has no attribute 'shape'`.
-
-Again, you cannot use the shape of the symbol at runtime as symbols only describe operations and not the underlying data they operate on.
-Note: This will change in the future as Apache MXNet will support [dynamic shape inference](https://cwiki.apache.org/confluence/display/MXNET/Dynamic+shape), and the shapes of symbols will be symbols themselves
-
-There are also a lot of operators that support special indices to help with most of the use-cases where you would want to access the shape information. For example, `F.reshape(x, (0,0,-1))` will keep the first two dimensions unchanged and collapse all further dimensions into the third dimension. See the documentation of the [F.reshape](../../../../api/legacy/ndarray/ndarray.rst#mxnet.ndarray.reshape) for more details.
-
-### Item assignment
-
-Last but not least, you cannot directly assign values in tensor in a symbolic graph, the resulting tensors always needs to be the results of operations performed on the inputs of the computational graph. The following code:
-
-```{.python .input}
-def hybrid_forward(self, F, x):
-    x[0] = 2
-    return x
-```
-
-Would get you this error `TypeError: 'Symbol' object does not support item assignment`.
-
-Direct item assignment is not possible in symbolic graph since it needs to be part of a computational graph. One way is to use add more inputs to your graph and use masking or the [F.where](../../../../api/legacy/ndarray/ndarray.rst#mxnet.ndarray.where) operator.
-
-e.g to set the first element to 2 you can do:
-
-```{.python .input}
-x = mx.nd.array([1,2,3])
-value = mx.nd.ones_like(x)*2
-condition = mx.nd.array([0,1,1])
-mx.nd.where(condition=condition, x=x, y=value)
-```
+We can see that the three lines of print statements defined in the `forward` function will not print anything. This is because a symbolic computing graph has been recorded since the last time `net(x)` was run by calling the `hybridize` function. Afterwards, when we run `net(x)` again, MXNet will no longer need to access Python code, but can directly perform symbolic programming at the C++ backend. This is another reason why model computing performance will be improve after the `hybridize` function is called. However, there is always the potential that any programs we write will suffer a loss in flexibility. If we want to use the three lines of print statements to debug the code in the above example, they will be skipped over and we would not be able to print when the symbolic program is executed. Additionally, in the case of a few functions not supported by Symbol (like `asnumpy`), and operations in-place like `a += b` and `a[:] = a + b` (must be rewritten as `a = a + b`). Therefore, we will not be able to use the `forward` function or perform forward computation after the `hybridize` function has been called.
 
 ## Disabling Hybridization
 
diff --git a/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md b/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md
index 26dcb11d78ed..ff6293964543 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md
@@ -32,12 +32,12 @@ import sys
 import tarfile
 import time
 
-import numpy as np
+import numpy as onp
 from matplotlib import pyplot as plt
 from mxboard import SummaryWriter
 import mxnet as mx
 from mxnet import gluon
-from mxnet import ndarray as nd
+from mxnet import np, npx
 from mxnet.gluon import nn, utils
 from mxnet import autograd
 
@@ -72,10 +72,10 @@ if not os.path.exists(data_path):
 ```{.python .input}
 def transform(data, width=64, height=64):
     data = mx.image.imresize(data, width, height)
-    data = nd.transpose(data, (2,0,1))
-    data = data.astype(np.float32)/127.5 - 1
+    data = np.transpose(data, (2,0,1))
+    data = data.astype(onp.float32)/127.5 - 1
     if data.shape[0] == 1:
-        data = nd.tile(data, (3, 1, 1))
+        data = np.tile(data, (3, 1, 1))
     return data.reshape((1,) + data.shape)
 ```
 
@@ -108,7 +108,7 @@ test_filenames = filenames[split:]
 train_images = images[:split]
 train_filenames = filenames[:split]
 
-train_data = gluon.data.ArrayDataset(nd.concatenate(train_images))
+train_data = gluon.data.ArrayDataset(np.concatenate(train_images))
 train_dataloader = gluon.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, last_batch='rollover', num_workers=multiprocessing.cpu_count()-1)
 ```
 
@@ -139,9 +139,9 @@ class Generator(gluon.HybridBlock):
         self.G.add(nn.Conv2DTranspose(3, 4, 2, 1, use_bias=False))
         self.G.add(nn.Activation('tanh'))
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         x = self.prev(x)
-        x = F.reshape(x, (0, -1, 1, 1))
+        x = np.reshape(x, (0, -1, 1, 1))
         return self.G(x)
 ```
 
@@ -176,7 +176,7 @@ class Discriminator(gluon.HybridBlock):
         self.Q = nn.HybridSequential()
         self.Q.add(self.feat, self.category_prob, self.continuous_mean)
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         x               = self.D(x)
         prob            = self.prob(x)
         feat            = self.feat(x)
@@ -216,8 +216,8 @@ Create vectors with real (=1) and fake labels (=0).
 
 
 ```{.python .input}
-real_label = nd.ones((batch_size,), ctx=ctx)
-fake_label = nd.zeros((batch_size,),ctx=ctx)
+real_label = np.ones((batch_size,), ctx=ctx)
+fake_label = np.zeros((batch_size,),ctx=ctx)
 ```
 
 Load a pretrained model.
@@ -256,13 +256,13 @@ This function samples `c`, `z`, and concatenates them to create the generator in
 def create_generator_input():
 
     #create random noise
-    z      = nd.random_normal(0, 1, shape=(batch_size, z_dim), ctx=ctx)
-    label  = nd.array(np.random.randint(n_categories, size=batch_size)).as_in_context(ctx)
-    c1     = nd.one_hot(label, depth=n_categories).as_in_context(ctx)
-    c2     = nd.random.uniform(-1, 1, shape=(batch_size, n_continuous)).as_in_context(ctx)
+    z      = np.random.normal(0, 1, size=(batch_size, z_dim), ctx=ctx)
+    label  = np.array(onp.random.randint(n_categories, size=batch_size)).as_in_context(ctx)
+    c1     = npx.one_hot(label, depth=n_categories).as_in_context(ctx)
+    c2     = np.random.uniform(-1, 1, size=(batch_size, n_continuous)).as_in_context(ctx)
 
     # concatenate random noise with c which will be the input of the generator
-    return nd.concat(z, c1, c2, dim=1), label, c2
+    return np.concatenate([z, c1, c2], axis=1), label, c2
 ```
 
 Define the training loop.
@@ -282,8 +282,8 @@ with SummaryWriter(logdir='./logs/') as sw:
         print("Epoch", epoch)
         starttime = time.time()
 
-        d_error_epoch = nd.zeros((1,), ctx=ctx)
-        g_error_epoch = nd.zeros((1,), ctx=ctx)
+        d_error_epoch = np.zeros((1,), ctx=ctx)
+        g_error_epoch = np.zeros((1,), ctx=ctx)
 
         for idx, data in enumerate(train_dataloader):
 
@@ -329,16 +329,16 @@ with SummaryWriter(logdir='./logs/') as sw:
                 count = idx + 1
                 logging.info('speed: {} samples/s'.format(batch_size / (time.time() - starttime)))
                 logging.info('discriminator loss = %f, generator loss = %f at iter %d epoch %d'
-                         %(d_error_epoch.asscalar()/count,g_error_epoch.asscalar()/count, count, epoch))
+                         %(d_error_epoch.item()/count,g_error_epoch.item()/count, count, epoch))
 
                 g_input,_,_ = create_generator_input()
 
                 # create some fake image for logging in MXBoard
                 fake_image = generator(g_input)
 
-                sw.add_scalar(tag='Loss_D', value={'test':d_error_epoch.asscalar()/count}, global_step=counter)
-                sw.add_scalar(tag='Loss_G', value={'test':d_error_epoch.asscalar()/count}, global_step=counter)
-                sw.add_image(tag='data_image', image=((fake_image[0]+ 1.0) * 127.5).astype(np.uint8)  , global_step=counter)
+                sw.add_scalar(tag='Loss_D', value={'test':d_error_epoch.item()/count}, global_step=counter)
+                sw.add_scalar(tag='Loss_G', value={'test':d_error_epoch.item()/count}, global_step=counter)
+                sw.add_image(tag='data_image', image=((fake_image[0]+ 1.0) * 127.5).astype(onp.uint8)  , global_step=counter)
                 sw.flush()
 
         discriminator.save_parameters("infogan_d_latest.params")
@@ -366,9 +366,9 @@ Nearest neighbor function, which takes a matrix of features and an input feature
 
 ```{.python .input}
 def get_knn(features, input_vector, k=3):
-    dist = (nd.square(features - input_vector).sum(axis=1))/features.shape[0]
+    dist = (np.square(features - input_vector).sum(axis=1))/features.shape[0]
     indices = dist.asnumpy().argsort()[:k]
-    return [(index, dist[index].asscalar()) for index in indices]
+    return [(index, dist[index].item()) for index in indices]
 ```
 
 A helper function to visualize image data.
@@ -376,7 +376,7 @@ A helper function to visualize image data.
 
 ```{.python .input}
 def visualize(img_array):
-    plt.imshow(((img_array.asnumpy().transpose(1, 2, 0) + 1.0) * 127.5).astype(np.uint8))
+    plt.imshow(((img_array.asnumpy().transpose(1, 2, 0) + 1.0) * 127.5).astype(onp.uint8))
     plt.axis('off')
 ```
 
@@ -386,18 +386,18 @@ Take some images from the test data, obtain its feature vector from `discriminat
 ```{.python .input}
 feature_size = 8192
 
-features = nd.zeros((len(test_images), feature_size), ctx=ctx)
+features = np.zeros((len(test_images), feature_size), ctx=ctx)
 
 for idx, image in enumerate(test_images):
 
-    feature = discriminator(nd.array(image, ctx=ctx))
+    feature = discriminator(np.array(image, ctx=ctx))
     feature = feature.reshape(feature_size,)
     features[idx,:] = feature.copyto(ctx)
 
 
 for image in test_images[:100]:
 
-    feature = discriminator(mx.nd.array(image, ctx=ctx))
+    feature = discriminator(np.array(image, ctx=ctx))
     feature = feature.reshape((feature_size,))
     image   = image.reshape((3,64,64))
 
@@ -438,7 +438,7 @@ data = []
 counter = 0
 for i,f in enumerate(test_filenames):
 
-    point = [float((tsne[i,k] - np.min(tsne[:,k]))/(np.max(tsne[:,k]) - np.min(tsne[:,k]))) for k in range(2) ]
+    point = [float((tsne[i,k] - onp.min(tsne[:,k]))/(onp.max(tsne[:,k]) - onp.min(tsne[:,k]))) for k in range(2) ]
     data.append({"path": os.path.abspath(os.path.join(os.getcwd(),f)), "point": point})
 
 with open("imagetsne.json", 'w') as outfile:
diff --git a/docs/python_docs/python/tutorials/packages/gluon/loss/custom-loss.md b/docs/python_docs/python/tutorials/packages/gluon/loss/custom-loss.md
index 21e545d26cb9..3baaf732fe23 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/loss/custom-loss.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/loss/custom-loss.md
@@ -28,7 +28,7 @@ However, we may sometimes want to solve problems that require customized loss fu
 ```{.python .input}
 import matplotlib.pyplot as plt
 import mxnet as mx
-from mxnet import autograd, gluon, nd
+from mxnet import autograd, gluon, np, npx
 from mxnet.gluon.loss import Loss
 import random
 ```
@@ -45,7 +45,7 @@ import random
 
 The loss function uses a margin *m* which is has the effect that dissimlar pairs only contribute if their loss is within a certain margin.
 
-In order to implement such a customized loss function in Gluon, we only need to define a new class that is inheriting from the [Loss](../../../../api/gluon/loss/index.rst#mxnet.gluon.loss.Loss) base class. We then define the contrastive loss logic in the [hybrid_forward](../../../../api/gluon/hybrid_block.rst#mxnet.gluon.HybridBlock.hybrid_forward) method. This method takes the images `image1`, `image2` and the label which defines whether  `image1` and `image2` are similar (=0) or  dissimilar (=1). The input F is an `mxnet.ndarry` or an `mxnet.symbol` if we hybridize the network. Gluon's `Loss` base class is in fact a [HybridBlock](../../../../api/gluon/hybrid_block.rst#mxnet.gluon.HybridBlock). This means we can either run  imperatively or symbolically. When we hybridize our custom loss function, we can get performance speedups.
+In order to implement such a customized loss function in Gluon, we just need to define a new class that is inheriting from the [Loss](../../../../api/gluon/loss/index.rst#mxnet.gluon.loss.Loss) base class. We then define the contrastive loss logic in the [forward](../../../../api/gluon/hybrid_block.rst#mxnet.gluon.HybridBlock.forward) method. This method takes the images `image1`, `image2` and the label which defines whether  `image1` and `image2` are similar (=0) or  dissimilar (=1). Gluon's `Loss` base class is in fact a [HybridBlock](../../../../api/gluon/hybrid_block.rst#mxnet.gluon.HybridBlock), and we hybridize our custom loss function, we can get performance speedups.
 
 
 ```{.python .input}
@@ -54,12 +54,12 @@ class ContrastiveLoss(Loss):
         super(ContrastiveLoss, self).__init__(weight, batch_axis, **kwargs)
         self.margin = margin
 
-    def hybrid_forward(self, F, image1, image2, label):
+    def forward(self, image1, image2, label):
         distances = image1 - image2
-        distances_squared = F.sum(F.square(distances), 1, keepdims=True)
-        euclidean_distances = F.sqrt(distances_squared + 0.0001)
-        d = F.clip(self.margin - euclidean_distances, 0, self.margin)
-        loss = (1 - label) * distances_squared + label * F.square(d)
+        distances_squared = np.sum(np.square(distances), 1, keepdims=True)
+        euclidean_distances = np.sqrt(distances_squared + 0.0001)
+        d = np.clip(self.margin - euclidean_distances, 0, self.margin)
+        loss = (1 - label) * distances_squared + label * np.square(d)
         loss = 0.5*loss
         return loss
 loss = ContrastiveLoss(margin=6.0)
@@ -83,7 +83,7 @@ class Siamese(gluon.HybridBlock):
         self.cnn.add(gluon.nn.Dense(256, activation='relu'))
         self.cnn.add(gluon.nn.Dense(2, activation='softrelu'))
 
-    def hybrid_forward(self, F, input0, input1):
+    def forward(self, input0, input1):
         out0 = self.cnn(input0)
         out1 = self.cnn(input1)
         return out0, out1
diff --git a/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/csr.md b/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/csr.md
index 3a8bae357a83..bd3b939b932e 100644
--- a/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/csr.md
+++ b/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/csr.md
@@ -554,9 +554,6 @@ except mx.MXNetError as err:
     sys.stderr.write(str(err))
 ```
 
-## Next 
-
-[Train a Linear Regression Model with Gluon Sparse](./train_gluon.ipynb)
 
 
 <!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/row_sparse.md b/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/row_sparse.md
index 6003e4f55a0f..66fda6b4225f 100644
--- a/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/row_sparse.md
+++ b/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/row_sparse.md
@@ -576,9 +576,6 @@ except mx.MXNetError as err:
     sys.stderr.write(str(err))
 ```
 
-## Next
-
-[Train a Linear Regression Model with Gluon Sparse](./train_gluon.ipynb)
 
 
 <!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/train_gluon.md b/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/train_gluon.md
deleted file mode 100644
index cf99b5313e85..000000000000
--- a/docs/python_docs/python/tutorials/packages/legacy/ndarray/sparse/train_gluon.md
+++ /dev/null
@@ -1,474 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-
-# Sparse NDArrays with Gluon
-
-When working on machine learning problems, you may encounter situations where the input data is sparse (i.e. the majority of values are zero). One example of this is in recommendation systems. You could have millions of user and product features, but only a few of these features are present for each sample. Without special treatment, the sheer magnitude of the feature space can lead to out-of-memory situations and cause significant slowdowns when training and making predictions.
-
-MXNet supports a number of sparse storage types (often called `stype` for short) for these situations. In this tutorial, we'll start by generating some sparse data, write it to disk in the LibSVM format and then read back using the [LibSVMIter](../../../../../api/legacy/io/index.rst#mxnet.io.LibSVMIter) for training. We use the Gluon API to train the model and leverage sparse storage types such as [CSRNDArray](../../../../../api/legacy/ndarray/sparse/index.rst#mxnet.ndarray.sparse.CSRNDArray) and [RowSparseNDArray](../../../../../api/legacy/ndarray/sparse/index.rst#mxnet.ndarray.sparse.RowSparseNDArray) to maximise performance and memory efficiency.
-
-
-```{.python .input}
-import mxnet as mx
-import numpy as np
-import time
-```
-
-### Generating Sparse Data
-
-You will most likely have a sparse dataset in mind already if you're reading this tutorial, but let's create a dummy dataset to use in the examples that follow. Using `rand_ndarray` we will generate 1000 samples, each with 1,000,000 features of which 99.999% of values will be zero (i.e. 10 non-zero features for each sample). We take this as our input data for training and calculate a label based on an arbitrary rule: whether the feature sum is higher than average.
-
-
-```{.python .input}
-num_samples = 1000
-num_features = 1000000
-data = mx.test_utils.rand_ndarray((num_samples, num_features), stype='csr', density=0.00001)
-# generate label: 1 if row sum above average, 0 otherwise.
-label = data.sum(axis=1) > data.sum(axis=1).mean()
-```
-
-
-```{.python .input}
-print(type(data))
-print(data[:10].asnumpy())
-print('{:,.0f} elements'.format(np.product(data.shape)))
-print('{:,.0f} non-zero elements'.format(data.data.size))
-```
-
-```
-<class 'mxnet.ndarray.sparse.CSRNDArray'>
-[[0. 0. 0. ... 0. 0. 0.]
- [0. 0. 0. ... 0. 0. 0.]
- [0. 0. 0. ... 0. 0. 0.]
- ...
- [0. 0. 0. ... 0. 0. 0.]
- [0. 0. 0. ... 0. 0. 0.]
- [0. 0. 0. ... 0. 0. 0.]]
-1,000,000,000 elements
-10,000 non-zero elements
-```
-
-Our storage type is CSR (Compressed Sparse Row) which is the ideal type for sparse data along multiple axes. See [this in-depth tutorial](./csr.ipynb) for more information. Just to confirm the generation process ran correctly, we can see that the vast majority of values are indeed zero. One of the first questions to ask would be how much memory is saved by storing this data in a [CSRNDArray](../../../../../api/legacy/ndarray/sparse/index.rst#mxnet.ndarray.sparse.CSRNDArray) versus a standard [NDArray](../../../../../api/legacy/ndarray/ndarray.rst). Since sparse arrays are constructed from many components (e.g. `data`, `indices` and `indptr`) we define a function called `get_nbytes` to calculate the number of bytes taken in memory to store an array. We compare the same data stored in a standard [NDArray](../../../../../api/legacy/ndarray/ndarray.rst) (with `data.tostype('default')`) to the [CSRNDArray](../../../../../api/legacy/ndarray/sparse/index.rst#mxnet.ndarray.sparse.CSRNDArray).
-
-
-```{.python .input}
-def get_nbytes(array):
-    fn = lambda a: a.size * np.dtype(a).itemsize
-    if isinstance(array, mx.ndarray.sparse.CSRNDArray):
-        return fn(array.data) + fn(array.indices) + fn(array.indptr)
-    elif isinstance(array, mx.ndarray.sparse.RowSparseNDArray):
-        return fn(array.data) + fn(array.indices)
-    elif isinstance(array, mx.ndarray.NDArray):
-        return fn(array)
-    else:
-        TypeError('{} not supported'.format(type(array)))
-```
-
-
-```{.python .input}
-print('NDarray:', get_nbytes(data.tostype('default'))/1000000, 'MBs')
-print('CSRNDArray', get_nbytes(data)/1000000, 'MBs')
-```
-
-```
-NDarray: 4000.0 MBs
-CSRNDArray 0.128008 MBs
-```
-
-Given the extremely high sparsity of the data, we observe a huge memory saving here! 0.13 MBs versus 4 GBs: ~30,000 times smaller. You can experiment with the amount of sparsity and see how these two storage types compare. When the number of non-zero values increases, this difference will reduce. And when the number of non-zero values exceeds ~1/3 you will find that this sparse storage type take more memory than dense! So use wisely.
-
-### Writing Sparse Data
-
-Since there is such a large size difference between dense and sparse storage formats here, we ideally want to store the data on disk in a sparse storage format too. MXNet supports a format called LibSVM and has a data iterator called [LibSVMIter](../../../../../api/legacy/io/index.rst#mxnet.io.LibSVMIter) specifically for data formatted this way.
-
-A LibSVM file has a row for each sample, and each row starts with the label: in this case `0.0` or `1.0` since we have a classification task. After this we have a variable number of `key:value` pairs separated by spaces, where the key is column/feature index and the value is the value of that feature. When working with your own sparse data in a custom format you should try to convert your data into this format. We define a `save_as_libsvm` function to save the `data` ([CSRNDArray](../../../../../api/legacy/ndarray/sparse/index.rst#mxnet.ndarray.sparse.CSRNDArray)) and `label` (`NDArray`) to disk in LibSVM format.
-
-
-```{.python .input}
-def save_as_libsvm(filepath, data, label):
-    with open(filepath, 'w') as openfile:
-        for row_idx in range(data.shape[0]):
-            data_sample = data[row_idx]
-            label_sample = label[row_idx]
-            col_idxs = data_sample.indices.asnumpy().tolist()
-            values = data_sample.data.asnumpy().tolist()
-            label_str = str(label_sample.asscalar())
-            value_strs = ['{}:{}'.format(idx, value) for idx, value in zip(col_idxs, values)]
-            value_str = " ".join(value_strs)
-            sample_str = '{} {}\n'.format(label_str, value_str)
-            openfile.write(sample_str)
-```
-
-
-```{.python .input}
-filepath = 'dataset.libsvm'
-save_as_libsvm(filepath, data, label)
-```
-
-We have now written the `data` and `label` to disk, and can inspect the first 10 lines of the file:
-
-
-```{.python .input}
-with open(filepath, 'r') as openfile:
-    lines = [openfile.readline() for _ in range(10)]
-for line in lines:
-    print(line[:80] + '...' if len(line) > 80 else line)
-```
-
-```
-0.0 35454:0.22486156225204468 80954:0.39130592346191406 81941:0.1988530308008194...
-1.0 37029:0.5980494618415833 52916:0.15797750651836395 71623:0.32251599431037903...
-1.0 89962:0.47770974040031433 216426:0.21326342225074768 271027:0.18589609861373...
-1.0 7071:0.9432336688041687 81664:0.7788773775100708 117459:0.8166475296020508 4...
-0.0 380966:0.16906292736530304 394363:0.7987179756164551 458442:0.56873309612274...
-0.0 89361:0.9099966287612915 141813:0.5927085280418396 282489:0.293381005525589 ...
-0.0 150427:0.4747847020626068 169376:0.2603490948677063 179377:0.237988427281379...
-0.0 49774:0.2822582423686981 91245:0.5794865489006042 102970:0.7004560232162476 ...
-1.0 97133:0.0024336236529052258 109855:0.9895315766334534 116765:0.2465638816356...
-0.0 803440:0.4020800292491913
-```
-
-
-Some storage overhead is introduced by serializing the data as characters (with spaces and colons). `dataset.libsvm` is 250 KBs but the original `data` and `label` were 132 KBs combined. Compared with the 4GB dense `NDArray` though, this isn't a huge issue.
-
-### Reading Sparse Data
-
-Using [LibSVMIter](../../../../../api/legacy/io/index.rst#mxnet.io.LibSVMIter), we can quickly and easily load data into batches ready for training. Although Gluon [Dataset](../../../../../api/gluon/data/index.rst#mxnet.gluon.data.Dataset)s can be written to return sparse arrays, Gluon [DataLoader](../../../../../api/gluon/data/index.rst#mxnet.gluon.data.DataLoader)s currently convert each sample to dense before stacking up to create the batch. As a result, [LibSVMIter](../../../../../api/legacy/io/index.rst#mxnet.io.LibSVMIter) is the recommended method of loading sparse data in batches.
-
-Similar to using a [DataLoader](../../../../../api/gluon/data/index.rst#mxnet.gluon.data.DataLoader), you must specify the required `batch_size`. Since we're dealing with sparse data and the column shape isn't explicitly stored in the LibSVM file, we additionally need to provide the shape of the data and label. Our [LibSVMIter](../../../../../api/legacy/io/index.rst#mxnet.io.LibSVMIter) returns batches in a slightly different form to a [DataLoader](../../../../../api/gluon/data/index.rst#mxnet.gluon.data.DataLoader). We get `DataBatch` objects instead of `tuple`.
-
-```{.python .input}
-data_iter = mx.io.LibSVMIter(data_libsvm=filepath, data_shape=(num_features,), label_shape=(1,), batch_size=10)
-for batch in data_iter:
-    data = batch.data[0]
-    print('data.stype: {}'.format(data.stype))
-    label = batch.label[0]
-    print('label.stype: {}'.format(label.stype))
-    break
-```
-
-```
-data.stype: csr
-label.stype: default
-```
-
-We can see that `data` and `label` are in the appropriate storage formats, given their sparse and dense values respectively. We can avoid out-of-memory issues that might have occurred if `data` was in dense storage format. Another benefit of storing the data efficiently is the reduced data transfer time when using GPUs. Although the transfer time for a single batch is small, we transfer `data` and `label` to the GPU every iteration so this time can become significant. We will time the transfer of the sparse `data` to GPU (if available) and compare to the time for its dense counterpart.
-
-
-```{.python .input}
-ctx = mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()
-```
-
-
-```{.python .input}
-%%timeit
-data_on_ctx = data.as_in_context(ctx)
-data_on_ctx.wait_to_read()
-```
-
-```
-192 microseconds +- 51.1 microseconds per loop (mean +- std. dev. of 7 runs, 1 loop each)
-```
-
-
-```{.python .input}
-print('sparse batch: {} MBs'.format(get_nbytes(data)/1000000))
-data = data.tostype('default')  # avoid timing this sparse to dense conversion
-print('dense batch: {} MBs'.format(get_nbytes(data)/1000000))
-```
-
-```
-sparse batch: 0.001348 MBs
-dense batch: 40.0 MBs
-```
-
-
-```{.python .input}
-%%timeit
-data_on_ctx = data.as_in_context(ctx)
-data_on_ctx.wait_to_read()
-```
-
-```
-4 ms +- 36.8 microseconds per loop (mean +- std. dev. of 7 runs, 100 loops each)
-```
-
-Although results will change depending on system specifications and degree of sparsity, the sparse array can be transferred from CPU to GPU significantly faster than the dense array. We see a ~25x speed up for sparse vs dense for this specific batch of data.
-
-## Gluon Models for Sparse Data
-
-Our next step is to define a network. We have an input of 1,000,000 features and we want to make a binary prediction. We don't have any spatial or temporal relationships between features, so we'll use a 3 layer fully-connected network where the last layer has 1 output unit (with sigmoid activation). Since we're working with sparse data, we'd ideally like to use network operators that can exploit this sparsity for improved performance and memory efficiency.
-
-Gluon's [nn.Dense](../../../../../api/gluon/nn/index.rst#mxnet.gluon.nn.Dense) block can used with [CSRNDArray](../../../../../api/legacy/ndarray/sparse/index.rst#mxnet.ndarray.sparse.CSRNDArray) input arrays but it doesn't exploit the sparsity. Under the hood, [Dense](../../../../../api/gluon/nn/index.rst#mxnet.gluon.nn.Dense) uses the [FullyConnected](../../../../../api/legacy/ndarray/ndarray.rst#mxnet.ndarray.FullyConnected) operator which isn't optimized for [CSRNDArray](../../../../../api/legacy/ndarray/sparse/index.rst#mxnet.ndarray.sparse.CSRNDArray) arrays. We'll implement a `Block` that does exploit this sparsity, *but first*, let's just remind ourselves of the [Dense](../../../../../api/gluon/nn/index.rst#mxnet.gluon.nn.Dense) implementation by creating an equivalent `Block` called `FullyConnected`.
-
-
-```{.python .input}
-class FullyConnected(mx.gluon.HybridBlock):
-    def __init__(self, in_units, units):
-        super(FullyConnected, self).__init__()
-        self._units = units
-        self.weight = mx.gluon.Parameter('weight', shape=(units, in_units),
-                                         init=None, allow_deferred_init=True,
-                                         dtype='float32', stype='default', grad_stype='default')
-        self.bias = mx.gluon.Parameter('bias', shape=(units),
-                                        init='zeros', allow_deferred_init=True,
-                                        dtype='float32', stype='default', grad_stype='default')
-
-    def hybrid_forward(self, F, x, weight, bias):
-        return F.FullyConnected(x, weight, bias, num_hidden=self._units)
-```
-
-Our `weight` and `bias` parameters are dense (see `stype='default'`) and so are their gradients (see `grad_stype='default'`). Our `weight` parameter has shape `(units, in_units)` because the [FullyConnected](../../../../../api/legacy/ndarray/ndarray.rst#mxnet.ndarray.FullyConnected) operator performs the following calculation:
-
-$$Y = XW^T + b$$
-
-We could instead have created our parameter with shape `(in_units, units)` and avoid the transpose of the weight matrix. We'll see why this is so important later on. And instead of [FullyConnected](../../../../../api/legacy/ndarray/ndarray.rst#mxnet.ndarray.FullyConnected) we could have used [mx.sparse.dot](../../../../../api/legacy/ndarray/sparse/index.rst#mxnet.ndarray.sparse.dot) to fully exploit the sparsity of the [CSRNDArray](../../../../../api/legacy/ndarray/sparse/index.rst#mxnet.ndarray.sparse.CSRNDArray) input arrays. We'll now implement an alternative `Block` called `FullyConnectedSparse` using these ideas. We take `grad_stype` of the `weight` as an argument (called `weight_grad_stype`), since we're going to change this later on.
-
-
-```{.python .input}
-class FullyConnectedSparse(mx.gluon.HybridBlock):
-    def __init__(self, in_units, units, weight_grad_stype='default'):
-        super(FullyConnectedSparse, self).__init__()
-        self._units = units
-        self.weight = gluon.Parameter('weight', shape=(in_units, units),
-                                      init=None, allow_deferred_init=True,
-                                      dtype='float32', stype='default', grad_stype=weight_grad_stype)
-        self.bias = gluon.Parameter('bias', shape=(units),
-                                    init='zeros', allow_deferred_init=True,
-                                    dtype='float32', stype='default', grad_stype='default')
-
-    def hybrid_forward(self, F, x, weight, bias):
-        return F.sparse.dot(x, weight) + bias
-```
-
-Once again, we're using a dense `weight`, so both `FullyConnected` and `FullyConnectedSparse` will return dense array outputs. When constructing a multi-layer network therefore, only the first layer needs to be optimized for sparse inputs. Our first layer is often responsible for reducing the feature dimension dramatically (e.g. 1,000,000 features down to 128 features). We'll set the number of units in our 3 layers to be 128, 8 and 1.
-
-We will use [timeit](https://docs.python.org/2/library/timeit.html) to check the performance of these two variants, and analyse some [MXNet Profiler](../../../../performance/backend/profiler.rst) traces that have been created from these benchmarks. Additionally, we will inspect the memory usage of the weights (and gradients) using the `print_memory_allocation` function defined below:
-
-
-```{.python .input}
-def print_memory_allocation(net, block_idxs):
-    blocks = [net[block_idx] for block_idx in block_idxs]
-    weight_nbytes = [get_nbytes(b.weight.data()) for b in blocks]
-    weight_nbytes_pct = [b/sum(weight_nbytes) for b in weight_nbytes]
-    weight_grad_nbytes = [get_nbytes(b.weight.grad()) for b in blocks]
-    weight_grad_nbytes_pct = [b/sum(weight_grad_nbytes) for b in weight_grad_nbytes]
-    print("Memory Allocation for Weight:")
-    for i in range(len(block_idxs)):
-        print('{:7.3f} MBs ({:7.3f}%) for {:<40}'.format(weight_nbytes[i]/1000000,
-                                                              weight_nbytes_pct[i]*100,
-                                                              blocks[i].name))
-    print("Memory Allocation for Weight Gradient:")
-    for i in range(len(block_idxs)):
-        print('{:7.3f} MBs ({:7.3f}%) for {:<40}'.format(weight_grad_nbytes[i]/1000000,
-                                                              weight_grad_nbytes_pct[i]*100,
-                                                              blocks[i].name))
-```
-
-### Benchmark: `FullyConnected`
-
-We'll create a network using `nn.Dense` and benchmark the training.
-
-```{.python .input}
-net = mx.gluon.nn.Sequential()
-net.add(
-    mx.gluon.nn.Dense(in_units=num_features, units=128),
-    mx.gluon.nn.Activation('sigmoid'),
-    mx.gluon.nn.Dense(in_units=128, units=8),
-    mx.gluon.nn.Activation('sigmoid'),
-    mx.gluon.nn.Dense(in_units=8, units=1),
-    mx.gluon.nn.Activation('sigmoid'),
-)
-net.initialize(ctx=ctx)
-trainer = mx.gluon.Trainer(net.collect_params(), 'sgd')
-loss_fn = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss()
-```
-
-
-```{.python .input}
-%%timeit
-data_iter.reset()
-for batch in data_iter:
-    data = batch.data[0]
-    data = data.as_in_context(ctx)
-    label = batch.label[0].as_in_context(ctx)
-    with mx.autograd.record():
-        pred = net(data)
-        loss = loss_fn(pred, label)
-    loss.backward()
-    trainer.step(data.shape[0])
-    loss.wait_to_read()
-```
-
-```
-532 ms +- 3.47 ms per loop (mean +- std. dev. of 7 runs, 1 loop each)
-```
-
-![fully connected](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/ndarray/sparse/fully_connected.png)
-
-We can see the first [FullyConnected](../../../../../api/legacy/ndarray/ndarray.rst#mxnet.ndarray.FullyConnected) operator takes a significant proportion of time to execute (~25% of the iteration) because there are 1,000,000 input features (to 128). After this, the other [FullyConnected](../../../../../api/legacy/ndarray/ndarray.rst#mxnet.ndarray.FullyConnected) operators are much faster because they have input features of 128 (to 8) and 8 (to 1). On the backward pass, we see the same pattern (but in reverse). And finally, the parameter update step takes a large amount of time on the weight matrix of the first `FullyConnected` `Block`. When checking the memory allocations below, we can see the weight matrix of the first `FullyConnected` `Block` is responsible for 99.999% of the memory compared to other [FullyConnected](../../../../../api/legacy/ndarray/ndarray.rst#mxnet.ndarray.FullyConnected) weight matrices.
-
-
-```{.python .input}
-print_memory_allocation(net, block_idxs=[0, 2, 4])
-```
-
-```
-Memory Allocation for Weight:
-512.000 MBs ( 99.999%) for dense0
-  0.004 MBs (  0.001%) for dense1
-  0.000 MBs (  0.000%) for dense2
-Memory Allocation for Weight Gradient:
-512.000 MBs ( 99.999%) for dense0
-  0.004 MBs (  0.001%) for dense1
-  0.000 MBs (  0.000%) for dense2
-```
-
-### Benchmark: `FullyConnectedSparse`
-
-We will now switch the first layer from `FullyConnected` to `FullyConnectedSparse`.
-
-
-```{.python .input}
-net = mx.gluon.nn.Sequential()
-net.add(
-    FullyConnectedSparse(in_units=num_features, units=128),
-    mx.gluon.nn.Activation('sigmoid'),
-    FullyConnected(in_units=128, units=8),
-    mx.gluon.nn.Activation('sigmoid'),
-    FullyConnected(in_units=8, units=1),
-    mx.gluon.nn.Activation('sigmoid'),
-)
-net.initialize(ctx=ctx)
-trainer = mx.gluon.Trainer(net.collect_params(), 'sgd')
-loss_fn = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss()
-```
-
-
-```{.python .input}
-%%timeit
-data_iter.reset()
-for batch in data_iter:
-    data = batch.data[0]
-    data = data.as_in_context(ctx)
-    label = batch.label[0].as_in_context(ctx)
-    with mx.autograd.record():
-        pred = net(data)
-        loss = loss_fn(pred, label)
-    loss.backward()
-    trainer.step(data.shape[0])
-    loss.wait_to_read()
-```
-
-```
-528 ms +- 22.7 ms per loop (mean +- std. dev. of 7 runs, 1 loop each)
-```
-
-![fully connected sparse](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/ndarray/sparse/fully_connected_sparse.png)
-
-We see the forward pass of `dot` and `add` (equivalent to [FullyConnected](../../../../../api/legacy/ndarray/ndarray.rst#mxnet.ndarray.FullyConnected) operator) is much faster now: 1.54ms vs 0.26ms. And this explains the reduction in overall time for the epoch. We didn't gain any benefit on the backward pass or parameter updates though.
-
-![fully connected sparse backward](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/ndarray/sparse/fully_connected_sparse_backward.png)
-
-Our first weight matrix and its gradients still take up the same amount of memory as before.
-
-
-```{.python .input}
-print_memory_allocation(net, block_idxs=[0, 2, 4])
-```
-
-```
-Memory Allocation for Weight:
-512.000 MBs ( 99.999%) for fullyconnectedsparse0
-  0.004 MBs (  0.001%) for fullyconnected0
-  0.000 MBs (  0.000%) for fullyconnected1
-Memory Allocation for Weight Gradient:
-512.000 MBs ( 99.999%) for fullyconnectedsparse0
-  0.004 MBs (  0.001%) for fullyconnected0
-  0.000 MBs (  0.000%) for fullyconnected1
-```
-
-### Benchmark: `FullyConnectedSparse` with `grad_stype=row_sparse`
-
-One useful outcome of sparsity in our [CSRNDArray](../../../../../api/legacy/ndarray/sparse/index.rst#mxnet.ndarray.sparse.CSRNDArray) input is that our gradients will be row sparse. We can exploit this fact to give us potentially huge memory savings and speed improvements. Creating our `weight` parameter with shape `(units, in_units)` and not transposing in the forward pass are important pre-requisite for obtaining row sparse gradients. Using [nn.Dense](../../../../../api/gluon/nn/index.rst#mxnet.gluon.nn.Dense) would have led to column sparse gradients which are not supported in MXNet. We previously had `grad_stype` of the `weight` parameter in the first layer set to `'default'` so we were handling the gradient as a dense array. Switching this to `'row_sparse'` can give us these potential improvements.
-
-
-```{.python .input}
-net = mx.gluon.nn.Sequential()
-net.add(
-    FullyConnectedSparse(in_units=num_features, units=128, weight_grad_stype='row_sparse'),
-    mx.gluon.nn.Activation('sigmoid'),
-    FullyConnected(in_units=128, units=8),
-    mx.gluon.nn.Activation('sigmoid'),
-    FullyConnected(in_units=8, units=1),
-    mx.gluon.nn.Activation('sigmoid'),
-)
-net.initialize(ctx=ctx)
-trainer = mx.gluon.Trainer(net.collect_params(), 'sgd')
-loss_fn = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss()
-```
-
-
-```{.python .input}
-%%timeit
-data_iter.reset()
-for batch in data_iter:
-    data = batch.data[0]
-    data = data.as_in_context(ctx)
-    label = batch.label[0].as_in_context(ctx)
-    with mx.autograd.record():
-        pred = net(data)
-        loss = loss_fn(pred, label)
-    loss.backward()
-    trainer.step(data.shape[0])
-    loss.wait_to_read()
-```
-
-```
-334 ms +- 16.9 ms per loop (mean +- std. dev. of 7 runs, 1 loop each)
-```
-
-![fully connected sparse grad backward](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/ndarray/sparse/fully_connected_sparse_grad_backward.png)
-
-We can see a huge reduction in the time taken for the backward pass and parameter update step: 3.99ms vs 0.18ms. And this reduces the overall time of the epoch significantly. Our gradient consumes a much smaller amount of memory and means only a subset of parameters need updating as part of the `sgd_update` step. Some optimizers don't support sparse gradients however, so reference the specific optimizer's documentation for more details.
-
-
-```{.python .input}
-print_memory_allocation(net, block_idxs=[0, 2, 4])
-```
-
-```
-Memory Allocation for Weight:
-512.000 MBs ( 99.999%) for fullyconnectedsparse1
-  0.004 MBs (  0.001%) for fullyconnected2
-  0.000 MBs (  0.000%) for fullyconnected3
-Memory Allocation for Weight Gradient:
-  0.059 MBs ( 93.490%) for fullyconnectedsparse1
-  0.004 MBs (  6.460%) for fullyconnected2
-  0.000 MBs (  0.050%) for fullyconnected3
-```
-
-## Conclusion
-
-As part of this tutorial, we learned how to write sparse data to disk in LibSVM format and load it back in sparse batches with the [LibSVMIter](../../../../../api/legacy/io/index.rst#mxnet.io.LibSVMIter). We learned how to improve the performance of Gluon's [nn.Dense](../../../../../api/gluon/nn/index.rst#mxnet.gluon.nn.Dense) on sparse arrays using `mx.nd.sparse`. And lastly, we set `grad_stype` to `'row_sparse'` to reduce the size of the gradient and speed up the parameter update step.
-
-## Recommended Next Steps
-
-* More detail on the [CSRNDArray](../../../../../api/legacy/ndarray/sparse/index.rst#mxnet.ndarray.sparse.CSRNDArray) sparse array format can be found in [this tutorial](./csr.ipynb).
-* More detail on the [RowSparseNDArray](../../../../../api/legacy/ndarray/sparse/index.rst#mxnet.ndarray.sparse.RowSparseNDArray) sparse array format can be found in [this tutorial](./row_sparse.ipynb).
-* Users of the Gluon API can see a Gluon example in [this tutorial](./train_gluon.ipynb).
diff --git a/docs/tutorial_utils/vision/cnn_visualization/gradcam.py b/docs/tutorial_utils/vision/cnn_visualization/gradcam.py
index 54cb65eef11b..433d67a7a1e1 100644
--- a/docs/tutorial_utils/vision/cnn_visualization/gradcam.py
+++ b/docs/tutorial_utils/vision/cnn_visualization/gradcam.py
@@ -18,13 +18,12 @@
 from __future__ import print_function
 
 import mxnet as mx
-import mxnet.ndarray as nd
 
-from mxnet import gluon
+from mxnet import gluon, np, npx
 from mxnet import autograd
 from mxnet.gluon import nn
 
-import numpy as np
+import numpy as onp
 import cv2
 
 class ReluOp(mx.operator.CustomOp):
@@ -38,7 +37,7 @@ class ReluOp(mx.operator.CustomOp):
 
     def forward(self, is_train, req, in_data, out_data, aux):
         x = in_data[0]
-        y = nd.maximum(x, nd.zeros_like(x))
+        y = np.maximum(x, np.zeros_like(x))
         self.assign(out_data[0], req[0], y)
 
     def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
@@ -47,7 +46,7 @@ def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
             y = out_data[0]
             dy = out_grad[0]
             # Zero out the negatives in the gradients of the output
-            dy_positives = nd.maximum(dy, nd.zeros_like(dy))
+            dy_positives = np.maximum(dy, np.zeros_like(dy))
             # What output values were greater than 0?
             y_ones = y.__gt__(0)
             # Mask out the values for which at least one of dy or y is negative
@@ -85,8 +84,8 @@ def __init__(self, act_type, **kwargs):
         assert act_type == 'relu'
         super(Activation, self).__init__(**kwargs)
 
-    def hybrid_forward(self, F, x):
-        return F.Custom(x, op_type='relu')
+    def forward(self, x):
+        return npx.Custom(x, op_type='relu')
 
 class Conv2D(mx.gluon.HybridBlock):
     """Wrapper on top of gluon.nn.Conv2D to capture the output and gradients of output of a Conv2D
@@ -108,7 +107,7 @@ def __init__(self, channels, kernel_size, strides=(1, 1), padding=(0, 0),
                              activation=activation, use_bias=use_bias, weight_initializer=weight_initializer,
                              bias_initializer=bias_initializer, in_channels=in_channels)
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         out = self.conv(x)
         name = self._prefix[:-1]
         if name == Conv2D.capture_layer_name:
@@ -153,10 +152,10 @@ def _get_grad(net, image, class_id=None, conv_layer_name=None, image_grad=False)
     # If user didn't provide a class id, we'll use the class that the network predicted
     if class_id == None:
         model_output = out.asnumpy()
-        class_id = np.argmax(model_output)
+        class_id = onp.argmax(model_output)
 
     # Create a one-hot target with class_id and backprop with the created target
-    one_hot_target = mx.nd.one_hot(mx.nd.array([class_id]), 1000)
+    one_hot_target = mx.npx.one_hot(mx.np.array([class_id]), 1000)
     out.backward(one_hot_target, train_mode=False)
 
     if image_grad:
@@ -202,46 +201,46 @@ def grad_to_image(gradient):
     the output neurons."""
     gradient = gradient - gradient.min()
     gradient /= gradient.max()
-    gradient = np.uint8(gradient * 255).transpose(1, 2, 0)
+    gradient = onp.uint8(gradient * 255).transpose(1, 2, 0)
     gradient = gradient[..., ::-1]
     return gradient
 
 def get_cam(imggrad, conv_out):
     """Compute CAM. Refer section 3 of https://arxiv.org/abs/1610.02391 for details"""
-    weights = np.mean(imggrad, axis=(1, 2))
-    cam = np.ones(conv_out.shape[1:], dtype=np.float32)
+    weights = onp.mean(imggrad, axis=(1, 2))
+    cam = onp.ones(conv_out.shape[1:], dtype=onp.float32)
     for i, w in enumerate(weights):
         cam += w * conv_out[i, :, :]
     cam = cv2.resize(cam, (imggrad.shape[1], imggrad.shape[2]))
-    cam = np.maximum(cam, 0)
-    cam = (cam - np.min(cam)) / (np.max(cam) - np.min(cam)) 
-    cam = np.uint8(cam * 255)
+    cam = onp.maximum(cam, 0)
+    cam = (cam - onp.min(cam)) / (onp.max(cam) - onp.min(cam)) 
+    cam = onp.uint8(cam * 255)
     return cam
 
 def get_guided_grad_cam(cam, imggrad):
     """Compute Guided Grad-CAM. Refer section 3 of https://arxiv.org/abs/1610.02391 for details"""
-    return np.multiply(cam, imggrad)
+    return onp.multiply(cam, imggrad)
 
 def get_img_heatmap(orig_img, activation_map):
     """Draw a heatmap on top of the original image using intensities from activation_map"""
     heatmap = cv2.applyColorMap(activation_map, cv2.COLORMAP_COOL)
     heatmap = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)
-    img_heatmap = np.float32(heatmap) + np.float32(orig_img)
-    img_heatmap = img_heatmap / np.max(img_heatmap)
+    img_heatmap = onp.float32(heatmap) + onp.float32(orig_img)
+    img_heatmap = img_heatmap / onp.max(img_heatmap)
     img_heatmap *= 255
     return img_heatmap.astype(int)
 
 def to_grayscale(cv2im):
     """Convert gradients to grayscale. This gives a saliency map."""
     # How strongly does each position activate the output
-    grayscale_im = np.sum(np.abs(cv2im), axis=0)
+    grayscale_im = onp.sum(onp.abs(cv2im), axis=0)
 
     # Normalize between min and 99th percentile
-    im_max = np.percentile(grayscale_im, 99)
-    im_min = np.min(grayscale_im)
-    grayscale_im = np.clip((grayscale_im - im_min) / (im_max - im_min), 0, 1)
+    im_max = onp.percentile(grayscale_im, 99)
+    im_min = onp.min(grayscale_im)
+    grayscale_im = onp.clip((grayscale_im - im_min) / (im_max - im_min), 0, 1)
 
-    grayscale_im = np.expand_dims(grayscale_im, axis=0)
+    grayscale_im = onp.expand_dims(grayscale_im, axis=0)
     return grayscale_im
 
 def visualize(net, preprocessed_img, orig_img, conv_layer_name):
@@ -257,7 +256,7 @@ def visualize(net, preprocessed_img, orig_img, conv_layer_name):
     img_heatmap = get_img_heatmap(orig_img, cam)
     
     ggcam_gray = to_grayscale(ggcam)
-    img_ggcam_gray = np.squeeze(grad_to_image(ggcam_gray))
+    img_ggcam_gray = onp.squeeze(grad_to_image(ggcam_gray))
     
     return img_heatmap, img_ggcam, img_ggcam_gray
 
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 335ab90b3cb1..b68765eedff8 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -1399,6 +1399,14 @@ MXNET_DLL int MXNDArrayGetDeferredComputeSymbol(NDArrayHandle *output_handles,
                                                 int num_outputs,
                                                 SymbolHandle *out);
 
+/*!
+ * \brief Clear the deferred compute info associated with the ndarrays.
+ * \param arrays ndarray handles of deferred compute outputs
+ * \param num number of ndarrays
+ * \return 0 when success, -1 otherwise
+ */
+MXNET_DLL int MXNDArrayClearDeferredCompute(NDArrayHandle *arrays, int num);
+
 //--------------------------------------------
 // Part 3: symbolic configuration generation
 //--------------------------------------------
diff --git a/include/mxnet/imperative.h b/include/mxnet/imperative.h
index c3e2697b3d8d..d998a74fde48 100644
--- a/include/mxnet/imperative.h
+++ b/include/mxnet/imperative.h
@@ -118,6 +118,11 @@ class Imperative {
                           const std::vector<NDArray *> &inputs,
                           const std::vector<NDArray *> &outputs);
 
+    static void Clear(const nnvm::ObjectPtr& node) {
+      if (node == nullptr || node->info.empty()) return;
+      node->info.clear();
+    }
+
    private:
     friend class Imperative;
 
@@ -248,6 +253,8 @@ class Imperative {
   nnvm::Symbol GetDeferredComputeSymbol(const std::vector<NDArray *> &outputs);
   /*! \brief associate arrays with variables for deferred compute */
   void SetDeferredComputeVariable(NDArrayHandle *arrays, SymbolHandle *variables, const int num);
+  /*! \brief clear info node associated with array */
+  void DeferredComputeClear(NDArrayHandle *arrays, const int num);
   /*! \brief */
   OpStatePtr Invoke(const Context& default_ctx,
                     const nnvm::NodeAttrs& attrs,
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 77c0a4badfe7..7d5bff7f1e85 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -108,3 +108,5 @@
 from . import _api_internal
 from . import api
 from . import container
+
+npx.set_np()
diff --git a/python/mxnet/_deferred_compute.py b/python/mxnet/_deferred_compute.py
index 4cb1725579b1..d1a2c30ea2aa 100644
--- a/python/mxnet/_deferred_compute.py
+++ b/python/mxnet/_deferred_compute.py
@@ -104,3 +104,22 @@ def set_variable(arrays, variables):
     variables = variables_type(*[symbol.handle for symbol in variables])
 
     check_call(_LIB.MXNDArraySetDeferredComputeVariable(arrays, variables, len(arrays)))
+
+
+def clear(arrays):
+    """Clear the dc info node associated with output variables.
+
+    Parameters
+    ----------
+    arrays: NDArray or List[NDArray]
+    """
+
+    arrays = _as_list(arrays)
+
+    # Prepare ctypes array types
+    arrays_type = ctypes.c_void_p * len(arrays)
+
+    # Convert handles
+    arrays = arrays_type(*[array.handle for array in arrays])
+
+    check_call(_LIB.MXNDArrayClearDeferredCompute(arrays, len(arrays)))
diff --git a/python/mxnet/amp/lists/symbol_fp16.py b/python/mxnet/amp/lists/symbol_fp16.py
index d8869738f0a4..9d8ff4e975d8 100644
--- a/python/mxnet/amp/lists/symbol_fp16.py
+++ b/python/mxnet/amp/lists/symbol_fp16.py
@@ -317,6 +317,9 @@
     '_npx_relu',
     '_npx_reshape',
     '_npx_sigmoid',
+    '_npx_cond',
+    '_npx_foreach',
+    '_npx_while_loop',
     '_onehot_encode',
     '_ones',
     '_plus_scalar',
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index bd4587a46110..1f9f37d04d88 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -819,7 +819,8 @@ def write_all_str(module_file, module_all_list):
                               '_npx_convolution', '_npx_deconvolution', '_npx_pooling',
                               '_npx_dropout', '_npx_one_hot', '_npx_rnn', '_npx_embedding',
                               '_npx_topk', '_npx_layer_norm', '_npx_leaky_relu', '_npx_batch_dot',
-                              '_npx_broadcast_like', '_npx_arange_like'}
+                              '_npx_broadcast_like', '_npx_arange_like', '_npx_group_norm',
+                              '_npx_foreach', '_npx_while_loop', '_npx_cond'}
 
 _NP_INTERNAL_OP_PREFIX = '_npi_'
 
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 299df1843b53..5be7a51c4d96 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -21,7 +21,6 @@
 __all__ = ['Block', 'HybridBlock', 'SymbolBlock']
 
 import copy
-import inspect
 import warnings
 import weakref
 from collections import OrderedDict, defaultdict
@@ -40,7 +39,7 @@
 from ..ndarray import NDArray
 from .parameter import Parameter, DeferredInitializationError
 from .utils import _indent, _brief_print_list, HookHandle, shape_is_known
-from .utils import _check_same_symbol_type, _check_all_np_ndarrays
+from .utils import _check_same_symbol_type, _check_all_np_ndarrays, _check_block_input_np_ndarrays
 from .. import numpy_extension as _mx_npx
 from .. import numpy as _mx_np, ndarray as nd
 from .. util import is_np_array, np_shape, np_array
@@ -217,12 +216,12 @@ def __init__(self, **kwargs):
                 self.dense1 = nn.Dense(20)
 
             def forward(self, x):
-                x = mx.nd.relu(self.dense0(x))
-                return mx.nd.relu(self.dense1(x))
+                x = mx.npx.relu(self.dense0(x))
+                return mx.npx.relu(self.dense1(x))
 
         model = Model()
         model.initialize(ctx=mx.cpu(0))
-        model(mx.nd.zeros((10, 10), ctx=mx.cpu(0)))
+        model(mx.np.zeros((10, 10), ctx=mx.cpu(0)))
 
 
     Child :py:class:`Block` assigned this way will be registered and :py:meth:`collect_params`
@@ -1010,13 +1009,13 @@ def __init__(self, **kwargs):
                 self.dense1 = nn.Dense(20)
 
             def forward(self, x):
-                x = nd.relu(self.dense0(x))
-                return nd.relu(self.dense1(x))
+                x = mx.npx.relu(self.dense0(x))
+                return mx.npx.relu(self.dense1(x))
 
         model = Model()
         model.initialize(ctx=mx.cpu(0))
         model.hybridize()
-        model(mx.nd.zeros((10, 10), ctx=mx.cpu(0)))
+        model(mx.np.zeros((10, 10), ctx=mx.cpu(0)))
 
     Forward computation in :py:class:`HybridBlock` must be static to work with :py:class:`Symbol` s,
     i.e. you cannot call :py:meth:`NDArray.asnumpy`, :py:attr:`NDArray.shape`,
@@ -1028,7 +1027,7 @@ def forward(self, x):
     Before activating with :py:meth:`hybridize()`, :py:class:`HybridBlock` works just like normal
     :py:class:`Block`. After activation, :py:class:`HybridBlock` will create a symbolic graph
     representing the forward computation and cache it. On subsequent forwards,
-    the cached graph will be used instead of :py:meth:`hybrid_forward`.
+    the cached graph will be used instead of :py:meth:`forward`.
 
     Please see references for detailed tutorial.
 
@@ -1039,7 +1038,9 @@ def forward(self, x):
     """
     def __init__(self):
         super(HybridBlock, self).__init__()
-        self._v2 = inspect.unwrap(self.hybrid_forward.__func__) is HybridBlock.hybrid_forward
+        assert hasattr(self, "hybrid_forward") is False, (
+            "'forward' instead of 'hybrid_forward' interface needs to be used starting from Gluon2.0."
+            "Please follow MXNet2.0 Migration Guide to use new APIs.")
         self._cached_graph = ()
         self._cached_op = None
         self._out_format = None
@@ -1064,41 +1065,7 @@ def __setattr__(self, name, value):
                 self._active = False
             self._clear_cached_op()
 
-    def _get_graph_v1(self, *args):
-        if not self._cached_graph:
-            flatten_args, self._in_format = _flatten(args, "input")
-            flatten_inputs = []
-            symbol_inputs = []
-            cnt = 0
-            real_arg_num = sum([ele is not None for ele in flatten_args])
-            if real_arg_num == 0:
-                raise ValueError('All args are None and we do not support such a case.'
-                                 ' Received args={}'.format(args))
-            for arg in flatten_args:
-                if arg is not None:
-                    if real_arg_num > 1:
-                        arg_sym = symbol.var('data{}'.format(cnt))
-                    else:
-                        arg_sym = symbol.var('data')
-                    if isinstance(arg, _mx_np.ndarray):
-                        arg_sym = arg_sym.as_np_ndarray()
-                    cnt += 1
-                    flatten_inputs.append(arg_sym)
-                    symbol_inputs.append(arg_sym)
-                else:
-                    flatten_inputs.append(None)
-            grouped_inputs = _regroup(flatten_inputs, self._in_format)
-
-            with _block_scope(self):
-                params = {i: j.var() for i, j in self._reg_params.items()}
-                out = self.hybrid_forward(symbol, *grouped_inputs, **params)  # pylint: disable=no-value-for-parameter
-            out, self._out_format = _flatten(out, "output")
-
-            self._cached_graph = symbol_inputs, symbol.Group(out, _check_same_symbol_type(out))
-
-        return self._cached_graph
-
-    def _get_graph_v2(self, *args):
+    def _get_graph(self, *args):
         if not self._cached_graph:
             flatten_args, self._in_format = _flatten(args, "input")
             flatten_args = [ele.detach() if ele is not None else None for ele in flatten_args]
@@ -1121,17 +1088,10 @@ def _get_graph_v2(self, *args):
                 out = super().__call__(*args)
             flatten_out, self._out_format = _flatten(out, "output")
             symbol_outputs = dc.get_symbol(flatten_out, sym_cls=type(symbol_inputs[0]))
+            dc.clear(flatten_out)
             self._cached_graph = symbol_inputs, symbol_outputs
         return self._cached_graph
 
-    def _get_graph(self, *args):
-        if not self._cached_graph:
-            if not self._v2:
-                return self._get_graph_v1(*args)
-            else:  # Gluon 2 based on deferred compute mode
-                return self._get_graph_v2(*args)
-        return self._cached_graph
-
     def _build_cache(self, *args, update_graph=True):
         data, out = self._get_graph(*args)
         data_names = {data.name: i for i, data in enumerate(data)}
@@ -1492,20 +1452,17 @@ def _infer_attrs(self, infer_fn, attr, *args):
 
     def infer_shape(self, *args):
         """Infers shape of Parameters from inputs."""
-        if not self._v2:
-            # Gluon 1 based on F:  hybrid_forward is defined by user
-            self._infer_attrs('infer_shape', 'shape', *args)
-        else:
-            # In Gluon 2, users must implement infer_shape, if any deferred
-            # initialized parameters are associated with the HybridBlock
-            params = [p for p in self._reg_params.values() if not shape_is_known(p.shape)]
-            if params:
-                params_str = ", ".join("{} ({})".format(p.name, p.shape) for p in params)
-                raise RuntimeError(
-                    "{name} has parameters with unknown shape. You need to either specify the shape "
-                    "in __init__ or implement {name}.infer_shape to set the parameter shapes "
-                    "based on the first input. Parameters with unknown shapes are {params}".format(
-                        name=type(self).__name__, params=params_str))
+        # pylint: disable=unused-argument
+        # In Gluon 2, users must implement infer_shape, if any deferred
+        # initialized parameters are associated with the HybridBlock
+        params = [p for p in self._reg_params.values() if not shape_is_known(p.shape)]
+        if params:
+            params_str = ", ".join("{} ({})".format(p.name, p.shape) for p in params)
+            raise RuntimeError(
+                "{name} has parameters with unknown shape. You need to either specify the shape "
+                "in __init__ or implement {name}.infer_shape to set the parameter shapes "
+                "based on the first input. Parameters with unknown shapes are {params}".format(
+                    name=type(self).__name__, params=params_str))
 
     def infer_type(self, *args):
         """Infers data type of Parameters from inputs."""
@@ -1618,81 +1575,46 @@ def c_callback(name, op_name, array):
             cld()._monitor_all = monitor_all
 
     def __call__(self, x, *args):
-        if not self._v2:
-            # Gluon 1 based on F:  hybrid_forward is defined by user
+        _check_block_input_np_ndarrays([x, *args])
+        assert self.forward is not HybridBlock.forward, (
+            'Must define {name}.forward. '
+            'Defining {name}.hybrid_forward is deprecated.'.format(name=type(self).__name__))
+
+        _, has_ndarray, ctx_set, first_ctx = _gather_type_ctx_info([x] + list(args))
+        if not has_ndarray:
+            raise ValueError('In HybridBlock, there must be one NDArray in the input.'
+                             ' Please check the type of the args.\n')
+        if self._active and not dc.is_deferred_compute():
+            # Do not call CachedOp if not hybridized or inside deferred compute mode.
+            if len(ctx_set) > 1:
+                raise ValueError('Find multiple contexts in the input, '
+                                 'After hybridized, the HybridBlock only supports one input '
+                                 'context. You can print the ele.ctx in the '
+                                 'input arguments to inspect their contexts. '
+                                 'Find all contexts = {}'.format(ctx_set))
+
+        if not self._called_infer_shape_already:
+            self.infer_shape(x, *args)
+            for p in self._reg_params.values():
+                p._finish_deferred_init()
+            self._called_infer_shape_already = True
+
+        if not self._active:
+            # Normal imperative computation of forward()
             return super().__call__(x, *args)
-        else:  # Gluon 2 based on deferred compute mode
-            assert self.forward is not HybridBlock.forward, (
-                'Must either define {name}.forward or {name}.hybrid_forward. '
-                'Defining {name}.hybrid_forward is deprecated.'.format(name=type(self).__name__))
-
-            if not self._called_infer_shape_already:
-                self.infer_shape(x, *args)
-                for p in self._reg_params.values():
-                    p._finish_deferred_init()
-                self._called_infer_shape_already = True
-
-            if not self._active:
-                # Normal imperative computation of forward()
-                return super().__call__(x, *args)
-
-            if dc.is_deferred_compute():
-                # Deferred compute is already enabled. This typically means that the current
-                # HybridBlock is a child block of a HybridBlock that has been hybridized.
-                return super().__call__(x, *args)
 
-            with x.ctx:
-                return self._call_cached_op(x, *args)
+        if dc.is_deferred_compute():
+            # Deferred compute is already enabled. This typically means that the current
+            # HybridBlock is a child block of a HybridBlock that has been hybridized.
+            return super().__call__(x, *args)
 
-    def forward(self, x, *args):
-        """Defines the forward computation. Arguments can be either
-        :py:class:`NDArray` or :py:class:`Symbol`."""
+        with first_ctx:
+            return self._call_cached_op(x, *args)
 
-        has_symbol, has_ndarray, ctx_set, first_ctx = _gather_type_ctx_info([x] + list(args))
-        if has_symbol and has_ndarray:
-            raise ValueError('In HybridBlock, we do not support mixed NDArrays and Symbols'
-                             ' types for the input. Please check the type of the args.\n')
-        if not has_symbol and not has_ndarray:
-            raise ValueError('In HybridBlock, there must be one NDArray or one Symbol in the input.'
-                             ' Please check the type of the args.\n')
-        if has_ndarray:
-            ctx = first_ctx
-            if self._active and not dc.is_deferred_compute():
-                # Do not call CachedOp if not hybridized or inside deferred compute mode.
-                if len(ctx_set) > 1:
-                    raise ValueError('Find multiple contexts in the input, '
-                                     'After hybridized, the HybridBlock only supports one input '
-                                     'context. You can print the ele.ctx in the '
-                                     'input arguments to inspect their contexts. '
-                                     'Find all contexts = {}'.format(ctx_set))
-                with ctx:
-                    return self._call_cached_op(x, *args)
-            with ctx:
-                try:
-                    params = {k: v.data(ctx) for k, v in self._reg_params.items()}
-                except DeferredInitializationError:
-                    self._deferred_infer_shape(x, *args)
-                    for _, v in self.params.items():
-                        v._finish_deferred_init()
-                    params = {k: v.data(ctx) for k, v in self._reg_params.items()}
-
-                return self.hybrid_forward(ndarray, x, *args, **params)
-
-        with _block_scope(self):
-            params = {i: j.var() for i, j in self._reg_params.items()}
-            return self.hybrid_forward(symbol, x, *args, **params)
-
-    def hybrid_forward(self, F, x, *args, **kwargs):
-        """Overrides to construct symbolic graph for this `Block`.
+    def forward(self, x, *args):
+        """Overrides the forward computation. Arguments must be
+        :py:class:`mxnet.numpy.ndarray`."""
 
-        Parameters
-        ----------
-        x : Symbol or NDArray
-            The first input tensor.
-        *args : list of Symbol or list of NDArray
-            Additional input tensors.
-        """
-        # pylint: disable= invalid-name
         raise NotImplementedError
 
     def reset_ctx(self, ctx):
@@ -1873,6 +1795,23 @@ def __init__(self, outputs, inputs, params=None):
 
         self._cached_graph = syms, out
 
+    def infer_shape(self, *args):
+        """Infers shape of Parameters from inputs."""
+        self._infer_attrs('infer_shape', 'shape', *args)
+
+    def __call__(self, x, *args):
+        """Calls forward. Only accepts positional arguments."""
+        for hook in self._forward_pre_hooks.values():
+            hook(self, [x] + args)
+
+        out = self.forward(x, *args)
+
+        for hook in self._forward_hooks.values():
+            hook(self, [x] + args, out)
+        if _mx_npx.is_np_array():
+            _check_all_np_ndarrays(out)
+        return out
+
     def forward(self, x, *args):
         if dc.is_deferred_compute():
             raise RuntimeError('Calling a SymbolBlock from within HybridBlock '
@@ -1922,9 +1861,6 @@ def cast(self, dtype):
                         for sib in sibs:
                             self.params.get(sib).cast('float32')
 
-    def hybrid_forward(self, F, x, *args, **kwargs):
-        raise NotImplementedError
-
 def _infer_param_types(in_params, out_params, arg_params, aux_params, default_dtype=mx_real_t):
     """Utility function that helps in inferring DType of args and auxs params
     from given input param.
diff --git a/python/mxnet/gluon/data/vision/transforms/__init__.py b/python/mxnet/gluon/data/vision/transforms/__init__.py
index c32bac14c0e4..54bb29f9a523 100644
--- a/python/mxnet/gluon/data/vision/transforms/__init__.py
+++ b/python/mxnet/gluon/data/vision/transforms/__init__.py
@@ -24,7 +24,8 @@
 
 from ....block import Block, HybridBlock
 from ....nn import Sequential, HybridSequential
-from .....util import is_np_array
+from .....util import use_np
+from ..... import np, npx
 
 from . image import *
 from .image import _append_return
@@ -110,6 +111,7 @@ def __init__(self, transforms):
         self.hybridize()
 
 
+@use_np
 class Cast(HybridBlock):
     """Cast inputs to a specific data type
 
@@ -129,10 +131,8 @@ def __init__(self, dtype='float32'):
         super(Cast, self).__init__()
         self._dtype = dtype
 
-    def hybrid_forward(self, F, *args):
-        if is_np_array():
-            F = F.npx
-        return tuple([F.cast(x, self._dtype) for x in args])
+    def forward(self, *args):
+        return tuple(x.astype(self._dtype) for x in args)
 
 
 class RandomApply(Sequential):
@@ -189,9 +189,6 @@ def __init__(self, transforms, p=0.5):
         self.transforms = transforms
         self.p = p
 
-    def hybrid_forward(self, F, x, *args):
-        if is_np_array():
-            cond = self.p < F.random.uniform(low=0, high=1, size=1)
-            return F.npx.cond(cond, x, self.transforms(x))
-        cond = self.p < F.random.uniform(low=0, high=1, shape=1)
-        return _append_return(F.contrib.cond(cond, x, self.transforms(x)), *args)
+    def forward(self, x, *args):
+        cond = lambda p: p < np.random.uniform(low=0, high=1, size=1)
+        return npx.cond(cond, x, self.transforms(x), self.p)
diff --git a/python/mxnet/gluon/data/vision/transforms/image.py b/python/mxnet/gluon/data/vision/transforms/image.py
index 37b2a061b0df..92c085d6804d 100644
--- a/python/mxnet/gluon/data/vision/transforms/image.py
+++ b/python/mxnet/gluon/data/vision/transforms/image.py
@@ -18,12 +18,13 @@
 # coding: utf-8
 # pylint: disable= arguments-differ
 "Image transforms."
-import numpy as np
+import numpy as onp
 
 from ....block import Block, HybridBlock
 from ..... import image
 from .....base import numeric_types
-from .....util import is_np_array
+from .....util import use_np
+from ..... import np, npx
 
 __all__ = ['ToTensor', 'Normalize', 'Rotate', 'RandomRotation',
            'RandomResizedCrop', 'CropResize', 'CropResize', 'RandomCrop',
@@ -42,6 +43,7 @@ def _append_return(*args):
     return None
 
 
+@use_np
 class ToTensor(HybridBlock):
     """Converts an image NDArray or batch of image NDArray to a tensor NDArray.
 
@@ -80,12 +82,11 @@ class ToTensor(HybridBlock):
     def __init__(self):
         super(ToTensor, self).__init__()
 
-    def hybrid_forward(self, F, x, *args):
-        if is_np_array():
-            F = F.npx
-        return _append_return(F.image.to_tensor(x), *args)
+    def forward(self, x, *args):
+        return _append_return(npx.image.to_tensor(x), *args)
 
 
+@use_np
 class Normalize(HybridBlock):
     """Normalize an tensor of shape (C x H x W) or (N x C x H x W) with mean and
     standard deviation.
@@ -135,12 +136,11 @@ def __init__(self, mean=0.0, std=1.0):
         self._mean = mean
         self._std = std
 
-    def hybrid_forward(self, F, x, *args):
-        if is_np_array():
-            F = F.npx
-        return _append_return(F.image.normalize(x, self._mean, self._std), *args)
+    def forward(self, x, *args):
+        return _append_return(npx.image.normalize(x, self._mean, self._std), *args)
 
 
+@use_np
 class Rotate(Block):
     """Rotate the input image by a given angle. Keeps the original image shape.
 
@@ -165,12 +165,13 @@ def __init__(self, rotation_degrees, zoom_in=False, zoom_out=False):
         self._args = (rotation_degrees, zoom_in, zoom_out)
 
     def forward(self, x, *args):
-        if np.dtype(x.dtype) is not np.dtype(np.float32):
+        if onp.dtype(x.dtype) is not onp.dtype(onp.float32):
             raise TypeError("This transformation only supports float32. "
                             "Consider calling it after ToTensor, given: {}".format(x.dtype))
         return _append_return(image.imrotate(x, *self._args), *args)
 
 
+@use_np
 class RandomRotation(Block):
     """Random rotate the input image by a random angle.
        Keeps the original image shape and aspect ratio.
@@ -204,14 +205,15 @@ def __init__(self, angle_limits, zoom_in=False, zoom_out=False, rotate_with_prob
         self._rotate_with_proba = rotate_with_proba
 
     def forward(self, x, *args):
-        if np.random.random() > self._rotate_with_proba:
+        if onp.random.random() > self._rotate_with_proba:
             return x
-        if np.dtype(x.dtype) is not np.dtype(np.float32):
+        if onp.dtype(x.dtype) is not onp.dtype(onp.float32):
             raise TypeError("This transformation only supports float32. "
                             "Consider calling it after ToTensor")
         return _append_return(image.random_rotate(x, *self._args), *args)
 
 
+@use_np
 class RandomResizedCrop(HybridBlock):
     """Crop the input image with random scale and aspect ratio.
 
@@ -250,12 +252,11 @@ def __init__(self, size, scale=(0.08, 1.0), ratio=(3.0/4.0, 4.0/3.0),
                         'area': scale, 'ratio': ratio,
                         'interp': interpolation, 'max_trial': 10}
 
-    def hybrid_forward(self, F, x, *args):
-        if is_np_array():
-            F = F.npx
-        return _append_return(F.image.random_resized_crop(x, **self._kwargs), *args)
+    def forward(self, x, *args):
+        return _append_return(npx.image.random_resized_crop(x, **self._kwargs), *args)
 
 
+@use_np
 class CropResize(HybridBlock):
     r"""Crop the input image with and optionally resize it.
 
@@ -309,16 +310,13 @@ def __init__(self, x, y, width, height, size=None, interpolation=None):
         self._size = size
         self._interpolation = interpolation
 
-    def hybrid_forward(self, F, x, *args):
-        if is_np_array():
-            _image = F.npx.image
-        else:
-            _image = F.image
-        out = _image.crop(x, self._x, self._y, self._width, self._height)
+    def forward(self, x, *args):
+        out = npx.image.crop(x, self._x, self._y, self._width, self._height)
         if self._size:
-            out = _image.resize(out, self._size, False, self._interpolation)
+            out = npx.image.resize(out, self._size, False, self._interpolation)
         return _append_return(out, *args)
 
+@use_np
 class RandomCrop(HybridBlock):
     """Randomly crop `src` with `size` (width, height).
     Padding is optional.
@@ -362,19 +360,13 @@ def __init__(self, size, pad=None, pad_value=0, interpolation=1):
             self.nd_pad = pad
             self.np_pad = pad
 
-    def hybrid_forward(self, F, x, *args):
-        if is_np_array():
-            if self.np_pad:
-                x = F.np.pad(x, pad_width=self.np_pad, mode='constant', constant_values=self._pad_value)
-            return _append_return(F.npx.image.random_crop(x, *self._args), *args)
-        else:
-            if self.nd_pad:
-                x = F.cast(F.expand_dims(F.expand_dims(x, 0), 0), 'float32')
-                x_pad = F.pad(x, pad_width=self.nd_pad, mode='constant', constant_value=self._pad_value)
-                x = F.cast(x_pad.squeeze(0).squeeze(0), 'uint8')
-            return _append_return(F.image.random_crop(x, *self._args), *args)
-
+    def forward(self, x, *args):
+        if self.np_pad:
+            x = np.pad(x, pad_width=self.np_pad, mode='constant', constant_values=self._pad_value)
+        # pylint: disable=too-many-function-args
+        return _append_return(npx.image.random_crop(x, *self._args), *args)
 
+@use_np
 class CenterCrop(HybridBlock):
     """Crops the image `src` to the given `size` by trimming on all four
     sides and preserving the center of the image. Upsamples if `src` is
@@ -408,12 +400,12 @@ def __init__(self, size, interpolation=1):
             size = (size, size)
         self._args = (size[0], size[1], interpolation)
 
-    def hybrid_forward(self, F, x, *args):
-        if is_np_array():
-            F = F.npx
-        return _append_return(F.image.random_crop(x, (0.5, 0.5), (0.5, 0.5), *self._args), *args)
+    def forward(self, x, *args):
+        # pylint: disable=too-many-function-args
+        return _append_return(npx.image.random_crop(x, (0.5, 0.5), (0.5, 0.5), *self._args), *args)
 
 
+@use_np
 class Resize(HybridBlock):
     """Resize an image or a batch of image NDArray to the given size.
     Should be applied before `mxnet.gluon.data.vision.transforms.ToTensor`.
@@ -454,11 +446,10 @@ def __init__(self, size, keep_ratio=False, interpolation=1):
         self._size = size
         self._interpolation = interpolation
 
-    def hybrid_forward(self, F, x, *args):
-        if is_np_array():
-            F = F.npx
-        return _append_return(F.image.resize(x, self._size, self._keep, self._interpolation), *args)
+    def forward(self, x, *args):
+        return _append_return(npx.image.resize(x, self._size, self._keep, self._interpolation), *args)
 
+@use_np
 class RandomFlipLeftRight(HybridBlock):
     """Randomly flip the input image left to right with a probability
     of p(0.5 by default).
@@ -473,20 +464,16 @@ def __init__(self, p=0.5):
         super(RandomFlipLeftRight, self).__init__()
         self.p = p
 
-    def hybrid_forward(self, F, x, *args):
+    def forward(self, x, *args):
         if self.p <= 0:
             return _append_return(x, *args)
 
-        if is_np_array():
-            if self.p >= 1:
-                return _append_return(F.npx.image.flip_left_right(x), *args)
-            return _append_return(F.npx.image.random_flip_left_right(x, p=self.p), *args)
-        else:
-            if self.p >= 1:
-                return _append_return(F.image.flip_left_right(x), *args)
-            return _append_return(F.image.random_flip_left_right(x, p=self.p), *args)
+        if self.p >= 1:
+            return _append_return(npx.image.flip_left_right(x), *args)
+        return _append_return(npx.image.random_flip_left_right(x, p=self.p), *args)
 
 
+@use_np
 class RandomFlipTopBottom(HybridBlock):
     """Randomly flip the input image top to bottom with a probability
     of p(0.5 by default).
@@ -501,20 +488,16 @@ def __init__(self, p=0.5):
         super(RandomFlipTopBottom, self).__init__()
         self.p = p
 
-    def hybrid_forward(self, F, x, *args):
+    def forward(self, x, *args):
         if self.p <= 0:
             return _append_return(x, *args)
 
-        if is_np_array():
-            if self.p >= 1:
-                return _append_return(F.npx.image.flip_top_bottom(x), *args)
-            return _append_return(F.npx.image.random_flip_top_bottom(x, p=self.p), *args)
-        else:
-            if self.p >= 1:
-                return _append_return(F.image.flip_top_bottom(x), *args)
-            return _append_return(F.image.random_flip_top_bottom(x, p=self.p), *args)
+        if self.p >= 1:
+            return _append_return(npx.image.flip_top_bottom(x), *args)
+        return _append_return(npx.image.random_flip_top_bottom(x, p=self.p), *args)
 
 
+@use_np
 class RandomBrightness(HybridBlock):
     """Randomly jitters image brightness with a factor
     chosen from `[max(0, 1 - brightness), 1 + brightness]`.
@@ -536,12 +519,11 @@ def __init__(self, brightness):
         super(RandomBrightness, self).__init__()
         self._args = (max(0, 1-brightness), 1+brightness)
 
-    def hybrid_forward(self, F, x, *args):
-        if is_np_array():
-            F = F.npx
-        return _append_return(F.image.random_brightness(x, *self._args), *args)
+    def forward(self, x, *args):
+        return _append_return(npx.image.random_brightness(x, *self._args), *args)
 
 
+@use_np
 class RandomContrast(HybridBlock):
     """Randomly jitters image contrast with a factor
     chosen from `[max(0, 1 - contrast), 1 + contrast]`.
@@ -563,12 +545,11 @@ def __init__(self, contrast):
         super(RandomContrast, self).__init__()
         self._args = (max(0, 1-contrast), 1+contrast)
 
-    def hybrid_forward(self, F, x, *args):
-        if is_np_array():
-            F = F.npx
-        return _append_return(F.image.random_contrast(x, *self._args), *args)
+    def forward(self, x, *args):
+        return _append_return(npx.image.random_contrast(x, *self._args), *args)
 
 
+@use_np
 class RandomSaturation(HybridBlock):
     """Randomly jitters image saturation with a factor
     chosen from `[max(0, 1 - saturation), 1 + saturation]`.
@@ -590,12 +571,11 @@ def __init__(self, saturation):
         super(RandomSaturation, self).__init__()
         self._args = (max(0, 1-saturation), 1+saturation)
 
-    def hybrid_forward(self, F, x, *args):
-        if is_np_array():
-            F = F.npx
-        return _append_return(F.image.random_saturation(x, *self._args), *args)
+    def forward(self, x, *args):
+        return _append_return(npx.image.random_saturation(x, *self._args), *args)
 
 
+@use_np
 class RandomHue(HybridBlock):
     """Randomly jitters image hue with a factor
     chosen from `[max(0, 1 - hue), 1 + hue]`.
@@ -617,12 +597,11 @@ def __init__(self, hue):
         super(RandomHue, self).__init__()
         self._args = (max(0, 1-hue), 1+hue)
 
-    def hybrid_forward(self, F, x, *args):
-        if is_np_array():
-            F = F.npx
-        return _append_return(F.image.random_hue(x, *self._args), *args)
+    def forward(self, x, *args):
+        return _append_return(npx.image.random_hue(x, *self._args), *args)
 
 
+@use_np
 class RandomColorJitter(HybridBlock):
     """Randomly jitters the brightness, contrast, saturation, and hue
     of an image.
@@ -653,12 +632,11 @@ def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
         super(RandomColorJitter, self).__init__()
         self._args = (brightness, contrast, saturation, hue)
 
-    def hybrid_forward(self, F, x, *args):
-        if is_np_array():
-            F = F.npx
-        return _append_return(F.image.random_color_jitter(x, *self._args), *args)
+    def forward(self, x, *args):
+        return _append_return(npx.image.random_color_jitter(x, *self._args), *args)
 
 
+@use_np
 class RandomLighting(HybridBlock):
     """Add AlexNet-style PCA-based noise to an image.
 
@@ -678,12 +656,11 @@ def __init__(self, alpha):
         super(RandomLighting, self).__init__()
         self._alpha = alpha
 
-    def hybrid_forward(self, F, x, *args):
-        if is_np_array():
-            F = F.npx
-        return _append_return(F.image.random_lighting(x, self._alpha), *args)
+    def forward(self, x, *args):
+        return _append_return(npx.image.random_lighting(x, self._alpha), *args)
 
 
+@use_np
 class RandomGray(HybridBlock):
     """Randomly convert to gray image.
 
@@ -696,18 +673,10 @@ def __init__(self, p=0.5):
         super(RandomGray, self).__init__()
         self.p = p
 
-    def hybrid_forward(self, F, x, *args):
-        if is_np_array():
-            mat = F.np.concatenate((F.np.full((3, 1), 0.2989),
-                                    F.np.full((3, 1), 0.5870),
-                                    F.np.full((3, 1), 0.114)), axis=1)
-            x = F.npx.cast(x, dtype='float32')
-            gray = F.np.where(self.p < F.np.random.uniform(), x, F.np.dot(x, mat))
-        else:
-            mat = F.concat(F.full((3, 1), 0.2989),
-                           F.full((3, 1), 0.5870),
-                           F.full((3, 1), 0.114), dim=1)
-            cond = self.p < F.random.uniform(shape=1)
-            x = F.cast(x, dtype='float32')
-            gray = F.contrib.cond(cond, lambda: x, lambda: F.dot(x, mat))
+    def forward(self, x, *args):
+        mat = np.concatenate((np.full((3, 1), 0.2989),
+                              np.full((3, 1), 0.5870),
+                              np.full((3, 1), 0.114)), axis=1)
+        x = x.astype(dtype='float32')
+        gray = np.where(self.p < np.random.uniform(), x, np.dot(x, mat))
         return _append_return(gray, *args)
diff --git a/python/mxnet/gluon/loss.py b/python/mxnet/gluon/loss.py
index d0ebcb0a3248..5bf2cd9be293 100644
--- a/python/mxnet/gluon/loss.py
+++ b/python/mxnet/gluon/loss.py
@@ -24,14 +24,14 @@
            'KLDivLoss', 'CTCLoss', 'HuberLoss', 'HingeLoss',
            'SquaredHingeLoss', 'LogisticLoss', 'TripletLoss', 'PoissonNLLLoss', 'CosineEmbeddingLoss', 'SDMLLoss']
 
-import numpy as np
-from .. import ndarray
+import numpy as _np
 from ..base import numeric_types
 from .block import HybridBlock
-from ..util import is_np_array
+from ..util import use_np
+from .. import np, npx
 
 
-def _apply_weighting(F, loss, weight=None, sample_weight=None):
+def _apply_weighting(loss, weight=None, sample_weight=None):
     """Apply weighting to loss.
 
     Parameters
@@ -53,10 +53,7 @@ def _apply_weighting(F, loss, weight=None, sample_weight=None):
         Weighted loss
     """
     if sample_weight is not None:
-        if is_np_array():
-            loss = loss * sample_weight
-        else:
-            loss = F.broadcast_mul(loss, sample_weight)
+        loss = loss * sample_weight
 
     if weight is not None:
         assert isinstance(weight, numeric_types), "weight must be a number"
@@ -65,45 +62,21 @@ def _apply_weighting(F, loss, weight=None, sample_weight=None):
     return loss
 
 
-def _reshape_like(F, x, y):
-    """Reshapes x to the same shape as y."""
-    if F is ndarray:
-        return x.reshape(y.shape)
-    elif is_np_array():
-        F = F.npx
-    return F.reshape_like(x, y)
-
-
-def _batch_mean(F, loss, batch_axis):
+def _batch_mean(loss, batch_axis):
     """Return mean on the specified batch axis, not keeping the axis"""
-    if is_np_array():
-        if F is ndarray:
-            axes = list(range(loss.ndim))
-            del axes[batch_axis]
-            return F.np.mean(loss, axis=axes)
-        else:
-            assert batch_axis == 0, 'Currently, we have not supported the "exclude" ' \
-                                    'flag in mean. So we only support batch_axis=0.'
-            return F.npx.batch_flatten(loss).mean(axis=1)
-    else:
-        return F.mean(loss, axis=batch_axis, exclude=True)
+    axes = list(range(loss.ndim))
+    del axes[batch_axis]
+    return np.mean(loss, axis=axes)
 
-def _batch_sum(F, loss, batch_axis):
+def _batch_sum(loss, batch_axis):
     """Return sum on the specified batch axis, not keeping the axis"""
-    if is_np_array():
-        if F is ndarray:
-            axes = list(range(loss.ndim))
-            del axes[batch_axis]
-            return F.np.sum(loss, axis=axes)
-        else:
-            assert batch_axis == 0, 'Currently, we have not supported the "exclude" ' \
-                                    'flag in mean. So we only support batch_axis=0.'
-            return F.npx.batch_flatten(loss).sum(axis=1)
-    else:
-        return F.sum(loss, axis=batch_axis, exclude=True)
+    axes = list(range(loss.ndim))
+    del axes[batch_axis]
+    return np.sum(loss, axis=axes)
 
 
 
+@use_np
 class Loss(HybridBlock):
     """Base class for loss.
 
@@ -124,7 +97,7 @@ def __repr__(self):
         s = '{name}(batch_axis={_batch_axis}, w={_weight})'
         return s.format(name=self.__class__.__name__, **self.__dict__)
 
-    def hybrid_forward(self, F, x, *args, **kwargs):
+    def forward(self, x, *args):
         """Overrides to construct symbolic graph for this `Block`.
 
         Parameters
@@ -139,6 +112,7 @@ def hybrid_forward(self, F, x, *args, **kwargs):
         raise NotImplementedError
 
 
+@use_np
 class L2Loss(Loss):
     r"""Calculates the mean squared error between `label` and `pred`.
 
@@ -171,14 +145,15 @@ class L2Loss(Loss):
     def __init__(self, weight=1., batch_axis=0, **kwargs):
         super(L2Loss, self).__init__(weight, batch_axis, **kwargs)
 
-    def hybrid_forward(self, F, pred, label, sample_weight=None):
-        square_fn = F.np.square if is_np_array() else F.square
-        label = _reshape_like(F, label, pred)
-        loss = square_fn(label - pred)
-        loss = _apply_weighting(F, loss, self._weight / 2, sample_weight)
-        return _batch_mean(F, loss, self._batch_axis)
+    def forward(self, pred, label, sample_weight=None):
+        label = npx.reshape_like(label, pred)
+        loss = np.square(label - pred)
+        loss = _apply_weighting(loss, self._weight / 2, sample_weight)
+        return _batch_mean(loss, self._batch_axis)
+
 
 
+@use_np
 class L1Loss(Loss):
     r"""Calculates the mean absolute error between `label` and `pred`.
 
@@ -211,14 +186,15 @@ class L1Loss(Loss):
     def __init__(self, weight=None, batch_axis=0, **kwargs):
         super(L1Loss, self).__init__(weight, batch_axis, **kwargs)
 
-    def hybrid_forward(self, F, pred, label, sample_weight=None):
-        abs_fn = F.np.abs if is_np_array() else F.abs
-        label = _reshape_like(F, label, pred)
-        loss = abs_fn(label - pred)
-        loss = _apply_weighting(F, loss, self._weight, sample_weight)
-        return _batch_mean(F, loss, self._batch_axis)
+    def forward(self, pred, label, sample_weight=None):
+        label = npx.reshape_like(label, pred)
+        loss = np.abs(label - pred)
+        loss = _apply_weighting(loss, self._weight, sample_weight)
+        return _batch_mean(loss, self._batch_axis)
+
 
 
+@use_np
 class SigmoidBinaryCrossEntropyLoss(Loss):
     r"""The cross-entropy loss for binary classification. (alias: SigmoidBCELoss)
 
@@ -281,46 +257,37 @@ def __init__(self, from_sigmoid=False, weight=None, batch_axis=0, **kwargs):
             weight, batch_axis, **kwargs)
         self._from_sigmoid = from_sigmoid
 
-    def hybrid_forward(self, F, pred, label, sample_weight=None, pos_weight=None):
-        if is_np_array():
-            relu_fn = F.npx.relu
-            act_fn = F.npx.activation
-            abs_fn = F.np.abs
-            mul_fn = F.np.multiply
-            log_fn = F.np.log
-        else:
-            relu_fn = F.relu
-            act_fn = F.Activation
-            abs_fn = F.abs
-            mul_fn = F.broadcast_mul
-            log_fn = F.log
-        label = _reshape_like(F, label, pred)
+    def forward(self, pred, label, sample_weight=None, pos_weight=None):
+        label = npx.reshape_like(label, pred)
         if not self._from_sigmoid:
             if pos_weight is None:
                 # We use the stable formula: max(x, 0) - x * z + log(1 + exp(-abs(x)))
-                loss = relu_fn(pred) - pred * label + \
-                    act_fn(-abs_fn(pred), act_type='softrelu')
+                loss = npx.relu(pred) - pred * label + \
+                    npx.activation(-np.abs(pred), act_type='softrelu')
             else:
                 # We use the stable formula: x - x * z + (1 + z * pos_weight - z) * \
                 #    (log(1 + exp(-abs(x))) + max(-x, 0))
-                log_weight = 1 + mul_fn(pos_weight - 1, label)
+                log_weight = 1 + np.multiply(pos_weight - 1, label)
                 loss = pred - pred * label + log_weight * \
-                       (act_fn(-abs_fn(pred), act_type='softrelu') + relu_fn(-pred))
+                       (npx.activation(-np.abs(pred), act_type='softrelu') + npx.relu(-pred))
         else:
             eps = 1e-12
             if pos_weight is None:
-                loss = -(log_fn(pred + eps) * label
-                         + log_fn(1. - pred + eps) * (1. - label))
+                loss = -(np.log(pred + eps) * label
+                         + np.log(1. - pred + eps) * (1. - label))
             else:
-                loss = -(mul_fn(log_fn(pred + eps) * label, pos_weight)
-                         + log_fn(1. - pred + eps) * (1. - label))
-        loss = _apply_weighting(F, loss, self._weight, sample_weight)
-        return _batch_mean(F, loss, self._batch_axis)
+                loss = -(np.multiply(np.log(pred + eps) * label, pos_weight)
+                         + np.log(1. - pred + eps) * (1. - label))
+        loss = _apply_weighting(loss, self._weight, sample_weight)
+        return _batch_mean(loss, self._batch_axis)
 
 
 SigmoidBCELoss = SigmoidBinaryCrossEntropyLoss
 
 
+
+
+@use_np
 class SoftmaxCrossEntropyLoss(Loss):
     r"""Computes the softmax cross entropy loss. (alias: SoftmaxCELoss)
 
@@ -391,27 +358,22 @@ def __init__(self, axis=-1, sparse_label=True, from_logits=False, weight=None,
         self._sparse_label = sparse_label
         self._from_logits = from_logits
 
-    def hybrid_forward(self, F, pred, label, sample_weight=None):
-        if is_np_array():
-            log_softmax_fn = F.npx.log_softmax
-            pick_fn = F.npx.pick
-        else:
-            log_softmax_fn = F.log_softmax
-            pick_fn = F.pick
+    def forward(self, pred, label, sample_weight=None):
         if not self._from_logits:
-            pred = log_softmax_fn(pred, self._axis)
+            pred = npx.log_softmax(pred, axis=self._axis)
         if self._sparse_label:
-            loss = -pick_fn(pred, label, axis=self._axis, keepdims=True)
+            loss = -npx.pick(pred, label, axis=self._axis, keepdims=True)
         else:
-            label = _reshape_like(F, label, pred)
+            label = npx.reshape_like(label, pred)
             loss = -(pred * label).sum(axis=self._axis, keepdims=True)
-        loss = _apply_weighting(F, loss, self._weight, sample_weight)
-        return _batch_mean(F, loss, self._batch_axis)
+        loss = _apply_weighting(loss, self._weight, sample_weight)
+        return _batch_mean(loss, self._batch_axis)
 
 
 SoftmaxCELoss = SoftmaxCrossEntropyLoss
 
 
+@use_np
 class KLDivLoss(Loss):
     r"""The Kullback-Leibler divergence loss.
 
@@ -479,20 +441,15 @@ def __init__(self, from_logits=True, axis=-1, weight=None, batch_axis=0,
         self._from_logits = from_logits
         self._axis = axis
 
-    def hybrid_forward(self, F, pred, label, sample_weight=None):
-        if is_np_array():
-            log_softmax_fn = F.npx.log_softmax
-            log_fn = F.np.log
-        else:
-            log_softmax_fn = F.log_softmax
-            log_fn = F.log
+    def forward(self, pred, label, sample_weight=None):
         if not self._from_logits:
-            pred = log_softmax_fn(pred, self._axis)
-        loss = label * (log_fn(label + 1e-12) - pred)
-        loss = _apply_weighting(F, loss, self._weight, sample_weight)
-        return _batch_mean(F, loss, self._batch_axis)
+            pred = npx.log_softmax(pred, self._axis)
+        loss = label * (np.log(label + 1e-12) - pred)
+        loss = _apply_weighting(loss, self._weight, sample_weight)
+        return _batch_mean(loss, self._batch_axis)
 
 
+@use_np
 class CTCLoss(Loss):
     r"""Connectionist Temporal Classification Loss.
 
@@ -561,25 +518,19 @@ def __init__(self, layout='NTC', label_layout='NT', weight=None, **kwargs):
         batch_axis = label_layout.find('N')
         super(CTCLoss, self).__init__(weight, batch_axis, **kwargs)
 
-    def hybrid_forward(self, F, pred, label,
-                       pred_lengths=None, label_lengths=None, sample_weight=None):
-        if is_np_array():
-            swapaxes_fn = F.np.swapaxes
-            ctc_fn = F.npx.ctc_loss
-        else:
-            swapaxes_fn = F.swapaxes
-            ctc_fn = F.ctc_loss
+    def forward(self, pred, label, pred_lengths=None, label_lengths=None, sample_weight=None):
         if self._layout == 'NTC':
-            pred = swapaxes_fn(pred, 0, 1)
+            pred = np.swapaxes(pred, 0, 1)
         if self._batch_axis == 1:
-            label = swapaxes_fn(label, 0, 1)
-        loss = ctc_fn(pred, label, pred_lengths, label_lengths,
-                      use_data_lengths=pred_lengths is not None,
-                      use_label_lengths=label_lengths is not None,
-                      blank_label='last')
-        return _apply_weighting(F, loss, self._weight, sample_weight)
+            label = np.swapaxes(label, 0, 1)
+        loss = npx.ctc_loss(pred, label, pred_lengths, label_lengths,
+                            use_data_lengths=pred_lengths is not None,
+                            use_label_lengths=label_lengths is not None,
+                            blank_label='last')
+        return _apply_weighting(loss, self._weight, sample_weight)
 
 
+@use_np
 class HuberLoss(Loss):
     r"""Calculates smoothed L1 loss that is equal to L1 loss if absolute error
     exceeds rho but is equal to L2 loss otherwise. Also called SmoothedL1 loss.
@@ -621,23 +572,16 @@ def __init__(self, rho=1, weight=None, batch_axis=0, **kwargs):
         super(HuberLoss, self).__init__(weight, batch_axis, **kwargs)
         self._rho = rho
 
-    def hybrid_forward(self, F, pred, label, sample_weight=None):
-        if is_np_array():
-            abs_fn = F.np.abs
-            where_fn = F.np.where
-            square_fn = F.np.square
-        else:
-            abs_fn = F.abs
-            where_fn = F.where
-            square_fn = F.square
-        label = _reshape_like(F, label, pred)
-        loss = abs_fn(label - pred)
-        loss = where_fn(loss > self._rho, loss - 0.5 * self._rho,
-                        (0.5 / self._rho) * square_fn(loss))
-        loss = _apply_weighting(F, loss, self._weight, sample_weight)
-        return _batch_mean(F, loss, self._batch_axis)
+    def forward(self, pred, label, sample_weight=None):
+        label = npx.reshape_like(label, pred)
+        loss = np.abs(label - pred)
+        loss = np.where(loss > self._rho, loss - 0.5 * self._rho,
+                        (0.5 / self._rho) * np.square(loss))
+        loss = _apply_weighting(loss, self._weight, sample_weight)
+        return _batch_mean(loss, self._batch_axis)
 
 
+@use_np
 class HingeLoss(Loss):
     r"""Calculates the hinge loss function often used in SVMs:
 
@@ -676,14 +620,14 @@ def __init__(self, margin=1, weight=None, batch_axis=0, **kwargs):
         super(HingeLoss, self).__init__(weight, batch_axis, **kwargs)
         self._margin = margin
 
-    def hybrid_forward(self, F, pred, label, sample_weight=None):
-        relu_fn = F.npx.relu if is_np_array() else F.relu
-        label = _reshape_like(F, label, pred)
-        loss = relu_fn(self._margin - pred * label)
-        loss = _apply_weighting(F, loss, self._weight, sample_weight)
-        return _batch_mean(F, loss, self._batch_axis)
+    def forward(self, pred, label, sample_weight=None):
+        label = npx.reshape_like(label, pred)
+        loss = npx.relu(self._margin - pred * label)
+        loss = _apply_weighting(loss, self._weight, sample_weight)
+        return _batch_mean(loss, self._batch_axis)
 
 
+@use_np
 class SquaredHingeLoss(Loss):
     r"""Calculates the soft-margin loss function used in SVMs:
 
@@ -722,19 +666,14 @@ def __init__(self, margin=1, weight=None, batch_axis=0, **kwargs):
         super(SquaredHingeLoss, self).__init__(weight, batch_axis, **kwargs)
         self._margin = margin
 
-    def hybrid_forward(self, F, pred, label, sample_weight=None):
-        if is_np_array():
-            relu_fn = F.npx.relu
-            square_fn = F.np.square
-        else:
-            relu_fn = F.relu
-            square_fn = F.square
-        label = _reshape_like(F, label, pred)
-        loss = square_fn(relu_fn(self._margin - pred * label))
-        loss = _apply_weighting(F, loss, self._weight, sample_weight)
-        return _batch_mean(F, loss, self._batch_axis)
+    def forward(self, pred, label, sample_weight=None):
+        label = npx.reshape_like(label, pred)
+        loss = np.square(npx.relu(self._margin - pred * label))
+        loss = _apply_weighting(loss, self._weight, sample_weight)
+        return _batch_mean(loss, self._batch_axis)
 
 
+@use_np
 class LogisticLoss(Loss):
     r"""Calculates the logistic loss (for binary losses only):
 
@@ -777,25 +716,18 @@ def __init__(self, weight=None, batch_axis=0, label_format='signed', **kwargs):
             raise ValueError("label_format can only be signed or binary, received %s."
                              % label_format)
 
-    def hybrid_forward(self, F, pred, label, sample_weight=None):
-        if is_np_array():
-            relu_fn = F.npx.relu
-            act_fn = F.npx.activation
-            abs_fn = F.np.abs
-        else:
-            relu_fn = F.relu
-            act_fn = F.Activation
-            abs_fn = F.abs
-        label = _reshape_like(F, label, pred)
+    def forward(self, pred, label, sample_weight=None):
+        label = npx.reshape_like(label, pred)
         if self._label_format == 'signed':
             label = (label + 1.0) / 2.0  # Transform label to be either 0 or 1
         # Use a stable formula in computation
-        loss = relu_fn(pred) - pred * label + \
-            act_fn(-abs_fn(pred), act_type='softrelu')
-        loss = _apply_weighting(F, loss, self._weight, sample_weight)
-        return _batch_mean(F, loss, self._batch_axis)
+        loss = npx.relu(pred) - pred * label + \
+            npx.activation(-np.abs(pred), act_type='softrelu')
+        loss = _apply_weighting(loss, self._weight, sample_weight)
+        return _batch_mean(loss, self._batch_axis)
 
 
+@use_np
 class TripletLoss(Loss):
     r"""Calculates triplet loss given three input tensors and a positive margin.
     Triplet loss measures the relative similarity between a positive
@@ -833,20 +765,16 @@ def __init__(self, margin=1, weight=None, batch_axis=0, **kwargs):
         super(TripletLoss, self).__init__(weight, batch_axis, **kwargs)
         self._margin = margin
 
-    def hybrid_forward(self, F, pred, positive, negative, sample_weight=None):
-        if is_np_array():
-            relu_fn = F.npx.relu
-            square_fn = F.np.square
-        else:
-            relu_fn = F.relu
-            square_fn = F.square
-        positive = _reshape_like(F, positive, pred)
-        negative = _reshape_like(F, negative, pred)
-        loss = _batch_sum(F, square_fn(positive - pred) - square_fn(negative - pred), self._batch_axis)
-        loss = relu_fn(loss + self._margin)
-        return _apply_weighting(F, loss, self._weight, sample_weight)
+    @use_np
+    def forward(self, pred, positive, negative, sample_weight=None):
+        positive = npx.reshape_like(positive, pred)
+        negative = npx.reshape_like(negative, pred)
+        loss = _batch_sum(np.square(positive - pred) - np.square(negative - pred), self._batch_axis)
+        loss = npx.relu(loss + self._margin)
+        return _apply_weighting(loss, self._weight, sample_weight)
 
 
+@use_np
 class PoissonNLLLoss(Loss):
     r"""For a target (Random Variable) in a Poisson distribution, the function calculates the Negative
     Log likelihood loss.
@@ -892,29 +820,24 @@ def __init__(self, weight=None, from_logits=True, batch_axis=0, compute_full=Fal
         self._from_logits = from_logits
         self._compute_full = compute_full
 
-    def hybrid_forward(self, F, pred, target, sample_weight=None, epsilon=1e-08):
-        if is_np_array():
-            exp_fn = F.np.exp
-            log_fn = F.np.log
-        else:
-            exp_fn = F.exp
-            log_fn = F.log
-        target = _reshape_like(F, target, pred)
+    def forward(self, pred, target, sample_weight=None, epsilon=1e-08):
+        target = npx.reshape_like(target, pred)
         if self._from_logits:
-            loss = exp_fn(pred) - target * pred
+            loss = np.exp(pred) - target * pred
         else:
-            loss = pred - target * log_fn(pred + epsilon)
+            loss = pred - target * np.log(pred + epsilon)
         if self._compute_full:
             # Using numpy's pi value
             stirling_factor = target * \
-                log_fn(target) - target + 0.5 * log_fn(2 * target * np.pi)
+                np.log(target) - target + 0.5 * np.log(2 * target * _np.pi)
             target_gt_1 = target > 1
             stirling_factor = stirling_factor * target_gt_1
             loss = loss + stirling_factor
-        loss = _apply_weighting(F, loss, self._weight, sample_weight)
-        return _batch_mean(F, loss, self._batch_axis)
+        loss = _apply_weighting(loss, self._weight, sample_weight)
+        return _batch_mean(loss, self._batch_axis)
 
 
+@use_np
 class CosineEmbeddingLoss(Loss):
     r"""For a target label 1 or -1, vectors input1 and input2, the function computes the cosine distance
     between the vectors. This can be interpreted as how similar/dissimilar two input vectors are.
@@ -955,45 +878,27 @@ def __init__(self, weight=None, batch_axis=0, margin=0, **kwargs):
         super(CosineEmbeddingLoss, self).__init__(weight, batch_axis, **kwargs)
         self._margin = margin
 
-    def hybrid_forward(self, F, input1, input2, label, sample_weight=None):
-        if is_np_array():
-            where_fn = F.np.where
-            clip_fn = F.np.clip
-        else:
-            where_fn = F.where
-            clip_fn = F.clip
-
-        input1 = _reshape_like(F, input1, input2)
-        cos_sim = self._cosine_similarity(F, input1, input2)
-        label = _reshape_like(F, label, cos_sim)
-        loss = where_fn(label == 1,
+    def forward(self, input1, input2, label, sample_weight=None):
+        input1 = npx.reshape_like(input1, input2)
+        cos_sim = self._cosine_similarity(input1, input2)
+        label = npx.reshape_like(label, cos_sim)
+        loss = np.where(label == 1,
                         1 - cos_sim,
-                        clip_fn(cos_sim - self._margin, 0, 1 - self._margin))
-
-        loss = _apply_weighting(F, loss, self._weight, sample_weight)
-        return _batch_mean(F, loss, self._batch_axis)
-
-    def _cosine_similarity(self, F, x, y, axis=-1):
-        if is_np_array():
-            reshape_fn = F.npx.reshape
-            norm_fn = F.npx.norm
-            sum_fn = F.np.sum
-            full_fn = F.np.full
-            max_fn = F.np.maximum
-        else:
-            reshape_fn = F.reshape
-            norm_fn = F.norm
-            sum_fn = F.sum
-            full_fn = F.full
-            max_fn = F.broadcast_maximum
+                        np.clip(cos_sim - self._margin, 0, 1 - self._margin))
+
+        loss = _apply_weighting(loss, self._weight, sample_weight)
+        return _batch_mean(loss, self._batch_axis)
+
+    def _cosine_similarity(self, x, y, axis=-1):
         # Calculates the cosine similarity between 2 vectors
-        x_norm = reshape_fn(norm_fn(x, axis=axis), (-1, 1))
-        y_norm = reshape_fn(norm_fn(y, axis=axis), (-1, 1))
-        x_dot_y = reshape_fn(sum_fn(x * y, axis=axis), (-1, 1))
-        eps_arr = full_fn((1, 1), 1e-12)
-        return (x_dot_y / max_fn(x_norm * y_norm, eps_arr))
+        x_norm = npx.reshape(npx.norm(x, axis=axis), (-1, 1))
+        y_norm = npx.reshape(npx.norm(y, axis=axis), (-1, 1))
+        x_dot_y = npx.reshape(np.sum(x * y, axis=axis), (-1, 1))
+        eps_arr = np.full((1, 1), 1e-12)
+        return (x_dot_y / np.maximum(x_norm * y_norm, eps_arr))
 
 
+@use_np
 class SDMLLoss(Loss):
     r"""Calculates Batchwise Smoothed Deep Metric Learning (SDML) Loss given two input tensors and a smoothing weight
     SDM Loss learns similarity between paired samples by using unpaired samples in the minibatch
@@ -1036,27 +941,22 @@ def __init__(self, smoothing_parameter=0.3, weight=1., batch_axis=0, **kwargs):
         # Smoothing probability mass
         self.smoothing_parameter = smoothing_parameter
 
-    def _compute_distances(self, F, x1, x2):
+    def _compute_distances(self, x1, x2):
         """
         This function computes the euclidean distance between every vector
         in the two batches in input.
         """
-        if is_np_array():
-            expand_dims_fn = F.np.expand_dims
-        else:
-            expand_dims_fn = F.expand_dims
-
         # expanding x1 form [batch_size, dim] to [batch_size, 1, dim]
         # and x2 to [1, batch_size, dim]
-        x1_ = expand_dims_fn(x1, 1)
-        x2_ = expand_dims_fn(x2, 0)
+        x1_ = np.expand_dims(x1, 1)
+        x2_ = np.expand_dims(x2, 0)
         # pointwise squared differences
         squared_diffs = (x1_ - x2_)**2
         # sum of squared differences distance
         return squared_diffs.sum(axis=2)
 
 
-    def _compute_labels(self, F, batch_size):
+    def _compute_labels(self, batch_size):
         """
         The function creates the label matrix for the loss.
         It is an identity matrix of size [BATCH_SIZE x BATCH_SIZE]
@@ -1076,11 +976,11 @@ def _compute_labels(self, F, batch_size):
         confident output distributions." arXiv preprint arXiv:1701.06548 (2017).
         """
 
-        gold = F.eye(batch_size)
+        gold = np.eye(batch_size)
         labels = gold * (1 - self.smoothing_parameter) + (1 - gold) * self.smoothing_parameter / (batch_size - 1)
         return labels
 
-    def hybrid_forward(self, F, x1, x2):
+    def forward(self, x1, x2):
         """
         the function computes the kl divergence between the negative distances
         (internally it compute a softmax casting into probabilities) and the
@@ -1098,16 +998,11 @@ def hybrid_forward(self, F, x1, x2):
         learn to predict french president comparing it with all the other
         vectors in batch 2
         """
-        assert F is ndarray, 'SDMLLoss does not support symbolic '
-        if is_np_array():
-            log_softmax_fn = F.npx.log_softmax
-        else:
-            log_softmax_fn = F.log_softmax
         batch_size = x1.shape[0]
-        labels = self._compute_labels(F, batch_size)
-        distances = self._compute_distances(F, x1, x2)
-        log_probabilities = log_softmax_fn(-distances, axis=1)
+        labels = self._compute_labels(batch_size)
+        distances = self._compute_distances(x1, x2)
+        log_probabilities = npx.log_softmax(-distances, axis=1)
         # multiply for the number of labels to obtain the correct loss (gluon kl_loss averages instead of sum)
         # PR#18423:multiply for the number of labels should multiply x1.shape[1] rather than x1.shape[0])
         # After PR#18423, it is no need to multiply it anymore.
-        return self.kl_loss(log_probabilities, labels.as_in_context(distances.context))
+        return self.kl_loss(log_probabilities, labels.as_in_ctx(distances.ctx))
diff --git a/python/mxnet/gluon/metric.py b/python/mxnet/gluon/metric.py
index de0beed7b353..b99a902bcfdb 100644
--- a/python/mxnet/gluon/metric.py
+++ b/python/mxnet/gluon/metric.py
@@ -26,7 +26,7 @@
 from ..util import use_np
 
 from ..base import numeric_types, string_types
-from .. import ndarray
+from .. import ndarray, npx
 from .. import registry
 
 
@@ -254,15 +254,15 @@ class CompositeEvalMetric(EvalMetric):
 
     Examples
     --------
-    >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
-    >>> labels   = [mx.nd.array([0, 1, 1])]
+    >>> predicts = [mx.np.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
+    >>> labels   = [mx.np.array([0, 1, 1])]
     >>> eval_metrics_1 = mx.gluon.metric.Accuracy()
     >>> eval_metrics_2 = mx.gluon.metric.F1()
     >>> eval_metrics = mx.gluon.metric.CompositeEvalMetric()
     >>> for child_metric in [eval_metrics_1, eval_metrics_2]:
     >>>     eval_metrics.add(child_metric)
     >>> eval_metrics.update(labels = labels, preds = predicts)
-    >>> print eval_metrics.get()
+    >>> eval_metrics.get()
     (['accuracy', 'f1'], [0.6666666666666666, 0.8])
     """
 
@@ -392,11 +392,11 @@ class Accuracy(EvalMetric):
 
     Examples
     --------
-    >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
-    >>> labels   = [mx.nd.array([0, 1, 1])]
+    >>> predicts = [mx.np.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
+    >>> labels   = [mx.np.array([0, 1, 1])]
     >>> acc = mx.gluon.metric.Accuracy()
     >>> acc.update(preds = predicts, labels = labels)
-    >>> print acc.get()
+    >>> acc.get()
     ('accuracy', 0.6666666666666666)
     """
     def __init__(self, axis=1, name='accuracy',
@@ -467,11 +467,11 @@ class TopKAccuracy(EvalMetric):
     --------
     >>> np.random.seed(999)
     >>> top_k = 3
-    >>> labels = [mx.nd.array([2, 6, 9, 2, 3, 4, 7, 8, 9, 6])]
-    >>> predicts = [mx.nd.array(np.random.rand(10, 10))]
+    >>> labels = [mx.np.array([2, 6, 9, 2, 3, 4, 7, 8, 9, 6])]
+    >>> predicts = [mx.np.array(np.random.rand(10, 10))]
     >>> acc = mx.gluon.metric.TopKAccuracy(top_k=top_k)
     >>> acc.update(labels, predicts)
-    >>> print acc.get()
+    >>> acc.get()
     ('top_k_accuracy', 0.3)
     """
 
@@ -504,7 +504,7 @@ def update(self, labels, preds):
             # much faster, which is important since that computation is
             # single-threaded due to Python GIL.
             pred_label = pred_label.as_np_ndarray().as_in_ctx(label.ctx).astype('float32')
-            pred_label = numpy.argpartition(pred_label, -self.top_k)
+            pred_label = numpy.argpartition(pred_label, -self.top_k).as_in_ctx(label.ctx)
             label = label.as_np_ndarray().astype('int32')
             check_label_shapes(label, pred_label)
             num_samples = pred_label.shape[0]
@@ -765,11 +765,11 @@ class F1(EvalMetric):
 
     Examples
     --------
-    >>> predicts = [mx.nd.array([[0.3, 0.7], [0., 1.], [0.4, 0.6]])]
-    >>> labels   = [mx.nd.array([0., 1., 1.])]
+    >>> predicts = [mx.np.array([[0.3, 0.7], [0., 1.], [0.4, 0.6]])]
+    >>> labels   = [mx.np.array([0., 1., 1.])]
     >>> f1 = mx.gluon.metric.F1()
     >>> f1.update(preds = predicts, labels = labels)
-    >>> print f1.get()
+    >>> f1.get()
     ('f1', 0.8)
     """
 
@@ -856,11 +856,11 @@ class Fbeta(F1):
 
     Examples
     --------
-    >>> predicts = [mx.nd.array([[0.3, 0.7], [0., 1.], [0.4, 0.6]])]
-    >>> labels   = [mx.nd.array([0., 1., 1.])]
+    >>> predicts = [mx.np.array([[0.3, 0.7], [0., 1.], [0.4, 0.6]])]
+    >>> labels   = [mx.np.array([0., 1., 1.])]
     >>> fbeta = mx.gluon.metric.Fbeta(beta=2)
     >>> fbeta.update(preds = predicts, labels = labels)
-    >>> print fbeta.get()
+    >>> fbeta.get()
     ('fbeta', 0.9090909090909091)
     """
 
@@ -892,11 +892,11 @@ class BinaryAccuracy(EvalMetric):
 
     Examples
     --------
-    >>> predicts = [mx.nd.array([0.7, 1, 0.55])]
-    >>> labels   = [mx.nd.array([0., 1., 0.])]
+    >>> predicts = [mx.np.array([0.7, 1, 0.55])]
+    >>> labels   = [mx.np.array([0., 1., 0.])]
     >>> bacc = mx.gluon.metric.BinaryAccuracy(threshold=0.6)
     >>> bacc.update(preds = predicts, labels = labels)
-    >>> print bacc.get()
+    >>> bacc.get()
     ('binary_accuracy', 0.6666666666666666)
     """
 
@@ -976,13 +976,13 @@ class MCC(EvalMetric):
     >>> false_negatives = 1
     >>> true_positives = 10000
     >>> true_negatives = 1
-    >>> predicts = [mx.nd.array(
+    >>> predicts = [mx.np.array(
         [[.3, .7]]*false_positives +
         [[.7, .3]]*true_negatives +
         [[.7, .3]]*false_negatives +
         [[.3, .7]]*true_positives
     )]
-    >>> labels  = [mx.nd.array(
+    >>> labels  = [mx.np.array(
         [0.]*(false_positives + true_negatives) +
         [1.]*(false_negatives + true_positives)
     )]
@@ -990,9 +990,9 @@ class MCC(EvalMetric):
     >>> f1.update(preds = predicts, labels = labels)
     >>> mcc = mx.gluon.metric.MCC()
     >>> mcc.update(preds = predicts, labels = labels)
-    >>> print f1.get()
+    >>> f1.get()
     ('f1', 0.95233560306652054)
-    >>> print mcc.get()
+    >>> mcc.get()
     ('mcc', 0.01917751877733392)
     """
 
@@ -1057,11 +1057,11 @@ class MAE(EvalMetric):
 
     Examples
     --------
-    >>> predicts = [mx.nd.array([3, -0.5, 2, 7])]
-    >>> labels = [mx.nd.array([2.5, 0.0, 2, 8])]
+    >>> predicts = [mx.np.array([3, -0.5, 2, 7])]
+    >>> labels = [mx.np.array([2.5, 0.0, 2, 8])]
     >>> mean_absolute_error = mx.gluon.metric.MAE()
     >>> mean_absolute_error.update(labels = labels, preds = predicts)
-    >>> print mean_absolute_error.get()
+    >>> mean_absolute_error.get()
     ('mae', 0.5)
     """
 
@@ -1118,11 +1118,11 @@ class MSE(EvalMetric):
 
     Examples
     --------
-    >>> predicts = [mx.nd.array([3, -0.5, 2, 7])]
-    >>> labels = [mx.nd.array([2.5, 0.0, 2, 8])]
+    >>> predicts = [mx.np.array([3, -0.5, 2, 7])]
+    >>> labels = [mx.np.array([2.5, 0.0, 2, 8])]
     >>> mean_squared_error = mx.gluon.metric.MSE()
     >>> mean_squared_error.update(labels = labels, preds = predicts)
-    >>> print mean_squared_error.get()
+    >>> mean_squared_error.get()
     ('mse', 0.375)
     """
     def __init__(self, name='mse',
@@ -1178,11 +1178,11 @@ class RMSE(MSE):
 
     Examples
     --------
-    >>> predicts = [mx.nd.array([3, -0.5, 2, 7])]
-    >>> labels = [mx.nd.array([2.5, 0.0, 2, 8])]
+    >>> predicts = [mx.np.array([3, -0.5, 2, 7])]
+    >>> labels = [mx.np.array([2.5, 0.0, 2, 8])]
     >>> root_mean_squared_error = mx.gluon.metric.RMSE()
     >>> root_mean_squared_error.update(labels = labels, preds = predicts)
-    >>> print root_mean_squared_error.get()
+    >>> root_mean_squared_error.get()
     ('rmse', 0.612372457981)
     """
     def __init__(self, name='rmse',
@@ -1223,11 +1223,11 @@ class MeanPairwiseDistance(EvalMetric):
 
     Examples
     --------
-    >>> predicts = [mx.nd.array([[1., 2.], [3., 4.]])]
-    >>> labels = [mx.nd.array([[1., 0.], [4., 2.]])]
+    >>> predicts = [mx.np.array([[1., 2.], [3., 4.]])]
+    >>> labels = [mx.np.array([[1., 0.], [4., 2.]])]
     >>> mpd = mx.gluon.metric.MeanPairwiseDistance()
     >>> mpd.update(labels = labels, preds = predicts)
-    >>> print mpd.get()
+    >>> mpd.get()
     ('mpd', 2.1180338859558105)
     """
     def __init__(self, name='mpd',
@@ -1292,11 +1292,11 @@ class MeanCosineSimilarity(EvalMetric):
 
     Examples
     --------
-    >>> predicts = [mx.nd.array([[1., 0.], [1., 1.]])]
-    >>> labels = [mx.nd.array([[3., 4.], [2., 2.]])]
+    >>> predicts = [mx.np.array([[1., 0.], [1., 1.]])]
+    >>> labels = [mx.np.array([[3., 4.], [2., 2.]])]
     >>> mcs = mx.gluon.metric.MeanCosineSimilarity()
     >>> mcs.update(labels = labels, preds = predicts)
-    >>> print mcs.get()
+    >>> mcs.get()
     ('cos_sim', 0.8)
     """
     def __init__(self, name='cos_sim',
@@ -1378,11 +1378,11 @@ class :math:`k`.
 
     Examples
     --------
-    >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
-    >>> labels   = [mx.nd.array([0, 1, 1])]
+    >>> predicts = [mx.np.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
+    >>> labels   = [mx.np.array([0, 1, 1])]
     >>> ce = mx.gluon.metric.CrossEntropy()
     >>> ce.update(labels, predicts)
-    >>> print ce.get()
+    >>> ce.get()
     ('cross-entropy', 0.57159948348999023)
     """
     def __init__(self, eps=1e-12, ignore_label=None, axis=-1, from_logits=False,
@@ -1414,10 +1414,8 @@ def update(self, labels, preds):
                 "shape mismatch: %s vs. %s"%(label.shape, pred.shape)
             label = label.reshape((label.size,))
             if self.from_logits:
-                pred = ndarray.softmax(pred, axis=self.axis)
-            pred = ndarray.pick(pred.as_in_context(label.ctx), label.astype(dtype='int32'), axis=self.axis)
-            label = label.as_np_ndarray()
-            pred = pred.as_np_ndarray()
+                pred = npx.softmax(pred, axis=self.axis)
+            pred = npx.pick(pred.as_in_ctx(label.ctx), label.astype(dtype='int32'), axis=self.axis)
             if self.ignore_label is not None:
                 ignore = (label == self.ignore_label).astype(pred.dtype)
                 num -= ignore.sum()
@@ -1478,11 +1476,11 @@ class Perplexity(CrossEntropy):
 
     Examples
     --------
-    >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
-    >>> labels   = [mx.nd.array([0, 1, 1])]
+    >>> predicts = [mx.np.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
+    >>> labels   = [mx.np.array([0, 1, 1])]
     >>> perp = mx.gluon.metric.Perplexity(ignore_label=None)
     >>> perp.update(labels, predicts)
-    >>> print perp.get()
+    >>> perp.get()
     ('Perplexity', 1.7710976285155853)
     """
     def __init__(self, eps=1e-12, ignore_label=None, axis=-1, from_logits=False,
@@ -1523,11 +1521,11 @@ class PearsonCorrelation(EvalMetric):
 
     Examples
     --------
-    >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
-    >>> labels   = [mx.nd.array([[1, 0], [0, 1], [0, 1]])]
+    >>> predicts = [mx.np.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
+    >>> labels   = [mx.np.array([[1, 0], [0, 1], [0, 1]])]
     >>> pr = mx.gluon.metric.PearsonCorrelation()
     >>> pr.update(labels, predicts)
-    >>> print pr.get()
+    >>> pr.get()
     ('pearsonr', 0.42163704544016178)
     """
     def __init__(self, name='pearsonr',
@@ -1628,13 +1626,13 @@ class PCC(EvalMetric):
     >>> false_negatives = 1
     >>> true_positives = 10000
     >>> true_negatives = 1
-    >>> predicts = [mx.nd.array(
+    >>> predicts = [mx.np.array(
         [[.3, .7]]*false_positives +
         [[.7, .3]]*true_negatives +
         [[.7, .3]]*false_negatives +
         [[.3, .7]]*true_positives
     )]
-    >>> labels  = [mx.nd.array(
+    >>> labels  = [mx.np.array(
         [0]*(false_positives + true_negatives) +
         [1]*(false_negatives + true_positives)
     )]
@@ -1642,9 +1640,9 @@ class PCC(EvalMetric):
     >>> f1.update(preds = predicts, labels = labels)
     >>> pcc = mx.gluon.metric.PCC()
     >>> pcc.update(preds = predicts, labels = labels)
-    >>> print f1.get()
+    >>> f1.get()
     ('f1', 0.95233560306652054)
-    >>> print pcc.get()
+    >>> pcc.get()
     ('pcc', 0.01917751877733392)
     """
     def __init__(self, name='pcc',
@@ -1712,6 +1710,7 @@ def reset(self):
 
 
 @register
+@use_np
 class Loss(EvalMetric):
     """Dummy metric for directly printing loss.
 
@@ -1737,7 +1736,7 @@ def update(self, _, preds):
             preds = [preds]
 
         for pred in preds:
-            loss = ndarray.sum(pred).asscalar()
+            loss = pred.sum().item()
             self.sum_metric += loss
             self.num_inst += pred.size
 
@@ -1780,12 +1779,12 @@ class CustomMetric(EvalMetric):
 
     Examples
     --------
-    >>> predicts = [mx.nd.array(np.array([3, -0.5, 2, 7]).reshape(4,1))]
-    >>> labels = [mx.nd.array(np.array([2.5, 0.0, 2, 8]).reshape(4,1))]
+    >>> predicts = [mx.np.array(np.array([3, -0.5, 2, 7]).reshape(4,1))]
+    >>> labels = [mx.np.array(np.array([2.5, 0.0, 2, 8]).reshape(4,1))]
     >>> feval = lambda x, y : (x + y).mean()
     >>> eval_metrics = mx.gluon.metric.CustomMetric(feval=feval)
     >>> eval_metrics.update(labels, predicts)
-    >>> print eval_metrics.get()
+    >>> eval_metrics.get()
     ('custom(<lambda>)', 6.0)
     """
     def __init__(self, feval, name=None, allow_extra_outputs=False,
diff --git a/python/mxnet/gluon/model_zoo/vision/__init__.py b/python/mxnet/gluon/model_zoo/vision/__init__.py
index 8ca7a34ba363..cbce5190a757 100644
--- a/python/mxnet/gluon/model_zoo/vision/__init__.py
+++ b/python/mxnet/gluon/model_zoo/vision/__init__.py
@@ -59,8 +59,8 @@
 
     image = image/255
     normalized = mx.image.color_normalize(image,
-                                          mean=mx.nd.array([0.485, 0.456, 0.406]),
-                                          std=mx.nd.array([0.229, 0.224, 0.225]))
+                                          mean=mx.np.array([0.485, 0.456, 0.406]),
+                                          std=mx.np.array([0.229, 0.224, 0.225]))
 
 .. _AlexNet: https://arxiv.org/abs/1404.5997
 .. _DenseNet: https://arxiv.org/abs/1608.06993
diff --git a/python/mxnet/gluon/model_zoo/vision/alexnet.py b/python/mxnet/gluon/model_zoo/vision/alexnet.py
index 7bdacc915fb3..713ed9a82329 100644
--- a/python/mxnet/gluon/model_zoo/vision/alexnet.py
+++ b/python/mxnet/gluon/model_zoo/vision/alexnet.py
@@ -26,8 +26,10 @@
 from ...block import HybridBlock
 from ... import nn
 from .... import base
+from ....util import use_np
 
 # Net
+@use_np
 class AlexNet(HybridBlock):
     r"""AlexNet model from the `"One weird trick..." <https://arxiv.org/abs/1404.5997>`_ paper.
 
@@ -60,7 +62,7 @@ def __init__(self, classes=1000, **kwargs):
 
         self.output = nn.Dense(classes)
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         x = self.features(x)
         x = self.output(x)
         return x
diff --git a/python/mxnet/gluon/model_zoo/vision/densenet.py b/python/mxnet/gluon/model_zoo/vision/densenet.py
index 21d984dcfd21..ffa06c03637d 100644
--- a/python/mxnet/gluon/model_zoo/vision/densenet.py
+++ b/python/mxnet/gluon/model_zoo/vision/densenet.py
@@ -26,6 +26,7 @@
 from ...block import HybridBlock
 from ... import nn
 from .... import base
+from ....util import use_np
 
 # Helpers
 def _make_dense_block(num_layers, bn_size, growth_rate, dropout):
@@ -60,6 +61,7 @@ def _make_transition(num_output_features):
     return out
 
 # Net
+@use_np
 class DenseNet(HybridBlock):
     r"""Densenet-BC model from the
     `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_ paper.
@@ -105,7 +107,7 @@ def __init__(self, num_init_features, growth_rate, block_config,
 
         self.output = nn.Dense(classes)
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         x = self.features(x)
         x = self.output(x)
         return x
diff --git a/python/mxnet/gluon/model_zoo/vision/inception.py b/python/mxnet/gluon/model_zoo/vision/inception.py
index c2db7f9a2148..ca62d4eb69ef 100644
--- a/python/mxnet/gluon/model_zoo/vision/inception.py
+++ b/python/mxnet/gluon/model_zoo/vision/inception.py
@@ -26,6 +26,7 @@
 from ...block import HybridBlock
 from ... import nn
 from .... import base
+from ....util import use_np
 
 # Helpers
 def _make_basic_conv(**kwargs):
@@ -149,6 +150,7 @@ def make_aux(classes):
     return out
 
 # Net
+@use_np
 class Inception3(HybridBlock):
     r"""Inception v3 model from
     `"Rethinking the Inception Architecture for Computer Vision"
@@ -186,7 +188,7 @@ def __init__(self, classes=1000, **kwargs):
 
         self.output = nn.Dense(classes)
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         x = self.features(x)
         x = self.output(x)
         return x
diff --git a/python/mxnet/gluon/model_zoo/vision/mobilenet.py b/python/mxnet/gluon/model_zoo/vision/mobilenet.py
index 69cd1c03ba10..40dc40d340ef 100644
--- a/python/mxnet/gluon/model_zoo/vision/mobilenet.py
+++ b/python/mxnet/gluon/model_zoo/vision/mobilenet.py
@@ -30,18 +30,20 @@
 from ... import nn
 from ....context import cpu
 from ...block import HybridBlock
-from .... import base
+from .... import base, np
+from ....util import use_np
 
 
 # Helpers
+@use_np
 class RELU6(nn.HybridBlock):
     """Relu6 used in MobileNetV2."""
 
     def __init__(self, **kwargs):
         super(RELU6, self).__init__(**kwargs)
 
-    def hybrid_forward(self, F, x):
-        return F.clip(x, 0, 6, name="relu6")
+    def forward(self, x):
+        return np.clip(x, 0, 6)
 
 
 # pylint: disable= too-many-arguments
@@ -59,6 +61,7 @@ def _add_conv_dw(out, dw_channels, channels, stride, relu6=False):
     _add_conv(out, channels=channels, relu6=relu6)
 
 
+@use_np
 class LinearBottleneck(nn.HybridBlock):
     r"""LinearBottleneck used in MobileNetV2 model from the
     `"Inverted Residuals and Linear Bottlenecks:
@@ -87,14 +90,15 @@ def __init__(self, in_channels, channels, t, stride, **kwargs):
                   pad=1, num_group=in_channels * t, relu6=True)
         _add_conv(self.out, channels, active=False, relu6=True)
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         out = self.out(x)
         if self.use_shortcut:
-            out = F.elemwise_add(out, x)
+            out = np.add(out, x)
         return out
 
 
 # Net
+@use_np
 class MobileNet(HybridBlock):
     r"""MobileNet model from the
     `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
@@ -126,12 +130,13 @@ def __init__(self, multiplier=1.0, classes=1000, **kwargs):
 
         self.output = nn.Dense(classes)
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         x = self.features(x)
         x = self.output(x)
         return x
 
 
+@use_np
 class MobileNetV2(nn.HybridBlock):
     r"""MobileNetV2 model from the
     `"Inverted Residuals and Linear Bottlenecks:
@@ -175,7 +180,7 @@ def __init__(self, multiplier=1.0, classes=1000, **kwargs):
             nn.Flatten()
         )
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         x = self.features(x)
         x = self.output(x)
         return x
diff --git a/python/mxnet/gluon/model_zoo/vision/resnet.py b/python/mxnet/gluon/model_zoo/vision/resnet.py
index 147ad6af2d9e..33d921bb398a 100644
--- a/python/mxnet/gluon/model_zoo/vision/resnet.py
+++ b/python/mxnet/gluon/model_zoo/vision/resnet.py
@@ -32,7 +32,8 @@
 from ...block import HybridBlock
 from ... import nn
 from .... import base
-from .... util import is_np_array
+from .... util import use_np
+from .... import npx
 
 # Helpers
 def _conv3x3(channels, stride, in_channels):
@@ -41,6 +42,7 @@ def _conv3x3(channels, stride, in_channels):
 
 
 # Blocks
+@use_np
 class BasicBlockV1(HybridBlock):
     r"""BasicBlock V1 from `"Deep Residual Learning for Image Recognition"
     <http://arxiv.org/abs/1512.03385>`_ paper.
@@ -73,7 +75,7 @@ def __init__(self, channels, stride, downsample=False, in_channels=0, **kwargs):
         else:
             self.downsample = None
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         residual = x
 
         x = self.body(x)
@@ -81,12 +83,12 @@ def hybrid_forward(self, F, x):
         if self.downsample:
             residual = self.downsample(residual)
 
-        act = F.npx.activation if is_np_array() else F.Activation
-        x = act(residual+x, act_type='relu')
+        x = npx.activation(residual+x, act_type='relu')
 
         return x
 
 
+@use_np
 class BottleneckV1(HybridBlock):
     r"""Bottleneck V1 from `"Deep Residual Learning for Image Recognition"
     <http://arxiv.org/abs/1512.03385>`_ paper.
@@ -122,7 +124,7 @@ def __init__(self, channels, stride, downsample=False, in_channels=0, **kwargs):
         else:
             self.downsample = None
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         residual = x
 
         x = self.body(x)
@@ -130,11 +132,11 @@ def hybrid_forward(self, F, x):
         if self.downsample:
             residual = self.downsample(residual)
 
-        act = F.npx.activation if is_np_array() else F.Activation
-        x = act(x + residual, act_type='relu')
+        x = npx.activation(x + residual, act_type='relu')
         return x
 
 
+@use_np
 class BasicBlockV2(HybridBlock):
     r"""BasicBlock V2 from
     `"Identity Mappings in Deep Residual Networks"
@@ -164,22 +166,22 @@ def __init__(self, channels, stride, downsample=False, in_channels=0, **kwargs):
         else:
             self.downsample = None
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         residual = x
         x = self.bn1(x)
-        act = F.npx.activation if is_np_array() else F.Activation
-        x = act(x, act_type='relu')
+        x = npx.activation(x, act_type='relu')
         if self.downsample:
             residual = self.downsample(x)
         x = self.conv1(x)
 
         x = self.bn2(x)
-        x = act(x, act_type='relu')
+        x = npx.activation(x, act_type='relu')
         x = self.conv2(x)
 
         return x + residual
 
 
+@use_np
 class BottleneckV2(HybridBlock):
     r"""Bottleneck V2 from
     `"Identity Mappings in Deep Residual Networks"
@@ -211,27 +213,27 @@ def __init__(self, channels, stride, downsample=False, in_channels=0, **kwargs):
         else:
             self.downsample = None
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         residual = x
         x = self.bn1(x)
-        act = F.npx.activation if is_np_array() else F.Activation
-        x = act(x, act_type='relu')
+        x = npx.activation(x, act_type='relu')
         if self.downsample:
             residual = self.downsample(x)
         x = self.conv1(x)
 
         x = self.bn2(x)
-        x = act(x, act_type='relu')
+        x = npx.activation(x, act_type='relu')
         x = self.conv2(x)
 
         x = self.bn3(x)
-        x = act(x, act_type='relu')
+        x = npx.activation(x, act_type='relu')
         x = self.conv3(x)
 
         return x + residual
 
 
 # Nets
+@use_np
 class ResNetV1(HybridBlock):
     r"""ResNet V1 model from
     `"Deep Residual Learning for Image Recognition"
@@ -277,13 +279,14 @@ def _make_layer(self, block, layers, channels, stride, in_channels=0):
             layer.add(block(channels, 1, False, in_channels=channels))
         return layer
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         x = self.features(x)
         x = self.output(x)
 
         return x
 
 
+@use_np
 class ResNetV2(HybridBlock):
     r"""ResNet V2 model from
     `"Identity Mappings in Deep Residual Networks"
@@ -335,7 +338,7 @@ def _make_layer(self, block, layers, channels, stride, in_channels=0):
             layer.add(block(channels, 1, False, in_channels=channels))
         return layer
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         x = self.features(x)
         x = self.output(x)
         return x
diff --git a/python/mxnet/gluon/model_zoo/vision/squeezenet.py b/python/mxnet/gluon/model_zoo/vision/squeezenet.py
index 6d3336cefd18..ac04129369c9 100644
--- a/python/mxnet/gluon/model_zoo/vision/squeezenet.py
+++ b/python/mxnet/gluon/model_zoo/vision/squeezenet.py
@@ -26,6 +26,7 @@
 from ...block import HybridBlock
 from ... import nn
 from .... import base
+from ....util import use_np
 
 # Helpers
 def _make_fire(squeeze_channels, expand1x1_channels, expand3x3_channels):
@@ -46,6 +47,7 @@ def _make_fire_conv(channels, kernel_size, padding=0):
     return out
 
 # Net
+@use_np
 class SqueezeNet(HybridBlock):
     r"""SqueezeNet model from the `"SqueezeNet: AlexNet-level accuracy with 50x fewer parameters
     and <0.5MB model size" <https://arxiv.org/abs/1602.07360>`_ paper.
@@ -102,7 +104,7 @@ def __init__(self, version, classes=1000, **kwargs):
         self.output.add(nn.AvgPool2D(13))
         self.output.add(nn.Flatten())
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         x = self.features(x)
         x = self.output(x)
         return x
diff --git a/python/mxnet/gluon/model_zoo/vision/vgg.py b/python/mxnet/gluon/model_zoo/vision/vgg.py
index 4dd04f8b03b2..0734bd370169 100644
--- a/python/mxnet/gluon/model_zoo/vision/vgg.py
+++ b/python/mxnet/gluon/model_zoo/vision/vgg.py
@@ -30,8 +30,10 @@
 from ...block import HybridBlock
 from ... import nn
 from .... import base
+from ....util import use_np
 
 
+@use_np
 class VGG(HybridBlock):
     r"""VGG model from the `"Very Deep Convolutional Networks for Large-Scale Image Recognition"
     <https://arxiv.org/abs/1409.1556>`_ paper.
@@ -78,7 +80,7 @@ def _make_features(self, layers, filters, batch_norm):
             featurizer.add(nn.MaxPool2D(strides=2))
         return featurizer
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         x = self.features(x)
         x = self.output(x)
         return x
diff --git a/python/mxnet/gluon/nn/activations.py b/python/mxnet/gluon/nn/activations.py
index afa5a223d06b..02f62821223b 100644
--- a/python/mxnet/gluon/nn/activations.py
+++ b/python/mxnet/gluon/nn/activations.py
@@ -20,12 +20,13 @@
 """Basic neural network layers."""
 __all__ = ['Activation', 'LeakyReLU', 'PReLU', 'ELU', 'SELU', 'Swish', 'GELU', 'SiLU']
 
-from ... import initializer
+from ... import initializer, npx
 from ..block import HybridBlock
 from ..parameter import Parameter
-from ...util import is_np_array
+from ...util import use_np
 
 
+@use_np
 class Activation(HybridBlock):
     r"""Applies an activation function to input.
 
@@ -49,9 +50,8 @@ def __init__(self, activation, **kwargs):
     def _alias(self):
         return self._act_type
 
-    def hybrid_forward(self, F, x):
-        act = F.npx.activation if is_np_array() else F.Activation
-        return act(x, act_type=self._act_type, name='fwd')
+    def forward(self, x):
+        return npx.activation(x, act_type=self._act_type, name='fwd')
 
     def __repr__(self):
         s = '{name}({_act_type})'
@@ -59,6 +59,7 @@ def __repr__(self):
                         **self.__dict__)
 
 
+@use_np
 class LeakyReLU(HybridBlock):
     r"""Leaky version of a Rectified Linear Unit.
 
@@ -90,9 +91,8 @@ def __init__(self, alpha, **kwargs):
         super(LeakyReLU, self).__init__(**kwargs)
         self._alpha = alpha
 
-    def hybrid_forward(self, F, x):
-        leaky_relu = F.npx.leaky_relu if is_np_array() else F.LeakyReLU
-        return leaky_relu(x, act_type='leaky', slope=self._alpha, name='fwd')
+    def forward(self, x):
+        return npx.leaky_relu(x, act_type='leaky', slope=self._alpha, name='fwd')
 
     def __repr__(self):
         s = '{name}({alpha})'
@@ -100,6 +100,7 @@ def __repr__(self):
                         alpha=self._alpha)
 
 
+@use_np
 class PReLU(HybridBlock):
     r"""Parametric leaky version of a Rectified Linear Unit.
     <https://arxiv.org/abs/1502.01852>`_ paper.
@@ -137,11 +138,12 @@ def __init__(self, alpha_initializer=initializer.Constant(0.25),
         super(PReLU, self).__init__(**kwargs)
         self.alpha = Parameter('alpha', shape=(in_channels,), init=alpha_initializer)
 
-    def hybrid_forward(self, F, x, alpha):
-        leaky_relu = F.npx.leaky_relu if is_np_array() else F.LeakyReLU
-        return leaky_relu(x, gamma=alpha, act_type='prelu', name='fwd')
+    def forward(self, x):
+        ctx = x.ctx
+        return npx.leaky_relu(x, gamma=self.alpha.data(ctx), act_type='prelu', name='fwd')
 
 
+@use_np
 class ELU(HybridBlock):
     r"""
     Exponential Linear Unit (ELU)
@@ -166,11 +168,11 @@ def __init__(self, alpha=1.0, **kwargs):
         super(ELU, self).__init__(**kwargs)
         self._alpha = alpha
 
-    def hybrid_forward(self, F, x):
-        leaky_relu = F.npx.leaky_relu if is_np_array() else F.LeakyReLU
-        return leaky_relu(x, act_type='elu', slope=self._alpha)
+    def forward(self, x):
+        return npx.leaky_relu(x, act_type='elu', slope=self._alpha)
 
 
+@use_np
 class SELU(HybridBlock):
     r"""
     Scaled Exponential Linear Unit (SELU)
@@ -187,11 +189,11 @@ class SELU(HybridBlock):
     def __init__(self, **kwargs):
         super(SELU, self).__init__(**kwargs)
 
-    def hybrid_forward(self, F, x):
-        leaky_relu = F.npx.leaky_relu if is_np_array() else F.LeakyReLU
-        return leaky_relu(x, act_type='selu', name='fwd')
+    def forward(self, x):
+        return npx.leaky_relu(x, act_type='selu', name='fwd')
 
 
+@use_np
 class GELU(HybridBlock):
     r"""
     Gaussian Exponential Linear Unit (GELU)
@@ -208,11 +210,11 @@ class GELU(HybridBlock):
     def __init__(self, **kwargs):
         super(GELU, self).__init__(**kwargs)
 
-    def hybrid_forward(self, F, x):
-        leaky_relu = F.npx.leaky_relu if is_np_array() else F.LeakyReLU
-        return leaky_relu(x, act_type='gelu', name='fwd')
+    def forward(self, x):
+        return npx.leaky_relu(x, act_type='gelu', name='fwd')
 
 
+@use_np
 class Swish(HybridBlock):
     r"""
     Swish Activation function (SiLU with a hyperparameter)
@@ -235,13 +237,11 @@ def __init__(self, beta=1.0, **kwargs):
         super(Swish, self).__init__(**kwargs)
         self._beta = beta
 
-    def hybrid_forward(self, F, x):
-        if is_np_array():
-            return x * F.npx.sigmoid(self._beta * x)
-        else:
-            return x * F.sigmoid(self._beta * x, name='fwd')
+    def forward(self, x):
+        return x * npx.sigmoid(self._beta * x)
 
 
+@use_np
 class SiLU(HybridBlock):
     r"""
     Sigmoid Linear Units
@@ -264,8 +264,5 @@ class SiLU(HybridBlock):
     def __init__(self, **kwargs):
         super(SiLU, self).__init__(**kwargs)
 
-    def hybrid_forward(self, F, x):
-        if is_np_array():
-            return x * F.npx.sigmoid(x)
-        else:
-            return x * F.sigmoid(x, name='fwd')
+    def forward(self, x):
+        return x * npx.sigmoid(x)
diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py
index 0922f573ab4b..c542544cfd29 100644
--- a/python/mxnet/gluon/nn/basic_layers.py
+++ b/python/mxnet/gluon/nn/basic_layers.py
@@ -23,14 +23,13 @@
            'Flatten', 'Lambda', 'HybridLambda', 'Concatenate', 'HybridConcatenate', 'Identity']
 import warnings
 import uuid
-import inspect
-import numpy as np
+import numpy as _np
 
 from .activations import Activation
 from ..block import Block, HybridBlock
 from ..utils import _indent
-from ... import ndarray as nd, np as mxnp, symbol as sym, context, _deferred_compute as dc
-from ...util import is_np_array
+from ... import np, npx, context
+from ...util import use_np
 from ..parameter import Parameter
 
 
@@ -101,6 +100,7 @@ def hybridize(self, active=True, **kwargs):
         super(Sequential, self).hybridize(active, **kwargs)
 
 
+@use_np
 class HybridSequential(HybridBlock):
     """Stacks HybridBlocks sequentially.
 
@@ -114,7 +114,6 @@ class HybridSequential(HybridBlock):
     def __init__(self):
         super().__init__()
         self._layers = []
-        self._v2_checked = False
 
     def add(self, *blocks):
         """Adds block on top of the stack."""
@@ -122,32 +121,7 @@ def add(self, *blocks):
             self._layers.append(block)
             self.register_child(block)
 
-    def __call__(self, *args, **kwargs):
-        if self._active  and not self._v2_checked and not dc.is_deferred_compute():
-            # If any of the child Blocks implements the Gluon 2 interface, the
-            # container must not pass a Symbol to them
-            if any(inspect.unwrap(chld().hybrid_forward.__func__) is
-                   HybridBlock.hybrid_forward for chld in self._children.values()):
-                self._v2 = True
-                self._v2_checked = True
-                self.forward = self._forward
-
-        return super().__call__(*args, **kwargs)
-
-
-    def _forward(self, x, *args):
-        for block in self._children.values():
-            x = block()(x, *args)
-            args = []
-            if isinstance(x, (tuple, list)):
-                args = x[1:]
-                x = x[0]
-        if args:
-            x = tuple([x] + list(args))
-        return x
-
-
-    def hybrid_forward(self, F, x, *args):
+    def forward(self, x, *args):
         for block in self._children.values():
             x = block()(x, *args)
             args = []
@@ -178,6 +152,7 @@ def __len__(self):
         return len(self._children)
 
 
+@use_np
 class Dense(HybridBlock):
     r"""Just your regular densely-connected NN layer.
 
@@ -247,14 +222,25 @@ def __init__(self, units, activation=None, use_bias=True, flatten=True,
         else:
             self.act = None
 
-    def hybrid_forward(self, F, x, weight, bias=None):
-        fc = F.npx.fully_connected if is_np_array() else F.FullyConnected
-        act = fc(x, weight, bias, no_bias=bias is None, num_hidden=self._units,
-                 flatten=self._flatten, name='fwd')
+    def forward(self, x):
+        ctx = x.ctx
+        act = npx.fully_connected(x, self.weight.data(ctx),
+                                  self.bias.data(ctx) if self.bias is not None else None,
+                                  no_bias=self.bias is None,
+                                  num_hidden=self._units, flatten=self._flatten, name='fwd')
         if self.act is not None:
             act = self.act(act)
         return act
 
+    def infer_shape(self, x, *args):
+        if self._flatten:
+            num_input = 1
+            for i in range(1, x.ndim):
+                num_input *= x.shape[i]
+            self.weight.shape = (self.weight.shape[0], num_input)
+        else:
+            self.weight.shape = (self.weight.shape[0], x.shape[x.ndim - 1])
+
     def __repr__(self):
         s = '{name}({layout}, {act})'
         shape = self.weight.shape
@@ -263,6 +249,7 @@ def __repr__(self):
                         layout='{0} -> {1}'.format(shape[1] if shape[1] else None, shape[0]))
 
 
+@use_np
 class Dropout(HybridBlock):
     """Applies Dropout to the input.
 
@@ -293,13 +280,11 @@ def __init__(self, rate, axes=(), **kwargs):
         self._rate = rate
         self._axes = axes
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         if self._rate > 0:
-            dropout = F.npx.dropout if is_np_array() else F.Dropout
-            return dropout(x, p=self._rate, axes=self._axes, name='fwd', cudnn_off=False)
+            return npx.dropout(x, p=self._rate, axes=self._axes, name='fwd', cudnn_off=False)
         else:
-            copy = F.np.copy if is_np_array() else F.identity
-            return copy(x)
+            return np.copy(x)
 
     def __repr__(self):
         s = '{name}(p = {_rate}, axes={_axes})'
@@ -307,6 +292,7 @@ def __repr__(self):
                         **self.__dict__)
 
 
+@use_np
 class _BatchNorm(HybridBlock):
     """Abstract BatchNorm layer (private, used as implementation base).
     Batch normalization layer (Ioffe and Szegedy, 2014).
@@ -367,6 +353,7 @@ def __init__(self, axis=1, momentum=0.9, epsilon=1e-5, center=True, scale=True,
         self._kwargs = {'axis': axis, 'eps': epsilon, 'momentum': momentum,
                         'fix_gamma': not scale, 'use_global_stats': use_global_stats}
         self.fuse_relu = fuse_relu
+        self._axis = axis
         if in_channels != 0:
             self.in_channels = in_channels
 
@@ -390,16 +377,30 @@ def __init__(self, axis=1, momentum=0.9, epsilon=1e-5, center=True, scale=True,
                                      differentiable=False)
 
     def cast(self, dtype):
-        if np.dtype(dtype).name == 'float16':
+        if _np.dtype(dtype).name == 'float16':
             dtype = 'float32'
         super(_BatchNorm, self).cast(dtype)
 
-    def hybrid_forward(self, F, x, gamma, beta, running_mean, running_var):
-        batch_norm = F.npx.batch_norm if is_np_array() else F.BatchNorm
-        if (not is_np_array()) and self.fuse_relu:
-            batch_norm = F.contrib.BatchNormWithReLU
-        return batch_norm(x, gamma, beta, running_mean, running_var,
-                          name='fwd', **self._kwargs)
+    def forward(self, x):
+        ctx = x.ctx
+        if self.fuse_relu:
+            return npx.batch_norm_with_relu(x, self.gamma.data(ctx), self.beta.data(ctx),
+                                            self.running_mean.data(ctx),
+                                            self.running_var.data(ctx),
+                                            name='fwd', **self._kwargs)
+        else:
+            return npx.batch_norm(x, self.gamma.data(ctx), self.beta.data(ctx),
+                                  self.running_mean.data(ctx),
+                                  self.running_var.data(ctx),
+                                  name='fwd', **self._kwargs)
+
+    def infer_shape(self, x, *args):
+        channel_axis = self._axis if self._axis >= 0 else self._axis + x.ndim
+        channel_count = x.shape[channel_axis]
+        self.gamma.shape = (channel_count,)
+        self.beta.shape = (channel_count,)
+        self.running_mean.shape = (channel_count,)
+        self.running_var.shape = (channel_count,)
 
     def __repr__(self):
         s = '{name}({content}'
@@ -538,6 +539,7 @@ def __init__(self, axis=1, momentum=0.9, epsilon=1e-5, center=True, scale=True,
             in_channels=in_channels, **kwargs)
 
 
+@use_np
 class Embedding(HybridBlock):
     r"""Turns non-negative integers (indexes/tokens) into dense vectors
     of fixed size. eg. [4, 20] -> [[0.25, 0.1], [0.6, -0.2]]
@@ -572,6 +574,7 @@ class Embedding(HybridBlock):
     def __init__(self, input_dim, output_dim, dtype='float32',
                  weight_initializer=None, sparse_grad=False, **kwargs):
         super(Embedding, self).__init__(**kwargs)
+        assert not sparse_grad, "Currently, sparse feature is not supported in Gluon2.0"
         grad_stype = 'row_sparse' if sparse_grad else 'default'
         self._kwargs = {'input_dim': input_dim, 'output_dim': output_dim,
                         'dtype': dtype, 'sparse_grad': sparse_grad}
@@ -579,9 +582,9 @@ def __init__(self, input_dim, output_dim, dtype='float32',
                                 init=weight_initializer, dtype=dtype,
                                 allow_deferred_init=True, grad_stype=grad_stype)
 
-    def hybrid_forward(self, F, x, weight):
-        embedding = F.npx.embedding if is_np_array() else F.Embedding
-        return embedding(x, weight, name='fwd', **self._kwargs)
+    def forward(self, x):
+        ctx = x.ctx
+        return npx.embedding(x, self.weight.data(ctx), name='fwd', **self._kwargs)
 
     def __repr__(self):
         s = '{block_name}({input_dim} -> {output_dim}, {dtype})'
@@ -589,6 +592,7 @@ def __repr__(self):
                         **self._kwargs)
 
 
+@use_np
 class Flatten(HybridBlock):
     r"""Flattens the input to two dimensional.
 
@@ -601,14 +605,14 @@ class Flatten(HybridBlock):
     def __init__(self, **kwargs):
         super(Flatten, self).__init__(**kwargs)
 
-    def hybrid_forward(self, F, x):
-        flatten = F.npx.batch_flatten if is_np_array() else F.flatten
-        return flatten(x)
+    def forward(self, x):
+        return npx.batch_flatten(x)
 
     def __repr__(self):
         return self.__class__.__name__
 
 
+@use_np
 class InstanceNorm(HybridBlock):
     r"""
     Applies instance normalization to the n-dimensional input array.
@@ -663,7 +667,7 @@ class InstanceNorm(HybridBlock):
     Examples
     --------
     >>> # Input of shape (2,1,2)
-    >>> x = mx.nd.array([[[ 1.1,  2.2]],
+    >>> x = mx.np.array([[[ 1.1,  2.2]],
     ...                 [[ 3.3,  4.4]]])
     >>> # Instance normalization is calculated with the above formula
     >>> layer = InstanceNorm()
@@ -671,7 +675,6 @@ class InstanceNorm(HybridBlock):
     >>> layer(x)
     [[[-0.99998355  0.99998331]]
      [[-0.99998319  0.99998361]]]
-    <NDArray 2x1x2 @cpu(0)>
     """
     def __init__(self, axis=1, epsilon=1e-5, center=True, scale=False,
                  beta_initializer='zeros', gamma_initializer='ones',
@@ -687,13 +690,18 @@ def __init__(self, axis=1, epsilon=1e-5, center=True, scale=False,
                               shape=(in_channels,), init=beta_initializer,
                               allow_deferred_init=True)
 
-    def hybrid_forward(self, F, x, gamma, beta):
+    def forward(self, x):
+        ctx = x.ctx
         if self._axis == 1:
-            return F.InstanceNorm(x, gamma, beta,
-                                  name='fwd', eps=self._epsilon)
+            return npx.instance_norm(x, self.gamma.data(ctx), self.beta.data(ctx),
+                                     name='fwd', eps=self._epsilon)
         x = x.swapaxes(1, self._axis)
-        return F.InstanceNorm(x, gamma, beta, name='fwd',
-                              eps=self._epsilon).swapaxes(1, self._axis)
+        return npx.instance_norm(x, self.gamma.data(ctx), self.beta.data(ctx),
+                                 name='fwd', eps=self._epsilon).swapaxes(1, self._axis)
+
+    def infer_shape(self, x, *args):
+        self.gamma.shape = (x.shape[1],)
+        self.beta.shape = (x.shape[1],)
 
     def __repr__(self):
         s = '{name}({content}'
@@ -705,6 +713,7 @@ def __repr__(self):
                                            for k, v in self._kwargs.items()]))
 
 
+@use_np
 class LayerNorm(HybridBlock):
     r"""
     Applies layer normalization to the n-dimensional input array.
@@ -750,14 +759,13 @@ class LayerNorm(HybridBlock):
     Examples
     --------
     >>> # Input of shape (2, 5)
-    >>> x = mx.nd.array([[1, 2, 3, 4, 5], [1, 1, 2, 2, 2]])
+    >>> x = mx.np.array([[1, 2, 3, 4, 5], [1, 1, 2, 2, 2]])
     >>> # Layer normalization is calculated with the above formula
     >>> layer = LayerNorm()
     >>> layer.initialize(ctx=mx.cpu(0))
     >>> layer(x)
     [[-1.41421    -0.707105    0.          0.707105    1.41421   ]
      [-1.2247195  -1.2247195   0.81647956  0.81647956  0.81647956]]
-    <NDArray 2x5 @cpu(0)>
     """
     def __init__(self, axis=-1, epsilon=1e-5, center=True, scale=True,
                  beta_initializer='zeros', gamma_initializer='ones',
@@ -775,9 +783,16 @@ def __init__(self, axis=-1, epsilon=1e-5, center=True, scale=True,
                               shape=(in_channels,), init=beta_initializer,
                               allow_deferred_init=True)
 
-    def hybrid_forward(self, F, data, gamma, beta):
-        layer_norm = F.npx.layer_norm if is_np_array() else F.LayerNorm
-        return layer_norm(data, gamma=gamma, beta=beta, axis=self._axis, eps=self._epsilon)
+    def forward(self, data):
+        ctx = data.ctx
+        return npx.layer_norm(data, gamma=self.gamma.data(ctx),
+                              beta=self.beta.data(ctx), axis=self._axis, eps=self._epsilon)
+
+    def infer_shape(self, data, *args):
+        channel_axis = self._axis if self._axis >= 0 else self._axis + data.ndim
+        channel_count = data.shape[channel_axis]
+        self.gamma.shape = (channel_count,)
+        self.beta.shape = (channel_count,)
 
     def __repr__(self):
         s = '{name}({content}'
@@ -789,6 +804,7 @@ def __repr__(self):
                                            for k, v in self._kwargs.items()]))
 
 
+@use_np
 class GroupNorm(HybridBlock):
     r"""
     Applies group normalization to the n-dimensional input array.
@@ -832,7 +848,7 @@ class GroupNorm(HybridBlock):
     Examples
     --------
     >>> # Input of shape (2, 3, 4)
-    >>> x = mx.nd.array([[[ 0,  1,  2,  3],
+    >>> x = mx.np.array([[[ 0,  1,  2,  3],
                           [ 4,  5,  6,  7],
                           [ 8,  9, 10, 11]],
                          [[12, 13, 14, 15],
@@ -848,7 +864,6 @@ class GroupNorm(HybridBlock):
      [[-1.5932543 -1.3035717 -1.0138891 -0.7242065]
       [-0.4345239 -0.1448413  0.1448413  0.4345239]
       [ 0.7242065  1.0138891  1.3035717  1.5932543]]]
-    <NDArray 2x3x4 @cpu(0)>
     """
     def __init__(self, num_groups=1, epsilon=1e-5, center=True, scale=True,
                  beta_initializer='zeros', gamma_initializer='ones',
@@ -866,10 +881,16 @@ def __init__(self, num_groups=1, epsilon=1e-5, center=True, scale=True,
                               shape=(in_channels,), init=beta_initializer,
                               allow_deferred_init=True)
 
-    def hybrid_forward(self, F, data, gamma, beta):
-        norm_data = F.GroupNorm(data, gamma=gamma, beta=beta, num_groups=self._num_groups, eps=self._epsilon)
+    def forward(self, data):
+        ctx = data.ctx
+        norm_data = npx.group_norm(data, gamma=self.gamma.data(ctx), beta=self.beta.data(ctx),
+                                   num_groups=self._num_groups, eps=self._epsilon)
         return norm_data
 
+    def infer_shape(self, data, *args):
+        self.gamma.shape = (data.shape[1],)
+        self.beta.shape = (data.shape[1],)
+
     def __repr__(self):
         s = '{name}({content}'
         in_channels = self.gamma.shape[0]
@@ -894,7 +915,7 @@ class Lambda(Block):
 
         2) a function that conforms to ``def function(*args)``. For example::
 
-            block = Lambda(lambda x: nd.LeakyReLU(x, slope=0.1))
+            block = Lambda(lambda x: npx.leaky_relu(x, slope=0.1))
 
     Inputs:
         - ** *args **: one or more input data. Their shapes depend on the function.
@@ -905,9 +926,13 @@ class Lambda(Block):
     def __init__(self, function):
         super(Lambda, self).__init__()
         if isinstance(function, str):
-            assert hasattr(nd, function), \
-                   "Function name %s is not found in ndarray." % function
-            self._func_impl = getattr(nd, function)
+            if hasattr(np, function):
+                self._func_impl = getattr(np, function)
+            elif hasattr(npx, function):
+                self._func_impl = getattr(npx, function)
+            else:
+                raise Exception("Function name %s is not found in np/npx." % function)
+            self._func_name = function
         elif callable(function):
             self._func_impl = function
         else:
@@ -923,6 +948,7 @@ def __repr__(self):
                                            function=self._func_impl.__name__)
 
 
+@use_np
 class HybridLambda(HybridBlock):
     r"""Wraps an operator or an expression as a HybridBlock object.
 
@@ -949,10 +975,12 @@ class HybridLambda(HybridBlock):
     def __init__(self, function):
         super(HybridLambda, self).__init__()
         if isinstance(function, str):
-            assert hasattr(nd, function) and hasattr(sym, function), \
-                   "Function name %s is not found in symbol/ndarray." % function
-            func_dict = {sym: getattr(sym, function), nd: getattr(nd, function)}
-            self._func = lambda F, *args: func_dict[F](*args)
+            if hasattr(np, function):
+                self._func = getattr(np, function)
+            elif hasattr(npx, function):
+                self._func = getattr(npx, function)
+            else:
+                raise Exception("Function name %s is not found in np/npx." % function)
             self._func_name = function
         elif callable(function):
             self._func = function
@@ -962,14 +990,15 @@ def __init__(self, function):
                 "Unrecognized function in lambda: {} of type {}"
                 .format(function, type(function)))
 
-    def hybrid_forward(self, F, x, *args):
-        return self._func(F, x, *args)
+    def forward(self, x, *args):
+        return self._func(x, *args)
 
     def __repr__(self):
         return '{name}({function})'.format(name=self.__class__.__name__,
                                            function=self._func_name)
 
 
+@use_np
 class Concatenate(Sequential):
     """Lays `Block` s concurrently.
 
@@ -997,13 +1026,11 @@ def forward(self, x):
         out = []
         for block in self._children.values():
             out.append(block()(x))
-        if is_np_array():
-            out = np.concatenate(out, axis=self.axis)
-        else:
-            out = nd.concat(*out, dim=self.axis)
+        out = np.concatenate(out, axis=self.axis)
         return out
 
 
+@use_np
 class HybridConcatenate(HybridSequential):
     """Lays `HybridBlock` s concurrently.
 
@@ -1027,27 +1054,15 @@ def __init__(self, axis=-1):
         super().__init__()
         self.axis = axis
 
-    def _forward(self, x):
-        out = []
-        for block in self._children.values():
-            out.append(block()(x))
-        if is_np_array():
-            out = mxnp.concatenate(out, axis=self.axis)
-        else:
-            out = nd.concat(*out, dim=self.axis)
-        return out
-
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         out = []
         for block in self._children.values():
             out.append(block()(x))
-        if is_np_array():
-            out = F.np.concatenate(out, axis=self.axis)
-        else:
-            out = F.concat(*out, dim=self.axis)
+        out = np.concatenate(out, axis=self.axis)
         return out
 
 
+@use_np
 class Identity(HybridBlock):
     """Block that passes through the input directly.
 
@@ -1064,10 +1079,11 @@ class Identity(HybridBlock):
     def __init__(self):
         super(Identity, self).__init__()
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         return x
 
 
+@use_np
 class SyncBatchNorm(BatchNorm):
     """Cross-GPU Synchronized Batch normalization (SyncBN)
 
@@ -1148,6 +1164,8 @@ def _get_num_devices(self):
         num_devices = num_devices if num_devices > 0 else 1
         return num_devices
 
-    def hybrid_forward(self, F, x, gamma, beta, running_mean, running_var):
-        return F.contrib.SyncBatchNorm(x, gamma, beta, running_mean, running_var,
-                                       name='fwd', **self._kwargs)
+    def forward(self, x):
+        ctx = x.ctx
+        return npx.sync_batch_norm(x, self.gamma.data(ctx), self.beta.data(ctx),
+                                   self.running_mean.data(ctx), self.running_var.data(ctx),
+                                   name='fwd', **self._kwargs)
diff --git a/python/mxnet/gluon/nn/conv_layers.py b/python/mxnet/gluon/nn/conv_layers.py
index f0cb940bd575..65e22d82eded 100644
--- a/python/mxnet/gluon/nn/conv_layers.py
+++ b/python/mxnet/gluon/nn/conv_layers.py
@@ -29,23 +29,12 @@
 
 from ..block import HybridBlock
 from ..parameter import Parameter
-from ... import symbol
+from ... import np, npx
 from ...base import numeric_types
 from .activations import Activation
-from ...util import is_np_array, np_array
-
-
-def _infer_weight_shape(op_name, data_shape, kwargs):
-    data = symbol.var('data', shape=data_shape)
-    if is_np_array():
-        op = getattr(symbol.npx, op_name)
-        data = data.as_np_ndarray()
-    else:
-        op = getattr(symbol, op_name)
-    sym = op(data, **kwargs)
-    return sym.infer_shape_partial()[0]
-
+from ...util import use_np
 
+@use_np
 class _Conv(HybridBlock):
     """Abstract nD convolution layer (private, used as implementation base).
 
@@ -98,10 +87,13 @@ class _Conv(HybridBlock):
     def __init__(self, channels, kernel_size, strides, padding, dilation,
                  groups, layout, in_channels=0, activation=None, use_bias=True,
                  weight_initializer=None, bias_initializer='zeros',
-                 op_name='Convolution', adj=None):
+                 op_name='convolution', adj=None):
         super(_Conv, self).__init__()
         self._channels = channels
         self._in_channels = in_channels
+        self._kernel_size = kernel_size
+        self._layout = layout
+        self._groups = groups
         if isinstance(strides, numeric_types):
             strides = (strides,)*len(kernel_size)
         if isinstance(padding, numeric_types):
@@ -116,19 +108,11 @@ def __init__(self, channels, kernel_size, strides, padding, dilation,
         if adj is not None:
             self._kwargs['adj'] = adj
 
-        if is_np_array():
-            dshape = [-1]*(len(kernel_size) + 2)
-        else:
-            dshape = [0]*(len(kernel_size) + 2)
-
-        dshape[layout.find('N')] = 1
-        dshape[layout.find('C')] = in_channels
-        wshapes = _infer_weight_shape(op_name, dshape, self._kwargs)
-        self.weight = Parameter('weight', shape=wshapes[1],
+        self.weight = Parameter('weight', shape=self.pre_infer(),
                                 init=weight_initializer,
                                 allow_deferred_init=True)
         if use_bias:
-            self.bias = Parameter('bias', shape=wshapes[2],
+            self.bias = Parameter('bias', shape=(channels,),
                                   init=bias_initializer,
                                   allow_deferred_init=True)
         else:
@@ -139,17 +123,70 @@ def __init__(self, channels, kernel_size, strides, padding, dilation,
         else:
             self.act = None
 
-    def hybrid_forward(self, F, x, weight, bias=None):
-        if is_np_array():
-            F = F.npx
-        if bias is None:
-            act = getattr(F, self._op_name)(x, weight, name='fwd', **self._kwargs)
+    def forward(self, x):
+        ctx = x.ctx
+        if self.bias is None:
+            act = getattr(npx, self._op_name)(x, self.weight.data(ctx), **self._kwargs)
         else:
-            act = getattr(F, self._op_name)(x, weight, bias, name='fwd', **self._kwargs)
+            act = getattr(npx, self._op_name)(x, self.weight.data(ctx), self.bias.data(ctx),
+                                              **self._kwargs)
         if self.act is not None:
             act = self.act(act)
         return act
 
+    def pre_infer(self):
+        """
+        Pre-infer the shape of weight parameter based on kernel size, group size and channels
+        """
+        wshape = [-1]*(len(self._kernel_size) + 2)
+        if self._op_name == "convolution":
+            if len(self._kernel_size) == 1:
+                wshape[self._layout.find('N')] = self._channels // self._groups
+                wshape[self._layout.find('W')] = self._kernel_size[0]
+                wshape[0] *= self._groups
+            elif len(self._kernel_size) == 2:
+                wshape[self._layout.find('N')] = self._channels // self._groups
+                wshape[self._layout.find('H')] = self._kernel_size[0]
+                wshape[self._layout.find('W')] = self._kernel_size[1]
+                wshape[0] *= self._groups
+            else:
+                assert len(self._kernel_size) == 3, "kernel_size must be 1, 2 or 3"
+                wshape[self._layout.find('N')] = self._channels // self._groups
+                wshape[self._layout.find('D')] = self._kernel_size[0]
+                wshape[self._layout.find('H')] = self._kernel_size[1]
+                wshape[self._layout.find('W')] = self._kernel_size[2]
+                wshape[0] *= self._groups
+        else:
+            assert self._op_name == "deconvolution", \
+                "Only support operator name with convolution and deconvolution"
+            if len(self._kernel_size) == 1:
+                wshape[self._layout.find('C')] = self._channels // self._groups
+                wshape[self._layout.find('W')] = self._kernel_size[0]
+            elif len(self._kernel_size) == 2:
+                wshape[self._layout.find('C')] = self._channels // self._groups
+                wshape[self._layout.find('H')] = self._kernel_size[0]
+                wshape[self._layout.find('W')] = self._kernel_size[1]
+            else:
+                assert len(self._kernel_size) == 3, "kernel_size must be 1, 2 or 3"
+                wshape[self._layout.find('C')] = self._channels // self._groups
+                wshape[self._layout.find('D')] = self._kernel_size[0]
+                wshape[self._layout.find('H')] = self._kernel_size[1]
+                wshape[self._layout.find('W')] = self._kernel_size[2]
+        return tuple(wshape)
+
+    def infer_shape(self, x):
+        dshape1 = x.shape[self._layout.find('C')]
+        wshape = self.weight.shape
+        if self._op_name == "convolution":
+            wshape_list = list(wshape)
+            wshape_list[self._layout.find('C')] = dshape1 // self._groups
+        else:
+            assert self._op_name == "deconvolution", \
+                "Only support operator name with convolution and deconvolution"
+            wshape_list = list(wshape)
+            wshape_list[self._layout.find('N')] = dshape1
+        self.weight.shape = tuple(wshape_list)
+
     def _alias(self):
         return 'conv'
 
@@ -252,9 +289,7 @@ def __init__(self, channels, kernel_size, strides=1, padding=0, dilation=1,
         if isinstance(kernel_size, numeric_types):
             kernel_size = (kernel_size,)
         assert len(kernel_size) == 1, "kernel_size must be a number or a list of 1 ints"
-        op_name = kwargs.pop('op_name', 'Convolution')
-        if is_np_array():
-            op_name = 'convolution'
+        op_name = 'convolution'
         super(Conv1D, self).__init__(
             channels, kernel_size, strides, padding, dilation, groups, layout,
             in_channels, activation, use_bias, weight_initializer, bias_initializer,
@@ -336,9 +371,7 @@ def __init__(self, channels, kernel_size, strides=(1, 1), padding=(0, 0),
         if isinstance(kernel_size, numeric_types):
             kernel_size = (kernel_size,)*2
         assert len(kernel_size) == 2, "kernel_size must be a number or a list of 2 ints"
-        op_name = kwargs.pop('op_name', 'Convolution')
-        if is_np_array():
-            op_name = 'convolution'
+        op_name = 'convolution'
         super(Conv2D, self).__init__(
             channels, kernel_size, strides, padding, dilation, groups, layout,
             in_channels, activation, use_bias, weight_initializer, bias_initializer,
@@ -421,9 +454,7 @@ def __init__(self, channels, kernel_size, strides=(1, 1, 1), padding=(0, 0, 0),
         if isinstance(kernel_size, numeric_types):
             kernel_size = (kernel_size,)*3
         assert len(kernel_size) == 3, "kernel_size must be a number or a list of 3 ints"
-        op_name = kwargs.pop('op_name', 'Convolution')
-        if is_np_array():
-            op_name = 'convolution'
+        op_name = 'convolution'
         super(Conv3D, self).__init__(
             channels, kernel_size, strides, padding, dilation, groups, layout,
             in_channels, activation, use_bias, weight_initializer, bias_initializer,
@@ -509,9 +540,7 @@ def __init__(self, channels, kernel_size, strides=1, padding=0, output_padding=0
             output_padding = (output_padding,)
         assert len(kernel_size) == 1, "kernel_size must be a number or a list of 1 ints"
         assert len(output_padding) == 1, "output_padding must be a number or a list of 1 ints"
-        op_name = kwargs.pop('op_name', 'Deconvolution')
-        if is_np_array():
-            op_name = 'deconvolution'
+        op_name = 'deconvolution'
         super(Conv1DTranspose, self).__init__(
             channels, kernel_size, strides, padding, dilation, groups, layout,
             in_channels, activation, use_bias, weight_initializer,
@@ -603,9 +632,7 @@ def __init__(self, channels, kernel_size, strides=(1, 1), padding=(0, 0),
             output_padding = (output_padding,)*2
         assert len(kernel_size) == 2, "kernel_size must be a number or a list of 2 ints"
         assert len(output_padding) == 2, "output_padding must be a number or a list of 2 ints"
-        op_name = kwargs.pop('op_name', 'Deconvolution')
-        if is_np_array():
-            op_name = 'deconvolution'
+        op_name = 'deconvolution'
         super(Conv2DTranspose, self).__init__(
             channels, kernel_size, strides, padding, dilation, groups, layout,
             in_channels, activation, use_bias, weight_initializer,
@@ -698,9 +725,7 @@ def __init__(self, channels, kernel_size, strides=(1, 1, 1), padding=(0, 0, 0),
             output_padding = (output_padding,)*3
         assert len(kernel_size) == 3, "kernel_size must be a number or a list of 3 ints"
         assert len(output_padding) == 3, "output_padding must be a number or a list of 3 ints"
-        op_name = kwargs.pop('op_name', 'Deconvolution')
-        if is_np_array():
-            op_name = 'deconvolution'
+        op_name = 'deconvolution'
         super(Conv3DTranspose, self).__init__(
             channels, kernel_size, strides, padding, dilation, groups, layout,
             in_channels, activation, use_bias, weight_initializer, bias_initializer,
@@ -708,6 +733,7 @@ def __init__(self, channels, kernel_size, strides=(1, 1, 1), padding=(0, 0, 0),
         self.outpad = output_padding
 
 
+@use_np
 class _Pooling(HybridBlock):
     """Abstract class for different pooling layers."""
     def __init__(self, pool_size, strides, padding, ceil_mode, global_pool,
@@ -730,9 +756,8 @@ def __init__(self, pool_size, strides, padding, ceil_mode, global_pool,
     def _alias(self):
         return 'pool'
 
-    def hybrid_forward(self, F, x):
-        pooling = F.npx.pooling if is_np_array() else F.Pooling
-        return pooling(x, name='fwd', **self._kwargs)
+    def forward(self, x):
+        return npx.pooling(x, name='fwd', **self._kwargs)
 
     def __repr__(self):
         s = '{name}(size={kernel}, stride={stride}, padding={pad}, ceil_mode={ceil_mode}'
@@ -1204,6 +1229,7 @@ def __init__(self, layout='NCDHW', **kwargs):
             (1, 1, 1), None, 0, True, True, 'avg', layout, **kwargs)
 
 
+@use_np
 class ReflectionPad2D(HybridBlock):
     r"""Pads the input tensor using the reflection of the input boundary.
 
@@ -1229,7 +1255,7 @@ class ReflectionPad2D(HybridBlock):
     Examples
     --------
     >>> m = nn.ReflectionPad2D(3)
-    >>> input = mx.nd.random.normal(shape=(16, 3, 224, 224))
+    >>> input = mx.np.random.normal(size=(16, 3, 224, 224))
     >>> output = m(input)
     """
     def __init__(self, padding=0, **kwargs):
@@ -1239,10 +1265,15 @@ def __init__(self, padding=0, **kwargs):
         assert(len(padding) == 8)
         self._padding = padding
 
-    def hybrid_forward(self, F, x):
-        return F.pad(x, mode='reflect', pad_width=self._padding)
+    def forward(self, x):
+        """
+        Use pad operator in numpy extension module,
+        which has backward support for reflect mode
+        """
+        return npx.pad(x, mode='reflect', pad_width=self._padding)
 
 
+@use_np
 class DeformableConvolution(HybridBlock):
     """2-D Deformable Convolution v_1 (Dai, 2017).
     Normal Convolution uses sampling points in a regular grid, while the sampling
@@ -1335,8 +1366,12 @@ def __init__(self, channels, kernel_size=(1, 1), strides=(1, 1), padding=(0, 0),
         if isinstance(dilation, numeric_types):
             dilation = (dilation,) * len(kernel_size)
         self._op_name = op_name
+        self._kernel_size = kernel_size
+        self._layout = layout
+        self._groups = groups
 
         offset_channels = 2 * kernel_size[0] * kernel_size[1] * num_deformable_group
+        self._offset_channels = offset_channels
         self._kwargs_offset = {
             'kernel': kernel_size, 'stride': strides, 'dilate': dilation,
             'pad': padding, 'num_filter': offset_channels, 'num_group': groups,
@@ -1352,31 +1387,19 @@ def __init__(self, channels, kernel_size=(1, 1), strides=(1, 1), padding=(0, 0),
             self._kwargs_offset['adj'] = adj
             self._kwargs_deformable_conv['adj'] = adj
 
-        dshape = [0] * (len(kernel_size) + 2)
-        dshape[layout.find('N')] = 1
-        dshape[layout.find('C')] = in_channels
-
-        op_name = 'convolution' if is_np_array() else 'Convolution'
-        offsetshapes = _infer_weight_shape(op_name, dshape, self._kwargs_offset)
-
-        self.offset_weight = Parameter('offset_weight', shape=offsetshapes[1],
+        self.offset_weight = Parameter('offset_weight', shape=self.pre_infer_offset_weight(),
                                        init=offset_weight_initializer,
                                        allow_deferred_init=True)
 
         if offset_use_bias:
-            self.offset_bias = Parameter('offset_bias', shape=offsetshapes[2],
+            self.offset_bias = Parameter('offset_bias', shape=(offset_channels,),
                                          init=offset_bias_initializer,
                                          allow_deferred_init=True)
         else:
             self.offset_bias = None
 
-        deformable_conv_weight_shape = [0] * (len(kernel_size) + 2)
-        deformable_conv_weight_shape[0] = channels
-        deformable_conv_weight_shape[2] = kernel_size[0]
-        deformable_conv_weight_shape[3] = kernel_size[1]
-
         self.deformable_conv_weight = Parameter('deformable_conv_weight',
-                                                shape=deformable_conv_weight_shape,
+                                                shape=self.pre_infer_weight(),
                                                 init=weight_initializer,
                                                 allow_deferred_init=True)
 
@@ -1392,32 +1415,62 @@ def __init__(self, channels, kernel_size=(1, 1), strides=(1, 1), padding=(0, 0),
         else:
             self.act = None
 
-    def hybrid_forward(self, F, x, offset_weight, deformable_conv_weight, offset_bias=None, deformable_conv_bias=None):
-        if not is_np_array():
-            x = x.as_np_ndarray()
-            offset_weight = offset_weight.as_np_ndarray()
-            deformable_conv_weight = deformable_conv_weight.as_np_ndarray()
-            if offset_bias is not None:
-                offset_bias = offset_bias.as_np_ndarray()
-            if deformable_conv_bias is not None:
-                deformable_conv_bias = deformable_conv_bias.as_np_ndarray()
-        if offset_bias is None:
-            offset = F.npx.convolution(x, offset_weight, cudnn_off=True, **self._kwargs_offset)
+    def forward(self, x):
+        ctx = x.ctx
+        if self.offset_bias is None:
+            offset = npx.convolution(x, self.offset_weight.data(ctx), cudnn_off=True, **self._kwargs_offset)
         else:
-            offset = F.npx.convolution(x, offset_weight, offset_bias, cudnn_off=True, **self._kwargs_offset)
+            offset = npx.convolution(x, self.offset_weight.data(ctx), self.offset_bias.data(ctx),
+                                     cudnn_off=True, **self._kwargs_offset)
 
-        if deformable_conv_bias is None:
-            act = F.npx.deformable_convolution(data=x, offset=offset, weight=deformable_conv_weight,
-                                               name='fwd', **self._kwargs_deformable_conv)
+        if self.deformable_conv_bias is None:
+            act = npx.deformable_convolution(data=x, offset=offset,
+                                             weight=self.deformable_conv_weight.data(ctx),
+                                             name='fwd', **self._kwargs_deformable_conv)
         else:
-            act = F.npx.deformable_convolution(data=x, offset=offset, weight=deformable_conv_weight,
-                                               bias=deformable_conv_bias, name='fwd',
-                                               **self._kwargs_deformable_conv)
+            act = npx.deformable_convolution(data=x, offset=offset,
+                                             weight=self.deformable_conv_weight.data(ctx),
+                                             bias=self.deformable_conv_bias.data(ctx), name='fwd',
+                                             **self._kwargs_deformable_conv)
 
         if self.act:
-            with np_array(True):
-                act = self.act(act)
-        return act if is_np_array() else act.as_nd_ndarray()
+            act = self.act(act)
+        return act
+
+
+    def pre_infer_offset_weight(self):
+        """
+        Pre-infer the shape of offsite weight parameter based on kernel size,
+        group size and offset channels
+        """
+        wshape = [-1]*(len(self._kernel_size) + 2)
+        wshape[self._layout.find('N')] = self._offset_channels // self._groups
+        wshape[self._layout.find('H')] = self._kernel_size[0]
+        wshape[self._layout.find('W')] = self._kernel_size[1]
+        wshape[0] *= self._groups
+        return tuple(wshape)
+
+    def pre_infer_weight(self):
+        """
+        Pre-infer the shape of weight parameter based on kernel size, group size and channels
+        """
+        wshape = [-1]*(len(self._kernel_size) + 2)
+        wshape[self._layout.find('N')] = self._channels // self._groups
+        wshape[self._layout.find('H')] = self._kernel_size[0]
+        wshape[self._layout.find('W')] = self._kernel_size[1]
+        wshape[0] *= self._groups
+        return tuple(wshape)
+
+    def infer_shape(self, x):
+        dshape1 = x.shape[self._layout.find('C')]
+        wshape = self.deformable_conv_weight.shape
+        wshape_offset = self.offset_weight.shape
+        wshape_list = list(wshape)
+        wshape_offset_list = list(wshape_offset)
+        wshape_list[self._layout.find('C')] = dshape1 // self._groups
+        wshape_offset_list[self._layout.find('C')] = dshape1 // self._groups
+        self.deformable_conv_weight.shape = tuple(wshape_list)
+        self.offset_weight.shape = tuple(wshape_offset_list)
 
     def _alias(self):
         return 'deformable_conv'
@@ -1444,6 +1497,7 @@ def __repr__(self):
                         **self._kwargs_deformable_conv)
 
 
+@use_np
 class ModulatedDeformableConvolution(HybridBlock):
     """2-D Deformable Convolution v2 (Dai, 2018).
 
@@ -1537,6 +1591,10 @@ def __init__(self, channels, kernel_size=(1, 1), strides=(1, 1), padding=(0, 0),
 
         offset_channels = num_deformable_group * 3 * kernel_size[0] * kernel_size[1]
         self.offset_split_index = num_deformable_group * 2 * kernel_size[0] * kernel_size[1]
+        self._layout = layout
+        self._groups = groups
+        self._offset_channels = offset_channels
+        self._kernel_size = kernel_size
         self._kwargs_offset = {
             'kernel': kernel_size, 'stride': strides, 'dilate': dilation,
             'pad': padding, 'num_filter': offset_channels, 'num_group': groups,
@@ -1552,13 +1610,8 @@ def __init__(self, channels, kernel_size=(1, 1), strides=(1, 1), padding=(0, 0),
             self._kwargs_offset['adj'] = adj
             self._kwargs_deformable_conv['adj'] = adj
 
-        deformable_conv_weight_shape = [0] * (len(kernel_size) + 2)
-        deformable_conv_weight_shape[0] = channels
-        deformable_conv_weight_shape[2] = kernel_size[0]
-        deformable_conv_weight_shape[3] = kernel_size[1]
-
         self.deformable_conv_weight = Parameter('deformable_conv_weight',
-                                                shape=deformable_conv_weight_shape,
+                                                shape=self.pre_infer_weight(),
                                                 init=weight_initializer,
                                                 allow_deferred_init=True)
 
@@ -1569,21 +1622,12 @@ def __init__(self, channels, kernel_size=(1, 1), strides=(1, 1), padding=(0, 0),
         else:
             self.deformable_conv_bias = None
 
-        dshape = [0] * (len(kernel_size) + 2)
-        dshape[layout.find('N')] = 1
-        dshape[layout.find('C')] = in_channels
-
-        op = getattr(symbol, 'Convolution')
-        offset = op(symbol.var('data', shape=dshape), **self._kwargs_offset)
-
-        offsetshapes = offset.infer_shape_partial()[0]
-
-        self.offset_weight = Parameter('offset_weight', shape=offsetshapes[1],
+        self.offset_weight = Parameter('offset_weight', shape=self.pre_infer_offset_weight(),
                                        init=offset_weight_initializer,
                                        allow_deferred_init=True)
 
         if offset_use_bias:
-            self.offset_bias = Parameter('offset_bias', shape=offsetshapes[2],
+            self.offset_bias = Parameter('offset_bias', shape=(offset_channels,),
                                          init=offset_bias_initializer,
                                          allow_deferred_init=True)
         else:
@@ -1594,43 +1638,72 @@ def __init__(self, channels, kernel_size=(1, 1), strides=(1, 1), padding=(0, 0),
         else:
             self.act = None
 
-    def hybrid_forward(self, F, x, offset_weight, deformable_conv_weight, offset_bias=None, deformable_conv_bias=None):
-        if not is_np_array():
-            x = x.as_np_ndarray()
-            offset_weight = offset_weight.as_np_ndarray()
-            deformable_conv_weight = deformable_conv_weight.as_np_ndarray()
-            if offset_bias is not None:
-                offset_bias = offset_bias.as_np_ndarray()
-            if deformable_conv_bias is not None:
-                deformable_conv_bias = deformable_conv_bias.as_np_ndarray()
-        if offset_bias is None:
-            offset = F.npx.convolution(x, offset_weight, cudnn_off=True, **self._kwargs_offset)
+    def forward(self, x):
+        ctx = x.ctx
+        if self.offset_bias is None:
+            offset = npx.convolution(x, self.offset_weight.data(ctx),
+                                     cudnn_off=True, **self._kwargs_offset)
         else:
-            offset = F.npx.convolution(x, offset_weight, offset_bias, cudnn_off=True, **self._kwargs_offset)
+            offset = npx.convolution(x, self.offset_weight.data(ctx),
+                                     self.offset_bias.data(ctx), cudnn_off=True, **self._kwargs_offset)
 
-        offset_t = F.npx.slice_axis(offset, axis=1, begin=0, end=self.offset_split_index)
-        mask = F.npx.slice_axis(offset, axis=1, begin=self.offset_split_index, end=None)
-        mask = F.npx.sigmoid(mask) * 2
+        offset_t = npx.slice_axis(offset, axis=1, begin=0, end=self.offset_split_index)
+        mask = npx.slice_axis(offset, axis=1, begin=self.offset_split_index, end=None)
+        mask = npx.sigmoid(mask) * 2
 
-        if deformable_conv_bias is None:
-            act = F.npx.modulated_deformable_convolution(data=x, offset=offset_t, mask=mask,
-                                                         weight=deformable_conv_weight,
-                                                         name='fwd', **self._kwargs_deformable_conv)
+        if self.deformable_conv_bias is None:
+            act = npx.modulated_deformable_convolution(data=x, offset=offset_t, mask=mask,
+                                                       weight=self.deformable_conv_weight.data(ctx),
+                                                       name='fwd', **self._kwargs_deformable_conv)
         else:
-            act = F.npx.modulated_deformable_convolution(data=x, offset=offset_t, mask=mask,
-                                                         weight=deformable_conv_weight,
-                                                         bias=deformable_conv_bias, name='fwd',
-                                                         **self._kwargs_deformable_conv)
+            act = npx.modulated_deformable_convolution(data=x, offset=offset_t, mask=mask,
+                                                       weight=self.deformable_conv_weight.data(ctx),
+                                                       bias=self.deformable_conv_bias.data(ctx), name='fwd',
+                                                       **self._kwargs_deformable_conv)
 
         if self.act:
-            with np_array(True):
-                act = self.act(act)
-        return act if is_np_array() else act.as_nd_ndarray()
+            act = self.act(act)
+        return act
+
+    def pre_infer_offset_weight(self):
+        """
+        Pre-infer the shape of offsite weight parameter based on kernel size,
+        group size and offset channels
+        """
+        wshape = [-1]*(len(self._kernel_size) + 2)
+        wshape[self._layout.find('N')] = self._offset_channels // self._groups
+        wshape[self._layout.find('H')] = self._kernel_size[0]
+        wshape[self._layout.find('W')] = self._kernel_size[1]
+        wshape[0] *= self._groups
+        return tuple(wshape)
+
+    def pre_infer_weight(self):
+        """
+        Pre-infer the shape of weight parameter based on kernel size, group size and channels
+        """
+        wshape = [-1]*(len(self._kernel_size) + 2)
+        wshape[self._layout.find('N')] = self._channels // self._groups
+        wshape[self._layout.find('H')] = self._kernel_size[0]
+        wshape[self._layout.find('W')] = self._kernel_size[1]
+        wshape[0] *= self._groups
+        return tuple(wshape)
+
+    def infer_shape(self, x):
+        dshape1 = x.shape[self._layout.find('C')]
+        wshape = self.deformable_conv_weight.shape
+        wshape_offset = self.offset_weight.shape
+        wshape_list = list(wshape)
+        wshape_offset_list = list(wshape_offset)
+        wshape_list[self._layout.find('C')] = dshape1 // self._groups
+        wshape_offset_list[self._layout.find('C')] = dshape1 // self._groups
+        self.deformable_conv_weight.shape = tuple(wshape_list)
+        self.offset_weight.shape = tuple(wshape_offset_list)
 
     def _alias(self):
         return 'modulated_deformable_conv'
 
 
+@use_np
 class PixelShuffle1D(HybridBlock):
 
     r"""Pixel-shuffle layer for upsampling in 1 dimension.
@@ -1657,7 +1730,7 @@ class PixelShuffle1D(HybridBlock):
     Examples
     --------
     >>> pxshuf = PixelShuffle1D(2)
-    >>> x = mx.nd.zeros((1, 8, 3))
+    >>> x = mx.np.zeros((1, 8, 3))
     >>> pxshuf(x).shape
     (1, 4, 6)
     """
@@ -1666,21 +1739,19 @@ def __init__(self, factor):
         super(PixelShuffle1D, self).__init__()
         self._factor = int(factor)
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         """Perform pixel-shuffling on the input."""
-        f = self._factor
-        if not is_np_array():
-            x = x.as_np_ndarray()
-                                             # (N, C*f, W)
-        x = F.npx.reshape(x, (-2, -6, -1, f, -2))  # (N, C, f, W)
-        x = F.np.transpose(x, (0, 1, 3, 2))     # (N, C, W, f)
-        x = F.npx.reshape(x, (-2, -2, -5))         # (N, C, W*f)
-        return x if is_np_array() else x.as_nd_ndarray()
+        f = self._factor                                             # (N, C*f, W)
+        x = npx.reshape(x, (-2, -6, -1, f, -2))  # (N, C, f, W)
+        x = np.transpose(x, (0, 1, 3, 2))     # (N, C, W, f)
+        x = npx.reshape(x, (-2, -2, -5))         # (N, C, W*f)
+        return x
 
     def __repr__(self):
         return "{}({})".format(self.__class__.__name__, self._factor)
 
 
+@use_np
 class PixelShuffle2D(HybridBlock):
 
     r"""Pixel-shuffle layer for upsampling in 2 dimensions.
@@ -1716,7 +1787,7 @@ class PixelShuffle2D(HybridBlock):
     Examples
     --------
     >>> pxshuf = PixelShuffle2D((2, 3))
-    >>> x = mx.nd.zeros((1, 12, 3, 5))
+    >>> x = mx.np.zeros((1, 12, 3, 5))
     >>> pxshuf(x).shape
     (1, 2, 6, 15)
     """
@@ -1729,22 +1800,21 @@ def __init__(self, factor):
             self._factors = tuple(int(fac) for fac in factor)
             assert len(self._factors) == 2, "wrong length {}".format(len(self._factors))
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         """Perform pixel-shuffling on the input."""
         f1, f2 = self._factors
-        if not is_np_array():
-            x = x.as_np_ndarray()
                                                       # (N, f1*f2*C, H, W)
-        x = F.npx.reshape(x, (-2, -6, -1, f1 * f2, -2, -2))  # (N, C, f1*f2, H, W)
-        x = F.npx.reshape(x, (-2, -2, -6, f1, f2, -2, -2))    # (N, C, f1, f2, H, W)
-        x = F.np.transpose(x, (0, 1, 4, 2, 5, 3))        # (N, C, H, f1, W, f2)
-        x = F.npx.reshape(x, (-2, -2, -5, -5))              # (N, C, H*f1, W*f2)
-        return x if is_np_array() else x.as_nd_ndarray()
+        x = npx.reshape(x, (-2, -6, -1, f1 * f2, -2, -2))  # (N, C, f1*f2, H, W)
+        x = npx.reshape(x, (-2, -2, -6, f1, f2, -2, -2))    # (N, C, f1, f2, H, W)
+        x = np.transpose(x, (0, 1, 4, 2, 5, 3))        # (N, C, H, f1, W, f2)
+        x = npx.reshape(x, (-2, -2, -5, -5))              # (N, C, H*f1, W*f2)
+        return x
 
     def __repr__(self):
         return "{}({})".format(self.__class__.__name__, self._factors)
 
 
+@use_np
 class PixelShuffle3D(HybridBlock):
 
     r"""Pixel-shuffle layer for upsampling in 3 dimensions.
@@ -1780,7 +1850,7 @@ class PixelShuffle3D(HybridBlock):
     Examples
     --------
     >>> pxshuf = PixelShuffle3D((2, 3, 4))
-    >>> x = mx.nd.zeros((1, 48, 3, 5, 7))
+    >>> x = mx.np.zeros((1, 48, 3, 5, 7))
     >>> pxshuf(x).shape
     (1, 2, 6, 15, 28)
     """
@@ -1793,23 +1863,21 @@ def __init__(self, factor):
             self._factors = tuple(int(fac) for fac in factor)
             assert len(self._factors) == 3, "wrong length {}".format(len(self._factors))
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         """Perform pixel-shuffling on the input."""
         # `transpose` doesn't support 8D, need other implementation
         f1, f2, f3 = self._factors
-        if not is_np_array():
-            x = x.as_np_ndarray()
                                                               # (N, C*f1*f2*f3, D, H, W)
-        x = F.npx.reshape(x, (-2, -6, -1, f1 * f2 * f3, -2, -2, -2))  # (N, C, f1*f2*f3, D, H, W)
-        x = F.np.swapaxes(x, 2, 3)                               # (N, C, D, f1*f2*f3, H, W)
-        x = F.npx.reshape(x, (-2, -2, -2, -6, f1, f2*f3, -2, -2))      # (N, C, D, f1, f2*f3, H, W)
-        x = F.npx.reshape(x, (-2, -2, -5, -2, -2, -2))                 # (N, C, D*f1, f2*f3, H, W)
-        x = F.np.swapaxes(x, 3, 4)                               # (N, C, D*f1, H, f2*f3, W)
-        x = F.npx.reshape(x, (-2, -2, -2, -2, -6, f2, f3, -2))         # (N, C, D*f1, H, f2, f3, W)
-        x = F.npx.reshape(x, (-2, -2, -2, -5, -2, -2))                 # (N, C, D*f1, H*f2, f3, W)
-        x = F.np.swapaxes(x, 4, 5)                               # (N, C, D*f1, H*f2, W, f3)
-        x = F.npx.reshape(x, (-2, -2, -2, -2, -5))                    # (N, C, D*f1, H*f2, W*f3)
-        return x if is_np_array() else x.as_nd_ndarray()
+        x = npx.reshape(x, (-2, -6, -1, f1 * f2 * f3, -2, -2, -2))  # (N, C, f1*f2*f3, D, H, W)
+        x = np.swapaxes(x, 2, 3)                               # (N, C, D, f1*f2*f3, H, W)
+        x = npx.reshape(x, (-2, -2, -2, -6, f1, f2*f3, -2, -2))      # (N, C, D, f1, f2*f3, H, W)
+        x = npx.reshape(x, (-2, -2, -5, -2, -2, -2))                 # (N, C, D*f1, f2*f3, H, W)
+        x = np.swapaxes(x, 3, 4)                               # (N, C, D*f1, H, f2*f3, W)
+        x = npx.reshape(x, (-2, -2, -2, -2, -6, f2, f3, -2))         # (N, C, D*f1, H, f2, f3, W)
+        x = npx.reshape(x, (-2, -2, -2, -5, -2, -2))                 # (N, C, D*f1, H*f2, f3, W)
+        x = np.swapaxes(x, 4, 5)                               # (N, C, D*f1, H*f2, W, f3)
+        x = npx.reshape(x, (-2, -2, -2, -2, -5))                    # (N, C, D*f1, H*f2, W*f3)
+        return x
 
     def __repr__(self):
         return "{}({})".format(self.__class__.__name__, self._factors)
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 26d061a703fe..16e2957c2551 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -52,12 +52,12 @@ class Parameter(object):
     not ``'null'``, it will also hold a gradient array on each :py:class:`Context`::
 
         ctx = mx.gpu(0)
-        x = mx.nd.zeros((16, 100), ctx=ctx)
+        x = mx.np.zeros((16, 100), ctx=ctx)
         w = mx.gluon.Parameter('fc_weight', shape=(64, 100), init=mx.init.Xavier())
         b = mx.gluon.Parameter('fc_bias', shape=(64,), init=mx.init.Zero())
         w.initialize(ctx=ctx)
         b.initialize(ctx=ctx)
-        out = mx.nd.FullyConnected(x, w.data(ctx), b.data(ctx), num_hidden=64)
+        out = mx.npx.fully_connected(x, w.data(ctx), b.data(ctx), num_hidden=64)
 
     Parameters
     ----------
diff --git a/python/mxnet/gluon/probability/block/stochastic_block.py b/python/mxnet/gluon/probability/block/stochastic_block.py
index 64602145e613..27975276615d 100644
--- a/python/mxnet/gluon/probability/block/stochastic_block.py
+++ b/python/mxnet/gluon/probability/block/stochastic_block.py
@@ -45,7 +45,7 @@ def add_loss(self, loss):
     @staticmethod
     def collectLoss(func):
         """To accumulate loss during the forward phase, one could first decorate
-        hybrid_forward with `StochasticBlock.collectLoss,
+        forward with `StochasticBlock.collectLoss,
         and then collect the loss tensor `x` by calling self.add_loss(x).
         For example, in the following forward function,
         we generate samples from a Gaussian parameterized by `loc` and `scale` and
@@ -60,7 +60,7 @@ def forward(self, loc, scale):
         """
         @wraps(func)
         def inner(self, *args, **kwargs):
-            # Loss from hybrid_forward
+            # Loss from forward
             func_out = func(self, *args, **kwargs)
             collected_loss = self._losscache
             self._losscache = []
diff --git a/python/mxnet/gluon/probability/distributions/bernoulli.py b/python/mxnet/gluon/probability/distributions/bernoulli.py
index f61189c13bc6..a34d68202d4c 100644
--- a/python/mxnet/gluon/probability/distributions/bernoulli.py
+++ b/python/mxnet/gluon/probability/distributions/bernoulli.py
@@ -21,8 +21,9 @@
 __all__ = ['Bernoulli']
 
 from .exp_family import ExponentialFamily
-from .utils import prob2logit, logit2prob, getF, cached_property, sample_n_shape_converter
+from .utils import prob2logit, logit2prob, cached_property, sample_n_shape_converter
 from .constraint import Boolean, Interval, Real
+from .... import np, npx
 
 
 class Bernoulli(ExponentialFamily):
@@ -34,9 +35,6 @@ class Bernoulli(ExponentialFamily):
         Probability of sampling `1`.
     logit : Tensor or scalar, default None
         The log-odds of sampling `1`.
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
@@ -44,9 +42,7 @@ class Bernoulli(ExponentialFamily):
     arg_constraints = {'prob': Interval(0, 1),
                        'logit': Real()}
 
-    def __init__(self, prob=None, logit=None, F=None, validate_args=None):
-        _F = F if F is not None else getF(prob, logit)
-
+    def __init__(self, prob=None, logit=None, validate_args=None):
         if (prob is None) == (logit is None):
             raise ValueError(
                 "Either `prob` or `logit` must be specified, but not both. " +
@@ -58,7 +54,7 @@ def __init__(self, prob=None, logit=None, F=None, validate_args=None):
             self.logit = logit
 
         super(Bernoulli, self).__init__(
-            F=_F, event_dim=0, validate_args=validate_args)
+            event_dim=0, validate_args=validate_args)
 
     @cached_property
     def prob(self):
@@ -70,7 +66,7 @@ def prob(self):
             Parameter tensor.
         """
         # pylint: disable=method-hidden
-        return logit2prob(self.logit, True, self.F)
+        return logit2prob(self.logit, True)
 
     @cached_property
     def logit(self):
@@ -82,7 +78,7 @@ def logit(self):
             Parameter tensor.
         """
         # pylint: disable=method-hidden
-        return prob2logit(self.prob, True, self.F)
+        return prob2logit(self.prob, True)
 
     @property
     def mean(self):
@@ -96,9 +92,9 @@ def broadcast_to(self, batch_shape):
         new_instance = self.__new__(type(self))
         F = self.F
         if 'prob' in self.__dict__:
-            new_instance.prob = F.np.broadcast_to(self.prob, batch_shape)
+            new_instance.prob = np.broadcast_to(self.prob, batch_shape)
         else:
-            new_instance.logit = F.np.broadcast_to(self.logit, batch_shape)
+            new_instance.logit = np.broadcast_to(self.logit, batch_shape)
         super(Bernoulli, new_instance).__init__(F=F,
                                                 event_dim=self.event_dim,
                                                 validate_args=False)
@@ -108,21 +104,20 @@ def broadcast_to(self, batch_shape):
     def log_prob(self, value):
         if self._validate_args:
             self._validate_samples(value)
-        F = self.F
         if self.prob is None:
             logit = self.logit
-            return logit * (value - 1) - F.np.log(F.np.exp(-logit) + 1)
+            return logit * (value - 1) - np.log(np.exp(-logit) + 1)
         else:
             # Parameterized by probability
             eps = 1e-12
-            return (self.F.np.log(self.prob + eps) * value
-                    + self.F.np.log1p(-self.prob + eps) * (1 - value))
+            return (np.log(self.prob + eps) * value
+                    + np.log1p(-self.prob + eps) * (1 - value))
 
     def sample(self, size=None):
-        return self.F.npx.random.bernoulli(self.prob, self.logit, size)
+        return npx.random.bernoulli(self.prob, self.logit, size)
 
     def sample_n(self, size=None):
-        return self.F.npx.random.bernoulli(self.prob, self.logit, sample_n_shape_converter(size))
+        return npx.random.bernoulli(self.prob, self.logit, sample_n_shape_converter(size))
 
     @property
     def _natural_params(self):
@@ -130,10 +125,9 @@ def _natural_params(self):
 
     def _log_normalizer(self, x):
         # pylint: disable=arguments-differ
-        return self.F.np.log(1 + self.F.np.exp(x))
+        return np.log(1 + np.exp(x))
 
     def entropy(self):
-        F = self.F
         logit = self.logit
         prob = self.prob
-        return -(logit * (prob - 1) - F.np.log(F.np.exp(-logit) + 1))
+        return -(logit * (prob - 1) - np.log(np.exp(-logit) + 1))
diff --git a/python/mxnet/gluon/probability/distributions/beta.py b/python/mxnet/gluon/probability/distributions/beta.py
index dea7dc728a8d..af468a46fbf9 100644
--- a/python/mxnet/gluon/probability/distributions/beta.py
+++ b/python/mxnet/gluon/probability/distributions/beta.py
@@ -22,7 +22,8 @@
 
 from .exp_family import ExponentialFamily
 from .constraint import UnitInterval, Positive
-from .utils import getF, sample_n_shape_converter, gammaln, digamma, _clip_prob
+from .utils import sample_n_shape_converter, gammaln, digamma, _clip_prob
+from .... import np
 
 
 class Beta(ExponentialFamily):
@@ -34,8 +35,6 @@ class Beta(ExponentialFamily):
        The first shape parameter
     beta : Tensor or scalar
         The second shape parameter
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
     """
     # pylint: disable=abstract-method
 
@@ -44,19 +43,17 @@ class Beta(ExponentialFamily):
     arg_constraints = {'alpha': Positive(),
                        'beta': Positive()}
 
-    def __init__(self, alpha, beta, F=None, validate_args=None):
-        _F = F if F is not None else getF(alpha, beta)
+    def __init__(self, alpha, beta, validate_args=None):
         self.alpha = alpha
         self.beta = beta
         super(Beta, self).__init__(
-            F=_F, event_dim=0, validate_args=validate_args)
+            event_dim=0, validate_args=validate_args)
 
     def sample(self, size=None):
-        F = self.F
-        X = F.np.random.gamma(self.alpha, 1, size=size)
-        Y = F.np.random.gamma(self.beta, 1, size=size)
+        X = np.random.gamma(self.alpha, 1, size=size)
+        Y = np.random.gamma(self.beta, 1, size=size)
         out = X / (X + Y)
-        return _clip_prob(out, F)
+        return _clip_prob(out)
 
     def sample_n(self, size=None):
         return self.sample(sample_n_shape_converter(size))
@@ -77,19 +74,17 @@ def variance(self):
     def log_prob(self, value):
         if self._validate_args:
             self._validate_samples(value)
-        F = self.F
-        lgamma = gammaln(F)
-        log = F.np.log
-        log1p = F.np.log1p
+        lgamma = gammaln()
+        log = np.log
+        log1p = np.log1p
         a = self.alpha
         b = self.beta
         lgamma_term = lgamma(a + b) - lgamma(a) - lgamma(b)
         return (a - 1) * log(value) + (b - 1) * log1p(-value) + lgamma_term
 
     def entropy(self):
-        F = self.F
-        lgamma = gammaln(F)
-        dgamma = digamma(F)
+        lgamma = gammaln()
+        dgamma = digamma()
         a = self.alpha
         b = self.beta
         lgamma_term = lgamma(a + b) - lgamma(a) - lgamma(b)
diff --git a/python/mxnet/gluon/probability/distributions/binomial.py b/python/mxnet/gluon/probability/distributions/binomial.py
index e99acb5d0bba..e2d14475f38a 100644
--- a/python/mxnet/gluon/probability/distributions/binomial.py
+++ b/python/mxnet/gluon/probability/distributions/binomial.py
@@ -21,9 +21,10 @@
 __all__ = ['Binomial']
 
 from .distribution import Distribution
-from .utils import prob2logit, logit2prob, getF, cached_property, sample_n_shape_converter
+from .utils import prob2logit, logit2prob, cached_property, sample_n_shape_converter
 from .utils import gammaln
 from .constraint import Interval, Real, NonNegativeInteger
+from .... import np, npx
 
 
 class Binomial(Distribution):
@@ -37,9 +38,6 @@ class Binomial(Distribution):
         Probability of sampling `1`.
     logit : Tensor or scalar, default None
         The log-odds of sampling `1`.
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
@@ -47,11 +45,10 @@ class Binomial(Distribution):
     arg_constraints = {'prob': Interval(0, 1),
                        'logit': Real()}
 
-    def __init__(self, n=1, prob=None, logit=None, F=None, validate_args=None):
+    def __init__(self, n=1, prob=None, logit=None, validate_args=None):
         if (n < 0) or (n % 1 != 0):
             raise ValueError(
                 "Expect `n` to be non-negative integer, received n={}".format(n))
-        _F = F if F is not None else getF(n, prob, logit)
         if (prob is None) == (logit is None):
             raise ValueError(
                 "Either `prob` or `logit` must be specified, but not both. " +
@@ -63,7 +60,7 @@ def __init__(self, n=1, prob=None, logit=None, F=None, validate_args=None):
             self.logit = logit
         self.n = n
         super(Binomial, self).__init__(
-            F=_F, event_dim=0, validate_args=validate_args)
+            event_dim=0, validate_args=validate_args)
 
     @cached_property
     def prob(self):
@@ -75,7 +72,7 @@ def prob(self):
             Parameter tensor.
         """
         # pylint: disable=method-hidden
-        return logit2prob(self.logit, True, self.F)
+        return logit2prob(self.logit, True)
 
     @cached_property
     def logit(self):
@@ -87,7 +84,7 @@ def logit(self):
             Parameter tensor.
         """
         # pylint: disable=method-hidden
-        return prob2logit(self.prob, True, self.F)
+        return prob2logit(self.prob, True)
 
     @property
     def mean(self):
@@ -100,14 +97,12 @@ def variance(self):
 
     def broadcast_to(self, batch_shape):
         new_instance = self.__new__(type(self))
-        F = self.F
         if 'prob' in self.__dict__:
-            new_instance.prob = F.np.broadcast_to(self.prob, batch_shape)
+            new_instance.prob = np.broadcast_to(self.prob, batch_shape)
         else:
-            new_instance.logit = F.np.broadcast_to(self.logit, batch_shape)
+            new_instance.logit = np.broadcast_to(self.logit, batch_shape)
         new_instance.n = self.n
-        super(Binomial, new_instance).__init__(F=F,
-                                               event_dim=self.event_dim,
+        super(Binomial, new_instance).__init__(event_dim=self.event_dim,
                                                validate_args=False)
         new_instance._validate_args = self._validate_args
         return new_instance
@@ -115,31 +110,28 @@ def broadcast_to(self, batch_shape):
     def log_prob(self, value):
         if self._validate_args:
             self._validate_samples(value)
-        F = self.F
-        lgamma = gammaln(F)
+        lgamma = gammaln()
         binomal_coef = lgamma(self.n + 1) - lgamma(1 +
                                                    value) - lgamma(self.n - value + 1)
         # log(prob) may have numerical issue.
-        unnormalized_log_prob = (value * F.np.log(self.prob) +
-                                 (self.n - value) * F.np.log1p(-self.prob))
+        unnormalized_log_prob = (value * np.log(self.prob) +
+                                 (self.n - value) * np.log1p(-self.prob))
         return binomal_coef + unnormalized_log_prob
 
     def sample(self, size=None):
-        F = self.F
         if size is not None:
-            logit = F.np.broadcast_to(self.logit, size)
+            logit = np.broadcast_to(self.logit, size)
         else:
             logit = self.logit
-        expanded_logit = F.np.repeat(
-            F.np.expand_dims(logit, -1), int(self.n), -1)
-        return F.npx.random.bernoulli(logit=expanded_logit).sum(-1)
+        expanded_logit = np.repeat(
+            np.expand_dims(logit, -1), int(self.n), -1)
+        return npx.random.bernoulli(logit=expanded_logit).sum(-1)
 
     def sample_n(self, size=None):
-        F = self.F
         logit = self.logit
-        expanded_logit = F.np.repeat(
-            F.np.expand_dims(logit, -1), int(self.n), -1)
-        return F.npx.random.bernoulli(
+        expanded_logit = np.repeat(
+            np.expand_dims(logit, -1), int(self.n), -1)
+        return npx.random.bernoulli(
             logit=expanded_logit,
             size=sample_n_shape_converter(size)
         ).sum(-1)
diff --git a/python/mxnet/gluon/probability/distributions/categorical.py b/python/mxnet/gluon/probability/distributions/categorical.py
index 8633ba979b32..ace162f28374 100644
--- a/python/mxnet/gluon/probability/distributions/categorical.py
+++ b/python/mxnet/gluon/probability/distributions/categorical.py
@@ -21,8 +21,9 @@
 __all__ = ['Categorical']
 
 from .distribution import Distribution
-from .utils import prob2logit, logit2prob, getF, cached_property, sample_n_shape_converter
+from .utils import prob2logit, logit2prob, cached_property, sample_n_shape_converter
 from .constraint import Simplex, Real, IntegerInterval
+from .... import np, npx
 
 
 class Categorical(Distribution):
@@ -36,9 +37,6 @@ class Categorical(Distribution):
         Probabilities of each event.
     logit : Tensor
         The log-odds of each event
-     F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
@@ -46,8 +44,7 @@ class Categorical(Distribution):
     arg_constraints = {'prob': Simplex(),
                        'logit': Real()}
 
-    def __init__(self, num_events, prob=None, logit=None, F=None, validate_args=None):
-        _F = F if F is not None else getF(prob, logit)
+    def __init__(self, num_events, prob=None, logit=None, validate_args=None):
         if (num_events > 0):
             num_events = int(num_events)
             self.num_events = num_events
@@ -65,7 +62,7 @@ def __init__(self, num_events, prob=None, logit=None, F=None, validate_args=None
             self.logit = logit
 
         super(Categorical, self).__init__(
-            F=_F, event_dim=0, validate_args=validate_args)
+            event_dim=0, validate_args=validate_args)
 
     @cached_property
     def prob(self):
@@ -77,7 +74,7 @@ def prob(self):
         Tensor
             Parameter tensor.
         """
-        return logit2prob(self.logit, False, self.F)
+        return logit2prob(self.logit, False)
 
     @cached_property
     def logit(self):
@@ -89,7 +86,7 @@ def logit(self):
         Tensor
             Parameter tensor.
         """
-        return prob2logit(self.prob, False, self.F)
+        return prob2logit(self.prob, False)
 
     @property
     def support(self):
@@ -110,11 +107,10 @@ def log_prob(self, value):
         """
         if self._validate_args:
             self._validate_samples(value)
-        F = self.F
         logit = self.logit
-        indices = F.np.expand_dims(value, -1).astype('int')
-        expanded_logit = logit * F.np.ones_like(logit + indices)
-        return F.npx.pick(expanded_logit, indices).squeeze()
+        indices = np.expand_dims(value, -1).astype('int')
+        expanded_logit = logit * np.ones_like(logit + indices)
+        return npx.pick(expanded_logit, indices).squeeze()
 
     def sample(self, size=None):
         """Sample from categorical distribution.
@@ -131,38 +127,33 @@ def sample(self, size=None):
         out : Tensor
             Samples from the categorical distribution.
         """
-        F = self.F
         if size is None:
             size = ()
             logit = self.logit
         else:
             if isinstance(size, int):
-                logit = F.np.broadcast_to(self.logit, (size,) + (-2,))
+                logit = np.broadcast_to(self.logit, (size,) + (-2,))
             else:
-                logit = F.np.broadcast_to(self.logit, size + (-2,))
-        gumbel_samples = F.np.random.gumbel(logit)
-        return F.np.argmax(gumbel_samples, axis=-1)
+                logit = np.broadcast_to(self.logit, size + (-2,))
+        gumbel_samples = np.random.gumbel(logit)
+        return np.argmax(gumbel_samples, axis=-1)
 
     def sample_n(self, size=None):
-        F = self.F
         size = sample_n_shape_converter(size)
-        gumbel_samples = F.np.random.gumbel(self.logit, size=size)
-        return F.np.argmax(gumbel_samples, axis=-1)
+        gumbel_samples = np.random.gumbel(self.logit, size=size)
+        return np.argmax(gumbel_samples, axis=-1)
 
     def broadcast_to(self, batch_shape):
         new_instance = self.__new__(type(self))
-        F = self.F
-        new_instance.prob = F.np.broadcast_to(self.prob, batch_shape + (-2,))
-        new_instance.logit = F.np.broadcast_to(self.logit, batch_shape + (-2,))
+        new_instance.prob = np.broadcast_to(self.prob, batch_shape + (-2,))
+        new_instance.logit = np.broadcast_to(self.logit, batch_shape + (-2,))
         new_instance.num_events = self.num_events
-        super(Categorical, new_instance).__init__(F=F,
-                                                  event_dim=self.event_dim,
+        super(Categorical, new_instance).__init__(event_dim=self.event_dim,
                                                   validate_args=False)
         new_instance._validate_args = self._validate_args
         return new_instance
 
     def enumerate_support(self):
         num_events = self.num_events
-        F = self.F
-        value = F.npx.arange_like(self.logit) % num_events
-        return F.np.moveaxis(value, -1, 0)
+        value = npx.arange_like(self.logit) % num_events
+        return np.moveaxis(value, -1, 0)
diff --git a/python/mxnet/gluon/probability/distributions/cauchy.py b/python/mxnet/gluon/probability/distributions/cauchy.py
index 90e16b93a8de..bd7804d869b5 100644
--- a/python/mxnet/gluon/probability/distributions/cauchy.py
+++ b/python/mxnet/gluon/probability/distributions/cauchy.py
@@ -25,7 +25,8 @@
 from numpy import nan, pi
 from .constraint import Real
 from .distribution import Distribution
-from .utils import getF, sample_n_shape_converter
+from .utils import sample_n_shape_converter
+from .... import np
 
 
 class Cauchy(Distribution):
@@ -37,9 +38,6 @@ class Cauchy(Distribution):
         mode or median of the distribution
     scale : Tensor or scalar, default 1
         half width at half maximum
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
@@ -47,12 +45,11 @@ class Cauchy(Distribution):
     support = Real()
     arg_constraints = {'loc': Real(), 'scale': Real()}
 
-    def __init__(self, loc=0.0, scale=1.0, F=None, validate_args=None):
-        _F = F if F is not None else getF(loc, scale)
+    def __init__(self, loc=0.0, scale=1.0, validate_args=None):
         self.loc = loc
         self.scale = scale
         super(Cauchy, self).__init__(
-            F=_F, event_dim=0, validate_args=validate_args)
+            event_dim=0, validate_args=validate_args)
 
     @property
     def mean(self):
@@ -64,12 +61,11 @@ def variance(self):
 
     def sample(self, size=None):
         # TODO: Implement sampling op in the backend.
-        F = self.F
         # `np.zeros_like` does not support scalar at this moment.
         if (isinstance(self.loc, Number), isinstance(self.scale, Number)) == (True, True):
-            u = F.np.random.uniform(size=size)
+            u = np.random.uniform(size=size)
         else:
-            u = F.np.random.uniform(F.np.zeros_like(
+            u = np.random.uniform(np.zeros_like(
                 self.loc + self.scale), size=size)
         return self.icdf(u)
 
@@ -79,18 +75,16 @@ def sample_n(self, size=None):
     def log_prob(self, value):
         if self._validate_args:
             self._validate_samples(value)
-        log = self.F.np.log
-        return (-log(pi) - log(self.scale) -
-                log(1 + ((value - self.loc) / self.scale) ** 2))
+        return (-np.log(pi) - np.log(self.scale) -
+                np.log(1 + ((value - self.loc) / self.scale) ** 2))
 
     def cdf(self, value):
         if self._validate_args:
             self._validate_samples(value)
-        return self.F.np.arctan((value - self.loc) / self.scale) / pi + 0.5
+        return np.arctan((value - self.loc) / self.scale) / pi + 0.5
 
     def icdf(self, value):
-        return self.F.np.tan(pi * (value - 0.5)) * self.scale + self.loc
+        return np.tan(pi * (value - 0.5)) * self.scale + self.loc
 
     def entropy(self):
-        log = self.F.np.log
-        return log(4 * pi) + log(self.scale)
+        return np.log(4 * pi) + np.log(self.scale)
diff --git a/python/mxnet/gluon/probability/distributions/chi2.py b/python/mxnet/gluon/probability/distributions/chi2.py
index 7b74683cb09c..d4c52e55c746 100644
--- a/python/mxnet/gluon/probability/distributions/chi2.py
+++ b/python/mxnet/gluon/probability/distributions/chi2.py
@@ -32,16 +32,13 @@ class Chi2(Gamma):
     ----------
     df : Tensor or scalar, default 0
         Shape parameter of the distribution.
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
     arg_constraints = {'df': Positive()}
 
-    def __init__(self, df, F=None, validate_args=None):
-        super(Chi2, self).__init__(df / 2, 2, F, validate_args)
+    def __init__(self, df, validate_args=None):
+        super(Chi2, self).__init__(df / 2, 2, validate_args)
 
     @property
     def df(self):
diff --git a/python/mxnet/gluon/probability/distributions/constraint.py b/python/mxnet/gluon/probability/distributions/constraint.py
index a27850f08e51..5f6d59a3d6d8 100644
--- a/python/mxnet/gluon/probability/distributions/constraint.py
+++ b/python/mxnet/gluon/probability/distributions/constraint.py
@@ -27,8 +27,8 @@
            "Simplex", "LowerTriangular", "LowerCholesky", "PositiveDefinite",
            "Cat", "Stack"]
 
-from .utils import getF, constraint_check
-from .... import ndarray as nd
+from .utils import constraint_check
+from .... import np
 
 
 class Constraint(object):
@@ -86,12 +86,11 @@ class Real(Constraint):
     """
 
     def check(self, value):
-        F = getF(value)
         err_msg = "Constraint violated: {} should be a real tensor".format(
             value)
         # False when value has NANs
         condition = (value == value) # pylint: disable=comparison-with-itself
-        _value = constraint_check(F)(condition, err_msg) * value
+        _value = constraint_check()(condition, err_msg) * value
         return _value
 
 
@@ -101,11 +100,10 @@ class Boolean(Constraint):
     """
 
     def check(self, value):
-        F = getF(value)
         err_msg = "Constraint violated: {} should be either 0 or 1.".format(
             value)
         condition = (value == 0) | (value == 1)
-        _value = constraint_check(F)(condition, err_msg) * value
+        _value = constraint_check()(condition, err_msg) * value
         return _value
 
 
@@ -120,11 +118,10 @@ def __init__(self, lower_bound, upper_bound):
         self._upper_bound = upper_bound
 
     def check(self, value):
-        F = getF(value)
         err_msg = "Constraint violated: {} should be >= {} and <= {}.".format(
             value, self._lower_bound, self._upper_bound)
         condition = (value >= self._lower_bound) & (value <= self._upper_bound)
-        _value = constraint_check(F)(condition, err_msg) * value
+        _value = constraint_check()(condition, err_msg) * value
         return _value
 
 
@@ -139,11 +136,10 @@ def __init__(self, lower_bound, upper_bound):
         self._upper_bound = upper_bound
 
     def check(self, value):
-        F = getF(value)
         err_msg = "Constraint violated: {} should be > {} and < {}.".format(
             value, self._lower_bound, self._upper_bound)
         condition = (value > self._lower_bound) & (value < self._upper_bound)
-        _value = constraint_check(F)(condition, err_msg) * value
+        _value = constraint_check()(condition, err_msg) * value
         return _value
 
 
@@ -158,11 +154,10 @@ def __init__(self, lower_bound, upper_bound):
         self._upper_bound = upper_bound
 
     def check(self, value):
-        F = getF(value)
         err_msg = "Constraint violated: {} should be >= {} and < {}.".format(
             value, self._lower_bound, self._upper_bound)
         condition = (value >= self._lower_bound) & (value < self._upper_bound)
-        _value = constraint_check(F)(condition, err_msg) * value
+        _value = constraint_check()(condition, err_msg) * value
         return _value
 
 
@@ -177,13 +172,12 @@ def __init__(self, lower_bound, upper_bound):
         self._upper_bound = upper_bound
 
     def check(self, value):
-        F = getF(value)
         err_msg = "Constraint violated: {} should be integer and be >= {} and <= {}.".format(
             value, self._lower_bound, self._upper_bound)
         condition = value % 1 == 0
         condition = condition & (value >= self._lower_bound) & (
             value <= self._upper_bound)
-        _value = constraint_check(F)(condition, err_msg) * value
+        _value = constraint_check()(condition, err_msg) * value
         return _value
 
 
@@ -198,13 +192,12 @@ def __init__(self, lower_bound, upper_bound):
         self._upper_bound = upper_bound
 
     def check(self, value):
-        F = getF(value)
         err_msg = "Constraint violated: {} should be integer and be > {} and < {}.".format(
             value, self._lower_bound, self._upper_bound)
         condition = value % 1 == 0
         condition = condition & (value > self._lower_bound) & (
             value < self._upper_bound)
-        _value = constraint_check(F)(condition, err_msg) * value
+        _value = constraint_check()(condition, err_msg) * value
         return _value
 
 
@@ -219,13 +212,12 @@ def __init__(self, lower_bound, upper_bound):
         self._upper_bound = upper_bound
 
     def check(self, value):
-        F = getF(value)
         err_msg = "Constraint violated: {} should be integer and be >= {} and < {}.".format(
             value, self._lower_bound, self._upper_bound)
         condition = value % 1 == 0
         condition = condition & (value >= self._lower_bound) & (
             value < self._upper_bound)
-        _value = constraint_check(F)(condition, err_msg) * value
+        _value = constraint_check()(condition, err_msg) * value
         return _value
 
 
@@ -239,11 +231,10 @@ def __init__(self, lower_bound):
         self._lower_bound = lower_bound
 
     def check(self, value):
-        F = getF(value)
         err_msg = "Constraint violated: {} should be greater than {}".format(
             value, self._lower_bound)
         condition = value > self._lower_bound
-        _value = constraint_check(F)(condition, err_msg) * value
+        _value = constraint_check()(condition, err_msg) * value
         return _value
 
 
@@ -266,11 +257,10 @@ def __init__(self, lower_bound):
         self._lower_bound = lower_bound
 
     def check(self, value):
-        F = getF(value)
         err_msg = "Constraint violated: {} should be greater than or equal to {}".format(
             value, self._lower_bound)
         condition = value >= self._lower_bound
-        _value = constraint_check(F)(condition, err_msg) * value
+        _value = constraint_check()(condition, err_msg) * value
         return _value
 
 
@@ -284,11 +274,10 @@ def __init__(self, upper_bound):
         self._upper_bound = upper_bound
 
     def check(self, value):
-        F = getF(value)
         err_msg = "Constraint violated: {} should be less than {}".format(
             value, self._upper_bound)
         condition = value < self._upper_bound
-        _value = constraint_check(F)(condition, err_msg) * value
+        _value = constraint_check()(condition, err_msg) * value
         return _value
 
 
@@ -302,11 +291,10 @@ def __init__(self, upper_bound):
         self._upper_bound = upper_bound
 
     def check(self, value):
-        F = getF(value)
         err_msg = "Constraint violated: {} should be less than or equal to {}".format(
             value, self._upper_bound)
         condition = value <= self._upper_bound
-        _value = constraint_check(F)(condition, err_msg) * value
+        _value = constraint_check()(condition, err_msg) * value
         return _value
 
 
@@ -320,12 +308,11 @@ def __init__(self, lower_bound):
         self._lower_bound = lower_bound
 
     def check(self, value):
-        F = getF(value)
         err_msg = "Constraint violated: {} should be integer and be greater than {}".format(
             value, self._lower_bound)
         condition = value % 1 == 0
-        condition = F.np.bitwise_and(condition, value > self._lower_bound)
-        _value = constraint_check(F)(condition, err_msg) * value
+        condition = np.bitwise_and(condition, value > self._lower_bound)
+        _value = constraint_check()(condition, err_msg) * value
         return _value
 
 
@@ -339,13 +326,12 @@ def __init__(self, lower_bound):
         self._lower_bound = lower_bound
 
     def check(self, value):
-        F = getF(value)
         err_msg = "Constraint violated: {} should be integer and" \
                   " be greater than or equal to {}".format(
                       value, self._lower_bound)
         condition = value % 1 == 0
-        condition = F.np.bitwise_and(condition, value >= self._lower_bound)
-        _value = constraint_check(F)(condition, err_msg) * value
+        condition = np.bitwise_and(condition, value >= self._lower_bound)
+        _value = constraint_check()(condition, err_msg) * value
         return _value
 
 
@@ -359,12 +345,11 @@ def __init__(self, upper_bound):
         self._upper_bound = upper_bound
 
     def check(self, value):
-        F = getF(value)
         err_msg = "Constraint violated: {} should be integer and be less than {}".format(
             value, self._upper_bound)
         condition = value % 1 == 0
         condition = condition & (value < self._upper_bound)
-        _value = constraint_check(F)(condition, err_msg) * value
+        _value = constraint_check()(condition, err_msg) * value
         return _value
 
 
@@ -378,13 +363,12 @@ def __init__(self, upper_bound):
         self._upper_bound = upper_bound
 
     def check(self, value):
-        F = getF(value)
         err_msg = "Constraint violated: {} should be integer and" \
                   " be less than or equal to {}".format(
                       value, self._upper_bound)
         condition = value % 1 == 0
         condition = condition & (value <= self._upper_bound)
-        _value = constraint_check(F)(condition, err_msg) * value
+        _value = constraint_check()(condition, err_msg) * value
         return _value
 
 
@@ -431,12 +415,11 @@ class Simplex(Constraint):
     """
 
     def check(self, value):
-        F = getF(value)
         err_msg = "Constraint violated: {} should be >= 0 and" \
                   " its rightmost dimension should sum up to 1".format(value)
-        condition = F.np.all(value >= 0, axis=-1)
-        condition = condition & (F.np.abs(value.sum(-1) - 1) < 1e-6)
-        _value = constraint_check(F)(condition, err_msg) * value
+        condition = np.all(value >= 0, axis=-1)
+        condition = condition & (np.abs(value.sum(-1) - 1) < 1e-6)
+        _value = constraint_check()(condition, err_msg) * value
         return _value
 
 
@@ -446,11 +429,10 @@ class LowerTriangular(Constraint):
     """
 
     def check(self, value):
-        F = getF(value)
         err_msg = "Constraint violated: {} should be" \
                   " square lower triangular matrices".format(value)
-        condition = F.np.tril(value) == value
-        _value = constraint_check(F)(condition, err_msg) * value
+        condition = np.tril(value) == value
+        _value = constraint_check()(condition, err_msg) * value
         return _value
 
 
@@ -460,13 +442,12 @@ class LowerCholesky(Constraint):
     """
 
     def check(self, value):
-        F = getF(value)
         err_msg = "Constraint violated: {} should be" \
                   " square lower triangular matrices" \
                   " with real and positive diagonal entries".format(value)
-        condition = F.np.all(F.np.tril(value) == value, axis=-1)
-        condition = condition & (F.np.diagonal(value, axis1=-2, axis2=-1) > 0)
-        _value = constraint_check(F)(condition, err_msg) * value
+        condition = np.all(np.tril(value) == value, axis=-1)
+        condition = condition & (np.diagonal(value, axis1=-2, axis2=-1) > 0)
+        _value = constraint_check()(condition, err_msg) * value
         return _value
 
 
@@ -476,14 +457,13 @@ class PositiveDefinite(Constraint):
     """
 
     def check(self, value):
-        F = getF(value)
         err_msg = "Constraint violated: {} should be" \
                   " positive definite matrices".format(value)
         eps = 1e-5
-        condition = F.np.all(
-            F.np.abs(value - F.np.swapaxes(value, -1, -2)) < eps, axis=-1)
-        condition = condition & (F.np.linalg.eigvals(value) > 0)
-        _value = constraint_check(F)(condition, err_msg) * value
+        condition = np.all(
+            np.abs(value - np.swapaxes(value, -1, -2)) < eps, axis=-1)
+        condition = condition & (np.linalg.eigvals(value) > 0)
+        _value = constraint_check()(condition, err_msg) * value
         return _value
 
 
@@ -507,15 +487,14 @@ def __init__(self, constraint_seq, axis=0, lengths=None):
         self._axis = axis
 
     def check(self, value):
-        F = getF(value)
         _values = []
         start = 0
         for length in self._lengths:
-            v = F.np.take(value, indices=F.np.arange(
+            v = np.take(value, indices=np.arange(
                 start, start + length), axis=self._axis)
             _values.append(v)
             start = start + length
-        _value = F.np.concatenate(_values, self._axis)
+        _value = np.concatenate(_values, self._axis)
         return _value
 
 
@@ -534,14 +513,11 @@ def __init__(self, constraint_seq, axis=0):
         self._axis = axis
 
     def check(self, value):
-        F = getF(value)
-        assert F is nd, "mxnet.probability.distributions.constraint.Stack" \
-                        " is only supported when hybridization is turned off"
         size = value.shape[self._axis]
-        value_array = F.np.split(value, size, axis=self._axis)
-        value_array = [constraint.check(F.np.squeeze(v)) for v, constraint
+        value_array = np.split(value, size, axis=self._axis)
+        value_array = [constraint.check(np.squeeze(v)) for v, constraint
                        in zip(value_array, self._constraint_seq)]
-        _value = F.np.stack(value_array, self._axis)
+        _value = np.stack(value_array, self._axis)
         return _value
 
 
diff --git a/python/mxnet/gluon/probability/distributions/dirichlet.py b/python/mxnet/gluon/probability/distributions/dirichlet.py
index 205b5bb9e9e5..d68445efff0a 100644
--- a/python/mxnet/gluon/probability/distributions/dirichlet.py
+++ b/python/mxnet/gluon/probability/distributions/dirichlet.py
@@ -22,7 +22,8 @@
 
 from .exp_family import ExponentialFamily
 from .constraint import Positive, Simplex
-from .utils import getF, gammaln, digamma, sample_n_shape_converter, _clip_float_eps
+from .utils import gammaln, digamma, sample_n_shape_converter, _clip_float_eps
+from .... import np
 
 
 class Dirichlet(ExponentialFamily):
@@ -32,8 +33,6 @@ class Dirichlet(ExponentialFamily):
     ----------
     alpha : Tensor or scalar
        Shape parameter of the distribution
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
     """
     # pylint: disable=abstract-method
 
@@ -41,44 +40,39 @@ class Dirichlet(ExponentialFamily):
     support = Simplex()
     arg_constraints = {'alpha': Positive()}
 
-    def __init__(self, alpha, F=None, validate_args=None):
-        _F = F if F is not None else getF(alpha)
+    def __init__(self, alpha, validate_args=None):
         self.alpha = alpha
         super(Dirichlet, self).__init__(
-            F=_F, event_dim=1, validate_args=validate_args)
+            event_dim=1, validate_args=validate_args)
 
     def sample(self, size=None):
-        F = self.F
         if size is None:
             size = ()
             alpha = self.alpha
         else:
             if isinstance(size, int):
-                alpha = F.np.broadcast_to(self.alpha, (size,) + (-2,))
+                alpha = np.broadcast_to(self.alpha, (size,) + (-2,))
             else:
-                alpha = F.np.broadcast_to(self.alpha, size + (-2,))
-        gamma_samples = F.np.random.gamma(alpha, 1)
+                alpha = np.broadcast_to(self.alpha, size + (-2,))
+        gamma_samples = np.random.gamma(alpha, 1)
         s = gamma_samples.sum(-1, keepdims=True)
-        return _clip_float_eps(gamma_samples / s, F)
+        return _clip_float_eps(gamma_samples / s)
 
     def sample_n(self, size=None):
-        F = self.F
         alpha = self.alpha
         if size is None:
             return self.sample()
-        gamma_samples = F.np.random.gamma(
+        gamma_samples = np.random.gamma(
             alpha, 1, sample_n_shape_converter(size))
         s = gamma_samples.sum(-1, keepdims=True)
-        return _clip_float_eps(gamma_samples / s, F)
+        return _clip_float_eps(gamma_samples / s)
 
     def log_prob(self, value):
         if self._validate_args:
             self._validate_samples(value)
-        F = self.F
-        lgamma = gammaln(F)
-        log = F.np.log
+        lgamma = gammaln()
         alpha = self.alpha
-        return (log(value) * (alpha - 1.0)).sum(-1) +\
+        return (np.log(value) * (alpha - 1.0)).sum(-1) +\
             lgamma(alpha.sum(-1)) - lgamma(alpha).sum(-1)
 
     @property
@@ -93,9 +87,8 @@ def variance(self):
         return a * (s - a) / ((s + 1) * s ** 2)
 
     def entropy(self):
-        F = self.F
-        lgamma = gammaln(F)
-        dgamma = digamma(F)
+        lgamma = gammaln()
+        dgamma = digamma()
         a0 = self.alpha.sum(-1)
         log_B_alpha = lgamma(self.alpha).sum(-1) - lgamma(a0)
         return (log_B_alpha + (self.alpha - 1).sum(-1) * dgamma(a0) -
diff --git a/python/mxnet/gluon/probability/distributions/distribution.py b/python/mxnet/gluon/probability/distributions/distribution.py
index 736f3dcb607d..b173497712ad 100644
--- a/python/mxnet/gluon/probability/distributions/distribution.py
+++ b/python/mxnet/gluon/probability/distributions/distribution.py
@@ -22,6 +22,7 @@
 
 from numbers import Number
 from .utils import cached_property
+from .... import np
 
 
 class Distribution(object):
@@ -29,8 +30,6 @@ class Distribution(object):
 
     Parameters
     ----------
-    F : mx.ndarray or mx.symbol.numpy._Symbol
-        Variable that stores the running mode.
     event_dim : int, default None
         Variable indicating the dimension of the distribution's support.
     validate_args : bool, default None
@@ -51,8 +50,7 @@ def set_default_validate_args(value):
             raise ValueError
         Distribution._validate_args = value
 
-    def __init__(self, F=None, event_dim=None, validate_args=None):
-        self.F = F
+    def __init__(self, event_dim=None, validate_args=None):
         self.event_dim = event_dim
         if validate_args is not None:
             self._validate_args = validate_args
@@ -75,7 +73,7 @@ def pdf(self, value):
         r"""
         Returns the probability density/mass function evaluated at `value`.
         """
-        return self.F.np.exp(self.log_prob(value))
+        return np.exp(self.log_prob(value))
 
     def cdf(self, value):
         r"""
@@ -172,8 +170,7 @@ def perplexity(self):
         r"""
         Returns perplexity of distribution.
         """
-        F = self.F
-        return F.np.exp(self.entropy())
+        return np.exp(self.entropy())
 
     def __repr__(self):
         mode = self.F
diff --git a/python/mxnet/gluon/probability/distributions/divergence.py b/python/mxnet/gluon/probability/distributions/divergence.py
index f58c578edd2f..90c0d1f989f7 100644
--- a/python/mxnet/gluon/probability/distributions/divergence.py
+++ b/python/mxnet/gluon/probability/distributions/divergence.py
@@ -23,6 +23,7 @@
 import math
 import numpy as _np
 
+from .... import np
 from .utils import gammaln, digamma
 from .exponential import Exponential
 from .pareto import Pareto
@@ -139,20 +140,17 @@ class _KL_storage():
 
     @staticmethod
     def _kl_Normal_Normal(p, q):
-        F = p.F
         var_ratio = (p.scale / q.scale) ** 2
         t1 = ((p.loc - q.loc) / q.scale) ** 2
-        return 0.5 * (var_ratio + t1 - 1 - F.np.log(var_ratio))
+        return 0.5 * (var_ratio + t1 - 1 - np.log(var_ratio))
 
 
 @register_kl(Bernoulli, Bernoulli)
 def _kl_bernoulli_bernoulli(p, q):
-    F = p.F
-    log_fn = F.np.log
     prob_p = p.prob
     prob_q = q.prob
-    t1 = prob_p * log_fn(prob_p / prob_q)
-    t2 = (1 - prob_p) * log_fn((1 - prob_p) / (1 - prob_q))
+    t1 = prob_p * np.log(prob_p / prob_q)
+    t2 = (1 - prob_p) * np.log((1 - prob_p) / (1 - prob_q))
     return t1 + t2
 
 
@@ -168,87 +166,78 @@ def _kl_onehotcategorical_onehotcategorical(p, q):
 
 @register_kl(Uniform, Uniform)
 def _kl_uniform_uniform(p, q):
-    F = p.F
-    result = F.np.log((q.high - q.low) / (p.high - p.low))
-    result = F.np.where((q.low > p.low) | (q.high < p.high), _np.inf, result)
+    result = np.log((q.high - q.low) / (p.high - p.low))
+    result = np.where((q.low > p.low) | (q.high < p.high), _np.inf, result)
     return result
 
 
 @register_kl(Cauchy, Cauchy)
 def _kl_cauchy_cauchy(p, q):
-    F = p.F
-    t1 = F.np.log((p.scale + q.scale) ** 2 + (p.loc - q.loc) ** 2)
-    t2 = F.np.log(4 * p.scale * q.scale)
+    t1 = np.log((p.scale + q.scale) ** 2 + (p.loc - q.loc) ** 2)
+    t2 = np.log(4 * p.scale * q.scale)
     return t1 - t2
 
 
 @register_kl(Laplace, Laplace)
 def _kl_laplace_laplace(p, q):
-    F = p.F
     scale_ratio = p.scale / q.scale
-    loc_abs_diff = F.np.abs(p.loc - q.loc)
-    t1 = -F.np.log(scale_ratio)
+    loc_abs_diff = np.abs(p.loc - q.loc)
+    t1 = -np.log(scale_ratio)
     t2 = loc_abs_diff / q.scale
-    t3 = scale_ratio * F.np.exp(-loc_abs_diff / p.scale)
+    t3 = scale_ratio * np.exp(-loc_abs_diff / p.scale)
     return t1 + t2 + t3 - 1
 
 
 @register_kl(Poisson, Poisson)
 def _kl_poisson_poisson(p, q):
-    F = p.F
-    t1 = p.rate * (F.np.log(p.rate) - F.np.log(q.rate))
+    t1 = p.rate * (np.log(p.rate) - np.log(q.rate))
     t2 = (p.rate - q.rate)
     return t1 - t2
 
 
 @register_kl(Geometric, Geometric)
 def _kl_geometric_geometric(p, q):
-    F = p.F
-    return (-p.entropy() - F.np.log1p(-q.prob) / p.prob - q.logit)
+    return (-p.entropy() - np.log1p(-q.prob) / p.prob - q.logit)
 
 
 @register_kl(Exponential, Exponential)
 def _kl_exponential_exponential(p, q):
-    F = p.F
     scale_ratio = p.scale / q.scale
-    t1 = -F.np.log(scale_ratio)
+    t1 = -np.log(scale_ratio)
     return t1 + scale_ratio - 1
 
 
 @register_kl(Pareto, Pareto)
 def _kl_pareto_pareto(p, q):
-    F = p.F
     scale_ratio = p.scale / q.scale
     alpha_ratio = q.alpha / p.alpha
-    t1 = q.alpha * F.np.log(scale_ratio)
-    t2 = -F.np.log(alpha_ratio)
+    t1 = q.alpha * np.log(scale_ratio)
+    t2 = -np.log(alpha_ratio)
     result = t1 + t2 + alpha_ratio - 1
-    result = F.np.where(p.support._lower_bound <
-                        q.support._lower_bound, _np.nan, result)
+    result = np.where(p.support._lower_bound <
+                      q.support._lower_bound, _np.nan, result)
     return result
 
 
 @register_kl(Gumbel, Gumbel)
 def _kl_gumbel_gumbel(p, q):
-    F = p.F
-    lgamma = gammaln(F)
+    lgamma = gammaln()
     _euler_gamma = _np.euler_gamma
     ct1 = p.scale / q.scale
     ct2 = q.loc / q.scale
     ct3 = p.loc / q.scale
-    t1 = -F.np.log(ct1) - ct2 + ct3
+    t1 = -np.log(ct1) - ct2 + ct3
     t2 = ct1 * _euler_gamma
-    t3 = F.np.exp(ct2 + lgamma(1 + ct1) - ct3)
+    t3 = np.exp(ct2 + lgamma(1 + ct1) - ct3)
     return t1 + t2 + t3 - (1 + _euler_gamma)
 
 
 @register_kl(Gamma, Gamma)
 def _kl_gamma_gamma(p, q):
-    F = p.F
-    lgamma = gammaln(F)
-    dgamma = digamma(F)
+    lgamma = gammaln()
+    dgamma = digamma()
     return (
-        q.shape * F.np.log(q.scale / p.scale) +
+        q.shape * np.log(q.scale / p.scale) +
         lgamma(q.shape) - lgamma(p.shape) +
         (p.shape - q.shape) * dgamma(p.shape) +
         (p.shape * p.scale) * (1 / q.scale - 1 / p.scale)
@@ -257,9 +246,8 @@ def _kl_gamma_gamma(p, q):
 
 @register_kl(Beta, Beta)
 def _kl_beta_beta(p, q):
-    F = p.F
-    lgamma = gammaln(F)
-    dgamma = digamma(F)
+    lgamma = gammaln()
+    dgamma = digamma()
     sum_params_p = p.beta + p.alpha
     sum_params_q = q.beta + q.alpha
     t1 = lgamma(q.alpha) + lgamma(q.beta) + lgamma(sum_params_p)
@@ -274,66 +262,61 @@ def _kl_beta_beta(p, q):
 
 @register_kl(Dirichlet, Dirichlet)
 def _kl_dirichlet_dirichlet(p, q):
-    F = p.F
-    lgamma = gammaln(F)
-    dgamma = digamma(F)
+    lgamma = gammaln()
+    dgamma = digamma()
     sum_p_concentration = p.alpha.sum(-1)
     sum_q_concentration = q.alpha.sum(-1)
     t1 = lgamma(sum_p_concentration) - lgamma(sum_q_concentration)
     t2 = (lgamma(p.alpha) - lgamma(q.alpha)).sum(-1)
     t3 = p.alpha - q.alpha
-    t4 = dgamma(p.alpha) - F.np.expand_dims(dgamma(sum_p_concentration), -1)
+    t4 = dgamma(p.alpha) - np.expand_dims(dgamma(sum_p_concentration), -1)
     return t1 - t2 + (t3 * t4).sum(-1)
 
 
 @register_kl(HalfNormal, HalfNormal)
 def _kl_halfNormal_halfNormal(p, q):
-    F = p.F
     var_ratio = (p.scale / q.scale) ** 2
     t1 = ((p.loc - q.loc) / q.scale) ** 2
-    return 0.5 * (var_ratio + t1 - 1 - F.np.log(var_ratio))
+    return 0.5 * (var_ratio + t1 - 1 - np.log(var_ratio))
 
 
 @register_kl(Binomial, Binomial)
 def _kl_binomial_binomial(p, q):
-    F = p.F
     kl = p.n * (p.prob * (p.logit - q.logit) +
-                F.np.log1p(-p.prob) - F.np.log1p(-q.prob))
-    kl = F.np.where(p.n > q.n, _np.inf, kl)
+                np.log1p(-p.prob) - np.log1p(-q.prob))
+    kl = np.where(p.n > q.n, _np.inf, kl)
     return kl
 
 
 @register_kl(MultivariateNormal, MultivariateNormal)
 def _kl_mvn_mvn(p, q):
-    F = p.F
     log_det = (lambda mvn:
-               F.np.log(
-                   F.np.diagonal(mvn.scale_tril, axis1=-2, axis2=-1)
+               np.log(
+                   np.diagonal(mvn.scale_tril, axis1=-2, axis2=-1)
                ).sum(-1)
                )
     # log(det(\Sigma_1) / det(\Sigma_2))
     term1 = log_det(q) - log_det(p)
 
     # tr(inv(\Sigma_2) * \Sigma_1)
-    term2 = F.np.trace(F.np.matmul(q.precision, p.cov), axis1=-2, axis2=-1)
+    term2 = np.trace(np.matmul(q.precision, p.cov), axis1=-2, axis2=-1)
 
     # (\mu_2 - \mu_1).T * inv(\Sigma_2) * (\mu_2 - \mu_1)
     diff = q.loc - p.loc
-    term3 = F.np.einsum(
+    term3 = np.einsum(
         '...i,...i->...',
         diff,
         # Batch matrix vector multiply
-        F.np.einsum('...jk,...j->...k', q.precision, diff)
+        np.einsum('...jk,...j->...k', q.precision, diff)
     ) * -0.5
-    n = F.np.ones_like(diff).sum(-1)
+    n = np.ones_like(diff).sum(-1)
     return 0.5 * (term1 + term2 + term3 - n)
 
 
 @register_kl(Uniform, Normal)
 def _kl_uniform_normal(p, q):
-    F = p.F
     common_term = p.high - p.low
-    t1 = F.np.log(math.sqrt(math.pi * 2) * q.scale / common_term)
+    t1 = np.log(math.sqrt(math.pi * 2) * q.scale / common_term)
     t2 = (common_term) ** 2 / 12
     t3 = ((p.high + p.low - 2 * q.loc) / 2) ** 2
     return t1 + 0.5 * (t2 + t3) / (q.scale ** 2)
@@ -341,32 +324,29 @@ def _kl_uniform_normal(p, q):
 
 @register_kl(Uniform, Gumbel)
 def _kl_uniform_gumbel(p, q):
-    F = p.F
     common_term = q.scale / (p.high - p.low)
     high_loc_diff = (p.high - q.loc) / q.scale
     low_loc_diff = (p.low - q.loc) / q.scale
-    t1 = F.np.log(common_term) + 0.5 * (high_loc_diff + low_loc_diff)
-    t2 = common_term * (F.np.exp(-high_loc_diff) - F.np.exp(-low_loc_diff))
+    t1 = np.log(common_term) + 0.5 * (high_loc_diff + low_loc_diff)
+    t2 = common_term * (np.exp(-high_loc_diff) - np.exp(-low_loc_diff))
     return t1 - t2
 
 
 @register_kl(Exponential, Gumbel)
 def _kl_exponential_gumbel(p, q):
-    F = p.F
     scale_rate_prod = q.scale / p.scale
     loc_scale_ratio = q.loc / q.scale
-    t1 = F.np.log(scale_rate_prod) - 1
-    t2 = F.np.exp(loc_scale_ratio) * scale_rate_prod / (scale_rate_prod + 1)
+    t1 = np.log(scale_rate_prod) - 1
+    t2 = np.exp(loc_scale_ratio) * scale_rate_prod / (scale_rate_prod + 1)
     t3 = scale_rate_prod ** -1
     return t1 - loc_scale_ratio + t2 + t3
 
 
 @register_kl(Exponential, Normal)
 def _kl_exponential_normal(p, q):
-    F = p.F
     var_normal = q.variance
     rate_sqr = p.scale ** (-2)
-    t1 = 0.5 * F.np.log(rate_sqr * var_normal * 2 * _np.pi)
+    t1 = 0.5 * np.log(rate_sqr * var_normal * 2 * _np.pi)
     t2 = rate_sqr ** -1
     t3 = q.loc * p.scale
     t4 = (q.loc ** 2) * 0.5
@@ -375,8 +355,7 @@ def _kl_exponential_normal(p, q):
 
 @register_kl(Exponential, Gamma)
 def _kl_exponential_gamma(p, q):
-    F = p.F
-    lgamma = gammaln(F)
+    lgamma = gammaln()
     ratio = p.scale / q.scale
-    t1 = -q.shape * F.np.log(ratio)
+    t1 = -q.shape * np.log(ratio)
     return t1 + ratio + lgamma(q.shape) + q.shape * _np.euler_gamma - (1 + _np.euler_gamma)
diff --git a/python/mxnet/gluon/probability/distributions/exponential.py b/python/mxnet/gluon/probability/distributions/exponential.py
index 19ddd58ed74b..9d73c5fa990f 100644
--- a/python/mxnet/gluon/probability/distributions/exponential.py
+++ b/python/mxnet/gluon/probability/distributions/exponential.py
@@ -22,7 +22,8 @@
 
 from .exp_family import ExponentialFamily
 from .constraint import Positive
-from .utils import getF, sample_n_shape_converter, cached_property
+from .utils import sample_n_shape_converter, cached_property
+from .... import np
 
 
 class Exponential(ExponentialFamily):
@@ -32,8 +33,6 @@ class Exponential(ExponentialFamily):
     ----------
     scale : Tensor or scalar
        Scale of the distribution. (scale = 1 /rate)
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
     """
     # pylint: disable=abstract-method
 
@@ -41,11 +40,10 @@ class Exponential(ExponentialFamily):
     support = Positive()
     arg_constraints = {'scale': Positive()}
 
-    def __init__(self, scale=1.0, F=None, validate_args=None):
-        _F = F if F is not None else getF(scale)
+    def __init__(self, scale=1.0, validate_args=None):
         self.scale = scale
         super(Exponential, self).__init__(
-            F=_F, event_dim=0, validate_args=validate_args)
+            event_dim=0, validate_args=validate_args)
 
     @cached_property
     def rate(self):
@@ -64,18 +62,16 @@ def stddev(self):
         return self.scale
 
     def sample(self, size=None):
-        return self.F.np.random.exponential(self.scale, size=size)
+        return np.random.exponential(self.scale, size=size)
 
     def sample_n(self, size=None):
-        return self.F.np.random.exponential(self.scale,
-                                            size=sample_n_shape_converter(size))
+        return np.random.exponential(self.scale,
+                                     size=sample_n_shape_converter(size))
 
     def broadcast_to(self, batch_shape):
         new_instance = self.__new__(type(self))
-        F = self.F
-        new_instance.scale = F.np.broadcast_to(self.scale, batch_shape)
-        super(Exponential, new_instance).__init__(F=F,
-                                                  event_dim=self.event_dim,
+        new_instance.scale = np.broadcast_to(self.scale, batch_shape)
+        super(Exponential, new_instance).__init__(event_dim=self.event_dim,
                                                   validate_args=False)
         new_instance._validate_args = self._validate_args
         return new_instance
@@ -83,22 +79,18 @@ def broadcast_to(self, batch_shape):
     def log_prob(self, value):
         if self._validate_args:
             self._validate_samples(value)
-        F = self.F
-        return F.np.log(self.rate) - self.rate * value
+        return np.log(self.rate) - self.rate * value
 
     def cdf(self, value):
         if self._validate_args:
             self._validate_samples(value)
-        F = self.F
-        return 1 - F.np.exp(-self.rate * value)
+        return 1 - np.exp(-self.rate * value)
 
     def icdf(self, value):
-        F = self.F
-        return - self.scale * F.np.log(1 - value)
+        return - self.scale * np.log(1 - value)
 
     def entropy(self):
-        F = self.F
-        return 1.0 + F.np.log(self.scale)
+        return 1.0 + np.log(self.scale)
 
     @property
     def _natural_params(self):
@@ -106,5 +98,4 @@ def _natural_params(self):
 
     def _log_normalizer(self, x):
         # pylint: disable=arguments-differ
-        F = self.F
-        return -F.np.log(-x)
+        return -np.log(-x)
diff --git a/python/mxnet/gluon/probability/distributions/fishersnedecor.py b/python/mxnet/gluon/probability/distributions/fishersnedecor.py
index f4d06a2f3e8f..fe481bb502a6 100644
--- a/python/mxnet/gluon/probability/distributions/fishersnedecor.py
+++ b/python/mxnet/gluon/probability/distributions/fishersnedecor.py
@@ -24,7 +24,9 @@
 from .distribution import Distribution
 from .gamma import Gamma
 from .constraint import Positive
-from .utils import getF, gammaln
+from .utils import gammaln
+from .... import np
+
 
 
 class FisherSnedecor(Distribution):
@@ -36,46 +38,40 @@ class FisherSnedecor(Distribution):
         degree of freedom parameter 1
     scale : Tensor or scalar
         degree of freedom parameter 2
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
     support = Positive()
     arg_constraints = {'df1': Positive(), 'df2': Positive()}
 
-    def __init__(self, df1, df2, F=None, validate_args=None):
-        _F = F if F is not None else getF(df1, df2)
+    def __init__(self, df1, df2, validate_args=None):
         self.df1 = df1
         self.df2 = df2
         self._gamma1 = Gamma(0.5 * self.df1, 1 / self.df1)
         self._gamma2 = Gamma(0.5 * self.df2, 1 / self.df2)
         super(FisherSnedecor, self).__init__(
-            F=_F, event_dim=0, validate_args=validate_args)
+            event_dim=0, validate_args=validate_args)
 
     def broadcast_to(self, batch_shape):
         new_instance = self.__new__(type(self))
-        F = self.F
-        new_instance.df1 = F.np.broadcast_to(self.df1, batch_shape)
-        new_instance.df2 = F.np.broadcast_to(self.df2, batch_shape)
+        new_instance.df1 = np.broadcast_to(self.df1, batch_shape)
+        new_instance.df2 = np.broadcast_to(self.df2, batch_shape)
         new_instance._gamma1 = self._gamma1.broadcast_to(batch_shape)
         new_instance._gamma2 = self._gamma2.broadcast_to(batch_shape)
-        super(FisherSnedecor, new_instance).__init__(F=F,
-                                                     event_dim=0, validate_args=False)
+        super(FisherSnedecor, new_instance).__init__(event_dim=0, validate_args=False)
         new_instance._validate_args = self._validate_args
         return new_instance
 
     @property
     def mean(self):
         # mean is only defined for df2 > 2
-        df2 = self.F.np.where(self.df2 <= 2, nan, self.df2)
+        df2 = np.where(self.df2 <= 2, nan, self.df2)
         return df2 / (df2 - 2)
 
     @property
     def variance(self):
         # variance is only define for df2 > 4
-        df2 = self.F.np.where(self.df2 <= 4, nan, self.df2)
+        df2 = np.where(self.df2 <= 4, nan, self.df2)
         df1 = self.df1
         numerator = 2 * df2 ** 2 * (df1 + df2 - 2)
         denominator = df1 * (df2 - 2) ** 2 * (df2 - 4)
@@ -94,9 +90,8 @@ def sample_n(self, size=None):
     def log_prob(self, value):
         if self._validate_args:
             self._validate_samples(value)
-        F = self.F
-        lgamma = gammaln(F)
-        log = F.np.log
+        lgamma = gammaln()
+        log = np.log
         ct1 = self.df1 / 2
         ct2 = self.df2 / 2
         ct3 = self.df1 / self.df2
diff --git a/python/mxnet/gluon/probability/distributions/gamma.py b/python/mxnet/gluon/probability/distributions/gamma.py
index bdb328cf2242..bff60b83ba63 100644
--- a/python/mxnet/gluon/probability/distributions/gamma.py
+++ b/python/mxnet/gluon/probability/distributions/gamma.py
@@ -22,7 +22,9 @@
 
 from .exp_family import ExponentialFamily
 from .constraint import Real, Positive
-from .utils import getF, sample_n_shape_converter, gammaln, digamma
+from .utils import sample_n_shape_converter, gammaln, digamma
+from .... import np
+
 
 
 class Gamma(ExponentialFamily):
@@ -35,9 +37,6 @@ class Gamma(ExponentialFamily):
     scale : Tensor or scalar, default 1
         scale parameter of the distribution, often represented by `\theta`,
         `\theta` = 1 / `\beta`, where `\beta` stands for the rate parameter.
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
@@ -46,19 +45,17 @@ class Gamma(ExponentialFamily):
     support = Real()
     arg_constraints = {'shape': Positive(), 'scale': Positive()}
 
-    def __init__(self, shape, scale=1.0, F=None, validate_args=None):
-        _F = F if F is not None else getF(shape, scale)
+    def __init__(self, shape, scale=1.0, validate_args=None):
         self.shape = shape
         self.scale = scale
         super(Gamma, self).__init__(
-            F=_F, event_dim=0, validate_args=validate_args)
+            event_dim=0, validate_args=validate_args)
 
     def log_prob(self, value):
         if self._validate_args:
             self._validate_samples(value)
-        F = self.F
-        log_fn = F.np.log
-        lgamma = gammaln(F)
+        log_fn = np.log
+        lgamma = gammaln()
         # alpha (concentration)
         a = self.shape
         # beta (rate)
@@ -67,20 +64,18 @@ def log_prob(self, value):
 
     def broadcast_to(self, batch_shape):
         new_instance = self.__new__(type(self))
-        F = self.F
-        new_instance.shape = F.np.broadcast_to(self.shape, batch_shape)
-        new_instance.scale = F.np.broadcast_to(self.scale, batch_shape)
-        super(Gamma, new_instance).__init__(F=F,
-                                            event_dim=self.event_dim,
+        new_instance.shape = np.broadcast_to(self.shape, batch_shape)
+        new_instance.scale = np.broadcast_to(self.scale, batch_shape)
+        super(Gamma, new_instance).__init__(event_dim=self.event_dim,
                                             validate_args=False)
         new_instance._validate_args = self._validate_args
         return new_instance
 
     def sample(self, size=None):
-        return self.F.np.random.gamma(self.shape, 1, size) * self.scale
+        return np.random.gamma(self.shape, 1, size) * self.scale
 
     def sample_n(self, size=None):
-        return self.F.np.random.gamma(self.shape, 1, sample_n_shape_converter(size)) * self.scale
+        return np.random.gamma(self.shape, 1, sample_n_shape_converter(size)) * self.scale
 
     @property
     def mean(self):
@@ -91,10 +86,9 @@ def variance(self):
         return self.shape * (self.scale ** 2)
 
     def entropy(self):
-        F = self.F
-        lgamma = gammaln(F)
-        dgamma = digamma(F)
-        return (self.shape + F.np.log(self.scale) + lgamma(self.shape) +
+        lgamma = gammaln()
+        dgamma = digamma()
+        return (self.shape + np.log(self.scale) + lgamma(self.shape) +
                 (1 - self.shape) * dgamma(self.shape))
 
     @property
diff --git a/python/mxnet/gluon/probability/distributions/geometric.py b/python/mxnet/gluon/probability/distributions/geometric.py
index 170edfec9912..b7cbbe1d2be4 100644
--- a/python/mxnet/gluon/probability/distributions/geometric.py
+++ b/python/mxnet/gluon/probability/distributions/geometric.py
@@ -22,8 +22,9 @@
 
 from numbers import Number
 from .distribution import Distribution
-from .utils import prob2logit, logit2prob, getF, cached_property, sample_n_shape_converter
+from .utils import prob2logit, logit2prob, cached_property, sample_n_shape_converter
 from .constraint import NonNegativeInteger, Interval, Real
+from .... import np
 
 
 class Geometric(Distribution):
@@ -35,9 +36,6 @@ class Geometric(Distribution):
         Probability of sampling `1`.
     logit : Tensor or scalar, default None
         The log-odds of sampling `1`.
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
@@ -45,8 +43,7 @@ class Geometric(Distribution):
     arg_constraints = {'prob': Interval(0, 1),
                        'logit': Real()}
 
-    def __init__(self, prob=None, logit=None, F=None, validate_args=None):
-        _F = F if F is not None else getF(prob, logit)
+    def __init__(self, prob=None, logit=None, validate_args=None):
         if (prob is None) == (logit is None):
             raise ValueError(
                 "Either `prob` or `logit` must be specified, but not both. " +
@@ -57,7 +54,7 @@ def __init__(self, prob=None, logit=None, F=None, validate_args=None):
         else:
             self.logit = logit
         super(Geometric, self).__init__(
-            F=_F, event_dim=0, validate_args=validate_args)
+            event_dim=0, validate_args=validate_args)
 
     @cached_property
     def prob(self):
@@ -69,7 +66,7 @@ def prob(self):
             Parameter tensor.
         """
         # pylint: disable=method-hidden
-        return logit2prob(self.logit, True, self.F)
+        return logit2prob(self.logit, True)
 
     @cached_property
     def logit(self):
@@ -81,7 +78,7 @@ def logit(self):
             Parameter tensor.
         """
         # pylint: disable=method-hidden
-        return prob2logit(self.prob, True, self.F)
+        return prob2logit(self.prob, True)
 
     @property
     def mean(self):
@@ -93,13 +90,11 @@ def variance(self):
 
     def broadcast_to(self, batch_shape):
         new_instance = self.__new__(type(self))
-        F = self.F
         if 'prob' in self.__dict__:
-            new_instance.prob = F.np.broadcast_to(self.prob, batch_shape)
+            new_instance.prob = np.broadcast_to(self.prob, batch_shape)
         else:
-            new_instance.logit = F.np.broadcast_to(self.logit, batch_shape)
-        super(Geometric, new_instance).__init__(F=F,
-                                                event_dim=self.event_dim,
+            new_instance.logit = np.broadcast_to(self.logit, batch_shape)
+        super(Geometric, new_instance).__init__(event_dim=self.event_dim,
                                                 validate_args=False)
         new_instance._validate_args = self._validate_args
         return new_instance
@@ -107,19 +102,17 @@ def broadcast_to(self, batch_shape):
     def log_prob(self, value):
         if self._validate_args:
             self._validate_samples(value)
-        F = self.F
         prob = self.prob
-        return value * F.np.log1p(-prob) + F.np.log(prob)
+        return value * np.log1p(-prob) + np.log(prob)
 
     def sample(self, size=None):
-        F = self.F
         if isinstance(self.prob, Number):
-            shape_tensor = F.np.zeros(())
+            shape_tensor = np.zeros(())
         else:
-            shape_tensor = F.np.zeros_like(self.prob)
-        u = F.np.random.uniform(shape_tensor, size=size)
-        samples = F.np.floor(
-            F.np.log(u) / F.np.log1p(-self.prob)
+            shape_tensor = np.zeros_like(self.prob)
+        u = np.random.uniform(shape_tensor, size=size)
+        samples = np.floor(
+            np.log(u) / np.log1p(-self.prob)
         )
         return samples
 
@@ -127,7 +120,6 @@ def sample_n(self, size=None):
         return self.sample(sample_n_shape_converter(size))
 
     def entropy(self):
-        F = self.F
         logit = self.logit
         prob = self.prob
-        return -(logit * (prob - 1) - F.np.log1p(F.np.exp(-logit))) / prob
+        return -(logit * (prob - 1) - np.log1p(np.exp(-logit))) / prob
diff --git a/python/mxnet/gluon/probability/distributions/gumbel.py b/python/mxnet/gluon/probability/distributions/gumbel.py
index 7094a5a0d90a..650359593aef 100644
--- a/python/mxnet/gluon/probability/distributions/gumbel.py
+++ b/python/mxnet/gluon/probability/distributions/gumbel.py
@@ -24,7 +24,8 @@
 from numpy import euler_gamma # Euler-Mascheroni constant
 from .distribution import Distribution
 from .constraint import Real, Positive
-from .utils import getF, sample_n_shape_converter
+from .utils import sample_n_shape_converter
+from .... import np
 
 
 class Gumbel(Distribution):
@@ -36,9 +37,6 @@ class Gumbel(Distribution):
         Location parameter of the distribution.
     scale : Tensor or scalar, default 1
         Scale parameter of the distribution
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
@@ -47,26 +45,24 @@ class Gumbel(Distribution):
     arg_constraints = {'loc': Real(),
                        'scale': Positive()}
 
-    def __init__(self, loc, scale=1, F=None, validate_args=None):
-        _F = F if F is not None else getF(loc, scale)
+    def __init__(self, loc, scale=1, validate_args=None):
         self.loc = loc
         self.scale = scale
         super(Gumbel, self).__init__(
-            F=_F, event_dim=0, validate_args=validate_args)
+            event_dim=0, validate_args=validate_args)
 
     def log_prob(self, value):
         if self._validate_args:
             self._validate_samples(value)
-        F = self.F
         # Standardized sample
         y = (self.loc - value) / self.scale
-        return (y - F.np.exp(y)) - F.np.log(self.scale)
+        return (y - np.exp(y)) - np.log(self.scale)
 
     def broadcast_to(self, batch_shape):
         new_instance = self.__new__(type(self))
         F = self.F
-        new_instance.loc = F.np.broadcast_to(self.loc, batch_shape)
-        new_instance.scale = F.np.broadcast_to(self.scale, batch_shape)
+        new_instance.loc = np.broadcast_to(self.loc, batch_shape)
+        new_instance.scale = np.broadcast_to(self.scale, batch_shape)
         super(Gumbel, new_instance).__init__(F=F,
                                              event_dim=self.event_dim,
                                              validate_args=False)
@@ -76,21 +72,19 @@ def broadcast_to(self, batch_shape):
     def cdf(self, value):
         if self._validate_args:
             self._validate_samples(value)
-        F = self.F
         y = (value - self.loc) / self.scale
-        exp_fn = F.np.exp
+        exp_fn = np.exp
         return exp_fn(-exp_fn(-y))
 
     def icdf(self, value):
-        F = self.F
-        log_fn = F.np.log
+        log_fn = np.log
         return self.loc + self.scale * (-log_fn(-log_fn(value)))
 
     def sample(self, size=None):
-        return self.F.np.random.gumbel(self.loc, self.scale, size)
+        return np.random.gumbel(self.loc, self.scale, size)
 
     def sample_n(self, size=None):
-        return self.F.np.random.gumbel(self.loc, self.scale, sample_n_shape_converter(size))
+        return np.random.gumbel(self.loc, self.scale, sample_n_shape_converter(size))
 
     @property
     def mean(self):
@@ -105,5 +99,4 @@ def variance(self):
         return self.stddev ** 2
 
     def entropy(self):
-        F = self.F
-        return F.np.log(self.scale) + (1 + euler_gamma)
+        return np.log(self.scale) + (1 + euler_gamma)
diff --git a/python/mxnet/gluon/probability/distributions/half_cauchy.py b/python/mxnet/gluon/probability/distributions/half_cauchy.py
index a39236b81784..97ebc863ceff 100644
--- a/python/mxnet/gluon/probability/distributions/half_cauchy.py
+++ b/python/mxnet/gluon/probability/distributions/half_cauchy.py
@@ -26,6 +26,7 @@
 from ..transformation import AbsTransform
 from .cauchy import Cauchy
 from .constraint import Positive
+from .... import np
 
 
 class HalfCauchy(TransformedDistribution):
@@ -37,9 +38,6 @@ class HalfCauchy(TransformedDistribution):
     ----------
     scale : Tensor or scalar, default 1
         Scale of the full Cauchy distribution.
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
@@ -47,8 +45,8 @@ class HalfCauchy(TransformedDistribution):
     support = Positive()
     arg_constraints = {'scale': Positive()}
 
-    def __init__(self, scale=1.0, F=None, validate_args=None):
-        base_dist = Cauchy(0, scale, F)
+    def __init__(self, scale=1.0, validate_args=None):
+        base_dist = Cauchy(0, scale)
         self.scale = scale
         super(HalfCauchy, self).__init__(
             base_dist, AbsTransform(), validate_args=validate_args)
@@ -57,7 +55,7 @@ def log_prob(self, value):
         if self._validate_args:
             self._validate_samples(value)
         log_prob = self._base_dist.log_prob(value) + math.log(2)
-        log_prob = self.F.np.where(value < 0, -inf, log_prob)
+        log_prob = np.where(value < 0, -inf, log_prob)
         return log_prob
 
     def cdf(self, value):
@@ -77,5 +75,4 @@ def mean(self):
 
     @property
     def variance(self):
-        pow_fn = self.F.np.power
-        return pow_fn(self.scale, 2) * (1 - 2 / math.pi)
+        return np.power(self.scale, 2) * (1 - 2 / math.pi)
diff --git a/python/mxnet/gluon/probability/distributions/half_normal.py b/python/mxnet/gluon/probability/distributions/half_normal.py
index 7e93b7b5837d..e8e1130ac15e 100644
--- a/python/mxnet/gluon/probability/distributions/half_normal.py
+++ b/python/mxnet/gluon/probability/distributions/half_normal.py
@@ -26,6 +26,7 @@
 from ..transformation import AbsTransform
 from .normal import Normal
 from .constraint import Positive
+from .... import np
 
 
 class HalfNormal(TransformedDistribution):
@@ -37,9 +38,6 @@ class HalfNormal(TransformedDistribution):
     ----------
     scale : Tensor or scalar, default 1
         Scale of the full Normal distribution.
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
@@ -47,8 +45,8 @@ class HalfNormal(TransformedDistribution):
     support = Positive()
     arg_constraints = {'scale': Positive()}
 
-    def __init__(self, scale=1.0, F=None, validate_args=None):
-        base_dist = Normal(0, scale, F)
+    def __init__(self, scale=1.0, validate_args=None):
+        base_dist = Normal(0, scale)
         self.scale = scale
         super(HalfNormal, self).__init__(
             base_dist, AbsTransform(), validate_args=validate_args)
@@ -57,7 +55,7 @@ def log_prob(self, value):
         if self._validate_args:
             self._validate_samples(value)
         log_prob = self._base_dist.log_prob(value) + math.log(2)
-        log_prob = self.F.np.where(value < 0, -inf, log_prob)
+        log_prob = np.where(value < 0, -inf, log_prob)
         return log_prob
 
     def cdf(self, value):
@@ -78,5 +76,4 @@ def mean(self):
 
     @property
     def variance(self):
-        pow_fn = self.F.np.power
-        return pow_fn(self.scale, 2) * (1 - 2 / math.pi)
+        return np.power(self.scale, 2) * (1 - 2 / math.pi)
diff --git a/python/mxnet/gluon/probability/distributions/independent.py b/python/mxnet/gluon/probability/distributions/independent.py
index 25c846d656cc..797f0a759108 100644
--- a/python/mxnet/gluon/probability/distributions/independent.py
+++ b/python/mxnet/gluon/probability/distributions/independent.py
@@ -38,19 +38,17 @@ def __init__(self, base_distribution, reinterpreted_batch_ndims, validate_args=N
         event_dim = reinterpreted_batch_ndims + base_distribution.event_dim
         self.base_dist = base_distribution
         self.reinterpreted_batch_ndims = reinterpreted_batch_ndims
-        super(Independent, self).__init__(F=base_distribution.F,
-                                          event_dim=event_dim,
+        super(Independent, self).__init__(event_dim=event_dim,
                                           validate_args=validate_args)
 
     def broadcast_to(self, batch_shape):
         new_instance = self.__new__(type(self))
-        F = self.F
         # we use -2 to copy the sizes of reinterpreted batch dimensions
         reinterpreted_axes = (-2,) * self.reinterpreted_batch_ndims
         new_instance.base_dist = self.base_dist.broadcast_to(
             batch_shape + reinterpreted_axes)
         new_instance.reinterpreted_batch_ndims = self.reinterpreted_batch_ndims
-        super(Independent, new_instance).__init__(F=F, event_dim=self.event_dim,
+        super(Independent, new_instance).__init__(event_dim=self.event_dim,
                                                   validate_args=False)
         new_instance._validate_args = self._validate_args
         return new_instance
diff --git a/python/mxnet/gluon/probability/distributions/laplace.py b/python/mxnet/gluon/probability/distributions/laplace.py
index 1bc88e94017c..e17c297bc991 100644
--- a/python/mxnet/gluon/probability/distributions/laplace.py
+++ b/python/mxnet/gluon/probability/distributions/laplace.py
@@ -22,7 +22,8 @@
 
 from .constraint import Real, Positive
 from .distribution import Distribution
-from .utils import getF, sample_n_shape_converter
+from .utils import sample_n_shape_converter
+from .... import np
 
 
 class Laplace(Distribution):
@@ -34,10 +35,6 @@ class Laplace(Distribution):
         mean of the distribution.
     scale : Tensor or scalar, default 1
         scale of the distribution
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
-
     """
     # pylint: disable=abstract-method
 
@@ -45,12 +42,11 @@ class Laplace(Distribution):
     support = Real()
     arg_constraints = {'loc': Real(), 'scale': Positive()}
 
-    def __init__(self, loc=0.0, scale=1.0, F=None, validate_args=None):
-        _F = F if F is not None else getF(loc, scale)
+    def __init__(self, loc=0.0, scale=1.0, validate_args=None):
         self.loc = loc
         self.scale = scale
         super(Laplace, self).__init__(
-            F=_F, event_dim=0, validate_args=validate_args)
+            event_dim=0, validate_args=validate_args)
 
     def log_prob(self, value):
         """Compute the log likelihood of `value`.
@@ -67,8 +63,7 @@ def log_prob(self, value):
         """
         if self._validate_args:
             self._validate_samples(value)
-        F = self.F
-        return -F.np.log(2 * self.scale) - F.np.abs(value - self.loc) / self.scale
+        return -np.log(2 * self.scale) - np.abs(value - self.loc) / self.scale
 
     def sample(self, size=None):
         r"""Generate samples of `size` from the normal distribution
@@ -85,7 +80,7 @@ def sample(self, size=None):
         Tensor
             Samples from Normal distribution.
         """
-        return self.F.np.random.laplace(self.loc, self.scale, size)
+        return np.random.laplace(self.loc, self.scale, size)
 
     def sample_n(self, size=None):
         r"""Generate samples of (batch_size + broadcast(loc, scale).shape)
@@ -101,15 +96,13 @@ def sample_n(self, size=None):
         Tensor
             Samples from Normal distribution.
         """
-        return self.F.np.random.laplace(self.loc, self.scale, sample_n_shape_converter(size))
+        return np.random.laplace(self.loc, self.scale, sample_n_shape_converter(size))
 
     def broadcast_to(self, batch_shape):
         new_instance = self.__new__(type(self))
-        F = self.F
-        new_instance.loc = F.np.broadcast_to(self.loc, batch_shape)
-        new_instance.scale = F.np.broadcast_to(self.scale, batch_shape)
-        super(Laplace, new_instance).__init__(F=F,
-                                              event_dim=self.event_dim,
+        new_instance.loc = np.broadcast_to(self.loc, batch_shape)
+        new_instance.scale = np.broadcast_to(self.scale, batch_shape)
+        super(Laplace, new_instance).__init__(event_dim=self.event_dim,
                                               validate_args=False)
         new_instance._validate_args = self._validate_args
         return new_instance
@@ -117,14 +110,12 @@ def broadcast_to(self, batch_shape):
     def cdf(self, value):
         if self._validate_args:
             self._validate_samples(value)
-        F = self.F
         value = value - self.loc
-        return 0.5 - 0.5 * F.np.sign(value) * F.np.expm1(-F.np.abs(value) / self.scale)
+        return 0.5 - 0.5 * np.sign(value) * np.expm1(-np.abs(value) / self.scale)
 
     def icdf(self, value):
-        F = self.F
         value = value - 0.5
-        return self.loc - self.scale * F.np.sign(value) * F.np.log1p(-2 * F.np.abs(value))
+        return self.loc - self.scale * np.sign(value) * np.log1p(-2 * np.abs(value))
 
     @property
     def mean(self):
@@ -139,5 +130,4 @@ def variance(self):
         return 2 * (self.scale ** 2)
 
     def entropy(self):
-        F = self.F
-        return 1 + F.np.log(2 * self.scale)
+        return 1 + np.log(2 * self.scale)
diff --git a/python/mxnet/gluon/probability/distributions/multinomial.py b/python/mxnet/gluon/probability/distributions/multinomial.py
index 875125eb3e02..419d9836b0d5 100644
--- a/python/mxnet/gluon/probability/distributions/multinomial.py
+++ b/python/mxnet/gluon/probability/distributions/multinomial.py
@@ -23,7 +23,7 @@
 from numbers import Number
 from .distribution import Distribution
 from .one_hot_categorical import OneHotCategorical
-from .utils import getF, cached_property, logit2prob, prob2logit, gammaln
+from .utils import cached_property, logit2prob, prob2logit, gammaln
 from .constraint import Simplex, Real, IntegerInterval
 
 
@@ -40,17 +40,13 @@ class Multinomial(Distribution):
         unnormalized probability of each event.
     total_count : int
         number of trials.
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
     arg_constraints = {'prob': Simplex(), 'logit': Real()}
 
     def __init__(self, num_events,
-                 prob=None, logit=None, total_count=1, F=None, validate_args=None):
-        _F = F if F is not None else getF(prob, logit)
+                 prob=None, logit=None, total_count=1, validate_args=None):
         if not isinstance(total_count, Number):
             raise ValueError("Expect `total_conut` to be scalar value")
         self.total_count = total_count
@@ -63,9 +59,9 @@ def __init__(self, num_events,
         else:
             self.logit = logit
         self._categorical = OneHotCategorical(
-            num_events, prob, logit, F, validate_args)
+            num_events, prob, logit, validate_args)
         super(Multinomial, self).__init__(
-            F=_F, event_dim=1, validate_args=validate_args)
+            event_dim=1, validate_args=validate_args)
 
     @property
     def mean(self):
@@ -78,12 +74,12 @@ def variance(self):
     @cached_property
     def prob(self):
         # pylint: disable=method-hidden
-        return logit2prob(self.logit, False, self.F)
+        return logit2prob(self.logit, False)
 
     @cached_property
     def logit(self):
         # pylint: disable=method-hidden
-        return prob2logit(self.prob, False, self.F)
+        return prob2logit(self.prob, False)
 
     @property
     def support(self):
@@ -105,8 +101,7 @@ def sample_n(self, size=None):
     def log_prob(self, value):
         if self._validate_args:
             self._validate_samples(value)
-        F = self.F
-        lgamma = gammaln(F)
+        lgamma = gammaln()
         log_factorial_n = lgamma(value.sum(-1) + 1)
         log_factorial_x = lgamma(value + 1).sum(-1)
         log_power = (self.logit * value).sum(-1)
@@ -114,12 +109,10 @@ def log_prob(self, value):
 
     def broadcast_to(self, batch_shape):
         new_instance = self.__new__(type(self))
-        F = self.F
         new_instance._categorical = self._categorical.broadcast_to(batch_shape)
         new_instance.num_events = self.num_events
         new_instance.total_conut = self.total_count
-        super(Multinomial, new_instance).__init__(F=F,
-                                                  event_dim=self.event_dim,
+        super(Multinomial, new_instance).__init__(event_dim=self.event_dim,
                                                   validate_args=False)
         new_instance._validate_args = self._validate_args
         return new_instance
diff --git a/python/mxnet/gluon/probability/distributions/multivariate_normal.py b/python/mxnet/gluon/probability/distributions/multivariate_normal.py
index 1eaa41449261..40e7c4c248d4 100644
--- a/python/mxnet/gluon/probability/distributions/multivariate_normal.py
+++ b/python/mxnet/gluon/probability/distributions/multivariate_normal.py
@@ -23,7 +23,8 @@
 import math
 from .distribution import Distribution
 from .constraint import Real, PositiveDefinite, LowerCholesky
-from .utils import getF, cached_property
+from .utils import cached_property
+from .... import np
 
 
 class MultivariateNormal(Distribution):
@@ -39,9 +40,6 @@ class MultivariateNormal(Distribution):
         precision matrix of the distribution
     scale_tril : Tensor
         lower-triangular factor of the covariance
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
@@ -52,11 +50,10 @@ class MultivariateNormal(Distribution):
                        'precision': PositiveDefinite(),
                        'scale_tril': LowerCholesky()}
 
-    def __init__(self, loc, cov=None, precision=None, scale_tril=None, F=None, validate_args=None):
+    def __init__(self, loc, cov=None, precision=None, scale_tril=None, validate_args=None):
         if (cov is not None) + (precision is not None) + (scale_tril is not None) != 1:
             raise ValueError("Exactly one onf `cov` or `precision` or " +
                              "`scale_tril` may be specified")
-        _F = F if F is not None else getF(cov, precision, scale_tril)
         self.loc = loc
         if cov is not None:
             self.cov = cov
@@ -65,7 +62,7 @@ def __init__(self, loc, cov=None, precision=None, scale_tril=None, F=None, valid
         else:
             self.scale_tril = scale_tril
         super(MultivariateNormal, self).__init__(
-            F=_F, event_dim=1, validate_args=validate_args)
+            event_dim=1, validate_args=validate_args)
 
     def _precision_to_scale_tril(self, P):
         """
@@ -74,38 +71,34 @@ def _precision_to_scale_tril(self, P):
         flip(inv(L.T)) = Cholesky(flip(P))
         L = flip(Cholesky(flip(P))).T
         """
-        F = self.F
-        L_flip_inv_T = F.np.linalg.cholesky(F.np.flip(P, (-1, -2)))
-        L = F.np.linalg.inv(F.np.swapaxes(
-            F.np.flip(L_flip_inv_T, (-1, -2)), -1, -2))
+        L_flip_inv_T = np.linalg.cholesky(np.flip(P, (-1, -2)))
+        L = np.linalg.inv(np.swapaxes(
+            np.flip(L_flip_inv_T, (-1, -2)), -1, -2))
         return L
 
     @cached_property
     def scale_tril(self):
         # pylint: disable=method-hidden
-        F = self.F
         if 'cov' in self.__dict__:
-            return F.np.linalg.cholesky(self.cov)
+            return np.linalg.cholesky(self.cov)
         return self._precision_to_scale_tril(self.precision)
 
     @cached_property
     def cov(self):
         # pylint: disable=method-hidden
-        F = self.F
         if 'scale_tril' in self.__dict__:
-            scale_triu = F.np.swapaxes(self.scale_tril, -1, -2)
-            return F.np.matmul(self.scale_tril, scale_triu)
-        return F.np.linalg.inv(self.precision)
+            scale_triu = np.swapaxes(self.scale_tril, -1, -2)
+            return np.matmul(self.scale_tril, scale_triu)
+        return np.linalg.inv(self.precision)
 
     @cached_property
     def precision(self):
         # pylint: disable=method-hidden
-        F = self.F
         if 'cov' in self.__dict__:
-            return F.np.linalg.inv(self.cov)
-        scale_tril_inv = F.np.linalg.inv(self.scale_tril)
-        scale_triu_inv = F.np.swapaxes(scale_tril_inv, -1, -2)
-        return F.np.matmul(scale_triu_inv, scale_tril_inv)
+            return np.linalg.inv(self.cov)
+        scale_tril_inv = np.linalg.inv(self.scale_tril)
+        scale_triu_inv = np.swapaxes(scale_tril_inv, -1, -2)
+        return np.matmul(scale_triu_inv, scale_tril_inv)
 
     @property
     def mean(self):
@@ -116,59 +109,55 @@ def variance(self):
         return (self.scale_tril ** 2).sum(-1)
 
     def sample(self, size=None):
-        F = self.F
         # symbol does not support `np.broadcast`
         shape_tensor = self.loc + self.scale_tril.sum(-1)
         if size is not None:
             if isinstance(size, int):
                 size = (size,)
-            shape_tensor = F.np.broadcast_to(shape_tensor, size + (-2,))
-        noise = F.np.random.normal(F.np.zeros_like(
-            shape_tensor), F.np.ones_like(shape_tensor))
+            shape_tensor = np.broadcast_to(shape_tensor, size + (-2,))
+        noise = np.random.normal(np.zeros_like(
+            shape_tensor), np.ones_like(shape_tensor))
         samples = self.loc + \
-            F.np.einsum('...jk,...j->...k', self.scale_tril, noise)
+            np.einsum('...jk,...j->...k', self.scale_tril, noise)
         return samples
 
     def sample_n(self, size=None):
         if size is None:
             return self.sample()
-        F = self.F
         # symbol does not support `np.broadcast`
         shape_tensor = self.loc + self.scale_tril[..., 0]
         if isinstance(size, int):
             size = (size,)
-        noise = F.np.random.normal(F.np.zeros_like(shape_tensor), F.np.ones_like(shape_tensor),
-                                   (-2,) + size)
+        noise = np.random.normal(np.zeros_like(shape_tensor), np.ones_like(shape_tensor),
+                                 (-2,) + size)
         samples = self.loc + \
-            F.np.einsum('...jk,...j->...k', self.scale_tril, noise)
+            np.einsum('...jk,...j->...k', self.scale_tril, noise)
         return samples
 
     def log_prob(self, value):
         if self._validate_args:
             self._validate_samples(value)
-        F = self.F
         diff = value - self.loc
         # diff.T * inv(\Sigma) * diff
-        M = F.np.einsum(
+        M = np.einsum(
             '...i,...i->...',
             diff,
-            F.np.einsum('...jk,...j->...k', self.precision,
-                        diff)  # Batch matrix vector multiply
+            np.einsum('...jk,...j->...k', self.precision,
+                      diff)  # Batch matrix vector multiply
         ) * -0.5
         #   (2 * \pi)^{-k/2} * det(\Sigma)^{-1/2}
         # = det(2 * \pi * L * L.T)^{-1/2}
         # = det(\sqrt(2 * \pi) * L)^{-1}
-        half_log_det = F.np.log(
-            F.np.diagonal(F.np.sqrt(2 * math.pi) *
-                          self.scale_tril, axis1=-2, axis2=-1)
+        half_log_det = np.log(
+            np.diagonal(np.sqrt(2 * math.pi) *
+                        self.scale_tril, axis1=-2, axis2=-1)
         ).sum(-1)
         return M - half_log_det
 
     def entropy(self):
-        F = self.F
         #   det(2 * \pi * e * \Sigma)
         # = det(\sqrt(2 * \pi * e) * L)^2
-        return F.np.log(F.np.diagonal(
-            F.np.sqrt(2 * math.pi * math.e) * self.scale_tril,
+        return np.log(np.diagonal(
+            np.sqrt(2 * math.pi * math.e) * self.scale_tril,
             axis1=-2, axis2=-1
         )).sum(-1)
diff --git a/python/mxnet/gluon/probability/distributions/negative_binomial.py b/python/mxnet/gluon/probability/distributions/negative_binomial.py
index d360d48f4d61..634908bf3fc9 100644
--- a/python/mxnet/gluon/probability/distributions/negative_binomial.py
+++ b/python/mxnet/gluon/probability/distributions/negative_binomial.py
@@ -23,9 +23,10 @@
 from .distribution import Distribution
 from .poisson import Poisson
 from .gamma import Gamma
-from .utils import prob2logit, logit2prob, getF, cached_property
+from .utils import prob2logit, logit2prob, cached_property
 from .utils import gammaln
 from .constraint import GreaterThanEq, Interval, Real, NonNegativeInteger
+from .... import np
 
 
 class NegativeBinomial(Distribution):
@@ -39,9 +40,6 @@ class NegativeBinomial(Distribution):
         Probability of sampling `1`.
     logit : Tensor or scalar, default None
         The log-odds of sampling `1`.
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
@@ -50,8 +48,7 @@ class NegativeBinomial(Distribution):
                        'prob': Interval(0, 1),
                        'logit': Real()}
 
-    def __init__(self, n, prob=None, logit=None, F=None, validate_args=None):
-        _F = F if F is not None else getF(n, prob, logit)
+    def __init__(self, n, prob=None, logit=None, validate_args=None):
         if (prob is None) == (logit is None):
             raise ValueError(
                 "Either `prob` or `logit` must be specified, but not both. " +
@@ -63,7 +60,7 @@ def __init__(self, n, prob=None, logit=None, F=None, validate_args=None):
             self.logit = logit
         self.n = n
         super(NegativeBinomial, self).__init__(
-            F=_F, event_dim=0, validate_args=validate_args)
+            event_dim=0, validate_args=validate_args)
 
     @cached_property
     def prob(self):
@@ -75,7 +72,7 @@ def prob(self):
             Parameter tensor.
         """
         # pylint: disable=method-hidden
-        return logit2prob(self.logit, True, self.F)
+        return logit2prob(self.logit, True)
 
     @cached_property
     def logit(self):
@@ -87,12 +84,11 @@ def logit(self):
             Parameter tensor.
         """
         # pylint: disable=method-hidden
-        return prob2logit(self.prob, True, self.F)
+        return prob2logit(self.prob, True)
 
     @property
     def mean(self):
-        F = self.F
-        return self.n * F.np.exp(self.logit)
+        return self.n * np.exp(self.logit)
 
     @property
     def variance(self):
@@ -101,14 +97,12 @@ def variance(self):
 
     def broadcast_to(self, batch_shape):
         new_instance = self.__new__(type(self))
-        F = self.F
         if 'prob' in self.__dict__:
-            new_instance.prob = F.np.broadcast_to(self.prob, batch_shape)
+            new_instance.prob = np.broadcast_to(self.prob, batch_shape)
         else:
-            new_instance.logit = F.np.broadcast_to(self.logit, batch_shape)
-        new_instance.n = F.np.broadcast_to(self.n, batch_shape)
-        super(NegativeBinomial, new_instance).__init__(F=F,
-                                                       event_dim=self.event_dim,
+            new_instance.logit = np.broadcast_to(self.logit, batch_shape)
+        new_instance.n = np.broadcast_to(self.n, batch_shape)
+        super(NegativeBinomial, new_instance).__init__(event_dim=self.event_dim,
                                                        validate_args=False)
         new_instance._validate_args = self._validate_args
         return new_instance
@@ -116,25 +110,22 @@ def broadcast_to(self, batch_shape):
     def log_prob(self, value):
         if self._validate_args:
             self._validate_samples(value)
-        F = self.F
-        lgamma = gammaln(F)
+        lgamma = gammaln()
         binomal_coef = lgamma(value + self.n) - \
             lgamma(1 + value) - lgamma(self.n)
         # log(prob) may have numerical issue.
         unnormalized_log_prob = self.n * \
-            F.np.log(self.prob) + value * F.np.log1p(-self.prob)
+            np.log(self.prob) + value * np.log1p(-self.prob)
         return binomal_coef + unnormalized_log_prob
 
     def sample(self, size=None):
-        F = self.F
         # Sample via Poisson-Gamma mixture
-        rate = Gamma(shape=self.n, scale=F.np.exp(
-            self.logit), F=F).sample(size)
-        return Poisson(rate, F=F).sample()
+        rate = Gamma(shape=self.n, scale=np.exp(
+            self.logit)).sample(size)
+        return Poisson(rate).sample()
 
     def sample_n(self, size=None):
-        F = self.F
         # Sample via Poisson-Gamma mixture
-        rate = Gamma(shape=self.n, scale=F.np.exp(
-            self.logit), F=F).sample_n(size)
-        return Poisson(rate, F=F).sample()
+        rate = Gamma(shape=self.n, scale=np.exp(
+            self.logit)).sample_n(size)
+        return Poisson(rate).sample()
diff --git a/python/mxnet/gluon/probability/distributions/normal.py b/python/mxnet/gluon/probability/distributions/normal.py
index d0f1b1fbb8b0..9bc3615e1db8 100644
--- a/python/mxnet/gluon/probability/distributions/normal.py
+++ b/python/mxnet/gluon/probability/distributions/normal.py
@@ -23,7 +23,8 @@
 import math
 from .constraint import Real, Positive
 from .exp_family import ExponentialFamily
-from .utils import getF, erf, erfinv
+from .utils import erf, erfinv
+from .... import np, npx
 
 
 class Normal(ExponentialFamily):
@@ -35,9 +36,6 @@ class Normal(ExponentialFamily):
         mean of the distribution.
     scale : Tensor or scalar, default 1
         standard deviation of the distribution
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
@@ -45,12 +43,11 @@ class Normal(ExponentialFamily):
     support = Real()
     arg_constraints = {'loc': Real(), 'scale': Positive()}
 
-    def __init__(self, loc=0.0, scale=1.0, F=None, validate_args=None):
-        _F = F if F is not None else getF(loc, scale)
+    def __init__(self, loc=0.0, scale=1.0, validate_args=None):
         self.loc = loc
         self.scale = scale
         super(Normal, self).__init__(
-            F=_F, event_dim=0, validate_args=validate_args)
+            event_dim=0, validate_args=validate_args)
 
     def log_prob(self, value):
         """Compute the log likelihood of `value`.
@@ -67,11 +64,10 @@ def log_prob(self, value):
         """
         if self._validate_args:
             self._validate_samples(value)
-        F = self.F
-        log_scale = F.np.log(self.scale)
+        log_scale = np.log(self.scale)
         log_prob = -((value - self.loc) ** 2) / (2 * self.variance)
         log_prob = log_prob - log_scale
-        log_prob = log_prob - F.np.log(F.np.sqrt(2 * math.pi))
+        log_prob = log_prob - np.log(np.sqrt(2 * math.pi))
         return log_prob
 
     def sample(self, size=None):
@@ -89,7 +85,7 @@ def sample(self, size=None):
         Tensor
             Samples from Normal distribution.
         """
-        return self.F.np.random.normal(self.loc, self.scale, size)
+        return np.random.normal(self.loc, self.scale, size)
 
     def sample_n(self, size=None):
         r"""Generate samples of (batch_size + broadcast(loc, scale).shape)
@@ -105,15 +101,13 @@ def sample_n(self, size=None):
         Tensor
             Samples from Normal distribution.
         """
-        return self.F.npx.random.normal_n(self.loc, self.scale, size)
+        return npx.random.normal_n(self.loc, self.scale, size)
 
     def broadcast_to(self, batch_shape):
         new_instance = self.__new__(type(self))
-        F = self.F
-        new_instance.loc = F.np.broadcast_to(self.loc, batch_shape)
-        new_instance.scale = F.np.broadcast_to(self.scale, batch_shape)
-        super(Normal, new_instance).__init__(F=F,
-                                             event_dim=self.event_dim,
+        new_instance.loc = np.broadcast_to(self.loc, batch_shape)
+        new_instance.scale = np.broadcast_to(self.scale, batch_shape)
+        super(Normal, new_instance).__init__(event_dim=self.event_dim,
                                              validate_args=False)
         new_instance._validate_args = self._validate_args
         return new_instance
@@ -121,14 +115,14 @@ def broadcast_to(self, batch_shape):
     def cdf(self, value):
         if self._validate_args:
             self._validate_samples(value)
-        erf_func = erf(self.F)
+        erf_func = erf()
         standarized_samples = ((value - self.loc) /
                                (math.sqrt(2) * self.scale))
         erf_term = erf_func(standarized_samples)
         return 0.5 * (1 + erf_term)
 
     def icdf(self, value):
-        erfinv_func = erfinv(self.F)
+        erfinv_func = erfinv()
         return self.loc + self.scale * erfinv_func(2 * value - 1) * math.sqrt(2)
 
     @property
@@ -144,8 +138,7 @@ def variance(self):
         return self.scale ** 2
 
     def entropy(self):
-        F = self.F
-        return 0.5 + 0.5 * math.log(2 * math.pi) + F.np.log(self.scale)
+        return 0.5 + 0.5 * math.log(2 * math.pi) + np.log(self.scale)
 
     @property
     def _natural_params(self):
@@ -158,9 +151,8 @@ def _natural_params(self):
             Natural parameters of normal distribution.
         """
         return (self.loc / (self.scale ** 2),
-                -0.5 * self.F.np.reciprocal(self.scale ** 2))
+                -0.5 * np.reciprocal(self.scale ** 2))
 
     def _log_normalizer(self, x, y):
         # pylint: disable=arguments-differ
-        F = self.F
-        return -0.25 * F.np.pow(x, 2) / y + 0.5 * F.np.log(-math.pi / y)
+        return -0.25 * np.pow(x, 2) / y + 0.5 * np.log(-math.pi / y)
diff --git a/python/mxnet/gluon/probability/distributions/one_hot_categorical.py b/python/mxnet/gluon/probability/distributions/one_hot_categorical.py
index 8729cd81b3a1..828fd2f2b917 100644
--- a/python/mxnet/gluon/probability/distributions/one_hot_categorical.py
+++ b/python/mxnet/gluon/probability/distributions/one_hot_categorical.py
@@ -22,8 +22,9 @@
 
 from .distribution import Distribution
 from .categorical import Categorical
-from .utils import getF, cached_property
+from .utils import cached_property
 from .constraint import Simplex, Real
+from .... import npx
 
 
 class OneHotCategorical(Distribution):
@@ -37,16 +38,12 @@ class OneHotCategorical(Distribution):
         Probabilities of each event.
     logit : Tensor
         The log-odds of each event
-     F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
     arg_constraints = {'prob': Simplex(), 'logit': Real()}
 
-    def __init__(self, num_events, prob=None, logit=None, F=None, validate_args=None):
-        _F = F if F is not None else getF(prob, logit)
+    def __init__(self, num_events, prob=None, logit=None, validate_args=None):
         if (num_events > 0):
             num_events = int(num_events)
             self.num_events = num_events
@@ -54,9 +51,9 @@ def __init__(self, num_events, prob=None, logit=None, F=None, validate_args=None
             raise ValueError("`num_events` should be greater than zero. " +
                              "Received num_events={}".format(num_events))
         self._categorical = Categorical(
-            num_events, prob, logit, _F, validate_args)
+            num_events, prob, logit, validate_args)
         super(OneHotCategorical, self).__init__(
-            _F, event_dim=1, validate_args=validate_args)
+            event_dim=1, validate_args=validate_args)
 
     @cached_property
     def prob(self):
@@ -77,11 +74,11 @@ def variance(self):
 
     def sample(self, size=None):
         indices = self._categorical.sample(size)
-        return self.F.npx.one_hot(indices, self.num_events)
+        return npx.one_hot(indices, self.num_events)
 
     def sample_n(self, size=None):
         indices = self._categorical.sample_n(size)
-        return self.F.npx.one_hot(indices, self.num_events)
+        return npx.one_hot(indices, self.num_events)
 
     def log_prob(self, value):
         if self._validate_args:
@@ -91,15 +88,13 @@ def log_prob(self, value):
 
     def broadcast_to(self, batch_shape):
         new_instance = self.__new__(type(self))
-        F = self.F
         new_instance._categorical = self._categorical.broadcast_to(batch_shape)
         new_instance.num_events = self.num_events
-        super(OneHotCategorical, new_instance).__init__(F=F,
-                                                        event_dim=self.event_dim,
+        super(OneHotCategorical, new_instance).__init__(event_dim=self.event_dim,
                                                         validate_args=False)
         new_instance._validate_args = self._validate_args
         return new_instance
 
     def enumerate_support(self):
         value = self._categorical.enumerate_support()
-        return self.F.npx.one_hot(value, self.num_events)
+        return npx.one_hot(value, self.num_events)
diff --git a/python/mxnet/gluon/probability/distributions/pareto.py b/python/mxnet/gluon/probability/distributions/pareto.py
index 309d49dce2ed..4369e4bf7e4c 100644
--- a/python/mxnet/gluon/probability/distributions/pareto.py
+++ b/python/mxnet/gluon/probability/distributions/pareto.py
@@ -24,7 +24,8 @@
 from .exponential import Exponential
 from .constraint import Positive, dependent_property, GreaterThan
 from ..transformation import ExpTransform, AffineTransform
-from .utils import getF, sample_n_shape_converter
+from .utils import sample_n_shape_converter
+from .... import np
 
 
 class Pareto(TransformedDistribution):
@@ -36,9 +37,6 @@ class Pareto(TransformedDistribution):
         shape parameter of the distribution.
     scale : Tensor or scalar, default 1
         scale parameter of the distribution.
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
@@ -46,8 +44,7 @@ class Pareto(TransformedDistribution):
     arg_constraints = {'scale': Positive(),
                        'alpha': Positive()}
 
-    def __init__(self, alpha, scale=1.0, F=None, validate_args=None):
-        _F = F if F is not None else getF(alpha, scale)
+    def __init__(self, alpha, scale=1.0, validate_args=None):
         self.alpha = alpha
         self.scale = scale
         base_dist = Exponential(1 / self.alpha)
@@ -55,12 +52,10 @@ def __init__(self, alpha, scale=1.0, F=None, validate_args=None):
             ExpTransform(), AffineTransform(0, self.scale)])
 
     def sample(self, size=None):
-        F = self.F
-        return self.scale * (F.np.random.pareto(self.alpha, size) + 1)
+        return self.scale * (np.random.pareto(self.alpha, size) + 1)
 
     def sample_n(self, size=None):
-        F = self.F
-        return self.scale * (F.np.random.pareto(self.alpha, sample_n_shape_converter(size)) + 1)
+        return self.scale * (np.random.pareto(self.alpha, sample_n_shape_converter(size)) + 1)
 
     @dependent_property
     def support(self):
@@ -68,16 +63,13 @@ def support(self):
 
     @property
     def mean(self):
-        F = self.F
-        a = F.np.clip(self.alpha, min=1)
+        a = np.clip(self.alpha, 1, None)
         return a * self.scale / (a - 1)
 
     @property
     def variance(self):
-        F = self.F
-        a = F.np.clip(self.alpha, min=2)
+        a = np.clip(self.alpha, 2, None)
         return (self.scale ** 2) * a / ((a - 1) ** 2 * (a - 2))
 
     def entropy(self):
-        F = self.F
-        return F.np.log(self.scale / self.alpha) + 1 / self.alpha + 1
+        return np.log(self.scale / self.alpha) + 1 / self.alpha + 1
diff --git a/python/mxnet/gluon/probability/distributions/poisson.py b/python/mxnet/gluon/probability/distributions/poisson.py
index ff32379424eb..04fb2e163ba0 100644
--- a/python/mxnet/gluon/probability/distributions/poisson.py
+++ b/python/mxnet/gluon/probability/distributions/poisson.py
@@ -23,7 +23,8 @@
 from numbers import Number
 from .exp_family import ExponentialFamily
 from .constraint import Positive, NonNegativeInteger
-from .utils import getF, gammaln
+from .utils import gammaln
+from .... import np, npx
 
 
 class Poisson(ExponentialFamily):
@@ -33,20 +34,16 @@ class Poisson(ExponentialFamily):
     ----------
     rate : Tensor or scalar, default 1
         rate parameter of the distribution.
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
     arg_constraints = {'rate': Positive()}
     support = NonNegativeInteger()
 
-    def __init__(self, rate=1.0, F=None, validate_args=None):
-        _F = F if F is not None else getF(rate)
+    def __init__(self, rate=1.0, validate_args=None):
         self.rate = rate
         super(Poisson, self).__init__(
-            F=_F, event_dim=0, validate_args=validate_args)
+            event_dim=0, validate_args=validate_args)
 
     @property
     def mean(self):
@@ -58,53 +55,46 @@ def variance(self):
 
     def broadcast_to(self, batch_shape):
         new_instance = self.__new__(type(self))
-        F = self.F
-        new_instance.rate = F.np.broadcast_to(self.rate, batch_shape)
-        super(Poisson, new_instance).__init__(F=F,
-                                              event_dim=self.event_dim,
+        new_instance.rate = np.broadcast_to(self.rate, batch_shape)
+        super(Poisson, new_instance).__init__(event_dim=self.event_dim,
                                               validate_args=False)
         new_instance._validate_args = self._validate_args
         return new_instance
 
     def sample(self, size=None):
-        F = self.F
         lam = self.rate
         if size is None:
             size = ()
         if isinstance(lam, Number):
             # Scalar case
-            return F.npx.scalar_poisson(lam, size)
+            return npx.scalar_poisson(lam, size)
         else:
             # Tensor case
-            shape_tensor = F.np.ones(size)
+            shape_tensor = np.ones(size)
             # shape = () currently not supported
-            return F.npx.tensor_poisson(lam * shape_tensor)
+            return npx.tensor_poisson(lam * shape_tensor)
 
     def sample_n(self, size=None):
-        F = self.F
         lam = self.rate
         if isinstance(lam, Number):
             # Scalar case
             if size is None:
                 size = ()
-            return F.npx.scalar_poisson(lam, size)
+            return npx.scalar_poisson(lam, size)
         else:
-            return F.np.moveaxis(F.npx.tensor_poisson(lam, size), -1, 0)
+            return np.moveaxis(npx.tensor_poisson(lam, size), -1, 0)
 
     def log_prob(self, value):
         if self._validate_args:
             self._validate_samples(value)
-        F = self.F
-        lgamma = gammaln(F)
+        lgamma = gammaln()
         rate = self.rate
-        return value * F.np.log(rate) - rate - lgamma(value + 1)
+        return value * np.log(rate) - rate - lgamma(value + 1)
 
     @property
     def _natural_params(self):
-        F = self.F
-        return (F.np.log(self.rate),)
+        return (np.log(self.rate),)
 
     def _log_normalizer(self, x):
         # pylint: disable=arguments-differ
-        F = self.F
-        return F.np.exp(x)
+        return np.exp(x)
diff --git a/python/mxnet/gluon/probability/distributions/relaxed_bernoulli.py b/python/mxnet/gluon/probability/distributions/relaxed_bernoulli.py
index faae9aed0cd4..2733278be0a8 100644
--- a/python/mxnet/gluon/probability/distributions/relaxed_bernoulli.py
+++ b/python/mxnet/gluon/probability/distributions/relaxed_bernoulli.py
@@ -23,8 +23,9 @@
 from .distribution import Distribution
 from .transformed_distribution import TransformedDistribution
 from ..transformation import SigmoidTransform
-from .utils import prob2logit, logit2prob, getF, cached_property
+from .utils import prob2logit, logit2prob, cached_property
 from .constraint import OpenInterval, Real, Interval
+from .... import np
 
 
 class _LogitRelaxedBernoulli(Distribution):
@@ -38,9 +39,6 @@ class _LogitRelaxedBernoulli(Distribution):
         Probability of sampling `1`.
     logit : Tensor or scalar, default None
         The log-odds of sampling `1`.
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
@@ -49,8 +47,7 @@ class _LogitRelaxedBernoulli(Distribution):
     arg_constraints = {'prob': Interval(0, 1),
                        'logit': Real()}
 
-    def __init__(self, T, prob=None, logit=None, F=None, validate_args=None):
-        _F = F if F is not None else getF(prob, logit)
+    def __init__(self, T, prob=None, logit=None, validate_args=None):
         self.T = T
         if (prob is None) == (logit is None):
             raise ValueError(
@@ -61,29 +58,27 @@ def __init__(self, T, prob=None, logit=None, F=None, validate_args=None):
         else:
             self.logit = logit
         super(_LogitRelaxedBernoulli, self).__init__(
-            F=_F, event_dim=0, validate_args=validate_args
+            event_dim=0, validate_args=validate_args
         )
 
     @cached_property
     def prob(self):
         # pylint: disable=method-hidden
-        return logit2prob(self.logit, True, self.F)
+        return logit2prob(self.logit, True)
 
     @cached_property
     def logit(self):
         # pylint: disable=method-hidden
-        return prob2logit(self.prob, True, self.F)
+        return prob2logit(self.prob, True)
 
     def sample(self, size=None):
-        F = self.F
         logit = self.logit
-        return F.np.random.logistic(loc=logit, scale=1, size=size) / self.T
+        return np.random.logistic(loc=logit, scale=1, size=size) / self.T
 
     def log_prob(self, value):
-        F = self.F
         # log-likelihood of `value` from (Logistic(logit, 1) / T)
         diff = self.logit - self.T * value
-        return F.np.log(self.T) + diff - 2 * F.np.log1p(F.np.exp(diff))
+        return np.log(self.T) + diff - 2 * np.log1p(np.exp(diff))
 
 
 class RelaxedBernoulli(TransformedDistribution):
@@ -97,9 +92,6 @@ class RelaxedBernoulli(TransformedDistribution):
         Probability of sampling `1`.
     logit : Tensor or scalar, default None
         The log-odds of sampling `1`.
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
@@ -108,8 +100,8 @@ class RelaxedBernoulli(TransformedDistribution):
     arg_constraints = {'prob': Interval(0, 1),
                        'logit': Real()}
 
-    def __init__(self, T, prob=None, logit=None, F=None, validate_args=None):
-        base_dist = _LogitRelaxedBernoulli(T, prob, logit, F, validate_args)
+    def __init__(self, T, prob=None, logit=None, validate_args=None):
+        base_dist = _LogitRelaxedBernoulli(T, prob, logit, validate_args)
         super(RelaxedBernoulli, self).__init__(base_dist, SigmoidTransform())
 
     @property
@@ -126,13 +118,11 @@ def logit(self):
 
     def broadcast_to(self, batch_shape):
         new_instance = self.__new__(type(self))
-        F = self.F
         if 'prob' in self.__dict__:
-            new_instance.prob = F.np.broadcast_to(self.prob, batch_shape)
+            new_instance.prob = np.broadcast_to(self.prob, batch_shape)
         else:
-            new_instance.logit = F.np.broadcast_to(self.logit, batch_shape)
-        super(RelaxedBernoulli, new_instance).__init__(F=F,
-                                                       event_dim=self.event_dim,
+            new_instance.logit = np.broadcast_to(self.logit, batch_shape)
+        super(RelaxedBernoulli, new_instance).__init__(event_dim=self.event_dim,
                                                        validate_args=False)
         new_instance._validate_args = self._validate_args
         return new_instance
diff --git a/python/mxnet/gluon/probability/distributions/relaxed_one_hot_categorical.py b/python/mxnet/gluon/probability/distributions/relaxed_one_hot_categorical.py
index 9d5f172cc865..e86cb3f2b4c0 100644
--- a/python/mxnet/gluon/probability/distributions/relaxed_one_hot_categorical.py
+++ b/python/mxnet/gluon/probability/distributions/relaxed_one_hot_categorical.py
@@ -24,8 +24,9 @@
 from .distribution import Distribution
 from .transformed_distribution import TransformedDistribution
 from ..transformation import ExpTransform
-from .utils import prob2logit, logit2prob, getF, cached_property
+from .utils import prob2logit, logit2prob, cached_property
 from .constraint import Real, Simplex
+from .... import np, npx
 
 
 class _LogRelaxedOneHotCategorical(Distribution):
@@ -42,9 +43,6 @@ class _LogRelaxedOneHotCategorical(Distribution):
         Probabilities of each event.
     logit : Tensor
         The log-odds of each event
-     F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
@@ -52,9 +50,8 @@ class _LogRelaxedOneHotCategorical(Distribution):
     arg_constraints = {'prob': Simplex(),
                        'logit': Real()}
 
-    def __init__(self, T, num_events, prob=None, logit=None, F=None, validate_args=None):
+    def __init__(self, T, num_events, prob=None, logit=None, validate_args=None):
         self.T = T
-        _F = F if F is not None else getF(prob, logit)
         if (num_events > 0):
             num_events = int(num_events)
             self.num_events = num_events
@@ -72,7 +69,7 @@ def __init__(self, T, num_events, prob=None, logit=None, F=None, validate_args=N
             self.logit = logit
 
         super(_LogRelaxedOneHotCategorical, self).__init__(
-            _F, event_dim=1, validate_args=validate_args)
+            event_dim=1, validate_args=validate_args)
 
     @cached_property
     def prob(self):
@@ -84,7 +81,7 @@ def prob(self):
             Parameter tensor.
         """
         # pylint: disable=method-hidden
-        return logit2prob(self.logit, False, self.F)
+        return logit2prob(self.logit, False)
 
     @cached_property
     def logit(self):
@@ -96,7 +93,7 @@ def logit(self):
             Parameter tensor.
         """
         # pylint: disable=method-hidden
-        return prob2logit(self.prob, False, self.F)
+        return prob2logit(self.prob, False)
 
     def log_prob(self, value):
         """Compute the log-likelihood of `value`
@@ -111,28 +108,24 @@ def log_prob(self, value):
         Tensor
             log-likelihood of `value`
         """
-        F = self.F
         K = self.num_events  # Python scalar
-        log = F.np.log
-        exp = F.np.exp
         logit = self.logit
         y = logit - value * self.T
-        log_sum_exp = log(exp(y).sum(-1, keepdims=True) + 1e-20)
-        log_scale = lgamma(K) - log(self.T) * (-(K - 1))
+        log_sum_exp = np.log(np.exp(y).sum(-1, keepdims=True) + 1e-20)
+        log_scale = lgamma(K) - np.log(self.T) * (-(K - 1))
         return (y - log_sum_exp).sum(-1) + log_scale
 
     def sample(self, size=None):
-        F = self.F
         if size is None:
             size = ()
             logit = self.logit
         else:
             if isinstance(size, int):
-                logit = F.np.broadcast_to(self.logit, (size) + (-2,))
+                logit = np.broadcast_to(self.logit, (size) + (-2,))
             else:
-                logit = F.np.broadcast_to(self.logit, size + (-2,))
-        scores = F.np.random.gumbel(logit) / self.T
-        return F.np.log(F.npx.softmax(scores, axis=-1) + 1e-20)
+                logit = np.broadcast_to(self.logit, size + (-2,))
+        scores = np.random.gumbel(logit) / self.T
+        return np.log(npx.softmax(scores, axis=-1) + 1e-20)
 
 
 class RelaxedOneHotCategorical(TransformedDistribution):
@@ -148,9 +141,6 @@ class RelaxedOneHotCategorical(TransformedDistribution):
         Probabilities of each event.
     logit : Tensor
         The log-odds of each event
-     F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
@@ -158,9 +148,9 @@ class RelaxedOneHotCategorical(TransformedDistribution):
     arg_constraints = {'prob': Simplex(),
                        'logit': Real()}
 
-    def __init__(self, T, num_events, prob=None, logit=None, F=None, validate_args=None):
+    def __init__(self, T, num_events, prob=None, logit=None, validate_args=None):
         base_dist = _LogRelaxedOneHotCategorical(
-            T, num_events, prob, logit, F, validate_args)
+            T, num_events, prob, logit, validate_args)
         super(RelaxedOneHotCategorical, self).__init__(
             base_dist, ExpTransform())
 
diff --git a/python/mxnet/gluon/probability/distributions/studentT.py b/python/mxnet/gluon/probability/distributions/studentT.py
index 45a4e1c4d385..06765e3e050b 100644
--- a/python/mxnet/gluon/probability/distributions/studentT.py
+++ b/python/mxnet/gluon/probability/distributions/studentT.py
@@ -24,7 +24,8 @@
 from .distribution import Distribution
 from .constraint import Real, Positive
 from .chi2 import Chi2
-from .utils import getF, gammaln, digamma, sample_n_shape_converter
+from .utils import gammaln, digamma, sample_n_shape_converter
+from .... import np
 
 
 class StudentT(Distribution):
@@ -38,56 +39,49 @@ class StudentT(Distribution):
         mean of the distribution.
     scale : Tensor or scalar, default 1
         scale of the distribution
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
     support = Real()
     arg_constraints = {'df': Positive(), 'loc': Real(), 'scale': Real()}
 
-    def __init__(self, df, loc=0.0, scale=1.0, F=None, validate_args=None):
-        _F = F if F is not None else getF(df, loc, scale)
+    def __init__(self, df, loc=0.0, scale=1.0, validate_args=None):
         self.df = df
         self.loc = loc
         self.scale = scale
         self._chi2 = Chi2(self.df)
         super(StudentT, self).__init__(
-            F=_F, event_dim=0, validate_args=validate_args)
+            event_dim=0, validate_args=validate_args)
 
     def broadcast_to(self, batch_shape):
         new_instance = self.__new__(type(self))
-        F = self.F
-        new_instance.loc = F.np.broadcast_to(self.loc, batch_shape)
-        new_instance.scale = F.np.broadcast_to(self.scale, batch_shape)
-        new_instance.df = F.np.broadcast_to(self.df, batch_shape)
+        new_instance.loc = np.broadcast_to(self.loc, batch_shape)
+        new_instance.scale = np.broadcast_to(self.scale, batch_shape)
+        new_instance.df = np.broadcast_to(self.df, batch_shape)
         new_instance._chi2 = self._chi2.broadcast_to(batch_shape)
         super(StudentT, new_instance).__init__(
-            F=F, event_dim=0, validate_args=False)
+            event_dim=0, validate_args=False)
         new_instance._validate_args = self._validate_args
         return new_instance
 
     @property
     def mean(self):
         # mean is only defined for df > 1
-        m = self.F.np.where(self.df <= 1, nan, self.loc)
+        m = np.where(self.df <= 1, nan, self.loc)
         return m
 
     @property
     def variance(self):
-        F = self.F
         df = self.df
         v = self.scale ** 2 * self.df / (self.df - 2)
-        v = F.np.where(df <= 2, inf, v)
-        v = F.np.where(df <= 1, nan, v)
+        v = np.where(df <= 2, inf, v)
+        v = np.where(df <= 1, nan, v)
         return v
 
     def sample(self, size=None):
-        F = self.F
-        X = F.np.random.normal(size=size)
+        X = np.random.normal(size=size)
         Z = self._chi2.sample(size)
-        Y = X * F.np.sqrt(self.df / Z)
+        Y = X * np.sqrt(self.df / Z)
         return self.loc + Y * self.scale
 
     def sample_n(self, size=None):
@@ -96,21 +90,19 @@ def sample_n(self, size=None):
     def log_prob(self, value):
         if self._validate_args:
             self._validate_samples(value)
-        F = self.F
-        lgamma = gammaln(F)
+        lgamma = gammaln()
         df = self.df
         value = (value - self.loc) / self.scale
         return (
             lgamma((df + 1) / 2) - lgamma(df / 2) -
-            F.np.log(self.scale) - 0.5 * F.np.log(df * pi)
-            - 0.5 * (df + 1) * F.np.log1p(value ** 2 / df)
+            np.log(self.scale) - 0.5 * np.log(df * pi)
+            - 0.5 * (df + 1) * np.log1p(value ** 2 / df)
         )
 
     def entropy(self):
-        F = self.F
-        lgamma = gammaln(F)
-        dgamma = digamma(F)
-        log_fn = F.np.log
+        lgamma = gammaln()
+        dgamma = digamma()
+        log_fn = np.log
         lbeta = lgamma(0.5 * self.df) + lgamma(0.5) - \
             lgamma(0.5 * (self.df + 1))
         return (log_fn(self.scale) +
diff --git a/python/mxnet/gluon/probability/distributions/transformed_distribution.py b/python/mxnet/gluon/probability/distributions/transformed_distribution.py
index c5cf3625e348..a879ba682c1f 100644
--- a/python/mxnet/gluon/probability/distributions/transformed_distribution.py
+++ b/python/mxnet/gluon/probability/distributions/transformed_distribution.py
@@ -22,6 +22,7 @@
 from ..transformation import Transformation
 from .distribution import Distribution
 from .utils import sum_right_most
+from .... import np
 
 
 class TransformedDistribution(Distribution):
@@ -42,14 +43,10 @@ def __init__(self, base_dist, transforms, validate_args=None):
         if isinstance(transforms, Transformation):
             transforms = [transforms, ]
         self._transforms = transforms
-        _F = base_dist.F
-        # Overwrite the F in transform
-        for t in self._transforms:
-            t.F = _F
         event_dim = max([self._base_dist.event_dim] +
                         [t.event_dim for t in self._transforms])
         super(TransformedDistribution, self).__init__(
-            _F, event_dim=event_dim, validate_args=validate_args)
+            event_dim=event_dim, validate_args=validate_args)
 
     def sample(self, size=None):
         x = self._base_dist.sample(size)
@@ -87,7 +84,7 @@ def cdf(self, value):
         """
         Compute the cumulative distribution function(CDF) p(Y < `value`)
         """
-        sign = self.F.np.ones_like(value)
+        sign = np.ones_like(value)
         for t in reversed(self._transforms):
             value = t.inv(value)
             sign = sign * t.sign
@@ -95,7 +92,7 @@ def cdf(self, value):
         return sign * (value - 0.5) + 0.5
 
     def icdf(self, value):
-        sign = self.F.np.ones_like(value)
+        sign = np.ones_like(value)
         for t in self._transforms:
             sign = sign * t.sign
         value = sign * (value - 0.5) + 0.5  # value or (1 - value)
diff --git a/python/mxnet/gluon/probability/distributions/uniform.py b/python/mxnet/gluon/probability/distributions/uniform.py
index e2d237418c18..059a906fe80c 100644
--- a/python/mxnet/gluon/probability/distributions/uniform.py
+++ b/python/mxnet/gluon/probability/distributions/uniform.py
@@ -22,7 +22,8 @@
 
 from .distribution import Distribution
 from .constraint import Real, Interval
-from .utils import getF, sample_n_shape_converter
+from .utils import sample_n_shape_converter
+from .... import np
 
 
 class Uniform(Distribution):
@@ -34,9 +35,6 @@ class Uniform(Distribution):
         lower range of the distribution.
     high : Tensor or scalar, default 1
         upper range of the distribution.
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
 
@@ -45,33 +43,29 @@ class Uniform(Distribution):
     has_grad = False
     arg_constraints = {'low': Real(), 'high': Real()}
 
-    def __init__(self, low=0.0, high=1.0, F=None, validate_args=None):
-        _F = F if F is not None else getF(low, high)
+    def __init__(self, low=0.0, high=1.0, validate_args=None):
         self.low = low
         self.high = high
         super(Uniform, self).__init__(
-            F=_F, event_dim=0, validate_args=validate_args)
+            event_dim=0, validate_args=validate_args)
 
     def log_prob(self, value):
         if self._validate_args:
             self._validate_samples(value)
-        F = self.F
         def type_converter(x):
             return float(x) if isinstance(x, bool) else x.astype('float')
         lower_bound = type_converter(self.low < value)
         upper_bound = type_converter(self.high > value)
         # 0 if value \in [low, high], -inf otherwise.
-        out_of_support_value = F.np.log(lower_bound * upper_bound)
-        return out_of_support_value - F.np.log(self.high - self.low)
+        out_of_support_value = np.log(lower_bound * upper_bound)
+        return out_of_support_value - np.log(self.high - self.low)
 
     def sample(self, size=None):
-        F = self.F
-        return F.np.random.uniform(self.low, self.high, size=size)
+        return np.random.uniform(self.low, self.high, size=size)
 
     def sample_n(self, size=None):
-        F = self.F
-        return F.np.random.uniform(self.low, self.high,
-                                   size=sample_n_shape_converter(size))
+        return np.random.uniform(self.low, self.high,
+                                 size=sample_n_shape_converter(size))
 
     @property
     def support(self):
@@ -79,11 +73,9 @@ def support(self):
 
     def broadcast_to(self, batch_shape):
         new_instance = self.__new__(type(self))
-        F = self.F
-        new_instance.low = F.np.broadcast_to(self.low, batch_shape)
-        new_instance.high = F.np.broadcast_to(self.high, batch_shape)
-        super(Uniform, new_instance).__init__(F=F,
-                                              event_dim=self.event_dim,
+        new_instance.low = np.broadcast_to(self.low, batch_shape)
+        new_instance.high = np.broadcast_to(self.high, batch_shape)
+        super(Uniform, new_instance).__init__(event_dim=self.event_dim,
                                               validate_args=False)
         new_instance._validate_args = self._validate_args
         return new_instance
@@ -98,4 +90,4 @@ def icdf(self, value):
         return value * (self.high - self.low) + self.low
 
     def entropy(self):
-        return self.F.np.log(self.high - self.low)
+        return np.log(self.high - self.low)
diff --git a/python/mxnet/gluon/probability/distributions/utils.py b/python/mxnet/gluon/probability/distributions/utils.py
index 066a8ea3752e..041a9424a6bd 100644
--- a/python/mxnet/gluon/probability/distributions/utils.py
+++ b/python/mxnet/gluon/probability/distributions/utils.py
@@ -18,7 +18,7 @@
 # coding: utf-8
 # pylint: disable=wildcard-import
 """Distribution utilities"""
-__all__ = ['getF', 'prob2logit', 'logit2prob', 'cached_property', 'sample_n_shape_converter',
+__all__ = ['prob2logit', 'logit2prob', 'cached_property', 'sample_n_shape_converter',
            'constraint_check', 'digamma', 'gammaln', 'erfinv', 'erf']
 
 from functools import update_wrapper
@@ -28,11 +28,10 @@
     import scipy.special as sc
 except ImportError:
     sc = None
-from .... import symbol as sym
-from .... import ndarray as nd
+from .... import np, npx
 
 
-def constraint_check(F):
+def constraint_check():
     """Unified check_constraint interface for both scalar and tensor
     """
     def _check(condition, err_msg):
@@ -40,11 +39,11 @@ def _check(condition, err_msg):
             if not condition:
                 raise ValueError(err_msg)
             return 1.0
-        return F.npx.constraint_check(condition, err_msg)
+        return npx.constraint_check(condition, err_msg)
     return _check
 
 
-def digamma(F):
+def digamma():
     """Unified digamma interface for both scalar and tensor
     """
     def compute(value):
@@ -55,11 +54,11 @@ def compute(value):
                 return sc.digamma(value, dtype='float32')
             else:
                 raise ValueError('Numbers are not supported as input if scipy is not installed')
-        return F.npx.digamma(value)
+        return npx.digamma(value)
     return compute
 
 
-def gammaln(F):
+def gammaln():
     """Unified gammaln interface for both scalar and tensor
     """
     def compute(value):
@@ -70,11 +69,11 @@ def compute(value):
                 return sc.gammaln(value, dtype='float32')
             else:
                 raise ValueError('Numbers are not supported as input if scipy is not installed')
-        return F.npx.gammaln(value)
+        return npx.gammaln(value)
     return compute
 
 
-def erf(F):
+def erf():
     """Unified erf interface for both scalar and tensor
     """
     def compute(value):
@@ -83,11 +82,11 @@ def compute(value):
                 return sc.erf(value)
             else:
                 raise ValueError('Numbers are not supported as input if scipy is not installed')
-        return F.npx.erf(value)
+        return npx.erf(value)
     return compute
 
 
-def erfinv(F):
+def erfinv():
     """Unified erfinv interface for both scalar and tensor
     """
     def compute(value):
@@ -96,7 +95,7 @@ def compute(value):
                 return sc.erfinv(value)
             else:
                 raise ValueError('Numbers are not supported as input if scipy is not installed')
-        return F.npx.erfinv(value)
+        return npx.erfinv(value)
     return compute
 
 
@@ -114,33 +113,6 @@ def sample_n_shape_converter(size):
     return size
 
 
-def getF(*params):
-    """Get running mode from parameters,
-    return mx.ndarray if inputs are python scalar.
-
-    Returns
-    -------
-    ndarray or _Symbol
-        the running mode inferred from `*params`
-    """
-    mode_flag = 0
-    for param in params:
-        if isinstance(param, nd.NDArray):
-            if mode_flag < 0:
-                raise TypeError("Expect parameters to have consistent running mode," +
-                                " got {}".format([type(p) for p in params]))
-            mode_flag = 1
-        elif isinstance(param, sym.Symbol):
-            if mode_flag > 0:
-                raise TypeError("Expect parameters to have consistent running mode," +
-                                " got {}".format([type(p) for p in params]))
-            mode_flag = -1
-    # In case of scalar params, we choose to use the imperative mode.
-    if mode_flag < 0:
-        return sym
-    return nd
-
-
 def sum_right_most(x, ndim):
     """Sum along the right most `ndim` dimensions of `x`,
 
@@ -161,42 +133,38 @@ def sum_right_most(x, ndim):
     return x.sum(axes)
 
 
-def _clip_prob(prob, F):
+def _clip_prob(prob):
     eps = onp.finfo('float32').eps
-    return F.np.clip(prob, eps, 1 - eps)
+    return np.clip(prob, eps, 1 - eps)
 
 
-def _clip_float_eps(value, F):
+def _clip_float_eps(value):
     eps = onp.finfo('float32').eps
-    return F.np.maximum(value, eps)
+    return np.maximum(value, eps)
 
 
-def prob2logit(prob, binary=True, F=None):
+def prob2logit(prob, binary=True):
     r"""Convert probability to logit form.
     For the binary case, the logit stands for log(p / (1 - p)).
     Whereas for the multinomial case, the logit denotes log(p).
     """
-    if F is None:
-        F = getF(prob)
-    _clipped_prob = _clip_prob(prob, F)
+    _clipped_prob = _clip_prob(prob)
     if binary:
-        return F.np.log(_clipped_prob) - F.np.log1p(-_clipped_prob)
+        return np.log(_clipped_prob) - np.log1p(-_clipped_prob)
     # The clipped prob would cause numerical error in the categorical case,
     # no idea about the reason behind.
-    return F.np.log(_clipped_prob)
+    return np.log(_clipped_prob)
 
 
-def logit2prob(logit, binary=True, F=None):
+def logit2prob(logit, binary=True):
     r"""Convert logit into probability form.
     For the binary case, `sigmoid()` is applied on the logit tensor.
     Whereas for the multinomial case, `softmax` is applied along the last
     dimension of the logit tensor.
     """
-    if F is None:
-        F = getF(logit)
     if binary:
-        return F.npx.sigmoid(logit)
-    return F.npx.softmax(logit)
+        return npx.sigmoid(logit)
+    return npx.softmax(logit)
 
 
 class _CachedProperty(object):
diff --git a/python/mxnet/gluon/probability/distributions/weibull.py b/python/mxnet/gluon/probability/distributions/weibull.py
index 358765b815e0..895679d55979 100644
--- a/python/mxnet/gluon/probability/distributions/weibull.py
+++ b/python/mxnet/gluon/probability/distributions/weibull.py
@@ -26,7 +26,8 @@
 from .exponential import Exponential
 from .constraint import Positive
 from ..transformation import PowerTransform, AffineTransform
-from .utils import getF, sample_n_shape_converter, gammaln
+from .utils import sample_n_shape_converter, gammaln
+from .... import np, npx
 
 
 class Weibull(TransformedDistribution):
@@ -38,9 +39,6 @@ class Weibull(TransformedDistribution):
         Concentration/shape parameter of the distribution.
     scale : Tensor or scalar, default 1
         scale parameter of the distribution.
-    F : mx.ndarray or mx.symbol.numpy._Symbol or None
-        Variable recording running mode, will be automatically
-        inferred from parameters if declared None.
     """
     # pylint: disable=abstract-method
     has_grad = True
@@ -48,38 +46,32 @@ class Weibull(TransformedDistribution):
     arg_constraints = {'scale': Positive(),
                        'concentration': Positive()}
 
-    def __init__(self, concentration, scale=1.0, F=None, validate_args=None):
-        _F = F if F is not None else getF(scale, concentration)
+    def __init__(self, concentration, scale=1.0, validate_args=None):
         self.concentration = concentration
         self.scale = scale
-        base_dist = Exponential(F=_F)
+        base_dist = Exponential()
         super(Weibull, self).__init__(base_dist, [PowerTransform(1 / self.concentration),
                                                   AffineTransform(0, self.scale)])
 
     def sample(self, size=None):
-        F = self.F
-        return self.scale * F.np.random.weibull(self.concentration, size)
+        return self.scale * np.random.weibull(self.concentration, size)
 
     def sample_n(self, size=None):
-        F = self.F
-        return self.scale * F.np.random.weibull(self.concentration,
-                                                sample_n_shape_converter(size))
+        return self.scale * np.random.weibull(self.concentration,
+                                              sample_n_shape_converter(size))
 
     @property
     def mean(self):
-        F = self.F
-        return self.scale * F.np.exp(F.npx.gammaln(1 + 1 / self.concentration))
+        return self.scale * np.exp(npx.gammaln(1 + 1 / self.concentration))
 
     @property
     def variance(self):
-        F = self.F
-        exp = F.np.exp
-        lgamma = gammaln(F)
+        exp = np.exp
+        lgamma = gammaln()
         term1 = exp(lgamma(1 + 2 / self.concentration))
         term2 = exp(2 * lgamma(1 + 1 / self.concentration))
         return (self.scale ** 2) * (term1 - term2)
 
     def entropy(self):
-        F = self.F
         return (euler_gamma * (1 - 1 / self.concentration) +
-                F.np.log(self.scale / self.concentration) + 1)
+                np.log(self.scale / self.concentration) + 1)
diff --git a/python/mxnet/gluon/probability/transformation/transformation.py b/python/mxnet/gluon/probability/transformation/transformation.py
index 4599a483d5dc..a1a28294b025 100644
--- a/python/mxnet/gluon/probability/transformation/transformation.py
+++ b/python/mxnet/gluon/probability/transformation/transformation.py
@@ -26,7 +26,7 @@
 import weakref
 from ..distributions.utils import _clip_prob, cached_property, sum_right_most
 from ...block import HybridBlock
-from .... import ndarray as nd
+from .... import np, npx
 
 
 class Transformation(object):
@@ -41,19 +41,10 @@ class Transformation(object):
     bijective = False
     event_dim = 0
 
-    def __init__(self, F=nd):
+    def __init__(self):
         self._inv = None
-        self._F = F
         super(Transformation, self).__init__()
 
-    @property
-    def F(self):
-        return self._F
-
-    @F.setter
-    def F(self, value):
-        self._F = value
-
     @property
     def sign(self):
         """
@@ -142,15 +133,6 @@ def _forward_compute(self, x):
             x = t(x)
         return x
 
-    @property
-    def F(self):
-        return self._parts[0].F
-
-    @F.setter
-    def F(self, value):
-        for t in self._parts:
-            t.F = value
-
     # @cached_property is, in essence, @property with lazy evaluation.
     # pylint: disable=invalid-overridden-method
     @cached_property
@@ -177,7 +159,7 @@ def inv(self):
 
     def log_det_jacobian(self, x, y):
         if not self._parts:
-            return self.F.np.zeros_like(x)
+            return np.zeros_like(x)
         result = 0
         x_prime = None
         for t in self._parts[:-1]:
@@ -200,10 +182,10 @@ class ExpTransform(Transformation):
     sign = 1
 
     def _forward_compute(self, x):
-        return self.F.np.exp(x)
+        return np.exp(x)
 
     def _inverse_compute(self, y):
-        return self.F.np.log(y)
+        return np.log(y)
 
     def log_det_jacobian(self, x, y):
         return x
@@ -228,16 +210,13 @@ def _inverse_compute(self, y):
         return (y - self._loc) / self._scale
 
     def log_det_jacobian(self, x, y):
-        abs_fn = self.F.np.abs
-        log_fn = self.F.np.log
-        ones_fn = self.F.np.ones_like
         # element-wise abs(log(dy/dx))
-        value = ones_fn(x) * log_fn(abs_fn(self._scale))
+        value = np.ones_like(x) * np.log(np.abs(self._scale))
         return sum_right_most(value, self.event_dim)
 
     @property
     def sign(self):
-        return self.F.np.sign(self._scale)
+        return np.sign(self._scale)
 
 
 class PowerTransform(Transformation):
@@ -252,14 +231,14 @@ def __init__(self, exponent):
         self._exponent = exponent
 
     def _forward_compute(self, x):
-        return self.F.np.power(x, self._exponent)
+        return np.power(x, self._exponent)
 
     def _inverse_compute(self, y):
-        return self.F.np.power(y, 1 / self._exponent)
+        return np.power(y, 1 / self._exponent)
 
     def log_det_jacobian(self, x, y):
-        log_fn = self.F.np.log
-        abs_fn = self.F.np.abs
+        log_fn = np.log
+        abs_fn = np.abs
         return log_fn(abs_fn(self._exponent * y / x))
 
 
@@ -271,19 +250,14 @@ class SigmoidTransform(Transformation):
     sign = 1
 
     def _forward_compute(self, x):
-        F = self.F
-        return _clip_prob(F.npx.sigmoid(x), F)
+        return _clip_prob(npx.sigmoid(x))
 
     def _inverse_compute(self, y):
-        F = self.F
-        clipped_prob = _clip_prob(y, F)
-        return F.np.log(clipped_prob) - F.np.log1p(-clipped_prob)
+        clipped_prob = _clip_prob(y)
+        return np.log(clipped_prob) - np.log1p(-clipped_prob)
 
     def log_det_jacobian(self, x, y):
-        F = self.F
-        log = F.np.log
-        exp = F.np.exp
-        softplus_fn = lambda x: log(1 + exp(x))
+        softplus_fn = lambda x: np.log(1 + np.exp(x))
         return -softplus_fn(-x) - softplus_fn(x)
 
 
@@ -291,15 +265,15 @@ class SoftmaxTransform(Transformation):
     event_dim = 1
 
     def _forward_compute(self, x):
-        return self.F.npx.softmax(x, -1)
+        return npx.softmax(x, -1)
 
     def _inverse_compute(self, y):
-        return self.F.log(y)
+        return np.log(y)
 
 
 class AbsTransform(Transformation):
     def _forward_compute(self, x):
-        return self.F.np.abs(x)
+        return np.abs(x)
 
     def _inverse_compute(self, y):
         return y
diff --git a/python/mxnet/gluon/rnn/conv_rnn_cell.py b/python/mxnet/gluon/rnn/conv_rnn_cell.py
index d6cb0f42e050..d951002bd41a 100644
--- a/python/mxnet/gluon/rnn/conv_rnn_cell.py
+++ b/python/mxnet/gluon/rnn/conv_rnn_cell.py
@@ -28,6 +28,8 @@
 from ...base import numeric_types
 from .rnn_cell import HybridRecurrentCell
 from ..parameter import Parameter
+from ... import np, npx
+from ...util import use_np
 
 
 def _get_conv_out_size(dimensions, kernels, paddings, dilations):
@@ -35,6 +37,7 @@ def _get_conv_out_size(dimensions, kernels, paddings, dilations):
                  zip(dimensions, kernels, paddings, dilations))
 
 
+@use_np
 class _BaseConvRNNCell(HybridRecurrentCell):
     """Abstract base class for convolutional RNNs"""
     def __init__(self, input_shape, hidden_channels,
@@ -142,38 +145,45 @@ def __repr__(self):
     def _num_gates(self):
         return len(self._gate_names)
 
-    def _conv_forward(self, F, inputs, states,
-                      i2h_weight, h2h_weight, i2h_bias, h2h_bias,
-                      prefix):
-        i2h = F.Convolution(data=inputs,
-                            num_filter=self._hidden_channels*self._num_gates,
-                            kernel=self._i2h_kernel,
-                            stride=self._stride,
-                            pad=self._i2h_pad,
-                            dilate=self._i2h_dilate,
-                            weight=i2h_weight,
-                            bias=i2h_bias,
-                            layout=self._conv_layout,
-                            name=prefix+'i2h')
-        h2h = F.Convolution(data=states[0],
-                            num_filter=self._hidden_channels*self._num_gates,
-                            kernel=self._h2h_kernel,
-                            dilate=self._h2h_dilate,
-                            pad=self._h2h_pad,
-                            stride=self._stride,
-                            weight=h2h_weight,
-                            bias=h2h_bias,
-                            layout=self._conv_layout,
-                            name=prefix+'h2h')
+    def _conv_forward(self, inputs, states):
+        ctx = inputs.ctx
+        i2h = npx.convolution(data=inputs,
+                              num_filter=self._hidden_channels*self._num_gates,
+                              kernel=self._i2h_kernel,
+                              stride=self._stride,
+                              pad=self._i2h_pad,
+                              dilate=self._i2h_dilate,
+                              weight=self.i2h_weight.data(ctx),
+                              bias=self.i2h_bias.data(ctx),
+                              layout=self._conv_layout)
+        h2h = npx.convolution(data=states[0].as_in_ctx(ctx),
+                              num_filter=self._hidden_channels*self._num_gates,
+                              kernel=self._h2h_kernel,
+                              dilate=self._h2h_dilate,
+                              pad=self._h2h_pad,
+                              stride=self._stride,
+                              weight=self.h2h_weight.data(ctx),
+                              bias=self.h2h_bias.data(ctx),
+                              layout=self._conv_layout)
         return i2h, h2h
 
     def state_info(self, batch_size=0):
         raise NotImplementedError("_BaseConvRNNCell is abstract class for convolutional RNN")
 
-    def hybrid_forward(self, F, inputs, states):
+    def forward(self, inputs, states):
         raise NotImplementedError("_BaseConvRNNCell is abstract class for convolutional RNN")
 
+    # pylint: disable=unused-argument
+    def infer_shape(self, i, x, is_bidirect):
+        channel_axis = self._conv_layout.find('C')
+        shape_c = x.shape[-len(self._i2h_kernel)-1:][channel_axis-1]
+        wshape = self.i2h_weight.shape
+        wshape_list = list(wshape)
+        wshape_list[self._conv_layout.find('C')] = shape_c
+        self.i2h_weight.shape = tuple(wshape_list)
+
 
+@use_np
 class _ConvRNNCell(_BaseConvRNNCell):
     def __init__(self, input_shape, hidden_channels,
                  i2h_kernel, h2h_kernel, i2h_pad, i2h_dilate, h2h_dilate,
@@ -203,14 +213,9 @@ def _alias(self):
     def _gate_names(self):
         return ('',)
 
-    def hybrid_forward(self, F, inputs, states, i2h_weight,
-                       h2h_weight, i2h_bias, h2h_bias):
-        prefix = 't%d_'%self._counter
-        i2h, h2h = self._conv_forward(F, inputs, states,
-                                      i2h_weight, h2h_weight, i2h_bias, h2h_bias,
-                                      prefix)
-        output = self._get_activation(F, i2h + h2h, self._activation,
-                                      name=prefix+'out')
+    def forward(self, inputs, states):
+        i2h, h2h = self._conv_forward(inputs, states)
+        output = self._get_activation(i2h + h2h, self._activation)
         return output, [output]
 
 
@@ -398,6 +403,7 @@ def __init__(self, input_shape, hidden_channels,
                                             activation=activation)
 
 
+@use_np
 class _ConvLSTMCell(_BaseConvRNNCell):
     def __init__(self, input_shape, hidden_channels,
                  i2h_kernel, h2h_kernel,
@@ -429,23 +435,16 @@ def _alias(self):
     def _gate_names(self):
         return ['_i', '_f', '_c', '_o']
 
-    def hybrid_forward(self, F, inputs, states, i2h_weight,
-                       h2h_weight, i2h_bias, h2h_bias):
-        prefix = 't%d_'%self._counter
-        i2h, h2h = self._conv_forward(F, inputs, states,
-                                      i2h_weight, h2h_weight, i2h_bias, h2h_bias,
-                                      prefix)
+    def forward(self, inputs, states):
+        i2h, h2h = self._conv_forward(inputs, states)
         gates = i2h + h2h
-        slice_gates = F.SliceChannel(gates, num_outputs=4, name=prefix+'slice',
-                                     axis=self._channel_axis)
-        in_gate = F.Activation(slice_gates[0], act_type="sigmoid", name=prefix+'i')
-        forget_gate = F.Activation(slice_gates[1], act_type="sigmoid", name=prefix+'f')
-        in_transform = self._get_activation(F, slice_gates[2], self._activation, name=prefix+'c')
-        out_gate = F.Activation(slice_gates[3], act_type="sigmoid", name=prefix+'o')
-        next_c = F.elemwise_add(forget_gate * states[1], in_gate * in_transform,
-                                name=prefix+'state')
-        next_h = F.elemwise_mul(out_gate, self._get_activation(F, next_c, self._activation),
-                                name=prefix+'out')
+        slice_gates = npx.slice_channel(gates, num_outputs=4, axis=self._channel_axis)
+        in_gate = npx.activation(slice_gates[0], act_type="sigmoid")
+        forget_gate = npx.activation(slice_gates[1], act_type="sigmoid")
+        in_transform = self._get_activation(slice_gates[2], self._activation)
+        out_gate = npx.activation(slice_gates[3], act_type="sigmoid")
+        next_c = forget_gate * states[1].as_in_ctx(inputs.ctx) + in_gate * in_transform
+        next_h = np.multiply(out_gate, self._get_activation(next_c, self._activation))
 
         return next_h, [next_h, next_c]
 
@@ -663,6 +662,7 @@ def __init__(self, input_shape, hidden_channels,
                                              activation=activation)
 
 
+@use_np
 class _ConvGRUCell(_BaseConvRNNCell):
     def __init__(self, input_shape, hidden_channels,
                  i2h_kernel, h2h_kernel, i2h_pad, i2h_dilate, h2h_dilate,
@@ -692,30 +692,21 @@ def _alias(self):
     def _gate_names(self):
         return ['_r', '_z', '_o']
 
-    def hybrid_forward(self, F, inputs, states, i2h_weight,
-                       h2h_weight, i2h_bias, h2h_bias):
-        prefix = 't%d_'%self._counter
-        i2h, h2h = self._conv_forward(F, inputs, states,
-                                      i2h_weight, h2h_weight, i2h_bias, h2h_bias,
-                                      prefix)
-
-        i2h_r, i2h_z, i2h = F.SliceChannel(i2h, num_outputs=3,
-                                           name=prefix+'i2h_slice',
-                                           axis=self._channel_axis)
-        h2h_r, h2h_z, h2h = F.SliceChannel(h2h, num_outputs=3,
-                                           name=prefix+'h2h_slice',
-                                           axis=self._channel_axis)
-
-        reset_gate = F.Activation(i2h_r + h2h_r, act_type="sigmoid",
-                                  name=prefix+'r_act')
-        update_gate = F.Activation(i2h_z + h2h_z, act_type="sigmoid",
-                                   name=prefix+'z_act')
-
-        next_h_tmp = self._get_activation(F, i2h + reset_gate * h2h, self._activation,
-                                          name=prefix+'h_act')
-
-        next_h = F.elemwise_add((1. - update_gate) * next_h_tmp, update_gate * states[0],
-                                name=prefix+'out')
+    def forward(self, inputs, states):
+        i2h, h2h = self._conv_forward(inputs, states)
+
+        i2h_r, i2h_z, i2h = npx.slice_channel(i2h, num_outputs=3,
+                                              axis=self._channel_axis)
+        h2h_r, h2h_z, h2h = npx.slice_channel(h2h, num_outputs=3,
+                                              axis=self._channel_axis)
+
+        reset_gate = npx.activation(i2h_r + h2h_r, act_type="sigmoid")
+        update_gate = npx.activation(i2h_z + h2h_z, act_type="sigmoid")
+
+        next_h_tmp = self._get_activation(i2h + reset_gate * h2h, self._activation)
+
+        next_h = (1. - update_gate) * next_h_tmp + update_gate * \
+            states[0].as_in_ctx(inputs.ctx)
 
         return next_h, [next_h]
 
diff --git a/python/mxnet/gluon/rnn/rnn_cell.py b/python/mxnet/gluon/rnn/rnn_cell.py
index bb285338c5c3..0f1c31a875e9 100644
--- a/python/mxnet/gluon/rnn/rnn_cell.py
+++ b/python/mxnet/gluon/rnn/rnn_cell.py
@@ -26,7 +26,8 @@
            'ModifierCell', 'ZoneoutCell', 'ResidualCell',
            'BidirectionalCell', 'VariationalDropoutCell', 'LSTMPCell']
 
-from ... import symbol, ndarray
+from ... import np, npx, context
+from ...util import use_np
 from ...base import string_types, numeric_types, _as_list
 from ..block import Block, HybridBlock
 from ..parameter import Parameter
@@ -41,14 +42,11 @@ def _cells_state_info(cells, batch_size):
 def _cells_begin_state(cells, **kwargs):
     return sum([c().begin_state(**kwargs) for c in cells], [])
 
-def _get_begin_state(cell, F, begin_state, inputs, batch_size):
+def _get_begin_state(cell, begin_state, inputs, batch_size):
     if begin_state is None:
-        if F is ndarray:
-            ctx = inputs.context if isinstance(inputs, tensor_types) else inputs[0].context
-            with ctx:
-                begin_state = cell.begin_state(func=F.zeros, batch_size=batch_size)
-        else:
-            begin_state = cell.begin_state(func=F.zeros, batch_size=batch_size)
+        ctx = inputs.context if isinstance(inputs, tensor_types) else inputs[0].context
+        with ctx:
+            begin_state = cell.begin_state(func=np.zeros, batch_size=batch_size)
     return begin_state
 
 def _format_sequence(length, inputs, layout, merge, in_layout=None):
@@ -60,69 +58,55 @@ def _format_sequence(length, inputs, layout, merge, in_layout=None):
     batch_axis = layout.find('N')
     batch_size = 0
     in_axis = in_layout.find('T') if in_layout is not None else axis
-    if isinstance(inputs, symbol.Symbol):
-        F = symbol
-        if merge is False:
-            assert len(inputs.list_outputs()) == 1, \
-                "unroll doesn't allow grouped symbol as input. Please convert " \
-                "to list with list(inputs) first or let unroll handle splitting."
-            inputs = list(symbol.split(inputs, axis=in_axis, num_outputs=length,
-                                       squeeze_axis=1))
-    elif isinstance(inputs, ndarray.NDArray):
-        F = ndarray
+    if isinstance(inputs, np.ndarray):
         batch_size = inputs.shape[batch_axis]
         if merge is False:
             assert length is None or length == inputs.shape[in_axis]
-            inputs = _as_list(ndarray.split(inputs, axis=in_axis,
-                                            num_outputs=inputs.shape[in_axis],
-                                            squeeze_axis=1))
+            inputs = _as_list(npx.slice_channel(inputs, axis=in_axis,
+                                                num_outputs=inputs.shape[in_axis],
+                                                squeeze_axis=1))
     else:
+        assert isinstance(inputs, (list, tuple)), \
+            "Only support MXNet numpy ndarray or list of MXNet numpy ndarrays as inputs"
         assert length is None or len(inputs) == length
-        if isinstance(inputs[0], symbol.Symbol):
-            F = symbol
-        else:
-            F = ndarray
-            batch_size = inputs[0].shape[0]
+        batch_size = inputs[0].shape[0]
         if merge is True:
-            inputs = F.stack(*inputs, axis=axis)
+            inputs = np.stack(inputs, axis=axis)
             in_axis = axis
 
-    if isinstance(inputs, tensor_types) and axis != in_axis:
-        inputs = F.swapaxes(inputs, dim1=axis, dim2=in_axis)
+    if isinstance(inputs, np.ndarray) and axis != in_axis:
+        inputs = np.swapaxes(inputs, axis, in_axis)
 
-    return inputs, axis, F, batch_size
+    return inputs, axis, batch_size
 
-def _mask_sequence_variable_length(F, data, length, valid_length, time_axis, merge):
+def _mask_sequence_variable_length(data, length, valid_length, time_axis, merge):
     assert valid_length is not None
     if not isinstance(data, tensor_types):
-        data = F.stack(*data, axis=time_axis)
-    outputs = F.SequenceMask(data, sequence_length=valid_length, use_sequence_length=True,
-                             axis=time_axis)
+        data = np.stack(data, axis=time_axis)
+    outputs = npx.sequence_mask(data, sequence_length=valid_length, use_sequence_length=True,
+                                axis=time_axis)
     if not merge:
-        outputs = _as_list(F.split(outputs, num_outputs=length, axis=time_axis,
-                                   squeeze_axis=True))
+        outputs = _as_list(npx.slice_channel(outputs, num_outputs=length, axis=time_axis,
+                                             squeeze_axis=True))
     return outputs
 
 def _reverse_sequences(sequences, unroll_step, valid_length=None):
-    if isinstance(sequences[0], symbol.Symbol):
-        F = symbol
-    else:
-        F = ndarray
-
     if valid_length is None:
         reversed_sequences = list(reversed(sequences))
     else:
-        reversed_sequences = F.SequenceReverse(F.stack(*sequences, axis=0),
-                                               sequence_length=valid_length,
-                                               use_sequence_length=True)
-        if unroll_step > 1 or F is symbol:
-            reversed_sequences = F.split(reversed_sequences, axis=0, num_outputs=unroll_step, squeeze_axis=True)
+        reversed_sequences = npx.sequence_reverse(np.stack(sequences, axis=0),
+                                                  sequence_length=valid_length,
+                                                  use_sequence_length=True)
+        if unroll_step > 1:
+            reversed_sequences = npx.slice_channel(reversed_sequences, axis=0,
+                                                   num_outputs=unroll_step, squeeze_axis=True)
         else:
             reversed_sequences = [reversed_sequences[0]]
 
     return reversed_sequences
 
 
+@use_np
 class RecurrentCell(Block):
     """Abstract base class for RNN cells
 
@@ -143,7 +127,7 @@ def state_info(self, batch_size=0):
         """shape and layout information of states"""
         raise NotImplementedError()
 
-    def begin_state(self, batch_size=0, func=ndarray.zeros, **kwargs):
+    def begin_state(self, batch_size=0, func=np.zeros, **kwargs):
         """Initial state for this cell.
 
         Parameters
@@ -174,13 +158,13 @@ def begin_state(self, batch_size=0, func=ndarray.zeros, **kwargs):
             "cell cannot be called directly. Call the modifier cell instead."
         states = []
         for info in self.state_info(batch_size):
-            self._init_counter += 1
             if info is not None:
                 info.update(kwargs)
             else:
                 info = kwargs
-            state = func(name='begin_state_%d'%(self._init_counter),
-                         **info)
+            state = func(shape=info.pop("shape", ()),
+                         ctx=info.pop("ctx", context.cpu()),
+                         dtype=info.pop("dtype", "float32"))
             states.append(state)
         return states
 
@@ -239,8 +223,8 @@ def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=N
         # pylint: disable=too-many-locals
         self.reset()
 
-        inputs, axis, F, batch_size = _format_sequence(length, inputs, layout, False)
-        begin_state = _get_begin_state(self, F, begin_state, inputs, batch_size)
+        inputs, axis, batch_size = _format_sequence(length, inputs, layout, False)
+        begin_state = _get_begin_state(self, begin_state, inputs, batch_size)
 
         states = begin_state
         outputs = []
@@ -251,29 +235,29 @@ def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=N
             if valid_length is not None:
                 all_states.append(states)
         if valid_length is not None:
-            states = [F.SequenceLast(F.stack(*ele_list, axis=0),
-                                     sequence_length=valid_length,
-                                     use_sequence_length=True,
-                                     axis=0)
+            states = [npx.sequence_last(np.stack(ele_list, axis=0),
+                                        sequence_length=valid_length,
+                                        use_sequence_length=True,
+                                        axis=0)
                       for ele_list in zip(*all_states)]
-            outputs = _mask_sequence_variable_length(F, outputs, length, valid_length, axis, True)
-        outputs, _, _, _ = _format_sequence(length, outputs, layout, merge_outputs)
+            outputs = _mask_sequence_variable_length(outputs, length, valid_length, axis, True)
+        outputs, _, _ = _format_sequence(length, outputs, layout, merge_outputs)
 
         return outputs, states
 
     #pylint: disable=no-self-use
-    def _get_activation(self, F, inputs, activation, **kwargs):
+    def _get_activation(self, inputs, activation, **kwargs):
         """Get activation function. Convert if is string"""
-        func = {'tanh': F.tanh,
-                'relu': F.relu,
-                'sigmoid': F.sigmoid,
-                'softsign': F.softsign}.get(activation)
+        func = {'tanh': np.tanh,
+                'relu': npx.relu,
+                'sigmoid': npx.sigmoid,
+                'softsign': npx.softsign}.get(activation)
         if func:
             return func(inputs, **kwargs)
         elif isinstance(activation, string_types):
-            return F.Activation(inputs, act_type=activation, **kwargs)
+            return npx.activation(inputs, act_type=activation, **kwargs)
         elif isinstance(activation, LeakyReLU):
-            return F.LeakyReLU(inputs, act_type='leaky', slope=activation._alpha, **kwargs)
+            return npx.leaky_relu(inputs, act_type='leaky', slope=activation._alpha, **kwargs)
         return activation(inputs, **kwargs)
 
     def forward(self, inputs, states):
@@ -306,16 +290,17 @@ def forward(self, inputs, states):
         self._counter += 1
         return super(RecurrentCell, self).forward(inputs, states)
 
-
+@use_np
 class HybridRecurrentCell(RecurrentCell, HybridBlock):
     """HybridRecurrentCell supports hybridize."""
     def __init__(self):
         super(HybridRecurrentCell, self).__init__()
 
-    def hybrid_forward(self, F, x, *args, **kwargs):
+    def forward(self, x, *args, **kwargs):
         raise NotImplementedError
 
 
+@use_np
 class RNNCell(HybridRecurrentCell):
     r"""Elman RNN recurrent neural network cell.
 
@@ -398,22 +383,33 @@ def __repr__(self):
                         mapping=mapping,
                         **self.__dict__)
 
-    def hybrid_forward(self, F, inputs, states, i2h_weight,
-                       h2h_weight, i2h_bias, h2h_bias):
-        prefix = 't%d_'%self._counter
-        i2h = F.FullyConnected(data=inputs, weight=i2h_weight, bias=i2h_bias,
-                               num_hidden=self._hidden_size,
-                               name=prefix+'i2h')
-        h2h = F.FullyConnected(data=states[0], weight=h2h_weight, bias=h2h_bias,
-                               num_hidden=self._hidden_size,
-                               name=prefix+'h2h')
-        i2h_plus_h2h = F.elemwise_add(i2h, h2h, name=prefix+'plus0')
-        output = self._get_activation(F, i2h_plus_h2h, self._activation,
-                                      name=prefix+'out')
+    def forward(self, inputs, states):
+        ctx = inputs.ctx
+        i2h = npx.fully_connected(inputs, weight=self.i2h_weight.data(ctx),
+                                  bias=self.i2h_bias.data(ctx),
+                                  num_hidden=self._hidden_size,
+                                  no_bias=False)
+        h2h = npx.fully_connected(states[0].as_in_ctx(ctx),
+                                  weight=self.h2h_weight.data(ctx),
+                                  bias=self.h2h_bias.data(ctx),
+                                  num_hidden=self._hidden_size,
+                                  no_bias=False)
+        i2h_plus_h2h = i2h + h2h
+        output = self._get_activation(i2h_plus_h2h, self._activation)
 
         return output, [output]
 
+    def infer_shape(self, i, x, is_bidirect):
+        if i == 0:
+            self.i2h_weight.shape = (self._hidden_size, x.shape[x.ndim-1])
+        else:
+            nh = self._hidden_size
+            if is_bidirect:
+                nh *= 2
+            self.i2h_weight.shape = (self._hidden_size, nh)
+
 
+@use_np
 class LSTMCell(HybridRecurrentCell):
     r"""Long-Short Term Memory (LSTM) network cell.
 
@@ -509,33 +505,38 @@ def __repr__(self):
                         mapping=mapping,
                         **self.__dict__)
 
-    def hybrid_forward(self, F, inputs, states, i2h_weight,
-                       h2h_weight, i2h_bias, h2h_bias):
+    def forward(self, inputs, states):
         # pylint: disable=too-many-locals
-        prefix = 't%d_'%self._counter
-        i2h = F.FullyConnected(data=inputs, weight=i2h_weight, bias=i2h_bias,
-                               num_hidden=self._hidden_size*4, name=prefix+'i2h')
-        h2h = F.FullyConnected(data=states[0], weight=h2h_weight, bias=h2h_bias,
-                               num_hidden=self._hidden_size*4, name=prefix+'h2h')
-        gates = F.elemwise_add(i2h, h2h, name=prefix+'plus0')
-        slice_gates = F.SliceChannel(gates, num_outputs=4, name=prefix+'slice')
-        in_gate = self._get_activation(
-            F, slice_gates[0], self._recurrent_activation, name=prefix+'i')
-        forget_gate = self._get_activation(
-            F, slice_gates[1], self._recurrent_activation, name=prefix+'f')
-        in_transform = self._get_activation(
-            F, slice_gates[2], self._activation, name=prefix+'c')
-        out_gate = self._get_activation(
-            F, slice_gates[3], self._recurrent_activation, name=prefix+'o')
-        next_c = F.elemwise_add(F.elemwise_mul(forget_gate, states[1], name=prefix+'mul0'),
-                                F.elemwise_mul(in_gate, in_transform, name=prefix+'mul1'),
-                                name=prefix+'state')
-        next_h = F.elemwise_mul(out_gate, F.Activation(next_c, act_type=self._activation, name=prefix+'activation0'),
-                                name=prefix+'out')
+        ctx = inputs.ctx
+        i2h = npx.fully_connected(inputs, weight=self.i2h_weight.data(ctx),
+                                  bias=self.i2h_bias.data(ctx),
+                                  num_hidden=self._hidden_size*4, no_bias=False)
+        h2h = npx.fully_connected(states[0].as_in_ctx(ctx),
+                                  weight=self.h2h_weight.data(ctx),
+                                  bias=self.h2h_bias.data(ctx),
+                                  num_hidden=self._hidden_size*4, no_bias=False)
+        gates = i2h + h2h
+        slice_gates = npx.slice_channel(gates, num_outputs=4)
+        in_gate = self._get_activation(slice_gates[0], self._recurrent_activation)
+        forget_gate = self._get_activation(slice_gates[1], self._recurrent_activation)
+        in_transform = self._get_activation(slice_gates[2], self._activation)
+        out_gate = self._get_activation(slice_gates[3], self._recurrent_activation)
+        next_c = np.multiply(forget_gate, states[1].as_in_ctx(ctx)) + \
+                 np.multiply(in_gate, in_transform)
+        next_h = np.multiply(out_gate, npx.activation(next_c, act_type=self._activation))
 
         return next_h, [next_h, next_c]
 
+    def infer_shape(self, i, x, is_bidirect):
+        if i == 0:
+            self.i2h_weight.shape = (4*self._hidden_size, x.shape[x.ndim-1])
+        else:
+            nh = self._hidden_size
+            if is_bidirect:
+                nh *= 2
+            self.i2h_weight.shape = (4*self._hidden_size, nh)
 
+@use_np
 class GRUCell(HybridRecurrentCell):
     r"""Gated Rectified Unit (GRU) network cell.
     Note: this is an implementation of the cuDNN version of GRUs
@@ -627,51 +628,45 @@ def __repr__(self):
                         mapping=mapping,
                         **self.__dict__)
 
-    def hybrid_forward(self, F, inputs, states, i2h_weight,
-                       h2h_weight, i2h_bias, h2h_bias):
+    def forward(self, inputs, states):
         # pylint: disable=too-many-locals
-        prefix = 't%d_'%self._counter
-        prev_state_h = states[0]
-        i2h = F.FullyConnected(data=inputs,
-                               weight=i2h_weight,
-                               bias=i2h_bias,
-                               num_hidden=self._hidden_size * 3,
-                               name=prefix+'i2h')
-        h2h = F.FullyConnected(data=prev_state_h,
-                               weight=h2h_weight,
-                               bias=h2h_bias,
-                               num_hidden=self._hidden_size * 3,
-                               name=prefix+'h2h')
-
-        i2h_r, i2h_z, i2h = F.SliceChannel(i2h, num_outputs=3,
-                                           name=prefix+'i2h_slice')
-        h2h_r, h2h_z, h2h = F.SliceChannel(h2h, num_outputs=3,
-                                           name=prefix+'h2h_slice')
-
-        reset_gate = self._get_activation(F,
-                                          F.elemwise_add(i2h_r, h2h_r, name=prefix+'plus0'),
-                                          self._recurrent_activation,
-                                          name=prefix+'r_act')
-        update_gate = self._get_activation(F,
-                                           F.elemwise_add(i2h_z, h2h_z, name=prefix+'plus1'),
-                                           self._recurrent_activation,
-                                           name=prefix+'z_act')
-        next_h_tmp = self._get_activation(F,
-                                          F.elemwise_add(i2h,
-                                                         F.elemwise_mul(reset_gate, h2h, name=prefix+'mul0'),
-                                                         name=prefix+'plus2'),
-                                          self._activation,
-                                          name=prefix+'h_act')
-        ones = F.ones_like(update_gate, name=prefix+"ones_like0")
-        next_h = F.elemwise_add(F.elemwise_mul(F.elemwise_sub(ones, update_gate, name=prefix+'minus0'),
-                                               next_h_tmp,
-                                               name=prefix+'mul1'),
-                                F.elemwise_mul(update_gate, prev_state_h, name=prefix+'mul20'),
-                                name=prefix+'out')
+        ctx = inputs.ctx
+        prev_state_h = states[0].as_in_ctx(ctx)
+        i2h = npx.fully_connected(inputs,
+                                  weight=self.i2h_weight.data(ctx),
+                                  bias=self.i2h_bias.data(ctx),
+                                  num_hidden=self._hidden_size * 3,
+                                  no_bias=False)
+        h2h = npx.fully_connected(prev_state_h,
+                                  weight=self.h2h_weight.data(ctx),
+                                  bias=self.h2h_bias.data(ctx),
+                                  num_hidden=self._hidden_size * 3,
+                                  no_bias=False)
+
+        i2h_r, i2h_z, i2h = npx.slice_channel(i2h, num_outputs=3)
+        h2h_r, h2h_z, h2h = npx.slice_channel(h2h, num_outputs=3)
+
+        reset_gate = self._get_activation(i2h_r + h2h_r,
+                                          self._recurrent_activation)
+        update_gate = self._get_activation(i2h_z + h2h_z,
+                                           self._recurrent_activation)
+        next_h_tmp = self._get_activation(i2h + np.multiply(reset_gate, h2h),
+                                          self._activation)
+        ones = np.ones(update_gate.shape)
+        next_h = np.multiply((ones - update_gate), next_h_tmp) + np.multiply(update_gate, prev_state_h)
 
         return next_h, [next_h]
 
+    def infer_shape(self, i, x, is_bidirect):
+        if i == 0:
+            self.i2h_weight.shape = (3*self._hidden_size, x.shape[x.ndim-1])
+        else:
+            nh = self._hidden_size
+            if is_bidirect:
+                nh *= 2
+            self.i2h_weight.shape = (3*self._hidden_size, nh)
 
+@use_np
 class SequentialRNNCell(RecurrentCell):
     """Sequentially stacking multiple RNN cells."""
     def __init__(self):
@@ -723,9 +718,9 @@ def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=N
         # pylint: disable=too-many-locals
         self.reset()
 
-        inputs, _, F, batch_size = _format_sequence(length, inputs, layout, None)
+        inputs, _, batch_size = _format_sequence(length, inputs, layout, None)
         num_cells = len(self._children)
-        begin_state = _get_begin_state(self, F, begin_state, inputs, batch_size)
+        begin_state = _get_begin_state(self, begin_state, inputs, batch_size)
 
         p = 0
         next_states = []
@@ -747,11 +742,16 @@ def __getitem__(self, i):
     def __len__(self):
         return len(self._children)
 
-    def hybrid_forward(self, *args, **kwargs):
+    def forward(self, *args, **kwargs):
         # pylint: disable=missing-docstring
         raise NotImplementedError
 
+    def infer_shape(self, _, x, is_bidirect):
+        for i, child in enumerate(self._layers):
+            child.infer_shape(i, x, is_bidirect)
 
+
+@use_np
 class HybridSequentialRNNCell(HybridRecurrentCell):
     """Sequentially stacking multiple HybridRNN cells."""
     def __init__(self):
@@ -801,9 +801,9 @@ def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=N
                valid_length=None):
         self.reset()
 
-        inputs, _, F, batch_size = _format_sequence(length, inputs, layout, None)
+        inputs, _, batch_size = _format_sequence(length, inputs, layout, None)
         num_cells = len(self._children)
-        begin_state = _get_begin_state(self, F, begin_state, inputs, batch_size)
+        begin_state = _get_begin_state(self, begin_state, inputs, batch_size)
 
         p = 0
         next_states = []
@@ -825,10 +825,16 @@ def __getitem__(self, i):
     def __len__(self):
         return len(self._children)
 
-    def hybrid_forward(self, F, inputs, states):
+    def forward(self, inputs, states):
         return self.__call__(inputs, states)
 
+    # pylint: disable=unused-argument
+    def infer_shape(self, _, x, is_bidirect):
+        for i, child in enumerate(self._layers):
+            child.infer_shape(i, x, False)
 
+
+@use_np
 class DropoutCell(HybridRecurrentCell):
     """Applies dropout on input.
 
@@ -866,24 +872,24 @@ def state_info(self, batch_size=0):
     def _alias(self):
         return 'dropout'
 
-    def hybrid_forward(self, F, inputs, states):
+    def forward(self, inputs, states):
         if self._rate > 0:
-            inputs = F.Dropout(data=inputs, p=self._rate, axes=self._axes,
-                               name='t%d_fwd'%self._counter)
+            inputs = npx.dropout(data=inputs, p=self._rate, axes=self._axes)
         return inputs, states
 
     def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=None,
                valid_length=None):
         self.reset()
 
-        inputs, _, F, _ = _format_sequence(length, inputs, layout, merge_outputs)
+        inputs, _, _ = _format_sequence(length, inputs, layout, merge_outputs)
         if isinstance(inputs, tensor_types):
-            return self.hybrid_forward(F, inputs, begin_state if begin_state else [])
+            return self.forward(inputs, begin_state if begin_state else [])
         return super(DropoutCell, self).unroll(
             length, inputs, begin_state=begin_state, layout=layout,
             merge_outputs=merge_outputs, valid_length=None)
 
 
+@use_np
 class ModifierCell(HybridRecurrentCell):
     """Base class for modifier cells. A modifier
     cell takes a base cell, apply modifications
@@ -907,7 +913,7 @@ def params(self):
     def state_info(self, batch_size=0):
         return self.base_cell.state_info(batch_size)
 
-    def begin_state(self, func=symbol.zeros, **kwargs):
+    def begin_state(self, func=np.zeros, **kwargs):
         assert not self._modified, \
             "After applying modifier cells (e.g. DropoutCell) the base " \
             "cell cannot be called directly. Call the modifier cell instead."
@@ -916,7 +922,7 @@ def begin_state(self, func=symbol.zeros, **kwargs):
         self.base_cell._modified = True
         return begin
 
-    def hybrid_forward(self, F, inputs, states):
+    def forward(self, inputs, states):
         raise NotImplementedError
 
     def __repr__(self):
@@ -925,6 +931,7 @@ def __repr__(self):
                         **self.__dict__)
 
 
+@use_np
 class ZoneoutCell(ModifierCell):
     """Applies Zoneout on base cell."""
     def __init__(self, base_cell, zoneout_outputs=0., zoneout_states=0.):
@@ -951,25 +958,29 @@ def reset(self):
         super(ZoneoutCell, self).reset()
         self._prev_output = None
 
-    def hybrid_forward(self, F, inputs, states):
+    def forward(self, inputs, states):
+        ctx = inputs.ctx
         cell, p_outputs, p_states = self.base_cell, self.zoneout_outputs, self.zoneout_states
         next_output, next_states = cell(inputs, states)
-        mask = (lambda p, like: F.Dropout(F.ones_like(like), p=p))
+        mask = (lambda p, like: npx.dropout(np.ones(like.shape), p=p))
 
         prev_output = self._prev_output
         if prev_output is None:
-            prev_output = F.zeros_like(next_output)
+            prev_output = np.zeros(next_output.shape)
 
-        output = (F.where(mask(p_outputs, next_output), next_output, prev_output)
+        output = (np.where(mask(p_outputs, next_output), next_output, prev_output)
                   if p_outputs != 0. else next_output)
-        states = ([F.where(mask(p_states, new_s), new_s, old_s) for new_s, old_s in
+        states = ([np.where(mask(p_states, new_s), new_s, old_s.as_in_ctx(ctx)) for new_s, old_s in
                    zip(next_states, states)] if p_states != 0. else next_states)
 
         self._prev_output = output
 
         return output, states
 
+    def infer_shape(self, i, x, is_bidirect):
+        self.base_cell.infer_shape(i, x, is_bidirect)
 
+@use_np
 class ResidualCell(ModifierCell):
     """
     Adds residual connection as described in Wu et al, 2016
@@ -981,9 +992,9 @@ def __init__(self, base_cell):
         # pylint: disable=useless-super-delegation
         super(ResidualCell, self).__init__(base_cell)
 
-    def hybrid_forward(self, F, inputs, states):
+    def forward(self, inputs, states):
         output, states = self.base_cell(inputs, states)
-        output = F.elemwise_add(output, inputs, name='t%d_fwd'%self._counter)
+        output = output + inputs
         return output, states
 
     def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=None,
@@ -998,19 +1009,23 @@ def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=N
 
         merge_outputs = isinstance(outputs, tensor_types) if merge_outputs is None else \
                         merge_outputs
-        inputs, axis, F, _ = _format_sequence(length, inputs, layout, merge_outputs)
+        inputs, axis, _ = _format_sequence(length, inputs, layout, merge_outputs)
         if valid_length is not None:
             # mask the padded inputs to zero
-            inputs = _mask_sequence_variable_length(F, inputs, length, valid_length, axis,
+            inputs = _mask_sequence_variable_length(inputs, length, valid_length, axis,
                                                     merge_outputs)
         if merge_outputs:
-            outputs = F.elemwise_add(outputs, inputs)
+            outputs = outputs + inputs
         else:
-            outputs = [F.elemwise_add(i, j) for i, j in zip(outputs, inputs)]
+            outputs = [i + j for i, j in zip(outputs, inputs)]
 
         return outputs, states
 
+    def infer_shape(self, i, x, is_bidirect):
+        self.base_cell.infer_shape(i, x, is_bidirect)
 
+
+@use_np
 class BidirectionalCell(HybridRecurrentCell):
     """Bidirectional RNN cell.
 
@@ -1049,9 +1064,9 @@ def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=N
         # pylint: disable=too-many-locals
         self.reset()
 
-        inputs, axis, F, batch_size = _format_sequence(length, inputs, layout, False)
+        inputs, axis, batch_size = _format_sequence(length, inputs, layout, False)
         reversed_inputs = list(_reverse_sequences(inputs, length, valid_length))
-        begin_state = _get_begin_state(self, F, begin_state, inputs, batch_size)
+        begin_state = _get_begin_state(self, begin_state, inputs, batch_size)
 
         states = begin_state
         l_cell, r_cell = [c() for c in self._children.values()]
@@ -1068,25 +1083,30 @@ def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=N
 
         if merge_outputs is None:
             merge_outputs = isinstance(l_outputs, tensor_types)
-            l_outputs, _, _, _ = _format_sequence(None, l_outputs, layout, merge_outputs)
-            reversed_r_outputs, _, _, _ = _format_sequence(None, reversed_r_outputs, layout,
-                                                           merge_outputs)
+            l_outputs, _, _ = _format_sequence(None, l_outputs, layout, merge_outputs)
+            reversed_r_outputs, _, _ = _format_sequence(None, reversed_r_outputs, layout,
+                                                        merge_outputs)
 
         if merge_outputs:
-            reversed_r_outputs = F.stack(*reversed_r_outputs, axis=axis)
-            outputs = F.concat(l_outputs, reversed_r_outputs, dim=2,
-                               name='out')
+            reversed_r_outputs = np.stack(reversed_r_outputs, axis=axis)
+            outputs = np.concatenate([l_outputs, reversed_r_outputs], axis=2)
 
         else:
-            outputs = [F.concat(l_o, r_o, dim=1, name='t%d'%(i))
+            outputs = [np.concatenate([l_o, r_o], axis=1)
                        for i, (l_o, r_o) in enumerate(zip(l_outputs, reversed_r_outputs))]
         if valid_length is not None:
-            outputs = _mask_sequence_variable_length(F, outputs, length, valid_length, axis,
+            outputs = _mask_sequence_variable_length(outputs, length, valid_length, axis,
                                                      merge_outputs)
         states = l_states + r_states
         return outputs, states
 
+    #pylint: disable=W0613
+    def infer_shape(self, i, x, is_bidirect):
+        l_cell, r_cell = [c() for c in self._children.values()]
+        l_cell.infer_shape(i, x, True)
+        r_cell.infer_shape(i, x, True)
 
+@use_np
 class VariationalDropoutCell(ModifierCell):
     """
     Applies Variational Dropout on base cell.
@@ -1136,36 +1156,37 @@ def reset(self):
         self.drop_states_mask = None
         self.drop_outputs_mask = None
 
-    def _initialize_input_masks(self, F, inputs, states):
+    def _initialize_input_masks(self, inputs, states):
         if self.drop_states and self.drop_states_mask is None:
-            self.drop_states_mask = F.Dropout(F.ones_like(states[0]),
-                                              p=self.drop_states)
+            self.drop_states_mask = npx.dropout(np.ones(states[0].shape),
+                                                p=self.drop_states)
 
         if self.drop_inputs and self.drop_inputs_mask is None:
-            self.drop_inputs_mask = F.Dropout(F.ones_like(inputs),
-                                              p=self.drop_inputs)
+            self.drop_inputs_mask = npx.dropout(np.ones(inputs.shape),
+                                                p=self.drop_inputs)
 
-    def _initialize_output_mask(self, F, output):
+    def _initialize_output_mask(self, output):
         if self.drop_outputs and self.drop_outputs_mask is None:
-            self.drop_outputs_mask = F.Dropout(F.ones_like(output),
-                                               p=self.drop_outputs)
+            self.drop_outputs_mask = npx.dropout(np.ones(output.shape),
+                                                 p=self.drop_outputs)
 
 
-    def hybrid_forward(self, F, inputs, states):
+    def forward(self, inputs, states):
+        ctx = inputs.ctx
         cell = self.base_cell
-        self._initialize_input_masks(F, inputs, states)
+        self._initialize_input_masks(inputs, states)
 
         if self.drop_states:
             states = list(states)
             # state dropout only needs to be applied on h, which is always the first state.
-            states[0] = states[0] * self.drop_states_mask
+            states[0] = states[0].as_in_ctx(ctx) * self.drop_states_mask
 
         if self.drop_inputs:
             inputs = inputs * self.drop_inputs_mask
 
         next_output, next_states = cell(inputs, states)
 
-        self._initialize_output_mask(F, next_output)
+        self._initialize_output_mask(next_output)
         if self.drop_outputs:
             next_output = next_output * self.drop_outputs_mask
 
@@ -1238,25 +1259,28 @@ def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=N
 
         self.reset()
 
-        inputs, axis, F, batch_size = _format_sequence(length, inputs, layout, True)
-        states = _get_begin_state(self, F, begin_state, inputs, batch_size)
+        inputs, axis, batch_size = _format_sequence(length, inputs, layout, True)
+        states = _get_begin_state(self, begin_state, inputs, batch_size)
 
         if self.drop_inputs:
-            inputs = F.Dropout(inputs, p=self.drop_inputs, axes=(axis,))
+            inputs = npx.dropout(inputs, p=self.drop_inputs, axes=(axis,))
 
         outputs, states = self.base_cell.unroll(length, inputs, states, layout, merge_outputs=True,
                                                 valid_length=valid_length)
         if self.drop_outputs:
-            outputs = F.Dropout(outputs, p=self.drop_outputs, axes=(axis,))
+            outputs = npx.dropout(outputs, p=self.drop_outputs, axes=(axis,))
         merge_outputs = isinstance(outputs, tensor_types) if merge_outputs is None else \
             merge_outputs
-        outputs, _, _, _ = _format_sequence(length, outputs, layout, merge_outputs)
+        outputs, _, _ = _format_sequence(length, outputs, layout, merge_outputs)
         if valid_length is not None:
-            outputs = _mask_sequence_variable_length(F, outputs, length, valid_length, axis,
+            outputs = _mask_sequence_variable_length(outputs, length, valid_length, axis,
                                                      merge_outputs)
         return outputs, states
 
+    def infer_shape(self, i, x, is_bidirect):
+        self.base_cell.infer_shape(i, x, is_bidirect)
 
+@use_np
 class LSTMPCell(HybridRecurrentCell):
     r"""Long-Short Term Memory Projected (LSTMP) network cell.
     (https://arxiv.org/abs/1402.1128)
@@ -1355,28 +1379,36 @@ def __repr__(self):
                         **self.__dict__)
 
     # pylint: disable= arguments-differ
-    def hybrid_forward(self, F, inputs, states, i2h_weight,
-                       h2h_weight, h2r_weight, i2h_bias, h2h_bias):
-        prefix = 't%d_'%self._counter
-        i2h = F.FullyConnected(data=inputs, weight=i2h_weight, bias=i2h_bias,
-                               num_hidden=self._hidden_size*4, name=prefix+'i2h')
-        h2h = F.FullyConnected(data=states[0], weight=h2h_weight, bias=h2h_bias,
-                               num_hidden=self._hidden_size*4, name=prefix+'h2h')
+    def forward(self, inputs, states):
+        ctx = inputs.ctx
+        i2h = npx.fully_connected(inputs, weight=self.i2h_weight.data(ctx),
+                                  bias=self.i2h_bias.data(ctx),
+                                  num_hidden=self._hidden_size*4, no_bias=False)
+        h2h = npx.fully_connected(states[0].as_in_ctx(ctx),
+                                  weight=self.h2h_weight.data(ctx),
+                                  bias=self.h2h_bias.data(ctx),
+                                  num_hidden=self._hidden_size*4, no_bias=False)
         gates = i2h + h2h
-        slice_gates = F.SliceChannel(gates, num_outputs=4, name=prefix+'slice')
-        in_gate = F.Activation(slice_gates[0], act_type="sigmoid", name=prefix+'i')
-        forget_gate = F.Activation(slice_gates[1], act_type="sigmoid", name=prefix+'f')
-        in_transform = F.Activation(slice_gates[2], act_type="tanh", name=prefix+'c')
-        out_gate = F.Activation(slice_gates[3], act_type="sigmoid", name=prefix+'o')
-        next_c = F.elemwise_add(forget_gate * states[1], in_gate * in_transform,
-                                name=prefix+'state')
-        hidden = F.elemwise_mul(out_gate, F.Activation(next_c, act_type="tanh"),
-                                name=prefix+'hidden')
-        next_r = F.FullyConnected(data=hidden, num_hidden=self._projection_size,
-                                  weight=h2r_weight, no_bias=True, name=prefix+'out')
+        slice_gates = npx.slice_channel(gates, num_outputs=4)
+        in_gate = npx.activation(slice_gates[0], act_type="sigmoid")
+        forget_gate = npx.activation(slice_gates[1], act_type="sigmoid")
+        in_transform = npx.activation(slice_gates[2], act_type="tanh")
+        out_gate = npx.activation(slice_gates[3], act_type="sigmoid")
+        next_c = forget_gate * states[1].as_in_ctx(ctx) + in_gate * in_transform
+        hidden = np.multiply(out_gate, npx.activation(next_c, act_type="tanh"))
+        next_r = npx.fully_connected(hidden, num_hidden=self._projection_size,
+                                     weight=self.h2r_weight.data(ctx), no_bias=True)
 
         return next_r, [next_r, next_c]
-    # pylint: enable= arguments-differ
+
+    def infer_shape(self, i, x, is_bidirect):
+        if i == 0:
+            self.i2h_weight.shape = (4*self._hidden_size, x.shape[x.ndim-1])
+        else:
+            nh = self._projection_size
+            if is_bidirect:
+                nh *= 2
+            self.i2h_weight.shape = (4*self._hidden_size, nh)
 
 
 def dynamic_unroll(cell, inputs, begin_state, drop_inputs=0, drop_outputs=0,
@@ -1428,10 +1460,10 @@ def dynamic_unroll(cell, inputs, begin_state, drop_inputs=0, drop_outputs=0,
     >>> input_size = 5
     >>> cell = mx.gluon.rnn.LSTMCell(input_size)
     >>> cell.initialize(ctx=mx.cpu())
-    >>> rnn_data = mx.nd.normal(loc=0, scale=1, shape=(seq_len, batch_size, input_size))
+    >>> rnn_data = mx.np.normal(loc=0, scale=1, shape=(seq_len, batch_size, input_size))
     >>> state_shape = (batch_size, input_size)
-    >>> states = [mx.nd.normal(loc=0, scale=1, shape=state_shape) for i in range(2)]
-    >>> valid_length = mx.nd.array([2, 3])
+    >>> states = [mx.np.normal(loc=0, scale=1, shape=state_shape) for i in range(2)]
+    >>> valid_length = mx.np.array([2, 3])
     >>> output, states = mx.gluon.rnn.rnn_cell.dynamic_unroll(cell, rnn_data, states,
     ...                                                       valid_length=valid_length,
     ...                                                       layout='TNC')
@@ -1446,48 +1478,57 @@ def dynamic_unroll(cell, inputs, begin_state, drop_inputs=0, drop_outputs=0,
     """
 
     # Merge is always True, so we don't need length.
-    inputs, axis, F, _ = _format_sequence(0, inputs, layout, True)
+    inputs, axis, _ = _format_sequence(0, inputs, layout, True)
     if axis != 0:
         axes = list(range(len(layout)))
         tmp = axes[0]
         axes[0] = axes[axis]
         axes[axis] = tmp
-        inputs = F.transpose(inputs, axes=axes)
+        inputs = np.transpose(inputs, axes=axes)
     states = begin_state
 
     if drop_inputs:
-        inputs = F.Dropout(inputs, p=drop_inputs, axes=(axis,))
+        inputs = npx.dropout(inputs, p=drop_inputs, axes=(axis,))
 
     if valid_length is None:
-        def loop_body(inputs, states):
-            return cell(inputs, states)
+        outputs, states = npx.foreach(cell, inputs, states + [valid_length])
     else:
         zeros = []
         for s in states:
-            zeros.append(F.zeros_like(s))
+            zeros.append(np.zeros(s.shape))
         states = list(_as_list(states))
-        states.append(F.zeros((1)))
-        def loop_body(inputs, states):
-            cell_states = states[:-1]
-            iter_no = states[-1]
-            out, new_states = cell(inputs, cell_states)
-            for i, state in enumerate(cell_states):
-                new_states[i] = F.where(F.broadcast_greater(valid_length, iter_no),
-                                        new_states[i], state)
-            new_states.append(iter_no + 1)
-            return out, new_states
-
-    outputs, states = F.contrib.foreach(loop_body, inputs, states)
+        states.append(np.zeros((1)))
+        class loop_body(HybridBlock):
+            """Loop body for foreach operator"""
+            def __init__(self, cell):
+                super(loop_body, self).__init__()
+                self.cell = cell
+
+            def forward(self, inputs, states):
+                valid_len = states.pop()
+                cell_states = states[:-1]
+                iter_no = states[-1]
+                out, new_states = self.cell(inputs, cell_states)
+                for i, state in enumerate(cell_states):
+                    cond = npx.broadcast_greater(valid_len, iter_no)
+                    cond_broad = np.broadcast_to(cond, new_states[i].T.shape).T
+                    new_states[i] = np.where(cond_broad, new_states[i], state)
+                new_states.append(iter_no + 1)
+                new_states.append(valid_len)
+                return out, new_states
+        body = loop_body(cell)
+        outputs, states = npx.foreach(body, inputs, states + [valid_length])
+        states.pop()
     if drop_outputs:
-        outputs = F.Dropout(outputs, p=drop_outputs, axes=(axis,))
+        outputs = npx.dropout(outputs, p=drop_outputs, axes=(axis,))
     if valid_length is not None:
         if axis != 0:
-            outputs = F.transpose(outputs, axes)
-        outputs = F.SequenceMask(outputs, sequence_length=valid_length,
-                                 use_sequence_length=True, axis=axis)
+            outputs = np.transpose(outputs, axes)
+        outputs = npx.sequence_mask(outputs, sequence_length=valid_length,
+                                    use_sequence_length=True, axis=axis)
         # the last state is the iteration number. We don't need it.
         return outputs, states[:-1]
     else:
         if axis != 0:
-            outputs = F.transpose(outputs, axes)
+            outputs = np.transpose(outputs, axes)
         return outputs, states
diff --git a/python/mxnet/gluon/rnn/rnn_layer.py b/python/mxnet/gluon/rnn/rnn_layer.py
index c6d23e5edabf..59056de6ce7b 100644
--- a/python/mxnet/gluon/rnn/rnn_layer.py
+++ b/python/mxnet/gluon/rnn/rnn_layer.py
@@ -23,12 +23,13 @@
 
 __all__ = ['RNN', 'LSTM', 'GRU']
 
-from ... import ndarray, symbol
+from ... import np, npx, context
 from .. import HybridBlock, tensor_types
 from ..parameter import Parameter
-from ...util import is_np_array
+from ...util import use_np
 
 
+@use_np
 class _RNNLayer(HybridBlock):
     """Implementation of recurrent layers."""
     def __init__(self, hidden_size, num_layers, layout,
@@ -81,14 +82,14 @@ def __init__(self, hidden_size, num_layers, layout,
                                          init=h2h_bias_initializer, dtype=dtype)
                 ni = nh * self._dir
         else:
-            np = self._projection_size
+            ps = self._projection_size
             for i in range(num_layers):
                 for j in ['l', 'r'][:self._dir]:
                     self._register_param('{}{}_i2h_weight'.format(j, i),
                                          shape=(ng*nh, ni),
                                          init=i2h_weight_initializer, dtype=dtype)
                     self._register_param('{}{}_h2h_weight'.format(j, i),
-                                         shape=(ng*nh, np),
+                                         shape=(ng*nh, ps),
                                          init=h2h_weight_initializer, dtype=dtype)
                     self._register_param('{}{}_i2h_bias'.format(j, i),
                                          shape=(ng*nh,),
@@ -97,9 +98,9 @@ def __init__(self, hidden_size, num_layers, layout,
                                          shape=(ng*nh,),
                                          init=h2h_bias_initializer, dtype=dtype)
                     self._register_param('{}{}_h2r_weight'.format(j, i),
-                                         shape=(np, nh),
+                                         shape=(ps, nh),
                                          init=h2r_weight_initializer, dtype=dtype)
-                ni = np * self._dir
+                ni = ps * self._dir
 
     def _register_param(self, name, shape, init, dtype):
         p = Parameter(name, shape=shape, init=init, allow_deferred_init=True, dtype=dtype)
@@ -128,7 +129,7 @@ def cast(self, dtype):
         super(_RNNLayer, self).cast(dtype)
         self._dtype = dtype
 
-    def begin_state(self, batch_size=0, func=ndarray.zeros, **kwargs):
+    def begin_state(self, batch_size=0, func=np.zeros, **kwargs):
         """Initial state for this cell.
 
         Parameters
@@ -155,25 +156,22 @@ def begin_state(self, batch_size=0, func=ndarray.zeros, **kwargs):
             Starting states for the first RNN step.
         """
         states = []
-        for i, info in enumerate(self.state_info(batch_size)):
+        for info in self.state_info(batch_size):
             if info is not None:
                 info.update(kwargs)
             else:
                 info = kwargs
-            state = func(name='h0_%d' % (i), **info)
-            if is_np_array():
-                state = state.as_np_ndarray()
+            state = func(shape=info.pop("shape", ()),
+                         ctx=info.pop("ctx", context.cpu()),
+                         dtype=info.pop("dtype", "float32"))
             states.append(state)
         return states
 
     def __call__(self, inputs, states=None, sequence_length=None, **kwargs):
         self.skip_states = states is None
         if states is None:
-            if isinstance(inputs, ndarray.NDArray):
-                batch_size = inputs.shape[self._layout.find('N')]
-                states = self.begin_state(batch_size, ctx=inputs.context, dtype=inputs.dtype)
-            else:
-                states = self.begin_state(0, func=symbol.zeros)
+            batch_size = inputs.shape[self._layout.find('N')]
+            states = self.begin_state(batch_size, ctx=inputs.context, dtype=inputs.dtype)
         if isinstance(states, tensor_types):
             states = [states]
 
@@ -182,57 +180,71 @@ def __call__(self, inputs, states=None, sequence_length=None, **kwargs):
         else:
             return super(_RNNLayer, self).__call__(inputs, states, **kwargs)
 
-    def hybrid_forward(self, F, inputs, states, sequence_length=None, **kwargs):
-        if F is ndarray:
-            batch_size = inputs.shape[self._layout.find('N')]
+    def forward(self, inputs, states, sequence_length=None):
+        batch_size = inputs.shape[self._layout.find('N')]
 
-        if F is ndarray:
-            for state, info in zip(states, self.state_info(batch_size)):
-                if state.shape != info['shape']:
-                    raise ValueError(
-                        "Invalid recurrent state shape. Expecting %s, got %s."%(
-                            str(info['shape']), str(state.shape)))
-        out = self._forward_kernel(F, inputs, states, sequence_length, **kwargs)
+        for state, info in zip(states, self.state_info(batch_size)):
+            if state.shape != info['shape']:
+                raise ValueError(
+                    "Invalid recurrent state shape. Expecting %s, got %s."%(
+                        str(info['shape']), str(state.shape)))
+        out = self._forward_kernel(inputs, states, sequence_length)
 
         # out is (output, state)
         return out[0] if self.skip_states else out
 
-    def _forward_kernel(self, F, inputs, states, sequence_length, **kwargs):
+    def infer_shape(self, inputs, *args):
+        assert inputs.ndim == 3, \
+            "Input data should be rank-3 tensor of dim [sequence length, batch size, input size]"
+        if not self._projection_size:
+            step = self._hidden_size
+        else:
+            step = self._projection_size
+        ni = inputs.shape[2]
+        for i in range(self._num_layers):
+            for j in ['l', 'r'][:self._dir]:
+                name = '{}{}_i2h_weight'.format(j, i)
+                getattr(self, name).shape = (self._gates*self._hidden_size, ni)
+            ni = step * self._dir
+
+    def _forward_kernel(self, inputs, states, sequence_length):
         """ forward using CUDNN or CPU kenrel"""
-        swapaxes = F.np.swapaxes if is_np_array() else F.swapaxes
+        ctx = inputs.ctx
         if self._layout == 'NTC':
-            inputs = swapaxes(inputs, 0, 1)
+            inputs = np.swapaxes(inputs, 0, 1)
         if self._projection_size is None:
-            params = (kwargs['{}{}_{}_{}'.format(d, l, g, t)].reshape(-1)
+            params = (getattr(self, '{}{}_{}_{}'.format(d, l, g, t)).data(ctx).reshape(-1)
                       for t in ['weight', 'bias']
                       for l in range(self._num_layers)
                       for d in ['l', 'r'][:self._dir]
                       for g in ['i2h', 'h2h'])
         else:
-            params = (kwargs['{}{}_{}_{}'.format(d, l, g, t)].reshape(-1)
+            params = (getattr(self, '{}{}_{}_{}'.format(d, l, g, t)).data(ctx).reshape(-1)
                       for t in ['weight', 'bias']
                       for l in range(self._num_layers)
                       for d in ['l', 'r'][:self._dir]
                       for g in ['i2h', 'h2h', 'h2r']
                       if g != 'h2r' or t != 'bias')
 
-        rnn_param_concat = F.np._internal.rnn_param_concat if is_np_array()\
-            else F._internal._rnn_param_concat
-        params = rnn_param_concat(*params, dim=0)
+        params = np.concatenate(params, axis=0)
 
         if self._use_sequence_length:
             rnn_args = states + [sequence_length]
         else:
             rnn_args = states
 
-        rnn_fn = F.npx.rnn if is_np_array() else F.RNN
-        rnn = rnn_fn(inputs, params, *rnn_args, use_sequence_length=self._use_sequence_length,
-                     state_size=self._hidden_size, projection_size=self._projection_size,
-                     num_layers=self._num_layers, bidirectional=self._dir == 2,
-                     p=self._dropout, state_outputs=True, mode=self._mode,
-                     lstm_state_clip_min=self._lstm_state_clip_min,
-                     lstm_state_clip_max=self._lstm_state_clip_max,
-                     lstm_state_clip_nan=self._lstm_state_clip_nan)
+        rnn_args_ctx = []
+        for args in rnn_args:
+            new_args = args.as_in_ctx(ctx)
+            rnn_args_ctx.append(new_args)
+
+        rnn = npx.rnn(inputs, params, *rnn_args_ctx, use_sequence_length=self._use_sequence_length,
+                      state_size=self._hidden_size, projection_size=self._projection_size,
+                      num_layers=self._num_layers, bidirectional=self._dir == 2,
+                      p=self._dropout, state_outputs=True, mode=self._mode,
+                      lstm_state_clip_min=self._lstm_state_clip_min,
+                      lstm_state_clip_max=self._lstm_state_clip_max,
+                      lstm_state_clip_nan=self._lstm_state_clip_nan)
 
         if self._mode == 'lstm':
             outputs, states = rnn[0], [rnn[1], rnn[2]]
@@ -240,7 +252,7 @@ def _forward_kernel(self, F, inputs, states, sequence_length, **kwargs):
             outputs, states = rnn[0], [rnn[1]]
 
         if self._layout == 'NTC':
-            outputs = swapaxes(outputs, 0, 1)
+            outputs = np.swapaxes(outputs, 0, 1)
 
         return outputs, states
 
@@ -314,11 +326,11 @@ class RNN(_RNNLayer):
     --------
     >>> layer = mx.gluon.rnn.RNN(100, 3)
     >>> layer.initialize()
-    >>> input = mx.nd.random.uniform(shape=(5, 3, 10))
+    >>> input = mx.np.random.uniform(size=(5, 3, 10))
     >>> # by default zeros are used as begin state
     >>> output = layer(input)
     >>> # manually specify begin state.
-    >>> h0 = mx.nd.random.uniform(shape=(3, 3, 100))
+    >>> h0 = mx.np.random.uniform(size=(3, 3, 100))
     >>> output, hn = layer(input, h0)
     """
     def __init__(self, hidden_size, num_layers=1, activation='relu',
@@ -429,12 +441,12 @@ class LSTM(_RNNLayer):
     --------
     >>> layer = mx.gluon.rnn.LSTM(100, 3)
     >>> layer.initialize()
-    >>> input = mx.nd.random.uniform(shape=(5, 3, 10))
+    >>> input = mx.np.random.uniform(size=(5, 3, 10))
     >>> # by default zeros are used as begin state
     >>> output = layer(input)
     >>> # manually specify begin state.
-    >>> h0 = mx.nd.random.uniform(shape=(3, 3, 100))
-    >>> c0 = mx.nd.random.uniform(shape=(3, 3, 100))
+    >>> h0 = mx.np.random.uniform(size=(3, 3, 100))
+    >>> c0 = mx.np.random.uniform(size=(3, 3, 100))
     >>> output, hn = layer(input, [h0, c0])
     """
     def __init__(self, hidden_size, num_layers=1, layout='TNC',
@@ -539,11 +551,11 @@ class GRU(_RNNLayer):
     --------
     >>> layer = mx.gluon.rnn.GRU(100, 3)
     >>> layer.initialize()
-    >>> input = mx.nd.random.uniform(shape=(5, 3, 10))
+    >>> input = mx.np.random.uniform(size=(5, 3, 10))
     >>> # by default zeros are used as begin state
     >>> output = layer(input)
     >>> # manually specify begin state.
-    >>> h0 = mx.nd.random.uniform(shape=(3, 3, 100))
+    >>> h0 = mx.np.random.uniform(size=(3, 3, 100))
     >>> output, hn = layer(input, h0)
     """
     def __init__(self, hidden_size, num_layers=1, layout='TNC',
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index 8e7a5207c522..df641cf1ace5 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -136,30 +136,33 @@ def clip_global_norm(arrays, max_norm, check_isfinite=True):
     def group_by_ctx(arr_list):
         groups = collections.defaultdict(list)
         for arr in arr_list:
-            ctx = arr.context
+            ctx = arr.ctx
             groups[ctx].append(arr)
         return groups
+    def multi_sum_sq(*args, ctx=None):
+        sum = _mx_np.array([0], ctx=ctx)
+        for arg in args:
+            sum += _mx_np.square(arg).sum().item()
+        return sum
     arrays_groups = group_by_ctx(arrays)
-    all_ctx_sum = []
-    ctx = arrays[0].context
+    all_ctx_sum = _mx_np.array([0])
+    ctx = arrays[0].ctx
     for group in arrays_groups:
-        sum_sq = ndarray.multi_sum_sq(*arrays_groups[group],
-                                      num_arrays=len(arrays_groups[group]))
-        sum_sq = ndarray.add_n(*sum_sq)
-        all_ctx_sum.append(sum_sq.as_in_context(ctx))
+        sum_sq = multi_sum_sq(*arrays_groups[group], ctx=ctx)
+        all_ctx_sum += sum_sq
     # global reduce
-    total_norm = ndarray.add_n(*all_ctx_sum).sqrt()
+    total_norm = _mx_np.sqrt(all_ctx_sum)
     if check_isfinite:
-        if not np.isfinite(total_norm.asscalar()):
+        if not np.isfinite(total_norm.item()):
             warnings.warn(
                 UserWarning('nan or inf is detected. '
                             'Clipping results will be undefined.'), stacklevel=2)
     scale = max_norm / (total_norm + 1e-8)
-    scale = ndarray.min(ndarray.concat(scale, ndarray.ones(1, ctx=ctx), dim=0))
+    scale = _mx_np.min(_mx_np.concatenate([scale, _mx_np.ones(1, ctx=ctx)], axis=0))
     for arr in arrays:
-        arr *= scale.as_in_context(arr.context)
+        arr *= scale.item()
     if check_isfinite:
-        return total_norm.asscalar()
+        return total_norm.item()
     else:
         return total_norm
 
@@ -484,3 +487,20 @@ def _check_all_np_ndarrays(out):
         for i in out:
             _check_all_np_ndarrays(i)
     # pylint: enable=no-else-raise
+
+
+def _check_block_input_np_ndarrays(inputs):
+    """Check if block's inputs are numpy ndarrays."""
+    from ..numpy import ndarray as np_ndarray
+    from ..symbol import Symbol as nd_symbol
+    from ..ndarray import NDArray as nd_ndarray
+
+    # pylint: disable=no-else-raise
+    if isinstance(inputs, (nd_ndarray, nd_symbol)) and not isinstance(inputs, (np_ndarray)):
+        raise TypeError("Block's inputs must be of type `mxnet.numpy.ndarray`, "
+                        "while got output type {}"
+                        .format(str(type(inputs))))
+    elif isinstance(inputs, (list, tuple)):
+        for i in inputs:
+            _check_block_input_np_ndarrays(i)
+    # pylint: enable=no-else-raise
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index ff0dbdeb3703..fa255f0ccac4 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -5561,7 +5561,11 @@ def average(a, axis=None, weights=None, returned=False, out=None):
     >>> np.average(data, axis=1, weights=weights)
     array([0.75, 2.75, 4.75])
     """
-    return _api_internal.average(a, weights, axis, returned, weights is not None, out)
+    out = _api_internal.average(a, weights, axis, returned, weights is not None, out)
+    if isinstance(out, NDArray):
+        return out
+    else:
+        return list(out)
 
 
 @set_module('mxnet.ndarray.numpy')
diff --git a/python/mxnet/ndarray/numpy_extension/__init__.py b/python/mxnet/ndarray/numpy_extension/__init__.py
index afa81edc3820..f50998334ba4 100644
--- a/python/mxnet/ndarray/numpy_extension/__init__.py
+++ b/python/mxnet/ndarray/numpy_extension/__init__.py
@@ -18,9 +18,11 @@
 """Module for the ops not belonging to the official numpy package."""
 
 from . import _op
+from . import control_flow
 from . import image
 from . import random
 from . import _register
 from ._op import *  # pylint: disable=wildcard-import
+from .control_flow import *  # pylint: disable=wildcard-import
 
-__all__ = _op.__all__
+__all__ = _op.__all__ + control_flow.__all__
diff --git a/python/mxnet/ndarray/numpy_extension/_op.py b/python/mxnet/ndarray/numpy_extension/_op.py
index f3d2db254bda..20b6c91de70d 100644
--- a/python/mxnet/ndarray/numpy_extension/_op.py
+++ b/python/mxnet/ndarray/numpy_extension/_op.py
@@ -28,7 +28,7 @@
            'activation', 'batch_norm', 'fully_connected', 'pick', 'convolution',
            'deconvolution', 'pooling', 'dropout', 'one_hot', 'rnn', 'embedding',
            'topk', 'layer_norm', 'leaky_relu', 'batch_dot', 'broadcast_like',
-           'arange_like']
+           'arange_like', 'group_norm']
 
 
 # pylint: disable=too-many-arguments
@@ -1126,8 +1126,8 @@ def embedding(data, weight, input_dim=None, output_dim=None, dtype="float32", sp
            [[ 0.,  1.,  2.,  3.,  4.],
             [10., 11., 12., 13., 14.]]])
     """
-    assert input_dim > 1, "Vocabulary size of the input indices should be greater than 1."
-    assert output_dim > 1, "Dimension of the embedding vectors should greater than 1."
+    assert input_dim > 0, "Vocabulary size of the input indices should be greater than 0."
+    assert output_dim > 0, "Dimension of the embedding vectors should greater than 0."
     return _api_internal.embedding(data, weight, input_dim, output_dim, dtype, sparse_grad)
 
 
@@ -1443,3 +1443,49 @@ def arange_like(data, start=0.0, step=1.0, repeat=1, ctx=None, axis=None):
     array([0., 1., 2., 3.])
     """
     return _api_internal.arange_like(data, start, step, repeat, ctx, axis)
+
+
+# pylint: disable=too-many-arguments
+@set_module('mxnet.ndarray.numpy_extension')
+def group_norm(data, gamma, beta, num_groups=1, eps=1e-3, output_mean_var=False):
+    r"""Group normalization.
+
+    The input channels are separated into ``num_groups`` groups,
+    each containing ``num_channels / num_groups`` channels.
+    The mean and standard-deviation are calculated separately over the each group.
+
+    .. math::
+
+      data = data.reshape((N, num_groups, C // num_groups, ...))
+      out = \frac{data - mean(data, axis)}{\sqrt{var(data, axis) + \epsilon}} * gamma + beta
+
+    Both ``gamma`` and ``beta`` are learnable parameters.
+
+
+
+    Defined in ../src/operator/nn/group_norm.cc:L78
+
+    Parameters
+    ----------
+    data : NDArray
+        Input data
+    gamma : NDArray
+        gamma array
+    beta : NDArray
+        beta array
+    num_groups : int, optional, default='1'
+        Total number of groups.
+    eps : float, optional, default=9.99999975e-06
+        An `epsilon` parameter to prevent division by 0.
+    output_mean_var : boolean, optional, default=0
+        Output the mean and std calculated along the given axis.
+
+    Returns
+    -------
+    out : NDArray or list of NDArrays
+        The output of this function.
+    """
+    out = _api_internal.group_norm(data, gamma, beta, num_groups, eps, output_mean_var)
+    if isinstance(out, NDArrayBase):
+        return out
+    return list(out)
diff --git a/python/mxnet/ndarray/numpy_extension/control_flow.py b/python/mxnet/ndarray/numpy_extension/control_flow.py
new file mode 100644
index 000000000000..e3b66e03d801
--- /dev/null
+++ b/python/mxnet/ndarray/numpy_extension/control_flow.py
@@ -0,0 +1,714 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for registering control flow ops for imperative programming."""
+
+from . import _api_internal
+from .._internal import NDArrayBase
+from ...util import set_module
+from ...numpy import ndarray as np_ndarray
+from ...symbol import Symbol
+from ...base import _as_list
+from ... import symbol, _deferred_compute as dc, autograd as ag
+from ...attribute import AttrScope, current as current_attribute
+
+
+__all__ = ["foreach", "while_loop", "cond"]
+
+
+def _flatten(args, inout_str):
+    """Parse the arguments into a flattened list + an additional format array.
+    The format array stores the structure of the original arguments to help reconstruct the inputs.
+
+    Parameters
+    ----------
+    args : NDArray, Symbol, or (nested) list of Symbol or NDArray
+        We allow None inside the args.
+    inout_str : str
+        The name of the HybridBlock
+
+    Returns
+    -------
+    flat : list of Symbol or NDArray
+        The flatten version of the input args.
+    fmts : (nested) list of ints
+        Stores the format information of the original structured args.
+    """
+    if isinstance(args, np_ndarray):
+        return [args], int(0)
+    if isinstance(args, Symbol):
+        length = len(args.list_outputs())
+        length = length if length > 1 else 0
+        return [args], int(length)
+    if args is None:
+        return [None], int(-1)
+
+    if not isinstance(args, (list, tuple)):
+        raise ValueError("When hybridized, the input of HybridBlock {}"
+                         " must be (nested) list of Symbol"
+                         " or NDArray, "
+                         "but got {} of type {}".format(inout_str, str(args), str(type(args))))
+    flat = []
+    fmts = []
+    for i in args:
+        arg, fmt = _flatten(i, inout_str)
+        flat.extend(arg)
+        fmts.append(fmt)
+    return flat, fmts
+
+
+def _regroup(args, fmt):
+    """Reconstruct the structured arguments based on the flattened version.
+
+    Parameters
+    ----------
+    args : NDArray, Symbol, or (nested) list of Symbol or NDArray
+        We allow None inside the args.
+    fmt : (nested) list of ints
+        Stores the format information of the original structured args.
+
+    Returns
+    -------
+    ret : NDArray, Symbol, or (nested) list of Symbol or NDArray
+
+    """
+    def _merger(args, fmt):
+        """Recursive call to merge the arguments"""
+        if isinstance(fmt, int):
+            if fmt < -1:
+                raise ValueError("Unsupported encoded format {}.".format(fmt))
+            if fmt == 0:
+                return args[0], args[1:]
+            if fmt == -1:
+                if args[0] is not None:
+                    raise ValueError('We do not support passing types that are not None'
+                                     ' when the initial HybridBlock has received NoneType and'
+                                     ' has been hybridized.'
+                                     ' Received arg = {}, fmt = {}.'.format(args[0], fmt))
+                return None, args[1:]
+            else:
+                return args[:fmt], args[fmt:]
+
+        if not isinstance(args, (list, tuple)):
+            raise ValueError("When hybridized, the output of HybridBlock must be (nested)"
+                             " list of Symbol or NDArray, "
+                             "but got {} of type {}".format(args, type(args)))
+        ret = []
+        for i in fmt:
+            res, args = _merger(args, i)
+            ret.append(res)
+        return ret, args
+    return _merger(args, fmt)[0]
+
+def _get_unique_subgraph_name(subgraph_name):
+    attrs = current_attribute()._attr
+    if attrs.get("__subgraph_name__", "") != "":
+        subgraph_name = "".join([attrs["__subgraph_name__"], "$", subgraph_name])
+    AttrScope._subgraph_names[subgraph_name] += 1
+    subgraph_name = subgraph_name + str(AttrScope._subgraph_names[subgraph_name] - 1)
+    return subgraph_name
+
+# This construct a subgraph for given output nodes.
+# If an output node is one of the input nodes, we call identity to make sure
+# that outputs nodes are different from input nodes.
+def _construct_subgraph(sym_out, sym_states):
+    sym_out = _as_list(sym_out)
+    sym_states = _as_list(sym_states)
+    all_outputs = []
+    all_outputs.extend(sym_out)
+    all_outputs.extend(sym_states)
+    g = symbol.Group(all_outputs)
+
+    flat_out = []
+    all_input_names = g.list_inputs()
+    output_names = {o.name for o in sym_out}
+    for o in sym_out:
+        if o.name in all_input_names:
+            flat_out.append(symbol.op.identity(o))
+        else:
+            flat_out.append(o)
+
+    for s in sym_states:
+        if s.name in all_input_names or s.name in output_names:
+            flat_out.append(symbol.op.identity(s))
+        else:
+            flat_out.append(s)
+    return symbol.Group(flat_out)
+
+@set_module('mxnet.ndarray.numpy_extension')
+def foreach(body, data, init_states, name="foreach"):
+    """Run a for loop with user-defined computation over NDArrays on dimension 0.
+
+    This operator simulates a for loop and body has the computation for an iteration
+    of the for loop. It runs the computation in body on each slice from the input
+    NDArrays.
+
+    body takes two arguments as input and outputs a tuple of two elements,
+    as illustrated below::
+
+        out, states = body(data1, states)
+
+    data1 can be either an NDArray or a list of NDArrays. If data is an NDArray,
+    data1 is an NDArray. Otherwise, data1 is a list of NDArrays and has the same
+    size as data. states is a list of NDArrays and have the same size as init_states.
+    Similarly, out can be either an NDArray or a list of NDArrays, which are concatenated
+    as the first output of foreach; states from the last execution of body
+    are the second output of foreach.
+
+    The computation done by this operator is equivalent to the pseudo code below
+    when the input data is NDArray::
+
+        states = init_states
+        outs = []
+        for i in data.shape[0]:
+            s = data[i]
+            out, states = body(s, states)
+            outs.append(out)
+        outs = stack(*outs)
+
+
+    Parameters
+    ----------
+    body : HybridBlock.
+        Define computation in an iteration.
+    data: an NDArray or a list of NDArrays.
+        The input data.
+    init_states: an NDArray or nested lists of NDArrays.
+        The initial values of the loop states.
+
+    Returns
+    -------
+    outputs: an NDArray or nested lists of NDArrays.
+        The output data concatenated from the output of all iterations.
+    states: an NDArray or nested lists of NDArrays.
+        The loop states in the last iteration.
+
+    Examples
+    --------
+    >>> step = lambda data, states: (data + states[0], [states[0] * 2])
+    >>> data = mx.np.random.uniform(size=(2, 10))
+    >>> states = [mx.np.random.uniform(size=(10))]
+    >>> outs, states = npx.control_flow.foreach(step, data, states)
+    """
+
+    def check_input(inputs, in_type, msg):
+        is_NDArray_or_list = True
+        if isinstance(inputs, list):
+            for i in inputs:
+                if not isinstance(i, in_type):
+                    is_NDArray_or_list = False
+                    break
+        else:
+            is_NDArray_or_list = isinstance(inputs, in_type)
+        assert is_NDArray_or_list, msg
+
+    flatten_data, data_fmt = _flatten(data, "foreach input")
+    check_input(flatten_data, np_ndarray,
+                "data should be an mxnet.numpy.ndarray or a nested list of mxnet.numpy.ndarray")
+    flatten_state, state_fmt = _flatten(init_states, "foreach states")
+    check_input(flatten_state, np_ndarray,
+                "init_states should be an mxnet.numpy.ndarray or a nested list of mxnet.numpy.ndarray")
+
+    real_data = [ele[0].copy().detach() if ele is not None else None for ele in flatten_data]
+    real_state = [ele.copy().detach() if ele is not None else None for ele in flatten_state]
+
+    # If the input python function references to the symbols outside
+    # the python function, we need to prune the computation graph constructed from
+    # the function. One way of doing it is to mark the nodes in the computation graph
+    # with AttrScope and prune the nodes without the special attribute.
+    name = _get_unique_subgraph_name(name)
+    with AttrScope(__subgraph_name__=name):
+        data_names = ['data_subgraph{}'.format(i) for i, ele in enumerate(real_data)]
+        state_names = ['state_subgraph{}'.format(i) for i, ele in enumerate(real_state)]
+        symbol_data = [
+            symbol.var(name).as_np_ndarray()
+            for arg, name in zip(real_data, data_names)
+        ]
+        symbol_state = [
+            symbol.var(name).as_np_ndarray()
+            for arg, name in zip(real_state, state_names)
+        ]
+        dc.set_variable(real_data, symbol_data)
+        dc.set_variable(real_state, symbol_state)
+        in_eles = _regroup(real_data, data_fmt)
+        in_states = _regroup(real_state, state_fmt)
+        if dc.is_deferred_compute():
+            out, states = body(in_eles, in_states)
+        else:
+            with ag.pause(), dc.context():
+                out, states = body(in_eles, in_states)
+
+        flatten_out, out_fmt = _flatten(out, "foreach output")
+        flatten_out_state, state_fmt = _flatten(states, "foreach loop_vars")
+
+        num_out_data = len(flatten_out)
+        num_states = len(flatten_out_state)
+        num_outputs = num_out_data + num_states
+        sym_out = [dc.get_symbol(out_data) for out_data in flatten_out]
+        sym_states = [dc.get_symbol(out_state) for out_state in flatten_out_state]
+        dc.clear(flatten_out)
+        dc.clear(flatten_out_state)
+        g = _construct_subgraph(sym_out, sym_states)
+
+    params_names = []
+    params_data = []
+    if hasattr(body, "collect_params"):
+        for p in body.collect_params().values():
+            params_names.append(p.var().name)
+            params_data.append(p.data())
+
+    subg_input_names = g.list_inputs()
+
+    in_data, in_states, params = [], [], []
+    in_data_locs, in_state_locs, remain_locs, in_state_index = [], [], [], []
+    for i, sub_name in enumerate(subg_input_names):
+        if sub_name in data_names:
+            in_data_locs.append(i)
+            idx = data_names.index(sub_name)
+            in_data.append(flatten_data[idx])
+        elif sub_name in state_names:
+            in_state_locs.append(i)
+            idx = state_names.index(sub_name)
+            in_states.append(flatten_state[idx])
+            in_state_index.append(idx)
+        elif sub_name in params_names:
+            remain_locs.append(i)
+            idx = params_names.index(sub_name)
+            params.append(params_data[idx])
+        else:
+            raise AssertionError("the data arrays have to be used in the loop body")
+
+    ordered_ins = in_data + in_states + params
+
+    ndoutput = _api_internal.foreach(g.handle, *ordered_ins, num_outputs, num_out_data, in_state_locs,
+                                     in_data_locs, remain_locs, in_state_index)
+    if isinstance(ndoutput, NDArrayBase):
+        ret = ndoutput
+    else:
+        ret = list(ndoutput)
+    outs = []
+    for i in range(num_outputs - num_states):
+        outs.append(ret[i])
+    outs = _regroup(outs, out_fmt)
+    states = []
+    for i in range(num_states):
+        states.append(ret[num_outputs - num_states + i])
+    states = _regroup(states, state_fmt)
+
+    return (outs, states)
+
+
+#pylint: disable=W0621
+@set_module('mxnet.ndarray.numpy_extension')
+def while_loop(cond, func, loop_vars, max_iterations=None, name="while_loop"):
+    """Run a while loop with user-defined computation and loop condition.
+
+    This operator simulates a while loop which iterately does customized computation
+    as long as the condition is satisfied.
+
+    `loop_vars` is a list of NDArrays on which the computation uses.
+
+    `cond` is a user-defined function, used as the loop condition.
+    It consumes `loop_vars`, and produces a scalar MXNet NDArray,
+    indicating the termination of the loop.
+    The loop ends when `cond` returns false (zero).
+    The `cond` is variadic, and its signature should be
+    `cond(*loop_vars) => NDArray`.
+
+    `func` is a user-defined function, used as the loop body.
+    It also consumes `loop_vars`, and produces `step_output` and `new_loop_vars` at each step.
+    In each step, `step_output` should contain the same number elements.
+    Through all steps, the i-th element of `step_output` should have the same shape and dtype.
+    Also, `new_loop_vars` should contain the same number of elements as `loop_vars`,
+    and the corresponding element should have the same shape and dtype.
+    The `func` is variadic, and its signature should be
+    `func(*loop_vars) =>
+    (NDArray or nested List[NDArray] step_output, NDArray or nested List[NDArray] new_loop_vars)`.
+
+    `max_iterations` is a scalar that defines the maximum number of iterations allowed.
+
+    This function returns two lists.
+    The first list has the length of `|step_output|`,
+    in which the i-th element are all i-th elements of
+    `step_output` from all steps, stacked along axis 0.
+    The second list has the length of `|loop_vars|`,
+    which represents final states of loop variables.
+
+    .. warning::
+
+       For now, the axis 0 of all NDArrays in the first list are `max_iterations`,
+       due to lack of dynamic shape inference.
+
+    .. warning::
+
+       When `cond` is never satisfied, we assume `step_output` is empty,
+       because it cannot be inferred. This is different from the symbolic version.
+
+    Parameters
+    ----------
+    cond: a Python function.
+        The loop condition.
+    func: a Python function.
+        The loop body.
+    loop_vars: an NDArray or nested lists of NDArrays.
+        The initial values of the loop variables.
+    max_iterations: a python int.
+        Maximum number of iterations.
+
+    Returns
+    ------
+    outputs: an NDArray or nested lists of NDArrays
+        stacked output from each step
+    states: an NDArray or nested lists of NDArrays
+        final state
+
+    Examples
+    --------
+    >>> cond = lambda i, s: i <= 5
+    >>> func = lambda i, s: ([i + s], [i + 1, s + i])
+    >>> loop_vars = (mx.np.array([0], dtype="int64"), mx.np.array([1], dtype="int64"))
+    >>> outputs, states = mx.npx.while_loop(cond, func, loop_vars, max_iterations=10)
+    >>> outputs
+    [array([[ 1],
+           [ 2],
+           [ 4],
+           [ 7],
+           [11],
+           [16],
+           [ 0],
+           [ 0],
+           [ 0],
+           [ 0]], dtype=int64)]
+    >>> states
+    [array([6], dtype=int64), array([16], dtype=int64)]
+    """
+    def _to_python_scalar(inputs, type_, name):
+        """Converts "inputs", possibly typed mxnet NDArray, a numpy ndarray, other python types,
+        to the given type
+        """
+        if isinstance(inputs, np_ndarray):
+            inputs = inputs.item()
+        try:
+            inputs = type_(inputs)
+        except:
+            raise ValueError("Cannot convert %s to python %s" % (name, type_.__name__))
+        return inputs
+
+    def _cond_wrapper(loop_vars):
+        if dc.is_deferred_compute():
+            result = cond(*loop_vars).astype("int")
+        else:
+            with ag.pause(), dc.context():
+                result = cond(*loop_vars).astype("int")
+        flatten_out, _ = _flatten(result, "while_loop output")
+        out = dc.get_symbol(flatten_out)
+        dc.clear(flatten_out)
+        return [], [out], [], []
+
+    def _func_wrapper(loop_vars):
+        """This wrapper unifies
+             "func: loop_vars -> new_loop_vars"
+         and "func: loop_vars -> (step_output, new_loop_vars)"
+        into "func: loop_vars -> (None or tuple of step_outputs, tuple of new_loop_vars)
+        """
+        if dc.is_deferred_compute():
+            step_output, new_loop_vars = func(*loop_vars)
+        else:
+            with ag.pause(), dc.context():
+                step_output, new_loop_vars = func(*loop_vars)
+        if step_output is None:
+            step_output = []
+        if new_loop_vars is None:
+            new_loop_vars = []
+        if isinstance(step_output, tuple):
+            step_output = list(step_output)
+        if isinstance(new_loop_vars, tuple):
+            new_loop_vars = list(new_loop_vars)
+        new_loop_vars = _as_list(new_loop_vars)
+        if len(loop_vars) != len(new_loop_vars):
+            raise ValueError("The length of loop_vars should be consistent during the loop")
+        step_output_flatten, out_fmt = _flatten(step_output, "while output")
+        new_loop_vars_flatten, var_fmt = _flatten(new_loop_vars, "while loop_vars")
+        if isinstance(step_output, list):
+            if len(step_output) == 0:
+                step_out = []
+            else:
+                step_out = [dc.get_symbol(out) for out in step_output_flatten]
+        else:
+            step_output_flatten, out_fmt = _flatten(step_output, "while output")
+            step_out = [dc.get_symbol(step_output_flatten)]
+        if len(new_loop_vars) == 0:
+            new_var = []
+        else:
+            new_var = [dc.get_symbol(var) for var in new_loop_vars_flatten]
+        return step_out, new_var, out_fmt, var_fmt
+
+    def _create_subgraph(graph_vars, graph_func, subgraph_name):
+        subgraph_name = _get_unique_subgraph_name(subgraph_name)
+        with AttrScope(__subgraph_name__=subgraph_name):
+            # create new variables with the same name,
+            # them feed them to the given func
+            flatten_data, data_fmt = _flatten(graph_vars, "foreach input")
+            real_data = [ele.copy().detach() if ele is not None else None for ele in flatten_data]
+            data_names = ['data_subgraph{}'.format(i) for i, ele in enumerate(real_data)]
+            symbol_data = [
+                symbol.var(name).as_np_ndarray()
+                for arg, name in zip(real_data, data_names)
+            ]
+            dc.set_variable(real_data, symbol_data)
+            new_graph_vars = _regroup(real_data, data_fmt)
+            outputs, final_state, out_fmt, var_fmt = graph_func(new_graph_vars)
+            # first `num_out_data` elements belong to `outputs`
+            # other elements belong to `final_state`
+            num_out_data = len(outputs)
+            num_outputs = len(outputs) + len(final_state)
+            # group all outputs of graph_func
+            graph = _construct_subgraph(outputs, final_state)
+        return graph, num_out_data, num_outputs, out_fmt, var_fmt
+
+    flatten_loop_vars, init_loop_var_fmt = _flatten(loop_vars, "while loop_vars")
+
+    def _union_inputs(*graphs):
+        # Given a list of graphs, each whose inputs are either from loop_vars or other variables.
+        # 1) calculate a list `inputs`, the union of their inputs.
+        # 2) for each graph, determine in which indices their inputs reside in `inputs`
+        # 3) for each variable in the input of `graph`, find which index it is
+        inputs = []             # List[Symbol], result of 1)
+        locs = []               # List[Tuple(List[Int], List[Int])], a list of tuples,
+                                # where tuples are results of 2) and 3)
+        input_id_to_loc = {}    # Dict[int, int], given id(sym), input_id_to_loc maps it
+                                # to a `loc`, where inputs[loc] = sym
+        for graph in graphs:
+            # some loop_vars are inputs to `graph`, some are not
+            name_to_loop_vars = {'data_subgraph{}'.format(i): ele for i, ele in enumerate(flatten_loop_vars)}
+            # also we collect the mapping from var's name to var's loc in loop_vars
+            name_to_var_locs = {'data_subgraph{}'.format(i): i for i, ele in enumerate(flatten_loop_vars)}
+            # collect arguments for each subgraph
+            input_locs = []                         # results from the second step
+            var_locs = [-1] * len(flatten_loop_vars)        # results from the third step
+            subg_input_names = graph.list_inputs()
+            for name in subg_input_names:
+                assert name in name_to_loop_vars   # it should obviously hold
+                array = name_to_loop_vars[name]
+                # do 2), and 1) is implicitly done
+                if id(array) in input_id_to_loc:
+                    loc = input_id_to_loc[id(array)]
+                else:
+                    loc = len(input_id_to_loc)
+                    inputs.append(array)
+                    input_id_to_loc[id(array)] = loc
+                input_locs.append(loc)
+                # do 3)
+                if name in name_to_var_locs:
+                    var_locs[name_to_var_locs[name]] = len(input_locs) - 1
+                    name_to_var_locs.pop(name, None)
+            locs.append((input_locs, var_locs))
+        return inputs, locs
+    if max_iterations is None:
+        raise ValueError("max_iterations should be specified")
+    max_iterations = _to_python_scalar(max_iterations, int, "max_iteration")
+    # It should be work as fine if loop_vars are empty I guess,
+    # but it is semantically unnecessary to include this case.
+    if isinstance(loop_vars, (list, tuple)):
+        if len(loop_vars) == 0:
+            raise ValueError("loop_vars should contain at least one element")
+    else:
+        assert isinstance(loop_vars, np_ndarray), ("loop_vars should be either mxnet.numpy.ndarray" \
+            " or list/tuple of mxnet.numpy.ndarray")
+        loop_vars = [loop_vars]
+    # create graph for `cond'
+    cond_g, num_out_data, num_outputs, _, _ = \
+        _create_subgraph(loop_vars, _cond_wrapper, name + "_cond")
+    assert num_out_data == 0
+    assert num_outputs == 1
+    # create graph for `func`
+    func_g, num_out_data, num_outputs, out_fmt, _ = \
+        _create_subgraph(loop_vars, _func_wrapper, name + "_func")
+    # find symbols used in either cond_g or func_g
+    input_vars, ((cond_input_locs, _), (func_input_locs, func_var_locs)) = \
+        _union_inputs(cond_g, func_g)
+    for i_th, loc in enumerate(func_var_locs, 1):
+        if loc == -1:
+            raise ValueError("The %d-th loop_var doesn't involve into the computation" % i_th)
+    result = _api_internal.while_loop(
+        cond_g.handle,
+        func_g.handle,
+        *input_vars,
+        max_iterations,
+        cond_input_locs,
+        func_input_locs,
+        func_var_locs,
+        num_out_data,
+        num_outputs
+    )
+    if isinstance(result, np_ndarray):
+        ret = [result]
+    else:
+        ret = list(result)
+    outputs = [ret[i] for i in range(num_out_data)]
+    outputs = _regroup(outputs, out_fmt)
+    final_loop_vars = [ret[i] for i in range(num_out_data, num_outputs)]
+    final_loop_vars = _regroup(final_loop_vars, init_loop_var_fmt)
+    return outputs, final_loop_vars
+
+
+@set_module('mxnet.ndarray.numpy_extension')
+def cond(pred, then_func, else_func, inputs, name="cond"):
+    """Run an if-then-else using user-defined condition and computation
+
+    This operator simulates a if-like branch which chooses to do one of
+    the two customized computations according to the specified condition.
+
+    `pred` is a scalar MXNet NDArray,
+    indicating which branch of computation should be used.
+
+    `then_func` is a user-defined function, used as computation of the then branch.
+    It produces `outputs`, which is a list of NDArrays.
+    The signature of `then_func` should be
+    `then_func() => NDArray or nested List[NDArray]`.
+
+    `else_func` is a user-defined function, used as computation of the else branch.
+    It produces `outputs`, which is a list of NDArrays.
+    The signature of `else_func` should be
+    `else_func() => NDArray or nested List[NDArray]`.
+
+    The `outputs` produces by `then_func` and `else_func` should have the same number
+    of elements, all of which should be in the same shape, of the same dtype and stype.
+
+    This function returns a list of symbols, representing the computation result.
+
+    Parameters
+    ----------
+    pred: a Python function.
+        The branch condition.
+    then_func: a Python function.
+        The computation to be executed if `pred` is true.
+    else_func: a Python function.
+        The computation to be executed if `pred` is false.
+
+    Returns
+    -------
+    outputs: an NDArray or nested lists of NDArrays, representing the result of computation.
+
+    Examples
+    --------
+    >>> a, b = mx.np.array([1]), mx.np.array([2])
+    >>> pred = a * b < 5
+    >>> then_func = lambda: (a + 5) * (b + 5)
+    >>> else_func = lambda: (a - 5) * (b - 5)
+    >>> outputs = mx.npx.cond(pred, then_func, else_func)
+    >>> outputs[0]
+    42.0
+    """
+
+    def _create_subgraph(graph_vars, graph_func, subgraph_name):
+        subgraph_name = _get_unique_subgraph_name(subgraph_name)
+        with AttrScope(__subgraph_name__=subgraph_name):
+            # create new variables with the same name,
+            # them feed them to the given func
+            flatten_data, data_fmt = _flatten(graph_vars, "cond input")
+            real_data = [ele.copy().detach() if ele is not None else None for ele in flatten_data]
+            data_names = ['data_subgraph{}'.format(i) for i, ele in enumerate(real_data)]
+            symbol_data = [
+                symbol.var(name).as_np_ndarray()
+                for arg, name in zip(real_data, data_names)
+            ]
+            dc.set_variable(real_data, symbol_data)
+            new_graph_vars = _regroup(real_data, data_fmt)
+            if dc.is_deferred_compute():
+                outputs = graph_func(*new_graph_vars)
+                if "pred" in subgraph_name:
+                    outputs = outputs.astype("int")
+            else:
+                with ag.pause(), dc.context():
+                    outputs = graph_func(*new_graph_vars)
+                    if "pred" in subgraph_name:
+                        outputs = outputs.astype("int")
+            outputs, out_fmt = _flatten(outputs, "cond outputs")
+            num_outputs = len(outputs)
+            sym_out = [dc.get_symbol(out_data) for out_data in outputs]
+            dc.clear(outputs)
+            graph = _construct_subgraph(sym_out, [])
+        return graph, num_outputs, out_fmt
+
+    flatten_inputs, _ = _flatten(inputs, "while loop_vars")
+
+    def _union_inputs(*graphs):
+        # Given a list of graphs, each whose inputs are either from input_vars or other variables.
+        # 1) calculate a list `inputs`, the union of their inputs.
+        # 2) for each graph, determine in which indices their inputs reside in `inputs`
+        # 3) for each variable in the input of `graph`, find which index it is
+        inputs = []             # List[Symbol], result of 1)
+        locs = []               # List[Tuple(List[Int], List[Int])], a list of tuples,
+                                # where tuples are results of 2) and 3)
+        input_id_to_loc = {}    # Dict[int, int], given id(sym), input_id_to_loc maps it
+                                # to a `loc`, where inputs[loc] = sym
+        for graph in graphs:
+            # some input_vars are inputs to `graph`, some are not
+            name_to_input_syms = {'data_subgraph{}'.format(i): ele for i, ele in enumerate(flatten_inputs)}
+            # collect arguments for each subgraph
+            input_locs = []                         # results from the second step
+            for name in graph.list_inputs():
+                assert name in name_to_input_syms   # it should obviously hold
+                array = name_to_input_syms[name]
+                # do 2), and 1) is implicitly done
+                if id(array) in input_id_to_loc:
+                    loc = input_id_to_loc[id(array)]
+                else:
+                    loc = len(input_id_to_loc)
+                    inputs.append(array)
+                    input_id_to_loc[id(array)] = loc
+                input_locs.append(loc)
+            locs.append(input_locs)
+        return inputs, locs
+    if isinstance(inputs, (list, tuple)):
+        if len(inputs) == 0:
+            raise ValueError("inputs should contain at least one element")
+    else:
+        assert isinstance(inputs, np_ndarray), ("inputs should be either mxnet.numpy.ndarray" \
+            " or list/tuple of mxnet.numpy.ndarray")
+        inputs = [inputs]
+    # create graph for `cond_func'
+    cond_g, cond_num_outputs, _ = _create_subgraph(inputs, pred, name + "_pred")
+    if cond_num_outputs != 1:
+        raise ValueError("pred should always be a single output")
+    # create graph for `then`
+    then_g, then_num_outputs, then_fmt = _create_subgraph(inputs, then_func, name + "_then")
+    # create graph for `else`
+    else_g, else_num_outputs, _ = _create_subgraph(inputs, else_func, name + "_else")
+    if then_num_outputs != else_num_outputs:
+        raise ValueError("Number of outputs differs between then-branch and else-branch")
+    # find symbols used in either cond_g or func_g
+    union_inputs, (cond_input_locs, then_input_locs, else_input_locs) = \
+        _union_inputs(cond_g, then_g, else_g)
+    result = _api_internal.cond(
+        cond_g.handle,
+        then_g.handle,
+        else_g.handle,
+        *union_inputs,
+        cond_input_locs,
+        then_input_locs,
+        else_input_locs,
+        then_num_outputs
+    )
+    if isinstance(result, np_ndarray):
+        ret = [result]
+    else:
+        ret = list(result)
+    outputs = [ret[i] for i in range(then_num_outputs)]
+    outputs = _regroup(outputs, then_fmt)
+    return outputs
diff --git a/python/mxnet/numpy_extension/__init__.py b/python/mxnet/numpy_extension/__init__.py
index a3666c48bd9d..ad325ff9e160 100644
--- a/python/mxnet/numpy_extension/__init__.py
+++ b/python/mxnet/numpy_extension/__init__.py
@@ -23,7 +23,9 @@
 from . import image
 from . import random  # pylint: disable=wildcard-import
 from . import _register
+from . import control_flow
 from ._op import *  # pylint: disable=wildcard-import
+from .control_flow import *  # pylint: disable=wildcard-import
 from ..context import *  # pylint: disable=wildcard-import
 from ..util import is_np_shape, is_np_array, set_np, reset_np, get_cuda_compute_capability,\
                    is_np_default_dtype, set_np_default_dtype
diff --git a/python/mxnet/numpy_extension/_op.py b/python/mxnet/numpy_extension/_op.py
index a84404e6a535..44638e2c59ed 100644
--- a/python/mxnet/numpy_extension/_op.py
+++ b/python/mxnet/numpy_extension/_op.py
@@ -25,7 +25,7 @@
            'activation', 'batch_norm', 'fully_connected', 'pick', 'convolution',
            'deconvolution', 'pooling', 'dropout', 'one_hot', 'rnn', 'embedding',
            'topk', 'layer_norm', 'leaky_relu', 'batch_dot', 'broadcast_like',
-           'arange_like']
+           'arange_like', 'group_norm']
 
 
 # pylint: disable=too-many-arguments
@@ -1362,3 +1362,47 @@ def arange_like(data, start=0.0, step=1.0, repeat=1, ctx=None, axis=None):
     """
     return _mx_nd_npx.arange_like(data=data, start=start, step=step, repeat=repeat,
                                   ctx=ctx, axis=axis)
+
+
+# pylint: disable=too-many-arguments
+@set_module('mxnet.numpy_extension')
+def group_norm(data, gamma, beta, num_groups=1, eps=1e-3, output_mean_var=False):
+    r"""Group normalization.
+
+    The input channels are separated into ``num_groups`` groups,
+    each containing ``num_channels / num_groups`` channels.
+    The mean and standard-deviation are calculated separately over the each group.
+
+    .. math::
+
+      data = data.reshape((N, num_groups, C // num_groups, ...))
+      out = \frac{data - mean(data, axis)}{\sqrt{var(data, axis) + \epsilon}} * gamma + beta
+
+    Both ``gamma`` and ``beta`` are learnable parameters.
+
+
+
+    Defined in ../src/operator/nn/group_norm.cc:L78
+
+    Parameters
+    ----------
+    data : NDArray
+        Input data
+    gamma : NDArray
+        gamma array
+    beta : NDArray
+        beta array
+    num_groups : int, optional, default='1'
+        Total number of groups.
+    eps : float, optional, default=9.99999975e-06
+        An `epsilon` parameter to prevent division by 0.
+    output_mean_var : boolean, optional, default=0
+        Output the mean and std calculated along the given axis.
+
+    Returns
+    -------
+    out : NDArray or list of NDArrays
+        The output of this function.
+    """
+    return _mx_nd_npx.group_norm(data=data, gamma=gamma, beta=beta, num_groups=num_groups,
+                                 eps=eps, output_mean_var=output_mean_var)
diff --git a/python/mxnet/numpy_extension/control_flow.py b/python/mxnet/numpy_extension/control_flow.py
new file mode 100644
index 000000000000..a3e2e728616d
--- /dev/null
+++ b/python/mxnet/numpy_extension/control_flow.py
@@ -0,0 +1,220 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Namespace for registering control flow ops for imperative programming."""
+
+from ..ndarray import numpy_extension as _mx_nd_npx
+from ..util import set_module
+
+
+__all__ = ["foreach", "while_loop", "cond"]
+
+
+@set_module('mxnet.numpy_extension')
+def foreach(body, data, init_states):
+    """Run a for loop with user-defined computation over NDArrays on dimension 0.
+
+    This operator simulates a for loop and body has the computation for an iteration
+    of the for loop. It runs the computation in body on each slice from the input
+    NDArrays.
+
+    body takes two arguments as input and outputs a tuple of two elements,
+    as illustrated below::
+
+        out, states = body(data1, states)
+
+    data1 can be either an NDArray or a list of NDArrays. If data is an NDArray,
+    data1 is an NDArray. Otherwise, data1 is a list of NDArrays and has the same
+    size as data. states is a list of NDArrays and have the same size as init_states.
+    Similarly, out can be either an NDArray or a list of NDArrays, which are concatenated
+    as the first output of foreach; states from the last execution of body
+    are the second output of foreach.
+
+    The computation done by this operator is equivalent to the pseudo code below
+    when the input data is NDArray::
+
+        states = init_states
+        outs = []
+        for i in data.shape[0]:
+            s = data[i]
+            out, states = body(s, states)
+            outs.append(out)
+        outs = stack(*outs)
+
+
+    Parameters
+    ----------
+    body : HybridBlock.
+        Define computation in an iteration.
+    data: an NDArray or a list of NDArrays.
+        The input data.
+    init_states: an NDArray or nested lists of NDArrays.
+        The initial values of the loop states.
+
+    Returns
+    -------
+    outputs: an NDArray or nested lists of NDArrays.
+        The output data concatenated from the output of all iterations.
+    states: an NDArray or nested lists of NDArrays.
+        The loop states in the last iteration.
+
+    Examples
+    --------
+    >>> step = lambda data, states: (data + states[0], [states[0] * 2])
+    >>> data = mx.np.random.uniform(size=(2, 10))
+    >>> states = [mx.np.random.uniform(size=(10))]
+    >>> outs, states = npx.control_flow.foreach(step, data, states)
+    """
+    return _mx_nd_npx.foreach(body, data, init_states)
+
+
+#pylint: disable=W0621
+@set_module('mxnet.numpy_extension')
+def while_loop(cond, func, loop_vars, max_iterations=None):
+    """Run a while loop with user-defined computation and loop condition.
+
+    This operator simulates a while loop which iterately does customized computation
+    as long as the condition is satisfied.
+
+    `loop_vars` is a list of NDArrays on which the computation uses.
+
+    `cond` is a user-defined function, used as the loop condition.
+    It consumes `loop_vars`, and produces a scalar MXNet NDArray,
+    indicating the termination of the loop.
+    The loop ends when `cond` returns false (zero).
+    The `cond` is variadic, and its signature should be
+    `cond(*loop_vars) => NDArray`.
+
+    `func` is a user-defined function, used as the loop body.
+    It also consumes `loop_vars`, and produces `step_output` and `new_loop_vars` at each step.
+    In each step, `step_output` should contain the same number elements.
+    Through all steps, the i-th element of `step_output` should have the same shape and dtype.
+    Also, `new_loop_vars` should contain the same number of elements as `loop_vars`,
+    and the corresponding element should have the same shape and dtype.
+    The `func` is variadic, and its signature should be
+    `func(*loop_vars) =>
+    (NDArray or nested List[NDArray] step_output, NDArray or nested List[NDArray] new_loop_vars)`.
+
+    `max_iterations` is a scalar that defines the maximum number of iterations allowed.
+
+    This function returns two lists.
+    The first list has the length of `|step_output|`,
+    in which the i-th element are all i-th elements of
+    `step_output` from all steps, stacked along axis 0.
+    The second list has the length of `|loop_vars|`,
+    which represents final states of loop variables.
+
+    .. warning::
+
+       For now, the axis 0 of all NDArrays in the first list are `max_iterations`,
+       due to lack of dynamic shape inference.
+
+    .. warning::
+
+       When `cond` is never satisfied, we assume `step_output` is empty,
+       because it cannot be inferred. This is different from the symbolic version.
+
+    Parameters
+    ----------
+    cond: a Python function.
+        The loop condition.
+    func: a Python function.
+        The loop body.
+    loop_vars: an NDArray or nested lists of NDArrays.
+        The initial values of the loop variables.
+    max_iterations: a python int.
+        Maximum number of iterations.
+
+    Returns
+    ------
+    outputs: an NDArray or nested lists of NDArrays
+        stacked output from each step
+    states: an NDArray or nested lists of NDArrays
+        final state
+
+    Examples
+    --------
+    >>> cond = lambda i, s: i <= 5
+    >>> func = lambda i, s: ([i + s], [i + 1, s + i])
+    >>> loop_vars = (mx.np.array([0], dtype="int64"), mx.np.array([1], dtype="int64"))
+    >>> outputs, states = mx.npx.while_loop(cond, func, loop_vars, max_iterations=10)
+    >>> outputs
+    [array([[ 1],
+           [ 2],
+           [ 4],
+           [ 7],
+           [11],
+           [16],
+           [ 0],
+           [ 0],
+           [ 0],
+           [ 0]], dtype=int64)]
+    >>> states
+    [array([6], dtype=int64), array([16], dtype=int64)]
+    """
+    return _mx_nd_npx.while_loop(cond, func, loop_vars, max_iterations=max_iterations)
+
+
+@set_module('mxnet.numpy_extension')
+def cond(pred, then_func, else_func, inputs, name="cond"):
+    """Run an if-then-else using user-defined condition and computation
+
+    This operator simulates a if-like branch which chooses to do one of
+    the two customized computations according to the specified condition.
+
+    `pred` is a scalar MXNet NDArray,
+    indicating which branch of computation should be used.
+
+    `then_func` is a user-defined function, used as computation of the then branch.
+    It produces `outputs`, which is a list of NDArrays.
+    The signature of `then_func` should be
+    `then_func() => NDArray or nested List[NDArray]`.
+
+    `else_func` is a user-defined function, used as computation of the else branch.
+    It produces `outputs`, which is a list of NDArrays.
+    The signature of `else_func` should be
+    `else_func() => NDArray or nested List[NDArray]`.
+
+    The `outputs` produces by `then_func` and `else_func` should have the same number
+    of elements, all of which should be in the same shape, of the same dtype and stype.
+
+    This function returns a list of symbols, representing the computation result.
+
+    Parameters
+    ----------
+    pred: a Python function.
+        The branch condition.
+    then_func: a Python function.
+        The computation to be executed if `pred` is true.
+    else_func: a Python function.
+        The computation to be executed if `pred` is false.
+
+    Returns
+    -------
+    outputs: an NDArray or nested lists of NDArrays, representing the result of computation.
+
+    Examples
+    --------
+    >>> a, b = mx.np.array([1]), mx.np.array([2])
+    >>> pred = a * b < 5
+    >>> then_func = lambda: (a + 5) * (b + 5)
+    >>> else_func = lambda: (a - 5) * (b - 5)
+    >>> outputs = mx.npx.cond(pred, then_func, else_func)
+    >>> outputs[0]
+    42.0
+    """
+    return _mx_nd_npx.cond(pred, then_func, else_func, inputs, name=name)
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index 159576ef392e..a3785354a640 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -415,15 +415,16 @@ def use_np_array(func):
     .. code-block:: python
 
         import mxnet as mx
-        from mxnet import gluon, np
+        from mxnet import gluon, nd, np
+        from mxnet.gluon import Parameter
 
         class TestHybridBlock1(gluon.HybridBlock):
             def __init__(self):
                 super(TestHybridBlock1, self).__init__()
-                self.w = self.params.get('w', shape=(2, 2))
+                self.w = Parameter('w', shape=(2, 2))
 
-            def hybrid_forward(self, F, x, w):
-                return F.dot(x, w)
+            def forward(self, x):
+                return nd.dot(x, self.w.data())
 
         x = mx.nd.ones((2, 2))
         net1 = TestHybridBlock1()
@@ -433,14 +434,14 @@ def hybrid_forward(self, F, x, w):
             assert type(v.data()) is mx.nd.NDArray
         assert type(out) is mx.nd.NDArray
 
-        @np.use_np_array
+        @mx.util.use_np_array
         class TestHybridBlock2(gluon.HybridBlock):
             def __init__(self):
                 super(TestHybridBlock2, self).__init__()
-                self.w = self.params.get('w', shape=(2, 2))
+                self.w = Parameter('w', shape=(2, 2))
 
-            def hybrid_forward(self, F, x, w):
-                return F.np.dot(x, w)
+            def forward(self, x):
+                return np.dot(x, self.w.data())
 
         x = np.ones((2, 2))
         net2 = TestHybridBlock2()
@@ -495,15 +496,16 @@ def use_np(func):
     .. code-block:: python
 
         import mxnet as mx
-        from mxnet import gluon, np
+        from mxnet import gluon, nd, np
+        from mxnet.gluon import Parameter
 
         class TestHybridBlock1(gluon.HybridBlock):
             def __init__(self):
                 super(TestHybridBlock1, self).__init__()
-                self.w = self.params.get('w', shape=(2, 2))
+                self.w = Parameter('w', shape=(2, 2))
 
-            def hybrid_forward(self, F, x, w):
-                return F.dot(x, w) + F.ones((1,))
+            def forward(self, x):
+                return nd.dot(x, self.w.data()) + nd.ones((1,))
 
         x = mx.nd.ones((2, 2))
         net1 = TestHybridBlock1()
@@ -513,14 +515,14 @@ def hybrid_forward(self, F, x, w):
             assert type(v.data()) is mx.nd.NDArray
         assert type(out) is mx.nd.NDArray
 
-        @np.use_np
+        @mx.util.use_np
         class TestHybridBlock2(gluon.HybridBlock):
             def __init__(self):
                 super(TestHybridBlock2, self).__init__()
-                self.w = self.params.get('w', shape=(2, 2))
+                self.w = Parameter('w', shape=(2, 2))
 
-            def hybrid_forward(self, F, x, w):
-                return F.np.dot(x, w) + F.np.ones(())
+            def forward(self, x):
+                return np.dot(x, self.w.data()) + np.ones(())
 
         x = np.ones((2, 2))
         net2 = TestHybridBlock2()
diff --git a/src/api/operator/numpy_extension/npx_control_flow_op.cc b/src/api/operator/numpy_extension/npx_control_flow_op.cc
new file mode 100644
index 000000000000..52001d8f7bd1
--- /dev/null
+++ b/src/api/operator/numpy_extension/npx_control_flow_op.cc
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file npx_control_flow_op.cc
+ * \brief Implementation of the API of functions in src/operator/numpy_extension/npx_control_flow_op.cc
+ */
+#include <mxnet/api_registry.h>
+#include <mxnet/runtime/packed_func.h>
+#include <mxnet/operator.h>
+#include "../utils.h"
+#include "../../../operator/npx_control_flow.h"
+
+namespace mxnet {
+
+MXNET_REGISTER_API("_npx.foreach")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  nnvm::NodeAttrs attrs;
+  const nnvm::Op* op = Op::Get("_npx_foreach");
+  op::NPXForeachParam param;
+  int args_size = args.size();
+  int num_inputs = args_size - 7;
+  // inputs
+  nnvm::Symbol* sym = static_cast<nnvm::Symbol*>(args[0].value().v_handle);
+  std::vector<std::shared_ptr<nnvm::Symbol> > subgraphs;
+  subgraphs.push_back(std::make_shared<nnvm::Symbol>(*sym));
+  std::vector<NDArray*> inputs;
+  inputs.reserve(num_inputs);
+  for (int i = 1; i < num_inputs + 1; ++i) {
+    inputs.push_back(static_cast<mxnet::NDArray*>(args[i]));
+  }
+
+  param.num_args = num_inputs;
+  param.num_outputs = args[1+num_inputs].operator int();
+  param.num_out_data = args[2+num_inputs].operator int();
+  if (args[3+num_inputs].type_code() == kDLInt) {
+    param.in_state_locs = mxnet::Tuple<int64_t>(1, args[3+num_inputs].operator int64_t());
+  } else {
+    param.in_state_locs = mxnet::Tuple<int64_t>(args[3+num_inputs].operator ObjectRef());
+  }
+  if (args[4+num_inputs].type_code() == kDLInt) {
+    param.in_data_locs = mxnet::Tuple<int64_t>(1, args[4+num_inputs].operator int64_t());
+  } else {
+    param.in_data_locs = mxnet::Tuple<int64_t>(args[4+num_inputs].operator ObjectRef());
+  }
+  if (args[5+num_inputs].type_code() == kDLInt) {
+    param.remain_locs = mxnet::Tuple<int64_t>(1, args[5+num_inputs].operator int64_t());
+  } else {
+    param.remain_locs = mxnet::Tuple<int64_t>(args[5+num_inputs].operator ObjectRef());
+  }
+  if (args[6+num_inputs].type_code() == kDLInt) {
+    param.in_state_index = mxnet::Tuple<int64_t>(1, args[6+num_inputs].operator int64_t());
+  } else {
+    param.in_state_index = mxnet::Tuple<int64_t>(args[6+num_inputs].operator ObjectRef());
+  }
+  attrs.parsed = param;
+  attrs.op = op;
+  attrs.subgraphs = subgraphs;
+  SetAttrDict<op::NPXForeachParam>(&attrs);
+  int num_outputs = 0;
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs.data(), &num_outputs, nullptr);
+  if (num_outputs == 1) {
+    *ret = ndoutputs[0];
+  } else {
+    std::vector<NDArrayHandle> ndarray_handles;
+    ndarray_handles.reserve(num_outputs);
+    for (int i = 0; i < num_outputs; ++i) {
+      ndarray_handles.emplace_back(ndoutputs[i]);
+    }
+    *ret = ADT(0, ndarray_handles.begin(), ndarray_handles.end());
+  }
+});
+
+
+MXNET_REGISTER_API("_npx.while_loop")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  nnvm::NodeAttrs attrs;
+  const nnvm::Op* op = Op::Get("_npx_while_loop");
+  op::NPXWhileLoopParam param;
+  int args_size = args.size();
+  int num_inputs = args_size - 8;
+  // inputs
+  std::vector<std::shared_ptr<nnvm::Symbol> > subgraphs;
+  subgraphs.reserve(2);
+  for (int i = 0; i < 2; i++) {
+    nnvm::Symbol* sym = static_cast<nnvm::Symbol*>(args[i].value().v_handle);
+    subgraphs.push_back(std::make_shared<nnvm::Symbol>(*sym));
+  }
+  std::vector<NDArray*> inputs;
+  inputs.reserve(num_inputs);
+  for (int i = 2; i < num_inputs + 2; ++i) {
+    inputs.push_back(static_cast<mxnet::NDArray*>(args[i]));
+  }
+
+  param.num_args = num_inputs;
+  param.max_iterations = args[2+num_inputs].operator int();
+  if (args[3+num_inputs].type_code() == kDLInt) {
+    param.cond_input_locs = mxnet::Tuple<int64_t>(1, args[3+num_inputs].operator int64_t());
+  } else {
+    param.cond_input_locs = mxnet::Tuple<int64_t>(args[3+num_inputs].operator ObjectRef());
+  }
+  if (args[4+num_inputs].type_code() == kDLInt) {
+    param.func_input_locs = mxnet::Tuple<int64_t>(1, args[4+num_inputs].operator int64_t());
+  } else {
+    param.func_input_locs = mxnet::Tuple<int64_t>(args[4+num_inputs].operator ObjectRef());
+  }
+  if (args[5+num_inputs].type_code() == kDLInt) {
+    param.func_var_locs = mxnet::Tuple<int64_t>(1, args[5+num_inputs].operator int64_t());
+  } else {
+    param.func_var_locs = mxnet::Tuple<int64_t>(args[5+num_inputs].operator ObjectRef());
+  }
+  param.num_out_data = args[6+num_inputs].operator int();
+  param.num_outputs = args[7+num_inputs].operator int();
+  attrs.parsed = param;
+  attrs.op = op;
+  attrs.subgraphs = subgraphs;
+  SetAttrDict<op::NPXWhileLoopParam>(&attrs);
+  int num_outputs = 0;
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs.data(), &num_outputs, nullptr);
+  if (num_outputs == 1) {
+    *ret = ndoutputs[0];
+  } else {
+    std::vector<NDArrayHandle> ndarray_handles;
+    ndarray_handles.reserve(num_outputs);
+    for (int i = 0; i < num_outputs; ++i) {
+      ndarray_handles.emplace_back(ndoutputs[i]);
+    }
+    *ret = ADT(0, ndarray_handles.begin(), ndarray_handles.end());
+  }
+});
+
+MXNET_REGISTER_API("_npx.cond")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  nnvm::NodeAttrs attrs;
+  const nnvm::Op* op = Op::Get("_npx_cond");
+  op::NPXCondParam param;
+  int args_size = args.size();
+  int num_inputs = args_size - 7;
+  // inputs
+  std::vector<std::shared_ptr<nnvm::Symbol> > subgraphs;
+  subgraphs.reserve(3);
+  for (int i = 0; i < 3; i++) {
+    nnvm::Symbol* sym = static_cast<nnvm::Symbol*>(args[i].value().v_handle);
+    subgraphs.push_back(std::make_shared<nnvm::Symbol>(*sym));
+  }
+  std::vector<NDArray*> inputs;
+  inputs.reserve(num_inputs);
+  for (int i = 3; i < num_inputs + 3; ++i) {
+    inputs.push_back(static_cast<mxnet::NDArray*>(args[i]));
+  }
+
+  param.num_args = num_inputs;
+  if (args[3+num_inputs].type_code() == kDLInt) {
+    param.cond_input_locs = mxnet::Tuple<int64_t>(1, args[3+num_inputs].operator int64_t());
+  } else {
+    param.cond_input_locs = mxnet::Tuple<int64_t>(args[3+num_inputs].operator ObjectRef());
+  }
+  if (args[4+num_inputs].type_code() == kDLInt) {
+    param.then_input_locs = mxnet::Tuple<int64_t>(1, args[4+num_inputs].operator int64_t());
+  } else {
+    param.then_input_locs = mxnet::Tuple<int64_t>(args[4+num_inputs].operator ObjectRef());
+  }
+  if (args[5+num_inputs].type_code() == kDLInt) {
+    param.else_input_locs = mxnet::Tuple<int64_t>(1, args[5+num_inputs].operator int64_t());
+  } else {
+    param.else_input_locs = mxnet::Tuple<int64_t>(args[5+num_inputs].operator ObjectRef());
+  }
+  param.num_outputs = args[6+num_inputs].operator int();
+  attrs.parsed = param;
+  attrs.op = op;
+  attrs.subgraphs = subgraphs;
+  SetAttrDict<op::NPXCondParam>(&attrs);
+  int num_outputs = 0;
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs.data(), &num_outputs, nullptr);
+  if (num_outputs == 1) {
+    *ret = ndoutputs[0];
+  } else {
+    std::vector<NDArrayHandle> ndarray_handles;
+    ndarray_handles.reserve(num_outputs);
+    for (int i = 0; i < num_outputs; ++i) {
+      ndarray_handles.emplace_back(ndoutputs[i]);
+    }
+    *ret = ADT(0, ndarray_handles.begin(), ndarray_handles.end());
+  }
+});
+
+}  // namespace mxnet
diff --git a/src/api/operator/numpy_extension/npx_group_norm_op.cc b/src/api/operator/numpy_extension/npx_group_norm_op.cc
new file mode 100644
index 000000000000..aff66c999b72
--- /dev/null
+++ b/src/api/operator/numpy_extension/npx_group_norm_op.cc
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file npx_group_norm_op.cc
+ * \brief Implementation of the API of functions in src/operator/numpy_extension/npx_group_norm_op.cc
+ */
+#include <mxnet/api_registry.h>
+#include <mxnet/runtime/packed_func.h>
+#include "../utils.h"
+#include "../../../operator/nn/group_norm-inl.h"
+
+namespace mxnet {
+
+MXNET_REGISTER_API("_npx.group_norm")
+.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) {
+  using namespace runtime;
+  nnvm::NodeAttrs attrs;
+  const nnvm::Op* op = Op::Get("_npx_group_norm");
+  op::GroupNormParam param;
+  // num_groups
+  param.num_groups = args[3];
+  // eps
+  param.eps = args[4].operator double();
+  // output_mean_var
+  param.output_mean_var = args[5].operator bool();
+  attrs.parsed = param;
+  attrs.op = op;
+  SetAttrDict<op::GroupNormParam>(&attrs);
+  // inputs
+  int num_inputs = 3;
+  std::vector<NDArray*> inputs;
+  inputs.reserve(num_inputs);
+  for (int i = 0; i < num_inputs; ++i) {
+    inputs.push_back(args[i].operator mxnet::NDArray*());
+  }
+  int num_outputs = 0;
+  auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs.data(), &num_outputs, nullptr);
+  if (num_outputs == 1) {
+    *ret = ndoutputs[0];
+  } else {
+    std::vector<NDArrayHandle> ndarray_handles;
+    ndarray_handles.reserve(num_outputs);
+    for (int i = 0; i < num_outputs; ++i) {
+      ndarray_handles.emplace_back(ndoutputs[i]);
+    }
+    *ret = ADT(0, ndarray_handles.begin(), ndarray_handles.end());
+  }
+});
+
+}  // namespace mxnet
diff --git a/src/api/operator/numpy_extension/npx_pooling_op.cc b/src/api/operator/numpy_extension/npx_pooling_op.cc
index 5e8ab8c3435b..923e116f2a0f 100644
--- a/src/api/operator/numpy_extension/npx_pooling_op.cc
+++ b/src/api/operator/numpy_extension/npx_pooling_op.cc
@@ -28,7 +28,7 @@
 
 namespace mxnet {
 
-inline int String2Layout(const std::string& s) {
+inline int String2PoolingLayout(const std::string& s) {
   using namespace op;
   if (s == "NCW") {
     return mshadow::kNCW;
@@ -97,7 +97,8 @@ MXNET_REGISTER_API("_npx.pooling")
   } else {
     param.kernel = TShape(args[1].operator ObjectRef());
   }
-
+  // global pool
+  param.global_pool = args[6].operator bool();
   // stride
   if (args[2].type_code() == kNull) {
     if (param.kernel.ndim() == 1) {
@@ -105,6 +106,10 @@ MXNET_REGISTER_API("_npx.pooling")
     } else if (param.kernel.ndim() == 2) {
       param.stride = mshadow::Shape2(1, 1);
     } else {
+      if (param.global_pool == false) {
+        CHECK_EQ(param.kernel.ndim(), 3U) << param.kernel.ndim()
+            << "D pooling not supported. Only 1D, 2D, and 3D pooling are supported.";
+      }
       param.stride = mshadow::Shape3(1, 1, 1);
     }
   } else if (args[2].type_code() == kDLInt) {
@@ -130,8 +135,6 @@ MXNET_REGISTER_API("_npx.pooling")
   param.pool_type = String2PoolType(args[4].operator std::string());
   // pooling convention
   param.pooling_convention = String2Convention(args[5].operator std::string());
-  // global pool
-  param.global_pool = args[6].operator bool();
   // cudnn_off
   if (args[7].type_code() == kNull) {
     param.cudnn_off = false;
@@ -154,12 +157,7 @@ MXNET_REGISTER_API("_npx.pooling")
   if (args[10].type_code() == kNull) {
     param.layout = dmlc::nullopt;
   } else {
-    param.layout = String2Layout(args[num_inputs + 10]);
-  }
-
-  if (param.global_pool == false) {
-    CHECK_EQ(param.kernel.ndim(), 3U) << param.kernel.ndim()
-        << "D pooling not supported";
+    param.layout = String2PoolingLayout(args[10]);
   }
 
   attrs.parsed = param;
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index f2e859248b47..a03868ad594a 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -439,6 +439,12 @@ int MXNDArraySetDeferredComputeVariable(NDArrayHandle *arrays, SymbolHandle *var
   API_END();
 }
 
+int MXNDArrayClearDeferredCompute(NDArrayHandle *arrays, int num) {
+  API_BEGIN();
+  Imperative::Get()->DeferredComputeClear(arrays, num);
+  API_END();
+}
+
 int MXNDArrayGetDeferredComputeSymbol(NDArrayHandle *output_handles, int num_outputs,
                                       SymbolHandle *out) {
   nnvm::Symbol *s = new nnvm::Symbol();
diff --git a/src/imperative/imperative.cc b/src/imperative/imperative.cc
index d07385876d6a..a42a60b919fe 100644
--- a/src/imperative/imperative.cc
+++ b/src/imperative/imperative.cc
@@ -384,6 +384,24 @@ void Imperative::SetDeferredComputeVariable(NDArrayHandle *arrays,
   }
 }
 
+void Imperative::DeferredComputeClear(NDArrayHandle *arrays, const int num) {
+  std::vector<nnvm::NodeEntry> outputs;
+  outputs.reserve(num);
+  for (int i = 0; i < num; i++) {
+    NDArray *nd = reinterpret_cast<NDArray *>(arrays[i]);
+    outputs.emplace_back(nd->deferredcompute_entry_);
+  }
+  nnvm::DFSVisit(outputs, [&](const nnvm::ObjectPtr& n) {
+    if (n != nullptr && !n->info.empty()) {
+      Imperative::DCInfo info = Imperative::DCInfo::Get(n);
+      info.inputs_.clear();
+      info.input_handles_.clear();
+      info.outputs_.clear();
+      info.Clear(n);
+    }
+  });
+}
+
 std::vector<NDArray*> Imperative::Backward(
     const std::vector<NDArray*>& outputs,
     const std::vector<NDArray*>& ograds,
diff --git a/src/operator/contrib/batch_norm_relu.cc b/src/operator/contrib/batch_norm_relu.cc
index 52671a094c22..c35f6c9c9ad8 100644
--- a/src/operator/contrib/batch_norm_relu.cc
+++ b/src/operator/contrib/batch_norm_relu.cc
@@ -246,6 +246,7 @@ std::vector<nnvm::NodeEntry> BatchNormWithReLUGrad(const nnvm::ObjectPtr& n,
 }
 
 NNVM_REGISTER_OP(_contrib_BatchNormWithReLU)
+.add_alias("_npx_batch_norm_with_relu")
 .describe(R"code(Batch normalization with ReLU fusion.
 
 An extented operator of Batch normalization which can fuse ReLU activation.
diff --git a/src/operator/contrib/stes_op.cc b/src/operator/contrib/stes_op.cc
index c334d4d1b59c..ab65302ef3b0 100644
--- a/src/operator/contrib/stes_op.cc
+++ b/src/operator/contrib/stes_op.cc
@@ -32,6 +32,7 @@ namespace op {
 
 // Round STE
 MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(_contrib_round_ste, cpu, mshadow_op::round)
+.add_alias("_npx_round_ste")
 .describe(R"code(Straight-through-estimator of `round()`.
 
 In forward pass, returns element-wise rounded value to the nearest integer of the input (same as `round()`).
@@ -57,6 +58,7 @@ The storage type of ``round_ste`` output depends upon the input storage type:
 
 // sign
 MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(_contrib_sign_ste, cpu, mshadow_op::sign)
+.add_alias("_npx_sign_ste")
 .describe(R"code(Straight-through-estimator of `sign()`.
 
 In forward pass, returns element-wise sign of the input (same as `sign()`).
diff --git a/src/operator/contrib/sync_batch_norm.cc b/src/operator/contrib/sync_batch_norm.cc
index f737ef528be8..e532e939f150 100644
--- a/src/operator/contrib/sync_batch_norm.cc
+++ b/src/operator/contrib/sync_batch_norm.cc
@@ -103,6 +103,7 @@ then set ``gamma`` to 1 and its gradient to 0.
 .add_arguments(SyncBatchNormParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_contrib_SyncBatchNorm)
+.add_alias("_npx_sync_batch_norm")
 .set_attr<nnvm::FSetInputVarAttrOnCompose>("FSetInputVarAttrOnCompose",
     [](const nnvm::NodeAttrs& attrs, nnvm::ObjectPtr var, const int index) {
       if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return;
diff --git a/src/operator/nn/group_norm-inl.h b/src/operator/nn/group_norm-inl.h
index 0df0db28fc65..37810acb06ac 100644
--- a/src/operator/nn/group_norm-inl.h
+++ b/src/operator/nn/group_norm-inl.h
@@ -62,6 +62,15 @@ struct GroupNormParam : public dmlc::Parameter<GroupNormParam> {
     DMLC_DECLARE_FIELD(output_mean_var).set_default(false)
       .describe("Output the mean and std calculated along the given axis.");
   }
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream num_groups_s, eps_s, output_mean_var_s;
+    num_groups_s << num_groups;
+    eps_s << eps;
+    output_mean_var_s << output_mean_var;
+    (*dict)["num_groups"] = num_groups_s.str();
+    (*dict)["eps"] = eps_s.str();
+    (*dict)["output_mean_var"] = output_mean_var_s.str();
+  }
 };
 
 
diff --git a/src/operator/nn/group_norm.cc b/src/operator/nn/group_norm.cc
index 53fab3161426..0b794c4d4f4a 100644
--- a/src/operator/nn/group_norm.cc
+++ b/src/operator/nn/group_norm.cc
@@ -62,6 +62,7 @@ static bool GroupNormShape(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(GroupNorm)
+.add_alias("_npx_group_norm")
 .describe(R"code(Group normalization.
 
 The input channels are separated into ``num_groups`` groups, each containing ``num_channels / num_groups`` channels.
diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index d8193ec95e60..be35b7372ed2 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -169,7 +169,7 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
     LOG(FATAL) << "should not reach here ";
     return "";
   }
-  std::string Layout2String(int layout) {
+  std::string PoolingLayout2String(int layout) {
     switch (layout) {
       case mshadow::kNCW:
         return "NCW";
@@ -213,7 +213,7 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
     (*dict)["p_value"] = p_value_s.str();
     (*dict)["count_include_pad"] = count_include_pad_s.str();
     if (layout.has_value()) {
-      (*dict)["layout"] = Layout2String(layout.value());
+      (*dict)["layout"] = PoolingLayout2String(layout.value());
     } else {
       (*dict)["layout"] = layout_s.str();
     }
diff --git a/src/operator/npx_control_flow.cc b/src/operator/npx_control_flow.cc
new file mode 100644
index 000000000000..d57041fb1eb3
--- /dev/null
+++ b/src/operator/npx_control_flow.cc
@@ -0,0 +1,1348 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <mxnet/io.h>
+#include <mxnet/base.h>
+#include <mxnet/ndarray.h>
+#include <mxnet/operator.h>
+#include <mxnet/operator_util.h>
+#include <dmlc/logging.h>
+#include <dmlc/optional.h>
+
+#include <utility>
+#include "./operator_common.h"
+#include "./elemwise_op_common.h"
+#include "../imperative/imperative_utils.h"
+#include "./subgraph_op_common.h"
+#include "./npx_control_flow.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(NPXForeachParam);
+
+class ForeachState: public LoopState {
+ public:
+  NPXForeachParam params;
+  int num_iterations;
+
+  ForeachState(const nnvm::Symbol &g, const NPXForeachParam &params) : LoopState(g, false) {
+    this->params = params;
+  }
+};
+
+static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
+                                const OpContext& ctx,
+                                const std::vector<NDArray>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<NDArray>& outputs) {
+  ForeachState &state = state_ptr.get_state<ForeachState>();
+  const NPXForeachParam& params = state.params;
+  const size_t iter_dim = 0;
+  CHECK_EQ(outputs.size(), (size_t) params.num_outputs);
+  CHECK_GT(params.in_data_locs.ndim(), 0);
+  size_t len = inputs[0].shape()[iter_dim];
+  state.num_iterations = len;
+  for (int i = 1; i < params.in_data_locs.ndim(); i++)
+    CHECK_EQ(inputs[i].shape()[iter_dim], len);
+  for (size_t i = 0; i < (size_t) params.num_out_data; i++)
+    CHECK_EQ(len, outputs[i].shape()[iter_dim]);
+  for (const auto &arr : outputs)
+    CHECK_EQ(arr.storage_type(), kDefaultStorage)
+        << "The for operator doesn't support the sparse format";
+
+  // Initialize the outputs of the subgraph is a little trickier.
+  // The states from the previous iteration are used as the inputs of the next
+  // iteration, so I have to maintain two arrays, so the inputs and outputs
+  // of the subgraph share the same memory.
+  std::vector<NDArray> subg_outputs1(outputs.size());
+  std::vector<NDArray> subg_outputs2(outputs.size());
+  std::vector<NDArray> *subg_outputs[2]{&subg_outputs1, &subg_outputs2};
+  // If the length is an odd number, the last iteration will use the first set
+  // of outputs. In this way, we don't need to copy the results from the
+  // subgraph to the final outputs of the loop.
+  if (len % 2 == 1) {
+    for (size_t i = params.num_out_data; i < subg_outputs1.size(); i++) {
+      subg_outputs1[i] = outputs[i];
+      subg_outputs2[i] = NDArray(outputs[i].shape(), outputs[i].ctx(), true,
+                                 outputs[i].dtype());
+    }
+  } else {
+    // Otherwise, we'll use the second set of outputs.
+    for (size_t i = params.num_out_data; i < subg_outputs1.size(); i++) {
+      subg_outputs1[i] = NDArray(outputs[i].shape(), outputs[i].ctx(), true,
+                                 outputs[i].dtype());
+      subg_outputs2[i] = outputs[i];
+    }
+  }
+
+  // Initialize the inputs for the subgraph.
+  // In each iteration, we need to update the subgraph inputs for input data
+  // and the loop states.
+  std::vector<NDArray> subg_inputs(inputs.size());
+  // The remaining arrays (other than input data and states) only need to be set once.
+  for (int j = 0; j < params.remain_locs.ndim(); j++) {
+    CHECK_LT(params.remain_locs[j], subg_inputs.size());
+    subg_inputs[params.remain_locs[j]] = inputs[j + params.in_data_locs.ndim()
+        + params.in_state_locs.ndim()];
+  }
+
+  // Here we iterate over the first dimension of the first input array.
+  for (size_t i = 0; i < len; i++) {
+    // Initialize outputs for the subgraph.
+    std::vector<NDArray> *subg_out_curr = subg_outputs[i % 2];
+    std::vector<NDArray> *subg_out_prev = subg_outputs[(i + 1) % 2];
+    for (int j = 0; j < params.num_out_data; j++)
+      (*subg_out_curr)[j] = outputs[j].At(i);
+    // When recording for backward computation, we should make sure
+    // that output arrays are actually different in each iteration.
+    if (ctx.need_grad && i < len - 1) {
+      for (size_t j = params.num_out_data; j < subg_out_curr->size(); j++)
+        (*subg_out_curr)[j] = NDArray(outputs[j].shape(), outputs[j].ctx(),
+                                      true, outputs[j].dtype());
+    } else if (ctx.need_grad && i == len - 1) {
+      // For the last iteration, we need to write data to the output array
+      // directly.
+      for (size_t j = params.num_out_data; j < subg_out_curr->size(); j++)
+        (*subg_out_curr)[j] = outputs[j];
+    }
+
+    // Initialize inputs for the subgraph.
+    // Get a slice from the input data arrays.
+    for (int j = 0; j < params.in_data_locs.ndim(); j++) {
+      size_t loc = params.in_data_locs[j];
+      subg_inputs[loc] = inputs[j].At(i);
+    }
+    // For the rest of the iterations, the states are the outputs
+    // from the previous iteration.
+    if (i > 0) {
+      for (size_t j = 0; j < params.in_state_locs.ndim(); ++j) {
+        CHECK_LT(params.in_state_locs[j], subg_inputs.size());
+        for (size_t k = params.num_out_data; k < subg_out_prev->size(); ++k) {
+          if (params.in_state_index[j] == k-params.num_out_data) {
+            subg_inputs[params.in_state_locs[j]] = (*subg_out_prev)[k];
+            break;
+          }
+        }
+      }
+    } else {
+      for (int j = 0; j < params.in_state_locs.ndim(); j++) {
+        CHECK_LT(params.in_state_locs[j], subg_inputs.size());
+        subg_inputs[params.in_state_locs[j]] = inputs[j + params.in_data_locs.ndim()];
+      }
+    }
+
+    state.Forward(i, subg_inputs, req, *subg_out_curr, ctx.need_grad);
+  }
+}
+
+static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
+                                    const OpContext& ctx,
+                                    const std::vector<NDArray>& inputs,
+                                    const std::vector<OpReqType>& req,
+                                    const std::vector<NDArray>& outputs) {
+  ForeachState &state = state_ptr.get_state<ForeachState>();
+  const NPXForeachParam& params = state.params;
+  CHECK_EQ(outputs.size(), (size_t) params.num_args);
+  CHECK_GT(params.in_data_locs.ndim(), 0);
+  for (const auto &arr : outputs)
+    CHECK_EQ(arr.storage_type(), kDefaultStorage)
+        << "The for operator doesn't support the sparse format";
+  int len = state.num_iterations;
+  size_t num_output_data = params.num_out_data;
+
+  // In backward computation, we need to run iterations from backwards.
+  std::vector<NDArray> subg_ograds(params.num_outputs);
+  std::vector<NDArray> subg_igrads(outputs.size());
+  for (size_t i = num_output_data; i < subg_ograds.size(); i++)
+    subg_ograds[i] = inputs[i];
+  std::vector<OpReqType> subg_req(req.size());
+  for (auto r : req)
+    CHECK_NE(r, kWriteInplace);
+
+  // There are three types of arrays in igrads.
+  // * data gradients.
+  // * loop variable gradients.
+  // * remaining variable gradients.
+  // They are in the following order:
+  // [data vars], [loop vars], [remaining vars]
+
+  // [remaining vars]
+  for (int i = 0; i < params.remain_locs.ndim(); i++) {
+    size_t loc = params.remain_locs[i];
+    size_t orig_loc = i + params.in_data_locs.ndim() + params.in_state_locs.ndim();
+    subg_igrads[loc] = outputs[orig_loc];
+    subg_req[loc] = req[orig_loc];
+  }
+
+  for (int iter_num = len - 1; iter_num >= 0; iter_num--) {
+    for (int i = 0; i < params.num_out_data; i++)
+      subg_ograds[i] = inputs[i].At(iter_num);
+    if (iter_num < len - 1) {
+      // For the rest of the iterations, we should add graidents to the
+      // remaining vars.
+      for (int i = 0; i < params.remain_locs.ndim(); i++) {
+        size_t loc = params.remain_locs[i];
+        subg_req[loc] = kAddTo;
+      }
+    }
+
+    // [data vars]
+    for (int i = 0; i < params.in_data_locs.ndim(); i++) {
+      size_t loc = params.in_data_locs[i];
+      subg_igrads[loc] = outputs[i].At(iter_num);
+      subg_req[loc] = req[i];
+    }
+    // [loop vars]
+    for (int i = 0; i < params.in_state_locs.ndim(); i++) {
+      size_t loc = params.in_state_locs[i];
+      const NDArray &output = outputs[i + params.in_data_locs.ndim()];
+      if (iter_num != 0) {
+        // For state gradients, we need to allocate new NDArrays
+        // because intermediate state gradients won't be returned to the users.
+        subg_igrads[loc] = NDArray(output.shape(), output.ctx(), true, output.dtype());
+      } else {
+        subg_igrads[loc] = output;
+      }
+      // For the first iteration, we need to use the request provided by
+      // the user to write state gradients to the outputs.
+      subg_req[loc] = iter_num != 0 ? kWriteTo : req[i + params.in_data_locs.ndim()];
+    }
+    state.Backward(iter_num, subg_ograds, subg_req, subg_igrads);
+
+    size_t num_states = subg_ograds.size() - num_output_data;
+    for (size_t i = 0; i < num_states; i++) {
+      size_t loc = -1;
+      for (size_t j = 0; j < params.in_state_index.ndim(); ++j) {
+        if (params.in_state_index[j] == i) {
+          loc = params.in_state_locs[j];
+          break;
+        }
+      }
+      CHECK_GE(loc, 0);
+      CHECK_LT(loc, subg_igrads.size());
+      subg_ograds[i + num_output_data] = subg_igrads[loc];
+    }
+  }
+  state.Cleanup();
+}
+
+template<typename T>
+static void remap(const std::vector<T> &op_in, size_t start,
+                  const mxnet::Tuple<dim_t> &locs, std::vector<T> *subg_in) {
+  auto op_in_it = op_in.begin() + start;
+  for (int i = 0; i < locs.ndim(); i++) {
+    dim_t loc = locs[i];
+    subg_in->at(loc) = *(op_in_it + i);
+  }
+}
+
+static inline mxnet::TShape SliceFirstDim(const mxnet::TShape &s) {
+  if (s.ndim() > 1) {
+    return mxnet::TShape(s.begin() + 1, s.end());
+  } else {
+    return mxnet::TShape(mshadow::Shape1(1));
+  }
+}
+
+static bool ForeachShape(const nnvm::NodeAttrs& attrs,
+                         mxnet::ShapeVector *in_shape,
+                         mxnet::ShapeVector *out_shape) {
+  const NPXForeachParam& params = nnvm::get<NPXForeachParam>(attrs.parsed);
+  CHECK_EQ(out_shape->size(), (size_t) params.num_outputs);
+  CHECK_EQ(attrs.subgraphs.size(), 1U);
+
+  mxnet::ShapeVector subg_in_shape(in_shape->size());
+  // data shape
+  std::vector<bool> data_1d(params.in_data_locs.ndim(), false);
+  for (int i = 0; i < params.in_data_locs.ndim(); i++) {
+    size_t loc = params.in_data_locs[i];
+    if (in_shape->at(i).ndim() == 1)
+      data_1d[i] = true;
+    subg_in_shape[loc] = SliceFirstDim(in_shape->at(i));
+  }
+  // state shape
+  remap(*in_shape, params.in_data_locs.ndim(), params.in_state_locs,
+        &subg_in_shape);
+  // remaining shape
+  remap(*in_shape, params.in_data_locs.ndim() + params.in_state_locs.ndim(),
+        params.remain_locs, &subg_in_shape);
+
+  mxnet::ShapeVector subg_out_shape = *out_shape;
+  for (int i = 0; i < params.num_out_data; i++) {
+    mxnet::TShape shape = subg_out_shape[i];
+    // If we don't have shape info, we don't need to do anything.
+    if (!mxnet::ndim_is_known(shape))
+      continue;
+    subg_out_shape[i] = SliceFirstDim(shape);
+  }
+
+  bool infer_success = InferSubgraphShape(*attrs.subgraphs[0],
+                                          &subg_in_shape, &subg_out_shape);
+
+  // After inference, we need to move inferred information back to in_shape and
+  // out_shape.
+
+  // For the shape of output data.
+  size_t len = in_shape->at(0)[0];
+  for (int i = 0; i < params.num_out_data; i++) {
+    // If the output shape isn't inferred, we don't need to propogate the info.
+    const auto& g_out_shape = subg_out_shape[i];
+    if (!mxnet::ndim_is_known(g_out_shape))
+      continue;
+
+    auto out = mxnet::TShape(g_out_shape.ndim() + 1, -1);
+    out[0] = len;
+    for (int i = 1; i < out.ndim(); i++)
+      out[i] = g_out_shape[i - 1];
+    SHAPE_ASSIGN_CHECK(*out_shape, i, out);
+  }
+  // For the shape of output states.
+  for (size_t i = params.num_out_data; i < subg_out_shape.size(); i++)
+    SHAPE_ASSIGN_CHECK(*out_shape, i, subg_out_shape[i]);
+
+  // For the shape of input data.
+  for (int i = 0; i < params.in_data_locs.ndim(); i++) {
+    size_t loc = params.in_data_locs[i];
+    const auto &shape = subg_in_shape[loc];
+    // If the input data shape isn't inferred, we don't need to propogate the
+    // info.
+    if (!mxnet::ndim_is_known(shape))
+      continue;
+
+    if (data_1d[i]) {
+      mxnet::TShape s(1, -1);
+      s[0] = len;
+      SHAPE_ASSIGN_CHECK(*in_shape, i, s);
+    } else {
+      auto in = mxnet::TShape(shape.ndim() + 1, -1);
+      in[0] = len;
+      for (int i = 1; i < in.ndim(); i++)
+        in[i] = shape[i - 1];
+      SHAPE_ASSIGN_CHECK(*in_shape, i, in);
+    }
+  }
+  // For the shape of state.
+  for (int i = 0; i < params.in_state_locs.ndim(); i++) {
+    size_t loc = params.in_state_locs[i];
+    SHAPE_ASSIGN_CHECK(*in_shape, i + params.in_data_locs.ndim(),
+                       subg_in_shape[loc]);
+  }
+  // For the shape of remaining data.
+  for (int i = 0; i < params.remain_locs.ndim(); i++) {
+    size_t loc = params.remain_locs[i];
+    SHAPE_ASSIGN_CHECK(*in_shape,
+                       i + params.in_data_locs.ndim() + params.in_state_locs.ndim(),
+                       subg_in_shape[loc]);
+  }
+
+  if (infer_success) {
+    size_t num_states = out_shape->size() - params.num_out_data;
+    for (size_t i = 0; i < num_states; i++) {
+      mxnet::TShape shape_check = (*in_shape)[i + params.in_data_locs.ndim()];
+      for (size_t j = 0; j < params.in_state_index.ndim(); ++j) {
+        if (params.in_state_index[j] == i) {
+          shape_check = (*in_shape)[j + params.in_data_locs.ndim()];
+          break;
+        }
+      }
+      CHECK_EQ((*out_shape)[i + params.num_out_data], shape_check);
+    }
+  }
+  // Check if we have inferred the shapes correctly.
+  return infer_success;
+}
+
+static bool ForeachType(const nnvm::NodeAttrs& attrs,
+                        std::vector<int> *in_type, std::vector<int> *out_type) {
+  const NPXForeachParam& params = nnvm::get<NPXForeachParam>(attrs.parsed);
+  CHECK_EQ(out_type->size(), (size_t) params.num_outputs);
+  CHECK_EQ(attrs.subgraphs.size(), 1U);
+  std::vector<int> subg_in_type(in_type->size(), 0);
+  remap(*in_type, 0, params.in_data_locs, &subg_in_type);
+  remap(*in_type, params.in_data_locs.ndim(), params.in_state_locs, &subg_in_type);
+  remap(*in_type, params.in_data_locs.ndim() + params.in_state_locs.ndim(),
+        params.remain_locs, &subg_in_type);
+  bool success = InferSubgraphDataType(*attrs.subgraphs[0], &subg_in_type, out_type);
+  for (int i = 0; i < params.in_data_locs.ndim(); i++) {
+    size_t loc = params.in_data_locs[i];
+    TYPE_ASSIGN_CHECK(*in_type, i, subg_in_type[loc]);
+  }
+  for (int i = 0; i < params.in_state_locs.ndim(); i++) {
+    size_t loc = params.in_state_locs[i];
+    TYPE_ASSIGN_CHECK(*in_type, i + params.in_data_locs.ndim(), subg_in_type[loc]);
+  }
+  for (int i = 0; i < params.remain_locs.ndim(); i++) {
+    size_t loc = params.remain_locs[i];
+    TYPE_ASSIGN_CHECK(*in_type, i + params.in_data_locs.ndim() + params.in_state_locs.ndim(),
+                      subg_in_type[loc]);
+  }
+  return success;
+}
+
+static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
+                               const int dev_mask,
+                               DispatchMode* dispatch_mode,
+                               std::vector<int> *in_attrs,
+                               std::vector<int> *out_attrs) {
+  const NPXForeachParam& params = nnvm::get<NPXForeachParam>(attrs.parsed);
+  CHECK_EQ(out_attrs->size(), (size_t) params.num_outputs);
+  CHECK_EQ(attrs.subgraphs.size(), 1U);
+  std::vector<int> subg_in_attrs(in_attrs->size(), kUndefinedStorage);
+  remap(*in_attrs, 0, params.in_data_locs, &subg_in_attrs);
+  remap(*in_attrs, params.in_data_locs.ndim(), params.in_state_locs, &subg_in_attrs);
+  remap(*in_attrs, params.in_data_locs.ndim() + params.in_state_locs.ndim(),
+        params.remain_locs, &subg_in_attrs);
+  bool success = InferSubgraphStorage(*attrs.subgraphs[0], dev_mask,
+                                      dispatch_mode, &subg_in_attrs, out_attrs);
+  for (int i = 0; i < params.in_data_locs.ndim(); i++) {
+    size_t loc = params.in_data_locs[i];
+    STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, i, subg_in_attrs[loc]);
+  }
+  for (int i = 0; i < params.in_state_locs.ndim(); i++) {
+    size_t loc = params.in_state_locs[i];
+    STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, i + params.in_data_locs.ndim(),
+                              subg_in_attrs[loc]);
+  }
+  for (int i = 0; i < params.remain_locs.ndim(); i++) {
+    size_t loc = params.remain_locs[i];
+    STORAGE_TYPE_ASSIGN_CHECK(*in_attrs,
+                              i + params.in_data_locs.ndim() + params.in_state_locs.ndim(),
+                              subg_in_attrs[loc]);
+  }
+  return success;
+}
+
+static bool BackwardForeachStorageType(const nnvm::NodeAttrs& attrs,
+                                       const int dev_mask,
+                                       DispatchMode* dispatch_mode,
+                                       std::vector<int> *in_attrs,
+                                       std::vector<int> *out_attrs) {
+  const NPXForeachParam& params = nnvm::get<NPXForeachParam>(attrs.parsed);
+  CHECK_EQ(out_attrs->size(), (size_t) params.num_args);
+  CHECK_EQ(in_attrs->size(), (size_t) params.num_args + params.num_outputs * 2);
+  CHECK_EQ(attrs.subgraphs.size(), 1U);
+  CachedOp op(*attrs.subgraphs[0],
+              std::vector<std::pair<std::string, std::string> >());
+  // map the operator inputs to the subgraph inputs.
+  std::vector<int> subg_forward_ins(params.num_args, kUndefinedStorage);
+  remap(*in_attrs, params.num_outputs, params.in_data_locs, &subg_forward_ins);
+  remap(*in_attrs, params.num_outputs + params.in_data_locs.ndim(),
+        params.in_state_locs, &subg_forward_ins);
+  remap(*in_attrs, params.num_outputs + params.in_data_locs.ndim() + params.in_state_locs.ndim(),
+        params.remain_locs, &subg_forward_ins);
+
+  // Copy backward input storage to backward subgraph input storage.
+  std::vector<int> subg_in_attrs = *in_attrs;
+  for (size_t i = 0; i < subg_forward_ins.size(); i++)
+    subg_in_attrs[i + params.num_outputs] = subg_forward_ins[i];
+  return op.BackwardStorageType(attrs, dev_mask, dispatch_mode,
+                                &subg_in_attrs, out_attrs);
+}
+
+static OpStatePtr CreateForeachState(const NodeAttrs& attrs,
+                                     Context ctx,
+                                     const mxnet::ShapeVector& ishape,
+                                     const std::vector<int>& itype) {
+  const NPXForeachParam& params = nnvm::get<NPXForeachParam>(attrs.parsed);
+  return OpStatePtr::Create<ForeachState>(*attrs.subgraphs[0], params);
+}
+
+static std::vector<nnvm::NodeEntry>
+ForeachGradient(const nnvm::ObjectPtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+  std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+  for (auto& h : n->inputs) {
+    heads.push_back(h);
+  }
+  uint32_t n_out = n->num_outputs();
+  for (uint32_t i = 0; i < n_out; ++i) {
+    heads.emplace_back(n, i, 0);
+  }
+
+  auto p = nnvm::Node::Create();
+  p->attrs.op = nnvm::Op::Get("_backward_npx_foreach");
+  p->attrs.name = n->attrs.name;
+  p->attrs.dict = n->attrs.dict;
+  p->inputs = heads;
+  p->control_deps.emplace_back(n);
+  p->attrs.parsed = n->attrs.parsed;
+  CHECK_EQ(p->num_inputs(), p->inputs.size())
+    << "Number of inputs to operator " << "_backward_npx_foreach" << " (" << p->num_inputs()
+    << ") does not match the actual number of inputs provided to operator "
+    << n->attrs.name << " (" << p->inputs.size() << ").";
+
+  std::vector<nnvm::NodeEntry> entries;
+  for (uint32_t i = 0; i < p->num_outputs(); ++i)
+    entries.emplace_back(nnvm::NodeEntry{p, i, 0});
+
+  entries[0].node->attrs.subgraphs = n->attrs.subgraphs;
+  return entries;
+}
+
+DMLC_REGISTER_PARAMETER(NPXWhileLoopParam);
+
+class WhileLoopState: public LoopState {
+ public:
+  NPXWhileLoopParam params;
+  size_t n_iterations;  // the actual number of steps taken in this while loop, <= max_iterations
+  CachedOpPtr cond_op;
+  // abbrev for output_input_mapping
+  // indicates to which index the output of `func' will be copied to the input of `cond'
+  std::vector<int> oi_map;
+
+  WhileLoopState(const NPXWhileLoopParam &params,
+                 const nnvm::Symbol &cond,
+                 const nnvm::Symbol &func) :
+                 LoopState(func),
+                 params(params),
+                 n_iterations(0U),
+                 cond_op(LoopState::MakeSharedOp(cond)),
+                 oi_map(params.func_var_locs.ndim(), -1) {
+    const mxnet::Tuple<dim_t> &func_input_locs = params.func_input_locs;
+    const mxnet::Tuple<dim_t> &func_var_locs = params.func_var_locs;
+    const mxnet::Tuple<dim_t> &cond_input_locs = params.cond_input_locs;
+    for (int i = 0; i < func_var_locs.ndim(); ++i) {
+      dim_t pos_i = func_input_locs[func_var_locs[i]];
+      for (int j = 0; j < cond_input_locs.ndim(); ++j) {
+        dim_t pos_j = cond_input_locs[j];
+        if (pos_i == pos_j) {
+          this->oi_map[i] = j;
+        }
+      }
+    }
+  }
+};
+
+static void WhileLoopComputeExCPU(const OpStatePtr& state_ptr,
+                                  const OpContext& ctx,
+                                  const std::vector<NDArray>& inputs,
+                                  const std::vector<OpReqType>& req,
+                                  const std::vector<NDArray>& outputs) {
+  // The argument `inputs' are loop_vars and other inputs
+  // loop_vars are stored in stored in `loop_vars_locs'
+  // The argument `outputs' are output and new_loop_vars
+  // [0: num_out_data) are outputs at each step.
+  // [num_out_data: ) are new_loop_vars
+  // TODO(Junru): avoid dynamic NDArray allocation
+  WhileLoopState &state = state_ptr.get_state<WhileLoopState>();
+  const NPXWhileLoopParam& params = state.params;
+  // a helper function, converting std::vector<NDArray> to std::vector<NDArray*>
+  const auto to_ptr_vec = [](std::vector<NDArray> &in, std::vector<NDArray*> *out) {
+    out->clear();
+    out->reserve(in.size());
+    std::transform(std::begin(in),
+                   std::end(in),
+                   std::back_inserter(*out),
+                   [](NDArray &a) {return &a;});
+  };
+  // sanity checks
+  CHECK_EQ(inputs.size(), (size_t) params.num_args);
+  CHECK_EQ(outputs.size(), (size_t) params.num_outputs);
+  CHECK_EQ(outputs.size(), req.size());
+  // construct inputs and outputs for cond
+  std::vector<NDArray> cond_inputs, cond_outputs = {NDArray()};
+  extract_by_loc(inputs, params.cond_input_locs, &cond_inputs);
+  std::vector<NDArray*> cond_input_ptr, cond_output_ptr;
+  to_ptr_vec(cond_inputs, &cond_input_ptr);
+  to_ptr_vec(cond_outputs, &cond_output_ptr);
+  // construct inputs and outputs for func
+  std::vector<NDArray> func_inputs, func_outputs(outputs.size());
+  extract_by_loc(inputs, params.func_input_locs, &func_inputs);
+  for (size_t &step = state.n_iterations = 0; step < (size_t) params.max_iterations; ++step) {
+    CHECK(inputs.size() > 0) << "while loop forward requires at least 1 input";
+    Context default_ctx = inputs[0].ctx();
+    state.cond_op->Forward(nullptr, cond_input_ptr, cond_output_ptr, default_ctx);
+    if (!as_bool_scalar(*cond_output_ptr[0])) {
+      break;
+    }
+    // we create func_outputs for the current step:
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      func_outputs[i] = NDArray(outputs[i].ctx(), outputs[i].dtype());
+    }
+    state.Forward(step, func_inputs, req, func_outputs, ctx.need_grad);
+    if (step == 0) {
+      for (int i = 0; i < params.num_out_data; ++i) {
+        func_outputs[i].WaitToRead();
+        if (!shape_is_known(func_outputs[i].shape())) {
+          func_outputs[i].SetShapeFromChunk();
+        }
+        mxnet::TShape step_shape = func_outputs[i].shape();
+        mxnet::TShape shape(step_shape.ndim() + 1, 0);
+        shape[0] = params.max_iterations;
+        for (int j = 0; j < step_shape.ndim(); ++j) {
+          shape[j + 1] = step_shape[j];
+        }
+        const_cast<NDArray &>(outputs[i]).Init(shape);
+      }
+    }
+    for (int i = 0; i < params.num_out_data; ++i) {
+      NDArray first_slot = outputs[i].At(step);
+      mxnet::CopyFromTo(func_outputs[i], &first_slot);
+    }
+    // func_inputs on the next step:
+    // the output (new_loop_vars) will become the new inputs (loop_vars)
+    for (size_t i = params.num_out_data; i < outputs.size(); ++i) {
+      int j = params.func_var_locs[i - params.num_out_data];
+      int loc = -1;
+      for (size_t m = 0; m < params.func_input_locs.ndim(); ++m) {
+        if (m == j) {
+          loc = params.func_input_locs[m];
+          func_inputs[m] = func_outputs[i];
+        } else {
+          if (params.func_input_locs[m] == loc) {
+            func_inputs[m] = func_outputs[i];
+          }
+        }
+      }
+      int k = state.oi_map[i - params.num_out_data];
+      if (k != -1) {
+        // I actually don't need to update cond_inputs
+        cond_inputs[k] = func_outputs[i];
+        cond_input_ptr[k] = &func_outputs[i];
+      }
+    }
+  }
+  // copy output data to `outputs'
+  // case 1: at least one step is executed,
+  // the final_loop_vars must be stored in func_inputs
+  // case 2: no step is executed
+  // the final_loop_vars is the same as loop_vars, which are also stored in func_inputs
+  // therefore, we copy func_inputs[:] to outputs[num_out_data: ]
+  for (size_t i = params.num_out_data; i < outputs.size(); ++i) {
+    size_t j = params.func_var_locs[i - params.num_out_data];
+    if (!shape_is_known(outputs[i].shape())) {
+      const_cast<NDArray &>(outputs[i]).Init(func_inputs[j].shape());
+    }
+    mxnet::CopyFromTo(func_inputs[j], &outputs[i]);
+  }
+  for (int i = 0; i < params.num_out_data; ++i) {
+    const_cast<NDArray &>(outputs[i]).SetShapeFromChunk();
+  }
+  if (state.n_iterations == 0) {
+    for (const auto & output : outputs) {
+      if (!shape_is_known(output.shape())) {
+        const_cast<NDArray &>(output).ReshapeAndAlloc({1});
+      }
+    }
+  }
+}
+
+static void WhileLoopGradComputeExCPU(const OpStatePtr& state_ptr,
+                                      const OpContext& ctx,
+                                      const std::vector<NDArray>& inputs,
+                                      const std::vector<OpReqType>& _req,
+                                      const std::vector<NDArray>& _outputs) {
+  // inputs are dl / df(x)
+  // outputs are dl / dx
+  // where f is the current function,
+  // x is the input to the current function,
+  // TODO(Junru): avoid dynamic NDArray allocation
+  WhileLoopState &state = state_ptr.get_state<WhileLoopState>();
+  const NPXWhileLoopParam& params = state.params;
+  // sanity checks
+  CHECK_EQ(_outputs.size(), (size_t) params.num_args);
+  CHECK_EQ(_outputs.size(), _req.size());
+  for (auto x : _req) {
+    CHECK_NE(x, kWriteInplace);
+  }
+  std::vector<NDArray> outputs;
+  std::vector<OpReqType> req;
+  extract_by_loc(_outputs, params.func_input_locs, &outputs);
+  extract_by_loc(_req, params.func_input_locs, &req);
+  if (state.n_iterations == 0) {
+    for (int i = params.num_out_data; i < params.num_outputs; ++i) {
+      int j = params.func_var_locs[i - params.num_out_data];
+      mxnet::CopyFromTo(inputs[i], &outputs[j]);
+    }
+    state.Cleanup();
+    return;
+  }
+  // collect var_locs and out_locs, positions other than var_locs are out_locs, i.e.
+  // [0, var_locs[0])
+  // (var_locs[1], var_locs[2]),
+  // (var_locs[2], var_locs[3]),
+  // ...
+  // (var_locs[-2], var_locs[-1] = params.num_args)
+  std::vector<dim_t> var_locs(params.func_var_locs.begin(), params.func_var_locs.end());
+  var_locs.push_back((dim_t) params.num_args);
+  sort(var_locs.begin(), var_locs.end());
+  // vectors for the backward loop
+  std::vector<NDArray> ograds(params.num_outputs);
+  std::vector<NDArray> igrads(outputs.size());
+  std::vector<OpReqType> iter_req(req.size());
+  for (int i = params.num_out_data; i < params.num_outputs; ++i)
+    ograds[i] = inputs[i];
+  const int n_iter = state.n_iterations;
+  for (int step = n_iter - 1; step >= 0; --step) {
+    // ograds[ : num_out_data] = inputs[ : num_out_data][step]
+    // ograds[num_out_data: ] is maintained in the end of each loop
+    std::transform(std::begin(inputs),
+                   std::begin(inputs) + params.num_out_data,
+                   std::begin(ograds),
+                   [step] (const NDArray &a) { return a.At(step); } );
+    // igrads[i] =
+    //    outputs[i]            (step == 0)
+    //    outputs[i]            (step != 0 && i not in loop_var_locs)
+    //    ArrayLike(outputs[i]) (step != 0 && i in loop_var_locs)
+    // iter_req =
+    //    kWriteTo              (step != 0           && i in loop_var_locs)
+    //    req[i]                (step == 0           && i in loop_var_locs)
+    //    kAddTo                (step != n_iters - 1 && i not in loop_var_locs)
+    //    req[i]                (step == n_iters - 1 && i not in loop_var_locs)
+    {
+      size_t i = 0;
+      for (size_t loc : var_locs) {
+        for ( ; i < loc; ++i) {
+          // locs other that var_locs
+          igrads[i] = outputs[i];
+          iter_req[i] = (step + 1 == n_iter || req[i] == kNullOp)
+                      ? req[i]
+                      : kAddTo;
+        }
+        if (i < (size_t) params.num_args) {
+          // a var
+          igrads[i] = (step == 0)
+                    ? outputs[i]
+                    : NDArray(outputs[i].shape(), outputs[i].ctx(), true, outputs[i].dtype());
+          iter_req[i] = (step == 0 || req[i] == kNullOp)
+                      ? req[i]
+                      : kWriteTo;
+          ++i;
+        } else {
+          break;
+        }
+      }
+    }
+    state.Backward(step, ograds, iter_req, igrads);
+    for (int i = params.num_out_data; i < params.num_outputs; ++i) {
+      size_t j = params.func_var_locs[i - params.num_out_data];
+      int loc = -1;
+      for (size_t m = 0; m < params.func_input_locs.ndim(); ++m) {
+        if (m == j) {
+          loc = params.func_input_locs[m];
+          ograds[i] = igrads[m];
+        } else {
+          if (params.func_input_locs[m] == loc) {
+            ograds[i] = igrads[m];
+          }
+        }
+      }
+    }
+  }
+  state.Cleanup();
+}
+
+static bool WhileLoopShape(const nnvm::NodeAttrs& attrs,
+                           mxnet::ShapeVector *in_shape,
+                           mxnet::ShapeVector *out_shape) {
+  const NPXWhileLoopParam& params = nnvm::get<NPXWhileLoopParam>(attrs.parsed);
+  CHECK_EQ(out_shape->size(), (size_t) params.num_outputs);
+  CHECK_EQ(attrs.subgraphs.size(), 2U);
+
+  mxnet::ShapeVector subg_in_shape(params.func_input_locs.ndim());
+  for (int i = 0; i < params.func_input_locs.ndim(); ++i) {
+    size_t loc = params.func_input_locs[i];
+    subg_in_shape[i] = in_shape->at(loc);
+  }
+  mxnet::ShapeVector subg_out_shape = *out_shape;
+  int num_out_data = params.num_out_data;
+  int num_outputs = params.num_outputs;
+  for (size_t i = num_out_data; i < num_outputs; i++)
+    SHAPE_ASSIGN_CHECK(subg_out_shape, i, (*in_shape)[i-num_out_data]);
+  bool infer_success = InferSubgraphShape(*attrs.subgraphs[1],
+                                          &subg_in_shape, &subg_out_shape);
+  for (int i = 0; i < params.num_out_data; i++) {
+    // If the output shape isn't inferred, we don't need to propogate the info.
+    const auto& g_out_shape = subg_out_shape[i];
+    if (!mxnet::ndim_is_known(g_out_shape))
+      continue;
+
+    mxnet::TShape shape(g_out_shape.ndim() + 1, 0);
+    shape[0] = params.max_iterations;
+    for (int j = 0; j < g_out_shape.ndim(); ++j) {
+      shape[j + 1] = g_out_shape[j];
+    }
+    SHAPE_ASSIGN_CHECK(*out_shape, i, shape);
+  }
+  for (size_t i = num_out_data; i < num_outputs; i++)
+    SHAPE_ASSIGN_CHECK(*out_shape, i, subg_out_shape[i]);
+  if (infer_success) {
+    for (size_t i = num_out_data; i < num_outputs; i++)
+      CHECK_EQ((*out_shape)[i], (*in_shape)[i-num_out_data]);
+  }
+  // Check if we have inferred the shapes correctly.
+  return infer_success;
+}
+
+static bool WhileLoopType(const nnvm::NodeAttrs& attrs,
+                          std::vector<int> *in_type, std::vector<int> *out_type) {
+  const NPXWhileLoopParam& params = nnvm::get<NPXWhileLoopParam>(attrs.parsed);
+  static const std::function<bool(const int &)> is_udf = is_type_udf;
+  CHECK_EQ(in_type->size(), (size_t) params.num_args);
+  CHECK_EQ(out_type->size(), (size_t) params.num_outputs);
+  CHECK_EQ(attrs.subgraphs.size(), 2U);
+  CHECK_EQ(attrs.subgraphs[0]->outputs.size(), 1U);
+  std::vector<int> cond_in_type;
+  std::vector<int> func_in_type;
+  extract_by_loc(*in_type, params.cond_input_locs, &cond_in_type);
+  extract_by_loc(*in_type, params.func_input_locs, &func_in_type);
+  std::vector<int> cond_out_type = {-1};
+  CHECK(params.sync_in_out(in_type, out_type, is_udf));
+  bool succ_0 = InferSubgraphDataType(*attrs.subgraphs[0], &cond_in_type, &cond_out_type);
+  CHECK(params.sync_in_out(in_type, out_type, is_udf));
+  CHECK(sync_in_in(params.cond_input_locs, in_type, &cond_in_type, is_udf));
+  bool succ_1 = InferSubgraphDataType(*attrs.subgraphs[1], &func_in_type, out_type);
+  CHECK(params.sync_in_out(in_type, out_type, is_udf));
+  CHECK(sync_in_in(params.func_input_locs, in_type, &func_in_type, is_udf));
+  return succ_0 && succ_1;
+}
+
+static bool WhileLoopStorageType(const nnvm::NodeAttrs& attrs,
+                                 const int dev_mask,
+                                 DispatchMode* dispatch_mode,
+                                 std::vector<int> *in_attrs,
+                                 std::vector<int> *out_attrs) {
+  const NPXWhileLoopParam& params = nnvm::get<NPXWhileLoopParam>(attrs.parsed);
+  static const std::function<bool(const int &)> is_udf = is_stype_udf;
+  CHECK_EQ(in_attrs->size(), (size_t) params.num_args);
+  CHECK_EQ(out_attrs->size(), (size_t) params.num_outputs);
+  CHECK_EQ(attrs.subgraphs.size(), 2U);
+  CHECK_EQ(attrs.subgraphs[0]->outputs.size(), 1U);
+  std::vector<int> cond_in_attrs;
+  std::vector<int> func_in_attrs;
+  extract_by_loc(*in_attrs, params.cond_input_locs, &cond_in_attrs);
+  extract_by_loc(*in_attrs, params.func_input_locs, &func_in_attrs);
+  std::vector<int> cond_out_attrs = {kDefaultStorage};
+  DispatchMode cond_mode = DispatchMode::kUndefined;
+  DispatchMode func_mode = DispatchMode::kUndefined;
+  *dispatch_mode = DispatchMode::kFComputeEx;
+  CHECK(params.sync_in_out(in_attrs, out_attrs, is_udf));
+  bool succ_0 = InferSubgraphStorage(*attrs.subgraphs[0], dev_mask, \
+                                     &cond_mode, &cond_in_attrs, &cond_out_attrs);
+  CHECK(params.sync_in_out(in_attrs, out_attrs, is_udf));
+  CHECK(sync_in_in(params.cond_input_locs, in_attrs, &cond_in_attrs, is_udf));
+  bool succ_1 = InferSubgraphStorage(*attrs.subgraphs[1], dev_mask, \
+                                     &func_mode, &func_in_attrs, out_attrs);
+  CHECK(params.sync_in_out(in_attrs, out_attrs, is_udf));
+  CHECK(sync_in_in(params.func_input_locs, in_attrs, &func_in_attrs, is_udf));
+  return succ_0 && succ_1;
+}
+
+static bool BackwardWhileLoopStorageType(const nnvm::NodeAttrs& attrs,
+                                         const int dev_mask,
+                                         DispatchMode* dispatch_mode,
+                                         std::vector<int> *in_attrs,
+                                         std::vector<int> *out_attrs) {
+  // `cond' is not backwarded, don't check
+  const NPXWhileLoopParam& params = nnvm::get<NPXWhileLoopParam>(attrs.parsed);
+  CHECK_EQ(out_attrs->size(), (size_t) params.num_args);
+  CHECK_EQ(attrs.subgraphs.size(), 2U);
+  CachedOp op(*attrs.subgraphs[1], {});
+  return op.BackwardStorageType(attrs, dev_mask, dispatch_mode,
+                                in_attrs, out_attrs);
+}
+
+static OpStatePtr CreateWhileLoopState(const NodeAttrs& attrs,
+                                       Context ctx,
+                                       const mxnet::ShapeVector& ishape,
+                                       const std::vector<int>& itype) {
+  const NPXWhileLoopParam& params = nnvm::get<NPXWhileLoopParam>(attrs.parsed);
+  return OpStatePtr::Create<WhileLoopState>(params, *attrs.subgraphs[0], *attrs.subgraphs[1]);
+}
+
+static std::vector<nnvm::NodeEntry>
+WhileLoopGradient(const nnvm::ObjectPtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+  std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+  for (auto& h : n->inputs) {
+    heads.push_back(h);
+  }
+  uint32_t n_out = n->num_outputs();
+  for (uint32_t i = 0; i < n_out; ++i) {
+    heads.emplace_back(n, i, 0);
+  }
+
+  auto p = nnvm::Node::Create();
+  p->attrs.op = nnvm::Op::Get("_backward_npx_while_loop");
+  p->attrs.name = n->attrs.name;
+  p->attrs.dict = n->attrs.dict;
+  p->inputs = heads;
+  p->control_deps.emplace_back(n);
+  p->attrs.parsed = n->attrs.parsed;
+  CHECK_EQ(p->num_inputs(), p->inputs.size())
+    << "Number of inputs to operator " << "_backward_npx_while_loop" << " (" << p->num_inputs()
+    << ") does not match the actual number of inputs provided to operator "
+    << n->attrs.name << " (" << p->inputs.size() << ").";
+
+  std::vector<nnvm::NodeEntry> entries;
+  for (uint32_t i = 0; i < p->num_outputs(); ++i)
+    entries.emplace_back(nnvm::NodeEntry{p, i, 0});
+
+  entries[0].node->attrs.subgraphs = n->attrs.subgraphs;
+  return entries;
+}
+
+DMLC_REGISTER_PARAMETER(NPXCondParam);
+
+class CondState {
+ public:
+  NPXCondParam params;
+  CachedOpPtr cond_op;
+  LoopState then_branch;
+  LoopState else_branch;
+  int branch_selection;  // 1 if then branch; 0 if else branch; -1 if undefined
+
+  CondState(NPXCondParam params,
+            const nnvm::Symbol &cond,
+            const nnvm::Symbol &then_sym,
+            const nnvm::Symbol &else_sym):
+            params(std::move(params)),
+            cond_op(LoopState::MakeSharedOp(cond)),
+            then_branch(then_sym),
+            else_branch(else_sym),
+            branch_selection(-1) {
+  }
+};
+
+static void CondComputeExCPU(const OpStatePtr& state_ptr,
+                             const OpContext& ctx,
+                             const std::vector<NDArray>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<NDArray>& outputs) {
+  // The argument `inputs' are loop_vars and other inputs
+  // loop_vars are stored in stored in `loop_vars_locs'
+  // The argument `outputs' are output and new_loop_vars
+  // [0: num_out_data) are outputs at each step.
+  // [num_out_data: ) are new_loop_vars
+  CondState &state = state_ptr.get_state<CondState>();
+  const NPXCondParam& params = state.params;
+  // a helper function, converting std::vector<NDArray> to std::vector<NDArray*>
+  const auto to_ptr_vec = [](std::vector<NDArray> &in, std::vector<NDArray*> *out) {
+    out->clear();
+    out->reserve(in.size());
+    std::transform(std::begin(in),
+                   std::end(in),
+                   std::back_inserter(*out),
+                   [](NDArray &a) {return &a;});
+  };
+  // sanity checks
+  CHECK_EQ(inputs.size(), (size_t) params.num_args);
+  CHECK_EQ(outputs.size(), (size_t) params.num_outputs);
+  CHECK_EQ(outputs.size(), req.size());
+  // construct inputs and outputs for cond
+  std::vector<NDArray> cond_inputs;
+  std::vector<NDArray> cond_outputs = {NDArray()};
+  std::vector<NDArray*> cond_input_ptr;
+  std::vector<NDArray*> cond_output_ptr;
+  extract_by_loc(inputs, params.cond_input_locs, &cond_inputs);
+  to_ptr_vec(cond_inputs, &cond_input_ptr);
+  to_ptr_vec(cond_outputs, &cond_output_ptr);
+  int &branch_selection = state.branch_selection;
+  // run cond
+  CHECK(cond_input_ptr.size() > 0) << "condition requires at least 1 input";
+  Context default_ctx = cond_inputs[0].ctx();
+  state.cond_op->Forward(nullptr, cond_input_ptr, cond_output_ptr, default_ctx);
+  branch_selection = as_bool_scalar(*cond_output_ptr[0]);
+  // select the right branch
+  const mxnet::Tuple<dim_t> &func_input_locs = branch_selection
+                                            ? params.then_input_locs
+                                            : params.else_input_locs;
+  LoopState &loop_state = branch_selection
+                        ? state.then_branch
+                        : state.else_branch;
+  // extract inputs for the branch
+  std::vector<NDArray> func_inputs;
+  extract_by_loc(inputs, func_input_locs, &func_inputs);
+  loop_state.Forward(0, func_inputs, req, outputs, ctx.need_grad);
+}
+
+static void CondGradComputeExCPU(const OpStatePtr& state_ptr,
+                                 const OpContext& ctx,
+                                 const std::vector<NDArray>& inputs,
+                                 const std::vector<OpReqType>& _req,
+                                 const std::vector<NDArray>& outputs) {
+  CondState &state = state_ptr.get_state<CondState>();
+  const NPXCondParam& params = state.params;
+  // sanity checks
+  CHECK_EQ(outputs.size(), (size_t) params.num_args);
+  CHECK_EQ(outputs.size(), _req.size());
+  // select the right branch
+  int branch_selection = state.branch_selection;
+  CHECK_NE(branch_selection, -1);
+  const mxnet::Tuple<dim_t> &func_input_locs = branch_selection
+                                            ? params.then_input_locs
+                                            : params.else_input_locs;
+  LoopState &loop_state = branch_selection
+                        ? state.then_branch
+                        : state.else_branch;
+  // construct parameters
+  std::vector<NDArray> ograds(inputs.begin(), inputs.begin() + params.num_outputs);
+  std::vector<OpReqType> req;
+  extract_by_loc(_req, func_input_locs, &req);
+  std::vector<NDArray> igrads;
+  extract_by_loc(outputs, func_input_locs, &igrads);
+  loop_state.Backward(0, ograds, req, igrads);
+  loop_state.Cleanup();
+}
+
+static bool CondType(const nnvm::NodeAttrs& attrs,
+                     std::vector<int> *in_type,
+                     std::vector<int> *out_type) {
+  const NPXCondParam& params = nnvm::get<NPXCondParam>(attrs.parsed);
+  static const std::function<bool(const int &)> is_udf = is_type_udf;
+  CHECK_EQ(in_type->size(), (size_t) params.num_args);
+  CHECK_EQ(out_type->size(), (size_t) params.num_outputs);
+  CHECK_EQ(attrs.subgraphs.size(), 3U);
+  CHECK_EQ(attrs.subgraphs[0]->outputs.size(), 1U);
+  CHECK_EQ(attrs.subgraphs[1]->outputs.size(), attrs.subgraphs[2]->outputs.size());
+  std::vector<int> cond_in_type;
+  std::vector<int> then_in_type;
+  std::vector<int> else_in_type;
+  extract_by_loc(*in_type, params.cond_input_locs, &cond_in_type);
+  extract_by_loc(*in_type, params.then_input_locs, &then_in_type);
+  extract_by_loc(*in_type, params.else_input_locs, &else_in_type);
+  std::vector<int> cond_out_type = {-1};
+  bool succ_0 = InferSubgraphDataType(*attrs.subgraphs[0], &cond_in_type, &cond_out_type);
+  CHECK(sync_in_in(params.cond_input_locs, in_type, &cond_in_type, is_udf));
+  bool succ_1 = InferSubgraphDataType(*attrs.subgraphs[1], &then_in_type, out_type);
+  CHECK(sync_in_in(params.then_input_locs, in_type, &then_in_type, is_udf));
+  bool succ_2 = InferSubgraphDataType(*attrs.subgraphs[2], &else_in_type, out_type);
+  CHECK(sync_in_in(params.else_input_locs, in_type, &else_in_type, is_udf));
+  return succ_0 && succ_1 && succ_2;
+}
+
+static bool CondStorageType(const nnvm::NodeAttrs& attrs,
+                            const int dev_mask,
+                            DispatchMode* dispatch_mode,
+                            std::vector<int> *in_attrs,
+                            std::vector<int> *out_attrs) {
+  const NPXCondParam& params = nnvm::get<NPXCondParam>(attrs.parsed);
+  static const std::function<bool(const int &)> is_udf = is_stype_udf;
+  CHECK_EQ(in_attrs->size(), (size_t) params.num_args);
+  CHECK_EQ(out_attrs->size(), (size_t) params.num_outputs);
+  CHECK_EQ(attrs.subgraphs.size(), 3U);
+  CHECK_EQ(attrs.subgraphs[0]->outputs.size(), 1U);
+  CHECK_EQ(attrs.subgraphs[1]->outputs.size(), attrs.subgraphs[2]->outputs.size());
+  std::vector<int> cond_in_attrs;
+  std::vector<int> then_in_attrs;
+  std::vector<int> else_in_attrs;
+  extract_by_loc(*in_attrs, params.cond_input_locs, &cond_in_attrs);
+  extract_by_loc(*in_attrs, params.then_input_locs, &then_in_attrs);
+  extract_by_loc(*in_attrs, params.else_input_locs, &else_in_attrs);
+  std::vector<int> cond_out_attrs = {kDefaultStorage};
+  DispatchMode cond_mode = DispatchMode::kUndefined;
+  DispatchMode then_mode = DispatchMode::kUndefined;
+  DispatchMode else_mode = DispatchMode::kUndefined;
+  *dispatch_mode = DispatchMode::kFComputeEx;
+  bool succ_0 = InferSubgraphStorage(*attrs.subgraphs[0], dev_mask, \
+                                     &cond_mode, &cond_in_attrs, &cond_out_attrs);
+  CHECK(sync_in_in(params.cond_input_locs, in_attrs, &cond_in_attrs, is_udf));
+  bool succ_1 = InferSubgraphStorage(*attrs.subgraphs[1], dev_mask, \
+                                     &then_mode, &then_in_attrs, out_attrs);
+  CHECK(sync_in_in(params.then_input_locs, in_attrs, &then_in_attrs, is_udf));
+  bool succ_2 = InferSubgraphStorage(*attrs.subgraphs[2], dev_mask, \
+                                     &else_mode, &else_in_attrs, out_attrs);
+  CHECK(sync_in_in(params.else_input_locs, in_attrs, &else_in_attrs, is_udf));
+  return succ_0 && succ_1 && succ_2;
+}
+
+static bool BackwardCondStorageType(const nnvm::NodeAttrs& attrs,
+                                    const int dev_mask,
+                                    DispatchMode* dispatch_mode,
+                                    std::vector<int> *in_attrs,
+                                    std::vector<int> *out_attrs) {
+  const NPXCondParam& params = nnvm::get<NPXCondParam>(attrs.parsed);
+  CHECK_EQ(out_attrs->size(), (size_t) params.num_args);
+  CHECK_EQ(attrs.subgraphs.size(), 3U);
+  static const std::function<bool(const int &)> is_udf = is_stype_udf;
+  auto sub_pass = [&](const std::shared_ptr<nnvm::Symbol> &subg,
+                      const mxnet::Tuple<dim_t> &input_locs) {
+    // A. first construct subg_in_attrs
+    // need subg_in_attrs as subg_bwd_out (copy), subg_fwd_in (extract), subg_fwd_out (copy)
+    std::vector<int> subg_in_attrs;
+    size_t num_elts = params.num_outputs * 2 + input_locs.ndim();
+    subg_in_attrs.reserve(num_elts);
+    // part 1. subg_bwd_out (copy)
+    subg_in_attrs.insert(subg_in_attrs.end(),
+                         in_attrs->begin(),
+                         in_attrs->begin() + params.num_outputs);
+    // part 2. subg_fwd_in (extract)
+    std::vector<int> fwd_in(in_attrs->begin() + params.num_outputs,
+                            in_attrs->begin() + params.num_outputs + params.num_args);
+    std::vector<int> subg_fwd_in;
+    extract_by_loc(fwd_in, input_locs, &subg_fwd_in);
+    subg_in_attrs.insert(subg_in_attrs.end(),
+                         subg_fwd_in.begin(),
+                         subg_fwd_in.end());
+    // part 3. subg_fwd_out (copy)
+    subg_in_attrs.insert(subg_in_attrs.end(),
+                         in_attrs->begin() + params.num_outputs + params.num_args,
+                         in_attrs->end());
+    // check correctness of the number of elements
+    CHECK_EQ(subg_in_attrs.size(), num_elts);
+    // B. then we construct subg_out_attrs by extracting from out_attrs
+    std::vector<int> subg_out_attrs;
+    extract_by_loc(*out_attrs, input_locs, &subg_out_attrs);
+    // then we construct the subgraph and do inference
+    CachedOp op(*subg, {});
+    bool ret = op.BackwardStorageType(attrs, dev_mask, dispatch_mode, \
+                                      &subg_in_attrs, &subg_out_attrs);
+    CHECK(sync_in_in(input_locs, out_attrs, &subg_out_attrs, is_udf));
+    return ret;
+  };
+  for (const dim_t &cond_in : params.cond_input_locs) {
+    (*out_attrs)[cond_in] = kDefaultStorage;
+  }
+  bool succ_0 = sub_pass(attrs.subgraphs[1], params.then_input_locs);
+  bool succ_1 = sub_pass(attrs.subgraphs[2], params.else_input_locs);
+  return succ_0 && succ_1;
+}
+
+static OpStatePtr CreateCondState(const NodeAttrs& attrs,
+                                  Context ctx,
+                                  const mxnet::ShapeVector& ishape,
+                                  const std::vector<int>& itype) {
+  const NPXCondParam& params = nnvm::get<NPXCondParam>(attrs.parsed);
+  return OpStatePtr::Create<CondState>(
+    params,
+    *attrs.subgraphs[0],
+    *attrs.subgraphs[1],
+    *attrs.subgraphs[2]);
+}
+
+static std::vector<nnvm::NodeEntry>
+CondGradient(const nnvm::ObjectPtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+  std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+  for (auto& h : n->inputs) {
+    heads.push_back(h);
+  }
+  uint32_t n_out = n->num_outputs();
+  for (uint32_t i = 0; i < n_out; ++i) {
+    heads.emplace_back(n, i, 0);
+  }
+
+  auto p = nnvm::Node::Create();
+  p->attrs.op = nnvm::Op::Get("_backward_npx_cond");
+  p->attrs.name = n->attrs.name;
+  p->attrs.dict = n->attrs.dict;
+  p->inputs = heads;
+  p->control_deps.emplace_back(n);
+  p->attrs.parsed = n->attrs.parsed;
+  CHECK_EQ(p->num_inputs(), p->inputs.size())
+    << "Number of inputs to operator " << "_backward_npx_cond" << " (" << p->num_inputs()
+    << ") does not match the actual number of inputs provided to operator "
+    << n->attrs.name << " (" << p->inputs.size() << ").";
+
+  std::vector<nnvm::NodeEntry> entries;
+  for (uint32_t i = 0; i < p->num_outputs(); ++i)
+    entries.emplace_back(nnvm::NodeEntry{p, i, 0});
+
+  entries[0].node->attrs.subgraphs = n->attrs.subgraphs;
+  return entries;
+}
+
+NNVM_REGISTER_OP(_npx_foreach)
+.add_alias("_npx_foreach")
+.MXNET_DESCRIBE("Run a for loop over an NDArray with user-defined computation")
+.set_attr_parser(ParamParser<NPXForeachParam>)
+.set_attr<FInferStorageType>("FInferStorageType", ForeachStorageType)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const NPXForeachParam& params = nnvm::get<NPXForeachParam>(attrs.parsed);
+  return params.num_args;
+})
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const NPXForeachParam& params = nnvm::get<NPXForeachParam>(attrs.parsed);
+  return params.num_outputs;
+})
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+  const NPXForeachParam& params = nnvm::get<NPXForeachParam>(attrs.parsed);
+  std::vector<std::string> names;
+  names.emplace_back("fn");
+  for (int i = 0; i < params.num_args - 1; i++)
+    names.push_back("data" + std::to_string(i));
+  return names;
+})
+.set_attr<nnvm::FInputGraph>("FInputGraph",
+    [](const NodeAttrs& attrs) {
+  return std::vector<uint32_t>{0};
+})
+.set_attr<nnvm::FGradient>("FGradient", ForeachGradient)
+.set_attr<FCreateOpState>("FCreateOpState", CreateForeachState)
+.set_attr<mxnet::FInferShape>("FInferShape", ForeachShape)
+.set_attr<nnvm::FInferType>("FInferType", ForeachType)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", ForeachComputeExCPU)
+// Foreach operator works like an executor. Its code will always run on CPU.
+// So the same code can be registered for both CPU and GPU.
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", ForeachComputeExCPU)
+.set_attr<FExecType>("FExecType", [](const NodeAttrs& attrs) {
+  return ExecType::kSubgraphExec;
+})
+.set_attr<std::string>("key_var_num_args", "num_args")
+.add_argument("fn", "Symbol", "Input graph.")
+.add_argument("data", "NDArray-or-Symbol[]",
+              "The input arrays that include data arrays and states.")
+.add_arguments(NPXForeachParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_npx_foreach)
+.set_num_inputs([](const NodeAttrs& attrs){
+  const NPXForeachParam& params = nnvm::get<NPXForeachParam>(attrs.parsed);
+  return params.num_outputs * 2 + params.num_args;
+})
+.set_num_outputs([](const NodeAttrs& attrs){
+  const NPXForeachParam& params = nnvm::get<NPXForeachParam>(attrs.parsed);
+  return params.num_args;
+})
+.set_attr<FExecType>("FExecType", [](const NodeAttrs& attrs) {
+  return ExecType::kSubgraphExec;
+})
+.set_attr<FInferStorageType>("FInferStorageType", BackwardForeachStorageType)
+.set_attr_parser(ParamParser<NPXForeachParam>)
+.set_attr<bool>("TIsLayerOpBackward", true)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", ForeachGradComputeExCPU)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", ForeachGradComputeExCPU);
+
+NNVM_REGISTER_OP(_npx_while_loop)
+.add_alias("_npx_while_loop")
+.MXNET_DESCRIBE("Run a while loop over with user-defined condition and computation")
+.set_attr_parser(ParamParser<NPXWhileLoopParam>)
+.set_attr<FInferStorageType>("FInferStorageType", WhileLoopStorageType)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const NPXWhileLoopParam& params = nnvm::get<NPXWhileLoopParam>(attrs.parsed);
+  return params.num_args;
+})
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const NPXWhileLoopParam& params = nnvm::get<NPXWhileLoopParam>(attrs.parsed);
+  return params.num_outputs;
+})
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+  const NPXWhileLoopParam& params = nnvm::get<NPXWhileLoopParam>(attrs.parsed);
+  std::vector<std::string> names;
+  names.reserve(params.num_args);
+  names.emplace_back("cond");
+  names.emplace_back("func");
+  for (int i = 2; i < params.num_args; i++)
+    names.push_back("data" + std::to_string(i - 2));
+  return names;
+})
+.set_attr<nnvm::FInputGraph>("FInputGraph",
+    [](const NodeAttrs& attrs) {
+  return std::vector<uint32_t>{0, 1};
+})
+.set_attr<nnvm::FGradient>("FGradient", WhileLoopGradient)
+.set_attr<FCreateOpState>("FCreateOpState", CreateWhileLoopState)
+.set_attr<mxnet::FInferShape>("FInferShape", WhileLoopShape)
+.set_attr<nnvm::FInferType>("FInferType", WhileLoopType)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", WhileLoopComputeExCPU)
+.set_attr<FExecType>("FExecType", [](const NodeAttrs& attrs) {
+  return ExecType::kSubgraphExec;
+})
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", WhileLoopComputeExCPU)
+.set_attr<std::string>("key_var_num_args", "num_args")
+.add_argument("cond", "Symbol", "Input graph for the loop condition.")
+.add_argument("func", "Symbol", "Input graph for the loop body.")
+.add_argument("data", "NDArray-or-Symbol[]",
+              "The input arrays that include data arrays and states.")
+.add_arguments(NPXWhileLoopParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_npx_while_loop)
+.set_num_inputs([](const NodeAttrs& attrs){
+  const NPXWhileLoopParam& params = nnvm::get<NPXWhileLoopParam>(attrs.parsed);
+  return params.num_outputs * 2 + params.num_args;
+})
+.set_num_outputs([](const NodeAttrs& attrs){
+  const NPXWhileLoopParam& params = nnvm::get<NPXWhileLoopParam>(attrs.parsed);
+  return params.num_args;
+})
+.set_attr<FExecType>("FExecType", [](const NodeAttrs& attrs) {
+  return ExecType::kSubgraphExec;
+})
+.set_attr<FInferStorageType>("FInferStorageType", BackwardWhileLoopStorageType)
+.set_attr_parser(ParamParser<NPXWhileLoopParam>)
+.set_attr<bool>("TIsLayerOpBackward", true)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", WhileLoopGradComputeExCPU)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", WhileLoopGradComputeExCPU);
+
+NNVM_REGISTER_OP(_npx_cond)
+.add_alias("_npx_cond")
+.MXNET_DESCRIBE("Run a if-then-else using user-defined condition and computation")
+.set_attr_parser(ParamParser<NPXCondParam>)
+.set_attr<FInferStorageType>("FInferStorageType", CondStorageType)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const NPXCondParam& params = nnvm::get<NPXCondParam>(attrs.parsed);
+  return params.num_args;
+})
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const NPXCondParam& params = nnvm::get<NPXCondParam>(attrs.parsed);
+  return params.num_outputs;
+})
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+  const NPXCondParam& params = nnvm::get<NPXCondParam>(attrs.parsed);
+  std::vector<std::string> names;
+  names.reserve(params.num_args);
+  names.emplace_back("cond");
+  names.emplace_back("then_branch");
+  names.emplace_back("else_branch");
+  for (int i = 3; i < params.num_args; ++i)
+    names.push_back("data" + std::to_string(i - 3));
+  return names;
+})
+.set_attr<nnvm::FInputGraph>("FInputGraph",
+    [](const NodeAttrs& attrs) {
+  return std::vector<uint32_t>{0, 1, 2};
+})
+.set_attr<nnvm::FGradient>("FGradient", CondGradient)
+.set_attr<FCreateOpState>("FCreateOpState", CreateCondState)
+.set_attr<nnvm::FInferType>("FInferType", CondType)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", CondComputeExCPU)
+.set_attr<FExecType>("FExecType", [](const NodeAttrs& attrs) {
+  return ExecType::kSubgraphExec;
+})
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", CondComputeExCPU)
+.set_attr<std::string>("key_var_num_args", "num_args")
+.add_argument("cond", "Symbol", "Input graph for the condition.")
+.add_argument("then_branch", "Symbol", "Input graph for the then branch.")
+.add_argument("else_branch", "Symbol", "Input graph for the else branch.")
+.add_argument("data", "NDArray-or-Symbol[]",
+              "The input arrays that include data arrays and states.")
+.add_arguments(NPXCondParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_npx_cond)
+.set_num_inputs([](const NodeAttrs& attrs){
+  const NPXCondParam& params = nnvm::get<NPXCondParam>(attrs.parsed);
+  return params.num_outputs * 2 + params.num_args;
+})
+.set_num_outputs([](const NodeAttrs& attrs){
+  const NPXCondParam& params = nnvm::get<NPXCondParam>(attrs.parsed);
+  return params.num_args;
+})
+.set_attr<FExecType>("FExecType", [](const NodeAttrs& attrs) {
+  return ExecType::kSubgraphExec;
+})
+.set_attr<FInferStorageType>("FInferStorageType", BackwardCondStorageType)
+.set_attr_parser(ParamParser<NPXCondParam>)
+.set_attr<bool>("TIsLayerOpBackward", true)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", CondGradComputeExCPU)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", CondGradComputeExCPU);
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/npx_control_flow.h b/src/operator/npx_control_flow.h
new file mode 100644
index 000000000000..8adc08a810bf
--- /dev/null
+++ b/src/operator/npx_control_flow.h
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_OPERATOR_NPX_CONTROL_FLOW_H_
+#define MXNET_OPERATOR_NPX_CONTROL_FLOW_H_
+
+#include <mxnet/io.h>
+#include <mxnet/base.h>
+#include <dmlc/optional.h>
+
+#include <string>
+#include <vector>
+
+namespace mxnet {
+namespace op {
+
+struct NPXForeachParam : public dmlc::Parameter<NPXForeachParam> {
+  int num_args;
+  int num_outputs;
+  int num_out_data;
+  // The location of states in the subgraph inputs.
+  mxnet::Tuple<dim_t> in_state_locs;
+  // The location of data arrays in the subgraph inputs.
+  mxnet::Tuple<dim_t> in_data_locs;
+  // The location of remaining arrays in the subgraph inputs.
+  mxnet::Tuple<dim_t> remain_locs;
+  // The index mapping from out_states to in_states.
+  mxnet::Tuple<dim_t> in_state_index;
+  DMLC_DECLARE_PARAMETER(NPXForeachParam) {
+    DMLC_DECLARE_FIELD(num_args).set_lower_bound(1)
+    .describe("Number of inputs.");
+    DMLC_DECLARE_FIELD(num_outputs)
+    .describe("The number of outputs of the subgraph.");
+    DMLC_DECLARE_FIELD(num_out_data)
+    .describe("The number of output data of the subgraph.");
+    DMLC_DECLARE_FIELD(in_state_locs)
+    .describe("The locations of loop states among the inputs.");
+    DMLC_DECLARE_FIELD(in_data_locs)
+    .describe("The locations of input data among the inputs.");
+    DMLC_DECLARE_FIELD(remain_locs)
+    .describe("The locations of remaining data among the inputs.");
+    DMLC_DECLARE_FIELD(in_state_index)
+    .describe("The index mapping from out_states to in_states.");
+  }
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream num_args_s, num_outputs_s, num_out_data_s, in_state_locs_s,
+                       in_data_locs_s, remain_locs_s, in_state_index_s;
+    num_args_s << num_args;
+    num_outputs_s << num_outputs;
+    num_out_data_s << num_out_data;
+    in_state_locs_s << in_state_locs;
+    in_data_locs_s << in_data_locs;
+    remain_locs_s << remain_locs;
+    in_state_index_s << in_state_index;
+  }
+};  // struct NPXForeachParam
+
+struct NPXWhileLoopParam : public dmlc::Parameter<NPXWhileLoopParam> {
+  int num_args;
+  int num_outputs;
+  int num_out_data;
+  int max_iterations;
+  // `cond' and `func' each takes a subset of while_loop's inputs as that to their subgraphs
+  // `cond_input_locs' contains indices of inputs fed to `cond', and
+  // `func_input_locs' contains indices of inputs fed to `func'.
+  // `func_var_locs' are indices in which input "variables" are stored in func's inputs.
+  mxnet::Tuple<dim_t> cond_input_locs;
+  mxnet::Tuple<dim_t> func_input_locs;
+  mxnet::Tuple<dim_t> func_var_locs;
+  DMLC_DECLARE_PARAMETER(NPXWhileLoopParam) {
+    DMLC_DECLARE_FIELD(num_args).set_lower_bound(2)
+    .describe("Number of input arguments, including cond and func as two symbol inputs.");
+    DMLC_DECLARE_FIELD(num_outputs).set_lower_bound(1)
+    .describe("The number of outputs of the subgraph.");
+    DMLC_DECLARE_FIELD(num_out_data).set_lower_bound(0)
+    .describe("The number of outputs from the function body.");
+    DMLC_DECLARE_FIELD(max_iterations).set_lower_bound(1)
+    .describe("Maximum number of iterations.");
+    DMLC_DECLARE_FIELD(cond_input_locs)
+    .describe("The locations of cond's inputs in the given inputs.");
+    DMLC_DECLARE_FIELD(func_input_locs)
+    .describe("The locations of func's inputs in the given inputs.");
+    DMLC_DECLARE_FIELD(func_var_locs)
+    .describe("The locations of loop_vars among func's inputs.");
+  }
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream num_args_s, num_outputs_s, num_out_data_s, max_iterations_s,
+                       cond_input_locs_s, func_input_locs_s, func_var_locs_s;
+    num_args_s << num_args;
+    num_outputs_s << num_outputs;
+    num_out_data_s << num_out_data;
+    max_iterations_s << max_iterations;
+    cond_input_locs_s << cond_input_locs;
+    func_input_locs_s << func_input_locs;
+    func_var_locs_s << func_var_locs;
+  }
+  template <typename T>
+  bool sync_in_out(std::vector<T> *in,
+                   std::vector<T> *out,
+                   std::function<bool(const T &)> is_empty) const {
+    for (int i = this->num_out_data; i < this->num_outputs; ++i) {
+      // each out->at(i) is a params, loop_var
+      T &x = in->at(this->func_input_locs[this->func_var_locs[i - this->num_out_data]]);
+      T &y = out->at(i);
+      fill_value(&x, &y, is_empty(x), is_empty(y));
+    }
+    return true;
+  }
+};  // struct NPXWhileLoopParam
+
+struct NPXCondParam : public dmlc::Parameter<NPXCondParam> {
+  int num_args;
+  int num_outputs;
+  mxnet::Tuple<dim_t> cond_input_locs;
+  mxnet::Tuple<dim_t> then_input_locs;
+  mxnet::Tuple<dim_t> else_input_locs;
+  DMLC_DECLARE_PARAMETER(NPXCondParam) {
+    DMLC_DECLARE_FIELD(num_args).set_lower_bound(3)
+    .describe("Number of input arguments, including cond, then and else as three symbol inputs.");
+    DMLC_DECLARE_FIELD(num_outputs).set_lower_bound(1)
+    .describe("The number of outputs of the subgraph.");
+    DMLC_DECLARE_FIELD(cond_input_locs)
+    .describe("The locations of cond's inputs in the given inputs.");
+    DMLC_DECLARE_FIELD(then_input_locs)
+    .describe("The locations of then's inputs in the given inputs.");
+    DMLC_DECLARE_FIELD(else_input_locs)
+    .describe("The locations of else's inputs in the given inputs.");
+  }
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream num_args_s, num_outputs_s,
+                       cond_input_locs_s, then_input_locs_s, else_input_locs_s;
+    num_args_s << num_args;
+    num_outputs_s << num_outputs;
+    cond_input_locs_s << cond_input_locs;
+    then_input_locs_s << then_input_locs;
+    else_input_locs_s << else_input_locs;
+  }
+};  // struct NPXCondParam
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NPX_CONTROL_FLOW_H_
diff --git a/src/operator/pad.cc b/src/operator/pad.cc
index b6dc0a7536be..da59ae70541a 100644
--- a/src/operator/pad.cc
+++ b/src/operator/pad.cc
@@ -767,7 +767,9 @@ Example::
 .add_argument("data", "NDArray-or-Symbol", "An n-dimensional input array.")
 .add_arguments(PadParam::__FIELDS__());
 
-NNVM_REGISTER_OP(Pad).add_alias("pad");
+NNVM_REGISTER_OP(Pad)
+.add_alias("pad")
+.add_alias("_npx_pad");
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/quantization/quantized_activation.cc b/src/operator/quantization/quantized_activation.cc
index e2c302ba0c59..51b1065d7163 100644
--- a/src/operator/quantization/quantized_activation.cc
+++ b/src/operator/quantization/quantized_activation.cc
@@ -82,6 +82,7 @@ inline static bool QuantizedActivationStorageType(const nnvm::NodeAttrs &attrs,
 }
 
 NNVM_REGISTER_OP(_contrib_quantized_act)
+.add_alias("_npx_quantized_act")
 .describe(R"code(Activation operator for input and output data type of int8.
 The input and output data comes with min and max thresholds for quantizing
 the float32 data into int8.
diff --git a/src/operator/quantization/quantized_conv.cc b/src/operator/quantization/quantized_conv.cc
index 4d7178b2c620..08b855bdb496 100644
--- a/src/operator/quantization/quantized_conv.cc
+++ b/src/operator/quantization/quantized_conv.cc
@@ -178,6 +178,7 @@ bool QuantizedConvStorageType(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_contrib_quantized_conv)
+.add_alias("_npx_quantized_conv")
 .describe(R"code(Convolution operator for input, weight and bias data type of int8,
 and accumulates in type int32 for the output. For each argument, two more arguments of type
 float32 must be provided representing the thresholds of quantizing argument from data
diff --git a/src/operator/quantization/quantized_elemwise_add.cc b/src/operator/quantization/quantized_elemwise_add.cc
index f335a078bf3d..92483af60dcc 100644
--- a/src/operator/quantization/quantized_elemwise_add.cc
+++ b/src/operator/quantization/quantized_elemwise_add.cc
@@ -91,6 +91,7 @@ void QuantizedElemwiseAddForward(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_contrib_quantized_elemwise_add)
+.add_alias("_npx_quantized_elemwise_add")
 .describe(R"code(elemwise_add operator for input dataA and input dataB data type of int8,
 and accumulates in type int32 for the output. For each argument, two more arguments of type
 float32 must be provided representing the thresholds of quantizing argument from data
diff --git a/src/operator/quantization/quantized_elemwise_mul.cc b/src/operator/quantization/quantized_elemwise_mul.cc
index 0988a8bcceaf..6d112af34418 100644
--- a/src/operator/quantization/quantized_elemwise_mul.cc
+++ b/src/operator/quantization/quantized_elemwise_mul.cc
@@ -217,6 +217,7 @@ void QuantizedElemwiseMulOpForward(const nnvm::NodeAttrs &attrs,
 }
 
 NNVM_REGISTER_OP(_contrib_quantized_elemwise_mul)
+.add_alias("_npx_quantized_elemwise_mul")
 .describe(R"code(Multiplies arguments int8 element-wise.
 )code" ADD_FILELINE)
 .set_num_inputs(6)
diff --git a/src/operator/quantization/quantized_flatten.cc b/src/operator/quantization/quantized_flatten.cc
index c8bf3e2ca762..ead46f2c52fb 100644
--- a/src/operator/quantization/quantized_flatten.cc
+++ b/src/operator/quantization/quantized_flatten.cc
@@ -29,6 +29,7 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_contrib_quantized_flatten)
+.add_alias("_npx_quantized_flatten")
 .set_num_inputs(3)
 .set_num_outputs(3)
 .set_attr<mxnet::FInferShape>("FInferShape", QuantizedFlattenShape)
diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index 0158ebda0229..644b55a6923d 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -303,6 +303,7 @@ void QuantizedFullyConnectedForwardExCPU(const nnvm::NodeAttrs &attrs,
 #endif
 
 NNVM_REGISTER_OP(_contrib_quantized_fully_connected)
+.add_alias("_npx_quantized_fully_connected")
 .describe(R"code(Fully Connected operator for input, weight and bias data type of int8,
 and accumulates in type int32 for the output. For each argument, two more arguments of type
 float32 must be provided representing the thresholds of quantizing argument from data
diff --git a/src/operator/quantization/quantized_indexing_op.cc b/src/operator/quantization/quantized_indexing_op.cc
index 7f5af2e6a36b..7f45c9955172 100644
--- a/src/operator/quantization/quantized_indexing_op.cc
+++ b/src/operator/quantization/quantized_indexing_op.cc
@@ -130,6 +130,7 @@ void QuantizedEmbeddingOpForward(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_contrib_quantized_embedding)
+.add_alias("_npx_quantized_embedding")
 .describe(R"code(Maps integer indices to int8 vector representations (embeddings).
 )code" ADD_FILELINE)
 .set_num_inputs(4)
diff --git a/src/operator/quantization/quantized_pooling.cc b/src/operator/quantization/quantized_pooling.cc
index f23ecb3a987d..1b5da8784eea 100644
--- a/src/operator/quantization/quantized_pooling.cc
+++ b/src/operator/quantization/quantized_pooling.cc
@@ -178,6 +178,7 @@ inline static bool QuantizedPoolingStorageType(const nnvm::NodeAttrs &attrs,
 }
 
 NNVM_REGISTER_OP(_contrib_quantized_pooling)
+.add_alias("_npx_quantized_pooling")
 .describe(R"code(Pooling operator for input and output data type of int8.
 The input and output data comes with min and max thresholds for quantizing
 the float32 data into int8.
diff --git a/src/operator/quantization/requantize.cc b/src/operator/quantization/requantize.cc
index e4124e0ccbe3..ff6c50e16624 100644
--- a/src/operator/quantization/requantize.cc
+++ b/src/operator/quantization/requantize.cc
@@ -50,6 +50,7 @@ bool RequantizeStorageType(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_contrib_requantize)
+.add_alias("_npx_requantize")
 .describe(R"code(Given data that is quantized in int32 and the corresponding thresholds,
 requantize the data into int8 using min and max thresholds either calculated at runtime
 or from calibration. It's highly recommended to pre-calucate the min and max thresholds
diff --git a/src/operator/sequence_last.cc b/src/operator/sequence_last.cc
index 3a6cdbad6149..a5c40055921a 100644
--- a/src/operator/sequence_last.cc
+++ b/src/operator/sequence_last.cc
@@ -111,5 +111,8 @@ Example::
                   "vector of sequence lengths of the form [batch_size]")
     .add_arguments(SequenceLastParam::__FIELDS__());
 
+NNVM_REGISTER_OP(SequenceLast)
+.add_alias("_npx_sequence_last");
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/sequence_reverse.cc b/src/operator/sequence_reverse.cc
index 0621d3470d80..591c9bae049b 100644
--- a/src/operator/sequence_reverse.cc
+++ b/src/operator/sequence_reverse.cc
@@ -127,5 +127,8 @@ Example::
                   "vector of sequence lengths of the form [batch_size]")
     .add_arguments(SequenceReverseParam::__FIELDS__());
 
+NNVM_REGISTER_OP(SequenceReverse)
+.add_alias("_npx_sequence_reverse");
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/slice_channel.cc b/src/operator/slice_channel.cc
index b051b9b90309..a8e7a36b9eba 100644
--- a/src/operator/slice_channel.cc
+++ b/src/operator/slice_channel.cc
@@ -109,7 +109,9 @@ Example::
 .add_argument("data", "NDArray-or-Symbol", "The input")
 .add_arguments(SliceChannelParam::__FIELDS__());
 
-NNVM_REGISTER_OP(SliceChannel).add_alias("split");
+NNVM_REGISTER_OP(SliceChannel)
+.add_alias("split")
+.add_alias("_npx_slice_channel");
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc b/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc
index cd433e00a770..d7e51418e4e4 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc
@@ -83,6 +83,9 @@ Example::
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::gt>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
+NNVM_REGISTER_OP(broadcast_greater)
+.add_alias("_npx_broadcast_greater");
+
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_greater_equal)
 .describe(R"code(Returns the result of element-wise **greater than or equal to** (>=) comparison operation with broadcasting.
 
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index a739db305172..064c828210c7 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -226,6 +226,9 @@ The storage type of ``softsign`` output is always dense
   .set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::softsign>)
   .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_softsign"});
 
+NNVM_REGISTER_OP(softsign)
+.add_alias("_npx_softsign");
+
 MXNET_OPERATOR_REGISTER_BINARY(_backward_softsign)
 .set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::Compute<cpu,
   unary_bwd<mshadow_op::softsign_grad> >);
diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py
index 69f9ffd468ca..ae3590e3b153 100644
--- a/tests/nightly/model_backwards_compatibility_check/common.py
+++ b/tests/nightly/model_backwards_compatibility_check/common.py
@@ -156,6 +156,7 @@ def forward(self, x):
         return x
 
 
+@mx.util.use_np
 class HybridNet(gluon.HybridBlock):
     def __init__(self, **kwargs):
         super(HybridNet, self).__init__(**kwargs)
@@ -166,14 +167,14 @@ def __init__(self, **kwargs):
         self.fc1 = nn.Dense(500)
         self.fc2 = nn.Dense(2)
 
-    def hybrid_forward(self, F, x):
-        x = self.pool1(F.tanh(self.conv1(x)))
-        x = self.pool2(F.tanh(self.conv2(x)))
+    def forward(self, x):
+        x = self.pool1(mx.np.tanh(self.conv1(x)))
+        x = self.pool2(mx.np.tanh(self.conv2(x)))
         # 0 means copy over size from corresponding dimension.
         # -1 means infer size from the rest of dimensions.
         x = x.reshape((0, -1))
-        x = F.tanh(self.fc1(x))
-        x = F.tanh(self.fc2(x))
+        x = mx.np.tanh(self.fc1(x))
+        x = mx.np.tanh(self.fc2(x))
         return x
 
 
diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py
index 3091edf5b2a6..a292621479e5 100644
--- a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py
+++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py
@@ -54,12 +54,12 @@ def test_lenet_gluon_hybrid_imports_api():
             logging.warn('No training files found for %s for MXNet version : %s' % (model_name, folder))
             continue
             # Load the model and perform inference
-        data = mx.nd.load(''.join([model_name, '-data']))
+        data = mx.np.load(''.join([model_name, '-data']))
         test_data = data['data']
         loaded_model = HybridNet()
         loaded_model = gluon.SymbolBlock.imports(model_name + '-symbol.json', ['data'], model_name + '-0000.params')
         output = loaded_model(test_data)
-        old_inference_results = mx.nd.load(model_name + '-inference')['inference']
+        old_inference_results = mx.np.load(model_name + '-inference')['inference']
         assert_almost_equal(old_inference_results.asnumpy(), output.asnumpy(), rtol=rtol_default, atol=atol_default)
         clean_model_files(model_files, model_name)
         logging.info('=================================')
diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py
index 573119a8f731..9d5e98159976 100644
--- a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py
+++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py
@@ -38,6 +38,7 @@ def train_lenet_gluon_save_params_api():
     net.save_params(os.path.join(get_model_path(model_name), ''.join([model_name, '-params'])))
 
 
+@mx.util.use_np
 def train_lenet_gluon_hybrid_export_api():
     model_name = 'lenet_gluon_hybrid_export_api'
     logging.info('Saving files for model %s' % model_name)
@@ -47,14 +48,14 @@ def train_lenet_gluon_hybrid_export_api():
     net.initialize(weights, ctx=[mx.cpu(0)])
     net.hybridize()
     # Prepare data
-    test_data = mx.nd.array(np.random.uniform(-1, 1, size=(20, 1, 30, 30)))
+    test_data = mx.np.array(np.random.uniform(-1, 1, size=(20, 1, 30, 30)))
     output = net(test_data)
     # print (y)
     # Save the test data as well.
     # Save the inference output ys
     # Save the model params
 
-    mx.nd.save(os.path.join(get_model_path(model_name), ''.join([model_name, '-data'])), {'data': test_data})
+    mx.npx.savez(os.path.join(get_model_path(model_name), ''.join([model_name, '-data'])), {'data': test_data})
     save_inference_results(output, model_name)
     if compare_versions(str(mxnet_version) , '1.1.0') < 0:
         # v1.0.0 does not have the epoch param in the .exports API. Hence adding this safety net
diff --git a/tests/python/gpu/test_fusion.py b/tests/python/gpu/test_fusion.py
index 4bb3a862144a..a18be14460d0 100644
--- a/tests/python/gpu/test_fusion.py
+++ b/tests/python/gpu/test_fusion.py
@@ -281,10 +281,10 @@ class Foo(HybridBlock):
         def __init__(self):
             super(Foo, self).__init__()
 
-        def hybrid_forward(self, F, valid_length):
+        def forward(self, valid_length):
             mask = valid_length.astype(np.float32)
             mask2 = valid_length.astype(np.float32)
-            mask = mask * F.np.expand_dims(mask2, axis=-1)
+            mask = mask * mx.np.expand_dims(mask2, axis=-1)
             return mask
 
     foo = Foo()
@@ -292,6 +292,7 @@ def hybrid_forward(self, F, valid_length):
     out = foo(mx.np.ones((10,), ctx=mx.gpu(), dtype=np.bool))
     mx.npx.waitall()
 
+@use_np
 def test_fusion_different_dimensions():
     from mxnet.gluon import HybridBlock
 
@@ -299,36 +300,37 @@ class Foo(HybridBlock):
         def __init__(self):
             super(Foo, self).__init__()
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             mask2 = x.astype(np.float32)
-            mask = F.expand_dims(mask2, axis=-1)
+            mask = mx.np.expand_dims(mask2, axis=-1)
             return mask
 
     foo = Foo()
     foo.hybridize(static_alloc=True)
     # Pass 1-D data
-    out = foo(mx.nd.ones((10,), ctx=mx.gpu()))
+    out = foo(mx.np.ones((10,), ctx=mx.gpu()))
     assert np.all(out.asnumpy() == np.ones((10,1)))
     assert out.shape == (10,1)
     # Pass 2-D data
-    out = foo(mx.nd.ones((10,10), ctx=mx.gpu()))
+    out = foo(mx.np.ones((10,10), ctx=mx.gpu()))
     assert np.all(out.asnumpy() == np.ones((10,10)))
     assert out.shape == (10,10,1)
 
+@use_np
 def test_input_reorder():
     class Block(gluon.HybridBlock):
         def __init__(self, **kwargs):
             super(Block, self).__init__(**kwargs)
 
-        def hybrid_forward(self, F, x, y, z):
+        def forward(self, x, y, z):
             s = x * 2
             s2 = s + z
-            s = F.broadcast_add(s, y * y)
-            return F.dot(s, s2)
+            s = mx.np.add(s, y * y)
+            return mx.np.dot(s, s2)
 
     for static_alloc in (False, True):
         arg_shapes = [(10, 10), (10, 1), (10, 10)]
-        arg_data = [mx.random.uniform(shape=s) for s in arg_shapes]
+        arg_data = [mx.np.random.uniform(size=s) for s in arg_shapes]
 
         arrays = {}
         for use_fusion in ('0', '1'):
@@ -348,21 +350,22 @@ def hybrid_forward(self, F, x, y, z):
         for key in ['result'] + list(range(len(arg_data))):
             assert_allclose(arrays['0'][key].asnumpy(), arrays['1'][key].asnumpy())
 
+@use_np
 def test_fusion_cycle():
     class Test(gluon.nn.HybridBlock):
         def __init__(self, **kwargs):
             super(Test, self).__init__(**kwargs)
 
-        def hybrid_forward(self, F, x, y):
-            x = F.relu(x)
-            y = F.relu(y)
-            z1 = F.expand_dims(F.sum_axis(x, axis=1), axis=1)
-            z2 = F.expand_dims(F.sum_axis(y, axis=1), axis=1)
+        def forward(self, x, y):
+            x = mx.npx.relu(x)
+            y = mx.npx.relu(y)
+            z1 = mx.np.expand_dims(mx.np.sum(x, axis=1), axis=1)
+            z2 = mx.np.expand_dims(mx.np.sum(y, axis=1), axis=1)
             return x + z2, y + z1
 
     t = Test()
-    a = mx.nd.zeros(shape=(10,1), ctx=mx.gpu())
-    b = mx.nd.zeros(shape=(10,1), ctx=mx.gpu())
+    a = mx.np.zeros(shape=(10,1), ctx=mx.gpu())
+    b = mx.np.zeros(shape=(10,1), ctx=mx.gpu())
     t.hybridize(static_alloc=True, static_shape=True)
     out = t(a, b)
-    mx.nd.waitall()
+    mx.npx.waitall()
diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index 310643bc19f7..9b42e7452516 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -21,8 +21,7 @@
 import mxnet as mx
 import multiprocessing as mp
 from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal, rand_ndarray, environment
-import mxnet.ndarray as nd
-import numpy as np
+import numpy as _np
 import math
 from mxnet import autograd
 import pytest
@@ -41,12 +40,12 @@
 def check_rnn_layer(layer):
     layer.initialize(ctx=[mx.cpu(0), mx.gpu(0)])
     with mx.gpu(0):
-        x = mx.nd.ones((10, 16, 30))
+        x = mx.np.ones((10, 16, 30))
         states = layer.begin_state(16)
         go, gs = layer(x, states)
 
     with mx.cpu(0):
-        x = mx.nd.ones((10, 16, 30))
+        x = mx.np.ones((10, 16, 30))
         states = layer.begin_state(16)
         co, cs = layer(x, states)
 
@@ -57,7 +56,7 @@ def check_rnn_layer(layer):
 
 def check_rnn_layer_w_rand_inputs(layer):
     layer.initialize(ctx=[mx.cpu(0), mx.gpu(0)])
-    x = mx.nd.uniform(shape=(10, 16, 30))
+    x = mx.np.random.uniform(size=(10, 16, 30))
     with mx.gpu(0):
         x = x.copyto(mx.gpu(0))
         states = layer.begin_state(16)
@@ -73,6 +72,7 @@ def check_rnn_layer_w_rand_inputs(layer):
         assert_almost_equal(g, c)
 
 
+@mx.util.use_np
 @assert_raises_cudnn_not_satisfied(min_version='7.2.1')
 def test_lstmp():
     hidden_size, projection_size = 3, 2
@@ -80,14 +80,14 @@ def test_lstmp():
     batch_size, seq_len = 7, 11
     input_size = 5
     ctx = mx.gpu(0)
-    lstm_input = mx.nd.uniform(
-        shape=(seq_len, batch_size, input_size), ctx=ctx)
+    lstm_input = mx.np.random.uniform(
+        size=(seq_len, batch_size, input_size), ctx=ctx)
     shapes = {'i2h_weight': (hidden_size * 4, input_size),
               'h2h_weight': (hidden_size * 4, projection_size),
               'i2h_bias': (hidden_size * 4,),
               'h2h_bias': (hidden_size * 4,),
               'h2r_weight': (projection_size, hidden_size)}
-    weights = {k: rand_ndarray(v) for k, v in shapes.items()}
+    weights = {k: rand_ndarray(v).as_np_ndarray() for k, v in shapes.items()}
     lstm_layer = gluon.rnn.LSTM(hidden_size, projection_size=projection_size,
                                 input_size=input_size)
     lstm_cell = gluon.rnn.LSTMPCell(hidden_size=hidden_size,
@@ -114,14 +114,14 @@ def test_lstmp():
         print('checking gradient for {}'.format('lstm0_l0_' + k))
         assert_almost_equal(layer_grad, cell_grad, rtol=rtol, atol=atol)
     check_rnn_layer_forward(gluon.rnn.LSTM(
-        10, 2, projection_size=5), mx.nd.ones((8, 3, 20)), ctx=ctx)
-    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, projection_size=5, bidirectional=True), mx.nd.ones(
-        (8, 3, 20)), [mx.nd.ones((4, 3, 5)), mx.nd.ones((4, 3, 10))], ctx=ctx)
-    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, dropout=0.5, projection_size=5), mx.nd.ones((8, 3, 20)),
+        10, 2, projection_size=5), mx.np.ones((8, 3, 20)), ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, projection_size=5, bidirectional=True), mx.np.ones(
+        (8, 3, 20)), [mx.np.ones((4, 3, 5)), mx.np.ones((4, 3, 10))], ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, dropout=0.5, projection_size=5), mx.np.ones((8, 3, 20)),
                             run_only=True, ctx=ctx)
     check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, bidirectional=True, dropout=0.5, projection_size=5),
-                            mx.nd.ones((8, 3, 20)),
-                            [mx.nd.ones((4, 3, 5)), mx.nd.ones((4, 3, 10))], run_only=True, ctx=ctx)
+                            mx.np.ones((8, 3, 20)),
+                            [mx.np.ones((4, 3, 5)), mx.np.ones((4, 3, 10))], run_only=True, ctx=ctx)
     lstm_layer.save_parameters('gpu_tmp.params')
     lstm_layer.load_parameters('gpu_tmp.params')
 
@@ -133,10 +133,10 @@ def test_lstm_clip():
     batch_size, seq_len = 32, 80
     input_size = 50
     clip_min, clip_max, clip_nan = -5, 5, True
-    lstm_input = mx.nd.uniform(
-        shape=(seq_len, batch_size, input_size), ctx=mx.gpu(0))
-    lstm_states = [mx.nd.uniform(shape=(2, batch_size, projection_size), ctx=mx.gpu(0)),
-                   mx.nd.uniform(shape=(2, batch_size, hidden_size), ctx=mx.gpu(0))]
+    lstm_input = mx.np.random.uniform(
+        size=(seq_len, batch_size, input_size), ctx=mx.gpu(0))
+    lstm_states = [mx.np.random.uniform(size=(2, batch_size, projection_size), ctx=mx.gpu(0)),
+                   mx.np.random.uniform(size=(2, batch_size, hidden_size), ctx=mx.gpu(0))]
     lstm_layer = gluon.rnn.LSTM(hidden_size, projection_size=projection_size,
                                 input_size=input_size,
                                 bidirectional=True,
@@ -146,9 +146,9 @@ def test_lstm_clip():
     lstm_layer.initialize(ctx=mx.gpu(0))
     with autograd.record():
         _, layer_output_states = lstm_layer(lstm_input, lstm_states)
-    cell_states = layer_output_states[0].asnumpy()
+    cell_states = layer_output_states[0]
     assert (cell_states >= clip_min).all() and (cell_states <= clip_max).all()
-    assert not np.isnan(cell_states).any()
+    assert not _np.isnan(cell_states).any()
 
 
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
@@ -163,6 +163,7 @@ def test_rnn_layer():
         100, num_layers=3, bidirectional=True))
 
 
+@mx.util.use_np
 def check_layer_bidirectional(size, in_size, proj_size):
     class RefBiLSTM(gluon.Block):
         def __init__(self, size, proj_size, **kwargs):
@@ -174,26 +175,26 @@ def __init__(self, size, proj_size, **kwargs):
 
         def forward(self, inpt):
             fwd = self._lstm_fwd(inpt)
-            bwd_inpt = nd.flip(inpt, 0)
+            bwd_inpt = mx.np.flip(inpt, 0)
             bwd = self._lstm_bwd(bwd_inpt)
-            bwd = nd.flip(bwd, 0)
-            return nd.concat(fwd, bwd, dim=2)
+            bwd = mx.np.flip(bwd, 0)
+            return mx.np.concatenate([fwd, bwd], axis=2)
     weights = {}
     for d in ['l', 'r']:
-        weights['{}0_i2h_weight'.format(d)] = mx.random.uniform(
-            shape=(size * 4, in_size))
+        weights['{}0_i2h_weight'.format(d)] = mx.np.random.uniform(
+            size=(size * 4, in_size))
         if proj_size:
-            weights['{}0_h2h_weight'.format(d)] = mx.random.uniform(
-                shape=(size * 4, proj_size))
-            weights['{}0_h2r_weight'.format(d)] = mx.random.uniform(
-                shape=(proj_size, size))
+            weights['{}0_h2h_weight'.format(d)] = mx.np.random.uniform(
+                size=(size * 4, proj_size))
+            weights['{}0_h2r_weight'.format(d)] = mx.np.random.uniform(
+                size=(proj_size, size))
         else:
             weights['{}0_h2h_weight'.format(
-                d)] = mx.random.uniform(shape=(size * 4, size))
+                d)] = mx.np.random.uniform(size=(size * 4, size))
         weights['{}0_i2h_bias'.format(
-            d)] = mx.random.uniform(shape=(size * 4,))
+            d)] = mx.np.random.uniform(size=(size * 4,))
         weights['{}0_h2h_bias'.format(
-            d)] = mx.random.uniform(shape=(size * 4,))
+            d)] = mx.np.random.uniform(size=(size * 4,))
 
     net = gluon.rnn.LSTM(size, projection_size=proj_size,
                          bidirectional=True)
@@ -207,7 +208,7 @@ def forward(self, inpt):
         ref_net_params[k.replace('l0', '_lstm_fwd.l0').replace(
             'r0', '_lstm_bwd.l0')].set_data(weights[k])
 
-    data = mx.random.uniform(shape=(11, 10, in_size))
+    data = mx.np.random.uniform(size=(11, 10, in_size))
     mx.test_utils.assert_allclose(net(data), ref_net(data), rtol=1e-6)
 
 
@@ -215,10 +216,10 @@ def forward(self, inpt):
 def check_layer_bidirectional_varseqlen(size, in_size):
     weights = {}
     for d in ['l', 'r']:
-        weights['{}0_i2h_weight'.format(d)] = mx.random.uniform(shape=(size*4, in_size))
-        weights['{}0_h2h_weight'.format(d)] = mx.random.uniform(shape=(size*4, size))
-        weights['{}0_i2h_bias'.format(d)] = mx.random.uniform(shape=(size*4,))
-        weights['{}0_h2h_bias'.format(d)] = mx.random.uniform(shape=(size*4,))
+        weights['{}0_i2h_weight'.format(d)] = mx.np.random.uniform(size=(size*4, in_size))
+        weights['{}0_h2h_weight'.format(d)] = mx.np.random.uniform(size=(size*4, size))
+        weights['{}0_i2h_bias'.format(d)] = mx.np.random.uniform(size=(size*4,))
+        weights['{}0_h2h_bias'.format(d)] = mx.np.random.uniform(size=(size*4,))
 
     net = gluon.rnn.LSTM(size, bidirectional=True, use_sequence_length=True)
     ref_net  = gluon.rnn.LSTM(size, bidirectional=True, use_sequence_length=False)
@@ -232,10 +233,10 @@ def check_layer_bidirectional_varseqlen(size, in_size):
 
     batch_size = 10
     num_timesteps = 11
-    data = mx.random.uniform(shape=(num_timesteps, batch_size, in_size))
+    data = mx.np.random.uniform(size=(num_timesteps, batch_size, in_size))
     data_np = data.asnumpy()
 
-    sequence_length = nd.random.randint(1, num_timesteps+1, shape=(batch_size)).astype("int32")
+    sequence_length = mx.np.random.randint(1, num_timesteps+1, size=(batch_size)).astype("int32")
     sequence_length_np = sequence_length.asnumpy().astype("int32")
 
     # Reference net is processing batch elements one at a time, so that it is "perfectly sized"
@@ -248,7 +249,7 @@ def check_layer_bidirectional_varseqlen(size, in_size):
         net_output = net(data.copy(), sequence_length=sequence_length.copy())
 
         for b in range(batch_size):
-            data_slice = mx.nd.array(data_np[:sequence_length_np[b], b, :]).reshape(sequence_length_np[b], 1, in_size)
+            data_slice = mx.np.array(data_np[:sequence_length_np[b], b, :]).reshape(sequence_length_np[b], 1, in_size)
             ref_output_slice = ref_net(data_slice)
             ref_net_output.append(ref_output_slice)
 
@@ -291,7 +292,7 @@ def test_layer_bidirectional_varseqlength():
 
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_rnn_layer_begin_state_type():
-    fake_data = nd.random.uniform(shape=(3, 5, 7), dtype='float16')
+    fake_data = mx.np.random.uniform(size=(3, 5, 7), dtype='float16')
     modeling_layer = gluon.rnn.LSTM(
         hidden_size=11, num_layers=2, dropout=0.2, bidirectional=True)
     modeling_layer.cast('float16')
@@ -301,10 +302,9 @@ def test_rnn_layer_begin_state_type():
 
 def test_gluon_ctc_consistency():
     loss = mx.gluon.loss.CTCLoss()
-    data = mx.nd.arange(0, 4, repeat=40, ctx=mx.gpu(0)
-                        ).reshape((2, 20, 4)).flip(axis=0)
-    cpu_label = mx.nd.array([[2, 1, -1, -1], [3, 2, 2, -1]], ctx=mx.cpu(0))
-    gpu_label = mx.nd.array([[2, 1, -1, -1], [3, 2, 2, -1]], ctx=mx.gpu(0))
+    data = mx.np.flip(mx.np.repeat(mx.np.arange(0, 4, ctx=mx.gpu(0)), 40).reshape((2, 20, 4)), axis=0)
+    cpu_label = mx.np.array([[2, 1, -1, -1], [3, 2, 2, -1]], ctx=mx.cpu(0))
+    gpu_label = mx.np.array([[2, 1, -1, -1], [3, 2, 2, -1]], ctx=mx.gpu(0))
 
     cpu_data = data.copy().as_in_context(mx.cpu(0))
     cpu_data.attach_grad()
@@ -323,20 +323,20 @@ def test_gluon_ctc_consistency():
 
 def test_global_norm_clip_multi_device():
     for check_isfinite in [True, False]:
-        x1 = mx.nd.ones((3, 3), ctx=mx.gpu(0))
-        x2 = mx.nd.ones((4, 4), ctx=mx.cpu(0))
-        x3 = mx.nd.ones((7, 4), ctx=mx.gpu(0))
-        x4 = mx.nd.ones((7, 4), ctx=mx.cpu(0))
+        x1 = mx.np.ones((3, 3), ctx=mx.gpu(0))
+        x2 = mx.np.ones((4, 4), ctx=mx.cpu(0))
+        x3 = mx.np.ones((7, 4), ctx=mx.gpu(0))
+        x4 = mx.np.ones((7, 4), ctx=mx.cpu(0))
         norm = gluon.utils.clip_global_norm(
             [x1, x2, x3, x4], 1.0, check_isfinite=check_isfinite)
         if check_isfinite:
             assert norm == 9.0
         else:
-            assert norm.asscalar() == 9.0
-        assert_almost_equal(x1, np.ones((3, 3)) / 9)
-        assert_almost_equal(x2, np.ones((4, 4)) / 9)
-        assert_almost_equal(x3, np.ones((7, 4)) / 9)
-        assert_almost_equal(x4, np.ones((7, 4)) / 9)
+            assert norm.item() == 9.0
+        assert_almost_equal(x1, _np.ones((3, 3)) / 9)
+        assert_almost_equal(x2, _np.ones((4, 4)) / 9)
+        assert_almost_equal(x3, _np.ones((7, 4)) / 9)
+        assert_almost_equal(x4, _np.ones((7, 4)) / 9)
 
 
 def _check_batchnorm_result(input, num_devices=1, cuda=False):
@@ -388,7 +388,7 @@ def _syncParameters(bn1, bn2, ctx):
         mx.autograd.backward(loss1)
         mx.autograd.backward(loss2)
 
-    output2 = mx.nd.concat(*[output.as_in_context(input.context) for output in output2], dim=0)
+    output2 = mx.np.concatenate([output.as_in_context(input.context) for output in output2], axis=0)
     # assert forwarding
     assert_almost_equal(input1, input2, atol=1e-3, rtol=1e-3)
     assert_almost_equal(output1, output2, atol=1e-3, rtol=1e-3)
@@ -398,14 +398,15 @@ def _syncParameters(bn1, bn2, ctx):
     assert_almost_equal(_find_bn(bn1).running_var.data(ctx_list[0]),
                         _find_bn(bn2).running_var.data(ctx_list[0]),
                         atol=1e-3, rtol=1e-3)
-    input2grad = mx.nd.concat(*[output.grad.as_in_context(input.context) for output in inputs2], dim=0)
+    input2grad = mx.np.concatenate([output.grad.as_in_context(input.context) for output in inputs2], axis=0)
     assert_almost_equal(input1.grad, input2grad, atol=1e-3, rtol=1e-3)
 
+@mx.util.use_np
 def test_sync_batchnorm():
     def get_num_devices():
         for i in range(100):
             try:
-                mx.nd.zeros((1,), ctx=mx.gpu(i))
+                mx.np.zeros((1,), ctx=mx.gpu(i))
             except:
                 return i
     # no need to use SyncBN with 1 gpu
@@ -414,7 +415,7 @@ def get_num_devices():
     ndev = 2
     # check with unsync version
     for i in range(10):
-        _check_batchnorm_result(mx.nd.random.uniform(shape=(4, 1, 4, 4)),
+        _check_batchnorm_result(mx.np.random.uniform(size=(4, 1, 4, 4)),
                                 num_devices=ndev, cuda=True)
 
 def test_symbol_block_fp16(tmpdir):
@@ -430,8 +431,8 @@ def test_symbol_block_fp16(tmpdir):
         pretrained=True, ctx=ctx, root=tmp)
     net_fp32.cast('float16')
     net_fp32.hybridize()
-    data = mx.nd.zeros((1, 3, 224, 224), dtype='float16', ctx=ctx)
-    net_fp32.forward(data)
+    data = mx.np.zeros((1, 3, 224, 224), dtype='float16', ctx=ctx)
+    net_fp32(data)
     symbol_file, param_file = net_fp32.export(tmpfile, 0)
 
     # 2. Load the saved model and verify if all the params are loaded correctly.
@@ -447,7 +448,7 @@ def test_symbol_block_fp16(tmpdir):
         if 'conv' in param_name and 'weight' in param_name:
             name = param_name
             break
-    assert np.dtype(net_fp16.params[name].dtype) == np.dtype(np.float16)
+    assert _np.dtype(net_fp16.params[name].dtype) == _np.dtype(_np.float16)
 
 
 @pytest.mark.serial
@@ -490,22 +491,23 @@ def tensor_size(big_tensor_bytes):
         (height, width) = (sz, sz)
         sys.stderr.write(" {}x{} ".format(height, width))
         sys.stderr.flush()
-        data_in = nd.random_uniform(low=0, high=255, shape=(1, 3, height, width),
-                                    ctx=ctx, dtype="float32")
+        data_in = mx.np.random.uniform(low=0, high=255, size=(1, 3, height, width),
+                                       ctx=ctx, dtype="float32")
         # Evaluate model
         net(data_in).asnumpy()
 
 # isolated execution bulking test function to be invoked with different env var settings
 
 
+@mx.util.use_np
 def _test_bulking_in_process(seed, time_per_iteration):
     # Use flip since it's a simple function with same-sized I/O unlikely to ever be fused.
     class Flip(gluon.HybridBlock):
         def __init__(self, **kwargs):
             super(Flip, self).__init__(**kwargs)
 
-        def hybrid_forward(self, F, x):
-            return F.flip(x, axis=0)
+        def forward(self, x):
+            return mx.np.flip(x, axis=0)
 
     def get_net(num_ops):
         net = nn.HybridSequential()
@@ -518,9 +520,9 @@ def get_net(num_ops):
     num_iterations = 20
 
     # build model
-    x = mx.ndarray.zeros(data_shape)
+    x = mx.np.zeros(data_shape)
     x.attach_grad()
-    dy = mx.ndarray.ones(data_shape)
+    dy = mx.np.ones(data_shape)
     net = get_net(num_ops)
     net.hybridize(static_alloc=True, static_shape=True)
 
@@ -579,9 +581,10 @@ def test_bulking_gluon_gpu():
     _test_bulking(_test_bulking_in_process)
 
 
+@mx.util.use_np
 def test_hybridblock_mix_ctx_raise():
     class FooHybrid(gluon.HybridBlock):
-        def hybrid_forward(self, F, a, b):
+        def forward(self, a, b):
             if isinstance(a, (list, tuple)):
                 a = sum(a)
             if isinstance(b, (list, tuple)):
@@ -589,31 +592,15 @@ def hybrid_forward(self, F, a, b):
             return a + b
     foo_hybrid = FooHybrid()
     foo_hybrid.hybridize()
-    pytest.raises(ValueError, lambda: foo_hybrid(mx.nd.ones((10,), ctx=mx.gpu()),
-                                                 mx.nd.ones((10,), ctx=mx.cpu())))
+    pytest.raises(ValueError, lambda: foo_hybrid(mx.np.ones((10,), ctx=mx.gpu()),
+                                                 mx.np.ones((10,), ctx=mx.cpu())))
 
-def test_symbol_block_symbolic_bn_fp16_cast():
-    with mx.gpu(0):
-        net = mx.gluon.nn.HybridSequential()
-        sym = mx.sym.var('data')
-        conv = mx.sym.Convolution(sym, kernel=(3, 3), num_filter=16)
-        bn = mx.sym.BatchNorm(conv, name='bn_test')
-        internals = bn.get_internals()
-        net.add(mx.gluon.nn.SymbolBlock([internals['bn_test_output']], [mx.sym.var('data')]))
-        net.add(mx.gluon.nn.Conv2D(10, kernel_size=1))
-        net.initialize()
-        x = mx.nd.zeros((1, 3, 32, 32), dtype='float32')
-        y = net(x)
-        assert np.dtype(y.dtype).name == 'float32'
-        net.cast('float16')
-        x = x.astype('float16')
-        y1 = net(x)
-        assert np.dtype(y1.dtype).name == 'float16'
 
+@mx.util.use_np
 def test_gemms_true_fp16():
     ctx = mx.gpu(0)
-    input = mx.nd.random.uniform(shape=(1, 512), dtype='float16', ctx=ctx)
-    weights = mx.nd.random.uniform(shape=(128, 512), ctx=ctx)
+    input = mx.np.random.uniform(size=(1, 512), dtype='float16', ctx=ctx)
+    weights = mx.np.random.uniform(size=(128, 512), ctx=ctx)
 
     net = nn.Dense(128, in_units=512, use_bias=False)
     net.cast('float16')
@@ -631,22 +618,23 @@ def test_gemms_true_fp16():
     assert_almost_equal(ref_results.asnumpy(), results_trueFP16.asnumpy(),
                         atol=atol, rtol=rtol)
 
+@mx.util.use_np
 def test_cudnn_dropout_reproducibility():
     d = nn.Dropout(0.5)
     d.initialize()
-    a = mx.random.uniform(shape=(100,100))
+    a = mx.np.random.uniform(size=(100,100))
     b = a.copy()
     a.attach_grad()
     b.attach_grad()
-    seed = np.random.randint(0, 100000)
+    seed = mx.np.random.randint(0, 100000).item()
     N = 10
-    mx.random.seed(seed)
+    mx.np.random.seed(seed)
     out1 = []
     for _ in range(N):
         with autograd.record():
             out1.append(d(a))
     out1[0].backward()
-    mx.random.seed(seed)
+    mx.np.random.seed(seed)
     out2 = []
     for _ in range(N):
         with autograd.record():
diff --git a/tests/python/gpu/test_gluon_model_zoo_gpu.py b/tests/python/gpu/test_gluon_model_zoo_gpu.py
index 36026894ea03..c4cde1d550ce 100644
--- a/tests/python/gpu/test_gluon_model_zoo_gpu.py
+++ b/tests/python/gpu/test_gluon_model_zoo_gpu.py
@@ -37,6 +37,7 @@ def download_data():
     return mx.test_utils.download(
         'https://repo.mxnet.io/gluon/dataset/test/val-5k-256-9e70d85e0.rec', VAL_DATA)
 
+@mx.util.use_np
 @pytest.mark.serial
 @pytest.mark.parametrize('model_name', ['resnet50_v1', 'vgg19_bn', 'alexnet', 'densenet201', 'squeezenet1.0', 'mobilenet0.25'])
 def test_inference(model_name):
@@ -64,10 +65,10 @@ def test_inference(model_name):
     # all parameters.
     cpu_model = get_model(model_name)
     cpu_model.initialize(ctx=mx.cpu())
-    cpu_model(mx.nd.array(data, ctx=mx.cpu()))
+    cpu_model(mx.np.array(data, ctx=mx.cpu()))
     gpu_model = get_model(model_name)
     gpu_model.initialize(ctx=mx.gpu())
-    gpu_model(mx.nd.array(data, ctx=mx.gpu()))
+    gpu_model(mx.np.array(data, ctx=mx.gpu()))
 
     # Force the two models have the same parameters.
     cpu_params = cpu_model.collect_params()
@@ -77,7 +78,7 @@ def test_inference(model_name):
         gpu_param = gpu_params.get(k)
         gpu_param.set_data(cpu_param.data().as_in_context(mx.gpu()))
 
-    cpu_data = mx.nd.array(data, ctx=mx.cpu())
+    cpu_data = mx.np.array(data, ctx=mx.cpu())
     for i in range(5):
         # Run inference.
         with autograd.record(train_mode=False):
@@ -98,6 +99,7 @@ def get_nn_model(name):
 # Seed 1521019752 produced a failure on the Py2 MKLDNN-GPU CI runner
 # on 2/16/2018 that was not reproducible.  Problem could be timing related or
 # based on non-deterministic algo selection.
+@mx.util.use_np
 @pytest.mark.serial
 def test_training():
     # We use network models without dropout for testing.
@@ -105,7 +107,7 @@ def test_training():
     all_models = ['resnet18_v1', 'densenet121']
 
     batch_size = 10
-    label = mx.nd.random.uniform(low=0, high=10, shape=(batch_size)).astype('int32')
+    label = mx.np.random.uniform(low=0, high=10, size=(batch_size)).astype('int32')
 
     download_data()
     dataIter = mx.io.ImageRecordIter(
@@ -132,10 +134,10 @@ def test_training():
         # all parameters.
         cpu_model = get_nn_model(model_name)
         cpu_model.initialize(ctx=mx.cpu())
-        cpu_model(mx.nd.array(data, ctx=mx.cpu()))
+        cpu_model(mx.np.array(data, ctx=mx.cpu()))
         gpu_model = get_nn_model(model_name)
         gpu_model.initialize(ctx=mx.gpu())
-        gpu_model(mx.nd.array(data, ctx=mx.gpu()))
+        gpu_model(mx.np.array(data, ctx=mx.gpu()))
 
         # Force the two models have the same parameters.
         cpu_params = cpu_model.collect_params()
@@ -150,7 +152,7 @@ def test_training():
 
         # Run forward and backward once.
         with autograd.record():
-            cpu_out = cpu_model(mx.nd.array(data, ctx=mx.cpu()))
+            cpu_out = cpu_model(mx.np.array(data, ctx=mx.cpu()))
             gpu_out = gpu_model(gpu_data)
             cpu_loss = softmax_cross_entropy(cpu_out, label)
             gpu_loss = softmax_cross_entropy(gpu_out, gpu_label)
diff --git a/tests/python/gpu/test_gluon_transforms.py b/tests/python/gpu/test_gluon_transforms.py
index c005fbb48b25..b51a8fab0b45 100644
--- a/tests/python/gpu/test_gluon_transforms.py
+++ b/tests/python/gpu/test_gluon_transforms.py
@@ -28,7 +28,7 @@
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
 from common import assertRaises
-from test_gluon_data_vision import test_to_tensor, test_normalize, test_crop_resize
+from test_numpy_gluon_data_vision import test_to_tensor, test_normalize, test_crop_resize
 
 set_default_context(mx.gpu(0))
 
@@ -40,23 +40,24 @@ def test_to_tensor_gpu():
     test_to_tensor()
 
 
+@mx.util.use_np
 def test_resize_gpu():
     # Test with normal case 3D input float type
-    data_in_3d = nd.random.uniform(0, 255, (300, 300, 3))
+    data_in_3d = mx.np.random.uniform(0, 255, (300, 300, 3))
     out_nd_3d = transforms.Resize((100, 100))(data_in_3d)
-    data_in_4d_nchw = nd.moveaxis(nd.expand_dims(data_in_3d, axis=0), 3, 1)
-    data_expected_3d = (nd.moveaxis(nd.contrib.BilinearResize2D(data_in_4d_nchw, height=100, width=100, align_corners=False), 1, 3))[0]
+    data_in_4d_nchw = mx.np.moveaxis(mx.np.expand_dims(data_in_3d, axis=0), 3, 1)
+    data_expected_3d = (mx.np.moveaxis(nd.contrib.BilinearResize2D(data_in_4d_nchw.as_nd_ndarray(), height=100, width=100, align_corners=False), 1, 3))[0]
     assert_almost_equal(out_nd_3d.asnumpy(), data_expected_3d.asnumpy())
 
     # Test with normal case 4D input float type
-    data_in_4d = nd.random.uniform(0, 255, (2, 300, 300, 3))
+    data_in_4d = mx.np.random.uniform(0, 255, (2, 300, 300, 3))
     out_nd_4d = transforms.Resize((100, 100))(data_in_4d)
-    data_in_4d_nchw = nd.moveaxis(data_in_4d, 3, 1)
-    data_expected_4d = nd.moveaxis(nd.contrib.BilinearResize2D(data_in_4d_nchw, height=100, width=100, align_corners=False), 1, 3)
+    data_in_4d_nchw = mx.np.moveaxis(data_in_4d, 3, 1)
+    data_expected_4d = mx.np.moveaxis(nd.contrib.BilinearResize2D(data_in_4d_nchw.as_nd_ndarray(), height=100, width=100, align_corners=False), 1, 3)
     assert_almost_equal(out_nd_4d.asnumpy(), data_expected_4d.asnumpy())
 
     # Test invalid interp
-    data_in_3d = nd.random.uniform(0, 255, (300, 300, 3))
+    data_in_3d = mx.np.random.uniform(0, 255, (300, 300, 3))
     invalid_transform = transforms.Resize(-150, keep_ratio=False, interpolation=2)
     assertRaises(MXNetError, invalid_transform, data_in_3d)
 
diff --git a/tests/python/gpu/test_numpy_op.py b/tests/python/gpu/test_numpy_op.py
index 07c7558443f0..dbabde50324c 100644
--- a/tests/python/gpu/test_numpy_op.py
+++ b/tests/python/gpu/test_numpy_op.py
@@ -36,8 +36,8 @@ def __init__(self, subscripts, optimize):
             self.subscripts = subscripts
             self.optimize = optimize
 
-        def hybrid_forward(self, F, *operands):
-            return F.np.einsum(self.subscripts, *operands, optimize=self.optimize)
+        def forward(self, *operands):
+            return mx.np.einsum(self.subscripts, *operands, optimize=self.optimize)
 
     def dbg(name, data):
         print('type of {} = {}'.format(name, type(data)))
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index b1db89e0ef74..73690059aa5c 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -40,7 +40,6 @@
 from test_numpy_ndarray import *
 from test_numpy_op import *
 from test_numpy_interoperability import *
-from test_gluon_probability_v1 import *
 from test_gluon_probability_v2 import *
 from test_optimizer import *
 from test_random import *
diff --git a/tests/python/gpu/test_profiler_gpu.py b/tests/python/gpu/test_profiler_gpu.py
index f5f1b37a8f23..5e7d2d72f010 100644
--- a/tests/python/gpu/test_profiler_gpu.py
+++ b/tests/python/gpu/test_profiler_gpu.py
@@ -123,12 +123,10 @@ def test_gpu_memory_profiler_gluon():
     model.initialize(ctx=mx.gpu())
     model.hybridize()
 
-    inputs = mx.sym.var('data')
-
     with mx.autograd.record():
-        out = model(mx.nd.zeros((16, 10), ctx=mx.gpu()))
+        out = model(mx.np.zeros((16, 10), ctx=mx.gpu()))
     out.backward()
-    mx.nd.waitall()
+    mx.npx.waitall()
     profiler.set_state('stop')
     profiler.dump(True)
 
diff --git a/tests/python/mkl/subgraphs/subgraph_common.py b/tests/python/mkl/subgraphs/subgraph_common.py
index df07822cb298..9f518414ac91 100644
--- a/tests/python/mkl/subgraphs/subgraph_common.py
+++ b/tests/python/mkl/subgraphs/subgraph_common.py
@@ -51,8 +51,8 @@ class RELU6(nn.HybridBlock):
     def __init__(self, **kwargs):
         super(RELU6, self).__init__(**kwargs)
 
-    def hybrid_forward(self, F, x):
-        return F.clip(x, 0, 6, name="relu6")
+    def forward(self, x):
+        return mx.np.clip(x, 0, 6)
 
 class TailNegBlock(nn.HybridBlock):
   def __init__(self, **kwargs):
@@ -60,11 +60,11 @@ def __init__(self, **kwargs):
     self.fc1 = nn.Dense(10, flatten=True)
     self.fc2 = nn.Dense(10, flatten=True)
 
-  def hybrid_forward(self, F, x1, x2):
+  def forward(self, x1, x2):
     out_fc1 = self.fc1(x1)
     out_fc2 = self.fc2(x2)
-    out = F.concat(out_fc1, out_fc2)
-    out = F.softmax(out)
+    out = mx.np.concatenate([out_fc1, out_fc2])
+    out = mx.npx.softmax(out)
     return out
 
 class CustomNormalInit(mx.init.Initializer):
@@ -77,7 +77,7 @@ def __init__(self, mean=0, sigma=0.01):
         self.sigma = sigma
 
     def _init_weight(self, _, arr):
-        mx.random.normal(self.mean, self.sigma, arr.shape, dtype=arr.dtype, out=arr)
+        mx.np.random.normal(self.mean, self.sigma, arr.shape, dtype=arr.dtype, out=arr)
 
 
 def check_qsym_calibrated(qsym, out_type, name='conv'):
@@ -119,7 +119,7 @@ def check_quantize(net_original, data_shape, out_type, name='conv',
 
   net_original.initialize(init=mx.init.Normal(0.5), force_reinit=True)
   min_value = -1 if out_type != 'uint8' else 0
-  data = mx.random.uniform(min_value, 1.0, shape=data_shape, dtype='float32', ctx=mx.current_context())
+  data = mx.np.random.uniform(min_value, 1.0, size=data_shape, dtype='float32', ctx=mx.current_context())
 
   outputs = net_original(data)
   for output in outputs:
@@ -146,8 +146,8 @@ def check_quantize(net_original, data_shape, out_type, name='conv',
 
     quantized_out = qnet(data)
     for i in range(len(ref_out)):
-      min_range = mx.nd.min(ref_out[i]).asscalar()
-      max_range = mx.nd.max(ref_out[i]).asscalar()
+      min_range = mx.np.min(ref_out[i]).item()
+      max_range = mx.np.max(ref_out[i]).item()
       atol = 0.1 * max(abs(min_range), abs(max_range))
       assert_almost_equal_with_err(quantized_out.asnumpy(), ref_out.asnumpy(), rtol=0.1, atol=atol, etol=0.2)
 
@@ -156,7 +156,7 @@ def check_fusion(net_original, data_shape, attrs_dict, check_fp32_fusion=True, c
                  out_types=['uint8', 'int8', 'auto'], dedup_subgraph=True):
   net_original.initialize()
   net_original.hybridize(static_alloc=False, static_shape=False)
-  data = mx.random.uniform(shape=data_shape, dtype='float32', ctx=mx.current_context())
+  data = mx.np.random.uniform(size=data_shape, dtype='float32', ctx=mx.current_context())
   net_original(data)
   net_fusion = copy.copy(net_original)
   sym, params = net_original.export(None)
@@ -184,7 +184,7 @@ def check_fusion(net_original, data_shape, attrs_dict, check_fp32_fusion=True, c
                 assert v[attr_name].lower() == attr_value.lower()
           assert found
 
-    data = mx.nd.random.uniform(shape=data_shape, low=data_min, high=data_max)
+    data = mx.np.random.uniform(size=data_shape, low=data_min, high=data_max)
     out_unfused = net_original(data)
 
     net_fusion.optimize_for(data, backend=SG_PASS_NAME)
@@ -201,7 +201,7 @@ def check_neg_fusion(net_original, attrs_name=None, excluded_attrs=None,
                      data_shapes=(4,4,10,10), name='conv'):
   op_name = config[name][OP_NAME]
 
-  data_nd = mx.nd.random.uniform(shape=data_shapes)
+  data_nd = mx.np.random.uniform(size=data_shapes)
   net_original.initialize()
   net_original.hybridize()
   net_original(data_nd)
diff --git a/tests/python/mkl/subgraphs/test_conv_subgraph.py b/tests/python/mkl/subgraphs/test_conv_subgraph.py
index 18ebc736ef25..da08c8117e2d 100644
--- a/tests/python/mkl/subgraphs/test_conv_subgraph.py
+++ b/tests/python/mkl/subgraphs/test_conv_subgraph.py
@@ -25,6 +25,9 @@
 from mxnet.gluon import nn
 from mxnet.test_utils import assert_almost_equal, assert_almost_equal_with_err
 
+mx.npx.reset_np()
+
+@mx.util.use_np
 def test_float64_fallback():
   class ConvWithDtype(nn.HybridBlock):
     def __init__(self, dtype='float32', **kwargs):
@@ -32,19 +35,26 @@ def __init__(self, dtype='float32', **kwargs):
         self.weight = mx.gluon.Parameter('weight', dtype=dtype, allow_deferred_init=True)
         self.bias = mx.gluon.Parameter('bias', dtype=dtype, allow_deferred_init=True)
 
-    def hybrid_forward(self, F, x, weight, bias):
-        out = F.Convolution(x, kernel=(1,1), num_filter=3, weight=weight, no_bias=False, bias=bias)
+    def forward(self, x):
+        out = mx.npx.convolution(x, kernel=(1,1), num_filter=3,
+                                 weight=self.weight.data(x.ctx), no_bias=False,
+                                 bias=self.bias.data(x.ctx))
         return out
+    
+    def infer_shape(self, x):
+        self.weight.shape = (3, 3, 1, 1)
+        self.bias.shape = (3,)
 
   dtype = 'float64'
   net = ConvWithDtype(dtype=dtype)
-  in_data = mx.nd.random.normal(shape=[3,3,3,3], dtype=dtype)
+  in_data = mx.np.random.normal(size=[3,3,3,3], dtype=dtype)
   net.initialize()
   out = net(in_data)
   out.wait_to_read()
   assert in_data.dtype == out.dtype
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 @pytest.mark.parametrize('use_bias', [True, False])
 def test_pos_single_conv(use_bias, data_shape):
@@ -54,7 +64,7 @@ def __init__(self, **kwargs):
         super(Conv, self).__init__(**kwargs)
         self.conv0 = nn.Conv2D(channels=64, kernel_size=(3, 3), strides=1, use_bias=use_bias)
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         out = self.conv0(x)
         return out
 
@@ -63,6 +73,7 @@ def hybrid_forward(self, F, x):
   check_fusion(net, data_shape, attr)
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 @pytest.mark.parametrize('use_bias', [True, False])
 def test_pos_conv_add(use_bias, data_shape):
@@ -74,7 +85,7 @@ def __init__(self, use_bias, **kwargs):
         self.conv1 = nn.Conv2D(channels=64, kernel_size=(3, 3), strides=1)
         self.pool = nn.AvgPool2D(pool_size=(1,1))
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
       out = self.conv0(x) + self.pool(self.conv1(x))
       return out
     
@@ -83,6 +94,7 @@ def hybrid_forward(self, F, x):
   check_fusion(net, data_shape, attr)
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 @pytest.mark.parametrize('no_bias', [True, False])
 def test_pos_conv_add2(no_bias, data_shape):
@@ -94,7 +106,7 @@ def __init__(self, use_bias, **kwargs):
         self.conv1 = nn.Conv2D(channels=64, kernel_size=(3, 3), strides=1)
         self.pool = nn.AvgPool2D(pool_size=(1,1))
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
       out = self.pool(self.conv1(x)) + self.conv0(x)
       return out
 
@@ -103,6 +115,7 @@ def hybrid_forward(self, F, x):
   check_fusion(net, data_shape, attr)
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 @pytest.mark.parametrize('alg,quantize', [
     ("relu", False), #TODO(bgawrych): investigate
@@ -134,7 +147,7 @@ def __init__(self, use_bias, alg, **kwargs):
         self.conv1 = nn.Conv2D(channels=64, kernel_size=(3, 3), strides=1, use_bias=use_bias)
         self.conv1.share_parameters(self.conv0.collect_params())
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         out = self.act(self.conv0(x)) + self.conv1(x)
         return out
 
@@ -145,6 +158,7 @@ def hybrid_forward(self, F, x):
   check_fusion(net, data_shape, attrs, check_quantization=quantize)
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 @pytest.mark.parametrize('alg,quantize', [
     ("relu", True),
@@ -174,7 +188,7 @@ def __init__(self, alg, use_bias, **kwargs):
         else:
           self.act = nn.Activation(activation = alg)
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
       out = self.act(self.bn(self.conv0(x)))
       return out
 
@@ -183,6 +197,7 @@ def hybrid_forward(self, F, x):
   check_fusion(net, data_shape, attr, check_quantization=quantize)
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 @pytest.mark.parametrize('alg,quantize', [
     ("relu", True),
@@ -214,7 +229,7 @@ def __init__(self, alg, use_bias, **kwargs):
         else:
           self.act = nn.Activation(activation = alg)
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         out = self.bn(self.conv0(x)) + self.conv1(x)
         out = self.act(out)
         return out
@@ -224,6 +239,7 @@ def hybrid_forward(self, F, x):
   check_fusion(net, data_shape, attr, check_quantization=quantize)
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 @pytest.mark.parametrize('input_num,dim', [
     (2, -1),
@@ -241,7 +257,7 @@ def __init__(self, input_num, dim, **kwargs):
         for i in range(input_num):
             self.concat.add(nn.Identity())
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         out = self.concat(x)
         return out
 
@@ -250,6 +266,7 @@ def hybrid_forward(self, F, x):
                   check_calibration=False)
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 @pytest.mark.parametrize('out_type', ['int8', 'auto'])
 def test_pos_single_concat_pos_neg(data_shape, out_type):
@@ -260,37 +277,48 @@ def __init__(self, dim, **kwargs):
         self.act = nn.Activation(activation = 'relu')
         self.concat_dim = dim
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         relu_out = self.act(self.conv0(x))
-        out = F.concat(x, relu_out, dim=self.concat_dim)
+        out = mx.np.concatenate([x, relu_out], axis=self.concat_dim)
         return out
 
   concat = ConvDataConcat(dim=1)
   check_quantize(concat, data_shape, out_type, name='', check_calibration=False)
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 @pytest.mark.parametrize('out_type', ['int8', 'auto'])
+@pytest.mark.skip("Scale doesn't align in numpy for numpy operators")
 def test_pos_concat_scale_align(data_shape, out_type):
   # concat scale alignment case
   class ConcatScaleAlign(nn.HybridBlock):
     def __init__(self, **kwargs):
-      super(ConcatScaleAlign, self).__init__(**kwargs)
-      self.shared_weight = mx.gluon.Parameter('shared_weight', init=mx.init.Xavier(magnitude=2.24),
-                                              dtype='float32', allow_deferred_init=True)
-
-    def hybrid_forward(self, F, x, shared_weight):
-        conv1 = F.Convolution(x, kernel=(3,3), num_filter=64, weight=shared_weight,   no_bias=True)
-        conv2 = F.Convolution(x, kernel=(3,3), num_filter=64, weight=shared_weight*2, no_bias=True)
-        conv3 = F.Convolution(x, kernel=(3,3), num_filter=64, weight=shared_weight*3, no_bias=True)
-        conv4 = F.Convolution(x, kernel=(3,3), num_filter=64, weight=shared_weight*4, no_bias=True)
-        return F.concat(conv1, conv2, conv3, conv4, dim=1)
+        super(ConcatScaleAlign, self).__init__(**kwargs)
+        self.shared_weight = mx.gluon.Parameter('shared_weight', shape=(64, data_shape[1], 3, 3),
+                                                init=mx.init.Xavier(magnitude=2.24),
+                                                dtype='float32', allow_deferred_init=True)
+
+    def forward(self, x):
+        conv1 = mx.npx.convolution(x, kernel=(3,3), num_filter=64,
+                                   weight=self.shared_weight.data(x.ctx), no_bias=True)
+        conv2 = mx.npx.convolution(x, kernel=(3,3), num_filter=64,
+                                   weight=self.shared_weight.data(x.ctx)*2, no_bias=True)
+        conv3 = mx.npx.convolution(x, kernel=(3,3), num_filter=64,
+                                   weight=self.shared_weight.data(x.ctx)*3, no_bias=True)
+        conv4 = mx.npx.convolution(x, kernel=(3,3), num_filter=64,
+                                   weight=self.shared_weight.data(x.ctx)*4, no_bias=True)
+        return mx.np.concatenate([conv1, conv2, conv3, conv4], axis=1)
+
+    def infer_shape(self, x, *args):
+        self.shared_weight.weight = (64, data_shape[1], 3, 3)
 
   concat = ConcatScaleAlign()
   check_quantize(concat, data_shape, out_type, check_calibration=True,
                   check_scale_align=True)
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 @pytest.mark.parametrize('alg,quantize', [
     ("relu", True),
@@ -319,7 +347,7 @@ def __init__(self, use_bias, alg, **kwargs):
         else:
           self.act = nn.Activation(activation = alg)
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         out = self.act(self.conv0(x))
         return out
 
@@ -330,6 +358,7 @@ def hybrid_forward(self, F, x):
   check_fusion(net, data_shape, attrs, check_quantization=quantize)
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 @pytest.mark.parametrize('use_bias', [True, False])
 def test_pos_conv_bn(use_bias, data_shape):
@@ -340,7 +369,7 @@ def __init__(self, use_bias, **kwargs):
         self.conv0 = nn.Conv2D(channels=64, kernel_size=(3, 3), strides=1, use_bias=use_bias)
         self.bn = nn.BatchNorm()
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         out = self.bn(self.conv0(x))
         return out
 
@@ -357,13 +386,14 @@ def __init__(self, channels, reverse_sum_order, **kwargs):
       self.bn = nn.BatchNorm()
       self.reverse = reverse_sum_order
 
-  def hybrid_forward(self, F, x):
+  def forward(self, x):
       if self.reverse:
         return self.bn(self.conv0(x)) + x
       else:
         return x + self.bn(self.conv0(x))
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 @pytest.mark.parametrize('reverse_sum_order', [True, False])
 @pytest.mark.parametrize('dedup_subgraph', [True, False])
@@ -384,7 +414,7 @@ def __init__(self, reverse_sum_order, **kwargs):
       self.bn2 = nn.BatchNorm()
       self.reverse = reverse_sum_order
 
-  def hybrid_forward(self, F, x):
+  def forward(self, x):
       out = self.bn1(self.conv1(x))
       if self.reverse:
         return self.bn2(self.conv2(out)) + out
@@ -392,6 +422,7 @@ def hybrid_forward(self, F, x):
         return out + self.bn2(self.conv2(out))
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 @pytest.mark.parametrize('reverse_sum_order', [True, False])
 @pytest.mark.parametrize('dedup_subgraph', [True, False])
@@ -401,11 +432,12 @@ def test_mobilenetv2_struct(data_shape, reverse_sum_order, dedup_subgraph):
   check_fusion(net, data_shape, attr, out_types=['int8', 'auto'], dedup_subgraph=dedup_subgraph)
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 @pytest.mark.parametrize('reverse_sum_order', [False, True])
 @pytest.mark.parametrize('model_name', ['conv_bn_sum', 'mobilenetv2_struct'])
 def test_deduplication(data_shape, reverse_sum_order, model_name):
-  data_nd = mx.random.uniform(-1, 1, shape=data_shape, ctx=mx.cpu())
+  data_nd = mx.np.random.uniform(-1, 1, size=data_shape, ctx=mx.cpu())
   if (model_name == 'mobilenetv2_struct'):
     model_dedup = MobileNetV2Struct(reverse_sum_order=reverse_sum_order)
   else:
@@ -424,6 +456,7 @@ def test_deduplication(data_shape, reverse_sum_order, model_name):
   assert_almost_equal(out.asnumpy(), out_dedup.asnumpy(), rtol=1e-3, atol=1e-1)
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 def test_neg_conv_bn(data_shape):
   # conv + bn can't be fusion case
@@ -440,7 +473,7 @@ def __init__(self, **kwargs):
       self.pool = nn.AvgPool2D(pool_size=(4,4))
       self.tailneg = TailNegBlock()
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
       conv = self.conv1(x)
       bn = self.bn1(conv)
       pool = self.pool(conv)
@@ -453,6 +486,7 @@ def hybrid_forward(self, F, x):
   check_neg_fusion(net, attrs, excluded_attrs, data_shape)
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 def test_neg_conv_relu(data_shape):
   # conv + relu can't be fusion case
@@ -469,7 +503,7 @@ def __init__(self, **kwargs):
       self.pool = nn.AvgPool2D(pool_size=(4,4))
       self.tailneg = TailNegBlock()
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
       conv = self.conv1(x)
       bn = self.act(conv)
       pool = self.pool(conv)
@@ -481,6 +515,7 @@ def hybrid_forward(self, F, x):
   check_neg_fusion(net, attrs, excluded_attrs, data_shape)
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 def test_neg_conv_add(data_shape):
   # conv + add can't be fusion case
@@ -502,18 +537,22 @@ def __init__(self, **kwargs):
       self.add_value = mx.gluon.Parameter('add_value', init=mx.init.Xavier(magnitude=2.24),
                                           dtype='float32', allow_deferred_init=True)
 
-    def hybrid_forward(self, F, x, add_value):
+    def forward(self, x):
       conv = self.conv1(x)
-      sum1 = conv + add_value
+      print(conv.shape)
+      sum1 = conv + self.add_value.data(x.ctx)
       pool = self.pool(conv)
       return self.tailneg(sum1, pool)
+    
+    def infer_shape(self, x):
+      self.add_value.shape = (data_shape[0], 64, data_shape[2]-2, data_shape[3]-2)
 
   attrs = []
   excluded_attrs = ['with_sum']
   net = NegConvAdd()
   check_neg_fusion(net, attrs, excluded_attrs, data_shape)
 
-
+@mx.util.use_np
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 def test_neg_conv_bn_relu(data_shape):
   # conv + bn + relu can't be fusion case
@@ -536,7 +575,7 @@ def __init__(self, batchnorm_pool = False, **kwargs):
       self.tailneg = TailNegBlock()
       self.batchnorm_pool = batchnorm_pool
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
       conv = self.conv1(x)
       bn = self.bn(conv)
       relu = self.act(bn)
@@ -556,6 +595,7 @@ def hybrid_forward(self, F, x):
   check_neg_fusion(net2, attrs2, excluded_attrs2, data_shape)
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 def test_neg_conv_bn_add_relu(data_shape):
   # conv + bn + add + relu can't be fusion case
@@ -586,10 +626,11 @@ def __init__(self, connect_mode = "conv_customop", **kwargs):
       self.add_value = mx.gluon.Parameter('add_value', init=mx.init.Xavier(magnitude=2.24),
                                           dtype='float32', allow_deferred_init=True)
 
-    def hybrid_forward(self, F, x, add_value):
+    def forward(self, x):
       conv = self.conv1(x)
       bn = self.bn(conv)
-      sum1 = bn + add_value
+      print(bn.shape)
+      sum1 = bn + self.add_value.data(x.ctx)
       relu = self.act(sum1)
       if self.connect_mode == "conv_customop":
         pool = self.pool(conv)
@@ -599,6 +640,9 @@ def hybrid_forward(self, F, x, add_value):
         pool = self.pool(sum1)
       return self.tailneg(relu, pool)
 
+    def infer_shape(self, x):
+      self.add_value.shape = (data_shape[0], 64, data_shape[2]-2, data_shape[3]-2)
+
   # eg.1
   net1 = NegConvBNAddRelu(connect_mode = "conv_customop")
   attrs1 = []
@@ -619,6 +663,7 @@ def hybrid_forward(self, F, x, add_value):
 
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_min,data_max,weight_min,weight_max', [
     (-1, 1, 0, 0),
     (-1, 1, -1e-6, +1e-6),
@@ -629,9 +674,9 @@ def hybrid_forward(self, F, x, add_value):
 ])
 def test_quantized_conv_bias_overflow(data_min, data_max, weight_min, weight_max):
   data_shape = (1, 32, 2, 2)
-  data_nd = mx.random.uniform(data_min, data_max, shape=data_shape, ctx=mx.cpu())
-  weight_nd = mx.random.uniform(weight_min, weight_max, shape=[64, 32, 1, 1], ctx=mx.cpu())
-  bias_nd = mx.random.uniform(-1, +1, shape=[64], ctx=mx.cpu())
+  data_nd = mx.np.random.uniform(data_min, data_max, size=data_shape, ctx=mx.cpu())
+  weight_nd = mx.np.random.uniform(weight_min, weight_max, size=[64, 32, 1, 1], ctx=mx.cpu())
+  bias_nd = mx.np.random.uniform(-1, +1, size=[64], ctx=mx.cpu())
 
   class ConvBiasOverflow(nn.HybridBlock):
         def __init__(self, dtype='float32', **kwargs):
@@ -639,9 +684,15 @@ def __init__(self, dtype='float32', **kwargs):
             self.weight = mx.gluon.Parameter('weight', dtype=dtype, allow_deferred_init=True)
             self.bias = mx.gluon.Parameter('bias', dtype=dtype, allow_deferred_init=True)
 
-        def hybrid_forward(self, F, x, weight, bias):
-            conv1 = F.Convolution(x, num_filter=64, kernel=(1,1), weight=weight, no_bias=False, bias=bias)
+        def forward(self, x):
+            conv1 = mx.npx.convolution(x, num_filter=64, kernel=(1,1),
+                                       weight=self.weight.data(x.ctx),
+                                       no_bias=False, bias=self.bias.data(x.ctx))
             return conv1
+        
+        def infer_shape(self, x):
+            self.weight.shape = (64, x.shape[1], 1, 1)
+            self.bias.shape = (64,)
 
   net = ConvBiasOverflow()
   net.initialize()
@@ -714,9 +765,10 @@ def test_quantized_fc_bias_overflow(data_min, data_max, weight_min, weight_max):
   assert_almost_equal_with_err(ex.outputs[0].asnumpy(), qex.outputs[0].asnumpy(),
                                rtol=1e-2, atol=1e-2, etol=0.01)
 
+@mx.util.use_np
 @pytest.mark.parametrize('axis', [0, 1, 2, 3])
 def test_bn_relu_fusion(axis):
-    dummy_data = mx.nd.uniform(-1.0, 1.0, shape=(32, 3, 224, 224))
+    dummy_data = mx.np.random.uniform(-1.0, 1.0, size=(32, 3, 224, 224))
 
     net = mx.gluon.nn.HybridSequential()
     net.add(mx.gluon.nn.BatchNorm(axis=axis))
diff --git a/tests/python/mkl/subgraphs/test_fc_subgraph.py b/tests/python/mkl/subgraphs/test_fc_subgraph.py
index 39c7959d3b4a..1bcd332e3b8c 100644
--- a/tests/python/mkl/subgraphs/test_fc_subgraph.py
+++ b/tests/python/mkl/subgraphs/test_fc_subgraph.py
@@ -29,12 +29,13 @@
 def test_float64_fallback():
   dtype = 'float64'
   net = nn.Dense(units=3, dtype=dtype)
-  in_data = mx.nd.random.normal(shape=[3,3,3,3], dtype=dtype)
+  in_data = mx.np.random.normal(size=[3,3,3,3], dtype=dtype)
   net.initialize()
   out = net(in_data)
   out.wait_to_read()
   assert in_data.dtype == out.dtype
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 @pytest.mark.parametrize('use_bias', [True, False])
 @pytest.mark.parametrize('flatten', [True, False])
@@ -45,7 +46,7 @@ def __init__(self, use_bias, flatten, **kwargs):
       super(SingleFC, self).__init__(**kwargs)
       self.fc = nn.Dense(units=64, use_bias=use_bias, flatten=flatten)
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
       return self.fc(x)
 
   attrs = {'fc': {}}
@@ -53,36 +54,38 @@ def hybrid_forward(self, F, x):
   check_fusion(net, data_shape, attrs, check_quantization=flatten)
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 @pytest.mark.parametrize('use_bias', [True, False])
 @pytest.mark.parametrize('flatten', [True, False])
 @pytest.mark.parametrize('alg', fc_post_ops_list)
+@pytest.mark.skip("Operator square, square_root, abs, exp cannot be found in numpy mode")
 def test_fc_eltwise(data_shape, use_bias, flatten, alg):
   # fc + eltwise fusion case
   class FCEltwise(nn.HybridBlock):
     def __init__(self, use_bias, flatten, alg, **kwargs):
       super(FCEltwise, self).__init__(**kwargs)
       self.fc = nn.Dense(units=64, use_bias=use_bias, flatten=flatten,
-                          weight_initializer=CustomNormalInit(mean=0.5, sigma=0.1) if alg == 'square_root' else None)
+                         weight_initializer=CustomNormalInit(mean=0.5, sigma=0.1) if alg == 'square_root' else None)
                                             #avoid calculating square root of negative values
       self.alg = alg
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
       fc_out = self.fc(x)
       if self.alg in ['relu', 'sigmoid', 'log_sigmoid', 'mish', 'tanh', 'softrelu']:
-        out = F.Activation(fc_out, act_type=self.alg)
+        out = mx.npx.activation(fc_out, act_type=self.alg)
       elif self.alg in ['gelu', 'elu', 'leaky']:
-        out = F.LeakyReLU(fc_out, act_type=self.alg)
+        out = mx.npx.leaky_relu(fc_out, act_type=self.alg)
       elif self.alg == 'square':
-        out = F.square(fc_out)
+        out = mx.np.square(fc_out)
       elif self.alg == 'square_root':
-        out = F.sqrt(fc_out)
+        out = mx.np.sqrt(fc_out)
       elif self.alg == 'abs':
-        out = F.abs(fc_out)
+        out = mx.np.abs(fc_out)
       elif self.alg == 'exp':
-        out = F.exp(fc_out)
+        out = mx.np.exp(fc_out)
       else:
-        out = F.clip(fc_out, 0, 1.0)
+        out = mx.np.clip(fc_out, 0, 1.0)
       return out
 
   attrs = {'fc': {'with_eltwise': 'true'}}
@@ -90,6 +93,7 @@ def hybrid_forward(self, F, x):
   check_fusion(net, data_shape, attrs, check_quantization=flatten)
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 @pytest.mark.parametrize('use_bias', [True, False])
 @pytest.mark.parametrize('flatten', [True, False])
@@ -108,7 +112,7 @@ def __init__(self, use_bias, flatten, **kwargs):
       self.act2 = nn.Activation('sigmoid')
       self.tail_neg = TailNegBlock()
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
       fc_out = self.fc(x)
       return self.tail_neg(self.act1(fc_out), self.act2(fc_out))
 
@@ -118,6 +122,7 @@ def hybrid_forward(self, F, x):
   check_neg_fusion(net, attrs, excluded_attrs, data_shape, name='fc')
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('data_min,data_max,weight_min,weight_max', [
     (-1, 1, 0, 0),
     (-1, 1, -1e-6, +1e-6),
@@ -128,9 +133,9 @@ def hybrid_forward(self, F, x):
 ])
 def test_quantized_fc_bias_overflow(data_min, data_max, weight_min, weight_max):
   data_shape = (1, 32)
-  data_nd = mx.random.uniform(data_min, data_max, shape=data_shape, ctx=mx.cpu())
-  weight_nd = mx.random.uniform(weight_min, weight_max, shape=[64, 32], ctx=mx.cpu())
-  bias_nd = mx.random.uniform(-1, +1, shape=[64], ctx=mx.cpu())
+  data_nd = mx.np.random.uniform(data_min, data_max, size=data_shape, ctx=mx.cpu())
+  weight_nd = mx.np.random.uniform(weight_min, weight_max, size=[64, 32], ctx=mx.cpu())
+  bias_nd = mx.np.random.uniform(-1, +1, size=[64], ctx=mx.cpu())
 
   class FCBiasOverflow(nn.HybridBlock):
     def __init__(self, dtype='float32', **kwargs):
@@ -138,9 +143,14 @@ def __init__(self, dtype='float32', **kwargs):
         self.weight = mx.gluon.Parameter('weight', dtype=dtype, allow_deferred_init=True)
         self.bias = mx.gluon.Parameter('bias', dtype=dtype, allow_deferred_init=True)
 
-    def hybrid_forward(self, F, x, weight, bias):
-        conv1 = F.FullyConnected(x, num_hidden=64, weight=weight, no_bias=False, bias=bias)
+    def forward(self, x):
+        conv1 = mx.npx.fully_connected(x, num_hidden=64, weight=self.weight.data(x.ctx),
+                                       no_bias=False, bias=self.bias.data(x.ctx))
         return conv1
+    
+    def infer_shape(self, x, *args):
+        self.weight.shape = (64, x.shape[x.ndim-1])
+        self.bias.shape = (64,)
 
   net = FCBiasOverflow()
   net.initialize()
diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
index 781ca421b712..aa7a8f4be40b 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/mkl/test_mkldnn.py
@@ -24,25 +24,27 @@
 import mxnet as mx
 import pytest
 from mxnet.test_utils import rand_ndarray, assert_almost_equal
-from mxnet import gluon, context
+from mxnet import gluon, context, use_np
 from mxnet.gluon import nn
 from mxnet.test_utils import *
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.append(os.path.join(curr_path, '../unittest/'))
 import itertools
 
+@use_np
 @pytest.mark.seed(1234)
 def test_mkldnn_ndarray_slice():
     ctx = mx.cpu()
     net = gluon.nn.HybridSequential()
     net.add(gluon.nn.Conv2D(channels=32, kernel_size=3, activation=None))
     net.initialize(ctx=ctx)
-    x = mx.nd.array(np.ones([32, 3, 224, 224]), ctx)
+    x = mx.np.array(np.ones([32, 3, 224, 224]), ctx=ctx)
     y = net(x)
 
     # trigger computation on ndarray slice
     assert_almost_equal(y[0].asnumpy()[0, 0, 0], np.array(0.056331709))
 
+@use_np
 @pytest.mark.seed(1234)
 def test_mkldnn_engine_threading():
     net = gluon.nn.HybridSequential()
@@ -58,12 +60,12 @@ def __getitem__(self, key):
 
     X = (32, 3, 32, 32)
     # trigger mkldnn execution thread
-    y = net(mx.nd.array(np.ones(X))).asnumpy()
+    y = net(mx.np.array(np.ones(X))).asnumpy()
 
     # Use Gluon dataloader to trigger different thread.
     # below line triggers different execution thread
     for _ in loader:
-        y = net(mx.nd.array(np.ones(X))).asnumpy()
+        y = net(mx.np.array(np.ones(X))).asnumpy()
         # output should be 056331709 (non-mkldnn mode output)
         assert_almost_equal(y[0, 0, 0, 0], np.array(0.056331709))
         break
@@ -99,6 +101,7 @@ def test_reshape_after_conv(dst_shape):
         test_reshape_after_conv(test_case)
 
 
+@use_np
 def test_reshape_before_conv():
     class Net(gluon.HybridBlock):
         """
@@ -109,13 +112,14 @@ def __init__(self, **kwargs):
             self.conv0 = nn.Conv2D(10, (3, 3))
             self.conv1 = nn.Conv2D(5, (3, 3))
 
-        def hybrid_forward(self, F, x, *args, **kwargs):
-            x_reshape = x.reshape((0, 0, 20, 5))
+        def forward(self, x, *args, **kwargs):
+            x_reshape = x.reshape((2, 4, 20, 5))
             y = self.conv0(x_reshape)
-            y_reshape = y.reshape((0, 0, 9, 6))
+            y_reshape = y.reshape((2, 10, 9, 6))
             out = self.conv1(y_reshape)
             return out
-    x = mx.nd.random.uniform(shape=(2, 4, 10, 10))
+
+    x = mx.np.random.uniform(size=(2, 4, 10, 10))
     x.attach_grad()
     net = Net()
     net.initialize()
@@ -131,6 +135,7 @@ def hybrid_forward(self, F, x, *args, **kwargs):
     assert_almost_equal(out1, out2, rtol=1e-5, atol=1e-6)
 
 
+@use_np
 def test_slice_before_conv():
     class Net(gluon.HybridBlock):
         """
@@ -141,13 +146,14 @@ def __init__(self, **kwargs):
             self.conv0 = nn.Conv2D(4, (3, 3))
             self.conv1 = nn.Conv2D(4, (3, 3))
 
-        def hybrid_forward(self, F, x, *args, **kwargs):
-            x_slice = x.slice(begin=(0, 0, 0, 0), end=(2, 4, 10, 10))
+        def forward(self, x, *args, **kwargs):
+            x_slice = mx.npx.slice(x, begin=(0, 0, 0, 0), end=(2, 4, 10, 10))
             y = self.conv0(x_slice)
-            y_slice = y.slice(begin=(1, 0, 2, 2), end=(2, 1, 7, 7))
+            y_slice = mx.npx.slice(y, begin=(1, 0, 2, 2), end=(2, 1, 7, 7))
             out = self.conv1(y_slice)
             return out
-    x = mx.nd.random.uniform(shape=(2, 10, 10, 10))
+
+    x = mx.np.random.uniform(size=(2, 10, 10, 10))
     x.attach_grad()
     net = Net()
     net.initialize()
@@ -163,6 +169,7 @@ def hybrid_forward(self, F, x, *args, **kwargs):
     assert_almost_equal(out1, out2, rtol=1e-5, atol=1e-6)
 
 
+@use_np
 def test_slice_reshape_before_conv():
     class Net(gluon.HybridBlock):
         """
@@ -173,13 +180,14 @@ def __init__(self, **kwargs):
             self.conv0 = nn.Conv2D(4, (3, 3))
             self.conv1 = nn.Conv2D(4, (3, 3))
 
-        def hybrid_forward(self, F, x, *args, **kwargs):
-            x_slice = x.slice(begin=(0, 0, 0, 0), end=(2, 4, 8, 9))
+        def forward(self, x, *args, **kwargs):
+            x_slice = mx.npx.slice(x, begin=(0, 0, 0, 0), end=(2, 4, 8, 9))
             y = self.conv0(x_slice)
-            y_reshape = y.reshape((0, 0, 14, 3))
+            y_reshape = y.reshape((2, 4, 14, 3))
             out = self.conv1(y_reshape)
             return out
-    x = mx.nd.random.uniform(shape=(2, 10, 10, 10))
+
+    x = mx.np.random.uniform(size=(2, 10, 10, 10))
     x.attach_grad()
     net = Net()
     net.initialize()
@@ -322,7 +330,7 @@ def forward(self, x):
         unfused_net = BNNet(fuse_relu=False)
         fused_net.initialize()
         unfused_net.initialize()
-        in_data = mx.nd.random.normal(shape=shape)
+        in_data = mx.np.random.normal(size=shape)
         no_fuse_outputs = unfused_net.forward(in_data)
         fuse_outputs = fused_net.forward(in_data)
 
@@ -550,19 +558,23 @@ def test_conv_transpose():
 
 
 # This test case is contributed by @awsbillz in https://github.com/apache/incubator-mxnet/issues/14766
+@use_np
 def test_reshape_transpose_6d():
     class Reshape2D(gluon.HybridBlock):
         def __init__(self, factor):
             super(Reshape2D, self).__init__()
             self._factors = (int(factor),) * 2
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             f1, f2 = self._factors
+            N = 1
+            C = 2
+            H = W = 596
                                                           # (N, f1*f2*C, H, W)
-            x = F.reshape(x, (0, -4, -1, f1 * f2, 0, 0))  # (N, C, f1*f2, H, W)
-            x = F.reshape(x, (0, 0, -4, f1, f2, 0, 0))    # (N, C, f1, f2, H, W)
-            x = F.transpose(x, (0, 1, 4, 2, 5, 3))        # (N, C, H, f1, W, f2)
-            x = F.reshape(x, (0, 0, -3, -3))              # (N, C, H*f1, W*f2)
+            x = mx.np.reshape(x, (N, C, f1 * f2, H, W))  # (N, C, f1*f2, H, W)
+            x = mx.np.reshape(x, (N, C, f1, f2, H, W))    # (N, C, f1, f2, H, W)
+            x = mx.np.transpose(x, (0, 1, 4, 2, 5, 3))        # (N, C, H, f1, W, f2)
+            x = mx.np.reshape(x, (N, C, H*f1, W*f2))              # (N, C, H*f1, W*f2)
             return x
 
 
@@ -572,7 +584,7 @@ def __init__(self, **kwargs):
             self.conv1 = nn.Conv2D(8, kernel_size=5)
             self.reshape2D = Reshape2D(2)
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x = self.conv1(x)
             x = self.reshape2D(x)
             return x
@@ -580,7 +592,7 @@ def hybrid_forward(self, F, x):
     net = Net()
     net.initialize(mx.init.Xavier(), ctx=mx.cpu())
     net.hybridize()
-    data = mx.nd.random_normal(shape=(1, 3, 600, 600))
+    data = mx.np.random.normal(size=(1, 3, 600, 600))
     output = net(data)
     a = output.asnumpy()
 
@@ -705,4 +717,4 @@ def batch_check(seq_length, state_size, batch_size, input_size):
             assert type(state[0]).__name__ == 'ndarray'
 
     for sl, ss, bs, in_s in itertools.product(SEQ_LENGTH, STATE_SIZE, BATCH_SIZE, INPUT_SIZE): 
-        batch_check(sl, ss, bs, in_s)
\ No newline at end of file
+        batch_check(sl, ss, bs, in_s)
diff --git a/tests/python/mkl/test_quantization_mkldnn.py b/tests/python/mkl/test_quantization_mkldnn.py
index 055996acb96a..a578dbe0b56f 100644
--- a/tests/python/mkl/test_quantization_mkldnn.py
+++ b/tests/python/mkl/test_quantization_mkldnn.py
@@ -24,6 +24,8 @@
 sys.path.insert(0, os.path.join(curr_path, '../quantization'))
 from test_quantization import *
 
+mx.npx.reset_np()
+
 if __name__ == '__main__':
     import pytest
     pytest.main()
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 21866f078d76..640469e0e7f6 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -20,7 +20,9 @@
 """
 import os
 import mxnet as mx
-import numpy as np
+import numpy as onp
+from mxnet import npx
+from mxnet.util import use_np
 from mxnet.gluon.model_zoo import vision
 from mxnet.test_utils import assert_almost_equal, assert_exception, rand_ndarray, rand_shape_nd, same, DummyIter
 from common import xfail_when_nonstandard_decimal_separator
@@ -28,6 +30,7 @@
 import unittest
 import operator
 
+npx.reset_np()
 
 def collect_block_args_aux(block, sym):
   arg_params, aux_params = dict(), dict()
@@ -61,24 +64,24 @@ def test_quantize_float32_to_int8():
     data_np = data.asnumpy()
     min_range = min_range.asscalar()
     max_range = max_range.asscalar()
-    real_range = np.maximum(np.abs(min_range), np.abs(max_range))
+    real_range = onp.maximum(onp.abs(min_range), onp.abs(max_range))
     quantized_range = 127.0
     scale = quantized_range / real_range
-    assert qdata.dtype == np.int8
-    assert min_val.dtype == np.float32
-    assert max_val.dtype == np.float32
+    assert qdata.dtype == onp.int8
+    assert min_val.dtype == onp.float32
+    assert max_val.dtype == onp.float32
     assert same(min_val.asscalar(), -real_range)
     assert same(max_val.asscalar(), real_range)
-    qdata_np = (np.sign(data_np) * np.minimum(np.abs(data_np) * scale + 0.5, quantized_range)).astype(np.int8)
+    qdata_np = (onp.sign(data_np) * onp.minimum(onp.abs(data_np) * scale + 0.5, quantized_range)).astype(onp.int8)
     assert_almost_equal(qdata.asnumpy(), qdata_np, atol = 1)
 
 
 def test_dequantize_int8_to_float32():
 
     def get_test_data(real_range, qdata_np):
-        qdata = mx.nd.array(qdata_np, dtype=np.int8)
-        min_range = mx.nd.array([-real_range], dtype=np.float32)
-        max_range = mx.nd.array([real_range], dtype=np.float32)
+        qdata = mx.nd.array(qdata_np, dtype=onp.int8)
+        min_range = mx.nd.array([-real_range], dtype=onp.float32)
+        max_range = mx.nd.array([real_range], dtype=onp.float32)
         return qdata, min_range, max_range
 
     def baseline_dequantization(qdata, real_range, qdata_np):
@@ -89,7 +92,7 @@ def baseline_dequantization(qdata, real_range, qdata_np):
 
     def test_nd_array_dequantization(qdata, min_range, max_range, expected_result):
         data = mx.nd.contrib.dequantize(qdata, min_range, max_range, out_type='float32')
-        assert data.dtype == np.float32
+        assert data.dtype == onp.float32
         assert_almost_equal(data.asnumpy(), expected_result, atol = 1)
 
     def test_symbolic_api_dequantization(qdata, min_range, max_range, expected_result):
@@ -101,12 +104,12 @@ def test_symbolic_api_dequantization(qdata, min_range, max_range, expected_resul
         out = dequant._bind(ctx=mx.current_context(),
                            args={'data':qdata, 'min_range':min_range, 'max_range':max_range})
         data = out.forward()[0]
-        assert data.dtype == np.float32
+        assert data.dtype == onp.float32
         assert_almost_equal(data.asnumpy(), expected_result, atol = 1)
 
     real_range = 128
     shape = rand_shape_nd(4)
-    qdata_np = np.random.uniform(low=-127, high=127, size=shape).astype(dtype=np.int8)
+    qdata_np = onp.random.uniform(low=-127, high=127, size=shape).astype(dtype=onp.int8)
     qdata, min_range, max_range = get_test_data(real_range, qdata_np)
     expected_result = baseline_dequantization(qdata, real_range, qdata_np)
     # test nd array implementation.
@@ -118,17 +121,17 @@ def test_symbolic_api_dequantization(qdata, min_range, max_range, expected_resul
 def test_requantize_int32_to_int8():
     def quantized_int32_to_float(qdata, min_range, max_range):
         assert qdata.dtype == 'int32'
-        quantized_range = np.iinfo('int32').max
-        real_range = np.maximum(np.abs(min_range), np.abs(max_range))
+        quantized_range = onp.iinfo('int32').max
+        real_range = onp.maximum(onp.abs(min_range), onp.abs(max_range))
         scale = float(real_range) / float(quantized_range)
         return qdata.astype('float32') * scale
 
     def float_to_quantized_int8(data, min_range, max_range):
         assert data.dtype == 'float32'
-        real_range = np.maximum(np.abs(min_range), np.abs(max_range))
-        quantized_range = np.iinfo('int8').max
+        real_range = onp.maximum(onp.abs(min_range), onp.abs(max_range))
+        quantized_range = onp.iinfo('int8').max
         scale = float(quantized_range) / float(real_range)
-        return (np.sign(data) * np.minimum(np.abs(data) * scale + 0.5, quantized_range)).astype('int8')
+        return (onp.sign(data) * onp.minimum(onp.abs(data) * scale + 0.5, quantized_range)).astype('int8')
 
     def requantize(qdata, min_data, max_data, real_range):
         data = quantized_int32_to_float(qdata, min_data, max_data)
@@ -137,12 +140,12 @@ def requantize(qdata, min_data, max_data, real_range):
 
     def requantize_baseline(qdata, min_data, max_data, min_calib_range=None, max_calib_range=None):
         if min_calib_range is not None and max_calib_range is not None:
-            real_range = np.maximum(np.abs(min_calib_range), np.abs(max_calib_range))
+            real_range = onp.maximum(onp.abs(min_calib_range), onp.abs(max_calib_range))
             return requantize(qdata, min_data, max_data, real_range)
         else:
-            min_range = quantized_int32_to_float(np.min(qdata), min_data, max_data)
-            max_range = quantized_int32_to_float(np.max(qdata), min_data, max_data)
-            return requantize(qdata, min_data, max_data, np.maximum(np.abs(min_range), np.abs(max_range)))
+            min_range = quantized_int32_to_float(onp.min(qdata), min_data, max_data)
+            max_range = quantized_int32_to_float(onp.max(qdata), min_data, max_data)
+            return requantize(qdata, min_data, max_data, onp.maximum(onp.abs(min_range), onp.abs(max_range)))
 
     def check_requantize(shape, min_calib_range=None, max_calib_range=None):
         qdata = mx.nd.random.uniform(low=-1000.0, high=1000.0, shape=shape).astype('int32')
@@ -160,13 +163,14 @@ def check_requantize(shape, min_calib_range=None, max_calib_range=None):
                                                                           min_calib_range=min_calib_range,
                                                                           max_calib_range=max_calib_range)
         assert_almost_equal(qdata_int8.asnumpy(), qdata_int8_np, atol = 1)
-        assert_almost_equal(min_output.asnumpy(), np.array([min_output_np]))
-        assert_almost_equal(max_output.asnumpy(), np.array([max_output_np]))
+        assert_almost_equal(min_output.asnumpy(), onp.array([min_output_np]))
+        assert_almost_equal(max_output.asnumpy(), onp.array([max_output_np]))
 
+    @use_np
     def check_requantize_with_gluon(shape, min_calib_range=None, max_calib_range=None):
-        qdata = mx.nd.random.uniform(low=-1000.0, high=1000.0, shape=shape).astype('int32')
-        min_range = mx.nd.array([-1010.0])
-        max_range = mx.nd.array([1020.0])
+        qdata = mx.np.random.uniform(low=-1000.0, high=1000.0, size=shape).astype('int32')
+        min_range = mx.np.array([-1010.0])
+        max_range = mx.np.array([1020.0])
 
         class RequantizeBlock(mx.gluon.nn.HybridBlock):
             def __init__(self, min_calib_range=None, max_calib_range=None, **kwargs):
@@ -174,25 +178,25 @@ def __init__(self, min_calib_range=None, max_calib_range=None, **kwargs):
                 self.min_calib_range = min_calib_range
                 self.max_calib_range = max_calib_range
 
-            def hybrid_forward(self, F, x, min_range, max_range):
+            def forward(self, x, min_range, max_range):
                 if self.min_calib_range is not None and self.max_calib_range is not None:
-                    out = F.contrib.requantize(x, min_range, max_range,
-                                               min_calib_range=self.min_calib_range,
-                                               max_calib_range=self.max_calib_range)
+                    out = npx.requantize(x, min_range, max_range,
+                                         min_calib_range=self.min_calib_range,
+                                         max_calib_range=self.max_calib_range)
                 else:
-                    out = F.contrib.requantize(x, min_range, max_range)
+                    out = npx.requantize(x, min_range, max_range)
                 return out
 
         requant = RequantizeBlock(min_calib_range, max_calib_range)  # m*_calib_ranges can be None
         qdata_int8, min_output, max_output = requant(qdata, min_range, max_range)
 
-        qdata_int8_np, min_output_np, max_output_np = requantize_baseline(qdata.asnumpy(), min_range.asscalar(),
-                                                                          max_range.asscalar(),
+        qdata_int8_np, min_output_np, max_output_np = requantize_baseline(qdata.asnumpy(), min_range.item(),
+                                                                          max_range.item(),
                                                                           min_calib_range=min_calib_range,
                                                                           max_calib_range=max_calib_range)
         assert_almost_equal(qdata_int8.asnumpy(), qdata_int8_np, atol = 1)
-        assert_almost_equal(min_output.asnumpy(), np.array([min_output_np]))
-        assert_almost_equal(max_output.asnumpy(), np.array([max_output_np]))
+        assert_almost_equal(min_output.asnumpy(), onp.array([min_output_np]))
+        assert_almost_equal(max_output.asnumpy(), onp.array([max_output_np]))
 
     # test with gluon API.
     check_requantize_with_gluon((3, 4, 10, 10))
@@ -206,6 +210,7 @@ def hybrid_forward(self, F, x, min_range, max_range):
     check_requantize((32, 3, 23, 23), min_calib_range=-134.349, max_calib_range=523.43)
 
 
+@use_np
 def test_quantized_conv():
     def check_quantized_conv(data_shape, kernel, num_filter, pad, stride, dilate, use_bias, qdtype):
         if is_test_for_native_cpu():
@@ -241,22 +246,23 @@ def check_quantized_conv(data_shape, kernel, num_filter, pad, stride, dilate, us
             data_high = 127.0
 
         convfp32.initialize()
-        input_data = mx.nd.random.uniform(low=data_low,
+        input_data = mx.np.random.uniform(low=data_low,
                                           high=data_high,
-                                          shape=data_shape
+                                          size=data_shape
                                          ).astype('int32').astype('float32')
         convfp32(input_data) # initialize params
-        mx.nd.waitall()
+        npx.waitall()
         fp32_params = convfp32.collect_params()
+        weight_shape = fp32_params['weight'].shape
         new_args = dict()
-        new_args['weight'] = mx.nd.random.uniform(low=-127.0,
+        new_args['weight'] = mx.np.random.uniform(low=-127.0,
                                                   high=127.0,
-                                                  shape=fp32_params['weight'].shape
+                                                  size=weight_shape
                                                  ).astype('int32').astype('float32')
         if use_bias:
-           new_args['bias'] = mx.nd.random.uniform(low=-127.0,
+           new_args['bias'] = mx.np.random.uniform(low=-127.0,
                                                    high=127.0,
-                                                   shape=fp32_params['bias'].shape
+                                                   size=fp32_params['bias'].shape
                                                   ).astype('int32').astype('float32')
         convfp32.load_dict(new_args, cast_dtype=True, dtype_source='saved')
 
@@ -272,25 +278,33 @@ def __init__(self, channels, kernel_size, strides=(1, 1),
                                 'pad': padding, 'num_filter': channels, 'no_bias': not use_bias, 'num_group': 1,
                                 'layout': 'NCHW'}
 
-                self.min_data = mx.gluon.Parameter('min_data', dtype='float32', allow_deferred_init=True)
-                self.max_data = mx.gluon.Parameter('max_data', dtype='float32', allow_deferred_init=True)
+                self.min_data = mx.gluon.Parameter('min_data', dtype='float32', shape=(1), allow_deferred_init=True)
+                self.max_data = mx.gluon.Parameter('max_data', dtype='float32', shape=(1), allow_deferred_init=True)
+
+                self.weight = mx.gluon.Parameter('weight', dtype='int8', shape=weight_shape, allow_deferred_init=True)
+                self.min_weight = mx.gluon.Parameter('min_weight', dtype='float32', shape=(1), allow_deferred_init=True)
+                self.max_weight = mx.gluon.Parameter('max_weight', dtype='float32', shape=(1), allow_deferred_init=True)
 
-                self.weight = mx.gluon.Parameter('weight', dtype='int8', allow_deferred_init=True)
-                self.min_weight = mx.gluon.Parameter('min_weight', dtype='float32', allow_deferred_init=True)
-                self.max_weight = mx.gluon.Parameter('max_weight', dtype='float32', allow_deferred_init=True)
-                
                 if use_bias:
-                    self.bias = mx.gluon.Parameter('bias', dtype='int8', allow_deferred_init=True)
-                    self.min_bias = mx.gluon.Parameter('min_bias', dtype='float32', allow_deferred_init=True)
-                    self.max_bias = mx.gluon.Parameter('max_bias', dtype='float32', allow_deferred_init=True)
-
-            def hybrid_forward(self, F, x, weight, bias=None, min_data=None, max_data=None,
-                               min_weight=None, max_weight=None, min_bias=None, max_bias=None):
-                out = F.contrib.quantized_conv(data=x, weight=weight, bias=bias, 
-                                               min_data=min_data, max_data=max_data,
-                                               min_weight=min_weight, max_weight=max_weight,
-                                               min_bias=min_bias, max_bias=max_bias,
-                                               **self._kwargs)
+                    self.bias = mx.gluon.Parameter('bias', dtype='int8', shape=(num_filter,), allow_deferred_init=True)
+                    self.min_bias = mx.gluon.Parameter('min_bias', dtype='float32', shape=(1), allow_deferred_init=True)
+                    self.max_bias = mx.gluon.Parameter('max_bias', dtype='float32', shape=(1), allow_deferred_init=True)
+
+            def forward(self, x):
+                ctx = x.ctx
+                weight = self.weight.data().as_in_ctx(ctx)
+                bias = self.bias.data().as_in_ctx(ctx) if self.use_bias else None
+                min_data = self.min_data.data().as_in_ctx(ctx)
+                max_data = self.max_data.data().as_in_ctx(ctx)
+                min_weight = self.min_weight.data().as_in_ctx(ctx)
+                max_weight = self.max_weight.data().as_in_ctx(ctx)
+                min_bias = self.min_bias.data().as_in_ctx(ctx) if self.use_bias else None
+                max_bias = self.max_bias.data().as_in_ctx(ctx) if self.use_bias else None
+                out = npx.quantized_conv(data=x, weight=weight, bias=bias, 
+                                         min_data=min_data, max_data=max_data,
+                                         min_weight=min_weight, max_weight=max_weight,
+                                         min_bias=min_bias, max_bias=max_bias,
+                                         **self._kwargs)
                 return out
 
         convint8 = QuantConv(channels=num_filter, kernel_size=kernel, strides=stride,
@@ -299,16 +313,16 @@ def hybrid_forward(self, F, x, weight, bias=None, min_data=None, max_data=None,
         quantized_range = 127.0
         qargs = {
             'weight': new_args['weight'].astype('int8'),
-            'min_data': mx.nd.array([-quantized_range]),
-            'max_data': mx.nd.array([quantized_range]),
-            'min_weight': mx.nd.array([-quantized_range]),
-            'max_weight': mx.nd.array([quantized_range])
+            'min_data': mx.np.array([-quantized_range]),
+            'max_data': mx.np.array([quantized_range]),
+            'min_weight': mx.np.array([-quantized_range]),
+            'max_weight': mx.np.array([quantized_range])
         }
         if use_bias:
             qargs.update({
                 'bias': new_args['bias'].astype('int8'),
-                'min_bias': mx.nd.array([-quantized_range]),
-                'max_bias': mx.nd.array([quantized_range]),
+                'min_bias': mx.np.array([-quantized_range]),
+                'max_bias': mx.np.array([quantized_range]),
             })
 
         convint8.load_dict(qargs, cast_dtype=True, dtype_source='saved')
@@ -317,8 +331,8 @@ def hybrid_forward(self, F, x, weight, bias=None, min_data=None, max_data=None,
 
         if use_bias:
             # with adding bias, accuracy loss should not be greater than one
-            diff = mx.nd.abs(output - qoutput.astype(output.dtype))
-            cond = mx.nd.lesser(2, diff).sum().asscalar()
+            diff = mx.np.abs(output - qoutput.astype(output.dtype))
+            cond = mx.np.less(2, diff).sum().item()
             assert cond == 0
         else:
             assert_almost_equal(output.asnumpy(), qoutput.asnumpy(), atol = 1)
@@ -332,6 +346,7 @@ def hybrid_forward(self, F, x, weight, bias=None, min_data=None, max_data=None,
         check_quantized_conv((1, 3, 4, 28, 28), (1, 3, 3), 128, (1, 1, 1), (1, 1, 1), (2, 2, 2), True, qdtype)
 
 
+@use_np
 def test_quantized_elemwise_add():
     def check_quantized_elemwise_add(data_shape, qtype):
         if is_test_for_native_cpu():
@@ -348,15 +363,15 @@ class ElemwiseSumBlock(mx.gluon.nn.HybridBlock):
             def __init__(self, **kwargs):
                 super(ElemwiseSumBlock, self).__init__(**kwargs)
 
-            def hybrid_forward(self, F, dataA, dataB):
-                return F.elemwise_add(dataA, dataB)
+            def forward(self, dataA, dataB):
+                return dataA + dataB
 
         class QuantElemwiseSumBlock(mx.gluon.nn.HybridBlock):
             def __init__(self, **kwargs):
                 super(QuantElemwiseSumBlock, self).__init__(**kwargs)
 
-            def hybrid_forward(self, F, dataA, dataB, dataA_min, dataA_max, dataB_min, dataB_max):
-                return F.contrib.quantized_elemwise_add(dataA, dataB, dataA_min, dataA_max, dataB_min, dataB_max)
+            def forward(self, dataA, dataB, dataA_min, dataA_max, dataB_min, dataB_max):
+                return npx.quantized_elemwise_add(dataA, dataB, dataA_min, dataA_max, dataB_min, dataB_max)
 
         elemwise_add_fp32 = ElemwiseSumBlock()
 
@@ -367,8 +382,8 @@ def hybrid_forward(self, F, dataA, dataB, dataA_min, dataA_max, dataB_min, dataB
             data_low = -127.0
             data_high = 127.0
 
-        dataA_val = mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape).astype('int32').astype('float32')
-        dataB_val = mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape).astype('int32').astype('float32')
+        dataA_val = mx.np.random.uniform(low=data_low, high=data_high, size=data_shape).astype('int32').astype('float32')
+        dataB_val = mx.np.random.uniform(low=data_low, high=data_high, size=data_shape).astype('int32').astype('float32')
 
         output = elemwise_add_fp32(dataA_val, dataB_val)
 
@@ -377,16 +392,16 @@ def hybrid_forward(self, F, dataA, dataB, dataA_min, dataA_max, dataB_min, dataB
         dataA_val_int8 = dataA_val.astype(qtype)
         dataB_val_int8 = dataB_val.astype(qtype)
         quantized_range = 127.0
-        min_dataA = mx.nd.array([data_low])
-        max_dataA = mx.nd.array([data_high])
-        min_dataB = mx.nd.array([data_low])
-        max_dataB = mx.nd.array([data_high])
+        min_dataA = mx.np.array([data_low])
+        max_dataA = mx.np.array([data_high])
+        min_dataB = mx.np.array([data_low])
+        max_dataB = mx.np.array([data_high])
         qoutput, min_range, max_range = quantized_elemwise_add(dataA_val_int8, dataB_val_int8,
                                                                min_dataA, max_dataA,
                                                                min_dataB, max_dataB)
         int8_rslt = qoutput.astype(output.dtype) * max_range / 0x7fffffff
-        diff = mx.nd.abs(output - int8_rslt)
-        cond = mx.nd.lesser(2, diff).sum().asscalar()
+        diff = mx.np.abs(output - int8_rslt)
+        cond = mx.np.less(2, diff).sum().item()
         assert cond == 0
 
     for qtype in ['int8', 'uint8']:
@@ -396,6 +411,7 @@ def hybrid_forward(self, F, dataA, dataB, dataA_min, dataA_max, dataB_min, dataB
         check_quantized_elemwise_add((32, 56, 64, 11), qtype)
 
 
+@use_np
 def test_quantized_elemwise_mul():
     def check_quantized_elemwise_mul(data_shape, qtype):
         if is_test_for_native_cpu():
@@ -412,15 +428,15 @@ class ElemwiseMulBlock(mx.gluon.nn.HybridBlock):
             def __init__(self, **kwargs):
                 super(ElemwiseMulBlock, self).__init__(**kwargs)
 
-            def hybrid_forward(self, F, dataA, dataB):
-                return F.elemwise_mul(dataA, dataB)
+            def forward(self, dataA, dataB):
+                return mx.np.multiply(dataA, dataB)
 
         class QuantElemwiseMulBlock(mx.gluon.nn.HybridBlock):
             def __init__(self, **kwargs):
                 super(QuantElemwiseMulBlock, self).__init__(**kwargs)
 
-            def hybrid_forward(self, F, dataA, dataB, dataA_min, dataA_max, dataB_min, dataB_max):
-                return F.contrib.quantized_elemwise_mul(dataA, dataB, dataA_min, dataA_max, dataB_min, dataB_max)
+            def forward(self, dataA, dataB, dataA_min, dataA_max, dataB_min, dataB_max):
+                return npx.quantized_elemwise_mul(dataA, dataB, dataA_min, dataA_max, dataB_min, dataB_max)
 
         elemwise_mul_fp32 = ElemwiseMulBlock()
         if qtype == 'uint8':
@@ -430,8 +446,8 @@ def hybrid_forward(self, F, dataA, dataB, dataA_min, dataA_max, dataB_min, dataB
             data_low = -127.0
             data_high = 127.0
 
-        dataA_val = mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape).astype('int32').astype('float32')
-        dataB_val = mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape).astype('int32').astype('float32')
+        dataA_val = mx.np.random.uniform(low=data_low, high=data_high, size=data_shape).astype('int32').astype('float32')
+        dataB_val = mx.np.random.uniform(low=data_low, high=data_high, size=data_shape).astype('int32').astype('float32')
 
         output = elemwise_mul_fp32(dataA_val, dataB_val)
 
@@ -439,10 +455,10 @@ def hybrid_forward(self, F, dataA, dataB, dataA_min, dataA_max, dataB_min, dataB
         dataA_val_int8 = dataA_val.astype(qtype)
         dataB_val_int8 = dataB_val.astype(qtype)
         quantized_range = 127.0
-        min_dataA = mx.nd.array([data_low])
-        max_dataA = mx.nd.array([data_high])
-        min_dataB = mx.nd.array([data_low])
-        max_dataB = mx.nd.array([data_high])
+        min_dataA = mx.np.array([data_low])
+        max_dataA = mx.np.array([data_high])
+        min_dataB = mx.np.array([data_low])
+        max_dataB = mx.np.array([data_high])
         qoutput, min_range, max_range = quantized_elemwise_mul(dataA_val_int8, dataB_val_int8,
                                                                min_dataA, max_dataA,
                                                                min_dataB, max_dataB)
@@ -458,6 +474,7 @@ def hybrid_forward(self, F, dataA, dataB, dataA_min, dataA_max, dataB_min, dataB
         check_quantized_elemwise_mul((32, 56, 64, 11), qtype)
 
 
+@use_np
 def test_quantized_pooling():
     def check_quantized_pooling(data_shape, kernel, pool_type, pad, stride, global_pool, qdtype, convention='valid'):
         if is_test_for_native_cpu():
@@ -479,8 +496,8 @@ def __init__(self, kernel=kernel, pad=pad, stride=stride,
                                 'pool_type': pool_type, 'global_pool': global_pool,
                                 'cudnn_off': False, 'pooling_convention': convention}
 
-            def hybrid_forward(self, F, data):
-                return F.Pooling(data, **self._kwargs)
+            def forward(self, data):
+                return npx.pooling(data, **self._kwargs)
 
         class QuantPoolingBlock(mx.gluon.nn.HybridBlock):
             def __init__(self, kernel=kernel, pad=pad, stride=stride,
@@ -492,8 +509,8 @@ def __init__(self, kernel=kernel, pad=pad, stride=stride,
                                 'pool_type': pool_type, 'global_pool': global_pool, 'cudnn_off': False,
                                 'pooling_convention':convention}
 
-            def hybrid_forward(self, F, data, min_data, max_data):
-                return F.contrib.quantized_pooling(data, min_data, max_data, **self._kwargs)
+            def forward(self, data, min_data, max_data):
+                return npx.quantized_pooling(data, min_data, max_data, **self._kwargs)
 
         pooling_fp32 = PoolingBlock()
         if qdtype == 'uint8':
@@ -503,9 +520,9 @@ def hybrid_forward(self, F, data, min_data, max_data):
             data_low = -127.0
             data_high = 127.0
 
-        input_data = mx.nd.random.uniform(low=data_low,
+        input_data = mx.np.random.uniform(low=data_low,
                                           high=data_high,
-                                          shape=data_shape
+                                          size=data_shape
                                          ).astype('int32').astype('float32')
         output = pooling_fp32(input_data)
 
@@ -515,16 +532,16 @@ def hybrid_forward(self, F, data, min_data, max_data):
 
         int8_input_data = input_data.astype(qdtype)
         quantized_range = 127.0
-        min_data = mx.nd.array([-quantized_range])
-        max_data = mx.nd.array([quantized_range])
+        min_data = mx.np.array([-quantized_range])
+        max_data = mx.np.array([quantized_range])
 
         qoutput, min_range, max_range = quantized_pooling(int8_input_data, min_data, max_data)
 
         if pool_type == 'max':
             assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
         elif pool_type == 'avg':  # for avg pooling, fp32 and int8 may be different due to rounding errors
-            diff = mx.nd.abs(output - qoutput.astype(output.dtype))
-            cond = mx.nd.lesser(2, diff).sum().asscalar()
+            diff = mx.np.abs(output - qoutput.astype(output.dtype))
+            cond = mx.np.less(2, diff).sum().item()
             assert cond == 0
 
     for qdtype in ['int8', 'uint8']:
@@ -547,6 +564,7 @@ def hybrid_forward(self, F, data, min_data, max_data):
         check_quantized_pooling((3, 512, 3, 7, 7), (1, 7, 7), 'avg', (0, 0, 0), (1, 2, 2), True, qdtype, 'full')
 
 
+@use_np
 def test_quantized_fc():
     def check_quantized_fc(data_shape, num_hidden, use_bias, qdtype, flatten=True):
         if is_test_for_native_cpu():
@@ -564,7 +582,7 @@ def check_quantized_fc(data_shape, num_hidden, use_bias, qdtype, flatten=True):
             return
 
         def maxabs(a, b):
-            return mx.nd.maximum(mx.nd.abs(a), mx.nd.abs(b))
+            return mx.np.maximum(mx.np.abs(a), mx.np.abs(b))
 
         int8_range = 127.0
         if qdtype == 'uint8':
@@ -576,42 +594,43 @@ def maxabs(a, b):
             data_high = 63.0
             quantized_range = 127.0
 
-        data = mx.nd.random.uniform(low=data_low,
+        data = mx.np.random.uniform(low=data_low,
                                     high=data_high,
-                                    shape=data_shape
+                                    size=data_shape
                                    ).astype('int32').astype('float32')
         fc_fp32 = mx.gluon.nn.Dense(units=num_hidden, use_bias=use_bias, flatten=flatten)
         fc_fp32.initialize()
         fc_fp32(data)
-        mx.nd.waitall()
+        npx.waitall()
         fp32_params = fc_fp32.collect_params()
+        weight_shape = fp32_params['weight'].shape
 
         new_args = dict()
-        new_args['weight'] = mx.nd.random.uniform(low=data_low,
+        new_args['weight'] = mx.np.random.uniform(low=data_low,
                                                   high=data_high,
-                                                  shape=fp32_params['weight'].shape
+                                                  size=fp32_params['weight'].shape
                                                  ).astype('int32').astype('float32')
-        data_min = mx.nd.min(data).astype('float32')
-        data_max = mx.nd.max(data).astype('float32')
-        weight_min = mx.nd.min(new_args['weight']).astype('float32')
-        weight_max = mx.nd.max(new_args['weight']).astype('float32')
+        data_min = mx.np.min(data).astype('float32')
+        data_max = mx.np.max(data).astype('float32')
+        weight_min = mx.np.min(new_args['weight']).astype('float32')
+        weight_max = mx.np.max(new_args['weight']).astype('float32')
         data_range = maxabs(data_min, data_max)
         weight_range = maxabs(weight_min, weight_max)
 
         if use_bias:
-            bias = mx.nd.random.uniform(low=data_low,
+            bias = mx.np.random.uniform(low=data_low,
                                         high=data_high,
-                                        shape=fp32_params['bias'].shape
+                                        size=fp32_params['bias'].shape
                                        ).astype('int32').astype('float32')
-            bias_min = mx.nd.min(bias).astype('float32')
-            bias_max = mx.nd.max(bias).astype('float32')
+            bias_min = mx.np.min(bias).astype('float32')
+            bias_max = mx.np.max(bias).astype('float32')
             bias_range = maxabs(bias_min, bias_max)
 
             bias_scale = int8_range / bias_range
             data_scale = quantized_range / data_range
             weight_scale = int8_range / weight_range
             bias_int32_rescale = data_scale * weight_scale / bias_scale
-            new_bias = mx.nd.cast(bias, dtype='float32') * bias_int32_rescale
+            new_bias = bias.astype('float32') * bias_int32_rescale
             new_args['bias'] = new_bias.astype('int32').astype('float32')
 
         fc_fp32.load_dict(new_args, cast_dtype=True, dtype_source='saved')
@@ -623,40 +642,48 @@ def __init__(self, num_hidden, use_bias, flatten, **kwargs):
                 self.use_bias = use_bias
                 self._kwargs = {'num_hidden': num_hidden, 'no_bias': not use_bias, 'flatten': flatten}
 
-                self.min_data = mx.gluon.Parameter('min_data', dtype='float32', allow_deferred_init=True)
-                self.max_data = mx.gluon.Parameter('max_data', dtype='float32', allow_deferred_init=True)
+                self.min_data = mx.gluon.Parameter('min_data', dtype='float32', shape=(1), allow_deferred_init=True)
+                self.max_data = mx.gluon.Parameter('max_data', dtype='float32', shape=(1), allow_deferred_init=True)
 
-                self.weight = mx.gluon.Parameter('weight', dtype='int8', allow_deferred_init=True)
-                self.min_weight = mx.gluon.Parameter('min_weight', dtype='float32', allow_deferred_init=True)
-                self.max_weight = mx.gluon.Parameter('max_weight', dtype='float32', allow_deferred_init=True)
+                self.weight = mx.gluon.Parameter('weight', dtype='int8', shape=weight_shape, allow_deferred_init=True)
+                self.min_weight = mx.gluon.Parameter('min_weight', dtype='float32', shape=(1), allow_deferred_init=True)
+                self.max_weight = mx.gluon.Parameter('max_weight', dtype='float32', shape=(1), allow_deferred_init=True)
 
                 if use_bias:
-                    self.bias = mx.gluon.Parameter('bias', dtype='int8', allow_deferred_init=True)
-                    self.min_bias = mx.gluon.Parameter('min_bias', dtype='float32', allow_deferred_init=True)
-                    self.max_bias = mx.gluon.Parameter('max_bias', dtype='float32', allow_deferred_init=True)
-
-            def hybrid_forward(self, F, x, weight, bias=None, min_data=None, max_data=None,
-                               min_weight=None, max_weight=None, min_bias=None, max_bias=None):
-                out = F.contrib.quantized_fully_connected(data=x, weight=weight, bias=bias, 
-                                                          min_data=min_data, max_data=max_data,
-                                                          min_weight=min_weight, max_weight=max_weight,
-                                                          min_bias=min_bias, max_bias=max_bias,
-                                                          **self._kwargs)
+                    self.bias = mx.gluon.Parameter('bias', dtype='int8', shape=(num_hidden,), allow_deferred_init=True)
+                    self.min_bias = mx.gluon.Parameter('min_bias', dtype='float32', shape=(1), allow_deferred_init=True)
+                    self.max_bias = mx.gluon.Parameter('max_bias', dtype='float32', shape=(1), allow_deferred_init=True)
+
+            def forward(self, x):
+                ctx = x.ctx
+                weight = self.weight.data().as_in_ctx(ctx)
+                bias = self.bias.data().as_in_ctx(ctx) if self.use_bias else None
+                min_data = self.min_data.data().as_in_ctx(ctx)
+                max_data = self.max_data.data().as_in_ctx(ctx)
+                min_weight = self.min_weight.data().as_in_ctx(ctx)
+                max_weight = self.max_weight.data().as_in_ctx(ctx)
+                min_bias = self.min_bias.data().as_in_ctx(ctx) if self.use_bias else None
+                max_bias = self.max_bias.data().as_in_ctx(ctx) if self.use_bias else None
+                out = npx.quantized_fully_connected(data=x, weight=weight, bias=bias, 
+                                                    min_data=min_data, max_data=max_data,
+                                                    min_weight=min_weight, max_weight=max_weight,
+                                                    min_bias=min_bias, max_bias=max_bias,
+                                                    **self._kwargs)
                 return out
 
         fc_int8 = QuantFC(num_hidden=num_hidden, use_bias=use_bias, flatten=flatten)
         qargs = {
             'weight': new_args['weight'].astype('int8'),
-            'min_data': mx.nd.array(-data_range),
-            'max_data': mx.nd.array(data_range),
-            'min_weight': mx.nd.array(-weight_range),
-            'max_weight': mx.nd.array(weight_range)
+            'min_data': mx.np.array([-data_range]),
+            'max_data': mx.np.array([data_range]),
+            'min_weight': mx.np.array([-weight_range]),
+            'max_weight': mx.np.array([weight_range])
         }
         if use_bias:
             qargs.update({
                 'bias': bias.astype('int8'),
-                'min_bias': mx.nd.array(-bias_range),
-                'max_bias': mx.nd.array(bias_range),
+                'min_bias': mx.np.array([-bias_range]),
+                'max_bias': mx.np.array([bias_range]),
             })
 
         fc_int8.load_dict(qargs, cast_dtype=True, dtype_source='saved')
@@ -665,8 +692,8 @@ def hybrid_forward(self, F, x, weight, bias=None, min_data=None, max_data=None,
 
         if use_bias:
             # with adding bias, accuracy loss should not be greater than one
-            diff = mx.nd.abs(output - qoutput.astype(output.dtype))
-            cond = mx.nd.lesser(2, diff).sum().asscalar()
+            diff = mx.np.abs(output - qoutput.astype(output.dtype))
+            cond = mx.np.less(2, diff).sum().item()
             assert cond == 0
         else:
             assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
@@ -687,6 +714,7 @@ def hybrid_forward(self, F, x, weight, bias=None, min_data=None, max_data=None,
         check_quantized_fc((256, 111, 2, 2), 800, False, qdtype)
 
 
+@use_np
 def test_quantized_embedding():
     def check_quantized_embedding(data_shape, input_dim, output_dim):
         if is_test_for_gpu():
@@ -694,30 +722,31 @@ def check_quantized_embedding(data_shape, input_dim, output_dim):
             return
 
         def maxabs(a, b):
-            return mx.nd.maximum(mx.nd.abs(a), mx.nd.abs(b))
+            return mx.np.maximum(mx.np.abs(a), mx.np.abs(b))
 
-        data = mx.nd.random.uniform(low=0,
+        data = mx.np.random.uniform(low=0,
                                     high=input_dim,
-                                    shape=data_shape
+                                    size=data_shape
                                    ).astype('int32').astype('float32')
         embedding_fp32 = mx.gluon.nn.Embedding(input_dim=input_dim, output_dim=output_dim)
         embedding_fp32.initialize()
         embedding_fp32(data)
-        mx.nd.waitall()
+        npx.waitall()
         fp32_params = embedding_fp32.collect_params()
+        weight_shape = fp32_params['weight'].shape
         int8_range = 127.0
         new_params = dict()
-        weight = mx.nd.random.uniform(low=-int8_range,
+        weight = mx.np.random.uniform(low=-int8_range,
                                       high=int8_range,
-                                      shape=fp32_params['weight'].shape
+                                      size=weight_shape
                                      ).astype('int32').astype('float32')
         new_params['weight'] = weight
         embedding_fp32.load_dict(new_params, cast_dtype=True, dtype_source='saved')
 
         output = embedding_fp32(data)
 
-        weight_min = mx.nd.min(weight).astype('float32')
-        weight_max = mx.nd.max(weight).astype('float32')
+        weight_min = mx.np.min(weight).astype('float32')
+        weight_max = mx.np.max(weight).astype('float32')
         weight_range = maxabs(weight_min, weight_max)
 
         class QuantEmbedding(mx.gluon.nn.HybridBlock):
@@ -725,22 +754,26 @@ def __init__(self, input_dim=input_dim, output_dim=output_dim, **kwargs):
                 super(QuantEmbedding, self).__init__(**kwargs)
                 self._kwargs = {'input_dim': input_dim, 'output_dim': output_dim}
 
-                self.weight = mx.gluon.Parameter('weight', dtype='float32', allow_deferred_init=True)
-                self.min_weight = mx.gluon.Parameter('min_weight', dtype='float32', allow_deferred_init=True)
-                self.max_weight = mx.gluon.Parameter('max_weight', dtype='float32', allow_deferred_init=True)
-
-            def hybrid_forward(self, F, x, weight, min_weight=None, max_weight=None):
-                out = F.contrib.quantized_embedding(data=x, weight=weight,
-                                                    min_weight=min_weight,
-                                                    max_weight=max_weight,
-                                                    **self._kwargs)
+                self.weight = mx.gluon.Parameter('weight', dtype='float32', shape=weight_shape, allow_deferred_init=True)
+                self.min_weight = mx.gluon.Parameter('min_weight', dtype='float32', shape=(1), allow_deferred_init=True)
+                self.max_weight = mx.gluon.Parameter('max_weight', dtype='float32', shape=(1), allow_deferred_init=True)
+
+            def forward(self, x):
+                ctx = x.ctx
+                weight = self.weight.data().as_in_ctx(ctx)
+                min_weight = self.min_weight.data().as_in_ctx(ctx)
+                max_weight = self.max_weight.data().as_in_ctx(ctx)
+                out = npx.quantized_embedding(data=x, weight=weight,
+                                              min_weight=min_weight,
+                                              max_weight=max_weight,
+                                              **self._kwargs)
                 return out
 
         embedding_int8 = QuantEmbedding(input_dim=input_dim, output_dim=output_dim)
         qargs = {
             'weight': weight.astype('int8'),
-            'min_weight': mx.nd.array(-weight_range),
-            'max_weight': mx.nd.array(weight_range)
+            'min_weight': mx.np.array([-weight_range]),
+            'max_weight': mx.np.array([weight_range])
         }
 
         embedding_int8.load_dict(qargs, cast_dtype=True, dtype_source='saved')
@@ -756,6 +789,7 @@ def hybrid_forward(self, F, x, weight, min_weight=None, max_weight=None):
     check_quantized_embedding((32,), 1024, 512)
 
 
+@use_np
 def test_quantized_flatten():
     def check_quantized_flatten(shape, qdtype):
         if qdtype == 'uint8':
@@ -764,13 +798,13 @@ def check_quantized_flatten(shape, qdtype):
         else:
             data_low = -127.0
             data_high = 127.0
-        qdata = mx.nd.random.uniform(low=data_low, high=data_high, shape=shape).astype(qdtype)
-        min_data = mx.nd.array([-1023.343], dtype='float32')
-        max_data = mx.nd.array([2343.324275], dtype='float32')
-        qoutput, min_output, max_output = mx.nd.contrib.quantized_flatten(qdata, min_data, max_data)
+        qdata = mx.np.random.uniform(low=data_low, high=data_high, size=shape).astype(qdtype)
+        min_data = mx.np.array([-1023.343], dtype='float32')
+        max_data = mx.np.array([2343.324275], dtype='float32')
+        qoutput, min_output, max_output = npx.quantized_flatten(qdata, min_data, max_data)
         assert qoutput.ndim == 2
         assert qoutput.shape[0] == qdata.shape[0]
-        assert qoutput.shape[1] == np.prod(qdata.shape[1:])
+        assert qoutput.shape[1] == onp.prod(qdata.shape[1:])
         assert same(qdata.asnumpy().flatten(), qoutput.asnumpy().flatten())
         assert same(min_data.asnumpy(), min_output.asnumpy())
         assert same(max_data.asnumpy(), max_output.asnumpy())
@@ -782,6 +816,7 @@ def check_quantized_flatten(shape, qdtype):
         check_quantized_flatten((3, 4, 23, 23), qdtype)
 
 
+@use_np
 def test_quantized_act():
     def check_quantized_act(data_shape, qdtype):
         if is_test_for_native_cpu():
@@ -803,9 +838,9 @@ def check_quantized_act(data_shape, qdtype):
             data_low = -127.0
             data_high = 127.0
 
-        data = mx.nd.random.uniform(low=data_low,
+        data = mx.np.random.uniform(low=data_low,
                                     high=data_high,
-                                    shape=data_shape
+                                    size=data_shape
                                    ).astype(qdtype).astype('float32')
         output = act_fp32(data)
 
@@ -814,20 +849,20 @@ def __init__(self, activation, **kwargs):
                 super(QuantActivation, self).__init__(**kwargs)
                 self._kwargs = {'act_type': activation}
 
-            def hybrid_forward(self, F, x, min_data, max_data):
-                out = F.contrib.quantized_act(data=x, min_data=min_data, max_data=max_data, **self._kwargs)
+            def forward(self, x, min_data, max_data):
+                out = npx.quantized_act(data=x, min_data=min_data, max_data=max_data, **self._kwargs)
                 return out
 
         quantized_act = QuantActivation(activation='relu')
 
         qdata = data.astype(qdtype)
-        quantized_range_min = mx.nd.min(data).astype('float32')
-        quantized_range_max = mx.nd.max(data).astype('float32')
+        quantized_range_min = mx.np.array([mx.np.min(data).astype('float32').item()])
+        quantized_range_max = mx.np.array([mx.np.max(data).astype('float32').item()])
         qoutput, min_range, max_range = quantized_act(qdata, quantized_range_min, quantized_range_max)
 
         assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
-        assert_almost_equal(min_range.asscalar(), quantized_range_min.asscalar())
-        assert_almost_equal(max_range.asscalar(), quantized_range_max.asscalar())
+        assert_almost_equal(min_range.item(), quantized_range_min.item())
+        assert_almost_equal(max_range.item(), quantized_range_max.item())
 
     for qdtype in ['int8', 'uint8']:
         check_quantized_act((10,), qdtype)
@@ -836,15 +871,20 @@ def hybrid_forward(self, F, x, min_data, max_data):
         check_quantized_act((3, 4, 23, 23), qdtype)
 
 
+@use_np
 def test_quantized_bn():
     def get_mean_var(data):
-        mean = mx.ndarray.mean(data, axis=1, exclude=1)
-        mean_broad = mx.ndarray.expand_dims(mean, axis=0)
-        mean_broad = mx.ndarray.expand_dims(mean_broad, axis=2)
-        mean_broad = mx.ndarray.expand_dims(mean_broad, axis=3)
-        mean_broad = mx.ndarray.broadcast_like(mean_broad, data)
-        var = mx.ndarray.multiply(data - mean_broad, data - mean_broad)
-        var = mx.ndarray.mean(var, axis=1, exclude=1)
+        axes = list(range(data.ndim))
+        del axes[1]
+        mean = mx.np.mean(data, axis=axes)
+        mean_broad = mx.np.expand_dims(mean, axis=0)
+        mean_broad = mx.np.expand_dims(mean_broad, axis=2)
+        mean_broad = mx.np.expand_dims(mean_broad, axis=3)
+        mean_broad = mx.npx.broadcast_like(mean_broad, data)
+        var = mx.np.multiply(data - mean_broad, data - mean_broad)
+        axes = list(range(var.ndim))
+        del axes[1]
+        var = mx.np.mean(var, axis=axes)
         return mean, var
 
     def check_quantized_bn(data_shape, qdtype):
@@ -865,15 +905,15 @@ def check_quantized_bn(data_shape, qdtype):
 
         # run fp32 bn
         bn_fp32 = mx.gluon.nn.BatchNorm(use_global_stats=True, scale=True)
-        data = mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape)
+        data = mx.np.random.uniform(low=data_low, high=data_high, size=data_shape)
         bn_fp32.initialize()
         bn_fp32.hybridize()
         bn_fp32(data)
         fp32_params = bn_fp32.collect_params()
         
-        data = mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape)
-        gamma = mx.nd.random.uniform(low=data_low, high=data_high, shape=fp32_params['gamma'].shape)
-        beta = mx.nd.random.uniform(low=data_low, high=data_high, shape=fp32_params['beta'].shape)
+        data = mx.np.random.uniform(low=data_low, high=data_high, size=data_shape)
+        gamma = mx.np.random.uniform(low=data_low, high=data_high, size=fp32_params['gamma'].shape)
+        beta = mx.np.random.uniform(low=data_low, high=data_high, size=fp32_params['beta'].shape)
         running_mean, running_var = get_mean_var(data)
         new_params = {
             'gamma':gamma,
@@ -940,13 +980,13 @@ def __init__(self, **kwargs):
         self.pool = mx.gluon.nn.AvgPool2D(pool_size=(4,4))
         self.fc = mx.gluon.nn.Dense(units=10, flatten=True)
 
-    def hybrid_forward(self, F, x):
+    def forward(self, x):
         out = self.conv(x)
         out = self.bn(out)
         out = self.act(out)
         out = self.pool(out)
         out = self.fc(out)
-        return F.softmax(out)
+        return npx.softmax(out)
 
 
 class FP32MultipleOutputs(mx.gluon.nn.HybridBlock):
@@ -956,17 +996,17 @@ def __init__(self, length, **kwargs):
         self.convs = mx.gluon.nn.Conv2D(channels=16, kernel_size=(1,1))
         self.fc = mx.gluon.nn.Dense(units=10, flatten=True)
 
-    def hybrid_forward(self, F, x):
-        res = F.SliceChannel(x, num_outputs=self.length,
-                             axis=1, squeeze_axis=1)
+    def forward(self, x):
+        res = npx.slice_channel(x, num_outputs=self.length,
+                                axis=1, squeeze_axis=1)
         out = []
         for i in range(self.length):
             out.append(self.convs(res[i]))
-            out[i] = F.expand_dims(out[i], axis=0)
-        out = F.concat(*out)
-        out = F.reshape(out, shape=((self.length, -1)))
+            out[i] = mx.np.expand_dims(out[i], axis=0)
+        out = mx.np.concatenate(out)
+        out = mx.np.reshape(out, ((self.length, -1)))
         out = self.fc(out)
-        return F.softmax(out)
+        return npx.softmax(out)
 
 class FP32MultipleInputs(mx.gluon.nn.HybridBlock):
     def __init__(self, **kwargs):
@@ -976,13 +1016,14 @@ def __init__(self, **kwargs):
         self.conv2 = mx.gluon.nn.Conv2D(channels=64, kernel_size=(1,1), use_bias=False)
         self.bn2 = mx.gluon.nn.BatchNorm()
 
-    def hybrid_forward(self, F, data0, data1):
+    def forward(self, data0, data1):
         out0 = self.conv1(data0)
         out0 = self.bn1(out0)
         out1 = self.conv2(data1)
         out1 = self.bn2(out1)
         return out1 + out0
 
+@use_np
 @xfail_when_nonstandard_decimal_separator
 def test_quantize_model():
     def check_params(params, qparams, qsym=None):
@@ -1046,7 +1087,7 @@ def check_quantize_model(qdtype):
         multi_out_data_shape = (length, 4, 4, 10, 10)
 
         for net, dshape in zip((standard_net, multi_out_net), (data_shape, multi_out_data_shape)):
-            data = mx.nd.random.uniform(low=0, high=1, shape=dshape)
+            data = mx.np.random.uniform(low=0, high=1, size=dshape)
             net.hybridize()
             net(data)
             sym, _ = net.export(None)
@@ -1062,7 +1103,7 @@ def check_quantize_model(qdtype):
             check_params(arg_params, qarg_params, qsym)
             check_params(aux_params, qaux_params)
 
-            calib_data = mx.nd.random.uniform(shape=dshape)
+            calib_data = mx.np.random.uniform(size=dshape)
             calib_data = mx.gluon.data.DataLoader(calib_data, batch_size=batch_size)
             qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
                                                                              arg_params=arg_params,
@@ -1086,8 +1127,8 @@ def check_quantize_model_multiple_inputs(qdtype):
         net.initialize()
         net.hybridize()
         dshape = (64, 4, 10, 10)
-        data = [mx.nd.random.uniform(low=0, high=1, shape=dshape),
-                mx.nd.random.uniform(low=0, high=1, shape=dshape)]
+        data = [mx.np.random.uniform(low=0, high=1, size=dshape),
+                mx.np.random.uniform(low=0, high=1, size=dshape)]
         net(*data)
         sym, _ = net.export(None)
         arg_params, aux_params = collect_block_args_aux(net, sym)
@@ -1102,8 +1143,8 @@ def check_quantize_model_multiple_inputs(qdtype):
         check_params(arg_params, qarg_params, qsym)
         check_params(aux_params, qaux_params)
 
-        calib_data = [mx.nd.random.uniform(shape=dshape),
-                      mx.nd.random.uniform(shape=dshape)]
+        calib_data = [mx.np.random.uniform(size=dshape),
+                      mx.np.random.uniform(size=dshape)]
         calib_data = mx.gluon.data.DataLoader(mx.gluon.data.ArrayDataset(*calib_data), batch_size=4)
         qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
                                                                          arg_params=arg_params,
@@ -1126,6 +1167,7 @@ def check_quantize_model_multiple_inputs(qdtype):
         check_quantize_model_multiple_inputs(qdtype)
 
 
+@mx.util.use_np
 def test_quantize_gluon_with_forward():
     def check_quantize_net(qdtype):
         if is_test_for_native_cpu():
@@ -1144,7 +1186,7 @@ def check_quantize_net(qdtype):
             excluded_names_match += ['activation', 'relu', 'conv0']
         num_calib_batches = 1
 
-        random_data = mx.random.uniform(shape=data_shape)
+        random_data = mx.np.random.uniform(size=data_shape)
         calib_data = mx.gluon.data.DataLoader(random_data, batch_size=batch_size)
 
         quantized_resnet18_v1 = mx.contrib.quant.quantize_net(resnet18_v1, quantized_dtype=qdtype,
@@ -1197,8 +1239,8 @@ def get_fp32_sym():
     qsym, _ = mx.contrib.quant._quantize_symbol(sym, ctx=mx.current_context(),
                                              offline_params=offline_params, quantize_mode='full')
     requantize_op_names = ['requantize_conv', 'requantize_fc']
-    min_max_dict = {'conv_output': (np.random.uniform(low=100.0, high=200.0), np.random.uniform(low=100.0, high=200.0)),
-                    'fc_output': (np.random.uniform(low=100.0, high=200.0), np.random.uniform(low=100.0, high=200.0))}
+    min_max_dict = {'conv_output': (onp.random.uniform(low=100.0, high=200.0), onp.random.uniform(low=100.0, high=200.0)),
+                    'fc_output': (onp.random.uniform(low=100.0, high=200.0), onp.random.uniform(low=100.0, high=200.0))}
     op_name_to_th_name = {'requantize_conv': 'conv_output', 'requantize_fc': 'fc_output'}
     cqsym = mx.contrib.quant._calibrate_quantized_sym(qsym, min_max_dict)
     attr_dict = cqsym.attr_dict()
@@ -1206,12 +1248,13 @@ def get_fp32_sym():
         assert name in attr_dict
         lhs = float(attr_dict[name]['min_calib_range'])
         rhs = min_max_dict[op_name_to_th_name[name]][0]
-        assert_almost_equal(np.array([lhs]), np.array([rhs]))
+        assert_almost_equal(onp.array([lhs]), onp.array([rhs]))
         lhs = float(attr_dict[name]['max_calib_range'])
         rhs = min_max_dict[op_name_to_th_name[name]][1]
-        assert_almost_equal(np.array([lhs]), np.array([rhs]), rtol=1e-3, atol=1e-4)
+        assert_almost_equal(onp.array([lhs]), onp.array([rhs]), rtol=1e-3, atol=1e-4)
 
 
+@use_np
 def test_quantization_net_with_different_data_inputs_options():
     if is_test_for_native_cpu():
         print('skipped testing test_quantization_net_with_different_data_inputs_options for native cpu since it is not supported yet')
@@ -1225,7 +1268,7 @@ def test_quantization_net_with_different_data_inputs_options():
 
     batch_size = 32
     data_shape = (batch_size, 3, 224, 224)
-    random_data = mx.random.uniform(shape=data_shape)
+    random_data = mx.np.random.uniform(size=data_shape)
 
     # pass data_shapes as list of tuples
     quantized_net = mx.contrib.quant.quantize_net(net,
@@ -1292,15 +1335,15 @@ def get_threshold(nd):
         return mx.nd.maximum(mx.nd.abs(min_nd), mx.nd.abs(max_nd)).asnumpy()
 
     for dtype in ['uint8', 'int8', 'auto']:
-        nd = mx.nd.uniform(low=-10.532, high=11.3432, shape=(8, 3, 23, 23), dtype=np.float64)
+        nd = mx.nd.uniform(low=-10.532, high=11.3432, shape=(8, 3, 23, 23), dtype=onp.float64)
         expected_threshold = get_threshold(nd)
         arr = nd.asnumpy()
-        min_range = np.min(arr)
-        max_range = np.max(arr)
+        min_range = onp.min(arr)
+        max_range = onp.max(arr)
         th = max(abs(min_range), abs(max_range))
-        hist, hist_edges = np.histogram(arr, bins=8001, range=(-th, th))
+        hist, hist_edges = onp.histogram(arr, bins=8001, range=(-th, th))
         hist_dict = {'layer1' : (hist, hist_edges, min_range, max_range, th)}
         min_max_dict = mx.contrib.quant._LayerHistogramCollector.get_optimal_thresholds(hist_dict, dtype)
         assert 'layer1' in min_max_dict
-        assert_almost_equal(np.array([min_max_dict['layer1'][1]]), expected_threshold, rtol=1e-2, atol=1e-4)
+        assert_almost_equal(onp.array([min_max_dict['layer1'][1]]), expected_threshold, rtol=1e-2, atol=1e-4)
 
diff --git a/tests/python/train/test_autograd.py b/tests/python/train/test_autograd.py
index 87548489a5b0..d54c764f9132 100644
--- a/tests/python/train/test_autograd.py
+++ b/tests/python/train/test_autograd.py
@@ -28,6 +28,7 @@
 from mxnet import autograd
 logging.basicConfig(level=logging.DEBUG)
 
+@mx.util.use_np
 def test_autograd(tmpdir):
     # define network
     def get_net():
diff --git a/tests/python/unittest/test_contrib_control_flow.py b/tests/python/unittest/test_contrib_control_flow.py
index e538dd4b1633..1a2a6ccf1c63 100644
--- a/tests/python/unittest/test_contrib_control_flow.py
+++ b/tests/python/unittest/test_contrib_control_flow.py
@@ -15,130 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import copy
 import pytest
-import numpy as np
 import mxnet as mx
-from mxnet import gluon
-from numpy.testing import assert_allclose, assert_array_equal
-from collections import defaultdict
+from numpy.testing import assert_allclose
 from mxnet.test_utils import *
 from mxnet.base import _as_list
-from mxnet.attribute import AttrScope
-
-
-def test_while_loop_simple_forward():
-
-    class _TestBlock(gluon.HybridBlock):
-
-        def __init__(self, cond, func, max_iterations):
-            super(_TestBlock, self).__init__()
-            self.cond = cond
-            self.func = func
-            self.max_iterations = max_iterations
-
-        def hybrid_forward(self, F, *loop_vars):
-            return F.contrib.while_loop(
-                cond=self.cond,
-                func=self.func,
-                loop_vars=loop_vars,
-                max_iterations=self.max_iterations
-            )
-
-    for hybridize in [False, True]:
-        # Case 1.1: result should be sum([1, 2, 3 ... 100])
-        model = _TestBlock(
-            cond=lambda i, s: i <= 5,
-            func=lambda i, s: (None, (i + 1, s + i)),
-            max_iterations=10,
-        )
-        if hybridize:
-            model.hybridize()
-        _, result = model(
-            mx.nd.array([1], dtype="int64"), # i
-            mx.nd.array([0], dtype="int64"), # s
-        )
-        assert result[0].asscalar() == 6
-        assert result[1].asscalar() == 15
-        # Case 1.2: result should be sum([1, 2, 3 ... 1000])
-        model = _TestBlock(
-            cond=lambda i, s, true: true,
-            func=lambda i, s, true: (None, (i + 1, s + i, true)),
-            max_iterations=1000,
-        )
-        if hybridize:
-            model.hybridize()
-        _, result = model(
-            mx.nd.array([1], dtype="int64"), # i
-            mx.nd.array([0], dtype="int64"), # s
-            mx.nd.array([1], dtype="int64"), # true
-        )
-        assert result[0].asscalar() == 1001
-        assert result[1].asscalar() == 500500
-        assert result[2].asscalar() == 1
-        # Case 1.3: result should be sum([])
-        model = _TestBlock(
-            cond=lambda i, s, false: false,
-            func=lambda i, s, false: (None, (i + 1, s + i, false)),
-            max_iterations=1000,
-        )
-        if hybridize:
-            model.hybridize()
-        _, result = model(
-            mx.nd.array([1], dtype="int64"), # i
-            mx.nd.array([0], dtype="int64"), # s
-            mx.nd.array([0], dtype="int64"), # false
-        )
-        assert result[0].asscalar() == 1
-        assert result[1].asscalar() == 0
-        assert result[2].asscalar() == 0
-        # Case 2.1: result should be sum([1, 2, 3 ... 100])
-        model = _TestBlock(
-            cond=lambda i, s: i <= 100,
-            func=lambda i, s: (i, (i + 1, s + i)),
-            max_iterations=1000,
-        )
-        if hybridize:
-            model.hybridize()
-        outputs, (result_i, result_s) = model(
-            mx.nd.array([1], dtype="int64"), # i
-            mx.nd.array([0], dtype="int64"), # s
-        )
-        assert all(outputs.asnumpy()[ : 100] == np.arange(1, 101).reshape(100, 1))
-        assert result_i.asscalar() == 101
-        assert result_s.asscalar() == 5050
-        # Case 2.2: result should be sum([1, 2, 3 ... 1000])
-        model = _TestBlock(
-            cond=lambda i, s, true: true,
-            func=lambda i, s, true: (i, (i + 1, s + i, true)),
-            max_iterations=1000,
-        )
-        if hybridize:
-            model.hybridize()
-        outputs, (result_i, result_s, _) = model(
-            mx.nd.array([1], dtype="int64"), # i
-            mx.nd.array([0], dtype="int64"), # s
-            mx.nd.array([1], dtype="int64"), # true
-        )
-        assert all(outputs.asnumpy() == np.arange(1, 1001).reshape(1000, 1))
-        assert result_i.asscalar() == 1001
-        assert result_s.asscalar() == 500500
-        # Case 2.3: a corner case, in which loop body is never executed
-        model = _TestBlock(
-            cond=lambda i, s, false: false,
-            func=lambda i, s, false: (i, (i + 1, s + i, false)),
-            max_iterations=1000,
-        )
-        if hybridize:
-            model.hybridize()
-        _, (result_i, result_s, _) = model(
-            mx.nd.array([1], dtype="int64"), # i
-            mx.nd.array([0], dtype="int64"), # s
-            mx.nd.array([0], dtype="int64"), # false
-        )
-        assert result_i.asscalar() == 1
-        assert result_s.asscalar() == 0
 
+mx.npx.reset_np()
 
 def _verify_while_loop(cond, func, loop_var_shapes, free_var_shapes, is_train, max_iterations, is_for, n_steps):
 
@@ -1053,66 +936,6 @@ def cond(inputs, free):
                     ]
                 )
 
-class RNNLayer(gluon.HybridBlock):
-    def __init__(self, cell_type, hidden_size):
-        super(RNNLayer, self).__init__()
-        self.cell = cell_type(hidden_size)
-
-    def hybrid_forward(self, F, inputs, states):
-        out, states = F.contrib.foreach(self.cell, inputs, states)
-        return out
-
-def check_contrib_rnn(cell_type, num_states):
-    batch_size = 10
-    hidden_size = 100
-    rnn_data = mx.nd.normal(loc=0, scale=1, shape=(5, batch_size, 50))
-    state_shape = (batch_size, hidden_size)
-    states = [mx.nd.normal(loc=0, scale=1, shape=state_shape) for i in range(num_states)]
-    layer = RNNLayer(cell_type, hidden_size)
-    layer.initialize(ctx=default_context())
-    res1 = layer(rnn_data, states)
-    params1 = layer.collect_params()
-    orig_params1 = copy.deepcopy(params1)
-
-    trainer = gluon.Trainer(params1, 'sgd', {'learning_rate' : 0.03})
-    with mx.autograd.record():
-        res1 = layer(rnn_data, states)
-    res1.backward()
-    trainer.step(batch_size)
-
-    configs = [
-            {},
-            {'inline_limit': 0},
-            {'static_alloc': True},
-            {'static_alloc': True, 'static_shape': True} ]
-    for config in configs:
-        layer = RNNLayer(cell_type, hidden_size)
-        layer.initialize(ctx=default_context())
-        layer.hybridize(**config)
-        res2 = layer(rnn_data, states)
-        params2 = layer.collect_params()
-        for key, val in orig_params1.items():
-            params2[key].set_data(copy.deepcopy(val.data()))
-        trainer = gluon.Trainer(params2, 'sgd', {'learning_rate' : 0.03})
-        with mx.autograd.record():
-            res2 = layer(rnn_data, states)
-        assert_almost_equal(res1.asnumpy(), res2.asnumpy(), rtol=1e-3, atol=1e-3)
-        res2.backward()
-        trainer.step(batch_size)
-
-        for key, val in params1.items():
-            weight1 = val.data()
-            weight2 = params2[key].data()
-            assert_almost_equal(weight1.asnumpy(), weight2.asnumpy(),
-                    rtol=1e-3, atol=1e-3)
-
-
-def test_contrib_rnn():
-    cell_types = [(gluon.rnn.RNNCell, 1), (gluon.rnn.LSTMCell, 2),
-            (gluon.rnn.GRUCell, 1)]
-    for cell_type, num_states in cell_types:
-        check_contrib_rnn(cell_type, num_states)
-
 
 @pytest.mark.garbage_expected
 def test_foreach():
@@ -1487,476 +1310,6 @@ def step_nd(in1, states):
     assert_almost_equal(state.grad.asnumpy(), state_grad.asnumpy(), rtol=1e-3, atol=1e-3)
 
 
-def test_cut_subgraph_foreach():
-    class TestLayer(gluon.HybridBlock):
-        def __init__(self):
-            super(TestLayer, self).__init__()
-
-        def hybrid_forward(self, F, inputs, states):
-            def step1(data, states):
-                return data + 1, states
-            out1, states1 = F.contrib.foreach(step1, inputs, states)
-            out2, states2 = F.contrib.foreach(step1, out1, states)
-            def step2(data, states):
-                return data + states[0], states1
-            out, states = F.contrib.foreach(step2, out2, states)
-            return out
-
-    data = mx.nd.normal(loc=0, scale=1, shape=(5, 10))
-    states = mx.nd.normal(loc=0, scale=1, shape=(10))
-    layer = TestLayer()
-    layer.initialize(ctx=default_context())
-    res1 = layer(data, [states])
-
-    with mx.autograd.record():
-        res1 = layer(data, [states])
-
-    layer = TestLayer()
-    layer.initialize(ctx=default_context())
-    layer.hybridize()
-    res2 = layer(data, [states])
-
-    with mx.autograd.record():
-        res2 = layer(data, [states])
-    assert_almost_equal(res1.asnumpy(), res2.asnumpy(), rtol=1e-3, atol=1e-3)
-
-
-def test_uniq_name():
-    class ForeachLayer1(gluon.HybridBlock):
-        def __init__(self):
-            super(ForeachLayer1, self).__init__()
-
-        def hybrid_forward(self, F, inputs, states):
-            def step1(data, states):
-                return data + 1, states
-            out1, states1 = F.contrib.foreach(step1, inputs, states)
-            # The input variables have the same symbol name.
-            out, states = F.contrib.foreach(step1, out1, states1)
-            return out
-
-    class ForeachLayer2(gluon.HybridBlock):
-        def __init__(self):
-            super(ForeachLayer2, self).__init__()
-
-        def hybrid_forward(self, F, inputs, states):
-            def step1(data, states):
-                return data + 1, states
-            out1, states1 = F.contrib.foreach(step1, inputs, states)
-            def step2(data, states):
-                return data, [states[0] + states1[0] + F.squeeze(out1.slice_axis(axis=0, begin=0, end=1))]
-            # The input variables have the same symbol names.
-            # The free variables have the same symbol names as the input variables.
-            out, states = F.contrib.foreach(step2, out1, states1)
-            return out
-
-    class WhileLayer1(gluon.HybridBlock):
-        def __init__(self):
-            super(WhileLayer1, self).__init__()
-
-        def hybrid_forward(self, F, inputs, states):
-            def cond(state1, state2):
-                s = F.squeeze(state1.slice_axis(axis=0, begin=0, end=1))
-                return s == s
-            def step(state1, state2):
-                return state1 + 1, [state1, state2]
-            states = [states[0], states[0] + 1]
-            out1, states1 = F.contrib.while_loop(cond, step, states, max_iterations=5)
-            # The input variables have the same symbol name.
-            out, states = F.contrib.while_loop(cond, step, states1, max_iterations=5)
-            return out
-
-    class WhileLayer2(gluon.HybridBlock):
-        def __init__(self):
-            super(WhileLayer2, self).__init__()
-
-        def hybrid_forward(self, F, inputs, states):
-            def cond(state1, state2):
-                s = F.squeeze(state1.slice_axis(axis=0, begin=0, end=1))
-                return s == s
-            def step1(state1, state2):
-                return state1 + 1, [state1, state2]
-            states = [states[0], states[0] + 1]
-            out1, states1 = F.contrib.while_loop(cond, step1, states, max_iterations=5)
-            def step2(state1, state2):
-                return state1 + 1, [state1 + states1[0], state2 + states1[1]]
-            # The input variables have the same symbol name.
-            out, states = F.contrib.while_loop(cond, step2, states1, max_iterations=5)
-            return out
-
-    TestLayers = [ForeachLayer1, ForeachLayer2,
-            WhileLayer1, WhileLayer2]
-
-    data = mx.nd.normal(loc=0, scale=1, shape=(2, 5))
-    states = mx.nd.normal(loc=0, scale=1, shape=(5))
-    for TestLayer in TestLayers:
-        layer = TestLayer()
-        layer.initialize(ctx=default_context())
-        res1 = layer(data, [states])
-
-        with mx.autograd.record():
-            res1 = layer(data, [states])
-
-        layer = TestLayer()
-        layer.initialize(ctx=default_context())
-        layer.hybridize()
-        res2 = layer(data, [states])
-
-        with mx.autograd.record():
-            res2 = layer(data, [states])
-        assert_almost_equal(res1.asnumpy(), res2.asnumpy(), rtol=0.001, atol=0.0001)
-
-
-def test_cut_subgraph_while_loop():
-    class TestLayer(gluon.HybridBlock):
-        def __init__(self):
-            super(TestLayer, self).__init__()
-        def hybrid_forward(self, F, data):
-            out1, data1 = F.contrib.while_loop(
-                cond=lambda i: i <= 5,
-                func=lambda i: (None, (i + 1, )),
-                loop_vars=(data, ),
-                max_iterations=10,
-            )
-            out2, data2 = F.contrib.while_loop(
-                cond=lambda i: data1[0],
-                func=lambda i: (None, (i + 1, )),
-                loop_vars=data1[0],
-                max_iterations=10,
-            )
-            return data2[0]
-    data = mx.nd.normal(loc=0, scale=1, shape=(1, ))
-    layer = TestLayer()
-    layer.initialize(ctx=default_context())
-    res1 = layer(data)
-    with mx.autograd.record():
-        res1 = layer(data)
-    layer = TestLayer()
-    layer.initialize(ctx=default_context())
-    layer.hybridize()
-    res2 = layer(data)
-    with mx.autograd.record():
-        res2 = layer(data)
-    assert_almost_equal(res1.asnumpy(), res2.asnumpy(), rtol=1e-3, atol=1e-3)
-
-
-def test_cut_subgraph_cond():
-    class TestLayer(gluon.HybridBlock):
-        def __init__(self):
-            super(TestLayer, self).__init__()
-        def hybrid_forward(self, F, data):
-            data1 = F.contrib.cond(
-                data > 0.5,
-                then_func=lambda: data * 2,
-                else_func=lambda: data * 3,
-            )
-            data2 = F.contrib.cond(
-                data1 > 0.5,
-                then_func=lambda: data1 * 2,
-                else_func=lambda: data1 * 3,
-            )
-            return data2
-    data = mx.nd.normal(loc=0, scale=1, shape=(1, ))
-    layer = TestLayer()
-    layer.initialize(ctx=default_context())
-    res1 = layer(data)
-    with mx.autograd.record():
-        res1 = layer(data)
-    layer = TestLayer()
-    layer.initialize(ctx=default_context())
-    layer.hybridize()
-    res2 = layer(data)
-    with mx.autograd.record():
-        res2 = layer(data)
-    assert_almost_equal(res1.asnumpy(), res2.asnumpy(), rtol=1e-3, atol=1e-3)
-
-
-def test_scope():
-    class TestBlock1(gluon.HybridBlock):
-        def __init__(self):
-            super(TestBlock1, self).__init__()
-        def hybrid_forward(self, F, data):
-            (new_data, ) = F.contrib.cond(
-                data > 0.5,
-                then_func=lambda: data * 2,
-                else_func=lambda: data * 3,
-                name="my_cond",
-            )
-            return new_data
-    class TestBlock2(gluon.HybridBlock):
-        def __init__(self):
-            super(TestBlock2, self).__init__()
-        def hybrid_forward(self, F, data):
-            (new_data, ) = F.contrib.cond(
-                data > 0.5,
-                then_func=lambda: data * 2,
-                else_func=lambda: data * 3,
-                name="my_cond",
-            )
-            return new_data
-    AttrScope._subgraph_names = defaultdict(int)
-    data = mx.nd.normal(loc=0, scale=1, shape=(1, ))
-    block1 = TestBlock1()
-    block1.initialize(ctx=default_context())
-    block1.hybridize()
-    _ = block1(data)
-    block2 = TestBlock2()
-    block2.initialize(ctx=default_context())
-    block2.hybridize()
-    _ = block2(data)
-    assert len(AttrScope._subgraph_names) == 3
-    assert AttrScope._subgraph_names['my_cond_else'] == 2
-    assert AttrScope._subgraph_names['my_cond_pred'] == 2
-    assert AttrScope._subgraph_names['my_cond_then'] == 2
-
-
-def test_output_format_foreach():
-    class TestLayer1(gluon.HybridBlock):
-        def __init__(self, step):
-            super(TestLayer1, self).__init__()
-            self.step = step
-        def hybrid_forward(self, F, ins, states):
-            out, states = F.contrib.foreach(self.step, ins, states)
-            return out, states
-
-    def step1(data, state):
-        return data, state
-    def step2(data, state):
-        return [data], state
-    def step3(data, state):
-        if isinstance(state, list):
-            return [], [state[0] + data]
-        else:
-            return [], state + data
-    def step4(data, state):
-        if isinstance(state, list):
-            return [data, state[0]], state
-        else:
-            return [data, state], state
-
-    steps = [step1, step2, step3, step4]
-    data = mx.nd.normal(loc=0, scale=1, shape=(10, 2))
-    state = mx.nd.normal(loc=0, scale=1, shape=(2))
-    for step in steps:
-        layer1 = TestLayer1(step)
-        layer1.initialize(ctx=default_context())
-        layer2 = TestLayer1(step)
-        layer2.initialize(ctx=default_context())
-        layer2.hybridize()
-        out1, state1 = layer1(data, [state])
-        out2, state2 = layer2(data, [state])
-        step_out, step_state = step(data, [state])
-        assert type(out1) == type(step_out)
-        assert type(out2) == type(step_out)
-        assert type(state1) == type(step_state)
-        assert type(state2) == type(step_state)
-        out1 = _as_list(out1)
-        out2 = _as_list(out2)
-        state1 = _as_list(state1)
-        state2 = _as_list(state2)
-        for i in range(len(out1)):
-            assert_almost_equal(out1[i].asnumpy(), out2[i].asnumpy(), rtol=0.001, atol=0.0001)
-        for i in range(len(state1)):
-            assert_almost_equal(state1[i].asnumpy(), state2[i].asnumpy(), rtol=0.001, atol=0.0001)
-
-        layer1 = TestLayer1(step)
-        layer1.initialize(ctx=default_context())
-        layer2 = TestLayer1(step)
-        layer2.initialize(ctx=default_context())
-        layer2.hybridize()
-        out1, state1 = layer1(data, state)
-        out2, state2 = layer2(data, state)
-        step_out, step_state = step(data, state)
-        assert type(out1) == type(step_out)
-        assert type(out2) == type(step_out)
-        assert type(state1) == type(step_state)
-        assert type(state2) == type(step_state)
-        out1 = _as_list(out1)
-        out2 = _as_list(out2)
-        state1 = _as_list(state1)
-        state2 = _as_list(state2)
-        for i in range(len(out1)):
-            assert_almost_equal(out1[i].asnumpy(), out2[i].asnumpy(), rtol=0.001, atol=0.0001)
-        for i in range(len(state1)):
-            assert_almost_equal(state1[i].asnumpy(), state2[i].asnumpy(), rtol=0.001, atol=0.0001)
-
-        if step == step3:
-            continue
-        layer1 = TestLayer1(step)
-        layer1.initialize(ctx=default_context())
-        layer2 = TestLayer1(step)
-        layer2.initialize(ctx=default_context())
-        layer2.hybridize()
-        out1, state1 = layer1(data, [state, [state + 1]])
-        out2, state2 = layer2(data, [state, [state + 1]])
-        step_out, step_state = step(data, [state, [state + 1]])
-        assert type(out1) == type(step_out)
-        assert type(out2) == type(step_out)
-        assert type(state1) == type(step_state)
-        assert type(state2) == type(step_state)
-        out1 = _as_list(out1)
-        out2 = _as_list(out2)
-        state1 = _as_list(state1)
-        state2 = _as_list(state2)
-        for i in range(len(out1)):
-            assert_almost_equal(out1[i].asnumpy(), out2[i].asnumpy(), rtol=0.001, atol=0.0001)
-        for i in range(len(state1)):
-            if isinstance(state1[i], list):
-                assert_almost_equal(state1[i][0].asnumpy(), state2[i][0].asnumpy(),
-                        rtol=0.001, atol=0.0001)
-            else:
-                assert_almost_equal(state1[i].asnumpy(), state2[i].asnumpy(),
-                        rtol=0.001, atol=0.0001)
-
-
-def test_output_format_while():
-    class TestLayer1(gluon.HybridBlock):
-        def __init__(self, step, use_list, nested_list=False):
-            super(TestLayer1, self).__init__()
-            self.step = step
-            self.use_list = use_list
-            self.nested_list = nested_list
-        def hybrid_forward(self, F, states):
-            def cond(state1):
-                scalar = state1.slice_axis(axis=0, begin=0, end=1)
-                return scalar == scalar
-            cond_func = cond
-            if self.use_list:
-                states = [states]
-            elif self.nested_list:
-                def cond2(state1, state2):
-                    scalar = state1.slice_axis(axis=0, begin=0, end=1)
-                    return scalar == scalar
-                cond_func = cond2
-                states = [states, [states + 1]]
-            out, states = F.contrib.while_loop(cond_func, self.step, states, max_iterations=5)
-            return out, states
-
-    def step1(state):
-        return state, state
-    def step2(state):
-        if isinstance(state, list):
-            return state, state
-        else:
-            return [state], state
-    def step3(state):
-        return [], state
-
-    steps = [step1, step2, step3]
-    state = mx.nd.normal(loc=0, scale=1, shape=(2))
-    for step in steps:
-        layer1 = TestLayer1(step, False)
-        layer1.initialize(ctx=default_context())
-        layer2 = TestLayer1(step, False)
-        layer2.initialize(ctx=default_context())
-        layer2.hybridize()
-        out1, state1 = layer1(state)
-        out2, state2 = layer2(state)
-        assert type(out1) == type(out2)
-        assert type(state1) == type(state1)
-        out1 = _as_list(out1)
-        out2 = _as_list(out2)
-        state1 = _as_list(state1)
-        state2 = _as_list(state2)
-        for i in range(len(out1)):
-            assert_almost_equal(out1[i].asnumpy(), out2[i].asnumpy(), rtol=0.001, atol=0.0001)
-        for i in range(len(state1)):
-            assert_almost_equal(state1[i].asnumpy(), state2[i].asnumpy(), rtol=0.001, atol=0.0001)
-
-        layer1 = TestLayer1(step, True)
-        layer1.initialize(ctx=default_context())
-        layer2 = TestLayer1(step, True)
-        layer2.initialize(ctx=default_context())
-        layer2.hybridize()
-        out1, state1 = layer1(state)
-        out2, state2 = layer2(state)
-        assert type(out1) == type(out2)
-        assert type(state1) == type(state2)
-        out1 = _as_list(out1)
-        out2 = _as_list(out2)
-        state1 = _as_list(state1)
-        state2 = _as_list(state2)
-        for i in range(len(out1)):
-            assert_almost_equal(out1[i].asnumpy(), out2[i].asnumpy(), rtol=0.001, atol=0.0001)
-        for i in range(len(state1)):
-            assert_almost_equal(state1[i].asnumpy(), state2[i].asnumpy(), rtol=0.001, atol=0.0001)
-
-    def step4(state, state2):
-        states = _as_list(state)
-        states.append(state2)
-        return state, states
-    def step5(state, state2):
-        states = _as_list(state)
-        states.append(state2)
-        if isinstance(state, list):
-            return state, states
-        else:
-            return [state], states
-    def step6(state, state2):
-        states = _as_list(state)
-        states.append(state2)
-        return [], states
-
-    steps = [step4, step5, step6]
-    for step in steps:
-        layer1 = TestLayer1(step, False, True)
-        layer1.initialize(ctx=default_context())
-        layer2 = TestLayer1(step, False, True)
-        layer2.initialize(ctx=default_context())
-        layer2.hybridize()
-        out1, state1 = layer1(state)
-        out2, state2 = layer2(state)
-        assert type(out1) == type(out2)
-        assert type(state1) == type(state2)
-        out1 = _as_list(out1)
-        out2 = _as_list(out2)
-        state1 = _as_list(state1)
-        state2 = _as_list(state2)
-        for i in range(len(out1)):
-            assert_almost_equal(out1[i].asnumpy(), out2[i].asnumpy(), rtol=0.001, atol=0.0001)
-        for i in range(len(state1)):
-            if not isinstance(state1[i], list):
-                assert_almost_equal(state1[i].asnumpy(), state2[i].asnumpy(),
-                                    rtol=0.001, atol=0.0001)
-
-
-def test_output_format_cond():
-    class TestLayer1(gluon.HybridBlock):
-        def __init__(self, func):
-            super(TestLayer1, self).__init__()
-            self.func = func
-        def hybrid_forward(self, F, data):
-            def then_func():
-                return self.func(data)
-            def else_func():
-                return self.func(data)
-            return F.contrib.cond(data.slice_axis(axis=0, begin=0, end=1),
-                    then_func, else_func)
-
-    def func1(data):
-        return data
-    def func2(data):
-        return [data]
-    def func3(data):
-        return [data, data]
-
-    funcs = [func1, func2, func3]
-    data = mx.nd.normal(loc=0, scale=1, shape=(2))
-    for func in funcs:
-        layer1 = TestLayer1(func)
-        layer1.initialize(ctx=default_context())
-        layer2 = TestLayer1(func)
-        layer2.initialize(ctx=default_context())
-        layer2.hybridize()
-        out1 = layer1(data)
-        out2 = layer2(data)
-        func_out = func(data)
-        assert type(out1) == type(func_out)
-        assert type(out2) == type(func_out)
-        out1 = _as_list(out1)
-        out2 = _as_list(out2)
-        for i in range(len(out1)):
-            assert_almost_equal(out1[i].asnumpy(), out2[i].asnumpy(), rtol=0.001, atol=0.0001)
-
 def test_foreach_with_unkown_dim():
     # MXNet supports using 0 as placeholder for unknown dimensions in shape
     step = lambda data, states: (data + states[0], [states[0] * 2])
@@ -1966,4 +1319,3 @@ def test_foreach_with_unkown_dim():
     outs, states = mx.sym.contrib.foreach(step, data, states)
     _, output_shape, _ = outs.infer_shape_partial()
     assert_allclose((0, 3, 32, 32), output_shape[0])
-
diff --git a/tests/python/unittest/test_contrib_gluon_data_vision.py b/tests/python/unittest/test_contrib_gluon_data_vision.py
index 1bbf5e8dd747..83b14471af3a 100644
--- a/tests/python/unittest/test_contrib_gluon_data_vision.py
+++ b/tests/python/unittest/test_contrib_gluon_data_vision.py
@@ -49,6 +49,7 @@ def _generate_objects():
     return [2, 5] + label
 
 
+@use_np
 class TestImage(unittest.TestCase):
     IMAGES_URL = "https://repo.mxnet.io/gluon/dataset/test/test_images-9cebe48a.tar.gz"
 
@@ -142,4 +143,4 @@ def test_bbox_augmenters(self):
             max_attempts=50)
         for batch in det_iter:
             pass
-        mx.nd.waitall()
+        mx.npx.waitall()
diff --git a/tests/python/unittest/test_contrib_io.py b/tests/python/unittest/test_contrib_io.py
index dbae69fe7294..ada65aef18bb 100644
--- a/tests/python/unittest/test_contrib_io.py
+++ b/tests/python/unittest/test_contrib_io.py
@@ -21,6 +21,8 @@
 from mxnet.contrib.io import *
 from mxnet.test_utils import *
 
+mx.npx.reset_np()
+
 def test_contrib_DataLoaderIter():
     def test_mnist_batches(batch_size, expected, last_batch='discard'):
         dataset = MNIST(train=False)
diff --git a/tests/python/unittest/test_contrib_stes_op.py b/tests/python/unittest/test_contrib_stes_op.py
index 5d20fd15f98c..a5b38e7b5661 100644
--- a/tests/python/unittest/test_contrib_stes_op.py
+++ b/tests/python/unittest/test_contrib_stes_op.py
@@ -21,49 +21,51 @@
 from mxnet.test_utils import default_context
 
 
+@mx.util.use_np
 class RoundSTENET(gluon.HybridBlock):
     def __init__(self, w_init, **kwargs):
         super(RoundSTENET, self).__init__(**kwargs)
         self.w = gluon.Parameter('w', shape=30, init=mx.initializer.Constant(w_init), grad_req='write')
 
     @staticmethod
-    def expected_grads(in_data, w_init):
-        return (in_data * w_init).round() + (in_data * w_init)
+    def expected_grads(self, in_data, w_init):
+        return mx.np.round(in_data * w_init) + (in_data * w_init)
 
     @staticmethod
-    def expected_output(in_data, w_init):
-        return (in_data * w_init).round() * w_init
+    def expected_output(self, in_data, w_init):
+        return mx.np.round(in_data * w_init) * w_init
 
-    def hybrid_forward(self, F, x, w):
+    def forward(self, x):
         # Simple forward function: round_ste(w*x)*w
-        out = w * x
-        out = F.contrib.round_ste(out)
+        out = self.w.data(x.ctx) * x
+        out = mx.npx.round_ste(out)
         # Uncomment to see how test fails with round
         # out = F.round(out)
-        out = out * w
+        out = out * self.w.data(x.ctx)
         return out
 
 
+@mx.util.use_np
 class SignSTENET(gluon.HybridBlock):
     def __init__(self, w_init, **kwargs):
         super(SignSTENET, self).__init__(**kwargs)
         self.w = gluon.Parameter('w', shape=30, init=mx.initializer.Constant(w_init), grad_req='write')
 
     @staticmethod
-    def expected_grads(in_data, w_init):
-        return (in_data * w_init).sign() + (in_data * w_init)
+    def expected_grads(self, in_data, w_init):
+        return mx.np.sign(in_data * w_init) + (in_data * w_init)
 
     @staticmethod
-    def expected_output(in_data, w_init):
-        return (in_data * w_init).sign() * w_init
+    def expected_output(self, in_data, w_init):
+        return mx.np.sign(in_data * w_init) * w_init
 
-    def hybrid_forward(self, F, x, w):
+    def forward(self, x):
         # Simple forward function: sign_ste(w*x)*w
-        out = w * x
-        out = F.contrib.sign_ste(out)
+        out = self.w.data(x.ctx) * x
+        out = mx.npx.sign_ste(out)
         # Uncomment to see how test fails with sign
         # out = F.sign(out)
-        out = out * w
+        out = out * self.w.data(x.ctx)
         return out
 
 
@@ -99,19 +101,19 @@ def check_ste(net_type_str, w_init, hybridize, in_data, ctx=None):
 @xfail_when_nonstandard_decimal_separator
 def test_contrib_round_ste():
     # Test with random data
-    in_data = nd.uniform(-10, 10, shape=30)  # 10 and 30 are arbitrary numbers
-    w_init = float(nd.uniform(-10, 10, shape=1).asscalar())
+    in_data = mx.np.random.uniform(-10, 10, size=30)  # 10 and 30 are arbitrary numbers
+    w_init = float(mx.np.random.uniform(-10, 10, size=1).item())
     check_ste(net_type_str="RoundSTENET", w_init=w_init, hybridize=True, in_data=in_data)
     check_ste(net_type_str="RoundSTENET", w_init=w_init, hybridize=False, in_data=in_data)
 
     # Test 1.5 (verifies that .5 rounds the same as in round)
-    in_data = nd.array([1.5]*30)  # 10 and 30 are arbitrary numbers
+    in_data = mx.np.array([1.5]*30)  # 10 and 30 are arbitrary numbers
     w_init = 1.
     check_ste(net_type_str="RoundSTENET", w_init=w_init, hybridize=True, in_data=in_data)
     check_ste(net_type_str="RoundSTENET", w_init=w_init, hybridize=False, in_data=in_data)
 
     # Test 0
-    in_data = nd.array([0]*30)  # 10 and 30 are arbitrary numbers
+    in_data = mx.np.array([0]*30)  # 10 and 30 are arbitrary numbers
     w_init = 0.
     check_ste(net_type_str="RoundSTENET", w_init=w_init, hybridize=True, in_data=in_data)
     check_ste(net_type_str="RoundSTENET", w_init=w_init, hybridize=False, in_data=in_data)
@@ -119,13 +121,13 @@ def test_contrib_round_ste():
 
 @xfail_when_nonstandard_decimal_separator
 def test_contrib_sign_ste():
-    in_data = nd.uniform(-10, 10, shape=30)  # 10 and 30 are arbitrary numbers
-    w_init = float(nd.uniform(-10, 10, shape=1).asscalar())
+    in_data = mx.np.random.uniform(-10, 10, size=30)  # 10 and 30 are arbitrary numbers
+    w_init = float(mx.np.random.uniform(-10, 10, size=1).item())
     check_ste(net_type_str="SignSTENET", w_init=w_init, hybridize=True, in_data=in_data)
     check_ste(net_type_str="SignSTENET", w_init=w_init, hybridize=False, in_data=in_data)
 
     # Test 0
-    in_data = nd.array([0]*30)  # 10 and 30 are arbitrary numbers
+    in_data = mx.np.array([0]*30)  # 10 and 30 are arbitrary numbers
     w_init = 0.
     check_ste(net_type_str="SignSTENET", w_init=w_init, hybridize=True, in_data=in_data)
     check_ste(net_type_str="SignSTENET", w_init=w_init, hybridize=False, in_data=in_data)
diff --git a/tests/python/unittest/test_deferred_compute.py b/tests/python/unittest/test_deferred_compute.py
index a5237b164a84..dcdbd9bc9a86 100644
--- a/tests/python/unittest/test_deferred_compute.py
+++ b/tests/python/unittest/test_deferred_compute.py
@@ -421,7 +421,8 @@ def _assert_dc_gluon(setup, net, setup_is_deterministic=True, numpy=True, autogr
     _all_same(ys_np, ys_hybrid_np)
 
     with tempfile.TemporaryDirectory() as root:
-        net.export(root)
+        with mx.util.np_shape(True), mx.util.np_array(True):
+            net.export(root)
 
 def _dc_gluon_simple_setup(shape=(8, 10), *, nd):
     return [nd.ones(shape=shape, ctx=mx.context.current_context())]
@@ -445,11 +446,7 @@ def forward(self, x):
     for ctx in contexts:
         net = MyBlock()
         net.initialize(ctx=contexts)
-        _assert_dc_gluon(_dc_gluon_simple_setup, net, numpy=False, ctx=ctx)
-        with mx.util.np_shape(True), mx.util.np_array(True):
-            net = MyBlock()
-            net.initialize(ctx=contexts)
-            _assert_dc_gluon(_dc_gluon_simple_setup, net, numpy=True, ctx=ctx)
+        _assert_dc_gluon(_dc_gluon_simple_setup, net, numpy=True, ctx=ctx)
 
 
 def test_dc_hybridblock_wrapped():
@@ -481,7 +478,7 @@ def forward(self, x):
 
     net = MyBlock()
     net.initialize()
-    data = mx.nd.ones(shape=(8, 10), ctx=mx.context.current_context())
+    data = mx.np.ones(shape=(8, 10), ctx=mx.context.current_context())
     with pytest.raises(RuntimeError):
         net(data)
 
@@ -501,11 +498,7 @@ def forward(self, x):
 
     net = MyBlock()
     net.initialize()
-    _assert_dc_gluon(_dc_gluon_simple_setup, net, numpy=False)
-    with mx.util.np_shape(True), mx.util.np_array(True):
-        net = MyBlock()
-        net.initialize()
-        _assert_dc_gluon(_dc_gluon_simple_setup, net, numpy=True)
+    _assert_dc_gluon(_dc_gluon_simple_setup, net, numpy=True)
 
 
 def test_dc_hybridblock_dynamic_shape():
@@ -534,50 +527,17 @@ def __init__(self):
             self.dense = mx.gluon.nn.Dense(units=4)
 
         def forward(self, x, idx):
-            return mx.nd.sum(mx.nd.sum(mx.nd.contrib.boolean_mask(self.dense(x), idx)))
+            mask = mx.nd.np._internal.boolean_mask(self.dense(x), idx)
+            return mx.np.sum(mask)
 
     def setup(*, nd):
-        x = mx.nd.array([[0, 1], [2, 3], [4, 5], [6, 7]])
-        idx = mx.nd.array([1, 1, 1, 1])
+        x = mx.np.array([[0, 1], [2, 3], [4, 5], [6, 7]])
+        idx = mx.np.array([1, 1, 1, 1])
         return [x, idx]
 
     net = MyBlock()
     net.initialize()
-    _assert_dc_gluon(setup, net, numpy=False, autograd=False)
-
-def test_dc_hybridblock_symbolblock_error():
-    model = mx.gluon.nn.HybridSequential()
-    model.add(mx.gluon.nn.Dense(128, activation='tanh'))
-    model.add(mx.gluon.nn.Dropout(0.5))
-    model.add(mx.gluon.nn.Dense(64, activation='tanh'),
-              mx.gluon.nn.Dense(32, in_units=64))
-    model.add(mx.gluon.nn.Activation('relu'))
-    model.initialize()
-
-    inputs = mx.sym.var('data')
-    outputs = model(inputs).get_internals()
-    smodel = mx.gluon.SymbolBlock(outputs, inputs)
-    smodel.initialize()
-
-    assert len(smodel(mx.nd.zeros((16, 10)))) == 14
-
-    class Net(mx.gluon.HybridBlock):
-        def __init__(self, model):
-            super(Net, self).__init__()
-            self.model = model
-
-        def forward(self, x):
-            out = self.model(x)
-            return mx.nd.add_n(*[i.sum() for i in out])
-
-    net = Net(smodel)
-    data = mx.nd.zeros((16, 10))
-    out = net(data)
-    out.asnumpy()
-
-    net.hybridize()
-    with pytest.raises(RuntimeError):
-        out_hybrid = net(data)  # Raises RuntimeError
+    _assert_dc_gluon(setup, net, numpy=True, autograd=False)
 
 
 def test_indexing_shape_change():
diff --git a/tests/python/unittest/test_dynamic_shape.py b/tests/python/unittest/test_dynamic_shape.py
index 44d9010b4718..41cabeaa4527 100644
--- a/tests/python/unittest/test_dynamic_shape.py
+++ b/tests/python/unittest/test_dynamic_shape.py
@@ -17,6 +17,7 @@
 
 import numpy as np
 import mxnet as mx
+import mxnet.ndarray.numpy._internal as _npi
 from mxnet import gluon
 from numpy.testing import assert_allclose, assert_array_equal
 from mxnet.test_utils import *
@@ -24,6 +25,7 @@
 from mxnet.attribute import AttrScope
 
 
+@mx.util.use_np
 def test_dynamic_shape():
 
     class _TestBlock(gluon.HybridBlock):
@@ -31,13 +33,13 @@ class _TestBlock(gluon.HybridBlock):
         def __init__(self):
             super(_TestBlock, self).__init__()
 
-        def hybrid_forward(self, F, data, index):
-            return F.contrib.boolean_mask(data, index)
+        def forward(self, data, index):
+            return _npi.boolean_mask(data, index)
 
     block = _TestBlock()
     block.hybridize()
-    data = mx.nd.array([[1, 2, 3],[4, 5, 6],[7, 8, 9]])
-    index = mx.nd.array([0, 1, 1])
+    data = mx.np.array([[1, 2, 3],[4, 5, 6],[7, 8, 9]])
+    index = mx.np.array([0, 1, 1])
     data.attach_grad()
     with mx.autograd.record():
         result = block(data, index)
@@ -47,6 +49,7 @@ def hybrid_forward(self, F, data, index):
     assert_almost_equal(result.asnumpy(), result_nd)
     assert_almost_equal(data.grad.asnumpy(), data_grad_nd)
 
+@mx.util.use_np
 def test_dynamic_shape_with_reshape():
     # test dynamic shape op followed by reshape op
     class _TestBlock(gluon.HybridBlock):
@@ -54,13 +57,13 @@ class _TestBlock(gluon.HybridBlock):
         def __init__(self):
             super(_TestBlock, self).__init__()
 
-        def hybrid_forward(self, F, data, index):
-            return F.contrib.boolean_mask(data, index).reshape((-1, ))
+        def forward(self, data, index):
+            return _npi.boolean_mask(data, index).reshape((-1, ))
 
     block = _TestBlock()
     block.hybridize()
-    data = mx.nd.array([[1, 2, 3],[4, 5, 6],[7, 8, 9]])
-    index = mx.nd.array([0, 1, 1])
+    data = mx.np.array([[1, 2, 3],[4, 5, 6],[7, 8, 9]])
+    index = mx.np.array([0, 1, 1])
     data.attach_grad()
     with mx.autograd.record():
         result = block(data, index)
@@ -70,6 +73,7 @@ def hybrid_forward(self, F, data, index):
     assert_almost_equal(result.asnumpy(), result_nd)
     assert_almost_equal(data.grad.asnumpy(), data_grad_nd)
 
+@mx.util.use_np
 def test_dynamic_shape_multiple_hybridize():
     # test multiple hybridize calls for the same block
     class _TestBlock(gluon.HybridBlock):
@@ -77,12 +81,12 @@ class _TestBlock(gluon.HybridBlock):
         def __init__(self):
             super(_TestBlock, self).__init__()
 
-        def hybrid_forward(self, F, data, index):
-            return F.sum(F.contrib.boolean_mask(data, index)) - 5
+        def forward(self, data, index):
+            return mx.np.sum(_npi.boolean_mask(data, index)) - 5
 
     block = _TestBlock()
-    data = mx.nd.array([[1, 2, 3],[4, 5, 6],[7, 8, 9]])
-    index = mx.nd.array([0, 1, 0])
+    data = mx.np.array([[1, 2, 3],[4, 5, 6],[7, 8, 9]])
+    index = mx.np.array([0, 1, 0])
     result_nd = np.array([10])
 
     block.hybridize()
@@ -97,18 +101,19 @@ def hybrid_forward(self, F, data, index):
     result = block(data, index)
     assert_almost_equal(result.asnumpy(), result_nd)
 
+@mx.util.use_np
 def test_dynamic_shape_switch_hybridize():
     # test hybridize switch on and off for the same block 
     class _TestBlock(gluon.HybridBlock):
         def __init__(self):
             super(_TestBlock, self).__init__()
 
-        def hybrid_forward(self, F, data, index):
-            return F.sum(F.contrib.boolean_mask(data, index)) - 5
+        def forward(self, data, index):
+            return mx.np.sum(_npi.boolean_mask(data, index)) - 5
 
     block = _TestBlock()
-    data = mx.nd.array([[1, 2, 3],[4, 5, 6],[7, 8, 9]])
-    index = mx.nd.array([0, 1, 0])
+    data = mx.np.array([[1, 2, 3],[4, 5, 6],[7, 8, 9]])
+    index = mx.np.array([0, 1, 0])
     result_nd = np.array([10])
 
     block.hybridize()
@@ -123,26 +128,27 @@ def hybrid_forward(self, F, data, index):
     result = block(data, index)
     assert_almost_equal(result.asnumpy(), result_nd)
 
+@mx.util.use_np
 def test_dynamic_shape_backward():
     # test dynamic shape ops with backward prop
     class _TestBlock(gluon.HybridBlock):
         def __init__(self):
             super(_TestBlock, self).__init__()
 
-        def hybrid_forward(self, F, data, index):
-            return F.contrib.boolean_mask(F.sum(F.transpose(data)), index)
+        def forward(self, data, index):
+            return mx.np.sum(_npi.boolean_mask(data, index)) - 5
 
     block = _TestBlock()
     for static_alloc in [True, False]:
-      block.hybridize(static_alloc=static_alloc)
-      data = mx.nd.array([[1, 2, 3],[4, 5, 6],[7, 8, 9]])
-      index = mx.nd.array([1])
-      data.attach_grad()
-      with mx.autograd.record():
-        result = block(data, index)
-      result.backward()
-      result_nd = np.array([45.])
-      data_grad_nd = np.array([[1., 1., 1.], [1., 1., 1.], [1., 1., 1.]])
-      assert_almost_equal(result.asnumpy(), result_nd)
-      assert_almost_equal(data.grad.asnumpy(), data_grad_nd)
+        block.hybridize(static_alloc=static_alloc)
+        data = mx.np.array([[1, 2, 3],[4, 5, 6],[7, 8, 9]])
+        index = mx.np.array([0, 1, 0])
+        data.attach_grad()
+        with mx.autograd.record():
+            result = block(data, index)
+        result.backward()
+        result_nd = np.array([10.])
+        data_grad_nd = np.array([[0., 0., 0.], [1., 1., 1.], [0., 0., 0.]])
+        assert_almost_equal(result.asnumpy(), result_nd)
+        assert_almost_equal(data.grad.asnumpy(), data_grad_nd)
 
diff --git a/tests/python/unittest/test_exc_handling.py b/tests/python/unittest/test_exc_handling.py
index 938a86957c71..9009f5f2265a 100644
--- a/tests/python/unittest/test_exc_handling.py
+++ b/tests/python/unittest/test_exc_handling.py
@@ -24,6 +24,7 @@
 from mxnet.test_utils import assert_exception, default_context, set_default_context, use_np
 import pytest
 
+mx.npx.reset_np()
 
 @pytest.mark.skipif(os.environ.get('MXNET_ENGINE_TYPE') == 'NaiveEngine',
                     reason="This test assumes asynchronous execution.")
@@ -75,28 +76,6 @@ def symbolic(exec_backward=True, waitall=True):
     pytest.raises(MXNetError, symbolic, exec_backward=False, waitall=True)
     pytest.raises(MXNetError, symbolic, exec_backward=True, waitall=True)
 
-@pytest.mark.skipif(os.environ.get('MXNET_ENGINE_TYPE') == 'NaiveEngine',
-                    reason="This test assumes asynchronous execution.")
-def test_exc_gluon():
-    def gluon(exec_wait=True, waitall=False):
-        model = nn.Sequential()
-        model.add(nn.Dense(128, activation='tanh', in_units=10, flatten=False))
-        model.add(nn.Dropout(1))
-        model.add(nn.Dense(64, activation='tanh', in_units=256),
-                  nn.Dense(32, in_units=64))
-        model.initialize(ctx=[default_context()])
-        x = mx.sym.var('data')
-        y = model(x)
-        z = model(mx.nd.random.normal(10, -10, (32, 2, 10), ctx=default_context()))
-        if waitall:
-            mx.nd.waitall()
-        elif exec_wait:
-            z.wait_to_read()
-
-    gluon(exec_wait=False)
-    pytest.raises(MXNetError, gluon, exec_wait=True)
-
-    pytest.raises(MXNetError, gluon, waitall=True)
 
 def test_exc_multiple_waits():
     def multiple_waits(waitall=False):
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index f7d2c4489df6..6d5e40c31ecf 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -27,10 +27,9 @@
 from mxnet.util import is_np_array
 from mxnet.ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID
 from mxnet.test_utils import use_np
-import mxnet.numpy as _mx_np
 from common import assertRaises, assert_raises_cudnn_not_satisfied, \
     xfail_when_nonstandard_decimal_separator, environment
-import numpy as np
+import numpy as onp
 from numpy.testing import assert_array_equal
 import pytest
 from copy import deepcopy
@@ -39,6 +38,8 @@
 import random
 import tempfile
 
+mx.npx.reset_np()
+
 def test_parameter():
     p = gluon.Parameter('weight', shape=(10, 10))
     p.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)])
@@ -63,7 +64,7 @@ def test_invalid_parameter_grad_stype():
 def test_sparse_parameter():
     p = gluon.Parameter('weight', shape=(10, 10), stype='row_sparse', grad_stype='row_sparse')
     p.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)])
-    row_id = mx.nd.arange(0, 10, ctx=mx.cpu(1))
+    row_id = mx.np.arange(0, 10, ctx=mx.cpu(1))
     assert len(p.list_grad()) == 2
     # getting row_sparse data without trainer throws an exception
     assertRaises(RuntimeError, p.list_row_sparse_data, row_id)
@@ -85,7 +86,7 @@ def test_parameter_invalid_access():
     p0.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)])
     assertRaises(RuntimeError, p0.data)
     assertRaises(RuntimeError, p0.list_data)
-    row_id = mx.nd.arange(0, 10)
+    row_id = mx.np.arange(0, 10)
     # cannot call row_sparse_data on dense parameters
     p1 = gluon.Parameter('weight', shape=(10, 10))
     p1.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)])
@@ -118,15 +119,16 @@ def test_parameter_row_sparse_data():
     mx.test_utils.assert_almost_equal(retained_2[0].asnumpy(), retained_target_2.asnumpy())
 
 
+@use_np
 def test_constant():
     class Test(gluon.HybridBlock):
         def __init__(self, **kwargs):
             super(Test, self).__init__(**kwargs)
-            self.value = np.asarray([[1,2], [3,4]])
+            self.value = onp.asarray([[1,2], [3,4]])
             self.const = gluon.Constant(self.value)
 
-        def hybrid_forward(self, F, x, const):
-            return x + const
+        def forward(self, x):
+            return x + self.const.data()
 
     test = Test()
     test.initialize()
@@ -134,7 +136,7 @@ def hybrid_forward(self, F, x, const):
                             {'learning_rate': 1.0, 'momentum': 0.5})
 
     with mx.autograd.record():
-        x = mx.nd.ones((2,2))
+        x = mx.np.ones((2,2))
         x.attach_grad()
         y = test(x)
         y.backward()
@@ -145,6 +147,7 @@ def hybrid_forward(self, F, x, const):
     assert (x.grad.asnumpy() == 1).all()
 
 
+@use_np
 def test_parameter_sharing():
     class Net(gluon.Block):
         def __init__(self, in_units=0, **kwargs):
@@ -158,7 +161,7 @@ def forward(self, x):
     net1 = Net(in_units=5)
     net2 = Net().share_parameters(net1.collect_params())
     net1.initialize()
-    net2(mx.nd.zeros((3, 5)))
+    net2(mx.np.zeros((3, 5)))
 
     net1.save_parameters('net1.params')
 
@@ -168,7 +171,7 @@ def forward(self, x):
     net4 = Net()
     net5 = Net(in_units=5).share_parameters(net4.collect_params())
     net4.initialize()
-    net5(mx.nd.zeros((3, 5)))
+    net5(mx.np.zeros((3, 5)))
 
     net4.save_parameters('net4.params')
 
@@ -201,6 +204,7 @@ def test_collect_parameters():
     assert set(net.collect_params('0.bias|1.bias').keys()) == \
         set(['0.bias', '1.bias'])
 
+@use_np
 def test_basic():
     model = nn.Sequential()
     model.add(nn.Dense(128, activation='tanh', in_units=10, flatten=False))
@@ -208,14 +212,10 @@ def test_basic():
     model.add(nn.Dense(64, activation='tanh', in_units=256),
               nn.Dense(32, in_units=64))
     model.add(nn.Activation('relu'))
-    # symbol
-    x = mx.sym.var('data')
-    y = model(x)
-    assert len(y.list_arguments()) == 7
 
     # ndarray
     model.initialize(mx.init.Xavier(magnitude=2.24))
-    x = model(mx.nd.zeros((32, 2, 10)))
+    x = model(mx.np.zeros((32, 2, 10)))
     assert x.shape == (32, 32)
     x.wait_to_read()
 
@@ -225,119 +225,6 @@ def test_basic():
     assert list(model.collect_params().values())[0]._grad is not None
 
 
-def test_dense():
-    model = nn.Dense(128, activation='tanh', in_units=10, flatten=False)
-    inputs = mx.sym.Variable('data')
-    outputs = model(inputs)
-    assert set(model.collect_params().keys()) == set(['weight', 'bias'])
-    args, outs, auxs = outputs.infer_shape(data=(2, 3, 10))
-    assert outs == [(2, 3, 128)]
-
-    model = nn.Dense(128, activation='relu', in_units=30, flatten=True)
-    inputs = mx.sym.Variable('data')
-    outputs = model(inputs)
-    assert set(model.collect_params().keys()) == set(['weight', 'bias'])
-    args, outs, auxs = outputs.infer_shape(data=(17, 2, 5, 3))
-    assert outs == [(17, 128)]
-
-
-def test_hybrid_sequential_unique_internals():
-    net = mx.gluon.nn.HybridSequential()
-    net.add(mx.gluon.nn.Dense(100, activation='relu'), mx.gluon.nn.Dense(10))
-    assert len(set(s.name for s in net(mx.sym.Variable('data')).get_internals())) == 8
-
-
-@pytest.mark.parametrize('compute_before_cast', [True, False])
-def test_symbol_block(tmpdir, compute_before_cast):
-    model = nn.HybridSequential()
-    model.add(nn.Dense(128, activation='tanh'))
-    model.add(nn.Dropout(0.5))
-    model.add(nn.Dense(64, activation='tanh'),
-              nn.Dense(32, in_units=64))
-    model.add(nn.Activation('relu'))
-
-    model.initialize()
-
-    inputs = mx.sym.var('data')
-    outputs = model(inputs).get_internals()
-    params = {p.var().name: p for p in model.collect_params().values()}
-    smodel = gluon.SymbolBlock(outputs, inputs, params=params)
-
-    assert len(smodel(mx.nd.zeros((16, 10)))) == 14
-
-    out = smodel(mx.sym.var('in'))
-    assert len(out) == len(outputs.list_outputs())
-
-    class Net(nn.HybridBlock):
-        def __init__(self, model):
-            super(Net, self).__init__()
-            self.model = model
-
-        def hybrid_forward(self, F, x):
-            out = self.model(x)
-            return F.add_n(*[i.sum() for i in out])
-
-    net = Net(smodel)
-    net.hybridize()
-    assert isinstance(net(mx.nd.zeros((16, 10))), mx.nd.NDArray)
-
-    inputs = mx.sym.var('data')
-    outputs = model(inputs)
-    params = {p.var().name: p for p in model.collect_params().values()}
-    smodel = gluon.SymbolBlock(outputs, inputs, params=params)
-    net = Net(smodel)
-    net.hybridize()
-    assert isinstance(net(mx.nd.zeros((16, 10))), mx.nd.NDArray)
-
-    # Test case to verify if initializing the SymbolBlock from a model with params
-    # other than fp32 param dtype.
-
-    # 1. Load a resnet model, cast it to fp64 and export
-    tmp = str(tmpdir)
-    tmpfile = os.path.join(tmp, 'resnet34_fp64')
-    ctx = mx.cpu(0)
-
-    net_fp32 = mx.gluon.model_zoo.vision.resnet34_v2(pretrained=True, ctx=ctx, root=tmp)
-    if compute_before_cast:
-        # Compute before casting to catch bugs where symbol dtype isn't casted correctly GH-18843
-        net_fp32.initialize()
-        net_fp32(mx.nd.zeros((1,3,224,224), ctx=ctx))
-    net_fp32.cast('float64')
-    net_fp32.hybridize()
-    data = mx.nd.zeros((1,3,224,224), dtype='float64', ctx=ctx)
-    net_fp32(data)
-    sym_file, params_file = net_fp32.export(tmpfile, 0)
-
-    # 2.a Load the saved model and verify if all the params are loaded correctly.
-    # and choose one of the param to verify the type if fp64.\
-    sm = mx.sym.load(sym_file)
-    inputs = mx.sym.var('data', dtype='float64')
-    net_fp64 = mx.gluon.SymbolBlock(sm, inputs)
-    net_fp64.load_parameters(params_file, ctx=ctx)
-    # Get a conv layer's weight parameter name. Conv layer's weight param is
-    # expected to be of dtype casted, fp64.
-    for param_name in net_fp64.params.keys():
-        if 'conv' in param_name and 'weight' in param_name:
-            break
-    assert np.dtype(net_fp64.params[param_name].dtype) == np.dtype(np.float64)
-
-    # 3.b Verify same functionnality with the imports API
-    net_fp_64 = mx.gluon.SymbolBlock.imports(sym_file, 'data', params_file, ctx=ctx)
-
-    # Get a conv layer's weight parameter name. Conv layer's weight param is
-    # expected to be of dtype casted, fp64.
-    for param_name in net_fp_64.params.keys():
-        if 'conv' in param_name and 'weight' in param_name:
-            break
-    assert np.dtype(net_fp_64.params[param_name].dtype) == np.dtype(np.float64)
-
-    # Cast the symbol block to FP32 and try to forward a FP32 data.
-    # This will verify SymbolBlock.cast() functionality.
-    net_fp64.cast('float32')
-    fp32_data = mx.nd.zeros((1,3,224,224), dtype='float32', ctx=ctx)
-    prediction = net_fp64.forward(fp32_data)
-    assert np.dtype(prediction.dtype) == np.dtype(np.float32)
-
 def test_sparse_symbol_block():
     data = mx.sym.var('data')
     weight = mx.sym.var('weight', stype='row_sparse')
@@ -353,14 +240,16 @@ def test_sparse_hybrid_block():
     params['bias'] = gluon.Parameter('bias', shape=(5), dtype='float32')
     net = gluon.nn.Dense(5).share_parameters(params)
     net.initialize()
-    x = mx.nd.ones((2,5))
+    x = mx.np.ones((2,5))
     with pytest.raises(RuntimeError):
         # an exception is expected when forwarding a HybridBlock w/ sparse param
         y = net(x)
 
+
+@use_np
 def test_hybrid_block_none_args():
     class Foo(gluon.HybridBlock):
-        def hybrid_forward(self, F, a, b):
+        def forward(self, a, b):
             if a is None and b is not None:
                 return b
             elif b is None and a is not None:
@@ -371,7 +260,7 @@ def hybrid_forward(self, F, a, b):
                 raise NotImplementedError
 
     class FooDefault(gluon.HybridBlock):
-        def hybrid_forward(self, F, a, b=None):
+        def forward(self, a, b=None):
             if a is None and b is not None:
                 return b
             elif b is None and a is not None:
@@ -389,15 +278,15 @@ def __init__(self):
             self.f2 = Foo()
             self.f3 = Foo()
 
-        def hybrid_forward(self, F, a, b):
+        def forward(self, a, b):
             data = self.f1(a, b)
             data = self.f2(a, data)
             data = self.f3(data, b)
             return data
 
-    for arg_inputs in [(None, mx.nd.ones((10,))),
-                       (mx.nd.ones((10,)), mx.nd.ones((10,))),
-                       (mx.nd.ones((10,)), None)]:
+    for arg_inputs in [(None, mx.np.ones((10,))),
+                       (mx.np.ones((10,)), mx.np.ones((10,))),
+                       (mx.np.ones((10,)), None)]:
         foo1 = FooNested()
         foo1.hybridize()
         foo2 = FooNested()
@@ -409,6 +298,7 @@ def hybrid_forward(self, F, a, b):
                     assert_almost_equal(lhs.asnumpy(), rhs.asnumpy())
             else:
                 assert_almost_equal(out1.asnumpy(), out2.asnumpy())
+
     for do_hybridize in [True, False]:
         foo = FooNested()
         if do_hybridize:
@@ -418,35 +308,36 @@ def hybrid_forward(self, F, a, b):
     # Make sure the ValueError is correctly raised
     foo = FooNested()
     foo.hybridize()
-    foo(None, mx.nd.ones((10,)))  # Pass for the first time to initialize the cached op
-    pytest.raises(ValueError, lambda: foo(mx.nd.ones((10,)), mx.nd.ones((10,))))
+    foo(None, mx.np.ones((10,)))  # Pass for the first time to initialize the cached op
+    pytest.raises(ValueError, lambda: foo(mx.np.ones((10,)), mx.np.ones((10,))))
     foo = FooNested()
-    pytest.raises(ValueError, lambda: foo(mx.nd.ones((10,)), mx.sym.var('a')))
+    pytest.raises(TypeError, lambda: foo(mx.np.ones((10,)), mx.sym.var('a')))
     foo = FooNested()
-    pytest.raises(ValueError, lambda: foo(mx.sym.var('a'), mx.nd.ones((10,))))
+    pytest.raises(TypeError, lambda: foo(mx.sym.var('a'), mx.np.ones((10,))))
 
     # Test the case of the default values
     foo1 = FooDefault()
     foo1.hybridize()
     foo2 = FooDefault()
-    out1 = foo1(mx.nd.ones((10,)))
-    out2 = foo2(mx.nd.ones((10,)))
-    out3 = foo1(mx.nd.ones((10,)), None)
-    out4 = foo2(mx.nd.ones((10,)), None)
+    out1 = foo1(mx.np.ones((10,)))
+    out2 = foo2(mx.np.ones((10,)))
+    out3 = foo1(mx.np.ones((10,)), None)
+    out4 = foo2(mx.np.ones((10,)), None)
     assert_almost_equal(out1.asnumpy(), out2.asnumpy())
     assert_almost_equal(out1.asnumpy(), out3.asnumpy())
     assert_almost_equal(out1.asnumpy(), out4.asnumpy())
     foo1 = FooDefault()
     foo1.hybridize()
-    out1 = foo1(mx.nd.ones((10,)), None)
-    out2 = foo1(mx.nd.ones((10,)))
+    out1 = foo1(mx.np.ones((10,)), None)
+    out2 = foo1(mx.np.ones((10,)))
     assert_almost_equal(out1.asnumpy(), out2.asnumpy())
-    pytest.raises(ValueError, lambda: foo1(mx.nd.ones((10,)), mx.nd.ones((10,))))
+    pytest.raises(ValueError, lambda: foo1(mx.np.ones((10,)), mx.np.ones((10,))))
 
 
+@use_np
 def test_hybrid_block_hybrid_no_hybrid():
     class FooHybrid(gluon.HybridBlock):
-        def hybrid_forward(self, F, a, b):
+        def forward(self, a, b):
             if isinstance(a, (list, tuple)):
                 a = sum(a)
             if isinstance(b, (list, tuple)):
@@ -463,35 +354,35 @@ def forward(self, a, b):
     # When hybridize is not called, HybridBlock acts the same as Block
     foo_hybrid = FooHybrid()
     foo = Foo()
-    for a, b in [(mx.nd.ones((10,)), 1),
-                 (mx.nd.ones((20,)), 2),
-                 ([mx.nd.ones((10,)), mx.nd.ones((10,))],
-                  [mx.nd.ones((10)), mx.nd.ones((10,)), mx.nd.ones((10,))]),
-                 ([mx.nd.ones((10,)), mx.nd.ones((10,))], 3)]:
+    for a, b in [(mx.np.ones((10,)), 1),
+                 (mx.np.ones((20,)), 2),
+                 ([mx.np.ones((10,)), mx.np.ones((10,))],
+                  [mx.np.ones((10)), mx.np.ones((10,)), mx.np.ones((10,))]),
+                 ([mx.np.ones((10,)), mx.np.ones((10,))], 3)]:
         hybrid_block_out = foo_hybrid(a, b)
         block_out = foo(a, b)
         assert_almost_equal(hybrid_block_out.asnumpy(), block_out.asnumpy())
     # When hybridize is called, we need to make sure that the model raises for the unsupported cases
     # 1. Scalar values in the input
-    # 2. No mixing of sym/ndarray
+    # 2. No sym in the input
     # 3. No mixing of cpu ndarray and gpu ndarray  (Tested in gpu/test_gluon_gpu.py)
     # 4. Allow mixing of cpu_pinned and cpu
     foo_hybrid = FooHybrid()
     foo_hybrid.hybridize()
-    pytest.raises(ValueError, lambda: foo_hybrid(mx.nd.ones((10,)), 1))
+    pytest.raises(ValueError, lambda: foo_hybrid(mx.np.ones((10,)), 1))
     foo_hybrid = FooHybrid()
     foo_hybrid.hybridize()
-    pytest.raises(ValueError, lambda: foo_hybrid(mx.nd.ones((10,)), mx.sym.var('a')))
+    pytest.raises(TypeError, lambda: foo_hybrid(mx.np.ones((10,)), mx.sym.var('a')))
     foo_hybrid = FooHybrid()
     foo_hybrid.hybridize()
-    pytest.raises(ValueError, lambda: foo_hybrid(mx.nd.ones((10,), ctx=mx.cpu(1)),
-                                                 mx.nd.ones((10,), ctx=mx.cpu(2))))
+    pytest.raises(ValueError, lambda: foo_hybrid(mx.np.ones((10,), ctx=mx.cpu(1)),
+                                                 mx.np.ones((10,), ctx=mx.cpu(2))))
 
 
 def check_layer_forward(layer, dshape):
     print("checking layer {}\nshape: {}.".format(layer, dshape))
     layer.initialize()
-    x = mx.nd.ones(shape=dshape)
+    x = mx.np.ones(shape=dshape)
     x.attach_grad()
     with mx.autograd.record():
         out = layer(x)
@@ -502,7 +393,7 @@ def check_layer_forward(layer, dshape):
 
     layer.hybridize()
 
-    x = mx.nd.ones(shape=dshape)
+    x = mx.np.ones(shape=dshape)
     x.attach_grad()
     with mx.autograd.record():
         out = layer(x)
@@ -643,7 +534,7 @@ def transpose(shape):
             noceil_out_shape = transpose(noceil_out_shape)
             ceil_out_shape = transpose(ceil_out_shape)
 
-        x = mx.nd.zeros(xshape)
+        x = mx.np.zeros(xshape)
 
         layer = nn.MaxPool2D(3, ceil_mode=False, layout=layout)
         layer.initialize()
@@ -666,7 +557,7 @@ def test_batchnorm_backward_synchronization(variable):
         layer = nn.BatchNorm()
         layer.initialize(ctx=ctx)
         for _ in range(3):
-            data = mx.nd.random.normal(loc=10, scale=2, shape=(1, 3, 10, 10), ctx=ctx)
+            data = mx.np.random.normal(loc=10, scale=2, size=(1, 3, 10, 10), ctx=ctx)
             with mx.autograd.record():
                 out = layer(data)
             out.backward()
@@ -684,6 +575,7 @@ def test_batchnorm():
     check_layer_forward(layer, (2, 10, 10, 10))
 
 
+@use_np
 @xfail_when_nonstandard_decimal_separator
 def test_sync_batchnorm():
     def _check_batchnorm_result(input, num_devices=1, cuda=False):
@@ -737,23 +629,23 @@ def _syncParameters(bn1, bn2, ctx):
             mx.autograd.backward(loss1)
             mx.autograd.backward(loss2)
 
-        output2 = mx.nd.concat(*[output.as_in_context(input.context)
-                                 for output in output2], dim=0)
+        output2 = mx.np.concatenate([output.as_in_context(input.context)
+                                    for output in output2], axis=1)
         # check bn1
 
         momentum = 0.9
         epsilon = 1e-5
         axis = 1
         data = input1
-        running_mean = mx.nd.zeros(nch, ctx=data.context)
-        running_var = mx.nd.ones(nch, ctx=data.context)
+        running_mean = mx.np.zeros(nch, ctx=data.context)
+        running_var = mx.np.ones(nch, ctx=data.context)
 
-        data_mean = data.mean(
-            axis=axis, exclude=True, keepdims=True)
-        data_var = (data - data_mean).square().mean(axis=axis,
-                                                    exclude=True, keepdims=True)
+        axes = list(range(data.ndim))
+        del axes[axis]
+        data_mean = data.mean(axis=axes, keepdims=True)
+        data_var = mx.np.square(data - data_mean).mean(axis=axes, keepdims=True)
 
-        target_output = (data - data_mean) / (data_var + epsilon).sqrt()
+        target_output = (data - data_mean) / mx.np.sqrt(data_var + epsilon)
 
         # squeeze data_mean and data_var
         data_mean_flat = data_mean.squeeze()
@@ -785,8 +677,8 @@ def _syncParameters(bn1, bn2, ctx):
         assert_almost_equal(_find_bn(bn1).running_var.data(ctx_list[0]).asnumpy(),
                             _find_bn(bn2).running_var.data(ctx_list[0]).asnumpy(),
                             atol=atol, rtol=rtol)
-        input2grad = mx.nd.concat(
-            *[output.grad.as_in_context(input.context) for output in inputs2], dim=0)
+        input2grad = mx.np.concatenate(
+            [output.grad.as_in_context(input.ctx) for output in inputs2], axis=0)
         assert_almost_equal(input1.grad.asnumpy(),
                             input2grad.asnumpy(), atol=atol, rtol=rtol)
 
@@ -801,7 +693,7 @@ def _syncParameters(bn1, bn2, ctx):
         for shape in [(batch_size, 2), (batch_size, 3, 4), (batch_size, 4, 4, 4), (batch_size, 5, 6, 4, 4)]:
             print(str((ndev, cuda, shape)))
             for i in range(10):
-                _check_batchnorm_result(mx.nd.random.uniform(shape=shape,
+                _check_batchnorm_result(mx.np.random.uniform(size=shape,
                                                              ctx=mx.cpu(0)),
                                         num_devices=ndev, cuda=cuda)
 
@@ -819,7 +711,7 @@ def test_layernorm():
         layer.initialize()
         if hybridize:
             layer.hybridize()
-        pytest.raises(MXNetError, lambda: layer(mx.nd.ones((2, 11))))
+        pytest.raises(AssertionError, lambda: layer(mx.np.ones((2, 11))))
 
 def test_groupnorm():
     layer = nn.GroupNorm()
@@ -835,7 +727,7 @@ def test_reflectionpad():
 
 
 def test_reshape():
-    x = mx.nd.ones((2, 4, 10, 10))
+    x = mx.np.ones((2, 4, 10, 10))
     layer = nn.Conv2D(10, 2, in_channels=4)
     layer.initialize()
     with mx.autograd.record():
@@ -846,7 +738,7 @@ def test_reshape():
 
 
 def test_slice():
-    x = mx.nd.ones((5, 4, 10, 10))
+    x = mx.np.ones((5, 4, 10, 10))
     layer = nn.Conv2D(10, 2, in_channels=4)
     layer.initialize()
     with mx.autograd.record():
@@ -857,7 +749,7 @@ def test_slice():
 
 
 def test_at():
-    x = mx.nd.ones((5, 4, 10, 10))
+    x = mx.np.ones((5, 4, 10, 10))
     layer = nn.Conv2D(10, 2, in_channels=4)
     layer.initialize()
     with mx.autograd.record():
@@ -868,23 +760,20 @@ def test_at():
 
 
 def test_deferred_init():
-    x = mx.nd.ones((5, 4, 10, 10))
+    x = mx.np.ones((5, 4, 10, 10))
     layer = nn.Conv2D(10, 2)
     layer.initialize()
     layer(x)
 
 
 
+@use_np
 def check_split_data(x, num_slice, batch_axis, **kwargs):
     res = gluon.utils.split_data(x, num_slice, batch_axis, **kwargs)
     assert len(res) == num_slice
-    if not is_np_array():
-        mx.test_utils.assert_almost_equal(mx.nd.concat(*res, dim=batch_axis).asnumpy(),
-                                          x.asnumpy())
-    else:
-        mx.test_utils.assert_almost_equal(_mx_np.concatenate(res, axis=batch_axis).asnumpy(),
-                                          x.asnumpy())
-    np_res = np.array_split(x.asnumpy(), num_slice, axis=batch_axis)
+    mx.test_utils.assert_almost_equal(mx.np.concatenate(res, axis=batch_axis).asnumpy(),
+                                      x.asnumpy())
+    np_res = onp.array_split(x.asnumpy(), num_slice, axis=batch_axis)
     res_asnp = [s.asnumpy() for s in res]
     for r1, r2 in zip(np_res, res_asnp):
         assert all(r1.reshape(-1) == r2.reshape(-1))
@@ -892,7 +781,7 @@ def check_split_data(x, num_slice, batch_axis, **kwargs):
 
 @use_np
 def test_split_data_np():
-    x = _mx_np.random.uniform(size=(128, 33, 64))
+    x = mx.np.random.uniform(size=(128, 33, 64))
     check_split_data(x, 8, 0)
     check_split_data(x, 3, 1)
     check_split_data(x, 4, 1, even_split=False)
@@ -904,7 +793,7 @@ def test_split_data_np():
     assert False, "Should have failed"
 
 def test_split_data():
-    x = mx.nd.random.uniform(shape=(128, 33, 64))
+    x = mx.np.random.uniform(size=(128, 33, 64))
     check_split_data(x, 8, 0)
     check_split_data(x, 3, 1)
     check_split_data(x, 4, 1, even_split=False)
@@ -917,11 +806,11 @@ def test_split_data():
 
 def test_flatten():
     flatten = nn.Flatten()
-    x = mx.nd.zeros((3,4,5,6))
+    x = mx.np.zeros((3,4,5,6))
     assert flatten(x).shape == (3, 4*5*6)
-    x = mx.nd.zeros((3,6))
+    x = mx.np.zeros((3,6))
     assert flatten(x).shape == (3, 6)
-    x = mx.nd.zeros((3,))
+    x = mx.np.zeros((3,))
     assert flatten(x).shape == (3, 1)
 
 def test_block_attr_hidden():
@@ -1013,7 +902,7 @@ def check_sequential(net):
     net.add(dense3)
     net.initialize()
 
-    net(mx.nd.zeros((10, 10)))
+    net(mx.np.zeros((10, 10)))
     net.hybridize()
     assert net[1] is dense2
     assert net[-1] is dense3
@@ -1021,6 +910,7 @@ def check_sequential(net):
     assert len(slc) == 2 and slc[0] is dense2 and slc[1] is dense3
     assert isinstance(slc, type(net))
 
+@use_np
 def check_sequential_dc(net):
     class MyBlock(mx.gluon.HybridBlock):
         def __init__(self):
@@ -1040,13 +930,14 @@ def forward(self, x):
 
     net.initialize()
     net.hybridize()
-    net(mx.nd.zeros((10, 10)))
+    net(mx.np.zeros((10, 10)))
     assert net[1] is dense2
     assert net[-1] is dense3
     slc = net[1:3]
     assert len(slc) == 2 and slc[0] is dense2 and slc[1] is dense3
     assert isinstance(slc, type(net))
 
+@use_np
 @pytest.mark.garbage_expected
 def test_sequential():
     check_sequential(gluon.nn.Sequential())
@@ -1063,52 +954,50 @@ def test_sequential_warning():
         assert len(w) == 1
 
 
+@use_np
 def test_global_norm_clip():
-    stypes = ['default', 'row_sparse']
-    def check_global_norm_clip(stype, check_isfinite):
-        x1 = mx.nd.ones((3,3)).tostype(stype)
-        x2 = mx.nd.ones((4,4)).tostype(stype)
+    def check_global_norm_clip(check_isfinite):
+        x1 = mx.np.ones((3,3))
+        x2 = mx.np.ones((4,4))
         norm = gluon.utils.clip_global_norm([x1, x2], 1.0, check_isfinite=check_isfinite)
         assert norm == 5.0
-        assert_almost_equal(x1.asnumpy(), np.ones((3,3))/5)
-        assert_almost_equal(x2.asnumpy(), np.ones((4,4))/5)
+        assert_almost_equal(x1.asnumpy(), onp.ones((3,3))/5)
+        assert_almost_equal(x2.asnumpy(), onp.ones((4,4))/5)
 
-        x3 = mx.nd.array([1.0, 2.0, float('nan')]).tostype(stype)
+        x3 = mx.np.array([1.0, 2.0, float('nan')])
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter("always")
             gluon.utils.clip_global_norm([x1, x3], 2.0, check_isfinite=check_isfinite)
             assert len(w) == check_isfinite
 
-    for stype in stypes:
-        for check_isfinite in [True, False]:
-            check_global_norm_clip(stype, check_isfinite)
+    for check_isfinite in [True, False]:
+        check_global_norm_clip(check_isfinite)
+
 
 def test_embedding():
-    def check_embedding(sparse_grad):
-        layer = gluon.nn.Embedding(10, 100, sparse_grad=sparse_grad)
+    def check_embedding():
+        layer = gluon.nn.Embedding(10, 100)
         layer.initialize()
-        x = mx.nd.array([3,4,2,0,1])
+        x = mx.np.array([3,4,2,0,1])
         with mx.autograd.record():
             y = layer(x)
             y.backward()
         assert (layer.weight.grad().asnumpy()[:5] == 1).all()
         assert (layer.weight.grad().asnumpy()[5:] == 0).all()
 
-    def check_embedding_large_input(sparse_grad):
-        embedding = mx.gluon.nn.Embedding(10, 1, sparse_grad=True)
+    def check_embedding_large_input():
+        embedding = mx.gluon.nn.Embedding(10, 1)
         embedding.initialize()
         embedding.hybridize()
         shape = (20481,)
         with mx.autograd.record():
-            emb_in = embedding(mx.nd.ones(shape))
+            emb_in = embedding(mx.np.ones(shape))
             loss = emb_in.sum()
         loss.backward()
-        assert embedding.weight.grad().data.sum().asscalar() == 20481
+        assert embedding.weight.grad().sum().item() == 20481
 
-    check_embedding(True)
-    check_embedding(False)
-    check_embedding_large_input(True)
-    check_embedding_large_input(False)
+    check_embedding()
+    check_embedding_large_input()
 
 def test_export(tmpdir):
     tmpfile = os.path.join(str(tmpdir), 'gluon')
@@ -1117,20 +1006,21 @@ def test_export(tmpdir):
         ctx=ctx, pretrained=False)
     model.initialize()
     model.hybridize()
-    data = mx.nd.random.normal(shape=(1, 3, 32, 32))
+    data = mx.np.random.normal(size=(1, 3, 32, 32))
     out = model(data)
 
     symbol_filename, params_filename = model.export(tmpfile)
     assert symbol_filename == tmpfile+'-symbol.json'
     assert params_filename == tmpfile+'-0000.params'
 
+@use_np
 def test_import():
     ctx = mx.context.current_context()
     net1 = gluon.model_zoo.vision.resnet18_v1(
         ctx=ctx, pretrained=False)
     net1.initialize()
     net1.hybridize()
-    data = mx.nd.random.normal(shape=(1, 3, 32, 32))
+    data = mx.np.random.normal(size=(1, 3, 32, 32))
     out1 = net1(data)
 
     net1.export('net1', epoch=1)
@@ -1152,10 +1042,10 @@ def test_hybrid_stale_cache():
 
     net.hybridize()
     net.initialize()
-    net(mx.nd.ones((2,3,5)))
+    net(mx.np.ones((2,3,5)))
 
     net.add(mx.gluon.nn.Flatten())
-    assert net(mx.nd.ones((2,3,5))).shape == (2, 30)
+    assert net(mx.np.ones((2,3,5))).shape == (2, 30)
 
     net = mx.gluon.nn.HybridSequential()
     net.fc1 = mx.gluon.nn.Dense(10, weight_initializer='zeros',
@@ -1164,12 +1054,12 @@ def test_hybrid_stale_cache():
                                 bias_initializer='ones', flatten=False)
     net.hybridize()
     net.initialize()
-    net(mx.nd.ones((2,3,5)))
+    net(mx.np.ones((2,3,5)))
 
     net.fc2 = mx.gluon.nn.Dense(10, weight_initializer='zeros',
                                 bias_initializer='ones', flatten=True)
     net.initialize()
-    assert net(mx.nd.ones((2,3,5))).shape == (2, 10)
+    assert net(mx.np.ones((2,3,5))).shape == (2, 10)
 
 
 def test_lambda():
@@ -1178,21 +1068,22 @@ def test_lambda():
              nn.LeakyReLU(0.1))
 
     net2 = mx.gluon.nn.HybridSequential()
-    op3 = lambda F, x, *args: F.LeakyReLU(x, *args, slope=0.1)
+    op3 = lambda x, *args: mx.npx.leaky_relu(x, *args, slope=0.1)
     net2.add(nn.HybridLambda('tanh'),
              nn.HybridLambda(op3))
 
-    op4 = lambda x: mx.nd.LeakyReLU(x, slope=0.1)
+    op4 = lambda x: mx.npx.leaky_relu(x, slope=0.1)
     net3 = mx.gluon.nn.Sequential()
     net3.add(nn.Lambda('tanh'),
              nn.Lambda(op4))
 
-    input_data = mx.nd.random.uniform(shape=(2, 3, 5, 7))
+    input_data = mx.np.random.uniform(size=(2, 3, 5, 7))
     out1, out2, out3 = net1(input_data), net2(input_data), net3(input_data)
     assert_almost_equal(out1.asnumpy(), out2.asnumpy(), rtol=1e-3, atol=1e-3)
     assert_almost_equal(out1.asnumpy(), out3.asnumpy(), rtol=1e-3, atol=1e-3)
 
 
+@use_np
 def test_fill_shape_deferred():
     net = nn.HybridSequential()
     net.add(nn.Conv2D(64, kernel_size=2, padding=1),
@@ -1201,47 +1092,48 @@ def test_fill_shape_deferred():
     net
     net.hybridize()
     net.initialize()
-    net(mx.nd.ones((2,3,5,7)))
+    net(mx.np.ones((2,3,5,7)))
     assert net[0].weight.shape[1] == 3, net[0].weight.shape[1]
     assert net[1].gamma.shape[0] == 64, net[1].gamma.shape[0]
     assert net[2].weight.shape[1] == 3072, net[2].weight.shape[1]
 
 
+@use_np
 def test_dtype():
     net = mx.gluon.model_zoo.vision.resnet18_v1()
     net.initialize()
     net.cast('float64')
     with mx.autograd.record():
-        y = net(mx.nd.ones((16, 3, 32, 32), dtype='float64'))
+        y = net(mx.np.ones((16, 3, 32, 32), dtype='float64'))
         y.backward()
 
     net = mx.gluon.model_zoo.vision.resnet18_v1()
     net.initialize()
     net.hybridize()
-    net(mx.nd.ones((16, 3, 32, 32), dtype='float32'))
+    net(mx.np.ones((16, 3, 32, 32), dtype='float32'))
 
     net.cast('float64')
-    net(mx.nd.ones((16, 3, 32, 32), dtype='float64'))
+    net(mx.np.ones((16, 3, 32, 32), dtype='float64'))
 
-    mx.nd.waitall()
+    mx.npx.waitall()
 
     class Net(gluon.Block):
         def __init__(self, in_dim, output_dim):
             super(Net, self).__init__()
-            self.embed = gluon.nn.Embedding(input_dim=in_dim, output_dim=output_dim,dtype=np.float64)
-            self.dense = gluon.nn.Dense(2, dtype=np.float64)
+            self.embed = gluon.nn.Embedding(input_dim=in_dim, output_dim=output_dim,dtype=onp.float64)
+            self.dense = gluon.nn.Dense(2, dtype=onp.float64)
 
         def forward(self, x):
             e = self.embed(x)
-            assert(e.dtype == np.float64)
+            assert(e.dtype == onp.float64)
             y = self.dense(e)
-            assert(y.dtype == np.float64)
+            assert(y.dtype == onp.float64)
             return y
 
     net = Net(5, 10)
     net.initialize()
-    out = net(mx.nd.ones((3,), dtype=np.float64))
-    mx.nd.waitall()
+    out = net(mx.np.ones((3,), dtype=onp.float64))
+    mx.npx.waitall()
 
 def test_fill_shape_load():
     ctx = mx.context.current_context()
@@ -1252,7 +1144,7 @@ def test_fill_shape_load():
     net1
     net1.hybridize()
     net1.initialize(ctx=ctx)
-    net1(mx.nd.ones((2,3,5,7), ctx))
+    net1(mx.np.ones((2,3,5,7), ctx=ctx))
     net1.save_parameters('net_fill.params')
 
     net2 = nn.HybridSequential()
@@ -1276,14 +1168,14 @@ def test_inline():
     net.initialize()
     net.hybridize(inline_limit=3)
     with mx.autograd.record():
-        y = net(mx.nd.zeros((1,10)))
+        y = net(mx.np.zeros((1,10)))
 
     len_1 = len(json.loads(mx.autograd.get_symbol(y).tojson())['nodes'])
     y.backward()
 
     net.hybridize(inline_limit=0)
     with mx.autograd.record():
-        y = net(mx.nd.zeros((1,10)))
+        y = net(mx.np.zeros((1,10)))
 
     len_2 = len(json.loads(mx.autograd.get_symbol(y).tojson())['nodes'])
     y.backward()
@@ -1293,18 +1185,18 @@ def test_inline():
 
 @xfail_when_nonstandard_decimal_separator
 def test_activations():
-    point_to_validate = mx.nd.array([-0.1, 0.1] * 3)
+    point_to_validate = mx.np.array([-0.1, 0.1] * 3)
 
     swish = mx.gluon.nn.Swish()
     def swish_test(x):
-        return x * mx.nd.sigmoid(x)
+        return x * mx.npx.sigmoid(x)
 
     for test_point, ref_point in zip(swish_test(point_to_validate), swish(point_to_validate)):
         assert test_point == ref_point
 
     silu = mx.gluon.nn.SiLU()
     def silu_test(x):
-        return x * mx.nd.sigmoid(x)
+        return x * mx.npx.sigmoid(x)
 
     for test_point, ref_point in zip(silu_test(point_to_validate), silu(point_to_validate)):
         assert test_point == ref_point
@@ -1312,7 +1204,7 @@ def silu_test(x):
     elu = mx.gluon.nn.ELU()
     def elu_test(x):
         def elu(x):
-            return mx.nd.expm1(x) if x <= 0.0 else x
+            return mx.np.expm1(x) if x <= 0.0 else x
         return [elu(x_i) for x_i in x]
 
     for test_point, ref_point in zip(elu_test(point_to_validate), elu(point_to_validate)):
@@ -1322,7 +1214,7 @@ def elu(x):
     def selu_test(x):
         def selu(x):
             scale, alpha = 1.0507009873554804934193349852946, 1.6732632423543772848170429916717
-            return scale * x if x >= 0 else scale * alpha * mx.nd.expm1(x)
+            return scale * x if x >= 0 else scale * alpha * mx.np.expm1(x)
         return [selu(x_i) for x_i in x]
 
     for test_point, ref_point in zip(selu_test(point_to_validate), selu(point_to_validate)):
@@ -1331,12 +1223,12 @@ def selu(x):
     prelu = mx.gluon.nn.PReLU()
     prelu.initialize()
     x = point_to_validate.reshape((1, 3, 2))
-    assert_almost_equal(prelu(x).asnumpy(), mx.nd.where(x >= 0, x, 0.25 * x).asnumpy())
+    assert_almost_equal(prelu(x).asnumpy(), mx.np.where(x >= 0, x, 0.25 * x).asnumpy())
 
-    multichannel_init = mx.initializer.Constant(mx.nd.array([0.1, 0.25, 0.5]))
+    multichannel_init = mx.initializer.Constant(mx.np.array([0.1, 0.25, 0.5]))
     prelu_multichannel = mx.gluon.nn.PReLU(alpha_initializer=multichannel_init, in_channels=3)
     prelu_multichannel.initialize()
-    assert_almost_equal(prelu_multichannel(x).asnumpy(), np.array([[-0.01, 0.1], [-0.025, 0.1], [-0.05, 0.1]]))
+    assert_almost_equal(prelu_multichannel(x).asnumpy(), onp.array([[-0.01, 0.1], [-0.025, 0.1], [-0.05, 0.1]]))
 
     # https://github.com/apache/incubator-mxnet/issues/18381
     # gelu = mx.gluon.nn.GELU()
@@ -1355,6 +1247,7 @@ def selu(x):
     #     assert test_point == ref_point
 
 
+@use_np
 def test_dropout():
     def get_slice(x, axis, idx):
         ix = ()
@@ -1369,7 +1262,7 @@ def check_dropout_axes(ratio, shape, axes):
         compactshape = list(shape)
         for axis in axes:
             compactshape[axis] = 1
-        compactx = mx.random.uniform(shape=tuple(compactshape))
+        compactx = mx.np.random.uniform(size=tuple(compactshape))
         broadcastx = compactx.broadcast_to(shape)
         dropouty = mx.gluon.nn.Dropout(rate=ratio, axes=axes)(broadcastx)
         for axis in axes:
@@ -1394,8 +1287,8 @@ def check_dropout_axes(ratio, shape, axes):
         check_dropout_axes(0.25, nshape, axes = (1, 2, 3))
 
 def test_req():
-    data = mx.nd.random.uniform(shape=(1,3,224,224))
-    label = mx.nd.random.uniform(shape=(1))
+    data = mx.np.random.uniform(size=(1,3,224,224))
+    label = mx.np.random.uniform(size=(1))
     label[:] = 1
     loss = gluon.loss.SoftmaxCrossEntropyLoss()
 
@@ -1429,10 +1322,11 @@ def test_req():
     assert_almost_equal(grad * 2, grad_double)
 
 
+@use_np
 def test_save_load(tmpdir):
     net = mx.gluon.model_zoo.vision.get_resnet(1, 18, pretrained=False, root=str(tmpdir))
     net.initialize()
-    net(mx.nd.ones((1,3,224,224)))
+    net(mx.np.ones((1,3,224,224)))
     net.save_parameters(os.path.join(str(tmpdir), 'test_save_load.params'))
 
     net = mx.gluon.model_zoo.vision.get_resnet(1, 18)
@@ -1440,10 +1334,10 @@ def test_save_load(tmpdir):
 
     net.load_parameters(os.path.join(str(tmpdir), 'test_save_load.params'))
 
-    class Network(gluon.Block):
+    class Network(gluon.HybridBlock):
         def __init__(self, **kwargs):
             super(Network, self).__init__(**kwargs)
-            self.encoders = gluon.nn.Sequential()
+            self.encoders = gluon.nn.HybridSequential()
             for _ in range(2):
                 lstm = mx.gluon.rnn.LSTM(200, 1, bidirectional=True)
                 self.encoders.add(lstm)
@@ -1455,14 +1349,16 @@ def forward(self, x):
     net = Network()
     net.initialize(mx.init.Xavier(), ctx=mx.cpu())
     net.hybridize()
-    x = np.random.rand(32, 10, 10)
-    x = mx.nd.array(x).as_in_context(mx.cpu())
+    x = onp.random.rand(32, 10, 10)
+    x = mx.np.array(x).as_in_context(mx.cpu())
     net(x)
-    _, param_path = tempfile.mkstemp(suffix='.params', dir=str(tmpdir))
+    # _, param_path = tempfile.mkstemp(suffix='.params', dir=str(tmpdir))
+    param_path = os.path.join(str(tmpdir), 'test_save_load_network.params')
     net.save_parameters(param_path)
     net2 = Network()
     net2.load_parameters(param_path)
 
+@use_np
 def test_save_load_deduplicate_with_shared_params(tmpdir):
     class B(mx.gluon.Block):
         def __init__(self):
@@ -1479,10 +1375,11 @@ def __init__(self, b1, b2):
     b2 = B().share_parameters(b1.collect_params())
     c = C(b1, b2)
     c.initialize()
-    _, param_path = tempfile.mkstemp(suffix='.params', dir=str(tmpdir))
+    # _, param_path = tempfile.mkstemp(suffix='.params', dir=str(tmpdir))
+    param_path = os.path.join(str(tmpdir), 'test_save_load_deduplicate_with_shared_params.params')
     c.save_parameters(param_path, deduplicate=True)
 
-    params = mx.nd.load(param_path)
+    params = mx.npx.load(param_path)
     assert len(params) == 1  # Only a single copy of the shared parameter is saved
 
     b1 = B()
@@ -1493,7 +1390,7 @@ def __init__(self, b1, b2):
     # Test default behavior
     c.save_parameters(param_path, deduplicate=False)
 
-    params = mx.nd.load(param_path)
+    params = mx.npx.load(param_path)
     assert len(params) == 2  # Only a single copy of the shared parameter is saved
 
     b1 = B()
@@ -1501,47 +1398,16 @@ def __init__(self, b1, b2):
     c = C(b1, b2)
     c.load_parameters(param_path)
 
-def test_symbol_block_save_load(tmpdir):
-    tmp = str(tmpdir)
-    tmpfile = os.path.join(tmp, 'resnet34_fp64')
-
-    class Net(gluon.HybridBlock):
-        def __init__(self):
-            super(Net, self).__init__()
-            backbone = gluon.model_zoo.vision.resnet18_v1()
-            backbone.initialize()
-            backbone.hybridize()
-            backbone(mx.nd.random.normal(shape=(1, 3, 32, 32)))
-            sym, params = backbone.export(None)
-            data = mx.sym.var('data')
-            self.backbone = gluon.SymbolBlock(sym, data)
-            self.backbone.load_dict(params)
-            self.body = nn.Conv2D(3, 1)
-
-        def hybrid_forward(self, F, x):
-            x = self.body(x)
-            return self.backbone(x)
-
-    net1 = Net()
-    net1.initialize(mx.init.Normal())
-    net1.hybridize()
-    net1(mx.nd.random.normal(shape=(1, 3, 32, 32)))
-
-    params_file = os.path.join(tmp, './test_symbol_block_save_load.params')
-    net1.save_parameters(params_file)
-    net2 = Net()
-    net2.load_parameters(params_file)
-
 
 def test_hybrid_multi_context():
     net = mx.gluon.model_zoo.vision.get_resnet(1, 18)
     net.initialize(ctx=[mx.cpu(0), mx.cpu(1)])
     net.hybridize()
-    net(mx.nd.zeros((1, 3, 32, 32), ctx=mx.cpu(0))).asnumpy()
+    net(mx.np.zeros((1, 3, 32, 32), ctx=mx.cpu(0))).asnumpy()
 
 def test_zero_grad():
     def _test_grad_reset(ctx, dtype='float32', sparse=False, embeddingType=None):
-        data = mx.nd.random.uniform(shape=(3,3), dtype=dtype, ctx=ctx)
+        data = mx.np.random.uniform(size=(3,3), dtype=dtype, ctx=ctx)
         if embeddingType is None:
             embeddingType = dtype
         net = nn.Embedding(3, 4, sparse_grad=sparse, dtype=embeddingType)
@@ -1559,8 +1425,8 @@ def _test_multi_reset(nArrays, dtype, ctx):
         for _ in range(nArrays):
             arrType = random.choice(dtype) if isinstance(dtype, list) else dtype
             shape = ()
-            for _ in range(np.random.randint(1, 5)):
-                shape = shape + (np.random.randint(1, 10),)
+            for _ in range(onp.random.randint(1, 5)):
+                shape = shape + (onp.random.randint(1, 10),)
             arr.append(mx.nd.random.uniform(shape=shape, dtype=arrType, ctx=ctx))
 
         # Reset all arrays
@@ -1579,13 +1445,12 @@ def _test_multi_reset(nArrays, dtype, ctx):
     testedTypes = ['float16', 'float32', 'float64']
     for _ in range(10):
         for type in [testedTypes] + testedTypes:
-            _test_multi_reset(np.random.randint(1, 50), type, ctx)
+            _test_multi_reset(onp.random.randint(1, 50), type, ctx)
 
     with environment('MXNET_STORAGE_FALLBACK_LOG_VERBOSE', '0'):
         for type in ['float16', 'float32', 'float64']:
             for embType in ['float32', 'float64']:
-                for sparse in [True, False]:
-                    _test_grad_reset(ctx, dtype=type, sparse=sparse, embeddingType=embType)
+                _test_grad_reset(ctx, dtype=type, sparse=False, embeddingType=embType)
 
 
 @pytest.mark.parametrize('static_alloc', [False, True])
@@ -1593,7 +1458,7 @@ def _test_multi_reset(nArrays, dtype, ctx):
 def test_hybrid_static_memory(static_alloc, static_shape):
     if static_shape and not static_alloc:
         pytest.skip()
-    x = mx.nd.random.uniform(shape=(2, 3, 32, 32))
+    x = mx.np.random.uniform(size=(2, 3, 32, 32))
     x.attach_grad()
 
     net = gluon.model_zoo.vision.get_resnet(
@@ -1629,17 +1494,17 @@ def test_hybrid_static_memory_switching(static_alloc, static_shape):
     net.initialize()
     net.hybridize(static_alloc=static_alloc, static_shape=static_shape)
 
-    x = mx.nd.random.uniform(shape=(4, 3, 32, 32))
+    x = mx.np.random.uniform(size=(4, 3, 32, 32))
     net(x)
     with mx.autograd.record():
         y = net(x)
         y.backward()
-    x = mx.nd.random.uniform(shape=(2, 3, 32, 32))
+    x = mx.np.random.uniform(size=(2, 3, 32, 32))
     net(x)
     with mx.autograd.record():
         y = net(x)
         y.backward()
-    mx.nd.waitall()
+    mx.npx.waitall()
 
 def test_hook():
     global hook_call_count
@@ -1659,22 +1524,23 @@ def call_pre_hook(block, x):
     block.initialize()
     handle = block.register_forward_hook(call_hook)
     pre_handle = block.register_forward_pre_hook(call_pre_hook)
-    block(mx.nd.ones((3, 5)))
+    block(mx.np.ones((3, 5)))
 
     assert hook_call_count == 1
     assert pre_hook_call_count == 1
 
     handle.detach()
-    block(mx.nd.ones((3, 5)))
+    block(mx.np.ones((3, 5)))
 
     assert hook_call_count == 1
     assert pre_hook_call_count == 2
 
     pre_handle.detach()
-    block(mx.nd.ones((3, 5)))
+    block(mx.np.ones((3, 5)))
     assert hook_call_count == 1
     assert pre_hook_call_count == 2
 
+@use_np
 def test_op_hook_output_names():
     def check_name(block, expected_names, inputs=None, expected_opr_names=None, monitor_all=False):
         opr_names = []
@@ -1687,13 +1553,16 @@ def mon_callback(node_name, opr_name, arr):
 
         block.register_op_hook(mon_callback, monitor_all)
         if not inputs:
-            block(mx.nd.ones((2, 3, 4)))
+            block(mx.np.ones((2, 3, 4)))
         else:
             block(inputs)
 
         for output_name, expected_name in zip(output_names, expected_names):
-            print(output_name)
-            assert output_name == expected_name
+            output_name_list = output_name.split('_')
+            output_name_list.pop(1)
+            expected_name_list = expected_name.split('_')
+            expected_name_list.pop(1)
+            assert output_name_list == expected_name_list
 
         if expected_opr_names:
             for opr_name, expected_opr_name in zip(opr_names, expected_opr_names):
@@ -1704,21 +1573,21 @@ def mon_callback(node_name, opr_name, arr):
     model.add(mx.gluon.nn.Dense(2))
     model.initialize()
     model.hybridize()
-    check_name(model, ["hybridsequential_dense0_fwd_output"])
+    check_name(model, ["node_0_output"])
 
     # Test with Activation, FListInputNames not registered, input name will have _input appended
     model = mx.gluon.nn.HybridSequential()
     model.add(mx.gluon.nn.Activation("relu"))
     model.initialize()
     model.hybridize()
-    check_name(model, ["hybridsequential_activation0_fwd_output"])
+    check_name(model, ["node_1_output"])
 
     # Test with Pooling, monitor_all is set to True
     model = mx.gluon.nn.HybridSequential()
     model.add(mx.gluon.nn.AvgPool1D())
     model.initialize()
     model.hybridize()
-    check_name(model, ['hybridsequential_avgpool1d0_fwd_data', 'hybridsequential_avgpool1d0_fwd_output'],
+    check_name(model, ['node_2_data', 'node_2_output'],
                expected_opr_names=["Pooling"], monitor_all=True)
 
     # stack two layers and test
@@ -1728,16 +1597,16 @@ def mon_callback(node_name, opr_name, arr):
     model.initialize()
     model.hybridize()
     check_name(model,
-               ['hybridsequential_dense0_fwd_data', 'hybridsequential_dense0_fwd_weight',
-                'hybridsequential_dense0_fwd_bias', 'hybridsequential_dense0_fwd_output',
-                'hybridsequential_activation0_fwd_input0', 'hybridsequential_activation0_fwd_output'], monitor_all=True)
+               ['node_3_data', 'node_3_weight',
+                'node_3_bias', 'node_3_output',
+                'node_4_input0', 'node_4_output'], monitor_all=True)
 
     # check with different hybridize modes
     model.hybridize(static_alloc=True)
     check_name(model,
-               ['hybridsequential_dense0_fwd_data', 'hybridsequential_dense0_fwd_weight',
-                'hybridsequential_dense0_fwd_bias', 'hybridsequential_dense0_fwd_output',
-                'hybridsequential_activation0_fwd_input0', 'hybridsequential_activation0_fwd_output'], monitor_all=True)
+               ['node_5_data', 'node_5_weight',
+                'node_5_bias', 'node_5_output',
+                'node_6_input0', 'node_6_output'], monitor_all=True)
 
 def test_apply():
     global called_blocks
@@ -1755,27 +1624,31 @@ def record_name(block):
     assert called_blocks == [type(block[0]), type(block[1]), type(block)]
 
 
+@use_np
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_summary():
     net = gluon.model_zoo.vision.resnet50_v1()
     net.initialize()
-    net.summary(mx.nd.ones((32, 3, 224, 224)))
+    net.summary(mx.np.ones((32, 3, 224, 224)))
 
     net2 = nn.Sequential()
     net2.add(nn.Embedding(40, 30))
     net2.add(gluon.rnn.LSTM(30))
     net2.add(nn.Dense(40, flatten=False).share_parameters(net2[0].params))
     net2.initialize()
-    net2.summary(mx.nd.ones((80, 32)))
+    with mx.util.np_shape(True), mx.util.np_array(True):
+        net2.summary(mx.np.ones((80, 32)))
 
     net3 = gluon.rnn.LSTM(30)
     net3.initialize()
     begin_state = net3.begin_state(32)
-    net3.summary(mx.nd.ones((80, 32, 5)), begin_state)
+    net3.summary(mx.np.ones((80, 32, 5)), begin_state)
 
     net.hybridize()
-    pytest.raises(AssertionError, net.summary, mx.nd.ones((32, 3, 224, 224)))
+    pytest.raises(AssertionError, net.summary, mx.np.ones((32, 3, 224, 224)))
 
+@use_np
+@pytest.mark.skip(reason='Currently, sparse feature is not supported in Gluon2.0')
 def test_sparse_hybrid_block_grad():
     class Embedding(mx.gluon.HybridBlock):
         def __init__(self, num_tokens, embedding_size):
@@ -1785,44 +1658,46 @@ def __init__(self, num_tokens, embedding_size):
             self.embedding = mx.gluon.nn.Embedding(
                 num_tokens, embedding_size, sparse_grad=True)
 
-        def hybrid_forward(self, F, words):
+        def forward(self, words):
             emb = self.embedding(words)
-            return emb + F.ones_like(emb)
+            return emb + mx.np.ones_like(emb)
 
     embedding = Embedding(20, 3)
     embedding.initialize()
     embedding.hybridize()
 
     with mx.autograd.record():
-        emb0 = embedding(mx.nd.arange(10)).sum()
-        emb1 = embedding(mx.nd.arange(10)).sum()
+        emb0 = embedding(mx.np.arange(10)).sum()
+        emb1 = embedding(mx.np.arange(10)).sum()
         loss = emb0 + emb1
     loss.backward()
     grad = embedding.embedding.weight.grad().asnumpy()
     assert (grad[:10] == 2).all()
     assert (grad[10:] == 0).all()
 
+@use_np
+@pytest.mark.skip(reason='Currently, sparse feature is not supported in Gluon2.0')
 def test_sparse_hybrid_block():
     class Linear(mx.gluon.HybridBlock):
         def __init__(self, units):
             super(Linear, self).__init__()
             self.w = gluon.Parameter('w', shape=(units, units))
 
-        def hybrid_forward(self, F, x, w):
-            return F.dot(x, w)
+        def forward(self, x, w):
+            return mx.np.dot(x, w)
 
     class SparseBlock(mx.gluon.HybridBlock):
         def __init__(self, units):
             super(SparseBlock, self).__init__()
             self.net = Linear(units)
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             return self.net(x) * x
 
     block = SparseBlock(2)
     block.initialize()
     block.hybridize()
-    x = mx.nd.ones((2,2)).tostype('csr')
+    x = mx.np.ones((2,2)).tostype('csr')
     with mx.autograd.record():
         z = block(x) + block(x)
     z.backward()
@@ -1834,29 +1709,30 @@ def test_hybrid_static_memory_recording():
     net.initialize()
     net.hybridize(static_alloc=True)
 
-    x = mx.nd.random.uniform(shape=(1, 3, 32, 32))
+    x = mx.np.random.uniform(size=(1, 3, 32, 32))
     with mx.autograd.record(True):
         net(x)
     net(x)
 
 
+@use_np
 def test_share_inputs_outputs():
     class TestIOBackward(gluon.HybridBlock):
         def __init__(self):
             super(TestIOBackward, self).__init__()
 
-        def hybrid_forward(self, F, in1, in2):
+        def forward(self, in1, in2):
             return in1 + in2
 
     class TestIOForward(gluon.HybridBlock):
         def __init__(self):
             super(TestIOForward, self).__init__()
 
-        def hybrid_forward(self, F, in1):
+        def forward(self, in1):
             return in1
 
-    d1 = mx.nd.arange(10)
-    d2 = mx.nd.arange(10)
+    d1 = mx.np.arange(10)
+    d2 = mx.np.arange(10)
 
     params=[{'inline_limit':0},
             {'inline_limit':0, 'static_alloc':True},
@@ -1867,7 +1743,7 @@ def hybrid_forward(self, F, in1):
         t.hybridize(**param)
         for i in range(5):
             d1.attach_grad()
-            out_grad = mx.nd.random.uniform(shape=(10))
+            out_grad = mx.np.random.uniform(size=(10))
             res = t(d1)
             assert_almost_equal(res.asnumpy(), d1.asnumpy())
 
@@ -1878,7 +1754,7 @@ def hybrid_forward(self, F, in1):
         for i in range(5):
             d1.attach_grad()
             d2.attach_grad()
-            out_grad = mx.nd.random.uniform(shape=(10))
+            out_grad = mx.np.random.uniform(size=(10))
             with mx.autograd.record():
                 res = t(d1, d2)
             res.backward(out_grad=out_grad)
@@ -1886,13 +1762,14 @@ def hybrid_forward(self, F, in1):
             assert_almost_equal(out_grad.asnumpy(), d2.grad.asnumpy())
 
 
+@use_np
 def test_grad_graph_change():
     class Model(mx.gluon.HybridBlock):
-        def hybrid_forward(self, F, array, index):
+        def forward(self, array, index):
             row = array.take(index)
             return row, index
-    array = mx.nd.arange(3)
-    index = mx.nd.array([2])
+    array = mx.np.arange(3)
+    index = mx.np.array([2])
     array.attach_grad()
     model = Model()
     model.hybridize(inline_limit=0)
@@ -1907,15 +1784,16 @@ def check_layer_forward_withinput(net, x):
     x_hybrid.attach_grad()
     net.initialize()
     with mx.autograd.record():
-        out1 = net(x)
+        out1 = net(x_hybrid)
     out1.backward()
     net.hybridize()
     with mx.autograd.record():
-        out2 = net(x_hybrid)
+        out2 = net(x)
     out2.backward()
     mx.test_utils.assert_almost_equal(x.grad.asnumpy(), x_hybrid.grad.asnumpy(), rtol=1e-5, atol=1e-6)
     mx.test_utils.assert_almost_equal(out1.asnumpy(), out2.asnumpy(), rtol=1e-5, atol=1e-6)
 
+@use_np
 @pytest.mark.parametrize('chn_num', [16, 256])
 @pytest.mark.parametrize('kernel', [1, 3, 224])
 def test_conv2d_16c(chn_num, kernel):
@@ -1928,18 +1806,19 @@ def __init__(self,
             super(Net, self).__init__(**kwargs)
             self.conv0 = gluon.nn.Conv2D(chn_num, (kernel, kernel))
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             out = self.conv0(x)
             return out
 
-    x = mx.nd.random.uniform(-1.0, 1.0, shape=(batch_size, 3, 224, 224))
+    x = mx.np.random.uniform(-1.0, 1.0, size=(batch_size, 3, 224, 224))
     net = Net(chn_num, kernel)
     check_layer_forward_withinput(net, x)
 
+@use_np
 @pytest.mark.parametrize('grp', [16])
 @pytest.mark.parametrize('kernel_size', [1, 3])
 def test_group_conv2d_16c(grp, kernel_size):
-    input_size_list = np.random.randint(low=3, high=65, size=10).tolist()
+    input_size_list = onp.random.randint(low=3, high=65, size=10).tolist()
     batch_size = 4
     class Net(gluon.HybridBlock):
         def __init__(self,
@@ -1950,16 +1829,17 @@ def __init__(self,
             self.conv0 = gluon.nn.Conv2D(chn_num, (1, 1))
             self.conv1 = gluon.nn.Conv2D(chn_num, (kernel, kernel), groups=chn_num)
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             y = self.conv0(x)
             out = self.conv1(y)
             return out
 
     for i in range(len(input_size_list)):
-        x = mx.nd.random.uniform(-1.0, 1.0, shape=(batch_size, 3, input_size_list[i], input_size_list[i]))
+        x = mx.np.random.uniform(-1.0, 1.0, size=(batch_size, 3, input_size_list[i], input_size_list[i]))
         net = Net(grp, kernel_size)
         check_layer_forward_withinput(net, x)
 
+@use_np
 @pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 def test_deconv2d_16c():
     in_chn_list = [1024, 512, 256, 128, 64, 32, 16]
@@ -1972,20 +1852,21 @@ def __init__(self, chn_num, kernel, **kwargs):
             super(Net, self).__init__(**kwargs)
             self.deconv0 = gluon.nn.Conv2DTranspose(chn_num, (kernel, kernel))
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             out = self.deconv0(x)
             return out
     for i in range(len(in_shape)):
-        x = mx.nd.random.uniform(-1.0, 1.0, shape=(batch_size, in_chn_list[i], in_shape[i], in_shape[i]))
+        x = mx.np.random.uniform(-1.0, 1.0, size=(batch_size, in_chn_list[i], in_shape[i], in_shape[i]))
         for j in range(len(kernel_list)):
             net = Net(out_chn_list[i], kernel_list[j])
             check_layer_forward_withinput(net, x)
 
 
+@use_np
 @pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 def test_batchnorm_16c():
     chn_list = [16, 1024]
-    shape = np.random.randint(low=1, high=300, size=10)
+    shape = onp.random.randint(low=1, high=300, size=10)
     shape_list = []
     for i in range(len(shape)):
         shape_list.append((shape[i], shape[i]))
@@ -2000,7 +1881,7 @@ def __init__(self,
             self.conv0 = gluon.nn.Conv2D(chn_num, (kernel, kernel))
             self.bn0   = gluon.nn.BatchNorm(axis=axis)
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             conv = self.conv0(x)
             out = self.bn0(conv)
             return out
@@ -2008,11 +1889,12 @@ def hybrid_forward(self, F, x):
     for i in range(len(chn_list)):
         for j in range(len(shape_list)):
             shape = (batch_size, ) + (3,) + shape_list[j]
-            x = mx.nd.random.uniform(-1.0, 1.0, shape=shape)
+            x = mx.np.random.uniform(-1.0, 1.0, size=shape)
             net = Net(chn_list[i], 1, 1)
             check_layer_forward_withinput(net, x)
 
 
+@use_np
 def test_batchnorm_chnls():
     chn_list = [1024, 512, 256, 128, 64, 45, 32, 16, 3]
     class Net(gluon.HybridBlock):
@@ -2033,7 +1915,7 @@ def __init__(self,
                     )
             self.bn1 = gluon.nn.BatchNorm(in_channels=chn_num, **({} if norm_kwargs is None else norm_kwargs))
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             """Hybrid forward of R2+1D net"""
             conv = self.conv1(x)
             out = self.bn1(conv)
@@ -2042,14 +1924,15 @@ def hybrid_forward(self, F, x):
     for i in range(len(chn_list)):
         net = Net(chn_list[i])
         net.initialize(init=init.Constant(1))
-        x = mx.nd.zeros((1, 3, 8, 160, 160))
+        x = mx.np.zeros((1, 3, 8, 160, 160))
         net(x).asnumpy()
 
 
+@use_np
 def test_concat():
     chn_list = [16, 64]
     shapes = [1, 3, 5]
-    input_num = np.random.randint(low=2, high=11)
+    input_num = onp.random.randint(low=2, high=11)
     shape_list = []
     for i in range(len(shapes)):
         shape_list.append((shapes[i], shapes[i]))
@@ -2066,32 +1949,34 @@ def __init__(self,
             for i in range(input_num):
                 self.concat.add(gluon.nn.Conv2D(chn_num, (kernel, kernel)))
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             return self.concat(x)
 
     for s in range(len(shape_list)):
         shape = (batch_size,) + (3,) + shape_list[i]
-        x = mx.nd.random.uniform(-1.0, 1.0, shape=shape)
+        x = mx.np.random.uniform(-1.0, 1.0, size=shape)
         for i in range(len(chn_list)):
             for axis in range(4):
                 net = Net(axis, input_num, chn_list[i], 1)
                 check_layer_forward_withinput(net, x)
 
+@use_np
 def test_reshape_conv():
     class Net(gluon.HybridBlock):
         def __init__(self, **kwargs):
             super(Net, self).__init__(**kwargs)
             self.conv0 = nn.Conv2D(64, (3, 3))
 
-        def hybrid_forward(self, F, x):
-            x_reshape = x.reshape((0, 0, 128, 32))
+        def forward(self, x):
+            x_reshape = x.reshape((-1, 3, 128, 32))
             out = self.conv0(x_reshape)
             return out
-    x = mx.nd.random.uniform(shape=(4, 3, 64, 64))
+    x = mx.np.random.uniform(size=(4, 3, 64, 64))
     net = Net()
     check_layer_forward_withinput(net, x)
 
 
+@use_np
 @pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 def test_reshape_conv_reshape_conv():
     class Net(gluon.HybridBlock):
@@ -2100,32 +1985,34 @@ def __init__(self, **kwargs):
             self.conv0 = nn.Conv2D(64, (3, 3))
             self.conv1 = nn.Conv2D(128, (3, 3))
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_reshape = x.reshape((0, 0, 128, 32))
             y = self.conv0(x_reshape)
             "spatial shape of y is (62, 62)"
             y_reshape = y.reshape((0, 0, 124, 31))
             out = self.conv1(y_reshape)
             return out
-    x = mx.nd.random.uniform(shape=(4, 3, 64, 64))
+    x = mx.np.random.uniform(size=(4, 3, 64, 64))
     net = Net()
     check_layer_forward_withinput(net, x)
 
+@use_np
 def test_slice_conv():
     class Net(gluon.HybridBlock):
         def __init__(self, **kwargs):
             super(Net, self).__init__(**kwargs)
             self.conv0 = nn.Conv2D(16, (3, 3))
 
-        def hybrid_forward(self, F, x):
-            x_slice = x.slice(begin=(0, 2, 0, 0), end=(4, 5, 32, 32))
+        def forward(self, x):
+            x_slice = mx.npx.slice(x, begin=(0, 2, 0, 0), end=(4, 5, 32, 32))
             out = self.conv0(x_slice)
             return out
-    x = mx.nd.random.uniform(shape=(8, 6, 32, 32))
+    x = mx.np.random.uniform(size=(8, 6, 32, 32))
     net = Net()
     check_layer_forward_withinput(net, x)
 
 
+@use_np
 def test_slice_conv_slice_conv():
     class Net(gluon.HybridBlock):
         def __init__(self, **kwargs):
@@ -2133,18 +2020,19 @@ def __init__(self, **kwargs):
             self.conv0 = nn.Conv2D(32, (3, 3))
             self.conv1 = nn.Conv2D(16, (1, 1))
 
-        def hybrid_forward(self, F, x):
-            x_slice = x.slice(begin=(0, 0, 0, 0), end=(4, 16, 16, 16))
+        def forward(self, x):
+            x_slice = mx.npx.slice(x, begin=(0, 0, 0, 0), end=(4, 16, 16, 16))
             y = self.conv0(x_slice)
             "shape of y is (4, 32, 14, 14)"
-            y_slice = y.slice(begin=(0, 0, 0, 0), end=(4, 16, 3, 3))
+            y_slice = mx.npx.slice(y, begin=(0, 0, 0, 0), end=(4, 16, 3, 3))
             out = self.conv1(y_slice)
             return out
-    x = mx.nd.random.uniform(shape=(4, 32, 32, 32))
+    x = mx.np.random.uniform(size=(4, 32, 32, 32))
     net = Net()
     check_layer_forward_withinput(net, x)
 
 
+@use_np
 @pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 def test_slice_conv_reshape_conv():
     class Net(gluon.HybridBlock):
@@ -2153,18 +2041,19 @@ def __init__(self, **kwargs):
             self.conv0 = nn.Conv2D(64, (3, 3))
             self.conv1 = nn.Conv2D(128, (3, 3))
 
-        def hybrid_forward(self, F, x):
-            x_slice = x.slice(begin=(0, 0, 1, 1), end=(4, 16, 33, 33))
+        def forward(self, x):
+            x_slice = mx.npx.slice(x, begin=(0, 0, 1, 1), end=(4, 16, 33, 33))
             y = self.conv0(x_slice)
             "shape of y is (4, 64, 30, 30)"
             y_reshape = y.reshape((0, 0, 60, 15))
             out = self.conv1(y_reshape)
             return out
 
-    x = mx.nd.random.uniform(shape=(4, 32, 64, 64))
+    x = mx.np.random.uniform(size=(4, 32, 64, 64))
     net = Net()
     check_layer_forward_withinput(net, x)
 
+@use_np
 def test_reshape_conv_slice_conv():
     """
     This test will test gluon Conv2d computation with ndarray reshape and slice
@@ -2175,140 +2064,147 @@ def __init__(self, **kwargs):
             self.conv0 = nn.Conv2D(16, (3, 3))
             self.conv1 = nn.Conv2D(32, (3, 3))
 
-        def hybrid_forward(self, F, x):
-            x_reshape = x.reshape((0, 0, 64, 16))
+        def forward(self, x):
+            x_reshape = x.reshape((-1, 3, 64, 16))
             y = self.conv0(x_reshape)
             "shape of y is (4, 16, 62, 14)"
-            y_slice = y.slice(begin=(0, 0, 0, 0), end=(2, 16, 14, 14))
+            y_slice = mx.npx.slice(y, begin=(0, 0, 0, 0), end=(2, 16, 14, 14))
             out = self.conv1(y_slice)
             return out
-    x = mx.nd.random.uniform(shape=(4, 3, 32, 32))
+    x = mx.np.random.uniform(size=(4, 3, 32, 32))
     net = Net()
     check_layer_forward_withinput(net, x)
 
+@use_np
 def test_reshape_dense():
     class Net(gluon.HybridBlock):
         def __init__(self, **kwargs):
             super(Net, self).__init__(**kwargs)
-            channel0 = np.random.randint(1, 17)
+            channel0 = onp.random.randint(1, 17)
             self.dense0 = nn.Dense(channel0)
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_reshape = x.reshape((8, 64, 128, -1))
             out = self.dense0(x_reshape)
             return out
 
-    x = mx.nd.random.uniform(shape=(4, 32, 64, 64))
+    x = mx.np.random.uniform(size=(4, 32, 64, 64))
     net = Net()
     check_layer_forward_withinput(net, x)
 
 
+@use_np
 def test_slice_dense():
     class Net(gluon.HybridBlock):
         def __init__(self, slice, **kwargs):
             super(Net, self).__init__(**kwargs)
-            channel0 = np.random.randint(1, 17)
+            channel0 = onp.random.randint(1, 17)
             self.dense0 = nn.Dense(channel0)
             self.slice = slice
 
-        def hybrid_forward(self, F, x):
-            x_slice = x.slice(begin=tuple(self.slice[0]),
+        def forward(self, x):
+            x_slice = mx.npx.slice(x, begin=tuple(self.slice[0]),
                               end=tuple(self.slice[1]))
             out = self.dense0(x_slice)
             return out
 
-    x = mx.nd.random.uniform(shape=(16, 32, 64, 64))
+    x = mx.np.random.uniform(size=(16, 32, 64, 64))
     slice = [[0, 16, 0, 0], [4, 32, 32, 32]]
     net = Net(slice)
     check_layer_forward_withinput(net, x)
 
+@use_np
 def test_slice_dense_slice_dense():
     class Net(gluon.HybridBlock):
         def __init__(self, slice, **kwargs):
             super(Net, self).__init__(**kwargs)
             channel0 = 32
-            channel1 = np.random.randint(1, 17)
+            channel1 = onp.random.randint(1, 17)
             self.dense0 = nn.Dense(channel0)
             self.dense1 = nn.Dense(channel1)
             self.slice = slice
 
-        def hybrid_forward(self, F, x):
-            x_slice = x.slice(begin=tuple(self.slice[0]), end=tuple(self.slice[1]))
+        def forward(self, x):
+            x_slice = mx.npx.slice(x, begin=tuple(self.slice[0]), end=tuple(self.slice[1]))
             y = self.dense0(x_slice)
-            y_slice = y.slice(begin=(1, 0), end=(3, 10))
+            y_slice = mx.npx.slice(y, begin=(1, 0), end=(3, 10))
             out = self.dense1(y_slice)
             return out
 
-    x = mx.nd.random.uniform(shape=(16, 32, 64, 64))
+    x = mx.np.random.uniform(size=(16, 32, 64, 64))
     slice = [[0, 16, 0, 0], [4, 32, 32, 32]]
     net = Net(slice)
     check_layer_forward_withinput(net, x)
 
+@use_np
 def test_reshape_dense_reshape_dense():
     class Net(gluon.HybridBlock):
         def __init__(self, **kwargs):
             super(Net, self).__init__(**kwargs)
-            channel0 = np.random.randint(1, 17)
-            channel1 = np.random.randint(1, 33)
+            channel0 = onp.random.randint(1, 17)
+            channel1 = onp.random.randint(1, 33)
             self.dense0 = nn.Dense(channel0)
             self.dense1 = nn.Dense(channel1)
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_reshape = x.reshape((4, 16, 128, 32))
             y = self.dense0(x_reshape)
             y_reshape = y.reshape((1, -1))
             out = self.dense1(y_reshape)
             return out
 
-    x = mx.nd.random.uniform(shape=(4, 16, 64, 64))
+    x = mx.np.random.uniform(size=(4, 16, 64, 64))
     net = Net()
     check_layer_forward_withinput(net, x)
 
 
+@use_np
 def test_slice_dense_reshape_dense():
     class Net(gluon.HybridBlock):
         def __init__(self, slice, **kwargs):
             super(Net, self).__init__(**kwargs)
-            channel0 = np.random.randint(1, 17)
-            channel1 = np.random.randint(1, 17)
+            channel0 = onp.random.randint(1, 17)
+            channel1 = onp.random.randint(1, 17)
             self.dense0 = nn.Dense(channel0)
             self.dense1 = nn.Dense(channel1)
             self.slice = slice
 
-        def hybrid_forward(self, F, x):
-            x_slice = x.slice(begin=tuple(self.slice[0]), end=tuple(self.slice[1]))
+        def forward(self, x):
+            x_slice = mx.npx.slice(x, begin=tuple(self.slice[0]), end=tuple(self.slice[1]))
             y = self.dense0(x_slice)
             y_reshape = y.reshape((1, -1))
             out = self.dense1(y_reshape)
             return out
 
-    x = mx.nd.random.uniform(shape=(16, 32, 64, 64))
+    x = mx.np.random.uniform(size=(16, 32, 64, 64))
     slice = [[0, 16, 0, 0], [4, 32, 32, 32]]
     net = Net(slice)
     check_layer_forward_withinput(net, x)
 
 
+@use_np
 def test_reshape_dense_slice_dense():
     class Net(gluon.HybridBlock):
         def __init__(self, **kwargs):
             super(Net, self).__init__(**kwargs)
             channel0 = 64
-            channel1 = np.random.randint(1, 17)
+            channel1 = onp.random.randint(1, 17)
             self.dense0 = nn.Dense(channel0)
             self.dense1 = nn.Dense(channel1)
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_reshape = x.reshape((4, 16, 128, 32))
             y = self.dense0(x_reshape)
-            y_slice = y.slice(begin=(1, 32), end=(3, 64))
+            y_slice = mx.npx.slice(y, begin=(1, 32), end=(3, 64))
             out = self.dense1(y_slice)
             return out
 
-    x = mx.nd.random.uniform(shape=(4, 16, 64, 64))
+    x = mx.np.random.uniform(size=(4, 16, 64, 64))
     net = Net()
     check_layer_forward_withinput(net, x)
 
 
+@use_np
 @pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 def test_reshape_batchnorm():
     class Net(gluon.HybridBlock):
@@ -2318,18 +2214,19 @@ def __init__(self, shape, **kwargs):
             self.bn0 = nn.BatchNorm()
             self.reshape = shape
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_in = self.conv0(x)
             x_reshape = x_in.reshape(self.reshape)
             out = self.bn0(x_reshape)
             return out
 
-    x = mx.nd.random.uniform(shape=(4, 32, 64, 64))
+    x = mx.np.random.uniform(size=(4, 32, 64, 64))
     shape = (4, 64, 64, -1)
     net = Net(shape)
     check_layer_forward_withinput(net, x)
 
 
+@use_np
 @pytest.mark.serial
 def test_slice_batchnorm():
     class Net(gluon.HybridBlock):
@@ -2339,19 +2236,20 @@ def __init__(self, slice, **kwargs):
             self.bn0 = nn.BatchNorm()
             self.slice = slice
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_in = self.conv0(x)
-            x_slice = x_in.slice(begin=tuple(self.slice[0]),
+            x_slice = mx.npx.slice(x_in, begin=tuple(self.slice[0]),
                               end=tuple(self.slice[1]))
             out = self.bn0(x_slice)
             return out
 
-    x = mx.nd.random.uniform(shape=(16, 128, 256, 256))
+    x = mx.np.random.uniform(size=(16, 128, 256, 256))
     slice = [[0, 0, 0, 0], [4, 32, 32, 32]]
     net = Net(slice)
     check_layer_forward_withinput(net, x)
 
 
+@use_np
 @pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 @pytest.mark.serial
 def test_slice_batchnorm_slice_batchnorm():
@@ -2363,20 +2261,21 @@ def __init__(self, slice, **kwargs):
             self.bn1 = nn.BatchNorm()
             self.slice = slice
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_in = self.conv0(x)
-            x_slice = x_in.slice(begin=tuple(self.slice[0][0]), end=tuple(self.slice[0][1]))
+            x_slice = mx.npx.slice(x_in, begin=tuple(self.slice[0][0]), end=tuple(self.slice[0][1]))
             y = self.bn0(x_slice)
-            y_slice = y.slice(begin=tuple(self.slice[1][0]), end=tuple(self.slice[1][1]))
+            y_slice = mx.npx.slice(y, begin=tuple(self.slice[1][0]), end=tuple(self.slice[1][1]))
             out = self.bn1(y_slice)
             return out
 
-    x = mx.nd.random.uniform(shape=(16, 128, 256, 256))
+    x = mx.np.random.uniform(size=(16, 128, 256, 256))
     slice = [[[0, 0, 0, 0], [4, 32, 32, 32]], [[0, 0, 0, 0], [2, 64, 16, 16]]]
     net = Net(slice)
     check_layer_forward_withinput(net, x)
 
 
+@use_np
 @pytest.mark.skip(reason='skippping temporarily, tracked by https://github.com/apache/incubator-mxnet/issues/11164')
 def test_reshape_batchnorm_reshape_batchnorm():
     class Net(gluon.HybridBlock):
@@ -2387,7 +2286,7 @@ def __init__(self, shape, **kwargs):
             self.bn1 = nn.BatchNorm()
             self.reshape = shape
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_in = self.conv0(x)
             x_reshape = x_in.reshape(self.reshape[0])
             y = self.bn0(x_reshape)
@@ -2395,12 +2294,13 @@ def hybrid_forward(self, F, x):
             out = self.bn1(y_reshape)
             return out
 
-    x = mx.nd.random.uniform(shape=(4, 32, 64, 64))
+    x = mx.np.random.uniform(size=(4, 32, 64, 64))
     shape = [(4, 64, 64, -1), (4, 128, -1, 32)]
     net = Net(shape)
     check_layer_forward_withinput(net, x)
 
 
+@use_np
 @pytest.mark.serial
 def test_slice_batchnorm_reshape_batchnorm():
     class Net(gluon.HybridBlock):
@@ -2412,15 +2312,15 @@ def __init__(self, shape, slice, **kwargs):
             self.reshape = shape
             self.slice = slice
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_in = self.conv0(x)
-            x_slice = x_in.slice(begin=tuple(self.slice[0]), end=tuple(self.slice[1]))
+            x_slice = mx.npx.slice(x_in, begin=tuple(self.slice[0]), end=tuple(self.slice[1]))
             y = self.bn0(x_slice)
             y_reshape = y.reshape(self.reshape)
             out = self.bn1(y_reshape)
             return out
 
-    x = mx.nd.random.uniform(shape=(16, 128, 256, 256))
+    x = mx.np.random.uniform(size=(16, 128, 256, 256))
     slice = [[0, 0, 0, 0], [4, 32, 32, 32]]
     shape = (1, 128, 64, -1)
     net = Net(shape, slice)
@@ -2438,7 +2338,7 @@ def __init__(self, shape, slice, **kwargs):
             self.reshape = shape
             self.slice = slice
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_in = self.conv0(x)
             x_reshape = x_in.reshape(self.reshape)
             y = self.bn0(x_reshape)
@@ -2446,7 +2346,7 @@ def hybrid_forward(self, F, x):
             out = self.bn1(y_slice)
             return out
 
-    x = mx.nd.random.uniform(shape=(4, 32, 64, 64))
+    x = mx.np.random.uniform(size=(4, 32, 64, 64))
     slice = [[0, 0, 0, 0], [2, 64, 32, 32]]
     shape = (4, 64, 64, -1)
     net = Net(shape, slice)
@@ -2468,12 +2368,12 @@ def __init__(self,
             self.reshape = shape
             self.pool0 = pooling_layer
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_reshape = x.reshape(self.reshape)
             out = self.pool0(x_reshape)
             return out
 
-    x = mx.nd.random.uniform(shape=(4, 32, 32, 32))
+    x = mx.np.random.uniform(size=(4, 32, 32, 32))
     shape = (4, 64, 64, -1)
     for i in range(len(pooling_layers)):
         net = Net(shape, pooling_layers[i])
@@ -2500,8 +2400,8 @@ def __init__(self,
                 self.slice = slice
                 self.pool0 = pooling_layer
 
-            def hybrid_forward(self, F, x):
-                x_slice = x.slice(begin=self.slice[0], end=self.slice[1])
+            def forward(self, x):
+                x_slice = mx.npx.slice(x, begin=self.slice[0], end=self.slice[1])
                 out = self.pool0(x_slice)
                 return out
 
@@ -2510,7 +2410,7 @@ def hybrid_forward(self, F, x):
         if layout == 'NHWC':
             xshape = transpose(xshape)
             slice_shape = transpose(slice_shape)
-        x = mx.nd.random.uniform(shape=xshape)
+        x = mx.np.random.uniform(size=xshape)
         slice = [(0, 0, 0, 0), slice_shape]
         for i in range(len(pooling_layers)):
             net = Net(slice, pooling_layers[i])
@@ -2534,14 +2434,14 @@ def __init__(self,
             self.pool0 = pooling_layer1
             self.pool1 = pooling_layer2
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_reshape = x.reshape(self.reshape[0])
             y = self.pool0(x_reshape)
             y_reshape = y.reshape(self.reshape[1])
             out = self.pool1(y_reshape)
             return out
 
-    x = mx.nd.random.uniform(shape=(16, 128, 256, 256))
+    x = mx.np.random.uniform(size=(16, 128, 256, 256))
     shape = [(128, 256, 64, -1), (128, 256, 11, -1)]
     for i in range(len(pooling_layers)):
         for j in range(len(pooling_layers)):
@@ -2568,14 +2468,14 @@ def __init__(self,
             self.pool0 = pooling_layer1
             self.pool1 = pooling_layer2
 
-        def hybrid_forward(self, F, x):
-            x_slice = x.slice(begin=self.slice[0][0], end=self.slice[0][1])
+        def forward(self, x):
+            x_slice = mx.npx.slice(x, begin=self.slice[0][0], end=self.slice[0][1])
             y = self.pool0(x_slice)
-            y_slice = y.slice(begin=self.slice[1][0], end=self.slice[1][1])
+            y_slice = mx.npx.slice(y, begin=self.slice[1][0], end=self.slice[1][1])
             out = self.pool1(y_slice)
             return out
 
-    x = mx.nd.random.uniform(shape=(16, 128, 256, 256))
+    x = mx.np.random.uniform(size=(16, 128, 256, 256))
     slice = [[(8, 0, 100, 50), (16, -1, -1, -1)], [(0, 64, 0, 50), (2, -1, -1, -1)]]
     for i in range(len(pooling_layers)):
         for j in range(len(pooling_layers)):
@@ -2604,14 +2504,14 @@ def __init__(self,
             self.pool0 = pooling_layer1
             self.pool1 = pooling_layer2
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_slice = x.slice(begin=self.slice[0], end=self.slice[1])
             y = self.pool0(x_slice)
             y_reshape = y.reshape(self.reshape)
             out = self.pool1(y_reshape)
             return out
 
-    x = mx.nd.random.uniform(shape=(16, 128, 256, 256))
+    x = mx.np.random.uniform(size=(16, 128, 256, 256))
     slice = [(8, 0, 100, 50), (16, 128, 256, 256)]
     shape = (32, -1, 0, 0)
     for i in range(len(pooling_layers)):
@@ -2640,14 +2540,14 @@ def __init__(self,
             self.pool0 = pooling_layer1
             self.pool1 = pooling_layer2
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_reshape = x.reshape(self.reshape)
             y = self.pool0(x_reshape)
             y_slice = y.slice(begin=self.slice[0], end=self.slice[1])
             out = self.pool1(y_slice)
             return out
 
-    x = mx.nd.random.uniform(shape=(16, 128, 256, 256))
+    x = mx.np.random.uniform(size=(16, 128, 256, 256))
     shape = (0, 512, 64, -1)
     slice = [(8, 256, 10, 20), (-1, -1, -1, 70)]
     for i in range(len(pooling_layers)):
@@ -2666,11 +2566,11 @@ def __init__(self, shape, **kwargs):
             self.reshape = shape
             self.conv0 = nn.Conv2DTranspose(64, (3, 3))
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_reshape = x.reshape(self.reshape)
             out = self.conv0(x_reshape)
             return out
-    x = mx.nd.random.uniform(shape=(4, 16, 32, 32))
+    x = mx.np.random.uniform(size=(4, 16, 32, 32))
     shape = (4, 16, 64, -1)
     net = Net(shape)
     check_layer_forward_withinput(net, x)
@@ -2684,11 +2584,11 @@ def __init__(self, slice, **kwargs):
             self.slice = slice
             self.conv0 = nn.Conv2DTranspose(64, (3, 3))
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_slice = x.slice(begin=self.slice[0], end=self.slice[1])
             out = self.conv0(x_slice)
             return out
-    x = mx.nd.random.uniform(shape=(8, 32, 64, 64))
+    x = mx.np.random.uniform(size=(8, 32, 64, 64))
     slice = [(0, 16, 0, 0), (4, 32, 32, 32)]
     net = Net(slice)
     check_layer_forward_withinput(net, x)
@@ -2703,14 +2603,14 @@ def __init__(self, shape, **kwargs):
             self.conv0 = nn.Conv2DTranspose(32, (3, 3))
             self.conv1 = nn.Conv2DTranspose(64, (3, 3), strides=(2, 2))
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_reshape = x.reshape(self.reshape[0])
             y = self.conv0(x_reshape)
             "shape of y is (4, 32, 66, 18)"
             y_reshape = y.reshape(self.reshape[1])
             out = self.conv1(y_reshape)
             return out
-    x = mx.nd.random.uniform(shape=(4, 16, 32, 32))
+    x = mx.np.random.uniform(size=(4, 16, 32, 32))
     shape = [(4, 16, 64, -1), (4, 32, 33, -1)]
     net = Net(shape)
     check_layer_forward_withinput(net, x)
@@ -2725,14 +2625,14 @@ def __init__(self, slice, **kwargs):
             self.conv0 = nn.Conv2DTranspose(32, (3, 3))
             self.conv1 = nn.Conv2DTranspose(64, (3, 3), strides=(2, 2))
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_slice = x.slice(begin=self.slice[0][0], end=self.slice[0][1])
             y = self.conv0(x_slice)
             "shape of y is (4, 32, 66, 18)"
             y_slice = y.slice(begin=self.slice[1][0], end=self.slice[1][1])
             out = self.conv1(y_slice)
             return out
-    x = mx.nd.random.uniform(shape=(8, 32, 64, 64))
+    x = mx.np.random.uniform(size=(8, 32, 64, 64))
     slice = [[(0, 0, 0, 0), (4, 16, 32, 32)], [(0, 0, 0, 0), (2, 16, 16, 16)]]
     net = Net(slice)
     check_layer_forward_withinput(net, x)
@@ -2748,14 +2648,14 @@ def __init__(self, shape, slice, **kwargs):
             self.conv0 = nn.Conv2DTranspose(32, (3, 3))
             self.conv1 = nn.Conv2DTranspose(64, (3, 3), strides=(2, 2))
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_reshape = x.reshape(self.reshape)
             y = self.conv0(x_reshape)
             "shape of y is (4, 32, 66, 18)"
             y_slice = y.slice(begin=self.slice[0], end=self.slice[1])
             out = self.conv1(y_slice)
             return out
-    x = mx.nd.random.uniform(shape=(4, 16, 32, 32))
+    x = mx.np.random.uniform(size=(4, 16, 32, 32))
     shape = (4, 16, 64, -1)
     slice = [(0, 0, 0, 0), (2, 16, 16, 16)]
     net = Net(shape, slice)
@@ -2772,19 +2672,20 @@ def __init__(self, shape, slice, **kwargs):
             self.conv0 = nn.Conv2DTranspose(32, (3, 3))
             self.conv1 = nn.Conv2DTranspose(96, (3, 3), strides=(2, 2))
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_slice = x.slice(begin=self.slice[0], end=self.slice[1])
             y = self.conv0(x_slice)
             "shape of y is (4, 32, 34, 34)"
             y_reshape = y.reshape(self.reshape)
             out = self.conv1(y_reshape)
             return out
-    x = mx.nd.random.uniform(shape=(8, 32, 64, 64))
+    x = mx.np.random.uniform(size=(8, 32, 64, 64))
     shape = (4, 64, 34, -1)
     slice = [(4, 0, 0, 0), (8, 16, 32, 32)]
     net = Net(shape, slice)
     check_layer_forward_withinput(net, x)
 
+@use_np
 @pytest.mark.serial
 def test_reshape_activation():
     class Net(gluon.HybridBlock):
@@ -2793,18 +2694,19 @@ def __init__(self, act, shape, **kwargs):
             self.reshape = shape
             self.act = nn.Activation(act)
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_reshape = x.reshape(self.reshape)
             out = self.act(x_reshape)
             return out
     acts = ["relu", "sigmoid", "tanh", "softrelu", "softsign"]
     for act in acts:
-        x = mx.nd.random.uniform(-1, 1, shape=(4, 16, 32, 32))
+        x = mx.np.random.uniform(-1, 1, size=(4, 16, 32, 32))
         shape = (4, 32, 32, -1)
         net = Net(act, shape)
         check_layer_forward_withinput(net, x)
 
 
+@use_np
 @pytest.mark.serial
 def test_slice_activation():
     class Net(gluon.HybridBlock):
@@ -2813,19 +2715,20 @@ def __init__(self, act, slice, **kwargs):
             self.slice = slice
             self.act = nn.Activation(act)
 
-        def hybrid_forward(self, F, x):
-            x_slice = x.slice(begin=self.slice[0], end=self.slice[1])
+        def forward(self, x):
+            x_slice = mx.npx.slice(x, begin=self.slice[0], end=self.slice[1])
             out = self.act(x_slice)
             return out
 
     acts = ["relu", "sigmoid", "tanh", "softrelu", "softsign"]
     for act in acts:
-        x = mx.nd.random.uniform(-1, 1, shape=(8, 32, 64, 64))
+        x = mx.np.random.uniform(-1, 1, size=(8, 32, 64, 64))
         slice = [(0, 16, 32, 32), (4, 32, 64, 64)]
         net = Net(act, slice)
         check_layer_forward_withinput(net, x)
 
 
+@use_np
 @pytest.mark.serial
 def test_reshape_activation_reshape_activation():
     class Net(gluon.HybridBlock):
@@ -2835,7 +2738,7 @@ def __init__(self, act0, act1, shape, **kwargs):
             self.act0 = nn.Activation(act0)
             self.act1 = nn.Activation(act1)
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_reshape = x.reshape(self.reshape[0])
             y = self.act0(x_reshape)
             y_reshape = y.reshape(self.reshape[1])
@@ -2846,12 +2749,13 @@ def hybrid_forward(self, F, x):
         for idx1, act1 in enumerate(acts):
             if idx1 == idx0:
                 continue
-            x = mx.nd.random.uniform(-1, 1, shape=(4, 16, 32, 32))
+            x = mx.np.random.uniform(-1, 1, size=(4, 16, 32, 32))
             shape = [(4, 32, 32, -1), (4, 32, 16, -1)]
             net = Net(act0, act1, shape)
             check_layer_forward_withinput(net, x)
 
 
+@use_np
 @pytest.mark.serial
 def test_slice_activation_slice_activation():
     class Net(gluon.HybridBlock):
@@ -2861,10 +2765,10 @@ def __init__(self, act0, act1, slice, **kwargs):
             self.act0 = nn.Activation(act0)
             self.act1 = nn.Activation(act1)
 
-        def hybrid_forward(self, F, x):
-            x_slice = x.slice(begin=self.slice[0][0], end=self.slice[0][1])
+        def forward(self, x):
+            x_slice = mx.npx.slice(x, begin=self.slice[0][0], end=self.slice[0][1])
             y = self.act0(x_slice)
-            y_slice = y.slice(begin=self.slice[1][0], end=self.slice[1][1])
+            y_slice = mx.npx.slice(y, begin=self.slice[1][0], end=self.slice[1][1])
             out = self.act1(y_slice)
             return out
     acts = ["relu", "sigmoid", "tanh", "softrelu", "softsign"]
@@ -2872,12 +2776,13 @@ def hybrid_forward(self, F, x):
         for idx1, act1 in enumerate(acts):
             if idx1 == idx0:
                 continue
-            x = mx.nd.random.uniform(-1, 1, shape=(8, 32, 64, 64))
+            x = mx.np.random.uniform(-1, 1, size=(8, 32, 64, 64))
             slice = [[(0, 16, 32, 32), (4, 32, 64, 64)], [(2, 0, 16, 16), (4, 16, 32, 32)]]
             net = Net(act0, act1, slice)
             check_layer_forward_withinput(net, x)
 
 
+@use_np
 @pytest.mark.serial
 def test_reshape_activation_slice_activation():
     class Net(gluon.HybridBlock):
@@ -2888,10 +2793,10 @@ def __init__(self, act0, act1, shape, slice, **kwargs):
             self.act0 = nn.Activation(act0)
             self.act1 = nn.Activation(act1)
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             x_reshape = x.reshape(self.reshape)
             y = self.act0(x_reshape)
-            y_slice = y.slice(begin=self.slice[0], end=self.slice[1])
+            y_slice = mx.npx.slice(y, begin=self.slice[0], end=self.slice[1])
             out = self.act1(y_slice)
             return out
     acts = ["relu", "sigmoid", "tanh", "softrelu", "softsign"]
@@ -2899,13 +2804,14 @@ def hybrid_forward(self, F, x):
         for idx1, act1 in enumerate(acts):
             if idx1 == idx0:
                 continue
-            x = mx.nd.random.uniform(-1, 1, shape=(4, 16, 32, 32))
+            x = mx.np.random.uniform(-1, 1, size=(4, 16, 32, 32))
             shape = (4, 32, 32, -1)
             slice = [(0, 0, 0, 0), (2, 16, 16, 16)]
             net = Net(act0, act1, shape, slice)
             check_layer_forward_withinput(net, x)
 
 
+@use_np
 @pytest.mark.serial
 def test_slice_activation_reshape_activation():
     class Net(gluon.HybridBlock):
@@ -2916,8 +2822,8 @@ def __init__(self, act0, act1, shape, slice, **kwargs):
             self.act0 = nn.Activation(act0)
             self.act1 = nn.Activation(act1)
 
-        def hybrid_forward(self, F, x):
-            x_slice = x.slice(begin=self.slice[0], end=self.slice[1])
+        def forward(self, x):
+            x_slice = mx.npx.slice(x, begin=self.slice[0], end=self.slice[1])
             y = self.act0(x_slice)
             y_reshape = y.reshape(self.reshape)
             out = self.act1(y_reshape)
@@ -2927,12 +2833,13 @@ def hybrid_forward(self, F, x):
         for idx1, act1 in enumerate(acts):
             if idx1 == idx0:
                 continue
-            x = mx.nd.random.uniform(-1, 1, shape=(8, 32, 64, 64))
+            x = mx.np.random.uniform(-1, 1, size=(8, 32, 64, 64))
             slice = [(0, 16, 32, 32), (4, 32, 64, 64)]
             shape = (4, 32, 32, -1)
             net = Net(act0, act1, shape, slice)
             check_layer_forward_withinput(net, x)
 
+@use_np
 @pytest.mark.serial
 def test_np_shape_parameters():
     class Foo(gluon.Block):
@@ -2943,7 +2850,7 @@ def forward(self, x):
             return self.dense(x)
 
     with mx.np_shape(True):
-        z = mx.nd.zeros((2,2016))
+        z = mx.np.zeros((2,2016))
         print(z.shape)
         foo = Foo()
         foo.initialize()
@@ -2955,7 +2862,7 @@ def test_gluon_param_load():
     net.save_parameters('test_gluon_param_load.params')
     net.cast('float16')
     net.load_parameters('test_gluon_param_load.params', cast_dtype=True)
-    mx.nd.waitall()
+    mx.npx.waitall()
 
 def test_gluon_param_load_dtype_source():
     net = mx.gluon.nn.Dense(10, in_units=10)
@@ -2964,23 +2871,22 @@ def test_gluon_param_load_dtype_source():
     net.save_parameters('test_gluon_param_load_dtype_source.params')
     net.cast('float32')
     net.load_parameters('test_gluon_param_load_dtype_source.params', cast_dtype=True, dtype_source="saved")
-    assert net.weight.dtype == np.float16
-    mx.nd.waitall()
+    assert net.weight.dtype == onp.float16
+    mx.npx.waitall()
 
+@use_np
 def test_squeeze_consistency():
     class Foo(gluon.HybridBlock):
-        def __init__(self, inplace, **kwargs):
+        def __init__(self, **kwargs):
             super(Foo, self).__init__(**kwargs)
-            self.inplace = inplace
 
         def forward(self, x):
-            return x.squeeze(inplace=self.inplace)
+            return x.squeeze()
 
-    for inplace in (True, False):
-        block = Foo(inplace)
-        block.hybridize()
-        shape = (np.random.randint(1, 10), np.random.randint(1, 10), 1)
-        block(mx.nd.ones(shape))
+    block = Foo()
+    block.hybridize()
+    shape = (onp.random.randint(1, 10), onp.random.randint(1, 10), 1)
+    block(mx.np.ones(shape))
 
 def test_shared_parameters_with_non_default_initializer():
     class MyBlock(gluon.HybridBlock):
@@ -2996,20 +2902,21 @@ def __init__(self, **kwargs):
     assert bl.param is not bl3.param
     assert bl.param.init == bl3.param.init
 
+@use_np
 def test_reqs_switching_training_inference():
     class Foo(gluon.HybridBlock):
         def __init__(self, **kwargs):
             super(Foo, self).__init__(**kwargs)
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             y = 2 * x
-            return F.sqrt(x) + F.sqrt(y)
+            return mx.np.sqrt(x) + mx.np.sqrt(y)
 
     f = Foo()
     f.hybridize(static_alloc=True)
-    x = mx.nd.ones(shape=(10,10))
+    x = mx.np.ones(shape=(10,10))
     x.attach_grad()
-    x2 = mx.nd.ones(shape=x.shape) * 2
+    x2 = mx.np.ones(shape=x.shape) * 2
     x2.attach_grad()
 
     # Call first in training mode
@@ -3052,7 +2959,7 @@ def test_DeformableConvolution():
     """
     try:
         ctx = mx.gpu()
-        _ = mx.nd.array([0], ctx=ctx)
+        _ = mx.np.array([0], ctx=ctx)
     except mx.base.MXNetError:
         pytest.skip("deformable_convolution only supports GPU")
     net = nn.HybridSequential()
@@ -3073,7 +2980,7 @@ def test_DeformableConvolution():
     net.initialize(force_reinit=True, ctx=ctx)
     net.hybridize()
 
-    x = mx.nd.random.uniform(shape=(8, 5, 30, 31), ctx=ctx)
+    x = mx.np.random.uniform(size=(8, 5, 30, 31), ctx=ctx)
     with mx.autograd.record():
         y = net(x)
         y.backward()
@@ -3105,11 +3012,12 @@ def test_ModulatedDeformableConvolution():
     net.initialize(force_reinit=True, ctx=ctx)
     net.hybridize()
 
-    x = mx.nd.random.uniform(shape=(8, 5, 30, 31), ctx=ctx)
+    x = mx.np.random.uniform(size=(8, 5, 30, 31), ctx=ctx)
     with mx.autograd.record():
         y = net(x)
 
 
+@use_np
 @pytest.mark.parametrize('dc', [True, False])
 @pytest.mark.parametrize('hybridize', [True, False])
 @pytest.mark.garbage_expected
@@ -3134,20 +3042,14 @@ def forward(self, x):
     model2.add(MyBlock(64, activation='tanh', in_units=10))
     model2.add(MyBlock(32, in_units=10))
 
-    # symbol
-    if not dc:
-        x = mx.sym.var('data')
-        y = model(x)
-        assert len(y.list_arguments()) == 7
-
     # ndarray
     model.initialize(mx.init.Xavier(magnitude=2.24))
     model2.initialize(mx.init.Xavier(magnitude=2.24))
     if hybridize:
         model.hybridize()
         model2.hybridize()
-    x = model(mx.nd.zeros((32, 10)))
-    x2 = model2(mx.nd.zeros((32, 10)))
+    x = model(mx.np.zeros((32, 10)))
+    x2 = model2(mx.np.zeros((32, 10)))
     assert x.shape == (32, 224)
     assert x2.shape == (32, 224)
     x.wait_to_read()
@@ -3155,7 +3057,7 @@ def forward(self, x):
 
 def test_identity():
     model = nn.Identity()
-    x = mx.nd.random.uniform(shape=(128, 33, 64))
+    x = mx.np.random.uniform(size=(128, 33, 64))
     assert_almost_equal(model(x), x)
 
 def test_pixelshuffle1d():
@@ -3165,7 +3067,7 @@ def test_pixelshuffle1d():
     shape_before = (1, nchan * up_x, nx)
     shape_after = (1, nchan, nx * up_x)
     layer = nn.PixelShuffle1D(up_x)
-    x = mx.nd.arange(np.prod(shape_before)).reshape(shape_before)
+    x = mx.np.arange(onp.prod(shape_before)).reshape(shape_before)
     y = layer(x)
     assert y.shape == shape_after
     assert_allclose(
@@ -3183,7 +3085,7 @@ def test_pixelshuffle2d():
     shape_before = (1, nchan * up_x * up_y, nx, ny)
     shape_after = (1, nchan, nx * up_x, ny * up_y)
     layer = nn.PixelShuffle2D((up_x, up_y))
-    x = mx.nd.arange(np.prod(shape_before)).reshape(shape_before)
+    x = mx.np.arange(onp.prod(shape_before)).reshape(shape_before)
     y = layer(x)
     assert y.shape == shape_after
     # - Channels are reshaped to form 2x3 blocks
@@ -3215,7 +3117,7 @@ def test_pixelshuffle3d():
     shape_before = (1, nchan * up_x * up_y * up_z, nx, ny, nz)
     shape_after = (1, nchan, nx * up_x, ny * up_y, nz * up_z)
     layer = nn.PixelShuffle3D((up_x, up_y, up_z))
-    x = mx.nd.arange(np.prod(shape_before)).reshape(shape_before)
+    x = mx.np.arange(onp.prod(shape_before)).reshape(shape_before)
     y = layer(x)
     assert y.shape == shape_after
     # - Channels are reshaped to form 2x1x2 blocks
diff --git a/tests/python/unittest/test_gluon_batch_processor.py b/tests/python/unittest/test_gluon_batch_processor.py
index bff80813bb12..0f0aa222658c 100644
--- a/tests/python/unittest/test_gluon_batch_processor.py
+++ b/tests/python/unittest/test_gluon_batch_processor.py
@@ -29,6 +29,8 @@
 from mxnet.gluon.contrib.estimator.batch_processor import BatchProcessor
 import pytest
 
+mx.npx.reset_np()
+
 def _get_test_network():
     net = nn.Sequential()
     net.add(nn.Dense(4, activation='relu', flatten=False))
@@ -37,14 +39,15 @@ def _get_test_network():
 
 def _get_test_data():
     batch_size = 4
-    in_data = mx.nd.random.uniform(shape=(10, 3))
-    out_data = mx.nd.random.uniform(shape=(10, 4))
+    in_data = mx.np.random.uniform(size=(10, 3))
+    out_data = mx.np.random.uniform(size=(10, 4))
     # Input dataloader
     dataset = gluon.data.dataset.ArrayDataset(in_data, out_data)
     dataloader = gluon.data.DataLoader(dataset, batch_size=batch_size)
     dataiter = mx.io.NDArrayIter(data=in_data, label=out_data, batch_size=batch_size)
     return dataloader, dataiter
 
+@mx.util.use_np
 def test_batch_processor_fit():
     ''' test estimator with different train data types '''
     net = _get_test_network()
@@ -76,6 +79,7 @@ def test_batch_processor_fit():
                 epochs=num_epochs)
 
 
+@mx.util.use_np
 def test_batch_processor_validation():
     ''' test different validation data types'''
     net = _get_test_network()
diff --git a/tests/python/unittest/test_gluon_control_flow.py b/tests/python/unittest/test_gluon_control_flow.py
new file mode 100644
index 000000000000..9eaa33cbae6e
--- /dev/null
+++ b/tests/python/unittest/test_gluon_control_flow.py
@@ -0,0 +1,694 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import copy
+import numpy as np
+import mxnet as mx
+from mxnet import gluon
+from mxnet.test_utils import *
+from mxnet.base import _as_list
+from collections import defaultdict
+from mxnet.attribute import AttrScope
+
+@mx.util.use_np
+def test_while_loop_simple_forward():
+
+    class _TestBlock(gluon.HybridBlock):
+
+        def __init__(self, cond, func, max_iterations):
+            super(_TestBlock, self).__init__()
+            self.cond = cond
+            self.func = func
+            self.max_iterations = max_iterations
+
+        def forward(self, *loop_vars):
+            return mx.npx.while_loop(
+                cond=self.cond,
+                func=self.func,
+                loop_vars=loop_vars,
+                max_iterations=self.max_iterations
+            )
+
+    for hybridize in [False, True]:
+        # Case 1.1: result should be sum([1, 2, 3 ... 100])
+        model = _TestBlock(
+            cond=lambda i, s: i <= 5,
+            func=lambda i, s: (None, (i + 1, s + i)),
+            max_iterations=10,
+        )
+        if hybridize:
+            model.hybridize()
+        _, result = model(
+            mx.np.array([1], dtype="int64"), # i
+            mx.np.array([0], dtype="int64"), # s
+        )
+        assert result[0].item() == 6
+        assert result[1].item() == 15
+        # Case 1.2: result should be sum([1, 2, 3 ... 1000])
+        model = _TestBlock(
+            cond=lambda i, s, true: true,
+            func=lambda i, s, true: (None, (i + 1, s + i, true)),
+            max_iterations=1000,
+        )
+        if hybridize:
+            model.hybridize()
+        _, result = model(
+            mx.np.array([1], dtype="int64"), # i
+            mx.np.array([0], dtype="int64"), # s
+            mx.np.array([1], dtype="int64"), # true
+        )
+        assert result[0].item() == 1001
+        assert result[1].item() == 500500
+        assert result[2].item() == 1
+        # Case 1.3: result should be sum([])
+        model = _TestBlock(
+            cond=lambda i, s, false: false,
+            func=lambda i, s, false: (None, (i + 1, s + i, false)),
+            max_iterations=1000,
+        )
+        if hybridize:
+            model.hybridize()
+        _, result = model(
+            mx.np.array([1], dtype="int64"), # i
+            mx.np.array([0], dtype="int64"), # s
+            mx.np.array([0], dtype="int64"), # false
+        )
+        assert result[0].item() == 1
+        assert result[1].item() == 0
+        assert result[2].item() == 0
+        # Case 2.1: result should be sum([1, 2, 3 ... 100])
+        model = _TestBlock(
+            cond=lambda i, s: i <= 100,
+            func=lambda i, s: (i, (i + 1, s + i)),
+            max_iterations=1000,
+        )
+        if hybridize:
+            model.hybridize()
+        outputs, (result_i, result_s) = model(
+            mx.np.array([1], dtype="int64"), # i
+            mx.np.array([0], dtype="int64"), # s
+        )
+        assert all(outputs.asnumpy()[ : 100] == np.arange(1, 101).reshape(100, 1))
+        assert result_i.item() == 101
+        assert result_s.item() == 5050
+        # Case 2.2: result should be sum([1, 2, 3 ... 1000])
+        model = _TestBlock(
+            cond=lambda i, s, true: true,
+            func=lambda i, s, true: (i, (i + 1, s + i, true)),
+            max_iterations=1000,
+        )
+        if hybridize:
+            model.hybridize()
+        outputs, (result_i, result_s, _) = model(
+            mx.np.array([1], dtype="int64"), # i
+            mx.np.array([0], dtype="int64"), # s
+            mx.np.array([1], dtype="int64"), # true
+        )
+        assert all(outputs.asnumpy() == np.arange(1, 1001).reshape(1000, 1))
+        assert result_i.item() == 1001
+        assert result_s.item() == 500500
+        # Case 2.3: a corner case, in which loop body is never executed
+        model = _TestBlock(
+            cond=lambda i, s, false: false,
+            func=lambda i, s, false: (i, (i + 1, s + i, false)),
+            max_iterations=1000,
+        )
+        if hybridize:
+            model.hybridize()
+        _, (result_i, result_s, _) = model(
+            mx.np.array([1], dtype="int64"), # i
+            mx.np.array([0], dtype="int64"), # s
+            mx.np.array([0], dtype="int64"), # false
+        )
+        assert result_i.item() == 1
+        assert result_s.item() == 0
+
+
+def test_cut_subgraph_foreach():
+    class TestLayer(gluon.HybridBlock):
+        def __init__(self):
+            super(TestLayer, self).__init__()
+
+        def forward(self, inputs, states):
+            def step1(data, states):
+                return data + 1, states
+            out1, states1 = mx.npx.foreach(step1, inputs, states)
+            out2, states2 = mx.npx.foreach(step1, out1, states)
+            def step2(data, states):
+                return data + states[0], states
+            out, states = mx.npx.foreach(step2, out2, states1)
+            return out
+
+    data = mx.np.random.normal(loc=0, scale=1, size=(5, 10))
+    states = mx.np.random.normal(loc=0, scale=1, size=(10))
+    layer = TestLayer()
+    layer.initialize(ctx=default_context())
+    res1 = layer(data, [states])
+
+    with mx.autograd.record():
+        res1 = layer(data, [states])
+
+    layer = TestLayer()
+    layer.initialize(ctx=default_context())
+    layer.hybridize()
+    res2 = layer(data, [states])
+
+    with mx.autograd.record():
+        res2 = layer(data, [states])
+    assert_almost_equal(res1.asnumpy(), res2.asnumpy(), rtol=1e-3, atol=1e-3)
+
+
+@mx.util.use_np
+def test_uniq_name():
+    class ForeachLayer1(gluon.HybridBlock):
+        def __init__(self):
+            super(ForeachLayer1, self).__init__()
+
+        def forward(self, inputs, states):
+            def step1(data, states):
+                return data + 1, states
+            out1, states1 = mx.npx.foreach(step1, inputs, states)
+            # The input variables have the same symbol name.
+            out, states = mx.npx.foreach(step1, out1, states1)
+            return out
+
+    class ForeachLayer2(gluon.HybridBlock):
+        def __init__(self):
+            super(ForeachLayer2, self).__init__()
+
+        def forward(self, inputs, states):
+            def step1(data, states):
+                return data + 1, states
+            out1, states1 = mx.npx.foreach(step1, inputs, states)
+            def step2(data, states):
+                return data, [states[0] + states[0] + mx.np.squeeze(mx.npx.slice(data, begin=0, end=1))]
+            # The input variables have the same symbol names.
+            # The free variables have the same symbol names as the input variables.
+            out, states = mx.npx.foreach(step2, out1, states1)
+            return out
+
+    class WhileLayer1(gluon.HybridBlock):
+        def __init__(self):
+            super(WhileLayer1, self).__init__()
+
+        def forward(self, inputs, states):
+            def cond(state1, state2):
+                s = mx.np.squeeze(mx.npx.slice(state1, begin=0, end=1))
+                return s == s
+            def step(state1, state2):
+                return state1 + 1, [state1 + 1, state2 + 1]
+            states = [states[0], states[0] + 1]
+            out1, states1 = mx.npx.while_loop(cond, step, states, max_iterations=5)
+            # The input variables have the same symbol name.
+            out, states = mx.npx.while_loop(cond, step, states1, max_iterations=5)
+            return out
+
+    class WhileLayer2(gluon.HybridBlock):
+        def __init__(self):
+            super(WhileLayer2, self).__init__()
+
+        def forward(self, inputs, states):
+            def cond(state1, state2):
+                s = mx.np.squeeze(mx.npx.slice(state1, begin=0, end=1))
+                return s == s
+            def step1(state1, state2):
+                return state1 + 1, [state1, state2]
+            states = [states[0], states[0] + 1]
+            out1, states1 = mx.npx.while_loop(cond, step1, states, max_iterations=5)
+            def step2(state1, state2):
+                return state1 + 1, [state1 + state1[0], state2 + state1[1]]
+            # The input variables have the same symbol name.
+            out, states = mx.npx.while_loop(cond, step2, states1, max_iterations=5)
+            return out
+
+    TestLayers = [ForeachLayer1, ForeachLayer2,
+            WhileLayer1, WhileLayer2]
+    # TestLayers = [WhileLayer1]
+
+    data = mx.np.random.normal(loc=0, scale=1, size=(2, 5))
+    states = mx.np.random.normal(loc=0, scale=1, size=(5))
+    for TestLayer in TestLayers:
+        layer = TestLayer()
+        layer.initialize(ctx=default_context())
+        res1 = layer(data, [states])
+
+        with mx.autograd.record():
+            res1 = layer(data, [states])
+
+        layer = TestLayer()
+        layer.initialize(ctx=default_context())
+        layer.hybridize()
+        res2 = layer(data, [states])
+
+        with mx.autograd.record():
+            res2 = layer(data, [states])
+        assert_almost_equal(res1.asnumpy(), res2.asnumpy(), rtol=0.001, atol=0.0001)
+
+
+@mx.util.use_np
+def test_cut_subgraph_while_loop():
+    class TestLayer(gluon.HybridBlock):
+        def __init__(self):
+            super(TestLayer, self).__init__()
+        def forward(self, data):
+            out1, data1 = mx.npx.while_loop(
+                cond=lambda i: i <= 5,
+                func=lambda i: (None, (i + 1, )),
+                loop_vars=(data, ),
+                max_iterations=10,
+            )
+            out2, data2 = mx.npx.while_loop(
+                cond=lambda i: i,
+                func=lambda i: (None, (i + 1, )),
+                loop_vars=data1[0],
+                max_iterations=10,
+            )
+            return data2[0]
+    data = mx.np.random.normal(loc=0, scale=1, size=(1, ))
+    layer = TestLayer()
+    layer.initialize(ctx=default_context())
+    res1 = layer(data)
+    with mx.autograd.record():
+        res1 = layer(data)
+    layer = TestLayer()
+    layer.initialize(ctx=default_context())
+    layer.hybridize()
+    res2 = layer(data)
+    with mx.autograd.record():
+        res2 = layer(data)
+    assert_almost_equal(res1.asnumpy(), res2.asnumpy(), rtol=1e-3, atol=1e-3)
+
+
+@mx.util.use_np
+def test_cut_subgraph_cond():
+    class TestLayer(gluon.HybridBlock):
+        def __init__(self):
+            super(TestLayer, self).__init__()
+        def forward(self, data):
+            data1 = mx.npx.cond(
+                pred=lambda data: data > 0.5,
+                then_func=lambda data: data * 2,
+                else_func=lambda data: data * 3,
+                inputs=data,
+            )
+            data2 = mx.npx.cond(
+                pred=lambda data: data > 0.5,
+                then_func=lambda data: data * 2,
+                else_func=lambda data: data * 3,
+                inputs=data1,
+            )
+            return data2
+    data = mx.np.random.normal(loc=0, scale=1, size=(1, ))
+    layer = TestLayer()
+    layer.initialize(ctx=default_context())
+    res1 = layer(data)
+    with mx.autograd.record():
+        res1 = layer(data)
+    layer = TestLayer()
+    layer.initialize(ctx=default_context())
+    layer.hybridize()
+    res2 = layer(data)
+    with mx.autograd.record():
+        res2 = layer(data)
+    assert_almost_equal(res1.asnumpy(), res2.asnumpy(), rtol=1e-3, atol=1e-3)
+
+
+@mx.util.use_np
+def test_output_format_foreach():
+    class TestLayer1(gluon.HybridBlock):
+        def __init__(self, step):
+            super(TestLayer1, self).__init__()
+            self.step = step
+        def forward(self, ins, states):
+            out, states = mx.npx.foreach(self.step, ins, states)
+            return out, states
+
+    def step1(data, state):
+        return data, state
+    def step2(data, state):
+        return [data], state
+    def step3(data, state):
+        if isinstance(state, list):
+            return [], [state[0] + data]
+        else:
+            return [], state + data
+    def step4(data, state):
+        if isinstance(state, list):
+            return [data, state[0]], state
+        else:
+            return [data, state], state
+
+    steps = [step1, step2, step3, step4]
+    data = mx.np.random.normal(loc=0, scale=1, size=(10, 2))
+    state = mx.np.random.normal(loc=0, scale=1, size=(2))
+    for step in steps:
+        layer1 = TestLayer1(step)
+        layer1.initialize(ctx=default_context())
+        layer2 = TestLayer1(step)
+        layer2.initialize(ctx=default_context())
+        layer2.hybridize()
+        out1, state1 = layer1(data, [state])
+        out2, state2 = layer2(data, [state])
+        step_out, step_state = step(data, [state])
+        assert type(out1) == type(step_out)
+        assert type(out2) == type(step_out)
+        assert type(state1) == type(step_state)
+        assert type(state2) == type(step_state)
+        out1 = _as_list(out1)
+        out2 = _as_list(out2)
+        state1 = _as_list(state1)
+        state2 = _as_list(state2)
+        for i in range(len(out1)):
+            assert_almost_equal(out1[i].asnumpy(), out2[i].asnumpy(), rtol=0.001, atol=0.0001)
+        for i in range(len(state1)):
+            assert_almost_equal(state1[i].asnumpy(), state2[i].asnumpy(), rtol=0.001, atol=0.0001)
+
+        layer1 = TestLayer1(step)
+        layer1.initialize(ctx=default_context())
+        layer2 = TestLayer1(step)
+        layer2.initialize(ctx=default_context())
+        layer2.hybridize()
+        out1, state1 = layer1(data, state)
+        out2, state2 = layer2(data, state)
+        step_out, step_state = step(data, state)
+        assert type(out1) == type(step_out)
+        assert type(out2) == type(step_out)
+        assert type(state1) == type(step_state)
+        assert type(state2) == type(step_state)
+        out1 = _as_list(out1)
+        out2 = _as_list(out2)
+        state1 = _as_list(state1)
+        state2 = _as_list(state2)
+        for i in range(len(out1)):
+            assert_almost_equal(out1[i].asnumpy(), out2[i].asnumpy(), rtol=0.001, atol=0.0001)
+        for i in range(len(state1)):
+            assert_almost_equal(state1[i].asnumpy(), state2[i].asnumpy(), rtol=0.001, atol=0.0001)
+
+        if step == step3:
+            continue
+        layer1 = TestLayer1(step)
+        layer1.initialize(ctx=default_context())
+        layer2 = TestLayer1(step)
+        layer2.initialize(ctx=default_context())
+        layer2.hybridize()
+        out1, state1 = layer1(data, [state, [state + 1]])
+        out2, state2 = layer2(data, [state, [state + 1]])
+        step_out, step_state = step(data, [state, [state + 1]])
+        assert type(out1) == type(step_out)
+        assert type(out2) == type(step_out)
+        assert type(state1) == type(step_state)
+        assert type(state2) == type(step_state)
+        out1 = _as_list(out1)
+        out2 = _as_list(out2)
+        state1 = _as_list(state1)
+        state2 = _as_list(state2)
+        for i in range(len(out1)):
+            assert_almost_equal(out1[i].asnumpy(), out2[i].asnumpy(), rtol=0.001, atol=0.0001)
+        for i in range(len(state1)):
+            if isinstance(state1[i], list):
+                assert_almost_equal(state1[i][0].asnumpy(), state2[i][0].asnumpy(),
+                        rtol=0.001, atol=0.0001)
+            else:
+                assert_almost_equal(state1[i].asnumpy(), state2[i].asnumpy(),
+                        rtol=0.001, atol=0.0001)
+
+
+@mx.util.use_np
+def test_output_format_while():
+    class TestLayer1(gluon.HybridBlock):
+        def __init__(self, step, use_list, nested_list=False):
+            super(TestLayer1, self).__init__()
+            self.step = step
+            self.use_list = use_list
+            self.nested_list = nested_list
+        def forward(self, states):
+            def cond(state1):
+                scalar = mx.npx.slice(state1, begin=0, end=1)
+                return scalar == scalar
+            cond_func = cond
+            if self.use_list:
+                states = [states]
+            elif self.nested_list:
+                def cond2(state1, state2):
+                    scalar = mx.npx.slice(state1, begin=0, end=1)
+                    return scalar == scalar
+                cond_func = cond2
+                states = [states, [states + 1]]
+            out, states = mx.npx.while_loop(cond_func, self.step, states, max_iterations=5)
+            return out, states
+
+    def step1(state):
+        return state, state
+    def step2(state):
+        if isinstance(state, list):
+            return state, state
+        else:
+            return [state], state
+    def step3(state):
+        return [], state
+
+    steps = [step1, step2, step3]
+    state = mx.np.random.normal(loc=0, scale=1, size=(2))
+    for step in steps:
+        layer1 = TestLayer1(step, False)
+        layer1.initialize(ctx=default_context())
+        layer2 = TestLayer1(step, False)
+        layer2.initialize(ctx=default_context())
+        layer2.hybridize()
+        out1, state1 = layer1(state)
+        out2, state2 = layer2(state)
+        assert type(out1) == type(out2)
+        assert type(state1) == type(state1)
+        out1 = _as_list(out1)
+        out2 = _as_list(out2)
+        state1 = _as_list(state1)
+        state2 = _as_list(state2)
+        for i in range(len(out1)):
+            assert_almost_equal(out1[i].asnumpy(), out2[i].asnumpy(), rtol=0.001, atol=0.0001)
+        for i in range(len(state1)):
+            assert_almost_equal(state1[i].asnumpy(), state2[i].asnumpy(), rtol=0.001, atol=0.0001)
+
+        layer1 = TestLayer1(step, True)
+        layer1.initialize(ctx=default_context())
+        layer2 = TestLayer1(step, True)
+        layer2.initialize(ctx=default_context())
+        layer2.hybridize()
+        out1, state1 = layer1(state)
+        out2, state2 = layer2(state)
+        assert type(out1) == type(out2)
+        assert type(state1) == type(state2)
+        out1 = _as_list(out1)
+        out2 = _as_list(out2)
+        state1 = _as_list(state1)
+        state2 = _as_list(state2)
+        for i in range(len(out1)):
+            assert_almost_equal(out1[i].asnumpy(), out2[i].asnumpy(), rtol=0.001, atol=0.0001)
+        for i in range(len(state1)):
+            assert_almost_equal(state1[i].asnumpy(), state2[i].asnumpy(), rtol=0.001, atol=0.0001)
+
+    def step4(state, state2):
+        states = _as_list(state)
+        states.append(state2)
+        return state, states
+    def step5(state, state2):
+        states = _as_list(state)
+        states.append(state2)
+        if isinstance(state, list):
+            return state, states
+        else:
+            return [state], states
+    def step6(state, state2):
+        states = _as_list(state)
+        states.append(state2)
+        return [], states
+
+    steps = [step4, step5, step6]
+    for step in steps:
+        layer1 = TestLayer1(step, False, True)
+        layer1.initialize(ctx=default_context())
+        layer2 = TestLayer1(step, False, True)
+        layer2.initialize(ctx=default_context())
+        layer2.hybridize()
+        out1, state1 = layer1(state)
+        out2, state2 = layer2(state)
+        assert type(out1) == type(out2)
+        assert type(state1) == type(state2)
+        out1 = _as_list(out1)
+        out2 = _as_list(out2)
+        state1 = _as_list(state1)
+        state2 = _as_list(state2)
+        for i in range(len(out1)):
+            assert_almost_equal(out1[i].asnumpy(), out2[i].asnumpy(), rtol=0.001, atol=0.0001)
+        for i in range(len(state1)):
+            if not isinstance(state1[i], list):
+                assert_almost_equal(state1[i].asnumpy(), state2[i].asnumpy(),
+                                    rtol=0.001, atol=0.0001)
+
+
+@mx.util.use_np
+def test_output_format_cond():
+    class TestLayer1(gluon.HybridBlock):
+        def __init__(self, func):
+            super(TestLayer1, self).__init__()
+            self.func = func
+        def forward(self, data):
+            def then_func(data):
+                return self.func(data)
+            def else_func(data):
+                return self.func(data)
+            return mx.npx.cond(lambda data: mx.npx.slice(data, begin=0, end=1),
+                    then_func, else_func, data)
+
+    def func1(data):
+        return data
+    def func2(data):
+        return [data]
+    def func3(data):
+        return [data, data]
+
+    funcs = [func1, func2, func3]
+    data = mx.np.random.normal(loc=0, scale=1, size=(2))
+    for func in funcs:
+        layer1 = TestLayer1(func)
+        layer1.initialize(ctx=default_context())
+        layer2 = TestLayer1(func)
+        layer2.initialize(ctx=default_context())
+        layer2.hybridize()
+        out1 = layer1(data)
+        out2 = layer2(data)
+        func_out = func(data)
+        assert type(out1) == type(func_out)
+        assert type(out2) == type(func_out)
+        out1 = _as_list(out1)
+        out2 = _as_list(out2)
+        for i in range(len(out1)):
+            assert_almost_equal(out1[i].asnumpy(), out2[i].asnumpy(), rtol=0.001, atol=0.0001)
+
+
+@mx.util.use_np
+def test_scope():
+    class TestBlock1(gluon.HybridBlock):
+        def __init__(self):
+            super(TestBlock1, self).__init__()
+
+        def forward(self, data):
+            (new_data, ) = mx.npx.cond(
+                pred=lambda data: data > 0.5,
+                then_func=lambda data: data * 2,
+                else_func=lambda data: data * 3,
+                inputs=data,
+                name="my_cond",
+            )
+            return new_data
+
+    class TestBlock2(gluon.HybridBlock):
+        def __init__(self):
+            super(TestBlock2, self).__init__()
+
+        def forward(self, data):
+            (new_data, ) = mx.npx.cond(
+                pred=lambda data: data > 0.5,
+                then_func=lambda data: data * 2,
+                else_func=lambda data: data * 3,
+                inputs=data,
+                name="my_cond",
+            )
+            return new_data
+
+    AttrScope._subgraph_names = defaultdict(int)
+    data = mx.np.random.normal(loc=0, scale=1, size=(1, ))
+    with AttrScope(__subgraph_name__="my_cond"):
+        block1 = TestBlock1()
+        block1.initialize(ctx=default_context())
+        block1.hybridize()
+        _ = block1(data)
+        block2 = TestBlock2()
+        block2.initialize(ctx=default_context())
+        block2.hybridize()
+        _ = block2(data)
+        assert len(AttrScope._subgraph_names) == 3
+        assert AttrScope._subgraph_names['my_cond$my_cond_else'] == 2
+        assert AttrScope._subgraph_names['my_cond$my_cond_pred'] == 2
+        assert AttrScope._subgraph_names['my_cond$my_cond_then'] == 2
+
+
+class RNNLayer(gluon.HybridBlock):
+    def __init__(self, cell_type, hidden_size):
+        super(RNNLayer, self).__init__()
+        self.cell = cell_type(hidden_size)
+
+    def forward(self, inputs, states):
+        out, states = mx.npx.foreach(self.cell, inputs, states)
+        return out
+    
+    def infer_shape(self, input, *args):
+        self.cell.infer_shape(0, input, False)
+
+@mx.util.use_np
+def check_rnn(cell_type, num_states):
+    batch_size = 10
+    hidden_size = 100
+    rnn_data = mx.np.random.normal(loc=0, scale=1, size=(5, batch_size, 50))
+    state_shape = (batch_size, hidden_size)
+    states = [mx.np.random.normal(loc=0, scale=1, size=state_shape) for i in range(num_states)]
+    layer = RNNLayer(cell_type, hidden_size)
+    layer.infer_shape(rnn_data)
+    layer.initialize(ctx=default_context())
+    res1 = layer(rnn_data, states)
+    params1 = layer.collect_params()
+    orig_params1 = copy.deepcopy(params1)
+
+    trainer = gluon.Trainer(params1, 'sgd', {'learning_rate' : 0.03})
+    with mx.autograd.record():
+        res1 = layer(rnn_data, states)
+    res1.backward()
+    trainer.step(batch_size)
+
+    configs = [
+            {},
+            {'inline_limit': 0},
+            {'static_alloc': True},
+            {'static_alloc': True, 'static_shape': True} ]
+    for config in configs:
+        layer = RNNLayer(cell_type, hidden_size)
+        layer.infer_shape(rnn_data)
+        layer.initialize(ctx=default_context())
+        layer.hybridize(**config)
+        res2 = layer(rnn_data, states)
+        params2 = layer.collect_params()
+        for key, val in orig_params1.items():
+            params2[key].set_data(copy.deepcopy(val.data()))
+        trainer = gluon.Trainer(params2, 'sgd', {'learning_rate' : 0.03})
+        with mx.autograd.record():
+            res2 = layer(rnn_data, states)
+        assert_almost_equal(res1.asnumpy(), res2.asnumpy(), rtol=1e-3, atol=1e-3)
+        res2.backward()
+        trainer.step(batch_size)
+
+        for key, val in params1.items():
+            weight1 = val.data()
+            weight2 = params2[key].data()
+            assert_almost_equal(weight1.asnumpy(), weight2.asnumpy(),
+                    rtol=1e-3, atol=1e-3)
+
+
+def test_rnn():
+    cell_types = [(gluon.rnn.RNNCell, 1), (gluon.rnn.LSTMCell, 2),
+            (gluon.rnn.GRUCell, 1)]
+    for cell_type, num_states in cell_types:
+        check_rnn(cell_type, num_states)
+
diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py
index 59c67e2b44f7..fe0cf7dfe665 100644
--- a/tests/python/unittest/test_gluon_data.py
+++ b/tests/python/unittest/test_gluon_data.py
@@ -72,18 +72,20 @@ def test_recordimage_dataset(prepare_record):
         assert x.shape[0] == 1 and x.shape[3] == 3
         assert y.asscalar() == i
 
+@mx.util.use_np
 def test_recordimage_dataset_handle(prepare_record):
     recfile = prepare_record
     class TmpTransform(mx.gluon.HybridBlock):
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             return x
+
     fn = TmpTransform()
     dataset = gluon.data.vision.ImageRecordDataset(recfile).transform_first(fn).__mx_handle__()
     loader = gluon.data.DataLoader(dataset, 1)
 
     for i, (x, y) in enumerate(loader):
         assert x.shape[0] == 1 and x.shape[3] == 3
-        assert y.asscalar() == i
+        assert y.item() == i
 
 def _dataset_transform_fn(x, y):
     """Named transform function since lambda function cannot be pickled."""
@@ -539,6 +541,7 @@ def test_mx_data_loader():
     for _ in dl:
         pass
 
+@mx.util.use_np
 def test_mx_data_loader_nopython():
     from mxnet.gluon.data.dataloader import DataLoader
     from mxnet.gluon.data.vision.transforms import ToTensor
diff --git a/tests/python/unittest/test_gluon_data_vision.py b/tests/python/unittest/test_gluon_data_vision.py
deleted file mode 100644
index a7c9e163db4b..000000000000
--- a/tests/python/unittest/test_gluon_data_vision.py
+++ /dev/null
@@ -1,433 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-from collections import namedtuple
-
-import mxnet as mx
-import mxnet.ndarray as nd
-from mxnet.base import MXNetError
-from mxnet import gluon
-from mxnet.base import MXNetError
-from mxnet.gluon.data.vision import transforms
-from mxnet import image
-from mxnet.test_utils import *
-from common import assertRaises, xfail_when_nonstandard_decimal_separator
-
-import numpy as np
-import pytest
-
-def test_to_tensor():
-    # 3D Input
-    data_in = np.random.uniform(0, 255, (300, 300, 3)).astype(dtype=np.uint8)
-    out_nd = transforms.ToTensor()(nd.array(data_in, dtype='uint8'))
-    assert_almost_equal(out_nd.asnumpy(), np.transpose(
-                        data_in.astype(dtype=np.float32) / 255.0, (2, 0, 1)))
-
-    # 4D Input
-    data_in = np.random.uniform(0, 255, (5, 300, 300, 3)).astype(dtype=np.uint8)
-    out_nd = transforms.ToTensor()(nd.array(data_in, dtype='uint8'))
-    assert_almost_equal(out_nd.asnumpy(), np.transpose(
-                        data_in.astype(dtype=np.float32) / 255.0, (0, 3, 1, 2)))
-
-    # Invalid Input
-    invalid_data_in = nd.random.uniform(0, 255, (5, 5, 300, 300, 3)).astype(dtype=np.uint8)
-    transformer = transforms.ToTensor()
-    assertRaises(MXNetError, transformer, invalid_data_in)
-
-    # Bounds (0->0, 255->1)
-    data_in = np.zeros((10, 20, 3)).astype(dtype=np.uint8)
-    out_nd = transforms.ToTensor()(nd.array(data_in, dtype='uint8'))
-    assert same(out_nd.asnumpy(), np.transpose(np.zeros(data_in.shape, dtype=np.float32), (2, 0, 1)))
-
-    data_in = np.full((10, 20, 3), 255).astype(dtype=np.uint8)
-    out_nd = transforms.ToTensor()(nd.array(data_in, dtype='uint8'))
-    assert same(out_nd.asnumpy(), np.transpose(np.ones(data_in.shape, dtype=np.float32), (2, 0, 1)))
-
-
-def test_normalize():
-    # 3D Input
-    data_in_3d = nd.random.uniform(0, 1, (3, 300, 300))
-    out_nd_3d = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1))(data_in_3d)
-    data_expected_3d = data_in_3d.asnumpy()
-    data_expected_3d[:][:][0] = data_expected_3d[:][:][0] / 3.0
-    data_expected_3d[:][:][1] = (data_expected_3d[:][:][1] - 1.0) / 2.0
-    data_expected_3d[:][:][2] = data_expected_3d[:][:][2] - 2.0
-    assert_almost_equal(data_expected_3d, out_nd_3d.asnumpy())
-
-    # 4D Input
-    data_in_4d = nd.random.uniform(0, 1, (2, 3, 300, 300))
-    out_nd_4d = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1))(data_in_4d)
-    data_expected_4d = data_in_4d.asnumpy()
-    data_expected_4d[0][:][:][0] = data_expected_4d[0][:][:][0] / 3.0
-    data_expected_4d[0][:][:][1] = (data_expected_4d[0][:][:][1] - 1.0) / 2.0
-    data_expected_4d[0][:][:][2] = data_expected_4d[0][:][:][2] - 2.0
-    data_expected_4d[1][:][:][0] = data_expected_4d[1][:][:][0] / 3.0
-    data_expected_4d[1][:][:][1] = (data_expected_4d[1][:][:][1] - 1.0) / 2.0
-    data_expected_4d[1][:][:][2] = data_expected_4d[1][:][:][2] - 2.0
-    assert_almost_equal(data_expected_4d, out_nd_4d.asnumpy())
-
-    # Invalid Input - Neither 3D or 4D input
-    invalid_data_in = nd.random.uniform(0, 1, (5, 5, 3, 300, 300))
-    normalize_transformer = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1))
-    assertRaises(MXNetError, normalize_transformer, invalid_data_in)
-
-    # Invalid Input - Channel neither 1 or 3
-    invalid_data_in = nd.random.uniform(0, 1, (5, 4, 300, 300))
-    normalize_transformer = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1))
-    assertRaises(MXNetError, normalize_transformer, invalid_data_in)
-
-
-def test_resize():
-    def _test_resize_with_diff_type(dtype):
-        # test normal case
-        data_in = nd.random.uniform(0, 255, (300, 200, 3)).astype(dtype)
-        out_nd = transforms.Resize(200)(data_in)
-        data_expected = mx.image.imresize(data_in, 200, 200, 1)
-        assert_almost_equal(out_nd.asnumpy(), data_expected.asnumpy())
-        # test 4D input
-        data_bath_in = nd.random.uniform(0, 255, (3, 300, 200, 3)).astype(dtype)
-        out_batch_nd = transforms.Resize(200)(data_bath_in)
-        for i in range(len(out_batch_nd)):
-            assert_almost_equal(mx.image.imresize(data_bath_in[i], 200, 200, 1).asnumpy(),
-                out_batch_nd[i].asnumpy())
-        # test interp = 2
-        out_nd = transforms.Resize(200, interpolation=2)(data_in)
-        data_expected = mx.image.imresize(data_in, 200, 200, 2)
-        assert_almost_equal(out_nd.asnumpy(), data_expected.asnumpy())
-        # test height not equals to width
-        out_nd = transforms.Resize((200, 100))(data_in)
-        data_expected = mx.image.imresize(data_in, 200, 100, 1)
-        assert_almost_equal(out_nd.asnumpy(), data_expected.asnumpy())
-        # test keep_ratio
-        out_nd = transforms.Resize(150, keep_ratio=True)(data_in)
-        data_expected = mx.image.imresize(data_in, 150, 225, 1)
-        assert_almost_equal(out_nd.asnumpy(), data_expected.asnumpy())
-        # test size below zero
-        invalid_transform = transforms.Resize(-150, keep_ratio=True)
-        assertRaises(MXNetError, invalid_transform, data_in)
-        # test size more than 2:
-        invalid_transform = transforms.Resize((100, 100, 100), keep_ratio=True)
-        assertRaises(MXNetError, invalid_transform, data_in)
-
-    for dtype in ['uint8', 'float32', 'float64']:
-        _test_resize_with_diff_type(dtype)
-
-
-def test_crop_resize():
-    def _test_crop_resize_with_diff_type(dtype):
-        # test normal case
-        data_in = nd.arange(60).reshape((5, 4, 3)).astype(dtype)
-        out_nd = transforms.CropResize(0, 0, 3, 2)(data_in)
-        out_np = out_nd.asnumpy()
-        assert(out_np.sum() == 180)
-        assert((out_np[0:2,1,1].flatten() == [4, 16]).all())
-        # test 4D input
-        data_bath_in = nd.arange(180).reshape((2, 6, 5, 3)).astype(dtype)
-        out_batch_nd = transforms.CropResize(1, 2, 3, 4)(data_bath_in)
-        out_batch_np = out_batch_nd.asnumpy()
-        assert(out_batch_np.sum() == 7524)
-        assert((out_batch_np[0:2,0:4,1,1].flatten() == [37,  52,  67,  82, 127, 142, 157, 172]).all())
-        # test normal case with resize
-        data_in = nd.random.uniform(0, 255, (300, 200, 3)).astype(dtype)
-        out_nd = transforms.CropResize(0, 0, 100, 50, (25, 25), 1)(data_in)
-        data_expected = transforms.Resize(size=25, interpolation=1)(nd.slice(data_in, (0, 0, 0), (50, 100, 3)))
-        assert_almost_equal(out_nd.asnumpy(), data_expected.asnumpy())
-        # test 4D input with resize
-        data_bath_in = nd.random.uniform(0, 255, (3, 300, 200, 3)).astype(dtype)
-        out_batch_nd = transforms.CropResize(0, 0, 100, 50, (25, 25), 1)(data_bath_in)
-        for i in range(len(out_batch_nd)):
-            actual = transforms.Resize(size=25, interpolation=1)(nd.slice(data_bath_in[i], (0, 0, 0), (50, 100, 3))).asnumpy()
-            expected = out_batch_nd[i].asnumpy()
-            assert_almost_equal(expected, actual)
-        # test with resize height and width should be greater than 0
-        transformer = transforms.CropResize(0, 0, 100, 50, (-25, 25), 1)
-        assertRaises(MXNetError, transformer, data_in)
-        # test height and width should be greater than 0
-        transformer = transforms.CropResize(0, 0, -100, -50)
-        assertRaises(MXNetError, transformer, data_in)
-        # test cropped area is bigger than input data
-        transformer = transforms.CropResize(150, 200, 200, 500)
-        assertRaises(MXNetError, transformer, data_in)
-        assertRaises(MXNetError, transformer, data_bath_in)
-
-    for dtype in ['uint8', 'float32', 'float64']:
-        _test_crop_resize_with_diff_type(dtype)
-
-    # test nd.image.crop backward
-    def test_crop_backward(test_nd_arr, TestCase):
-        a_np = test_nd_arr.asnumpy()
-        b_np = a_np[(slice(TestCase.y, TestCase.y + TestCase.height), slice(TestCase.x, TestCase.x + TestCase.width), slice(0, 3))]
-
-        data = mx.sym.Variable('data')
-        crop_sym = mx.sym.image.crop(data, TestCase.x, TestCase.y, TestCase.width, TestCase.height)
-
-        expected_in_grad = np.zeros_like(a_np)
-        expected_in_grad[(slice(TestCase.y, TestCase.y + TestCase.height), slice(TestCase.x, TestCase.x + TestCase.width), slice(0, 3))] = b_np
-        check_symbolic_backward(crop_sym, [a_np], [b_np], [expected_in_grad])
-
-    TestCase = namedtuple('TestCase', ['x', 'y', 'width', 'height'])
-    test_list = [TestCase(0, 0, 3, 3), TestCase(2, 1, 1, 2), TestCase(0, 1, 3, 2)]
-
-    for dtype in ['uint8', 'float32', 'float64']:
-        data_in = nd.arange(60).reshape((5, 4, 3)).astype(dtype)
-        for test_case in test_list:
-            test_crop_backward(data_in, test_case)
-
-
-def test_flip_left_right():
-    for width in range(3, 301, 7):
-        data_in = np.random.uniform(0, 255, (300, width, 3)).astype(dtype=np.uint8)
-        flip_in = data_in[:, ::-1, :]
-        data_trans = nd.image.flip_left_right(nd.array(data_in, dtype='uint8'))
-        assert_almost_equal(flip_in, data_trans.asnumpy())
-
-
-def test_flip_top_bottom():
-    for height in range(3, 301, 7):
-        data_in = np.random.uniform(0, 255, (height, 300, 3)).astype(dtype=np.uint8)
-        flip_in = data_in[::-1, :, :]
-        data_trans = nd.image.flip_top_bottom(nd.array(data_in, dtype='uint8'))
-        assert_almost_equal(flip_in, data_trans.asnumpy())
-
-
-def test_transformer():
-    from mxnet.gluon.data.vision import transforms
-
-    transform = transforms.Compose([
-        transforms.Resize(300),
-        transforms.Resize(300, keep_ratio=True),
-        transforms.CenterCrop(256),
-        transforms.RandomCrop(256, pad=16),
-        transforms.RandomResizedCrop(224),
-        transforms.RandomFlipLeftRight(),
-        transforms.RandomColorJitter(0.1, 0.1, 0.1, 0.1),
-        transforms.RandomBrightness(0.1),
-        transforms.RandomContrast(0.1),
-        transforms.RandomSaturation(0.1),
-        transforms.RandomHue(0.1),
-        transforms.RandomLighting(0.1),
-        transforms.ToTensor(),
-        transforms.RandomRotation([-10., 10.]),
-        transforms.Normalize([0, 0, 0], [1, 1, 1])])
-
-    transform(mx.nd.ones((245, 480, 3), dtype='uint8')).wait_to_read()
-
-def test_random_crop():
-    x = mx.nd.ones((245, 480, 3), dtype='uint8')
-    y = mx.nd.image.random_crop(x, width=100, height=100)
-    assert y.shape == (100, 100, 3)
-
-def test_random_resize_crop():
-    x = mx.nd.ones((245, 480, 3), dtype='uint8')
-    y = mx.nd.image.random_resized_crop(x, width=100, height=100)
-    assert y.shape == (100, 100, 3)
-
-def test_hybrid_transformer():
-    from mxnet.gluon.data.vision import transforms
-
-    transform = transforms.HybridCompose([
-        transforms.Resize(300),
-        transforms.Resize(300, keep_ratio=True),
-        transforms.CenterCrop(256),
-        transforms.RandomCrop(256, pad=16),
-        transforms.RandomFlipLeftRight(),
-        transforms.RandomColorJitter(0.1, 0.1, 0.1, 0.1),
-        transforms.RandomBrightness(0.1),
-        transforms.RandomContrast(0.1),
-        transforms.RandomSaturation(0.1),
-        transforms.RandomHue(0.1),
-        transforms.RandomLighting(0.1),
-        transforms.ToTensor(),
-        transforms.Normalize([0, 0, 0], [1, 1, 1])])
-
-    transform(mx.nd.ones((245, 480, 3), dtype='uint8')).wait_to_read()
-
-def test_rotate():
-    transformer = transforms.Rotate(10.)
-    assertRaises(TypeError, transformer, mx.nd.ones((3, 30, 60), dtype='uint8'))
-    single_image = mx.nd.ones((3, 30, 60), dtype='float32')
-    single_output = transformer(single_image)
-    assert same(single_output.shape, (3, 30, 60))
-    batch_image = mx.nd.ones((3, 3, 30, 60), dtype='float32')
-    batch_output = transformer(batch_image)
-    assert same(batch_output.shape, (3, 3, 30, 60))
-
-    input_image = nd.array([[[0., 0., 0.],
-                             [0., 0., 1.],
-                             [0., 0., 0.]]])
-    rotation_angles_expected_outs = [
-        (90., nd.array([[[0., 1., 0.],
-                         [0., 0., 0.],
-                         [0., 0., 0.]]])),
-        (180., nd.array([[[0., 0., 0.],
-                          [1., 0., 0.],
-                          [0., 0., 0.]]])),
-        (270., nd.array([[[0., 0., 0.],
-                          [0., 0., 0.],
-                          [0., 1., 0.]]])),
-        (360., nd.array([[[0., 0., 0.],
-                          [0., 0., 1.],
-                          [0., 0., 0.]]])),
-    ]
-    for rot_angle, expected_result in rotation_angles_expected_outs:
-        transformer = transforms.Rotate(rot_angle)
-        ans = transformer(input_image)
-        print(ans, expected_result)
-        assert_almost_equal(ans, expected_result, atol=1e-6)
-
-
-def test_random_rotation():
-    # test exceptions for probability input outside of [0,1]
-    assertRaises(ValueError, transforms.RandomRotation, [-10, 10.], rotate_with_proba=1.1)
-    assertRaises(ValueError, transforms.RandomRotation, [-10, 10.], rotate_with_proba=-0.3)
-    # test `forward`
-    transformer = transforms.RandomRotation([-10, 10.])
-    assertRaises(TypeError, transformer, mx.nd.ones((3, 30, 60), dtype='uint8'))
-    single_image = mx.nd.ones((3, 30, 60), dtype='float32')
-    single_output = transformer(single_image)
-    assert same(single_output.shape, (3, 30, 60))
-    batch_image = mx.nd.ones((3, 3, 30, 60), dtype='float32')
-    batch_output = transformer(batch_image)
-    assert same(batch_output.shape, (3, 3, 30, 60))
-    # test identity (rotate_with_proba = 0)
-    transformer = transforms.RandomRotation([-100., 100.], rotate_with_proba=0.0)
-    data = mx.nd.random_normal(shape=(3, 30, 60))
-    assert_almost_equal(data, transformer(data))
-
-
-@xfail_when_nonstandard_decimal_separator
-def test_rotate():
-    transformer = transforms.Rotate(10.)
-    assertRaises(TypeError, transformer, mx.nd.ones((3, 30, 60), dtype='uint8'))
-    single_image = mx.nd.ones((3, 30, 60), dtype='float32')
-    single_output = transformer(single_image)
-    assert same(single_output.shape, (3, 30, 60))
-    batch_image = mx.nd.ones((3, 3, 30, 60), dtype='float32')
-    batch_output = transformer(batch_image)
-    assert same(batch_output.shape, (3, 3, 30, 60))
-
-    input_image = nd.array([[[0., 0., 0.],
-                             [0., 0., 1.],
-                             [0., 0., 0.]]])
-    rotation_angles_expected_outs = [
-        (90., nd.array([[[0., 1., 0.],
-                         [0., 0., 0.],
-                         [0., 0., 0.]]])),
-        (180., nd.array([[[0., 0., 0.],
-                          [1., 0., 0.],
-                          [0., 0., 0.]]])),
-        (270., nd.array([[[0., 0., 0.],
-                          [0., 0., 0.],
-                          [0., 1., 0.]]])),
-        (360., nd.array([[[0., 0., 0.],
-                          [0., 0., 1.],
-                          [0., 0., 0.]]])),
-    ]
-    for rot_angle, expected_result in rotation_angles_expected_outs:
-        transformer = transforms.Rotate(rot_angle)
-        ans = transformer(input_image)
-        print(ans, expected_result)
-        assert_almost_equal(ans, expected_result, atol=1e-6)
-
-
-def test_random_rotation():
-    # test exceptions for probability input outside of [0,1]
-    assertRaises(ValueError, transforms.RandomRotation, [-10, 10.], rotate_with_proba=1.1)
-    assertRaises(ValueError, transforms.RandomRotation, [-10, 10.], rotate_with_proba=-0.3)
-    # test `forward`
-    transformer = transforms.RandomRotation([-10, 10.])
-    assertRaises(TypeError, transformer, mx.nd.ones((3, 30, 60), dtype='uint8'))
-    single_image = mx.nd.ones((3, 30, 60), dtype='float32')
-    single_output = transformer(single_image)
-    assert same(single_output.shape, (3, 30, 60))
-    batch_image = mx.nd.ones((3, 3, 30, 60), dtype='float32')
-    batch_output = transformer(batch_image)
-    assert same(batch_output.shape, (3, 3, 30, 60))
-    # test identity (rotate_with_proba = 0)
-    transformer = transforms.RandomRotation([-100., 100.], rotate_with_proba=0.0)
-    data = mx.nd.random_normal(shape=(3, 30, 60))
-    assert_almost_equal(data, transformer(data))
-
-
-def test_random_transforms():
-    from mxnet.gluon.data.vision import transforms
-
-    counter = 0
-    def transform_fn(x):
-        nonlocal counter
-        counter += 1
-        return x
-    transform = transforms.Compose([transforms.RandomApply(transform_fn, 0.5)])
-
-    img = mx.np.ones((10, 10, 3), dtype='uint8')
-    iteration = 10000
-    num_apply = 0
-    for _ in range(iteration):
-        out = transform(img)
-    assert counter == pytest.approx(5000, 1e-1)
-
-@xfail_when_nonstandard_decimal_separator
-@pytest.mark.flaky
-def test_random_gray():
-    from mxnet.gluon.data.vision import transforms
-
-    transform = transforms.RandomGray(0.5)
-    img = mx.nd.ones((4, 4, 3), dtype='uint8')
-    pixel = img[0, 0, 0].asnumpy()
-    iteration = 1000
-    num_apply = 0
-    for _ in range(iteration):
-        out = transform(img)
-        if out[0][0][0].asnumpy() != pixel:
-            num_apply += 1
-    assert_almost_equal(num_apply/float(iteration), 0.5, 0.1)
-
-    transform = transforms.RandomGray(0.5)
-    transform.hybridize()
-    img = mx.nd.ones((4, 4, 3), dtype='uint8')
-    pixel = img[0, 0, 0].asnumpy()
-    iteration = 1000
-    num_apply = 0
-    for _ in range(iteration):
-        out = transform(img)
-        if out[0][0][0].asnumpy() != pixel:
-            num_apply += 1
-    assert_almost_equal(num_apply/float(iteration), 0.5, 0.1)
-
-def test_bbox_random_flip():
-    from mxnet.gluon.contrib.data.vision.transforms.bbox import ImageBboxRandomFlipLeftRight
-
-    transform = ImageBboxRandomFlipLeftRight(0.5)
-    iteration = 200
-    num_apply = 0
-    for _ in range(iteration):
-        img = mx.nd.ones((10, 10, 3), dtype='uint8')
-        img[0, 0, 0] = 10
-        bbox = mx.nd.array([[1, 2, 3, 4, 0]])
-        im_out, im_bbox = transform(img, bbox)
-        if im_bbox[0][0].asnumpy() != 1 and im_out[0, 0, 0].asnumpy() != 10:
-            num_apply += 1
-    assert_almost_equal(np.array([num_apply])/float(iteration), 0.5, 0.5)
-
-def test_bbox_crop():
-    from mxnet.gluon.contrib.data.vision.transforms.bbox import ImageBboxCrop
-
-    transform = ImageBboxCrop((0, 0, 3, 3))
-    img = mx.nd.ones((10, 10, 3), dtype='uint8')
-    bbox = mx.nd.array([[0, 1, 3, 4, 0]])
-    im_out, im_bbox = transform(img, bbox)
-    assert im_out.shape == (3, 3, 3)
-    assert im_bbox[0][2] == 3
diff --git a/tests/python/unittest/test_gluon_estimator.py b/tests/python/unittest/test_gluon_estimator.py
index 8c12b5d2a13b..713a242687f6 100644
--- a/tests/python/unittest/test_gluon_estimator.py
+++ b/tests/python/unittest/test_gluon_estimator.py
@@ -28,6 +28,7 @@
 from mxnet.gluon.contrib.estimator import *
 from mxnet.gluon.contrib.estimator.event_handler import *
 
+mx.npx.reset_np()
 
 def _get_test_network(params=None):
     net = nn.Sequential()
@@ -37,8 +38,8 @@ def _get_test_network(params=None):
 
 def _get_test_data():
     batch_size = 4
-    in_data = mx.nd.random.uniform(shape=(10, 3))
-    out_data = mx.nd.random.uniform(shape=(10, 4))
+    in_data = mx.np.random.uniform(size=(10, 3))
+    out_data = mx.np.random.uniform(size=(10, 4))
     # Input dataloader
     dataset = gluon.data.dataset.ArrayDataset(in_data, out_data)
     dataloader = gluon.data.DataLoader(dataset, batch_size=batch_size)
@@ -46,6 +47,7 @@ def _get_test_data():
     return dataloader, dataiter
 
 
+@mx.util.use_np
 def test_fit():
     ''' test estimator with different train data types '''
     net = _get_test_network()
@@ -71,10 +73,11 @@ def test_fit():
 
     # Input NDArray
     with pytest.raises(ValueError):
-        est.fit(train_data=[mx.nd.ones(shape=(10, 3))],
+        est.fit(train_data=[mx.np.ones(shape=(10, 3))],
                 epochs=num_epochs)
 
 
+@mx.util.use_np
 def test_validation():
     ''' test different validation data types'''
     net = _get_test_network()
@@ -108,11 +111,12 @@ def test_validation():
                 epochs=num_epochs)
     # Input NDArray
     with pytest.raises(ValueError):
-        est.fit(train_data=[mx.nd.ones(shape=(10, 3))],
-                val_data=[mx.nd.ones(shape=(10, 3))],
+        est.fit(train_data=[mx.np.ones(shape=(10, 3))],
+                val_data=[mx.np.ones(shape=(10, 3))],
                 epochs=num_epochs)
 
 
+@mx.util.use_np
 def test_initializer():
     ''' test with no initializer, inconsistent initializer '''
     net = _get_test_network()
@@ -146,15 +150,16 @@ def test_initializer():
     # net partially initialized, fine tuning use case
     net = gluon.model_zoo.vision.resnet18_v1(pretrained=False, ctx=ctx)
     net.features.initialize(ctx=ctx)
-    net.features(mx.nd.zeros((1, 3, 224, 224)))
+    net.features(mx.np.zeros((1, 3, 224, 224)))
     net.output = gluon.nn.Dense(10) #last layer not initialized
     est = Estimator(net, loss=loss, train_metrics=acc, context=ctx)
-    dataset =  gluon.data.ArrayDataset(mx.nd.zeros((10, 3, 224, 224)), mx.nd.zeros((10, 10)))
+    dataset =  gluon.data.ArrayDataset(mx.np.zeros((10, 3, 224, 224)), mx.np.zeros((10, 10)))
     train_data = gluon.data.DataLoader(dataset=dataset, batch_size=5)
     est.fit(train_data=train_data,
             epochs=num_epochs)
 
 
+@mx.util.use_np
 def test_trainer():
     ''' test with no trainer and invalid trainer '''
     net = _get_test_network()
@@ -185,6 +190,7 @@ def test_trainer():
                         context=ctx)
 
 
+@mx.util.use_np
 def test_metric():
     ''' test with no metric, list of metrics, invalid metric '''
     net = _get_test_network()
@@ -227,6 +233,7 @@ def test_metric():
     assert isinstance(est.train_metrics[0], mx.gluon.metric.Accuracy)
 
 
+@mx.util.use_np
 def test_loss():
     ''' test with invalid loss '''
     net = _get_test_network()
@@ -243,6 +250,7 @@ def test_loss():
                         context=ctx)
 
 
+@mx.util.use_np
 def test_context():
     ''' test with no context, list of context, invalid context '''
     net = _get_test_network()
@@ -274,6 +282,7 @@ def test_context():
                         context=[mx.gpu(0), mx.gpu(100)])
 
 
+@mx.util.use_np
 def test_categorize_handlers():
     class CustomHandler1(TrainBegin):
 
@@ -319,6 +328,7 @@ def train_end(self):
     assert len(train_end) == 2
 
 
+@mx.util.use_np
 def test_default_handlers():
     net = _get_test_network()
     train_data, _ = _get_test_data()
@@ -370,6 +380,7 @@ def test_default_handlers():
     assert isinstance(handlers[1], MetricHandler)
     assert isinstance(handlers[4], LoggingHandler)
 
+@mx.util.use_np
 def test_val_net():
     ''' test estimator with different training and validation networks '''
     net = _get_test_network()
@@ -399,7 +410,7 @@ def test_val_net():
     net.output = gluon.nn.Dense(10)
     val_net = gluon.model_zoo.vision.resnet18_v1(pretrained=False, ctx=ctx)
     val_net.output = net.output
-    dataset = gluon.data.ArrayDataset(mx.nd.zeros((10, 3, 224, 224)), mx.nd.zeros((10, 10)))
+    dataset = gluon.data.ArrayDataset(mx.np.zeros((10, 3, 224, 224)), mx.np.zeros((10, 10)))
     dataloader = gluon.data.DataLoader(dataset=dataset, batch_size=5)
     net.initialize(ctx=ctx)
     val_net.initialize(ctx=ctx)
@@ -416,6 +427,7 @@ def test_val_net():
             val_data=dataloader,
             epochs=num_epochs)
 
+@mx.util.use_np
 def test_val_handlers():
     net = _get_test_network()
     train_data, _ = _get_test_data()
diff --git a/tests/python/unittest/test_gluon_event_handler.py b/tests/python/unittest/test_gluon_event_handler.py
index 4cadc9466ed1..4f0f0e2f7295 100644
--- a/tests/python/unittest/test_gluon_event_handler.py
+++ b/tests/python/unittest/test_gluon_event_handler.py
@@ -22,7 +22,7 @@
 
 import mxnet as mx
 from common import TemporaryDirectory
-from mxnet import nd
+from mxnet import np
 from mxnet.gluon import nn, loss
 from mxnet.gluon.contrib.estimator import estimator, event_handler
 from mxnet.gluon.contrib.estimator.event_handler import LoggingHandler
@@ -34,6 +34,8 @@
 except ImportError:
     from io import StringIO
 
+mx.npx.reset_np()
+
 class AxisArrayDataset(Dataset):
     def __init__(self, * args):
         self._length = len(args[1])
@@ -65,17 +67,18 @@ def _get_test_network(net=nn.Sequential()):
 
 
 def _get_test_data(in_size=32):
-    data = nd.ones((in_size, 100))
-    label = nd.zeros((in_size, 1))
+    data = np.ones((in_size, 100))
+    label = np.zeros((in_size, 1))
     data_arr = mx.gluon.data.dataset.ArrayDataset(data, label)
     return mx.gluon.data.DataLoader(data_arr, batch_size=8)
 
 def _get_batch_axis_test_data(in_size=32):
-    data = nd.ones((100, in_size))
-    label = nd.zeros((1, in_size))
+    data = np.ones((100, in_size))
+    label = np.zeros((1, in_size))
     data_arr = AxisArrayDataset(data, label)
     return mx.gluon.data.DataLoader(data_arr, batch_size=8)
 
+@mx.util.use_np
 def test_checkpoint_handler():
     with TemporaryDirectory() as tmpdir:
         model_prefix = 'test_epoch'
@@ -122,6 +125,7 @@ def test_checkpoint_handler():
         assert os.path.isfile(file_path + '-epoch2batch9.params')
         assert os.path.isfile(file_path + '-epoch2batch9.states')
 
+@mx.util.use_np
 def test_resume_checkpoint():
     with TemporaryDirectory() as tmpdir:
         model_prefix = 'test_net'
@@ -150,6 +154,7 @@ def test_resume_checkpoint():
         assert os.path.isfile(file_path + '-epoch4batch20.states')
 
 
+@mx.util.use_np
 def test_early_stopping():
     test_data = _get_test_data()
 
@@ -171,6 +176,7 @@ def test_early_stopping():
     assert early_stopping.current_epoch == 1
 
 
+@mx.util.use_np
 def test_logging():
     with TemporaryDirectory() as tmpdir:
         test_data = _get_test_data()
@@ -194,6 +200,7 @@ def test_logging():
         del est  # Clean up estimator and logger before deleting tmpdir
 
 
+@mx.util.use_np
 def test_custom_handler():
     class CustomStopHandler(event_handler.TrainBegin,
                             event_handler.BatchEnd,
@@ -237,6 +244,7 @@ def epoch_end(self, estimator, *args, **kwargs):
     assert custom_handler.num_batch == 5 * 4
     assert custom_handler.num_epoch == 5
 
+@mx.util.use_np
 def test_logging_interval():
     ''' test different options for logging handler '''
     ''' test case #1: log interval is 1 '''
@@ -294,6 +302,7 @@ def test_logging_interval():
 
     assert(info_len == int(data_size/batch_size/log_interval) + 1)
 
+@mx.util.use_np
 def test_validation_handler_batch_axis():
     # test case #1: test batch_axis=0
     test_data = _get_test_data()
@@ -310,6 +319,7 @@ def test_validation_handler_batch_axis():
     est.fit(test_data, val_data=val_data,
             epochs=3, batch_axis=1)
 
+@mx.util.use_np
 def test_validation_handler():
     test_data = _get_test_data()
 
diff --git a/tests/python/unittest/test_gluon_indexing.py b/tests/python/unittest/test_gluon_indexing.py
index 6886331f03d9..0aa54d5bb76c 100644
--- a/tests/python/unittest/test_gluon_indexing.py
+++ b/tests/python/unittest/test_gluon_indexing.py
@@ -20,43 +20,42 @@
 import mxnet as mx
 from mxnet.gluon import HybridBlock
 
+@mx.util.use_np
 def test_getitem_hybridized():
     class picking_np(HybridBlock):
         def __init__(self, **kwargs):
             super(picking_np, self).__init__(**kwargs)
-        def hybrid_forward(self, F, sequence, pick_ids):
+
+        def forward(self, sequence, pick_ids):
             """
             new implementation in deep numpy
             """
-            idx_arange = F.npx.arange_like(pick_ids.reshape((-1, )), axis=0)
-            batch_idx = F.np.floor(idx_arange / 2).astype(np.int32)
-
+            idx_arange = mx.npx.arange_like(pick_ids.reshape((-1, )), axis=0)
+            batch_idx = mx.np.floor(idx_arange / 2).astype(np.int32)
             encoded = sequence[batch_idx, pick_ids.reshape((-1,))]
-            encoded = F.npx.reshape_like(encoded, pick_ids, lhs_begin=-2, lhs_end=-1, rhs_begin=0)
+            encoded = mx.npx.reshape_like(encoded, pick_ids, lhs_begin=-2, lhs_end=-1, rhs_begin=0)
             return encoded
 
-    sequence = mx.nd.array(np.random.normal(0, 1, (8, 32, 768)), dtype=np.float32)
+    sequence = mx.np.array(np.random.normal(0, 1, (8, 32, 768)), dtype=np.float32)
     # pick_ids: [batch_size, picked_index]
-    pick_ids = mx.nd.random.randint(0, 32, (8,2), dtype=np.int32)
+    pick_ids = mx.np.random.randint(0, 32, (8,2), dtype=np.int32)
 
-    mx.npx.set_np()
     picker_np = picking_np()
-    seq_np = sequence.as_np_ndarray()
-    np_output = picker_np(seq_np, pick_ids.as_np_ndarray())
+    seq_np = sequence
+    np_output = picker_np(seq_np, pick_ids)
     seq_np.attach_grad()
     with mx.autograd.record():
-        z = picker_np(seq_np, pick_ids.as_np_ndarray())
+        z = picker_np(seq_np, pick_ids)
     z.backward()
 
     picker_np.initialize()
     picker_np.hybridize()
-    nd_output_hybridized = picker_np(sequence.as_np_ndarray(), pick_ids.as_np_ndarray())
-    seq_np_hybridized = sequence.as_np_ndarray()
+    nd_output_hybridized = picker_np(sequence, pick_ids)
+    seq_np_hybridized = sequence
     seq_np_hybridized.attach_grad()
     with mx.autograd.record():
         z_hybridized = picker_np(seq_np_hybridized, pick_ids.as_np_ndarray())
     z_hybridized.backward()
-    mx.npx.reset_np()
 
     mx.test_utils.assert_almost_equal(nd_output_hybridized.asnumpy(), np_output.asnumpy())
     mx.test_utils.assert_almost_equal(seq_np.grad.asnumpy(), seq_np_hybridized.grad.asnumpy())
diff --git a/tests/python/unittest/test_gluon_model_zoo.py b/tests/python/unittest/test_gluon_model_zoo.py
index cc2623aee22d..a9181a865a78 100644
--- a/tests/python/unittest/test_gluon_model_zoo.py
+++ b/tests/python/unittest/test_gluon_model_zoo.py
@@ -22,6 +22,7 @@
 import multiprocessing
 import pytest
 
+mx.npx.reset_np()
 
 def eprint(*args, **kwargs):
     print(*args, file=sys.stderr, **kwargs)
@@ -48,7 +49,7 @@ def test_models(model_name):
     print(model)
     if not test_pretrain:
         model.initialize()
-    model(mx.nd.random.uniform(shape=data_shape)).wait_to_read()
+    model(mx.np.random.uniform(size=data_shape)).wait_to_read()
 
 def parallel_download(model_name):
     model = get_model(model_name, pretrained=True, root='./parallel_download')
diff --git a/tests/python/unittest/test_gluon_probability_v1.py b/tests/python/unittest/test_gluon_probability_v1.py
deleted file mode 100644
index 0ae72cfec400..000000000000
--- a/tests/python/unittest/test_gluon_probability_v1.py
+++ /dev/null
@@ -1,2401 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test gluon.probability with HybridBlock.hybrid_forward api
-"""
-import mxnet as mx
-import numpy as _np
-from mxnet import np, npx, autograd
-from mxnet import gluon
-import mxnet.gluon.probability as mgp
-from mxnet.gluon.probability import StochasticBlock, StochasticSequential
-from mxnet.gluon import HybridBlock
-from mxnet.test_utils import use_np, assert_almost_equal
-
-from numpy.testing import assert_array_equal
-import pytest
-import scipy.stats as ss
-import scipy.special as scipy_special
-import itertools
-from numbers import Number
-
-
-def prob_to_logit(prob):
-    return np.log(prob) - np.log1p(-prob)
-
-
-def _distribution_method_invoker(dist, func, *args):
-    """Wrapper for invoking different types of class methods with one unified
-    interface.
-
-    Parameters
-    ----------
-    dist : Distribution
-    func : method
-    """
-    if (len(args) == 0):
-        out = getattr(dist, func)
-        if callable(out):
-            return out()
-        else:
-            return out
-    return getattr(dist, func)(*args)
-
-
-def test_mgp_getF_v1():
-    # Test getF
-    getF = mgp.utils.getF
-    nd = mx.nd
-    sym = mx.sym
-    assert getF(nd.ones((2, 2)), nd.ones((2, 2))) == nd
-    assert getF(sym.ones((2, 2)), sym.ones((2, 2))) == sym
-    assert getF(1.0, 2.0) == nd
-
-    # Test exception
-    with pytest.raises(TypeError):
-        getF(nd.ones((2, 2)), sym.ones((2, 2)))
-        getF(sym.ones((2, 2)), nd.ones((2, 2)))
-
-
-@use_np
-def test_gluon_uniform_v1():
-    class TestUniform(HybridBlock):
-        def __init__(self, func):
-            super(TestUniform, self).__init__()
-            self._func = func
-
-        def hybrid_forward(self, F, low, high, *args):
-            uniform = mgp.Uniform(low, high, validate_args=True)
-            return _distribution_method_invoker(uniform, self._func, *args)
-
-    shapes = [(), (1,), (2, 3), 6]
-
-    # Test log_prob
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        low = np.random.uniform(-1, 1, shape)
-        high = low + np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.uniform(low, high)
-        net = TestUniform("log_prob")
-        if hybridize:
-            net.hybridize()
-        for i in range(2):
-            mx_out = net(low, high, samples).asnumpy()
-            np_out = ss.uniform(low.asnumpy(),
-                                (high - low).asnumpy()).logpdf(samples.asnumpy())
-            assert_almost_equal(mx_out, np_out, atol=1e-4,
-                                rtol=1e-3, use_broadcast=False)
-
-    # Test cdf
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        low = np.random.uniform(-1, 1, shape)
-        high = low + np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.uniform(low, high)
-        net = TestUniform("cdf")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(low, high, samples).asnumpy()
-        np_out = ss.uniform(low.asnumpy(),
-                            (high - low).asnumpy()).cdf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test icdf
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        low = np.random.uniform(-1, 1, shape)
-        high = low + np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.uniform(size=shape)
-        net = TestUniform("icdf")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(low, high, samples).asnumpy()
-        np_out = ss.uniform(low.asnumpy(),
-                            (high - low).asnumpy()).ppf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test entropy
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        low = np.random.uniform(-1, 1, shape)
-        high = low + np.random.uniform(0.5, 1.5, shape)
-        net = TestUniform("entropy")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(low, high).asnumpy()
-        np_out = ss.uniform(low.asnumpy(),
-                            (high - low).asnumpy()).entropy()
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-
-@use_np
-def test_gluon_normal_v1():
-    class TestNormal(HybridBlock):
-        def __init__(self, func):
-            super(TestNormal, self).__init__()
-            self._func = func
-
-        def hybrid_forward(self, F, loc, scale, *args):
-            normal = mgp.Normal(loc, scale, validate_args=True)
-            return _distribution_method_invoker(normal, self._func, *args)
-
-    shapes = [(), (1,), (2, 3), 6]
-
-    # Test log_prob
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        loc = np.random.uniform(-1, 1, shape)
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.normal(size=shape)
-        net = TestNormal("log_prob")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(loc, scale, samples).asnumpy()
-        np_out = ss.norm(loc.asnumpy(),
-                         scale.asnumpy()).logpdf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test cdf
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        loc = np.random.uniform(-1, 1, shape)
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.normal(size=shape)
-        net = TestNormal("cdf")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(loc, scale, samples).asnumpy()
-        np_out = ss.norm(loc.asnumpy(),
-                         scale.asnumpy()).cdf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test icdf
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        loc = np.random.uniform(-1, 1, shape)
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.uniform(size=shape)
-        net = TestNormal("icdf")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(loc, scale, samples).asnumpy()
-        np_out = ss.norm(loc.asnumpy(),
-                         scale.asnumpy()).ppf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test entropy
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        loc = np.random.uniform(-1, 1, shape)
-        scale = np.random.uniform(0.5, 1.5, shape)
-        net = TestNormal("entropy")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(loc, scale).asnumpy()
-        np_out = ss.norm(loc.asnumpy(),
-                         scale.asnumpy()).entropy()
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-
-@use_np
-def test_gluon_laplace_v1():
-    class TestLaplace(HybridBlock):
-        def __init__(self, func):
-            super(TestLaplace, self).__init__()
-            self._func = func
-
-        def hybrid_forward(self, F, loc, scale, *args):
-            laplace = mgp.Laplace(loc, scale, validate_args=True)
-            return _distribution_method_invoker(laplace, self._func, *args)
-
-    shapes = [(), (1,), (2, 3), 6]
-
-    # Test log_prob
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        loc = np.random.uniform(-1, 1, shape)
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.laplace(size=shape)
-        net = TestLaplace("log_prob")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(loc, scale, samples).asnumpy()
-        np_out = ss.laplace(loc.asnumpy(),
-                            scale.asnumpy()).logpdf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test cdf
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        loc = np.random.uniform(-1, 1, shape)
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.laplace(size=shape)
-        net = TestLaplace("cdf")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(loc, scale, samples).asnumpy()
-        np_out = ss.laplace(loc.asnumpy(),
-                            scale.asnumpy()).cdf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test icdf
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        loc = np.random.uniform(-1, 1, shape)
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.uniform(size=shape)
-        net = TestLaplace("icdf")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(loc, scale, samples).asnumpy()
-        np_out = ss.laplace(loc.asnumpy(),
-                            scale.asnumpy()).ppf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test entropy
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        loc = np.random.uniform(-1, 1, shape)
-        scale = np.random.uniform(0.5, 1.5, shape)
-        net = TestLaplace("entropy")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(loc, scale).asnumpy()
-        np_out = ss.laplace(loc.asnumpy(),
-                            scale.asnumpy()).entropy()
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-
-@use_np
-def test_gluon_cauchy_v1():
-    class TestCauchy(HybridBlock):
-        def __init__(self, func):
-            self._func = func
-            super(TestCauchy, self).__init__()
-
-        def hybrid_forward(self, F, loc, scale, *args):
-            cauchy = mgp.Cauchy(loc, scale, F, validate_args=True)
-            return _distribution_method_invoker(cauchy, self._func, *args)
-
-    shapes = [(), (1,), (2, 3), 6]
-
-    # Test sampling
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        loc = np.random.uniform(-1, 1, shape)
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.normal(size=shape)
-        net = TestCauchy("sample")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(loc, scale)
-        desired_shape = (shape,) if isinstance(shape, Number) else shape
-        assert mx_out.shape == desired_shape
-
-    # Test log_prob
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        loc = np.random.uniform(-1, 1, shape)
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.normal(size=shape)
-        net = TestCauchy("log_prob")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(loc, scale, samples).asnumpy()
-        np_out = ss.cauchy(loc.asnumpy(),
-                           scale.asnumpy()).logpdf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test cdf
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        loc = np.random.uniform(-1, 1, shape)
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.normal(size=shape)
-        net = TestCauchy("cdf")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(loc, scale, samples).asnumpy()
-        np_out = ss.cauchy(loc.asnumpy(),
-                           scale.asnumpy()).cdf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test icdf
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        loc = np.random.uniform(-1, 1, shape)
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.uniform(size=shape, low=1e-4, high=1.0-1e-4)
-        net = TestCauchy("icdf")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(loc, scale, samples).asnumpy()
-        np_out = ss.cauchy(loc.asnumpy(),
-                           scale.asnumpy()).ppf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test entropy
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        loc = np.random.uniform(-1, 1, shape)
-        scale = np.random.uniform(0.5, 1.5, shape)
-        net = TestCauchy("entropy")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(loc, scale).asnumpy()
-        np_out = ss.cauchy(loc.asnumpy(),
-                           scale.asnumpy()).entropy()
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-
-@use_np
-def test_gluon_half_cauchy_v1():
-    class TestHalfCauchy(HybridBlock):
-        def __init__(self, func):
-            super(TestHalfCauchy, self).__init__()
-            self._func = func
-
-        def hybrid_forward(self, F, scale, *args):
-            half_normal = mgp.HalfCauchy(scale, F, validate_args=True)
-            return getattr(half_normal, self._func)(*args)
-
-    shapes = [(), (1,), (2, 3), 6]
-
-    # Test sampling
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        scale = np.random.uniform(0.5, 1.5, shape)
-        net = TestHalfCauchy("sample")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(scale).asnumpy()
-        if isinstance(shape, Number):
-            shape = (shape,)
-        assert mx_out.shape == shape
-
-    # Test log_prob
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.abs(np.random.normal(size=shape))
-        net = TestHalfCauchy("log_prob")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(scale, samples).asnumpy()
-        np_out = ss.halfcauchy(0, scale.asnumpy()).logpdf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test cdf
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.abs(np.random.normal(size=shape))
-        net = TestHalfCauchy("cdf")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(scale, samples).asnumpy()
-        np_out = ss.halfcauchy(0, scale.asnumpy()).cdf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test icdf
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.uniform(size=shape, high=1.0-1e-4)
-        net = TestHalfCauchy("icdf")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(scale, samples).asnumpy()
-        np_out = ss.halfcauchy(0, scale.asnumpy()).ppf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-
-@use_np
-def test_gluon_poisson_v1():
-    class TestPoisson(HybridBlock):
-        def __init__(self, func):
-            self._func = func
-            super(TestPoisson, self).__init__()
-
-        def hybrid_forward(self, F, rate, *args):
-            poisson = mgp.Poisson(rate, F, validate_args=True)
-            return _distribution_method_invoker(poisson, self._func, *args)
-
-    shapes = [(1,), (2, 3), 6]
-    # Test sampling
-    for shape, hybridize in itertools.product(shapes, [False]):
-        rate = np.random.uniform(0.5, 1.5, shape)
-        net = TestPoisson("sample")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(rate).asnumpy()
-        assert mx_out.shape == rate.shape
-
-    # Test log_prob
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        rate = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.randint(0, 5, shape).astype('float')
-        net = TestPoisson("log_prob")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(rate, samples).asnumpy()
-        np_out = ss.poisson(mu=rate.asnumpy()).logpmf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-
-@use_np
-def test_gluon_geometric_v1():
-    class TestGeometric(HybridBlock):
-        def __init__(self, func, is_logit=False):
-            super(TestGeometric, self).__init__()
-            self._is_logit = is_logit
-            self._func = func
-
-        def hybrid_forward(self, F, params, *args):
-            dist = mgp.Geometric(logit=params, validate_args=True) if self._is_logit else \
-                mgp.Geometric(prob=params, validate_args=True)
-            return _distribution_method_invoker(dist, self._func, *args)
-
-    shapes = [(), (1,), (2, 3), 6]
-    # Test log_prob
-    for shape, hybridize, use_logit in itertools.product(shapes, [True, False], [True, False]):
-        prob = np.random.uniform(size=shape)
-        sample = np.random.randint(0, 10, size=shape).astype('float32')
-        param = prob
-        if use_logit:
-            param = prob_to_logit(param)
-        net = TestGeometric("log_prob", use_logit)
-        if hybridize:
-            net.hybridize()
-        mx_out = net(param, sample).asnumpy()
-        np_out = ss.geom.logpmf(sample.asnumpy() + 1, prob.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test variance
-    for shape, hybridize, use_logit in itertools.product(shapes, [True, False], [True, False]):
-        prob = np.random.uniform(size=shape)
-        param = prob
-        if use_logit:
-            param = prob_to_logit(param)
-        net = TestGeometric("variance", use_logit)
-        if hybridize:
-            net.hybridize()
-        mx_out = net(param).asnumpy()
-        np_out = ss.geom(prob.asnumpy()).var()
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test entropy
-    for shape, hybridize, use_logit in itertools.product(shapes, [True, False], [True, False]):
-        # Add lower bound constraint, otherwise scipy would raise warning.
-        prob = np.random.uniform(low=0.1, size=shape)
-        param = prob
-        if use_logit:
-            param = prob_to_logit(param)
-        net = TestGeometric("entropy", use_logit)
-        if hybridize:
-            net.hybridize()
-        mx_out = net(param).asnumpy()
-        np_out = ss.geom(prob.asnumpy()).entropy()
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-
-@use_np
-def test_gluon_negative_binomial_v1():
-    class TestNegativeBinomial(HybridBlock):
-        def __init__(self, func, is_logit=False):
-            super(TestNegativeBinomial, self).__init__()
-            self._is_logit = is_logit
-            self._func = func
-
-        def hybrid_forward(self, F, n, params, *args):
-            dist = mgp.NegativeBinomial(n=n, logit=params, validate_args=True) if self._is_logit else \
-                mgp.NegativeBinomial(n=n, prob=params, validate_args=True)
-            return _distribution_method_invoker(dist, self._func, *args)
-
-    shapes = [(), (1,), (2, 3), 6]
-    # Test log_prob
-    for shape, hybridize, use_logit in itertools.product(shapes, [True, False], [True, False]):
-        n = np.random.randint(1, 10, size=shape).astype('float32')
-        prob = np.random.uniform(low=0.2, high=0.6, size=shape).astype('float32')
-        sample = np.random.randint(0, 10, size=shape).astype('float32')
-        param = prob
-        if use_logit:
-            param = prob_to_logit(param)
-        net = TestNegativeBinomial("log_prob", use_logit)
-        if hybridize:
-            net.hybridize()
-        mx_out = net(n, param, sample).asnumpy()
-        np_out = ss.nbinom(n=n.asnumpy(), p=prob.asnumpy()
-                           ).logpmf(sample.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test mean and variance
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        for func in ['mean', 'variance']:
-            for use_logit in [True, False]:
-                n = np.random.randint(1, 10, size=shape).astype('float32')
-                prob = np.random.uniform(low=0.2, high=0.6, size=shape).astype('float32')
-                net = TestNegativeBinomial(func, use_logit)
-                param = prob
-                if use_logit:
-                    param = prob_to_logit(param)
-                if hybridize:
-                    net.hybridize()
-                mx_out = net(n, param).asnumpy()
-                ss_nbinom = ss.nbinom(n=n.asnumpy(), p=1 - prob.asnumpy())
-                if func == 'mean':
-                    np_out = ss_nbinom.mean()
-                else:
-                    np_out = ss_nbinom.var()
-                assert_almost_equal(mx_out, np_out, atol=1e-4,
-                                    rtol=1e-3, use_broadcast=False)
-
-
-@use_np
-def test_gluon_exponential_v1():
-    class TestExponential(HybridBlock):
-        def __init__(self, func):
-            self._func = func
-            super(TestExponential, self).__init__()
-
-        def hybrid_forward(self, F, scale, *args):
-            exponential = mgp.Exponential(scale, F, validate_args=True)
-            return _distribution_method_invoker(exponential, self._func, *args)
-
-    shapes = [(), (1,), (2, 3), 6]
-    # Test log_prob
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.uniform(0.2, 1.2, size=shape)
-        net = TestExponential("log_prob")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(scale, samples).asnumpy()
-        np_out = ss.expon(scale=scale.asnumpy()).logpdf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test cdf
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.uniform(0.2, 1.2, size=shape)
-        net = TestExponential("cdf")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(scale, samples).asnumpy()
-        np_out = ss.expon(scale=scale.asnumpy()).cdf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test icdf
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.uniform(0.0, 1.0, size=shape)
-        net = TestExponential("icdf")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(scale, samples).asnumpy()
-        np_out = ss.expon(scale=scale.asnumpy()).ppf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test entropy
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        scale = np.random.uniform(0.5, 1.5, shape)
-        net = TestExponential("entropy")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(scale).asnumpy()
-        np_out = ss.expon(scale=scale.asnumpy()).entropy()
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-
-@use_np
-def test_gluon_weibull_v1():
-    class TestWeibull(HybridBlock):
-        def __init__(self, func):
-            super(TestWeibull, self).__init__()
-            self._func = func
-
-        def hybrid_forward(self, F, concentration, scale, *args):
-            weibull = mgp.Weibull(concentration, scale, F, validate_args=True)
-            return _distribution_method_invoker(weibull, self._func, *args)
-
-    shapes = [(), (1,), (2, 3), 6]
-
-    # Test log_prob
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        concentration = np.random.uniform(size=shape)
-        scale = np.random.uniform(size=shape)
-        samples = np.random.uniform(size=shape)
-        net = TestWeibull("log_prob")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(concentration, scale, samples).asnumpy()
-        np_out = ss.weibull_min(c=concentration.asnumpy(
-        ), scale=scale.asnumpy()).logpdf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test cdf
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        concentration = np.random.uniform(size=shape)
-        scale = np.random.uniform(size=shape)
-        samples = np.random.uniform(size=shape)
-        net = TestWeibull("cdf")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(concentration, scale, samples).asnumpy()
-        np_out = ss.weibull_min(c=concentration.asnumpy(
-        ), scale=scale.asnumpy()).cdf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test icdf
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        concentration = np.random.uniform(size=shape)
-        scale = np.random.uniform(size=shape)
-        samples = np.random.uniform(size=shape)
-        net = TestWeibull("icdf")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(concentration, scale, samples).asnumpy()
-        np_out = ss.weibull_min(c=concentration.asnumpy(
-        ), scale=scale.asnumpy()).ppf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test entropy
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        concentration = np.random.uniform(size=shape)
-        scale = np.random.uniform(size=shape)
-        net = TestWeibull("entropy")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(concentration, scale).asnumpy()
-        np_out = ss.weibull_min(c=concentration.asnumpy(),
-                                scale=scale.asnumpy()).entropy()
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-
-@use_np
-def test_gluon_pareto_v1():
-    class TestPareto(HybridBlock):
-        def __init__(self, func):
-            super(TestPareto, self).__init__()
-            self._func = func
-
-        def hybrid_forward(self, F, alpha, scale, *args):
-            pareto = mgp.Pareto(alpha, scale, F, validate_args=True)
-            return _distribution_method_invoker(pareto, self._func, *args)
-
-    shapes = [(), (1,), (2, 3), 6]
-
-    # Test log_prob
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        alpha = np.random.uniform(size=shape)
-        scale = np.random.uniform(size=shape)
-        samples = np.random.uniform(1, 2, size=shape)
-        net = TestPareto("log_prob")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(alpha, scale, samples).asnumpy()
-        np_out = ss.pareto(b=alpha.asnumpy(), scale=scale.asnumpy()).logpdf(
-            samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test cdf
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        alpha = np.random.uniform(size=shape)
-        scale = np.random.uniform(size=shape)
-        samples = np.random.uniform(1.0, 2.0, size=shape)
-        net = TestPareto("cdf")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(alpha, scale, samples).asnumpy()
-        np_out = ss.pareto(b=alpha.asnumpy(), scale=scale.asnumpy()).cdf(
-            samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test icdf
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        alpha = np.random.uniform(size=shape)
-        scale = np.random.uniform(size=shape)
-        samples = np.random.uniform(size=shape)
-        net = TestPareto("icdf")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(alpha, scale, samples).asnumpy()
-        np_out = ss.pareto(b=alpha.asnumpy(), scale=scale.asnumpy()).ppf(
-            samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test entropy
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        alpha = np.random.uniform(size=shape)
-        scale = np.random.uniform(size=shape)
-        net = TestPareto("entropy")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(alpha, scale).asnumpy()
-        np_out = ss.pareto(b=alpha.asnumpy(), scale=scale.asnumpy()).entropy()
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-
-@use_np
-def test_gluon_gamma_v1():
-    class TestGamma(HybridBlock):
-        def __init__(self, func):
-            super(TestGamma, self).__init__()
-            self._func = func
-
-        def hybrid_forward(self, F, shape, scale, *args):
-            gamma = mgp.Gamma(shape, scale, F, validate_args=True)
-            return _distribution_method_invoker(gamma, self._func, *args)
-
-    shapes = [(), (1,), (2, 3), 6]
-
-    # Test log_prob
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        alpha = np.random.uniform(0.5, 1.5, shape)
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.uniform(size=shape)
-        net = TestGamma("log_prob")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(alpha, scale, samples).asnumpy()
-        np_out = ss.gamma(a=alpha.asnumpy(), loc=0,
-                          scale=scale.asnumpy()).logpdf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test `mean`, `var` and `entropy`
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        for func in ['mean', 'variance', 'entropy']:
-            alpha = np.random.uniform(0.5, 1.5, shape)
-            scale = np.random.uniform(0.5, 1.5, shape)
-            net = TestGamma(func)
-            if hybridize:
-                net.hybridize()
-            mx_out = net(alpha, scale).asnumpy()
-            ss_gamma = ss.gamma(a=alpha.asnumpy(), loc=0,
-                                scale=scale.asnumpy())
-            if func == 'mean':
-                np_out = ss_gamma.mean()
-            elif func == 'variance':
-                np_out = ss_gamma.var()
-            else:
-                np_out = ss_gamma.entropy()
-            assert_almost_equal(mx_out, np_out, atol=1e-4,
-                                rtol=1e-3, use_broadcast=False)
-
-
-@use_np
-def test_gluon_dirichlet_v1():
-    class TestDirichlet(HybridBlock):
-        def __init__(self, func):
-            super(TestDirichlet, self).__init__()
-            self._func = func
-
-        def hybrid_forward(self, F, alpha, *args):
-            dirichlet = mgp.Dirichlet(alpha, F, validate_args=True)
-            return _distribution_method_invoker(dirichlet, self._func, *args)
-
-    event_shapes = [2, 4, 6]
-    batch_shapes = [None, (2, 3)]
-
-    # Test sampling
-    for event_shape, batch_shape in itertools.product(event_shapes, batch_shapes):
-        for hybridize in [True, False]:
-            desired_shape = (
-                batch_shape if batch_shape is not None else ()) + (event_shape,)
-            alpha = np.random.uniform(1.0, 5.0, size=desired_shape)
-            net = TestDirichlet("sample")
-            if hybridize:
-                net.hybridize()
-            mx_out = net(alpha).asnumpy()
-            # Check shape
-            assert mx_out.shape == desired_shape
-            # Check simplex
-            assert_almost_equal(mx_out.sum(-1), _np.ones_like(mx_out.sum(-1)), atol=1e-4,
-                                rtol=1e-3, use_broadcast=False)
-
-    # Test log_prob
-    # Scipy does not support batch `alpha`, thus we skip multi-dimensional batch_shape case.
-    for event_shape, batch_shape in itertools.product(event_shapes, batch_shapes[:1]):
-        for hybridize in [True, False]:
-            desired_shape = (
-                batch_shape if batch_shape is not None else ()) + (event_shape,)
-            alpha = np.random.uniform(1.0, 5.0, desired_shape)
-            np_samples = _np.random.dirichlet(
-                [10.0 / event_shape] * event_shape, size=batch_shape)
-            net = TestDirichlet("log_prob")
-            if hybridize:
-                net.hybridize()
-            mx_out = net(alpha, np.array(np_samples)).asnumpy()
-            np_out = ss.dirichlet(alpha=alpha.asnumpy()).logpdf(np_samples)
-            assert_almost_equal(mx_out, np_out, atol=1e-4,
-                                rtol=1e-3, use_broadcast=False)
-
-    # Test `mean`, `var` and `entropy`
-    for event_shape, batch_shape in itertools.product(event_shapes, batch_shapes[:1]):
-        for hybridize in [False]:
-            for func in ['mean', 'variance', 'entropy']:
-                desired_shape = (
-                    batch_shape if batch_shape is not None else ()) + (event_shape,)
-                alpha = np.random.uniform(1.0, 5.0, desired_shape)
-                net = TestDirichlet(func)
-                if hybridize:
-                    net.hybridize()
-                mx_out = net(alpha).asnumpy()
-                ss_dir = ss.dirichlet(alpha=alpha.asnumpy())
-                if func == 'mean':
-                    np_out = ss_dir.mean()
-                elif func == 'variance':
-                    np_out = ss_dir.var()
-                else:
-                    np_out = ss_dir.entropy()
-                assert_almost_equal(mx_out, np_out, atol=1e-4,
-                                    rtol=1e-3, use_broadcast=False)
-
-
-@use_np
-def test_gluon_beta_v1():
-    class TestBeta(HybridBlock):
-        def __init__(self, func):
-            super(TestBeta, self).__init__()
-            self._func = func
-
-        def hybrid_forward(self, F, alpha, beta, *args):
-            beta_dist = mgp.Beta(alpha, beta, F, validate_args=True)
-            return _distribution_method_invoker(beta_dist, self._func, *args)
-
-    shapes = [(), (1,), (2, 3), 6]
-
-    # Test log_prob
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        alpha = np.random.uniform(0.5, 1.5, shape)
-        beta = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.uniform(size=shape)
-        net = TestBeta("log_prob")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(alpha, beta, samples).asnumpy()
-        np_out = ss.beta(alpha.asnumpy(), beta.asnumpy()
-                         ).logpdf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test `mean`, `var` and `entropy`
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        for func in ['mean', 'variance', 'entropy']:
-            alpha = np.random.uniform(0.5, 1.5, shape)
-            beta = np.random.uniform(0.5, 1.5, shape)
-            net = TestBeta(func)
-            if hybridize:
-                net.hybridize()
-            mx_out = net(alpha, beta).asnumpy()
-            ss_beta = ss.beta(alpha.asnumpy(), beta.asnumpy())
-            if func == 'mean':
-                np_out = ss_beta.mean()
-            elif func == 'variance':
-                np_out = ss_beta.var()
-            else:
-                np_out = ss_beta.entropy()
-            assert_almost_equal(mx_out, np_out, atol=1e-4,
-                                rtol=1e-3, use_broadcast=False)
-
-
-@use_np
-def test_gluon_fisher_snedecor_v1():
-    class TestFisherSnedecor(HybridBlock):
-        def __init__(self, func):
-            super(TestFisherSnedecor, self).__init__()
-            self._func = func
-
-        def hybrid_forward(self, F, df1, df2, *args):
-            beta_dist = mgp.FisherSnedecor(df1, df2, F, validate_args=True)
-            return _distribution_method_invoker(beta_dist, self._func, *args)
-
-    shapes = [(), (1,), (2, 3), 6]
-
-    # Test log_prob
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        df1 = np.random.uniform(0.5, 1.5, shape)
-        df2 = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.uniform(size=shape)
-        net = TestFisherSnedecor("log_prob")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(df1, df2, samples).asnumpy()
-        np_out = ss.f(dfn=df1.asnumpy(), dfd=df2.asnumpy()
-                      ).logpdf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test `mean` and `var`
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        for func in ['mean', 'variance']:
-            df1 = np.random.uniform(0.5, 1.5, shape)
-            df2 = np.random.uniform(4.0, 6.0, shape)
-            net = TestFisherSnedecor(func)
-            if hybridize:
-                net.hybridize()
-            mx_out = net(df1, df2).asnumpy()
-            ss_f = ss.f(dfn=df1.asnumpy(), dfd=df2.asnumpy())
-            if func == 'mean':
-                np_out = ss_f.mean()
-            else:
-                np_out = ss_f.var()
-            assert_almost_equal(mx_out, np_out, atol=1e-4,
-                                rtol=1e-3, use_broadcast=False)
-
-
-@use_np
-def test_gluon_student_t_v1():
-    class TestT(HybridBlock):
-        def __init__(self, func):
-            super(TestT, self).__init__()
-            self._func = func
-
-        def hybrid_forward(self, F, df, loc, scale, *args):
-            t_dist = mgp.StudentT(df, loc, scale, F, validate_args=True)
-            return _distribution_method_invoker(t_dist, self._func, *args)
-
-    shapes = [(), (1,), (2, 3), 6]
-
-    # Test log_prob
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        loc = np.zeros(shape)
-        scale = np.random.uniform(0.5, 1.5, shape)
-        df = np.random.uniform(2, 4, shape)
-        samples = np.random.uniform(0, 4, size=shape)
-        net = TestT("log_prob")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(df, loc, scale, samples).asnumpy()
-        np_out = ss.t(loc=0, scale=scale.asnumpy(),
-                      df=df.asnumpy()).logpdf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test `mean`, `var` and `entropy`
-    for shape, hybridize in itertools.product(shapes, [False, True]):
-        for func in ['mean', 'variance', 'entropy']:
-            loc = np.zeros(shape)
-            scale = np.random.uniform(0.5, 1.5, shape)
-            df = np.random.uniform(3, 4, shape)
-            net = TestT(func)
-            if hybridize:
-                net.hybridize()
-            mx_out = net(df, loc, scale).asnumpy()
-            ss_f = ss.t(loc=0, scale=scale.asnumpy(), df=df.asnumpy())
-            if func == 'mean':
-                np_out = ss_f.mean()
-            elif func == 'variance':
-                np_out = ss_f.var()
-            else:
-                np_out = ss_f.entropy()
-            assert_almost_equal(mx_out, np_out, atol=1e-4,
-                                rtol=1e-3, use_broadcast=False)
-
-
-@use_np
-def test_gluon_gumbel_v1():
-    class TestGumbel(HybridBlock):
-        def __init__(self, func):
-            super(TestGumbel, self).__init__()
-            self._func = func
-
-        def hybrid_forward(self, F, loc, scale, *args):
-            normal = mgp.Gumbel(loc, scale, F, validate_args=True)
-            return getattr(normal, self._func)(*args)
-
-    shapes = [(), (1,), (2, 3), 6]
-
-    # Test log_prob
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        loc = np.random.uniform(-1, 1, shape)
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.normal(size=shape)
-        net = TestGumbel("log_prob")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(loc, scale, samples).asnumpy()
-        np_out = ss.gumbel_r(loc=loc.asnumpy(),
-                             scale=scale.asnumpy()).logpdf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test cdf
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        loc = np.random.uniform(-1, 1, shape)
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.normal(size=shape)
-        net = TestGumbel("cdf")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(loc, scale, samples).asnumpy()
-        np_out = ss.gumbel_r(loc.asnumpy(),
-                             scale.asnumpy()).cdf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test icdf
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        loc = np.random.uniform(-1, 1, shape)
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.uniform(size=shape)
-        net = TestGumbel("icdf")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(loc, scale, samples).asnumpy()
-        np_out = ss.gumbel_r(loc.asnumpy(),
-                             scale.asnumpy()).ppf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test entropy
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        loc = np.random.uniform(-1, 1, shape)
-        scale = np.random.uniform(0.5, 1.5, shape)
-        net = TestGumbel("entropy")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(loc, scale).asnumpy()
-        np_out = ss.gumbel_r(loc.asnumpy(),
-                             scale.asnumpy()).entropy()
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-
-@use_np
-def test_gluon_multinomial_v1():
-    class TestMultinomial(HybridBlock):
-        def __init__(self, func, num_events, total_count, is_logit, batch_shape=None, sample_shape=None):
-            super(TestMultinomial, self).__init__()
-            self._num_events = num_events
-            self._total_count = total_count
-            self._is_logit = is_logit
-            self._func = func
-            self._batch_shape = batch_shape
-            self._sample_shape = sample_shape
-
-        def hybrid_forward(self, F, params, *args):
-            multinomial = (
-                mgp.Multinomial(self._num_events, logit=params, total_count=self._total_count,
-                                validate_args=True)
-                if self._is_logit else
-                mgp.Multinomial(self._num_events, prob=params, total_count=self._total_count,
-                                validate_args=True)
-            )
-            if self._func == 'sample':
-                return multinomial.sample(self._batch_shape)
-            if self._func == 'sample_n':
-                return multinomial.sample_n(self._sample_shape)
-            return _distribution_method_invoker(multinomial, self._func, *args)
-
-    def one_hot(a, num_classes):
-        return np.identity(num_classes)[a]
-
-    event_shapes = [2, 5, 10]
-    batch_shapes = [None, (2, 3)]  # , (4, 0, 5)]
-    sample_shapes = [None, (2,), (3, 4)]
-
-    # Test sampling
-    for event_shape, batch_shape in itertools.product(event_shapes, batch_shapes):
-        for use_logit, hybridize in itertools.product([True, False], [True, False]):
-            prob = np.array(_np.random.dirichlet(
-                [1 / event_shape] * event_shape, size=batch_shape))
-            param = prob
-            if use_logit:
-                param = np.log(param)
-            net = TestMultinomial("sample", event_shape, _np.random.randint(1, 5),
-                                  use_logit, batch_shape)
-            if hybridize:
-                net.hybridize()
-            mx_out = net(param).asnumpy()
-            desired_shape = batch_shape if batch_shape is not None else ()
-            assert mx_out.shape == desired_shape + (event_shape,)
-
-    # Test sample_n
-    for event_shape, batch_shape, sample_shape in itertools.product(event_shapes, batch_shapes, sample_shapes):
-        for use_logit, hybridize in itertools.product([True, False], [True, False]):
-            prob = np.array(_np.random.dirichlet(
-                [1 / event_shape] * event_shape, size=batch_shape))
-            param = prob
-            if use_logit:
-                param = np.log(param)
-            net = TestMultinomial("sample_n", event_shape, _np.random.randint(1, 5),
-                                  use_logit, batch_shape, sample_shape)
-            if hybridize:
-                net.hybridize()
-            mx_out = net(param).asnumpy()
-            sample_shape = () if sample_shape is None else sample_shape
-            desired_shape = sample_shape + \
-                (batch_shape if batch_shape is not None else ())
-            assert mx_out.shape == desired_shape + (event_shape,)
-
-    # Test log_prob
-    for event_shape, batch_shape, sample_shape in itertools.product(event_shapes, batch_shapes, sample_shapes):
-        for use_logit, hybridize in itertools.product([True, False], [False]):
-            prob = np.array(_np.random.dirichlet(
-                [1 / event_shape] * event_shape, size=batch_shape))
-            eps = _np.finfo('float32').eps
-            prob = np.clip(prob, eps, 1 - eps)
-            param = prob
-            sample_shape = () if sample_shape is None else sample_shape
-            desired_shape = sample_shape + \
-                (batch_shape if batch_shape is not None else ())
-            samples = np.random.choice(event_shape, size=desired_shape)
-            samples = one_hot(samples, event_shape)
-            if use_logit:
-                param = np.log(param)
-            net = TestMultinomial("log_prob", event_shape,
-                                  _np.random.randint(1, 5), use_logit)
-            if hybridize:
-                net.hybridize()
-            mx_out = net(param, samples).asnumpy()
-            # Check shape
-            assert mx_out.shape == desired_shape
-
-
-@use_np
-def test_gluon_binomial_v1():
-    class TestBinomial(HybridBlock):
-        def __init__(self, func, is_logit=False, n=1):
-            super(TestBinomial, self).__init__()
-            self._is_logit = is_logit
-            self._func = func
-            self._n = n
-
-        def hybrid_forward(self, F, params, *args):
-            dist = mgp.Binomial(n=self._n, logit=params, validate_args=True) \
-                if self._is_logit else \
-                mgp.Binomial(n=self._n, prob=params, validate_args=True)
-            return _distribution_method_invoker(dist, self._func, *args)
-
-    shapes = [(), (1,), (2, 3), 6]
-    # Test sampling
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        for use_logit in [True, False]:
-            n = _np.random.randint(5, 10)
-            prob = np.random.uniform(low=0.1, size=shape)
-            net = TestBinomial('sample', use_logit, n=float(n))
-            param = prob
-            if use_logit:
-                param = prob_to_logit(param)
-            if hybridize:
-                net.hybridize()
-            mx_out = net(param).asnumpy()
-            desired_shape = (shape,) if isinstance(shape, int) else shape
-            assert mx_out.shape == desired_shape
-
-    # Test sample_n
-    prefix_shape = (2, 3)
-    for shape in shapes:
-        n = _np.random.randint(5, 10)
-        prob = np.random.uniform(low=0.1, size=shape)
-        dist = mgp.Binomial(n=n, prob=prob)
-        samples = dist.sample_n(prefix_shape)
-        assert samples.shape == (prefix_shape + prob.shape)
-
-    # Test log_prob
-    for shape, hybridize, use_logit in itertools.product(shapes, [True, False], [True, False]):
-        n = _np.random.randint(5, 10)
-        prob = np.random.uniform(low=0.1, size=shape)
-        sample = np.random.randint(0, n, size=shape).astype('float32')
-        param = prob
-        if use_logit:
-            param = prob_to_logit(param)
-        net = TestBinomial("log_prob", use_logit, n=float(n))
-        if hybridize:
-            net.hybridize()
-        mx_out = net(param, sample).asnumpy()
-        np_out = ss.binom(n=n, p=prob.asnumpy()).logpmf(sample.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test mean and variance
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        for func in ['mean', 'variance']:
-            for use_logit in [True, False]:
-                n = _np.random.randint(5, 10)
-                prob = np.random.uniform(low=0.1, size=shape)
-                net = TestBinomial(func, use_logit, n=float(n))
-                param = prob
-                if use_logit:
-                    param = prob_to_logit(param)
-                if hybridize:
-                    net.hybridize()
-                mx_out = net(param).asnumpy()
-                ss_binom = ss.binom(n=n, p=prob.asnumpy())
-                if func == 'mean':
-                    np_out = ss_binom.mean()
-                else:
-                    np_out = ss_binom.var()
-                assert_almost_equal(mx_out, np_out, atol=1e-4,
-                                    rtol=1e-3, use_broadcast=False)
-
-
-@use_np
-@pytest.mark.flaky
-def test_gluon_bernoulli_v1():
-    class TestBernoulli(HybridBlock):
-        def __init__(self, func, is_logit=False):
-            super(TestBernoulli, self).__init__()
-            self._is_logit = is_logit
-            self._func = func
-
-        def hybrid_forward(self, F, params, *args):
-            bernoulli = mgp.Bernoulli(logit=params, validate_args=True) if self._is_logit else \
-                mgp.Bernoulli(prob=params, validate_args=True)
-            return _distribution_method_invoker(bernoulli, self._func, *args)
-
-    # Test log_prob
-    shapes = [(), (1,), (2, 3), 6]
-    for shape, hybridize, use_logit in itertools.product(shapes, [True, False], [True, False]):
-        prob = np.random.uniform(size=shape)
-        sample = npx.random.bernoulli(prob=0.5, size=shape)
-        param = prob
-        if use_logit:
-            param = prob_to_logit(param)
-        net = TestBernoulli("log_prob", use_logit)
-        if hybridize:
-            net.hybridize()
-        mx_out = net(param, sample).asnumpy()
-        np_out = _np.log(ss.bernoulli.pmf(sample.asnumpy(), prob.asnumpy()))
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test variance
-    for shape, hybridize, use_logit in itertools.product(shapes, [True, False], [True, False]):
-        prob = np.random.uniform(size=shape)
-        sample = npx.random.bernoulli(prob=0.5, size=shape)
-        param = prob
-        if use_logit:
-            param = prob_to_logit(param)
-        net = TestBernoulli("variance", use_logit)
-        if hybridize:
-            net.hybridize()
-        mx_out = net(param).asnumpy()
-        np_out = ss.bernoulli(prob.asnumpy()).var()
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test entropy
-    for shape, hybridize, use_logit in itertools.product(shapes, [True, False], [True, False]):
-        prob = np.random.uniform(size=shape)
-        sample = npx.random.bernoulli(prob=0.5, size=shape)
-        param = prob
-        if use_logit:
-            param = prob_to_logit(param)
-        net = TestBernoulli("entropy", use_logit)
-        if hybridize:
-            net.hybridize()
-        mx_out = net(param).asnumpy()
-        np_out = ss.bernoulli(prob.asnumpy()).entropy()
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-
-@use_np
-def test_relaxed_bernoulli_v1():
-    class TestRelaxedBernoulli(HybridBlock):
-        def __init__(self, func, is_logit=False):
-            super(TestRelaxedBernoulli, self).__init__()
-            self._is_logit = is_logit
-            self._func = func
-
-        def hybrid_forward(self, F, params, *args):
-            relaxed_bernoulli = mgp.RelaxedBernoulli(T=1.0, logit=params, validate_args=True)\
-                if self._is_logit else \
-                mgp.RelaxedBernoulli(T=1.0, prob=params, validate_args=True)
-            if self._func == "sample":
-                return relaxed_bernoulli.sample()
-            return _distribution_method_invoker(relaxed_bernoulli, self._func, *args)
-
-    def prob_to_logit(prob):
-        return np.log(prob) - np.log1p(-prob)
-
-    shapes = [(), (1,), (2, 3), 6]
-    # Test sampling
-    for shape, hybridize, use_logit in itertools.product(shapes, [True, False], [True, False]):
-        prob = np.random.uniform(size=shape)
-        param = prob
-        if use_logit:
-            param = prob_to_logit(param)
-        param.attach_grad()
-        net = TestRelaxedBernoulli("sample", use_logit)
-        if hybridize:
-            net.hybridize()
-        with autograd.record():
-            mx_out = net(param)
-        mx_out.backward()
-        desired_shape = (shape,) if isinstance(shape, int) else shape
-        assert param.grad.shape == desired_shape
-
-    for shape, hybridize, use_logit in itertools.product(shapes, [True, False], [True, False]):
-        prob = np.random.uniform(size=shape)
-        sample = np.random.uniform(0.1, 0.9, size=shape)
-        param = prob
-        if use_logit:
-            param = prob_to_logit(param)
-        net = TestRelaxedBernoulli("log_prob", use_logit)
-        if hybridize:
-            net.hybridize()
-        mx_out = net(param, sample).asnumpy()
-        desired_shape = (shape,) if isinstance(shape, int) else shape
-        assert mx_out.shape == desired_shape
-
-
-@use_np
-def test_gluon_categorical_v1():
-    class TestCategorical(HybridBlock):
-        def __init__(self, func, is_logit=False, batch_shape=None, num_events=None, sample_shape=None):
-            super(TestCategorical, self).__init__()
-            self._is_logit = is_logit
-            self._func = func
-            self._batch_shape = batch_shape
-            self._num_events = num_events
-            self._sample_shape = sample_shape
-
-        def hybrid_forward(self, F, params, *args):
-            categorical = mgp.Categorical(self._num_events, logit=params, validate_args=True)\
-                if self._is_logit else \
-                mgp.Categorical(self._num_events, prob=params,
-                                validate_args=True)
-            if self._func == "sample":
-                return categorical.sample(self._batch_shape)
-            if self._func == "sample_n":
-                return categorical.sample_n(self._sample_shape)
-            return _distribution_method_invoker(categorical, self._func, *args)
-
-    event_shapes = [2, 5, 10]
-    batch_shapes = [None, (2, 3)]  # , (4, 0, 5)]
-    sample_shapes = [(), (2,), (3, 4)]
-
-    # Test sampling
-    for event_shape, batch_shape in itertools.product(event_shapes, batch_shapes):
-        for use_logit, hybridize in itertools.product([True, False], [True, False]):
-            prob = np.array(_np.random.dirichlet(
-                [1 / event_shape] * event_shape, size=batch_shape))
-            param = prob.astype('float32')
-            if use_logit:
-                param = np.log(param)
-            net = TestCategorical("sample", use_logit,
-                                  batch_shape, event_shape)
-            if hybridize:
-                net.hybridize()
-            mx_out = net(param).asnumpy()
-            desired_shape = batch_shape if batch_shape is not None else ()
-            assert mx_out.shape == desired_shape
-
-    # Test sample_n
-    for event_shape, batch_shape, sample_shape in itertools.product(event_shapes, batch_shapes, sample_shapes):
-        for use_logit, hybridize in itertools.product([True, False], [True, False]):
-            prob = np.array(_np.random.dirichlet(
-                [1 / event_shape] * event_shape, size=batch_shape))
-            param = prob.astype('float32')
-            if use_logit:
-                param = np.log(param)
-            net = TestCategorical("sample_n",
-                                  is_logit=use_logit, batch_shape=batch_shape,
-                                  num_events=event_shape, sample_shape=sample_shape
-                                  )
-            if hybridize:
-                net.hybridize()
-            mx_out = net(param).asnumpy()
-            desired_shape = sample_shape + \
-                (batch_shape if batch_shape is not None else ())
-            assert mx_out.shape == desired_shape
-
-    # Test log_prob
-    for event_shape, batch_shape, sample_shape in itertools.product(event_shapes, batch_shapes, sample_shapes):
-        for use_logit, hybridize in itertools.product([True, False], [True, False]):
-            prob = np.array(_np.random.dirichlet(
-                [1 / event_shape] * event_shape, size=batch_shape))
-            eps = _np.finfo('float32').eps
-            prob = np.clip(prob, eps, 1 - eps)
-            param = prob.astype('float32')
-            desired_shape = sample_shape + \
-                (batch_shape if batch_shape is not None else ())
-            samples = np.random.choice(event_shape, size=desired_shape)
-            if use_logit:
-                param = np.log(param)
-            net = TestCategorical("log_prob", use_logit,
-                                  batch_shape, event_shape)
-            if hybridize:
-                net.hybridize()
-            mx_out = net(param, samples)
-            # Check shape
-            assert mx_out.shape == desired_shape
-            # Check value
-            log_pmf, indices = np.broadcast_arrays(
-                np.log(prob), np.expand_dims(samples, -1))
-            if indices.ndim >= 1:
-                indices = indices[..., :1]
-            expect_log_prob = _np.take_along_axis(
-                log_pmf, indices.astype('int'), axis=-1).asnumpy()
-            assert_almost_equal(mx_out.asnumpy(), expect_log_prob.squeeze(), atol=1e-4,
-                                rtol=1e-3, use_broadcast=False)
-
-    # Test enumerate_support
-    for event_shape, batch_shape in itertools.product(event_shapes, batch_shapes):
-        for use_logit, hybridize in itertools.product([True, False], [True, False]):
-            prob = np.array(_np.random.dirichlet(
-                [1 / event_shape] * event_shape, size=batch_shape))
-            param = prob.astype('float32')
-            if use_logit:
-                param = np.log(param)
-            net = TestCategorical("enumerate_support",
-                                  use_logit, batch_shape, event_shape)
-            if hybridize:
-                net.hybridize()
-            mx_out = net(param).asnumpy()
-            desired_shape = (event_shape,) + \
-                (batch_shape if batch_shape is not None else ())
-            assert mx_out.shape == desired_shape
-
-
-@use_np
-def test_gluon_one_hot_categorical_v1():
-    def one_hot(a, num_classes):
-        return np.identity(num_classes)[a]
-
-    class TestOneHotCategorical(HybridBlock):
-        def __init__(self, func, is_logit=False, batch_shape=None, num_events=None):
-            super(TestOneHotCategorical, self).__init__()
-            self._is_logit = is_logit
-            self._func = func
-            self._batch_shape = batch_shape
-            self._num_events = num_events
-
-        def hybrid_forward(self, F, params, *args):
-            categorical = mgp.OneHotCategorical(num_events=self._num_events, logit=params) \
-                if self._is_logit else \
-                mgp.OneHotCategorical(num_events=self._num_events, prob=params)
-            if self._func == "sample":
-                return categorical.sample(self._batch_shape)
-            return _distribution_method_invoker(categorical, self._func, *args)
-
-    event_shapes = [2, 5, 10]
-    batch_shapes = [None, (2, 3)]  # , (4, 0, 5)]
-    sample_shapes = [(), (2,), (3, 4)]
-
-    # Test sampling
-    for event_shape, batch_shape in itertools.product(event_shapes, batch_shapes):
-        for use_logit, hybridize in itertools.product([True, False], [True, False]):
-            prob = np.array(_np.random.dirichlet(
-                [1 / event_shape] * event_shape, size=batch_shape))
-            param = prob
-            if use_logit:
-                param = np.log(param)
-            net = TestOneHotCategorical(
-                "sample", use_logit, batch_shape, event_shape)
-            if hybridize:
-                net.hybridize()
-            mx_out = net(param).asnumpy()
-            desired_shape = batch_shape if batch_shape is not None else ()
-            assert mx_out.shape == desired_shape + (event_shape,)
-
-    # Test log_prob
-    for event_shape, batch_shape, sample_shape in itertools.product(event_shapes, batch_shapes, sample_shapes):
-        for use_logit, hybridize in itertools.product([True, False], [True, False]):
-            prob = np.array(_np.random.dirichlet(
-                [1 / event_shape] * event_shape, size=batch_shape))
-            eps = _np.finfo('float32').eps
-            prob = np.clip(prob, eps, 1 - eps)
-            param = prob
-            desired_shape = sample_shape + \
-                (batch_shape if batch_shape is not None else ())
-            samples = np.random.choice(event_shape, size=desired_shape)
-            samples = one_hot(samples, event_shape)
-            if use_logit:
-                param = np.log(param)
-            net = TestOneHotCategorical(
-                "log_prob", use_logit, batch_shape, event_shape)
-            if hybridize:
-                net.hybridize()
-            mx_out = net(param, samples)
-            # Check shape
-            assert mx_out.shape == desired_shape
-
-    # Test enumerate support
-    for event_shape, batch_shape in itertools.product(event_shapes, batch_shapes):
-        for use_logit, hybridize in itertools.product([True, False], [True, False]):
-            prob = np.array(_np.random.dirichlet(
-                [1 / event_shape] * event_shape, size=batch_shape))
-            param = prob
-            if use_logit:
-                param = np.log(param)
-            net = TestOneHotCategorical(
-                "enumerate_support", use_logit, batch_shape, event_shape)
-            if hybridize:
-                net.hybridize()
-            mx_out = net(param).asnumpy()
-            desired_shape = batch_shape if batch_shape is not None else ()
-            assert mx_out.shape == (event_shape,) + \
-                desired_shape + (event_shape,)
-
-
-@use_np
-def test_relaxed_one_hot_categorical_v1():
-    class TestRelaxedOneHotCategorical(HybridBlock):
-        def __init__(self, func, is_logit=False, batch_shape=None, num_events=None):
-                super(TestRelaxedOneHotCategorical, self).__init__()
-                self._is_logit = is_logit
-                self._func = func
-                self._batch_shape = batch_shape
-                self._num_events = num_events
-
-        def hybrid_forward(self, F, params, *args):
-            categorical = mgp.RelaxedOneHotCategorical(T=1.0, num_events=self._num_events, logit=params) \
-                if self._is_logit else \
-                mgp.RelaxedOneHotCategorical(
-                    T=1.0, num_events=self._num_events, prob=params)
-            if self._func == "sample":
-                return categorical.sample(self._batch_shape)
-            return _distribution_method_invoker(categorical, self._func, *args)
-
-    event_shapes = [2, 5, 10]
-    batch_shapes = [None, (2, 3)]  # , (4, 0, 5)]
-    sample_shapes = [(), (2,), (3, 4)]
-
-    # Test sampling
-    for event_shape, batch_shape in itertools.product(event_shapes, batch_shapes):
-        for use_logit, hybridize in itertools.product([True, False], [True, False]):
-            prob = np.array(_np.random.dirichlet(
-                [1 / event_shape] * event_shape, size=batch_shape))
-            prob = prob.astype('float32')
-            param = prob
-            if use_logit:
-                param = np.log(param)
-            param.attach_grad()
-            net = TestRelaxedOneHotCategorical(
-                "sample", use_logit, batch_shape, event_shape)
-            if hybridize:
-                net.hybridize()
-            with autograd.record():
-                mx_out = net(param)
-            mx_out.backward()
-            desired_shape = batch_shape if batch_shape is not None else ()
-            assert mx_out.shape == desired_shape + (event_shape,)
-            assert param.grad.shape == param.shape
-
-    # Test log_prob
-    for event_shape, batch_shape, sample_shape in itertools.product(event_shapes, batch_shapes, sample_shapes):
-        for use_logit, hybridize in itertools.product([True, False], [False]):
-            prob = np.array(_np.random.dirichlet(
-                [1 / event_shape] * event_shape, size=batch_shape))
-            eps = _np.finfo('float32').eps
-            prob = np.clip(prob, eps, 1 - eps)
-            param = prob
-            desired_shape = sample_shape + \
-                (batch_shape if batch_shape is not None else ())
-            # Samples from a Relaxed One-hot Categorical lie on a simplex.
-            samples = np.array(_np.random.dirichlet(
-                [1 / event_shape] * event_shape, size=desired_shape))
-            if use_logit:
-                param = np.log(param)
-            net = TestRelaxedOneHotCategorical(
-                "log_prob", use_logit, batch_shape, event_shape)
-            if hybridize:
-                net.hybridize()
-            mx_out = net(param, samples)
-            # Check shape
-            assert mx_out.shape == desired_shape
-
-
-@use_np
-def test_gluon_mvn_v1():
-    class TestMVN(HybridBlock):
-        def __init__(self, func, param_type):
-            super(TestMVN, self).__init__()
-            self._func = func
-            # cov, precision or scale_tril
-            self._param_type = param_type
-
-        def hybrid_forward(self, F, loc, cov, *args):
-            mvn = mgp.MultivariateNormal(loc=loc, **{self._param_type: cov},
-                                         validate_args=True)
-            return _distribution_method_invoker(mvn, self._func, *args)
-
-    def _stable_inv(cov):
-        """
-        Force the precision matrix to be symmetric.
-        """
-        precision = np.linalg.inv(cov)
-        precision_t = np.swapaxes(precision, -1, -2)
-        return (precision + precision_t) / 2
-
-    event_shapes = [3, 5]
-    loc_shapes = [(), (2,), (4, 2)]
-    cov_shapes = [(), (2,), (4, 2)]
-    cov_func = {
-        'cov': lambda s: s,
-        'precision': lambda s: _stable_inv(s),
-        'scale_tril': lambda s: np.linalg.cholesky(s)
-    }
-
-    # Test sampling
-    for loc_shape, cov_shape, event_shape in itertools.product(loc_shapes, cov_shapes, event_shapes):
-        for cov_type in cov_func.keys():
-            for hybridize in [False]:
-                loc = np.random.randn(*(loc_shape + (event_shape,)))
-                _s = np.random.randn(*(cov_shape + (event_shape, event_shape)))
-                loc.attach_grad()
-                _s.attach_grad()
-                # Full covariance matrix
-                sigma = np.matmul(_s, np.swapaxes(
-                    _s, -1, -2)) + np.eye(event_shape)
-                cov_param = cov_func[cov_type](sigma)
-                net = TestMVN('sample', cov_type)
-                if hybridize:
-                    net.hybridize()
-                with autograd.record():
-                    mx_out = net(loc, cov_param)
-                desired_shape = (loc + sigma[..., 0]).shape
-                assert mx_out.shape == desired_shape
-                mx_out.backward()
-                assert loc.grad.shape == loc.shape
-                assert _s.grad.shape == _s.shape
-
-    # Test log_prob
-    for loc_shape, cov_shape, event_shape in itertools.product(loc_shapes, cov_shapes, event_shapes):
-        for cov_type in cov_func.keys():
-            for hybridize in [True, False]:
-                loc = np.random.randn(*(loc_shape + (event_shape,)))
-                _s = np.random.randn(*(cov_shape + (event_shape, event_shape)))
-                samples = np.random.normal(
-                    np.zeros_like(loc), np.ones_like(_s[..., 0]))
-                loc.attach_grad()
-                _s.attach_grad()
-                # Full covariance matrix
-                sigma = np.matmul(_s, np.swapaxes(
-                    _s, -1, -2)) + np.eye(event_shape)
-                cov_param = cov_func[cov_type](sigma)
-                net = TestMVN('log_prob', cov_type)
-                if hybridize:
-                    net.hybridize()
-                mx_out = net(loc, cov_param, samples)
-                assert mx_out.shape == samples.shape[:-1]
-                if mx_out.shape == ():
-                    mx_out_t = mx_out.asnumpy()
-                else:
-                    mx_out_t = mx_out.flatten()[0].asnumpy()
-                samples_t = samples.reshape(-1, event_shape).asnumpy()[0]
-                # Select the first element in the batch, because scipy does not support batching.
-                loc_t = loc.reshape(-1, event_shape)[0].asnumpy()
-                sigma_t = sigma.reshape(-1, event_shape,
-                                        event_shape)[0].asnumpy()
-                scipy_mvn = ss.multivariate_normal(loc_t, sigma_t)
-                ss_out = scipy_mvn.logpdf(samples_t)
-                assert_almost_equal(mx_out_t, ss_out, atol=1e-4,
-                                    rtol=1e-3, use_broadcast=False)
-
-    # Test entropy
-    for loc_shape, cov_shape, event_shape in itertools.product(loc_shapes, cov_shapes, event_shapes):
-        for cov_type in cov_func.keys():
-            for hybridize in [True, False]:
-                loc = np.random.randn(*(loc_shape + (event_shape,)))
-                _s = np.random.randn(*(cov_shape + (event_shape, event_shape)))
-                loc.attach_grad()
-                _s.attach_grad()
-                # Full covariance matrix
-                sigma = np.matmul(_s, np.swapaxes(
-                    _s, -1, -2)) + np.eye(event_shape)
-                cov_param = cov_func[cov_type](sigma)
-                net = TestMVN('entropy', cov_type)
-                if hybridize:
-                    net.hybridize()
-                mx_out = net(loc, cov_param)
-                assert mx_out.shape == sigma.shape[:-2]
-                if mx_out.shape == ():
-                    mx_out_t = mx_out.asnumpy()
-                else:
-                    mx_out_t = mx_out.flatten()[0].asnumpy()
-                # Select the first element in the batch, because scipy does not support batching.
-                loc_t = loc.reshape(-1, event_shape)[0].asnumpy()
-                sigma_t = sigma.reshape(-1, event_shape,
-                                        event_shape)[0].asnumpy()
-                scipy_mvn = ss.multivariate_normal(loc_t, sigma_t)
-                ss_out = scipy_mvn.entropy()
-                assert_almost_equal(mx_out_t, ss_out, atol=1e-4,
-                                    rtol=1e-3, use_broadcast=False)
-
-
-@use_np
-def test_gluon_half_normal_v1():
-    class TestHalfNormal(HybridBlock):
-        def __init__(self, func):
-            super(TestHalfNormal, self).__init__()
-            self._func = func
-
-        def hybrid_forward(self, F, scale, *args):
-            half_normal = mgp.HalfNormal(scale, F, validate_args=True)
-            return getattr(half_normal, self._func)(*args)
-
-    shapes = [(), (1,), (2, 3), 6]
-
-    # Test sampling
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        scale = np.random.uniform(0.5, 1.5, shape)
-        net = TestHalfNormal("sample")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(scale).asnumpy()
-        if isinstance(shape, Number):
-            shape = (shape,)
-        assert mx_out.shape == shape
-
-    # Test log_prob
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.abs(np.random.normal(size=shape))
-        net = TestHalfNormal("log_prob")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(scale, samples).asnumpy()
-        np_out = ss.halfnorm(0, scale.asnumpy()).logpdf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test cdf
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.abs(np.random.normal(size=shape))
-        net = TestHalfNormal("cdf")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(scale, samples).asnumpy()
-        np_out = ss.halfnorm(0, scale.asnumpy()).cdf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test icdf
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.uniform(size=shape)
-        net = TestHalfNormal("icdf")
-        if hybridize:
-            net.hybridize()
-        mx_out = net(scale, samples).asnumpy()
-        np_out = ss.halfnorm(0, scale.asnumpy()).ppf(samples.asnumpy())
-        assert_almost_equal(mx_out, np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-
-@use_np
-def test_affine_transform_v1():
-    r"""
-    Test the correctness of affine transformation by performing it
-    on a standard normal, since N(\mu, \sigma^2) = \mu + \sigma * N(0, 1)
-    """
-    class TestAffineTransform(HybridBlock):
-        def __init__(self, func):
-            super(TestAffineTransform, self).__init__()
-            self._func = func
-
-        def hybrid_forward(self, F, loc, scale, *args):
-            std_normal = mgp.Normal(F.np.zeros_like(loc),
-                                    F.np.ones_like(scale), F)
-            transforms = [mgp.AffineTransform(loc=0, scale=scale),
-                          mgp.AffineTransform(loc=loc, scale=1)]
-            transformed_normal = mgp.TransformedDistribution(
-                std_normal, transforms)
-            if (len(args) == 0):
-                return getattr(transformed_normal, self._func)
-            return getattr(transformed_normal, self._func)(*args)
-
-    shapes = [(1,), (2, 3), 6]
-
-    # Test log_prob
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        loc = np.random.uniform(-1, 1, shape)
-        loc.attach_grad()
-        scale = np.random.uniform(0.5, 1.5, shape)
-        scale.attach_grad()
-        samples = np.random.normal(size=shape)
-        net = TestAffineTransform('log_prob')
-        if hybridize:
-            net.hybridize()
-        with autograd.record():
-            mx_out = net(loc, scale, samples)
-        np_out = _np.log(ss.norm(loc.asnumpy(),
-                                 scale.asnumpy()).pdf(samples.asnumpy()))
-        assert_almost_equal(mx_out.asnumpy(), np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-        mx_out.backward()
-        loc_expected_grad = ((samples - loc) / scale ** 2).asnumpy()
-        scale_expected_grad = (samples - loc) ** 2 * \
-            np.power(scale, -3) - (1 / scale)
-        assert_almost_equal(loc.grad.asnumpy(), loc_expected_grad, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-        assert_almost_equal(scale.grad.asnumpy(), scale_expected_grad, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-    # Test sampling
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        loc = np.random.uniform(-1, 1, shape)
-        loc.attach_grad()
-        scale = np.random.uniform(0.5, 1.5, shape)
-        scale.attach_grad()
-        if not isinstance(shape, tuple):
-            shape = (shape,)
-        expected_shape = (4, 5) + shape
-        net = TestAffineTransform('sample')
-        mx_out = net(loc, scale, expected_shape).asnumpy()
-        assert mx_out.shape == expected_shape
-
-
-@use_np
-def test_compose_transform_v1():
-    class TestComposeTransform(HybridBlock):
-        def __init__(self, func):
-            super(TestComposeTransform, self).__init__()
-            self._func = func
-
-        def hybrid_forward(self, F, loc, scale, *args):
-            # Generate a log_normal distribution.
-            std_normal = mgp.Normal(F.np.zeros_like(loc),
-                                    F.np.ones_like(scale), F)
-            transforms = mgp.ComposeTransform([
-                mgp.AffineTransform(loc=0, scale=scale),
-                mgp.AffineTransform(loc=loc, scale=1),
-                mgp.ExpTransform()
-            ])
-            transformed_normal = mgp.TransformedDistribution(
-                std_normal, transforms)
-            if (len(args) == 0):
-                return getattr(transformed_normal, self._func)
-            return getattr(transformed_normal, self._func)(*args)
-
-    shapes = [(1,), (2, 3), 6]
-    # Test log_prob
-    for shape, hybridize in itertools.product(shapes, [True, False]):
-        loc = np.random.uniform(-1, 1, shape)
-        loc.attach_grad()
-        scale = np.random.uniform(0.5, 1.5, shape)
-        scale.attach_grad()
-        samples = np.random.uniform(1, 2, size=shape)
-        net = TestComposeTransform('log_prob')
-        if hybridize:
-            net.hybridize()
-        with autograd.record():
-            mx_out = net(loc, scale, samples)
-        np_out = ss.lognorm(s=scale.asnumpy(), scale=np.exp(
-            loc).asnumpy()).logpdf(samples.asnumpy())
-        assert_almost_equal(mx_out.asnumpy(), np_out, atol=1e-4,
-                            rtol=1e-3, use_broadcast=False)
-
-
-@use_np
-def test_cached_property_v1():
-    x = np.random.normal()
-    x.attach_grad()
-    scale = 0.1
-
-    class Dummy(object):
-        def __init__(self, x):
-            super(Dummy, self).__init__()
-            self.x = x
-
-        @mgp.cached_property
-        def y(self):
-            return scale * self.x + 1
-
-    with autograd.record():
-        obj = Dummy(x)
-        obj.y.backward()
-    assert_almost_equal(x.grad.asnumpy(), scale * np.ones((1,)))
-
-    class DummyBlock(HybridBlock):
-        def hybrid_forward(self, F, x):
-            obj = Dummy(x)
-            return obj.y
-
-    x = np.random.normal()
-    x.attach_grad()
-    net = DummyBlock()
-    with autograd.record():
-        y = net(x)
-        y.backward()
-    assert_almost_equal(x.grad.asnumpy(), scale * np.ones((1,)))
-
-    x = np.random.normal()
-    x.attach_grad()
-    net.hybridize()
-    with autograd.record():
-        y = net(x)
-        y.backward()
-    assert_almost_equal(x.grad.asnumpy(), scale * np.ones((1,)))
-
-
-@use_np
-def test_independent_v1():
-    class TestIndependent(HybridBlock):
-        def __init__(self, event_dim, func):
-            super(TestIndependent, self).__init__()
-            self._event_dim = event_dim
-            self._func = func
-
-        def hybrid_forward(self, F, logit, *args):
-            base_dist = mgp.Bernoulli(logit=logit)
-            reshaped_dist = mgp.Independent(base_dist, self._event_dim)
-            return getattr(reshaped_dist, self._func)(*args)
-
-    event_shapes = [(1,), (4,), (2, 2)]
-    batch_shapes = [(2, 3), (2,)]
-    for (batch_shape, event_shape) in itertools.product(batch_shapes, event_shapes):
-        for hybridize in [False, True]:
-            for func in ['log_prob']:
-                full_shape = batch_shape + event_shape
-                logit = np.random.normal(0, 2, size=full_shape)
-                samples = np.round(np.random.uniform(size=full_shape))
-                net = TestIndependent(len(event_shape), func)
-                if hybridize:
-                    net.hybridize()
-                mx_out = net(logit, samples)
-                assert mx_out.shape == batch_shape
-
-
-@use_np
-def test_gluon_kl_v1():
-    def _test_zero_kl(p, shape):
-        """Check if KL(p || p) = 0
-
-        Parameters
-        ----------
-        p : Distribution
-        """
-        mx_out = mgp.kl_divergence(p, p).asnumpy()
-        np_out = _np.zeros(shape)
-        assert_almost_equal(mx_out, np_out, atol=1e-3,
-                            rtol=1e-2, use_broadcast=False)
-
-    def _test_monte_carlo(p, q, M=50000):
-        r"""Check if KL(p || q) is approximately equal to
-        1/M * \Sum_{i=1}^{M} log(p(x_i) / q(x_i)), x_i ~ p(x)
-        """
-        kl = mgp.kl_divergence(p, q)
-        mc_approx = mgp.empirical_kl(p, q, M)
-        assert_almost_equal(mc_approx.asnumpy(), kl.asnumpy(), atol=1e-1,
-                            rtol=1e-1, use_broadcast=False)
-
-    def _dist_factory(dist, *param_funcs):
-        """Generate a distribution object with parameters of random value.
-
-        Parameters
-        ----------
-        dist : Type
-            A type of distribution.
-        param_funcs : List
-            A list of functions that generate valid parameters for `dist`
-        """
-        params = [f() if callable(f) else f for f in param_funcs]
-        return dist(*params)
-
-    # could cause longer runtime and potential flaky tests
-    monte_carlo_test = False
-    repeated_times = 50000
-    shapes = [(), (1,), (2, 3), 6]
-
-    # Test kl between same distributions
-    # uniform
-    for shape in shapes:
-        dist = mgp.Uniform
-        def low(): return np.random.uniform(0, 1, shape)
-        def high(): return np.random.uniform(1, 2, shape)
-        _test_zero_kl(_dist_factory(dist, low, high), shape)
-
-    # normal, laplace, cauchy, gumbel
-    for dist in [mgp.Normal, mgp.Laplace, mgp.Cauchy, mgp.Gumbel]:
-        for shape in shapes:
-            def loc(): return np.random.uniform(-1, 1, shape)
-            def scale(): return np.random.uniform(0.5, 1.5, shape)
-            _test_zero_kl(_dist_factory(dist, loc, scale), shape)
-            if monte_carlo_test:
-                _test_monte_carlo(_dist_factory(dist, loc, scale),
-                                  _dist_factory(dist, loc, scale),
-                                  repeated_times)
-
-    # poisson
-    for shape in shapes[1:]:
-        dist = mgp.Poisson
-        def rate(): return np.random.uniform(0.5, 1.5, shape)
-        _test_zero_kl(_dist_factory(dist, rate), shape)
-        if monte_carlo_test:
-            _test_monte_carlo(_dist_factory(dist, rate),
-                              _dist_factory(dist, rate),
-                              repeated_times)
-
-    # exponential, geometric
-    for dist in [mgp.Exponential, mgp.Geometric]:
-        for shape in shapes:
-            def s(): return np.random.uniform(size=shape, low=1e-3)
-            _test_zero_kl(_dist_factory(dist, s), shape)
-            if monte_carlo_test:
-                _test_monte_carlo(_dist_factory(dist, s),
-                                  _dist_factory(dist, s),
-                                  repeated_times)
-
-    # pareto
-    for shape in shapes:
-        dist = mgp.Pareto
-        def alpha(): return np.random.uniform(size=shape)
-        def scale(): return np.random.uniform(size=shape)
-        _test_zero_kl(_dist_factory(dist, alpha, scale), shape)
-
-    for shape in shapes:
-        dist = mgp.HalfNormal
-        def scale(): return np.random.uniform(0.5, 1.5, shape)
-        _test_zero_kl(_dist_factory(dist, scale), shape)
-        if monte_carlo_test:
-            _test_monte_carlo(_dist_factory(dist, scale),
-                              _dist_factory(dist, scale),
-                              repeated_times)
-
-    # gamma, beta
-    for dist in [mgp.Gamma, mgp.Beta]:
-        for shape in shapes:
-            def param1(): return np.random.uniform(0.5, 1.5, shape)
-            def param2(): return np.random.uniform(0.5, 1.5, shape)
-            _test_zero_kl(_dist_factory(dist, param1, param2), shape)
-            if monte_carlo_test:
-                _test_monte_carlo(_dist_factory(dist, param1, param2),
-                                  _dist_factory(dist, param1, param2),
-                                  50000)
-
-    # binomial
-    for shape in shapes:
-        n = _np.random.randint(5, 10)
-        prob = np.random.uniform(low=0.1, size=shape)
-        dist = mgp.Binomial(n=n, prob=prob)
-        _test_zero_kl(dist, shape)
-
-    # bernoulli
-    for shape in shapes:
-        prob = np.random.uniform(size=shape)
-        dist = mgp.Bernoulli(prob=prob)
-        _test_zero_kl(dist, shape)
-
-    event_shapes = [3, 5, 10]
-    loc_shapes = [(), (2,), (4, 2)]
-    cov_shapes = [(), (2,), (4, 2)]
-    for loc_shape, cov_shape, event_shape in itertools.product(loc_shapes, cov_shapes, event_shapes):
-        loc = np.random.randn(*(loc_shape + (event_shape,)))
-        _s = np.random.randn(*(cov_shape + (event_shape, event_shape)))
-        sigma = np.matmul(_s, np.swapaxes(_s, -1, -2)) + np.eye(event_shape)
-        dist = mgp.MultivariateNormal(loc, cov=sigma)
-        desired_shape = (loc + sigma[..., 0]).shape[:-1]
-        _test_zero_kl(dist, desired_shape)
-
-    batch_shapes = loc_shapes
-    # dirichlet
-    for event_shape, batch_shape in itertools.product(event_shapes, batch_shapes):
-        desired_shape = (batch_shape if batch_shape is not None else ())
-        dist = mgp.Dirichlet
-        def alpha(): return np.random.uniform(
-            0.5, 1.5, size=(desired_shape + (event_shape,)))
-        _test_zero_kl(_dist_factory(dist, alpha), desired_shape)
-        if monte_carlo_test:
-            _test_monte_carlo(_dist_factory(dist, alpha),
-                              _dist_factory(dist, alpha),
-                              50000)
-
-    # categorical, One-hot categorical
-    for dist in [mgp.Categorical, mgp.OneHotCategorical]:
-        for event_shape, batch_shape in itertools.product(event_shapes, batch_shapes):
-            prob = (lambda:
-                    np.array(_np.random.dirichlet([1 / event_shape] * event_shape, size=batch_shape)))
-            _test_zero_kl(_dist_factory(dist, event_shape, prob), batch_shape)
-            if monte_carlo_test:
-                _test_monte_carlo(_dist_factory(dist, event_shape, prob),
-                                  _dist_factory(dist, event_shape, prob),
-                                  repeated_times)
-
-    # Test kl between different distributions
-    # KL(Uniform || ...)
-    for shape in shapes:
-        rhs_dists = [
-            mgp.Normal(np.random.uniform(-1, 1, shape),
-                       np.random.uniform(0.5, 1.5, shape)),
-            mgp.Gumbel(np.random.uniform(-1, 1, shape),
-                       np.random.uniform(0.5, 1.5, shape)),
-        ]
-        for rhs_dist in rhs_dists:
-            low = np.random.uniform(-1, 1, shape)
-            high = low + np.random.uniform(0.5, 1.5, shape)
-            lhs_dist = mgp.Uniform(low, high)
-            kl = mgp.kl_divergence(lhs_dist, rhs_dist)
-            assert kl.shape == low.shape
-            if monte_carlo_test:
-                _test_monte_carlo(lhs_dist, rhs_dist, repeated_times)
-
-    # KL(Exponential || ...)
-    for shape in shapes:
-        rhs_dists = [
-            mgp.Normal(np.random.uniform(-1, 1, shape),
-                       np.random.uniform(0.5, 1.5, shape)),
-            mgp.Gumbel(np.random.uniform(-1, 1, shape),
-                       np.random.uniform(0.5, 1.5, shape)),
-            mgp.Gamma(np.random.uniform(0.5, 1.5, shape),
-                      np.random.uniform(0.5, 1.5, shape))
-        ]
-        for rhs_dist in rhs_dists:
-            s = np.random.uniform(size=shape)
-            lhs_dist = mgp.Exponential(s)
-            kl = mgp.kl_divergence(lhs_dist, rhs_dist)
-            assert kl.shape == s.shape
-            if monte_carlo_test:
-                _test_monte_carlo(lhs_dist, rhs_dist, repeated_times)
-
-
-@pytest.mark.garbage_expected
-@use_np
-def test_gluon_stochastic_block_v1():
-    class dummyBlock(StochasticBlock):
-        """In this test case, we generate samples from a Gaussian parameterized
-        by `loc` and `scale` and accumulate the KL-divergence between it and
-        its prior and the l2 norm of `loc` into the block's loss storage."""
-        @StochasticBlock.collectLoss
-        def hybrid_forward(self, F, loc, scale):
-            qz = mgp.Normal(loc, scale)
-            # prior
-            pz = mgp.Normal(F.np.zeros_like(loc), F.np.ones_like(scale))
-            self.add_loss(mgp.kl_divergence(qz, pz))
-            self.add_loss((loc ** 2).sum(1))
-            return qz.sample()
-
-    shape = (4, 4)
-    for hybridize in [True, False]:
-        net = dummyBlock()
-        if hybridize:
-            net.hybridize()
-        loc = np.random.randn(*shape)
-        scale = np.random.rand(*shape)
-        mx_out = net(loc, scale).asnumpy()
-        kl = net.losses[0].asnumpy()
-        l2_norm = net.losses[1].asnumpy()
-        assert mx_out.shape == loc.shape
-        assert kl.shape == loc.shape
-        assert l2_norm.shape == shape[:-1]
-
-
-@use_np
-def test_gluon_stochastic_block_exception_v1():
-    class problemBlock(StochasticBlock):
-        def hybrid_forward(self, F, loc, scale):
-            qz = mgp.Normal(loc, scale)
-            # prior
-            pz = mgp.Normal(F.np.zeros_like(loc), F.np.ones_like(scale))
-            self.add_loss(mgp.kl_divergence(qz, pz))
-            self.add_loss((loc ** 2).sum(1))
-            return qz.sample()
-
-    shape = (4, 4)
-    for hybridize in [True, False]:
-        net = problemBlock()
-        if hybridize:
-            net.hybridize()
-        loc = np.random.randn(*shape)
-        scale = np.random.rand(*shape)
-        with pytest.raises(ValueError):
-            mx_out = net(loc, scale).asnumpy()
-
-
-@pytest.mark.garbage_expected
-@use_np
-def test_gluon_stochastic_sequential_v1():
-    class normalBlock(HybridBlock):
-        def hybrid_forward(self, F, x):
-            return (x + 1)
-
-    class stochasticBlock(StochasticBlock):
-        @StochasticBlock.collectLoss
-        def hybrid_forward(self, F, x):
-            self.add_loss(x ** 2)
-            self.add_loss(x - 1)
-            return (x + 1)
-
-    class problemBlock(StochasticBlock):
-        def hybrid_forward(self, F, x):
-            self.add_loss(x ** 2)
-            self.add_loss(x - 1)
-            return (x + 1)
-
-    shape = (4, 4)
-    for hybridize in [True, False]:
-        initial_value = np.ones(shape)
-        net = StochasticSequential()
-        net.add(stochasticBlock())
-        net.add(normalBlock())
-        net.add(stochasticBlock())
-        net.add(normalBlock())
-        if hybridize:
-            net.hybridize()
-        mx_out = net(initial_value).asnumpy()
-        assert_almost_equal(mx_out, _np.ones(shape) * 5)
-        accumulated_loss = net.losses
-        assert len(accumulated_loss) == 2
-        assert_almost_equal(accumulated_loss[0][0].asnumpy(), _np.ones(shape))
-        assert_almost_equal(
-            accumulated_loss[0][1].asnumpy(), _np.ones(shape) - 1)
-        assert_almost_equal(
-            accumulated_loss[1][0].asnumpy(), _np.ones(shape) * 9)
-        assert_almost_equal(
-            accumulated_loss[1][1].asnumpy(), _np.ones(shape) + 1)
-
-    for hybridize in [True, False]:
-        initial_value = np.ones(shape)
-        net = StochasticSequential()
-        net.add(stochasticBlock())
-        net.add(normalBlock())
-        net.add(problemBlock())
-        net.add(normalBlock())
-        if hybridize:
-            net.hybridize()
-        with pytest.raises(ValueError):
-            mx_out = net(initial_value).asnumpy()
-
-
-@use_np
-def test_gluon_constraint_v1():
-    class TestConstraint(HybridBlock):
-        def __init__(self, constraint_type):
-            super(TestConstraint, self).__init__()
-            self._constraint_type = getattr(mgp.constraint, constraint_type)
-
-        def hybrid_forward(self, F, *params):
-            value = params[0]
-            constraint_param = params[1:]
-            if len(constraint_param) == 0:
-                constraint = self._constraint_type()
-            else:
-                constraint = self._constraint_type(*constraint_param)
-            return constraint.check(value)
-
-    _s = np.random.randn(5, 10, 10)
-    psd_matrix = np.matmul(_s, np.swapaxes(_s, -1, -2)) + np.eye(_s.shape[-1])
-
-    constraints_zoo = [
-        # (constraint_type, constraint_param, test_samples)
-        ('Real', (), [np.random.randn(2, 2)]),
-        ('Boolean', (), [np.random.randint(0, 20, size=(2, 2)) % 2 == 0]),
-        ('Interval', [np.zeros((2, 2)), np.ones(
-            (2, 2))], [np.random.rand(2, 2)]),
-        ('OpenInterval', [np.zeros((2, 2)), np.ones(
-            (2, 2))], [np.random.rand(2, 2)]),
-        ('HalfOpenInterval', [np.zeros((2, 2)),
-                              np.ones((2, 2))], [np.random.rand(2, 2)]),
-        ('IntegerInterval', [np.zeros((2, 2)), np.ones((2, 2)) * 10],
-            [np.random.randint(0, 10, size=(2, 2)).astype('float32')]),
-        ('IntegerOpenInterval', [np.zeros((2, 2)), np.ones((2, 2)) * 10],
-            [np.random.randint(1, 9, size=(2, 2)).astype('float32')]),
-        ('IntegerHalfOpenInterval', [np.zeros((2, 2)), np.ones((2, 2)) * 10],
-            [np.random.randint(1, 9, size=(2, 2)).astype('float32')]),
-        ('GreaterThan', [np.zeros((2, 2))], [np.random.rand(2, 2)]),
-        ('GreaterThanEq', [np.zeros((2, 2))], [np.random.rand(2, 2)]),
-        ('LessThan', [np.ones((2, 2))], [np.random.rand(2, 2)]),
-        ('LessThanEq', [np.ones((2, 2))], [np.random.rand(2, 2)]),
-        ('IntegerGreaterThan', [np.zeros((2, 2))],
-            [np.random.randint(1, 10, size=(2, 2)).astype('float32')]),
-        ('IntegerGreaterThanEq', [np.zeros((2, 2))],
-            [np.random.randint(0, 10, size=(2, 2)).astype('float32')]),
-        ('IntegerLessThan', [np.ones((2, 2)) * 10],
-            [np.random.randint(0, 9, size=(2, 2)).astype('float32')]),
-        ('IntegerLessThanEq', [np.ones((2, 2)) * 10],
-            [np.random.randint(0, 10, size=(2, 2)).astype('float32')]),
-        ('Positive', (), [np.random.rand(2, 2)]),
-        ('NonNegative', (), [np.random.rand(2, 2)]),
-        ('PositiveInteger', (), [np.random.randint(
-            1, 5, size=(2, 2)).astype('float32')]),
-        ('NonNegativeInteger', (), [np.random.randint(
-            0, 5, size=(2, 2)).astype('float32')]),
-        ('Simplex', (), [npx.softmax(np.random.randn(4, 4), axis=-1)]),
-        ('LowerTriangular', (), [np.tril(np.random.randn(5, 3, 3))]),
-        ('LowerCholesky', (), [np.linalg.cholesky(psd_matrix)]),
-        ('PositiveDefinite', (), [psd_matrix]),
-    ]
-
-    for (constraint_type, constraint_arg, test_samples) in constraints_zoo:
-        for hybridize in [True, False]:
-            net = TestConstraint(constraint_type)
-            if hybridize:
-                net.hybridize()
-            for test_sample in test_samples:
-                mx_out = net(test_sample, *constraint_arg).asnumpy()
-                assert_almost_equal(mx_out, test_sample.asnumpy())
-
-
-@use_np
-def test_gluon_domain_map_v1():
-    class TestDomainMap(HybridBlock):
-        def __init__(self, constraint_type, bijective):
-            super(TestDomainMap, self).__init__()
-            self._constraint_type = getattr(mgp.constraint, constraint_type)
-
-        def hybrid_forward(self, F, *params):
-            value = params[0]
-            constraint_param = params[1:]
-            if len(constraint_param) == 0:
-                constraint = self._constraint_type()
-            else:
-                constraint = self._constraint_type(*constraint_param)
-            if bijective:
-                bijector = mgp.biject_to(constraint)
-                bijector.F = F
-                value = bijector(value)
-            else:
-                transformation = mgp.transform_to(constraint)
-                transformation.F = F
-                value = transformation(value)
-            return (value, constraint.check(value))
-
-    constraints_zoo = [
-        # (constraint_type, constraint_param)
-        ('Positive', ()),
-        ('GreaterThan', [np.random.randn(2, 2)]),
-        ('GreaterThanEq', [np.random.randn(2, 2)]),
-        ('LessThan', [np.random.randn(2, 2)]),
-        ('Interval', [np.random.uniform(0, 1, (2, 2)),
-                      np.random.uniform(2, 3, (2, 2))]),
-        ('HalfOpenInterval', [np.random.uniform(
-            0, 1, (2, 2)), np.random.uniform(2, 3, (2, 2))])
-    ]
-
-    test_sample = np.random.randn(2, 2)
-
-    for (constraint_type, constraint_arg) in constraints_zoo:
-        for bijective in [True, False]:
-            for hybridize in [True, False]:
-                net = TestDomainMap(constraint_type, bijective)
-                if hybridize:
-                    net.hybridize()
-                constrained_out, constraint_status = net(
-                    test_sample, *constraint_arg)
-                assert_almost_equal(constrained_out.asnumpy(),
-                                    constraint_status.asnumpy())
diff --git a/tests/python/unittest/test_gluon_probability_v2.py b/tests/python/unittest/test_gluon_probability_v2.py
index 6d3e31412de0..33f8e27e5cf9 100644
--- a/tests/python/unittest/test_gluon_probability_v2.py
+++ b/tests/python/unittest/test_gluon_probability_v2.py
@@ -56,21 +56,6 @@ def _distribution_method_invoker(dist, func, *args):
     return getattr(dist, func)(*args)
 
 
-def test_mgp_getF():
-    # Test getF
-    getF = mgp.utils.getF
-    nd = mx.nd
-    sym = mx.sym
-    assert getF(nd.ones((2, 2)), nd.ones((2, 2))) == nd
-    assert getF(sym.ones((2, 2)), sym.ones((2, 2))) == sym
-    assert getF(1.0, 2.0) == nd
-
-    # Test exception
-    with pytest.raises(TypeError):
-        getF(nd.ones((2, 2)), sym.ones((2, 2)))
-        getF(sym.ones((2, 2)), nd.ones((2, 2)))
-
-
 @use_np
 def test_gluon_uniform():
     class TestUniform(HybridBlock):
diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index 49921413ad5f..5a2661dddb54 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -16,10 +16,9 @@
 # under the License.
 
 import mxnet as mx
-from mxnet import gluon, nd
-import numpy as np
+from mxnet import gluon, np
+import numpy as _np
 import copy
-from itertools import product
 from functools import partial
 from numpy.testing import assert_allclose
 import pytest
@@ -32,47 +31,43 @@ def check_rnn_states(fused_states, stack_states, num_layers, bidirectional=False
     assert len(stack_states) / len(fused_states) == num_layers * directions
 
     fused_states = [state.asnumpy() for state in fused_states]
-    stack_states = [np.expand_dims(state.asnumpy(), axis=0) for state in stack_states]
+    stack_states = [_np.expand_dims(state.asnumpy(), axis=0) for state in stack_states]
     if is_lstm:
         stack_states_h = stack_states[0::2]
         stack_states_c = stack_states[1::2]
-        stack_states = [np.concatenate(stack_states_h, axis=0), np.concatenate(stack_states_c, axis=0)]
+        stack_states = [_np.concatenate(stack_states_h, axis=0), _np.concatenate(stack_states_c, axis=0)]
     else:
-        stack_states = [np.concatenate(stack_states, axis=0)]
+        stack_states = [_np.concatenate(stack_states, axis=0)]
 
     for f, s in zip(fused_states, stack_states):
         assert f.shape == s.shape
         assert_almost_equal(f, s, atol=1e-4, rtol=1e-4)
 
 
+@mx.util.use_np
 def test_rnn():
     cell = gluon.rnn.RNNCell(100)
-    inputs = [mx.sym.Variable('t%d_data'%i) for i in range(3)]
+    inputs = [mx.np.ones((10,50)) for i in range(3)]
+    cell.infer_shape(0, inputs[0], False)
+    cell.initialize()
     outputs, _ = cell.unroll(3, inputs)
-    outputs = mx.sym.Group(outputs)
     assert sorted(cell.collect_params().keys()) == ['h2h_bias', 'h2h_weight',
                                                     'i2h_bias', 'i2h_weight']
-    assert outputs.list_outputs() == [
-        'rnncell_t0_out_output', 'rnncell_t1_out_output',
-        'rnncell_t2_out_output'
-    ]
 
-    args, outs, auxs = outputs.infer_shape(t0_data=(10,50), t1_data=(10,50), t2_data=(10,50))
+    outs = [o.shape for o in outputs]
     assert outs == [(10, 100), (10, 100), (10, 100)]
 
 
+@mx.util.use_np
 def test_lstm():
     cell = gluon.rnn.LSTMCell(100)
-    inputs = [mx.sym.Variable('t%d_data'%i) for i in range(3)]
+    inputs = [mx.np.ones((10,50)) for i in range(3)]
+    cell.infer_shape(0, inputs[0], False)
+    cell.initialize()
     outputs, _ = cell.unroll(3, inputs)
-    outputs = mx.sym.Group(outputs)
     assert sorted(cell.collect_params().keys()) == ['h2h_bias', 'h2h_weight', 'i2h_bias', 'i2h_weight']
-    assert outputs.list_outputs() == [
-        'lstmcell_t0_out_output', 'lstmcell_t1_out_output',
-        'lstmcell_t2_out_output'
-    ]
 
-    args, outs, auxs = outputs.infer_shape(t0_data=(10,50), t1_data=(10,50), t2_data=(10,50))
+    outs = [o.shape for o in outputs]
     assert outs == [(10, 100), (10, 100), (10, 100)]
 
 
@@ -83,7 +78,7 @@ def test_lstmp():
     rtol, atol = 1e-4, 1e-4
     batch_size, seq_len = 5, 3
     input_size = 128
-    lstm_input = mx.nd.uniform(shape=(seq_len, batch_size, input_size))
+    lstm_input = mx.np.random.uniform(size=(seq_len, batch_size, input_size))
 
     # ==== Unidirectional Layer ====
     for num_layers in [1, 3]:
@@ -95,16 +90,17 @@ def test_lstmp():
             stack_layer.add(gluon.rnn.LSTMPCell(hidden_size,
                                                 projection_size=projection_size))
         fused_layer.initialize()
-        stack_layer.initialize()
 
         fused_begin_state = fused_layer.begin_state(batch_size)
         stack_begin_state = stack_layer.begin_state(batch_size=batch_size)
         fused_layer.infer_shape(lstm_input, fused_begin_state)
+        stack_layer.infer_shape(0, lstm_input, False)
+        stack_layer.initialize()
         fused_layer_params = fused_layer.collect_params()
         stack_layer_params = stack_layer.collect_params()
 
         for name, value in fused_layer_params.items():
-            w = mx.nd.random.uniform(shape=value.shape)
+            w = mx.np.random.uniform(size=value.shape)
             value.set_data(w.copy())
             stack_layer_params[name[1:].replace('_', '.', 1)].set_data(w.copy())
 
@@ -134,11 +130,13 @@ def test_lstmp():
         fused_begin_state = fused_layer.begin_state(batch_size)
         stack_begin_state = stack_layer.begin_state(batch_size=batch_size)
         fused_layer.infer_shape(lstm_input, fused_begin_state)
+        stack_layer.infer_shape(0, lstm_input, False)
+        stack_layer.initialize()
         fused_layer_params = fused_layer.collect_params()
         stack_layer_params = stack_layer.collect_params()
 
         for name, value in fused_layer_params.items():
-            w = mx.nd.random.uniform(shape=value.shape)
+            w = mx.np.random.uniform(size=value.shape)
             value.set_data(w.copy())
             cur = name.split("_")[0]
             stack_layer_params["{}.{}_cell.{}".format(cur[1:], name[0], name[len(cur)+1:])].set_data(w.copy())
@@ -152,6 +150,7 @@ def test_lstmp():
         check_rnn_states(fused_states, stack_states, num_layers, True)
 
 
+@mx.util.use_np
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_lstm_cpu_inference():
     # should behave the same as lstm cell
@@ -159,7 +158,7 @@ def test_lstm_cpu_inference():
                                       [0.72045636, 0.72045636, 0.95215213, 0.95215213]],
                                      [[0.95215213, 0.95215213, 0.72045636, 0.72045636],
                                       [0.95215213, 0.95215213, 0.72045636, 0.72045636]]])
-    x = mx.nd.ones(shape=(2, 2, 2))
+    x = mx.np.ones(shape=(2, 2, 2))
     model = mx.gluon.rnn.LSTM(2, num_layers=6, bidirectional=True)
     model.initialize(mx.init.One())
 
@@ -168,79 +167,58 @@ def test_lstm_cpu_inference():
                                       rtol=1e-3, atol=1e-5)
 
 
+@mx.util.use_np
 def test_gru():
     cell = gluon.rnn.GRUCell(100, activation='relu', recurrent_activation='tanh')
-    inputs = [mx.sym.Variable('t%d_data'%i) for i in range(3)]
+    inputs = [mx.np.ones((10,50)) for i in range(3)]
+    cell.infer_shape(0, inputs[0], False)
+    cell.initialize()
     outputs, _ = cell.unroll(3, inputs)
-    outputs = mx.sym.Group(outputs)
     assert sorted(cell.collect_params().keys()) == ['h2h_bias', 'h2h_weight', 'i2h_bias', 'i2h_weight']
-    assert outputs.list_outputs() == [
-        'grucell_t0_out_output', 'grucell_t1_out_output',
-        'grucell_t2_out_output'
-    ]
 
-    args, outs, auxs = outputs.infer_shape(t0_data=(10,50), t1_data=(10,50), t2_data=(10,50))
+    outs = [o.shape for o in outputs]
     assert outs == [(10, 100), (10, 100), (10, 100)]
 
 
+@mx.util.use_np
 @pytest.mark.serial
 def test_residual():
     cell = gluon.rnn.ResidualCell(gluon.rnn.GRUCell(50))
-    inputs = [mx.sym.Variable('t%d_data'%i) for i in range(2)]
+    inputs = [mx.np.ones((10,50)) for i in range(2)]
+    cell.infer_shape(0, inputs[0], False)
+    cell.initialize()
     outputs, _ = cell.unroll(2, inputs)
-    outputs = mx.sym.Group(outputs)
     params = cell.collect_params()
     assert sorted(params.keys()) == \
            ['base_cell.h2h_bias', 'base_cell.h2h_weight', 'base_cell.i2h_bias', 'base_cell.i2h_weight']
 
-    args, outs, auxs = outputs.infer_shape(t0_data=(10, 50), t1_data=(10, 50))
+    outs = [o.shape for o in outputs]
     assert outs == [(10, 50), (10, 50)]
-    outputs = outputs.eval(**{'t0_data': mx.nd.ones((10, 50)),
-                              't1_data': mx.nd.ones((10, 50)),
-                              cell.base_cell.i2h_weight.var().name: mx.nd.zeros((150, 50)),
-                              cell.base_cell.i2h_bias.var().name: mx.nd.zeros((150, )),
-                              cell.base_cell.h2h_weight.var().name: mx.nd.zeros((150, 50)),
-                              cell.base_cell.h2h_bias.var().name: mx.nd.zeros((150, ))})
-    expected_outputs = np.ones((10, 50))
-    assert np.array_equal(outputs[0].asnumpy(), expected_outputs)
-    assert np.array_equal(outputs[1].asnumpy(), expected_outputs)
 
 
+@mx.util.use_np
 @pytest.mark.serial
 def test_residual_bidirectional():
     cell = gluon.rnn.ResidualCell(
             gluon.rnn.BidirectionalCell(
                 gluon.rnn.GRUCell(25),
                 gluon.rnn.GRUCell(25)))
-    inputs = [mx.sym.Variable('rnn_t%d_data'%i) for i in range(2)]
+    inputs = [mx.np.ones((10,50)) for i in range(2)]
+    cell.infer_shape(0, inputs[0], True)
+    cell.initialize()
     outputs, _ = cell.unroll(2, inputs, merge_outputs=False)
-    outputs = mx.sym.Group(outputs)
     params = cell.collect_params()
     assert sorted(params.keys()) == \
            ['base_cell.l_cell.h2h_bias', 'base_cell.l_cell.h2h_weight',
             'base_cell.l_cell.i2h_bias', 'base_cell.l_cell.i2h_weight',
             'base_cell.r_cell.h2h_bias', 'base_cell.r_cell.h2h_weight',
             'base_cell.r_cell.i2h_bias', 'base_cell.r_cell.i2h_weight']
-    # assert outputs.list_outputs() == \
-    #        ['bi_t0_plus_residual_output', 'bi_t1_plus_residual_output']
 
-    args, outs, auxs = outputs.infer_shape(rnn_t0_data=(10, 50), rnn_t1_data=(10, 50))
+    outs = [o.shape for o in outputs]
     assert outs == [(10, 50), (10, 50)]
-    outputs = outputs.eval(**{'rnn_t0_data':mx.nd.ones((10, 50))+5,
-                              'rnn_t1_data':mx.nd.ones((10, 50))+5,
-                              cell.base_cell.l_cell.i2h_weight.var().name:mx.nd.zeros((75, 50)),
-                              cell.base_cell.l_cell.i2h_bias.var().name:mx.nd.zeros((75,)),
-                              cell.base_cell.l_cell.h2h_weight.var().name:mx.nd.zeros((75, 25)),
-                              cell.base_cell.l_cell.h2h_bias.var().name:mx.nd.zeros((75,)),
-                              cell.base_cell.r_cell.i2h_weight.var().name:mx.nd.zeros((75, 50)),
-                              cell.base_cell.r_cell.i2h_bias.var().name:mx.nd.zeros((75,)),
-                              cell.base_cell.r_cell.h2h_weight.var().name:mx.nd.zeros((75, 25)),
-                              cell.base_cell.r_cell.h2h_bias.var().name:mx.nd.zeros((75,))})
-    expected_outputs = np.ones((10, 50))+5
-    assert np.array_equal(outputs[0].asnumpy(), expected_outputs)
-    assert np.array_equal(outputs[1].asnumpy(), expected_outputs)
 
 
+@mx.util.use_np
 def test_stack():
     cell = gluon.rnn.SequentialRNNCell()
     for i in range(5):
@@ -248,9 +226,10 @@ def test_stack():
             cell.add(gluon.rnn.ResidualCell(gluon.rnn.LSTMCell(100)))
         else:
             cell.add(gluon.rnn.LSTMCell(100))
-    inputs = [mx.sym.Variable('t%d_data'%i) for i in range(3)]
+    inputs = [mx.np.ones((10,50)) for i in range(3)]
+    cell.infer_shape(0, inputs[0], False)
+    cell.initialize()
     outputs, _ = cell.unroll(3, inputs)
-    outputs = mx.sym.Group(outputs)
     keys = sorted(cell.collect_params().keys())
     for i in range(5):
         if i==1:
@@ -263,12 +242,12 @@ def test_stack():
     assert '1.base_cell.h2h_bias' in keys
     assert '1.base_cell.i2h_weight' in keys
     assert '1.base_cell.i2h_bias' in keys
-    assert outputs.list_outputs() == ['lstmcell_t0_out_output', 'lstmcell_t1_out_output', 'lstmcell_t2_out_output']
 
-    args, outs, auxs = outputs.infer_shape(t0_data=(10,50), t1_data=(10,50), t2_data=(10,50))
+    outs = [o.shape for o in outputs]
     assert outs == [(10, 100), (10, 100), (10, 100)]
 
 
+@mx.util.use_np
 @pytest.mark.serial
 def test_hybridstack():
     cell = gluon.rnn.HybridSequentialRNNCell()
@@ -277,9 +256,10 @@ def test_hybridstack():
             cell.add(gluon.rnn.ResidualCell(gluon.rnn.LSTMCell(100)))
         else:
             cell.add(gluon.rnn.LSTMCell(100))
-    inputs = [mx.sym.Variable('t%d_data'%i) for i in range(3)]
+    inputs = [mx.np.ones((10,50)) for i in range(3)]
+    cell.infer_shape(0, inputs[0], False)
+    cell.initialize()
     outputs, _ = cell.unroll(3, inputs)
-    outputs = mx.sym.Group(outputs)
     keys = sorted(cell.collect_params().keys())
     for i in range(5):
         if i==1:
@@ -292,9 +272,8 @@ def test_hybridstack():
     assert '1.base_cell.h2h_bias' in keys
     assert '1.base_cell.i2h_weight' in keys
     assert '1.base_cell.i2h_bias' in keys
-    assert outputs.list_outputs() == ['lstmcell_t0_out_output', 'lstmcell_t1_out_output', 'lstmcell_t2_out_output']
 
-    args, outs, auxs = outputs.infer_shape(t0_data=(10,50), t1_data=(10,50), t2_data=(10,50))
+    outs = [o.shape for o in outputs]
     assert outs == [(10, 100), (10, 100), (10, 100)]
 
     # Test HybridSequentialRNNCell nested in nn.HybridBlock, SequentialRNNCell will fail in this case
@@ -312,30 +291,36 @@ def __init__(self):
 
             self.rnncell = gluon.rnn.BidirectionalCell(cell0, cell1)
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             return self.rnncell.unroll(3, x, layout="NTC", merge_outputs=True)
+        
+        def infer_shape(self, x, *args):
+            self.rnncell.infer_shape(0, x, True)
 
-    x = mx.nd.random.uniform(shape=(10, 3, 100))
+    x = mx.np.random.uniform(size=(10, 3, 100))
     net = BidirectionalOfSequential()
+    net.infer_shape(x)
     net.initialize()
     outs, _ = net(x)
 
     assert outs.shape == (10, 3, 200)
 
 
+@mx.util.use_np
 def test_bidirectional():
     cell = gluon.rnn.BidirectionalCell(
             gluon.rnn.LSTMCell(100),
             gluon.rnn.LSTMCell(100))
-    inputs = [mx.sym.Variable('t%d_data'%i) for i in range(3)]
+    inputs = [mx.np.ones((10,50)) for i in range(3)]
+    cell.infer_shape(0, inputs[0], False)
+    cell.initialize()
     outputs, _ = cell.unroll(3, inputs)
-    outputs = mx.sym.Group(outputs)
-    assert outputs.list_outputs() == ['t0_output', 't1_output', 't2_output']
 
-    args, outs, auxs = outputs.infer_shape(t0_data=(10,50), t1_data=(10,50), t2_data=(10,50))
+    outs = [o.shape for o in outputs]
     assert outs == [(10, 200), (10, 200), (10, 200)]
 
 
+@mx.util.use_np
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 @pytest.mark.serial
 def test_layer_bidirectional():
@@ -347,19 +332,19 @@ def __init__(self, size, **kwargs):
 
         def forward(self, inpt):
             fwd = self._lstm_fwd(inpt)
-            bwd_inpt = nd.flip(inpt, 0)
+            bwd_inpt = np.flip(inpt, 0)
             bwd = self._lstm_bwd(bwd_inpt)
-            bwd = nd.flip(bwd, 0)
-            return nd.concat(fwd, bwd, dim=2)
+            bwd = np.flip(bwd, 0)
+            return np.concatenate([fwd, bwd], axis=2)
 
     size = 7
     in_size = 5
     weights = {}
     for d in ['l', 'r']:
-        weights['{}0_i2h_weight'.format(d)] = mx.random.uniform(shape=(size*4, in_size))
-        weights['{}0_h2h_weight'.format(d)] = mx.random.uniform(shape=(size*4, size))
-        weights['{}0_i2h_bias'.format(d)] = mx.random.uniform(shape=(size*4,))
-        weights['{}0_h2h_bias'.format(d)] = mx.random.uniform(shape=(size*4,))
+        weights['{}0_i2h_weight'.format(d)] = mx.np.random.uniform(size=(size*4, in_size))
+        weights['{}0_h2h_weight'.format(d)] = mx.np.random.uniform(size=(size*4, size))
+        weights['{}0_i2h_bias'.format(d)] = mx.np.random.uniform(size=(size*4,))
+        weights['{}0_h2h_bias'.format(d)] = mx.np.random.uniform(size=(size*4,))
 
     net = gluon.rnn.LSTM(size, bidirectional=True)
     ref_net = RefBiLSTM(size)
@@ -371,19 +356,19 @@ def forward(self, inpt):
         net_params[k].set_data(weights[k])
         ref_net_params[k.replace('l0', '_lstm_fwd.l0').replace('r0', '_lstm_bwd.l0')].set_data(weights[k])
 
-    data = mx.random.uniform(shape=(11, 10, in_size))
+    data = mx.np.random.uniform(size=(11, 10, in_size))
     assert_allclose(net(data).asnumpy(), ref_net(data).asnumpy(), rtol=1e-04, atol=1e-02)
 
 
-
 def test_zoneout():
     cell = gluon.rnn.ZoneoutCell(gluon.rnn.RNNCell(100), zoneout_outputs=0.5,
                                  zoneout_states=0.5)
-    inputs = [mx.sym.Variable('rnn_t%d_data'%i) for i in range(3)]
+    inputs = [mx.np.ones((10,50)) for i in range(3)]
+    cell.infer_shape(0, inputs[0], False)
+    cell.initialize()
     outputs, _ = cell.unroll(3, inputs)
-    outputs = mx.sym.Group(outputs)
 
-    args, outs, auxs = outputs.infer_shape(rnn_t0_data=(10,50), rnn_t1_data=(10,50), rnn_t2_data=(10,50))
+    outs = [o.shape for o in outputs]
     assert outs == [(10, 100), (10, 100), (10, 100)]
 
 
@@ -395,8 +380,9 @@ def test_unroll_layout():
             cell.add(gluon.rnn.ResidualCell(gluon.rnn.LSTMCell(100)))
         else:
             cell.add(gluon.rnn.LSTMCell(100))
+    inputs = [mx.np.random.uniform(size=(10,50)) for _ in range(3)]
+    cell.infer_shape(0, inputs[0], False)
     cell.initialize()
-    inputs = [mx.nd.random.uniform(shape=(10,50)) for _ in range(3)]
     outputs, _ = cell.unroll(3, inputs, layout='TNC')
     assert outputs[0].shape == (10, 100)
     assert outputs[1].shape == (10, 100)
@@ -411,10 +397,10 @@ def test_unroll_layout():
 def check_rnn_forward_backward(layer, merged_inputs, hybridize, merge_outputs, deterministic):
     input_size = 5
     if merged_inputs:
-        inputs = mx.nd.ones((8, 3, 5))
+        inputs = mx.np.ones((8, 3, 5))
         inputs.attach_grad()
     else:
-        inputs = [mx.nd.ones((8, 5)) for _ in range(3)]
+        inputs = [mx.np.ones((8, 5)) for _ in range(3)]
         for x in inputs:
             x.attach_grad()
 
@@ -432,12 +418,12 @@ def check_rnn_forward_backward(layer, merged_inputs, hybridize, merge_outputs, d
     if merge_outputs:
         np_out = out.asnumpy()
     else:
-        np_out = np.stack([x.asnumpy() for x in out], axis=1)
+        np_out = _np.stack([x.asnumpy() for x in out], axis=1)
 
     if merged_inputs:
         np_dx = inputs.grad.asnumpy()
     else:
-        np_dx = np.stack([x.grad.asnumpy() for x in inputs], axis=1)
+        np_dx = _np.stack([x.grad.asnumpy() for x in inputs], axis=1)
 
     with mx.autograd.record():
         out = layer.unroll(3, inputs, merge_outputs=not merge_outputs)[0]
@@ -446,13 +432,13 @@ def check_rnn_forward_backward(layer, merged_inputs, hybridize, merge_outputs, d
     if merged_inputs:
         input_grads = inputs.grad.asnumpy()
     else:
-        input_grads = np.stack([x.grad.asnumpy() for x in inputs], axis=1)
+        input_grads = _np.stack([x.grad.asnumpy() for x in inputs], axis=1)
 
     if deterministic:
         if not merge_outputs:
             ref_np_out = out.asnumpy()
         else:
-            ref_np_out = np.stack([x.asnumpy() for x in out], axis=1)
+            ref_np_out = _np.stack([x.asnumpy() for x in out], axis=1)
         mx.test_utils.assert_almost_equal(np_out, ref_np_out, rtol=1e-3, atol=1e-5)
         mx.test_utils.assert_almost_equal(np_dx, input_grads, rtol=1e-3, atol=1e-5)
 
@@ -495,38 +481,49 @@ def test_sequential_rnn_cells(seq_rnn_type, determinism, merged_inputs, hybridiz
     check_rnn_forward_backward(net, merged_inputs, hybridize, merge_outputs, determinism)
 
 
+@mx.util.use_np
 def test_rnn_cells_export_import():
     class RNNLayer(gluon.HybridBlock):
         def __init__(self):
             super(RNNLayer, self).__init__()
             self.cell = gluon.rnn.RNNCell(hidden_size=1)
 
-        def hybrid_forward(self, F, seq):
+        def forward(self, seq):
             outputs, state = self.cell.unroll(inputs=seq, length=2, merge_outputs=True)
             return outputs
 
+        def infer_shape(self, input):
+            self.cell.infer_shape(0, input, False)
+
     class LSTMLayer(gluon.HybridBlock):
         def __init__(self):
             super(LSTMLayer, self).__init__()
             self.cell = gluon.rnn.LSTMCell(hidden_size=1)
 
-        def hybrid_forward(self, F, seq):
+        def forward(self, seq):
             outputs, state = self.cell.unroll(inputs=seq, length=2, merge_outputs=True)
             return outputs
 
+        def infer_shape(self, input):
+            self.cell.infer_shape(0, input, False)
+
     class GRULayer(gluon.HybridBlock):
         def __init__(self):
             super(GRULayer, self).__init__()
             self.cell = gluon.rnn.GRUCell(hidden_size=1)
 
-        def hybrid_forward(self, F, seq):
+        def forward(self, seq):
             outputs, state = self.cell.unroll(inputs=seq, length=2, merge_outputs=True)
             return outputs
+        
+        def infer_shape(self, input):
+            self.cell.infer_shape(0, input, False)
 
     for hybrid in [RNNLayer(), LSTMLayer(), GRULayer()]:
+        input = mx.np.ones(shape=(1, 2, 1), ctx=mx.context.current_context())
+        hybrid.infer_shape(input)
         hybrid.initialize()
         hybrid.hybridize()
-        input = mx.nd.ones(shape=(1, 2, 1))
         output1 = hybrid(input)
         hybrid.export(path="./model", epoch=0)
         symbol = mx.gluon.SymbolBlock.imports(
@@ -557,7 +554,7 @@ def check_rnn_layer_forward(layer, inputs, states=None, run_only=False, ctx=mx.c
             assert isinstance(out, (list, tuple)) and len(out) == 2
             out = out[0]
         else:
-            assert isinstance(out, mx.nd.NDArray)
+            assert isinstance(out, mx.np.ndarray)
         out.backward()
 
     np_out = out.asnumpy()
@@ -572,7 +569,7 @@ def check_rnn_layer_forward(layer, inputs, states=None, run_only=False, ctx=mx.c
             out = out[0]
         else:
             out = layer(inputs)
-            assert isinstance(out, mx.nd.NDArray)
+            assert isinstance(out, mx.np.ndarray)
         out.backward()
 
     if states is not None:
@@ -585,30 +582,30 @@ def check_rnn_layer_forward(layer, inputs, states=None, run_only=False, ctx=mx.c
         mx.test_utils.assert_almost_equal(np_dx, inputs.grad.asnumpy(), rtol=1e-3, atol=1e-5)
 
 
-
+@mx.util.use_np
 def run_rnn_layers(dtype, dtype2, ctx=mx.cpu()):
 
-    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype), mx.nd.ones((8, 3, 20), dtype=dtype), ctx=ctx)
-    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype, bidirectional=True), mx.nd.ones((8, 3, 20),  dtype=dtype), mx.nd.ones((4, 3, 10),  dtype=dtype), ctx=ctx)
-    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2,dtype=dtype), mx.nd.ones((8, 3, 20),  dtype=dtype), ctx=ctx)
-    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2,dtype=dtype,  bidirectional=True), mx.nd.ones((8, 3, 20),  dtype=dtype), [mx.nd.ones((4, 3, 10),  dtype=dtype), mx.nd.ones((4, 3, 10),  dtype=dtype)],ctx=ctx)
-    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, dtype=dtype, ), mx.nd.ones((8, 3, 20), dtype=dtype),ctx=ctx)
-    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, dtype=dtype, bidirectional=True), mx.nd.ones((8, 3, 20),  dtype=dtype), mx.nd.ones((4, 3, 10),  dtype=dtype),ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype), mx.np.ones((8, 3, 20), dtype=dtype), ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype, bidirectional=True), mx.np.ones((8, 3, 20),  dtype=dtype), mx.np.ones((4, 3, 10),  dtype=dtype), ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2,dtype=dtype), mx.np.ones((8, 3, 20),  dtype=dtype), ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2,dtype=dtype,  bidirectional=True), mx.np.ones((8, 3, 20),  dtype=dtype), [mx.np.ones((4, 3, 10),  dtype=dtype), mx.np.ones((4, 3, 10),  dtype=dtype)],ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, dtype=dtype, ), mx.np.ones((8, 3, 20), dtype=dtype),ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, dtype=dtype, bidirectional=True), mx.np.ones((8, 3, 20),  dtype=dtype), mx.np.ones((4, 3, 10),  dtype=dtype),ctx=ctx)
 
 
-    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype, dropout=0.5), mx.nd.ones((8, 3, 20), dtype=dtype),
+    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype, dropout=0.5), mx.np.ones((8, 3, 20), dtype=dtype),
                             run_only=True, ctx=ctx)
     check_rnn_layer_forward(gluon.rnn.RNN(10, 2, bidirectional=True, dropout=0.5, dtype=dtype),
-                            mx.nd.ones((8, 3, 20), dtype=dtype), mx.nd.ones((4, 3, 10), dtype=dtype), run_only=True, ctx=ctx)
-    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, dropout=0.5, dtype=dtype), mx.nd.ones((8, 3, 20), dtype=dtype),
+                            mx.np.ones((8, 3, 20), dtype=dtype), mx.np.ones((4, 3, 10), dtype=dtype), run_only=True, ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, dropout=0.5, dtype=dtype), mx.np.ones((8, 3, 20), dtype=dtype),
                             run_only=True, ctx=ctx)
     check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, bidirectional=True, dropout=0.5, dtype=dtype),
-                            mx.nd.ones((8, 3, 20), dtype=dtype),
-                            [mx.nd.ones((4, 3, 10), dtype=dtype), mx.nd.ones((4, 3, 10), dtype=dtype)], run_only=True, ctx=ctx)
-    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, dropout=0.5, dtype=dtype), mx.nd.ones((8, 3, 20), dtype=dtype),
+                            mx.np.ones((8, 3, 20), dtype=dtype),
+                            [mx.np.ones((4, 3, 10), dtype=dtype), mx.np.ones((4, 3, 10), dtype=dtype)], run_only=True, ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, dropout=0.5, dtype=dtype), mx.np.ones((8, 3, 20), dtype=dtype),
                             run_only=True, ctx=ctx)
     check_rnn_layer_forward(gluon.rnn.GRU(10, 2, bidirectional=True, dropout=0.5, dtype=dtype),
-                            mx.nd.ones((8, 3, 20), dtype=dtype), mx.nd.ones((4, 3, 10), dtype=dtype), run_only=True, ctx=ctx)
+                            mx.np.ones((8, 3, 20), dtype=dtype), mx.np.ones((4, 3, 10), dtype=dtype), run_only=True, ctx=ctx)
 
     net = gluon.nn.Sequential()
     net.add(gluon.rnn.LSTM(10, bidirectional=True, dtype=dtype2))
@@ -618,7 +615,7 @@ def run_rnn_layers(dtype, dtype2, ctx=mx.cpu()):
     net.initialize(ctx=ctx)
     net.cast(dtype)
     with mx.autograd.record():
-        out = net(mx.nd.ones((2, 3, 10), dtype=dtype, ctx=ctx))
+        out = net(mx.np.ones((2, 3, 10), dtype=dtype, ctx=ctx))
         out.backward()
         out = out.asnumpy()
 
@@ -631,7 +628,7 @@ def run_rnn_layers(dtype, dtype2, ctx=mx.cpu()):
     net2.initialize(ctx=ctx)
     net2.cast(dtype)
     with mx.autograd.record():
-        out = net2(mx.nd.ones((2, 3, 10), dtype=dtype, ctx=ctx))
+        out = net2(mx.np.ones((2, 3, 10), dtype=dtype, ctx=ctx))
         out.backward()
         out = out.asnumpy()
 
@@ -644,7 +641,7 @@ def run_rnn_layers(dtype, dtype2, ctx=mx.cpu()):
     net3.initialize(ctx=ctx)
     net3.cast(dtype2)
     with mx.autograd.record():
-        out = net3(mx.nd.ones((2, 3, 10), dtype=dtype2, ctx=ctx))
+        out = net3(mx.np.ones((2, 3, 10), dtype=dtype2, ctx=ctx))
         out.backward()
         out = out.asnumpy()
 
@@ -660,18 +657,20 @@ def test_rnn_layers_fp16():
 
 
 def check_rnn_consistency(fused_layer, stack_layer, loss, input_size, hidden_size, bidirectional=False, rtol=1e-2, atol=1e-4):
-    x = nd.random.normal(shape=(1, 5, input_size))
+    x = mx.np.random.normal(size=(1, 5, input_size))
     fused_begin_state = fused_layer.begin_state(1)
     stack_states = stack_layer.begin_state(batch_size=1)
     fused_layer.infer_shape(x, fused_begin_state)
     fused_layer_params = fused_layer.collect_params()
+    stack_layer.infer_shape(0, x, False)
+    stack_layer.initialize()
     stack_layer_params = stack_layer.collect_params()
 
     for name, value in fused_layer_params.items():
         if 'weight' in name:
-            w = mx.nd.zeros(shape=value.shape)
+            w = mx.np.zeros(shape=value.shape)
         else:
-            w = mx.nd.random.normal(shape=value.shape)
+            w = mx.np.random.normal(size=value.shape)
         value.set_data(w.copy())
         cur = name.split('_')[0]
         num = cur[1:]
@@ -680,13 +679,13 @@ def check_rnn_consistency(fused_layer, stack_layer, loss, input_size, hidden_siz
 
     fx = x.copy()
     sx = x.copy()
-    y = nd.random.uniform(shape=(1, 5, hidden_size * 2 if bidirectional else hidden_size))
-
+    y = mx.np.random.uniform(size=(1, 5, hidden_size * 2 if bidirectional else hidden_size))
     fx.attach_grad()
     with mx.autograd.record():
         fused_out, fused_states = fused_layer(fx, fused_begin_state)
         l = loss(fused_out, y).mean()
     l.backward()
+    mx.npx.waitall()
     fused_grads = dict([(name, p.grad()) for name, p in fused_layer.collect_params().items()])
     fused_input_grad = fx.grad.asnumpy()
 
@@ -695,6 +694,7 @@ def check_rnn_consistency(fused_layer, stack_layer, loss, input_size, hidden_siz
         stack_out, stack_states = stack_layer.unroll(5, sx, begin_state=stack_states, merge_outputs=True)
         l = loss(stack_out, y).mean()
     l.backward()
+    mx.npx.waitall()
     stack_grads = dict([(name, p.grad()) for name, p in stack_layer.collect_params().items()])
     stack_input_grad = sx.grad.asnumpy()
 
@@ -758,48 +758,17 @@ def check_rnn_bidir_layer_gradients(mode, input_size, hidden_size, num_layers, l
     check_rnn_consistency(fused_layer, stack_layer, loss, input_size, hidden_size, bidirectional=True)
 
 
+@mx.util.use_np
+@pytest.mark.parametrize('input_size', [8])
+@pytest.mark.parametrize('hidden_size', [8, 16])
+@pytest.mark.parametrize('num_layers', [1, 2, 3, 4])
+@pytest.mark.parametrize('func', [check_rnn_unidir_layer_gradients,
+                                  check_rnn_bidir_layer_gradients])
+@pytest.mark.parametrize('mode', ['lstm', 'gru', 'rnn_relu', 'rnn_tanh'])
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
-def test_fused_lstm_layer():
-    input_sizes = [8]
-    hidden_sizes = [8, 16]
-    num_layers = [1, 2, 3, 4]
-    for input_size, hidden_size, num_layers in product(input_sizes, hidden_sizes, num_layers):
-        loss = mx.gluon.loss.L2Loss()
-        check_rnn_unidir_layer_gradients('lstm', input_size, hidden_size, num_layers, loss)
-        check_rnn_bidir_layer_gradients('lstm', input_size, hidden_size, num_layers, loss)
-
-
-@assert_raises_cudnn_not_satisfied(min_version='5.1.10')
-def test_fused_gru_layer():
-    input_sizes = [8]
-    hidden_sizes = [8, 16]
-    num_layers = [1, 2, 3, 4]
-    for input_size, hidden_size, num_layers in product(input_sizes, hidden_sizes, num_layers):
-        loss = mx.gluon.loss.L2Loss()
-        check_rnn_unidir_layer_gradients('gru', input_size, hidden_size, num_layers, loss)
-        check_rnn_bidir_layer_gradients('gru', input_size, hidden_size, num_layers, loss)
-
-
-@assert_raises_cudnn_not_satisfied(min_version='5.1.10')
-def test_fused_rnnrelu_layer():
-    input_sizes = [8]
-    hidden_sizes = [8, 16]
-    num_layers = [1, 2, 3, 4]
-    for input_size, hidden_size, num_layers in product(input_sizes, hidden_sizes, num_layers):
-        loss = mx.gluon.loss.L2Loss()
-        check_rnn_unidir_layer_gradients('rnn_relu', input_size, hidden_size, num_layers, loss)
-        check_rnn_bidir_layer_gradients('rnn_relu', input_size, hidden_size, num_layers, loss)
-
-
-@assert_raises_cudnn_not_satisfied(min_version='5.1.10')
-def test_fused_rnntanh_layer():
-    input_sizes = [8]
-    hidden_sizes = [8, 16]
-    num_layers = [1, 2, 3, 4]
-    for input_size, hidden_size, num_layers in product(input_sizes, hidden_sizes, num_layers):
-        loss = mx.gluon.loss.L2Loss()
-        check_rnn_unidir_layer_gradients('rnn_tanh', input_size, hidden_size, num_layers, loss)
-        check_rnn_bidir_layer_gradients('rnn_tanh', input_size, hidden_size, num_layers, loss)
+def test_fused_layer(input_size, hidden_size, num_layers, func, mode):
+    loss = mx.gluon.loss.L2Loss()
+    func(mode, input_size, hidden_size, num_layers, loss)
 
 
 @pytest.mark.serial
@@ -819,13 +788,19 @@ def test_rnn_unroll_variant_length():
     batch_size = 4
     max_length = 10
     valid_length = [3, 10, 5, 6]
-    valid_length_nd = mx.nd.array(valid_length)
+    valid_length_nd = mx.np.array(valid_length)
     for cell in cell_list:
+        # Test for NTC layout
+        print(type(cell))
+        data_nd = mx.np.random.normal(0, 1, size=(batch_size, max_length, 20))
+        if isinstance(cell, (gluon.rnn.HybridSequentialRNNCell, gluon.rnn.SequentialRNNCell)):
+            cell.infer_shape(0, data_nd, False)
+        elif isinstance(cell, gluon.rnn.BidirectionalCell):
+            cell.infer_shape(0, data_nd, True)
+        else:
+            cell.infer_shape(0, data_nd, False)
         cell.initialize()
         cell.hybridize()
-        print(cell.collect_params())
-        # Test for NTC layout
-        data_nd = mx.nd.random.normal(0, 1, shape=(batch_size, max_length, 20))
         outs, states = cell.unroll(length=max_length, inputs=data_nd,
                                    valid_length=valid_length_nd,
                                    merge_outputs=True,
@@ -846,7 +821,7 @@ def test_rnn_unroll_variant_length():
                                 atol=1E-4, rtol=1E-4)
 
         # Test for TNC layout
-        data_nd = mx.nd.random.normal(0, 1, shape=(max_length, batch_size, 20))
+        data_nd = mx.np.random.normal(0, 1, size=(max_length, batch_size, 20))
         outs, states = cell.unroll(length=max_length, inputs=data_nd,
                                    valid_length=valid_length_nd,
                                    layout='TNC')
@@ -874,7 +849,7 @@ def test_cell_fill_shape():
 def test_layer_fill_shape():
     layer = gluon.rnn.LSTM(10)
     layer.hybridize()
-    check_rnn_layer_forward(layer, mx.nd.ones((3, 2, 7)))
+    check_rnn_layer_forward(layer, mx.np.ones((3, 2, 7)))
     print(layer)
     assert layer.l0_i2h_weight.shape[1] == 7, layer.l0_i2h_weight.shape[1]
 
@@ -890,17 +865,21 @@ def __init__(self, rnn_size, time_step, **kwargs):
                     gluon.rnn.LSTMCell(rnn_size),
                     gluon.rnn.LSTMCell(rnn_size))
 
-            def hybrid_forward(self, F, inputs, valid_len):
+            def forward(self, inputs, valid_len):
                 outputs, states = self.bi_lstm.unroll(self.time_step, inputs, valid_length=valid_len,
                                                       layout='NTC', merge_outputs=True)
                 return outputs, states
+            
+            def infer_shape(self, x, *args):
+                self.bi_lstm.infer_shape(0, x, True)
 
         rnn_size = 100
         net = BiLSTM(rnn_size, length)
+        inputs_data = mx.np.random.uniform(size=(10, length, 50))
+        net.infer_shape(inputs_data)
         net.initialize()
         net.hybridize()
-        inputs_data = mx.nd.random.uniform(shape=(10, length, 50))
-        valid_len = mx.nd.array([length]*10)
+        valid_len = mx.np.array([length]*10)
         outputs, _ = net(inputs_data, valid_len)
         assert outputs.shape == (10, length, 200)
 
@@ -908,44 +887,44 @@ def hybrid_forward(self, F, inputs, valid_len):
     _check_bidirectional_unroll_valid_length(3)
 
 
-def check_rnn_cell(cell, in_shape=(10, 50), out_shape=(10, 100), begin_state=None):
-    inputs = [mx.sym.Variable('rnn_t%d_data'%i) for i in range(3)]
-    outputs, _ = cell.unroll(3, inputs, begin_state=begin_state)
-    outputs = mx.sym.Group(outputs)
-    assert sorted(cell.collect_params().keys()) == ['h2h_bias', 'h2h_weight',
-                                                    'i2h_bias', 'i2h_weight']
-    assert outputs.list_outputs() == [type(cell).__name__.lower() + name for name in ['_t0_out_output', '_t1_out_output', '_t2_out_output']]
-
-    args, outs, auxs = outputs.infer_shape(rnn_t0_data=in_shape,
-                                           rnn_t1_data=in_shape,
-                                           rnn_t2_data=in_shape)
-    assert outs == [out_shape] * 3
-
-
 def check_rnn_forward(layer, inputs):
     inputs.attach_grad()
     layer.initialize()
     with mx.autograd.record():
         layer.unroll(3, inputs, merge_outputs=True)[0].backward()
         mx.autograd.backward(layer.unroll(3, inputs, merge_outputs=False)[0])
-    mx.nd.waitall()
+    mx.npx.waitall()
 
 
 def test_rnn_cells():
     check_rnn_forward(gluon.rnn.Conv1DLSTMCell((5, 7), 10, (3,), (3,)),
-                      mx.nd.ones((8, 3, 5, 7)))
+                      mx.np.ones((8, 3, 5, 7)))
     check_rnn_forward(gluon.rnn.Conv1DRNNCell((5, 7), 10, (3,), (3,)),
-                      mx.nd.ones((8, 3, 5, 7)))
+                      mx.np.ones((8, 3, 5, 7)))
     check_rnn_forward(gluon.rnn.Conv1DGRUCell((5, 7), 10, (3,), (3,)),
-                      mx.nd.ones((8, 3, 5, 7)))
+                      mx.np.ones((8, 3, 5, 7)))
 
     net = mx.gluon.rnn.SequentialRNNCell()
     net.add(gluon.rnn.Conv1DLSTMCell((5, 7), 10, (3,), (3,)))
     net.add(gluon.rnn.Conv1DRNNCell((10, 5), 11, (3,), (3,)))
     net.add(gluon.rnn.Conv1DGRUCell((11, 3), 12, (3,), (3,)))
-    check_rnn_forward(net, mx.nd.ones((8, 3, 5, 7)))
+    check_rnn_forward(net, mx.np.ones((8, 3, 5, 7)))
 
 
+@mx.util.use_np
+def check_rnn_cell(cell, in_shape=(10, 50), out_shape=(10, 100), begin_state=None):
+    inputs = [mx.np.ones(shape=in_shape) for i in range(3)]
+    cell.infer_shape(0, inputs[0], False)
+    cell.initialize()
+    outputs, _ = cell.unroll(3, inputs, begin_state=begin_state)
+    assert sorted(cell.collect_params().keys()) == ['h2h_bias', 'h2h_weight',
+                                                    'i2h_bias', 'i2h_weight']
+
+    outs = [o.shape for o in outputs]
+    assert outs == [out_shape] * 3
+
+
+@mx.util.use_np
 def test_convrnn():
     cell = gluon.rnn.Conv1DRNNCell((10, 50), 100, 3, 3)
     check_rnn_cell(cell, in_shape=(1, 10, 50), out_shape=(1, 100, 48))
@@ -957,6 +936,7 @@ def test_convrnn():
     check_rnn_cell(cell, in_shape=(1, 10, 20, 30, 50), out_shape=(1, 100, 18, 28, 48))
 
 
+@mx.util.use_np
 def test_convlstm():
     cell = gluon.rnn.Conv1DLSTMCell((10, 50), 100, 3, 3)
     check_rnn_cell(cell, in_shape=(1, 10, 50), out_shape=(1, 100, 48))
@@ -968,6 +948,7 @@ def test_convlstm():
     check_rnn_cell(cell, in_shape=(1, 10, 20, 30, 50), out_shape=(1, 100, 18, 28, 48))
 
 
+@mx.util.use_np
 def test_convgru():
     cell = gluon.rnn.Conv1DGRUCell((10, 50), 100, 3, 3)
     check_rnn_cell(cell, in_shape=(1, 10, 50), out_shape=(1, 100, 48))
@@ -979,55 +960,62 @@ def test_convgru():
     check_rnn_cell(cell, in_shape=(1, 10, 20, 30, 50), out_shape=(1, 100, 18, 28, 48))
 
 
+@mx.util.use_np
 def test_conv_fill_shape():
-    cell = gluon.rnn.Conv1DLSTMCell((0, 7), 10, (3,), (3,))
+    input = mx.np.ones((8, 3, 5, 7))
+    cell = gluon.rnn.Conv1DLSTMCell((-1, 7), 10, (3,), (3,))
+    cell.infer_shape(0, input, False)
     cell.hybridize()
-    check_rnn_forward(cell, mx.nd.ones((8, 3, 5, 7)))
+    check_rnn_forward(cell, input)
     assert cell.i2h_weight.shape[1] == 5, cell.i2h_weight.shape[1]
 
 
+@mx.util.use_np
 def test_lstmp():
     nhid = 100
     nproj = 64
     cell = gluon.rnn.LSTMPCell(nhid, nproj)
-    inputs = [mx.sym.Variable('rnn_t%d_data'%i) for i in range(3)]
+    inputs = [mx.np.ones(shape=(10,50)) for i in range(3)]
+    cell.infer_shape(0, inputs[0], False)
+    cell.initialize()
     outputs, _ = cell.unroll(3, inputs)
-    outputs = mx.sym.Group(outputs)
     expected_params = ['h2h_bias', 'h2h_weight', 'h2r_weight', 'i2h_bias', 'i2h_weight']
-    expected_outputs = [type(cell).__name__.lower() + name for name in ['_t0_out_output', '_t1_out_output', '_t2_out_output']]
     assert sorted(cell.collect_params().keys()) == expected_params
-    assert outputs.list_outputs() == expected_outputs, outputs.list_outputs()
 
-    args, outs, auxs = outputs.infer_shape(rnn_t0_data=(10,50), rnn_t1_data=(10,50), rnn_t2_data=(10,50))
-    assert outs == [(10, nproj), (10, nproj), (10, nproj)]
+    outs = [o.shape for o in outputs]
+    assert outs == [(10, nproj)] * 3
 
 
+@mx.util.use_np
 def test_vardrop():
     def check_vardrop(drop_inputs, drop_states, drop_outputs):
         cell = gluon.rnn.VariationalDropoutCell(mx.gluon.rnn.RNNCell(100),
                                                 drop_outputs=drop_outputs,
                                                 drop_states=drop_states,
                                                 drop_inputs=drop_inputs)
+
+        input_data = mx.np.random.uniform(size=(10, 3, 50), ctx=mx.context.current_context())
+        cell.infer_shape(0, input_data, False)
         cell.initialize(init='xavier')
-        input_data = mx.nd.random_uniform(shape=(10, 3, 50), ctx=mx.context.current_context())
         with mx.autograd.record():
             outputs1, _ = cell.unroll(3, input_data, merge_outputs=True)
-            mx.nd.waitall()
+            mx.npx.waitall()
             outputs2, _ = cell.unroll(3, input_data, merge_outputs=True)
         assert not almost_equal(outputs1.asnumpy(), outputs2.asnumpy())
 
-        inputs = [mx.sym.Variable('rnn_t%d_data'%i) for i in range(3)]
+        inputs = [mx.np.ones(shape=(10,50)) for i in range(3)]
+        cell.infer_shape(0, inputs[0], False)
+        cell.initialize()
         outputs, _ = cell.unroll(3, inputs, merge_outputs=False)
-        outputs = mx.sym.Group(outputs)
 
-        args, outs, auxs = outputs.infer_shape(rnn_t0_data=(10,50), rnn_t1_data=(10,50), rnn_t2_data=(10,50))
+        outs = [o.shape for o in outputs]
         assert outs == [(10, 100), (10, 100), (10, 100)]
 
         cell.reset()
         cell.hybridize()
         with mx.autograd.record():
             outputs3, _ = cell.unroll(3, input_data, merge_outputs=True)
-            mx.nd.waitall()
+            mx.npx.waitall()
             outputs4, _ = cell.unroll(3, input_data, merge_outputs=True)
         assert not almost_equal(outputs3.asnumpy(), outputs4.asnumpy())
         assert not almost_equal(outputs1.asnumpy(), outputs3.asnumpy())
@@ -1036,6 +1024,7 @@ def check_vardrop(drop_inputs, drop_states, drop_outputs):
     check_vardrop(0.5, 0, 0.5)
 
 
+@mx.util.use_np
 @pytest.mark.parametrize('cell_type,num_states', [
     (gluon.rnn.RNNCell, 1),
     (gluon.rnn.LSTMCell, 2),
@@ -1049,33 +1038,40 @@ def __init__(self, cell_type, hidden_size, layout):
             self.cell = cell_type(hidden_size)
             self.layout = layout
 
-        def hybrid_forward(self, F, inputs, states, valid_length):
+        def forward(self, inputs, states, valid_length):
             if isinstance(valid_length, list) and len(valid_length) == 0:
                 valid_length = None
             return gluon.rnn.rnn_cell.dynamic_unroll(self.cell, inputs, states,
                                                      valid_length=valid_length,
                                                      layout=self.layout)
+        
+        def infer_shape(self, x, *args):
+            self.cell.infer_shape(0, x, False)
+
     batch_size = 20
     input_size = 50
     hidden_size = 30
     seq_len = 10
     ctx = default_context()
     if layout == 'TNC':
-        rnn_data = mx.nd.normal(loc=0, scale=1, shape=(seq_len, batch_size, input_size), ctx=ctx)
+        rnn_data = mx.np.random.normal(loc=0, scale=1, size=(seq_len, batch_size, input_size), ctx=ctx)
     elif layout == 'NTC':
-        rnn_data = mx.nd.normal(loc=0, scale=1, shape=(batch_size, seq_len, input_size), ctx=ctx)
+        rnn_data = mx.np.random.normal(loc=0, scale=1, size=(batch_size, seq_len, input_size), ctx=ctx)
     else:
         print("Wrong layout")
         return
-    valid_length = mx.nd.round(mx.nd.random.uniform(low=1, high=10, shape=(batch_size), ctx=ctx))
+    valid_length = mx.np.round(mx.np.random.uniform(low=1, high=10, size=(batch_size), ctx=ctx))
     state_shape = (batch_size, hidden_size)
-    states = [mx.nd.normal(loc=0, scale=1, shape=state_shape, ctx=ctx) for i in range(num_states)]
+    states = [mx.np.random.normal(loc=0, scale=1, size=state_shape, ctx=ctx) for i in range(num_states)]
 
     cell = cell_type(hidden_size)
-    cell.initialize(ctx=default_context())
     if layout == 'TNC':
+        cell.infer_shape(0, rnn_data[0], False)
+        cell.initialize(ctx=default_context())
         cell(rnn_data[0], states)
     else:
+        cell.infer_shape(0, rnn_data[:,0,:], False)
+        cell.initialize(ctx=default_context())
         cell(rnn_data[:,0,:], states)
     params1 = cell.collect_params()
     orig_params1 = copy.deepcopy(params1)
@@ -1099,6 +1095,7 @@ def hybrid_forward(self, F, inputs, states, valid_length):
         valid_length = []
     for config in configs:
         layer = RNNLayer(cell_type, hidden_size, layout)
+        layer.infer_shape(rnn_data)
         layer.initialize(ctx=default_context())
         config(layer)
         res2, states2 = layer(rnn_data, states, valid_length)
@@ -1119,4 +1116,6 @@ def hybrid_forward(self, F, inputs, states, valid_length):
         for key, val in params1.items():
             weight1 = val.data()
             weight2 = params2['cell.' + key].data()
-            assert_almost_equal(weight1, weight2, rtol=0.001, atol=0.0001)
+            # Subgraph created from npx.foreach in deferred compute is
+            # little bit different from the legacy foreach operator. 
+            assert_almost_equal(weight1, weight2, rtol=0.1, atol=0.1)
diff --git a/tests/python/unittest/test_gluon_save.py b/tests/python/unittest/test_gluon_save.py
index c17df63dc64f..47796874e7e1 100644
--- a/tests/python/unittest/test_gluon_save.py
+++ b/tests/python/unittest/test_gluon_save.py
@@ -17,6 +17,7 @@
 
 import mxnet as mx
 
+@mx.util.use_np
 def test_save():
     class MyBlock(mx.gluon.Block):
         def __init__(self, **kwargs):
@@ -45,7 +46,7 @@ def createNet():
     net1.initialize()
     # hybridize (the hybridizeable blocks, ie. the Dense layers)
     net1.hybridize()
-    x = mx.nd.zeros((1,10))
+    x = mx.np.zeros((1,10))
     out1 = net1(x)
 
     # save hybridized model
@@ -55,6 +56,7 @@ def createNet():
     net2 = createNet()
     # reload hybridized model
     net2.load('MyModel')
+    net2.hybridize()
     # run inference again
     out2 = net2(x)
     mx.test_utils.assert_almost_equal(out1.asnumpy(), out2.asnumpy())
diff --git a/tests/python/unittest/test_gluon_trainer.py b/tests/python/unittest/test_gluon_trainer.py
index 4016da7283cf..e454be848ec1 100644
--- a/tests/python/unittest/test_gluon_trainer.py
+++ b/tests/python/unittest/test_gluon_trainer.py
@@ -26,6 +26,8 @@
 from copy import deepcopy
 import pytest
 
+mx.npx.reset_np()
+
 def dict_equ(a, b):
     assert set(a) == set(b)
     for k in a:
@@ -148,6 +150,8 @@ def test_trainer_save_load():
     assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.2
     os.putenv('MXNET_UPDATE_ON_KVSTORE', previous_update_on_kvstore)
 
+@mx.util.use_np
+@pytest.mark.skip(reason='Currently, sparse feature is not supported in Gluon2.0')
 def test_trainer_sparse_save_load():
     x = gluon.Parameter('x', shape=(10, 1), lr_mult=1.0,
                         stype='row_sparse', grad_stype='row_sparse')
@@ -166,48 +170,6 @@ def test_trainer_sparse_save_load():
     # check if parameter dict is correctly associated with optimizer after load_state
     assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.2
 
-def test_trainer_multi_layer_init():
-    class Net(gluon.Block):
-        def __init__(self, **kwargs):
-            super(Net, self).__init__(**kwargs)
-            # sparse param
-            self.embed_weight = gluon.Parameter('embed_weight', stype='row_sparse',
-                                                shape=(4,3), grad_stype='row_sparse')
-            # dense param from a hybrid block
-            self.dense0 = nn.Dense(2)
-
-        def forward(self, x):
-            embed_weight = self.embed_weight.row_sparse_data(x)
-            embed = mx.nd.Embedding(data=x, weight=embed_weight,
-                                    input_dim=4, output_dim=3, sparse_grad=True)
-            return self.dense0(embed)
-
-    def check_init(ctxes):
-        net = Net()
-        net.initialize(mx.init.One(), ctx=ctxes)
-        trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 1})
-        data = mx.nd.array([[0,2], [1,2]])
-        xs = gluon.utils.split_and_load(data, ctxes)
-        ys = []
-        with mx.autograd.record():
-            for x in xs:
-                y = net(x)
-                ys.append(y)
-        for y in ys:
-            y.backward()
-        trainer.step(1)
-        # all parameters should be initialized
-        assert not trainer._params_to_init
-        all_rows = mx.nd.arange(0, 4, ctx=mx.cpu(1))
-        # check the updated weights
-        weight = net.embed_weight.row_sparse_data(all_rows).asnumpy()
-        assert (weight[0] == -1).all()
-        assert (weight[1] == -1).all()
-        assert (weight[2] == -3).all()
-        assert (weight[3] == 1).all()
-
-    check_init([mx.cpu(1), mx.cpu(2)])
-    check_init([mx.cpu(1)])
 
 @xfail_when_nonstandard_decimal_separator
 def test_trainer_reset_kv():
@@ -345,11 +307,12 @@ def test_trainer_allreduce_hybridsequential():
     trainer = mx.gluon.Trainer(net.collect_params(), 'sgd', update_on_kvstore=False)
     for ctx in contexts:
         with mx.autograd.record():
-            out = net(mx.nd.ones((1, 1), ctx=ctx))
+            out = net(mx.np.ones((1, 1), ctx=ctx))
         out.backward()
     trainer.allreduce_grads()
 
 
+@mx.util.use_np
 def test_trainer_share_parameters():
     class Net(gluon.Block):
         def __init__(self, **kwargs):
@@ -369,7 +332,7 @@ def forward(self, x):
     ctxes = [mx.cpu(0), mx.cpu(1)]
     net.initialize(mx.init.One(), ctx=ctxes)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 1})
-    data = mx.nd.array([[1, 1], [1, 1]])
+    data = mx.np.array([[1, 1], [1, 1]])
     xs = gluon.utils.split_and_load(data, ctxes)
     ys = []
     with mx.autograd.record():
diff --git a/tests/python/unittest/test_higher_order_grad.py b/tests/python/unittest/test_higher_order_grad.py
index ccdd650d604c..611c89702e04 100644
--- a/tests/python/unittest/test_higher_order_grad.py
+++ b/tests/python/unittest/test_higher_order_grad.py
@@ -602,11 +602,11 @@ def test_dense_backward_flatten():
         net.initialize(mxnet.initializer.Constant(.5))
         x.attach_grad()
         with autograd.record():
-            y = net.forward(x)
+            y = net.forward(x.as_np_ndarray()).as_nd_ndarray()
             o_y = arange_shape_like(y)  # head gradient of y
             params = [p.data() for p in net.collect_params().values()]
-            w = params[0]
-            b = params[1]
+            w = params[0].as_nd_ndarray()
+            b = params[1].as_nd_ndarray()
             print("Checking y ({}) = x({}) * w^T({}) + b({})".format(y.shape, x.shape, w.shape, b.shape))
             x_grad = autograd.grad(heads=y, variables=x, head_grads=o_y,
                                    create_graph=True, retain_graph=True)[0]
@@ -645,11 +645,11 @@ def test_dense_backward_no_flatten():
         net.initialize(mxnet.initializer.Constant(.5))
         x.attach_grad()
         with autograd.record():
-            y = net.forward(x)
+            y = net.forward(x.as_np_ndarray()).as_nd_ndarray()
             o_y = arange_shape_like(y)  # head gradient of y
             params = [p.data() for p in net.collect_params().values()]
-            w = params[0]
-            b = params[1]
+            w = params[0].as_nd_ndarray()
+            b = params[1].as_nd_ndarray()
             print("Checking y ({}) = x({}) * w^T({}) + b({})".format(y.shape, x.shape, w.shape, b.shape))
             x_grad = autograd.grad(heads=y, variables=x, head_grads=o_y,
                                    create_graph=True, retain_graph=True)[0]
diff --git a/tests/python/unittest/test_image.py b/tests/python/unittest/test_image.py
index 2c61867d9c8f..775721980050 100644
--- a/tests/python/unittest/test_image.py
+++ b/tests/python/unittest/test_image.py
@@ -26,6 +26,7 @@
 import unittest
 import pytest
 
+mx.npx.reset_np()
 
 def _get_data(url, dirname):
     import os, tarfile
diff --git a/tests/python/unittest/test_infer_shape.py b/tests/python/unittest/test_infer_shape.py
index d6a8a432da4a..e67e1fc1a3bc 100644
--- a/tests/python/unittest/test_infer_shape.py
+++ b/tests/python/unittest/test_infer_shape.py
@@ -20,6 +20,8 @@
 from common import models
 import pytest
 
+mx.npx.reset_np()
+
 def test_mlp2_infer_shape():
     # Build MLP
     out = models.mlp2()
diff --git a/tests/python/unittest/test_loss.py b/tests/python/unittest/test_loss.py
index c2b15fdb783f..103ccc663d03 100644
--- a/tests/python/unittest/test_loss.py
+++ b/tests/python/unittest/test_loss.py
@@ -16,91 +16,98 @@
 # under the License.
 
 import mxnet as mx
-import numpy as np
+import numpy as onp
 from mxnet import gluon, autograd
 from mxnet.test_utils import assert_almost_equal, default_context
+from numpy.core.fromnumeric import size
 from common import xfail_when_nonstandard_decimal_separator
 import unittest
 
 
+@mx.util.use_np
 @xfail_when_nonstandard_decimal_separator
 def test_loss_ndarray():
-    output = mx.nd.array([1, 2, 3, 4])
-    label = mx.nd.array([1, 3, 5, 7])
-    weighting = mx.nd.array([0.5, 1, 0.5, 1])
+    output = mx.np.array([1, 2, 3, 4])
+    label = mx.np.array([1, 3, 5, 7])
+    weighting = mx.np.array([0.5, 1, 0.5, 1])
 
     loss = gluon.loss.L1Loss()
-    assert mx.nd.sum(loss(output, label)).asscalar() == 6.
+    assert mx.np.sum(loss(output, label)).item() == 6.
     loss = gluon.loss.L1Loss(weight=0.5)
-    assert mx.nd.sum(loss(output, label)).asscalar() == 3.
+    assert mx.np.sum(loss(output, label)).item() == 3.
     loss = gluon.loss.L1Loss()
-    assert mx.nd.sum(loss(output, label, weighting)).asscalar() == 5.
+    assert mx.np.sum(loss(output, label, weighting)).item() == 5.
 
     loss = gluon.loss.L2Loss()
-    assert mx.nd.sum(loss(output, label)).asscalar() == 7.
+    assert mx.np.sum(loss(output, label)).item() == 7.
     loss = gluon.loss.L2Loss(weight=0.25)
-    assert mx.nd.sum(loss(output, label)).asscalar() == 1.75
+    assert mx.np.sum(loss(output, label)).item() == 1.75
     loss = gluon.loss.L2Loss()
-    assert mx.nd.sum(loss(output, label, weighting)).asscalar() == 6
+    assert mx.np.sum(loss(output, label, weighting)).item() == 6
 
-    output = mx.nd.array([[0, 2], [1, 4]])
-    label = mx.nd.array([0, 1])
-    weighting = mx.nd.array([[0.5], [1.0]])
+    output = mx.np.array([[0, 2], [1, 4]])
+    label = mx.np.array([0, 1])
+    weighting = mx.np.array([[0.5], [1.0]])
 
     loss = gluon.loss.SoftmaxCrossEntropyLoss()
     L = loss(output, label).asnumpy()
-    assert_almost_equal(L, np.array([ 2.12692809,  0.04858733]), rtol=1e-3, atol=1e-4)
+    assert_almost_equal(L, onp.array([ 2.12692809,  0.04858733]), rtol=1e-3, atol=1e-4)
 
     L = loss(output, label, weighting).asnumpy()
-    assert_almost_equal(L, np.array([ 1.06346405,  0.04858733]), rtol=1e-3, atol=1e-4)
+    assert_almost_equal(L, onp.array([ 1.06346405,  0.04858733]), rtol=1e-3, atol=1e-4)
 
 
+@mx.util.use_np
 def test_bce_equal_ce2():
     N = 100
     loss1 = gluon.loss.SigmoidBCELoss(from_sigmoid=True)
     loss2 = gluon.loss.SoftmaxCELoss(from_logits=True)
-    out1 = mx.random.uniform(0.1, 0.9, shape=(N, 1))
-    out2 = mx.nd.log(mx.nd.concat(1-out1, out1, dim=1) + 1e-8)
-    label = mx.nd.round(mx.random.uniform(0, 1, shape=(N, 1)))
+    out1 = mx.np.random.uniform(0.1, 0.9, size=(N, 1))
+    out2 = mx.np.log(mx.np.concatenate([1-out1, out1], axis=1) + 1e-8)
+    label = mx.np.round(mx.np.random.uniform(0, 1, size=(N, 1)))
     assert_almost_equal(loss1(out1, label).asnumpy(), loss2(out2, label).asnumpy())
 
+
+@mx.util.use_np
 def test_logistic_loss_equal_bce():
     N = 100
     loss_binary = gluon.loss.LogisticLoss(label_format='binary')
     loss_signed = gluon.loss.LogisticLoss(label_format='signed')
     loss_bce = gluon.loss.SigmoidBCELoss(from_sigmoid=False)
-    data = mx.random.uniform(-10, 10, shape=(N, 1))
-    label = mx.nd.round(mx.random.uniform(0, 1, shape=(N, 1)))
+    data = mx.np.random.uniform(-10, 10, size=(N, 1))
+    label = mx.np.round(mx.np.random.uniform(0, 1, size=(N, 1)))
     assert_almost_equal(loss_binary(data, label), loss_bce(data, label), atol=1e-6)
     assert_almost_equal(loss_signed(data, 2 * label - 1), loss_bce(data, label), atol=1e-6)
 
 
+@mx.util.use_np
 def test_ctc_loss():
     loss = gluon.loss.CTCLoss()
-    l = loss(mx.nd.ones((2,20,4)), mx.nd.array([[1,0,-1,-1],[2,1,1,-1]]))
-    assert_almost_equal(l, np.array([18.82820702, 16.50581741]))
+    l = loss(mx.np.ones((2,20,4)), mx.np.array([[1,0,-1,-1],[2,1,1,-1]]))
+    assert_almost_equal(l, onp.array([18.82820702, 16.50581741]))
 
     loss = gluon.loss.CTCLoss(layout='TNC')
-    l = loss(mx.nd.ones((20,2,4)), mx.nd.array([[1,0,-1,-1],[2,1,1,-1]]))
-    assert_almost_equal(l, np.array([18.82820702, 16.50581741]))
+    l = loss(mx.np.ones((20,2,4)), mx.np.array([[1,0,-1,-1],[2,1,1,-1]]))
+    assert_almost_equal(l, onp.array([18.82820702, 16.50581741]))
 
     loss = gluon.loss.CTCLoss(layout='TNC', label_layout='TN')
-    l = loss(mx.nd.ones((20,2,4)), mx.nd.array([[1,0,-1,-1],[2,1,1,-1]]).T)
-    assert_almost_equal(l, np.array([18.82820702, 16.50581741]))
+    l = loss(mx.np.ones((20,2,4)), mx.np.array([[1,0,-1,-1],[2,1,1,-1]]).T)
+    assert_almost_equal(l, onp.array([18.82820702, 16.50581741]))
 
     loss = gluon.loss.CTCLoss()
-    l = loss(mx.nd.ones((2,20,4)), mx.nd.array([[2,1,2,2],[3,2,2,2]]), None, mx.nd.array([2,3]))
-    assert_almost_equal(l, np.array([18.82820702, 16.50581741]))
+    l = loss(mx.np.ones((2,20,4)), mx.np.array([[2,1,2,2],[3,2,2,2]]), None, mx.np.array([2,3]))
+    assert_almost_equal(l, onp.array([18.82820702, 16.50581741]))
 
     loss = gluon.loss.CTCLoss()
-    l = loss(mx.nd.ones((2,25,4)), mx.nd.array([[2,1,-1,-1],[3,2,2,-1]]), mx.nd.array([20,20]))
-    assert_almost_equal(l, np.array([18.82820702, 16.50581741]))
+    l = loss(mx.np.ones((2,25,4)), mx.np.array([[2,1,-1,-1],[3,2,2,-1]]), mx.np.array([20,20]))
+    assert_almost_equal(l, onp.array([18.82820702, 16.50581741]))
 
     loss = gluon.loss.CTCLoss()
-    l = loss(mx.nd.ones((2,25,4)), mx.nd.array([[2,1,3,3],[3,2,2,3]]), mx.nd.array([20,20]), mx.nd.array([2,3]))
-    assert_almost_equal(l, np.array([18.82820702, 16.50581741]))
+    l = loss(mx.np.ones((2,25,4)), mx.np.array([[2,1,3,3],[3,2,2,3]]), mx.np.array([20,20]), mx.np.array([2,3]))
+    assert_almost_equal(l, onp.array([18.82820702, 16.50581741]))
 
 
+@mx.util.use_np
 @xfail_when_nonstandard_decimal_separator
 def test_sdml_loss():
 
@@ -109,8 +116,8 @@ def test_sdml_loss():
     EPOCHS = 20
 
     # Generate randomized data and 'positive' samples
-    data = mx.random.uniform(-1, 1, shape=(N, DIM))
-    pos = data + mx.random.uniform(-0.1, 0.1, shape=(N, DIM)) # correlated paired data
+    data = mx.np.random.uniform(-1, 1, size=(N, DIM))
+    pos = data + mx.np.random.uniform(-0.1, 0.1, size=(N, DIM)) # correlated paired data
     data_iter = mx.io.NDArrayIter({'data' : data, 'pos' : pos}, batch_size=N)
 
     # Init model and trainer
@@ -134,60 +141,64 @@ def test_sdml_loss():
     avg_loss = loss.sum()/len(loss)
     assert(avg_loss < 0.05)
 
+@mx.util.use_np
 def test_cosine_loss():
     #Generating samples
-    input1 = mx.nd.random.randn(3, 2)
-    input2 = mx.nd.random.randn(3, 2)
-    label = mx.nd.sign(mx.nd.random.randn(input1.shape[0]))
+    input1 = mx.np.random.randn(3, 2)
+    input2 = mx.np.random.randn(3, 2)
+    label = mx.np.sign(mx.np.random.randn(input1.shape[0]))
     #Calculating loss from cosine embedding loss function in Gluon
     Loss = gluon.loss.CosineEmbeddingLoss()
     loss = Loss(input1, input2, label)
 
     # Calculating the loss Numpy way
-    numerator = mx.nd.sum(input1 * input2, keepdims=True, axis=1)
-    denominator = mx.nd.sqrt(mx.nd.sum(input1**2, axis=1, keepdims=True)) \
-    * mx.nd.sqrt(mx.nd.sum(input2**2, axis=1, keepdims=True))
-    numpy_loss = mx.nd.where(label == 1, 1-numerator/denominator, \
+    numerator = mx.np.sum(input1 * input2, keepdims=True, axis=1)
+    denominator = mx.np.sqrt(mx.np.sum(input1**2, axis=1, keepdims=True)) \
+    * mx.np.sqrt(mx.np.sum(input2**2, axis=1, keepdims=True))
+    numerator = numerator.as_nd_ndarray()
+    denominator = denominator.as_nd_ndarray()
+    numpy_loss = mx.nd.where(label.as_nd_ndarray() == 1, 1-numerator/denominator, \
     mx.nd.broadcast_maximum(mx.nd.array([0]), numerator/denominator, axis=1)).reshape((-1,))
     assert_almost_equal(loss.asnumpy(), numpy_loss.asnumpy(), rtol=1e-3, atol=1e-5)
 
+@mx.util.use_np
 @xfail_when_nonstandard_decimal_separator
 def test_poisson_nllloss():
     shape=(3, 4)
     not_axis0 = tuple(range(1, len(shape)))
-    pred = mx.nd.random.normal(shape=shape)
-    min_pred = mx.nd.min(pred)
+    pred = mx.np.random.normal(size=shape)
+    min_pred = mx.np.min(pred)
     #This is necessary to ensure only positive random values are generated for prediction,
     # to avoid ivalid log calculation
-    pred[:] = pred + mx.nd.abs(min_pred)
-    target = mx.nd.random.normal(shape=shape)
-    min_target = mx.nd.min(target)
+    pred[:] = pred + mx.np.abs(min_pred)
+    target = mx.np.random.normal(size=shape)
+    min_target = mx.np.min(target)
     #This is necessary to ensure only positive random values are generated for prediction,
     # to avoid ivalid log calculation
-    target[:] += mx.nd.abs(min_target)
+    target[:] += mx.np.abs(min_target)
 
     Loss = gluon.loss.PoissonNLLLoss(from_logits=True)
     Loss_no_logits = gluon.loss.PoissonNLLLoss(from_logits=False)
     #Calculating by brute formula for default value of from_logits = True
 
     # 1) Testing for flag logits = True
-    brute_loss = np.mean(np.exp(pred.asnumpy()) - target.asnumpy() * pred.asnumpy(), axis=1)
+    brute_loss = onp.mean(onp.exp(pred.asnumpy()) - target.asnumpy() * pred.asnumpy(), axis=1)
     loss_withlogits = Loss(pred, target)
     assert_almost_equal(brute_loss, loss_withlogits)
 
     #2) Testing for flag logits = False
     loss_no_logits = Loss_no_logits(pred, target)
-    np_loss_no_logits = np.mean(pred.asnumpy() - target.asnumpy() * np.log(pred.asnumpy() + 1e-08),
+    np_loss_no_logits = onp.mean(pred.asnumpy() - target.asnumpy() * onp.log(pred.asnumpy() + 1e-08),
                                 axis=1)
     assert_almost_equal(np_loss_no_logits, loss_no_logits.asnumpy())
 
     #3) Testing for Sterling approximation
     shape=(2, 3)
-    np_pred = np.random.uniform(1, 5, shape)
-    np_target = np.random.uniform(1, 5, shape)
-    np_compute_full = np.mean((np_pred - np_target * np.log(np_pred + 1e-08)) + ((np_target * np.log(np_target)-\
-     np_target + 0.5 * np.log(2 * np_target * np.pi))*(np_target > 1)), axis=1)
+    np_pred = onp.random.uniform(1, 5, shape)
+    np_target = onp.random.uniform(1, 5, shape)
+    np_compute_full = onp.mean((np_pred - np_target * onp.log(np_pred + 1e-08)) + ((np_target * onp.log(np_target)-\
+     np_target + 0.5 * onp.log(2 * np_target * onp.pi))*(np_target > 1)), axis=1)
     Loss_compute_full = gluon.loss.PoissonNLLLoss(from_logits=False, compute_full=True)
-    loss_compute_full = Loss_compute_full(mx.nd.array(np_pred), mx.nd.array(np_target))
+    loss_compute_full = Loss_compute_full(mx.np.array(np_pred), mx.np.array(np_target))
     assert_almost_equal(np_compute_full, loss_compute_full)
 
diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py
index c770dfb67375..b5161f3b6a91 100644
--- a/tests/python/unittest/test_metric.py
+++ b/tests/python/unittest/test_metric.py
@@ -46,32 +46,32 @@ def test_metrics():
 
 def test_ce():
     metric = mx.gluon.metric.create('ce')
-    pred = mx.nd.array([[0.2, 0.3, 0.5], [0.6, 0.1, 0.3]])
-    label = mx.nd.array([2, 1])
+    pred = mx.np.array([[0.2, 0.3, 0.5], [0.6, 0.1, 0.3]])
+    label = mx.np.array([2, 1])
     metric.update([label], [pred])
     _, loss = metric.get()
-    expected_loss = -(np.log(pred[0][2].asscalar()) + np.log(pred[1][1].asscalar())) / 2
-    assert loss == expected_loss
+    expected_loss = -(np.log(pred[0][2].item()) + np.log(pred[1][1].item())) / 2
+    np.testing.assert_almost_equal(loss, expected_loss)
     metric = mx.gluon.metric.create('ce', from_logits=True)
-    pred = mx.nd.log(pred)
+    pred = mx.np.log(pred)
     metric.update([label], [pred])
     _, loss = metric.get()
     np.testing.assert_almost_equal(loss, expected_loss)
 
 
 def test_acc():
-    pred = mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])
-    label = mx.nd.array([0, 1, 1])
+    pred = mx.np.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])
+    label = mx.np.array([0, 1, 1])
     metric = mx.gluon.metric.create('acc')
     metric.update([label], [pred])
     _, acc = metric.get()
-    expected_acc = (np.argmax(pred, axis=1) == label).sum().asscalar() / label.size
+    expected_acc = (np.argmax(pred, axis=1) == label).sum().item() / label.size
     np.testing.assert_almost_equal(acc, expected_acc)
 
 def test_acc_2d_label():
     # label maybe provided in 2d arrays in custom data iterator
-    pred = mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6], [0.8, 0.2], [0.3, 0.5], [0.6, 0.4]])
-    label = mx.nd.array([[0, 1, 1], [1, 0, 1]])
+    pred = mx.np.array([[0.3, 0.7], [0, 1.], [0.4, 0.6], [0.8, 0.2], [0.3, 0.5], [0.6, 0.4]])
+    label = mx.np.array([[0, 1, 1], [1, 0, 1]])
     metric = mx.gluon.metric.create('acc')
     metric.update([label], [pred])
     _, acc = metric.get()
@@ -80,7 +80,7 @@ def test_acc_2d_label():
     np.testing.assert_almost_equal(acc, expected_acc)
 
 def test_loss_update():
-    pred = mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])
+    pred = mx.np.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])
     metric1 = mx.gluon.metric.create('loss')
     metric2 = mx.gluon.metric.create('loss')
     metric1.update(None, [pred])
@@ -98,9 +98,9 @@ def test_binary_f1():
     assert np.isnan(microF1.get()[1])
 
     # check divide by zero
-    pred = mx.nd.array([[0.9, 0.1],
+    pred = mx.np.array([[0.9, 0.1],
                         [0.8, 0.2]])
-    label = mx.nd.array([0, 0])
+    label = mx.np.array([0, 0])
     macroF1.update([label], [pred])
     microF1.update([label], [pred])
     assert macroF1.get()[1] == 0.0
@@ -108,16 +108,16 @@ def test_binary_f1():
     macroF1.reset()
     microF1.reset()
 
-    pred11 = mx.nd.array([[0.1, 0.9],
+    pred11 = mx.np.array([[0.1, 0.9],
                           [0.5, 0.5]])
-    label11 = mx.nd.array([1, 0])
-    pred12 = mx.nd.array([[0.85, 0.15],
+    label11 = mx.np.array([1, 0])
+    pred12 = mx.np.array([[0.85, 0.15],
                           [1.0, 0.0]])
-    label12 = mx.nd.array([1, 0])
-    pred21 = mx.nd.array([[0.6, 0.4]])
-    label21 = mx.nd.array([0])
-    pred22 = mx.nd.array([[0.2, 0.8]])
-    label22 = mx.nd.array([1])
+    label12 = mx.np.array([1, 0])
+    pred21 = mx.np.array([[0.6, 0.4]])
+    label21 = mx.np.array([0])
+    pred22 = mx.np.array([[0.2, 0.8]])
+    label22 = mx.np.array([1])
 
     microF1.update([label11, label12], [pred11, pred12])
     macroF1.update([label11, label12], [pred11, pred12])
@@ -145,9 +145,9 @@ def test_multiclass_f1():
     assert np.isnan(microF1.get()[1])
 
     # check one class is zero
-    pred = mx.nd.array([[0.9, 0.1],
+    pred = mx.np.array([[0.9, 0.1],
                         [0.8, 0.2]])
-    label = mx.nd.array([0, 0])
+    label = mx.np.array([0, 0])
     macroF1.update([label], [pred])
     microF1.update([label], [pred])
     assert macroF1.get()[1] == 0.5 # one class is 1.0, the other is 0. (divided by 0)
@@ -156,10 +156,10 @@ def test_multiclass_f1():
     microF1.reset()
 
     # test case from sklearn, here pred is probabilistic distributions instead of predicted labels
-    pred11 = mx.nd.array([[1, 0, 0], [0, 1, 0]])
-    label11 = mx.nd.array([0, 2])
-    pred12 = mx.nd.array([[0, 0, 1], [1, 0, 0], [0, 1, 0], [0, 0, 1]])
-    label12 = mx.nd.array([1, 0, 0, 1])
+    pred11 = mx.np.array([[1, 0, 0], [0, 1, 0]])
+    label11 = mx.np.array([0, 2])
+    pred12 = mx.np.array([[0, 0, 1], [1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    label12 = mx.np.array([1, 0, 0, 1])
 
     microF1.update([label11, label12], [pred11, pred12])
     macroF1.update([label11, label12], [pred11, pred12])
@@ -183,9 +183,9 @@ def test_multilabel_f1():
     assert np.isnan(microF1.get()[1])
 
     # check one class is zero
-    pred = mx.nd.array([[0.9, 0.1],
+    pred = mx.np.array([[0.9, 0.1],
                         [0.8, 0.2]])
-    label = mx.nd.array([[1, 1], [1, 1]])
+    label = mx.np.array([[1, 1], [1, 1]])
     macroF1.update([label], [pred])
     microF1.update([label], [pred])
     assert macroF1.get()[1] == 0.5 # one class is 1.0, the other is 0. (divided by 0)
@@ -193,10 +193,10 @@ def test_multilabel_f1():
     macroF1.reset()
     microF1.reset()
 
-    pred11 = mx.nd.array([[0.9, 0.4, 0.3], [0.2, 0.7, 0.8]])
-    label11 = mx.nd.array([[1, 0, 1], [0, 0, 1]])
-    pred12 = mx.nd.array([[0.6, 0.6, 0.7]])
-    label12 = mx.nd.array([[0, 1, 1]])
+    pred11 = mx.np.array([[0.9, 0.4, 0.3], [0.2, 0.7, 0.8]])
+    label11 = mx.np.array([[1, 0, 1], [0, 0, 1]])
+    pred12 = mx.np.array([[0.6, 0.6, 0.7]])
+    label12 = mx.np.array([[0, 1, 1]])
 
     microF1.update([label11, label12], [pred11, pred12])
     macroF1.update([label11, label12], [pred11, pred12])
@@ -217,23 +217,23 @@ def test_mcc():
     assert np.isnan(microMCC.get()[1])
 
     # check divide by zero
-    pred = mx.nd.array([[0.9, 0.1],
+    pred = mx.np.array([[0.9, 0.1],
                         [0.8, 0.2]])
-    label = mx.nd.array([0, 0])
+    label = mx.np.array([0, 0])
     microMCC.update([label], [pred])
     assert microMCC.get()[1] == 0.0
     microMCC.reset()
 
-    pred11 = mx.nd.array([[0.1, 0.9],
+    pred11 = mx.np.array([[0.1, 0.9],
                         [0.5, 0.5]])
-    label11 = mx.nd.array([1, 0])
-    pred12 = mx.nd.array([[0.85, 0.15],
+    label11 = mx.np.array([1, 0])
+    pred12 = mx.np.array([[0.85, 0.15],
                         [1.0, 0.0]])
-    label12 = mx.nd.array([1, 0])
-    pred21 = mx.nd.array([[0.6, 0.4]])
-    label21 = mx.nd.array([0])
-    pred22 = mx.nd.array([[0.2, 0.8]])
-    label22 = mx.nd.array([1])
+    label12 = mx.np.array([1, 0])
+    pred21 = mx.np.array([[0.6, 0.4]])
+    label21 = mx.np.array([0])
+    pred22 = mx.np.array([[0.2, 0.8]])
+    label22 = mx.np.array([1])
     microMCC.update([label11, label12], [pred11, pred12])
     assert microMCC.num_inst == 4
     tp1 = 1; fp1 = 0; fn1 = 1; tn1=2
@@ -249,8 +249,8 @@ def test_mcc():
     np.testing.assert_almost_equal(microMCC.get()[1], mccT)
 
 def test_perplexity():
-    pred = mx.nd.array([[0.8, 0.2], [0.2, 0.8], [0, 1.]])
-    label = mx.nd.array([0, 1, 1])
+    pred = mx.np.array([[0.8, 0.2], [0.2, 0.8], [0, 1.]])
+    label = mx.np.array([0, 1, 1])
     p = pred.asnumpy()[np.arange(label.size), label.asnumpy().astype('int32')]
     perplexity_expected = np.exp(-np.log(p).sum()/label.size)
     metric = mx.gluon.metric.create('perplexity', axis=-1)
@@ -259,8 +259,8 @@ def test_perplexity():
     np.testing.assert_almost_equal(perplexity, perplexity_expected)
 
 def test_pearsonr():
-    pred1 = mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])
-    label1 = mx.nd.array([[1, 0], [0, 1], [0, 1]])
+    pred1 = mx.np.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])
+    label1 = mx.np.array([[1, 0], [0, 1], [0, 1]])
     pearsonr_expected_np = np.corrcoef(pred1.asnumpy().ravel(), label1.asnumpy().ravel())[0, 1]
     pearsonr_expected_scipy, _ = pearsonr(pred1.asnumpy().ravel(), label1.asnumpy().ravel())
     micro_pr = mx.gluon.metric.create('pearsonr')
@@ -272,11 +272,11 @@ def test_pearsonr():
     np.testing.assert_almost_equal(micro_pr.get()[1], pearsonr_expected_np)
     np.testing.assert_almost_equal(micro_pr.get()[1], pearsonr_expected_scipy)
 
-    pred2 = mx.nd.array([[1, 2], [3, 2], [4, 6]])
-    label2 = mx.nd.array([[1, 0], [0, 1], [0, 1]])
+    pred2 = mx.np.array([[1, 2], [3, 2], [4, 6]])
+    label2 = mx.np.array([[1, 0], [0, 1], [0, 1]])
     # Note that pred12 = pred1 + pred2; label12 = label1 + label2
-    pred12 = mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6],[1, 2], [3, 2], [4, 6]])
-    label12 = mx.nd.array([[1, 0], [0, 1], [0, 1], [1, 0], [0, 1], [0, 1]])
+    pred12 = mx.np.array([[0.3, 0.7], [0, 1.], [0.4, 0.6],[1, 2], [3, 2], [4, 6]])
+    label12 = mx.np.array([[1, 0], [0, 1], [0, 1], [1, 0], [0, 1], [0, 1]])
 
     pearsonr_expected_np = np.corrcoef(pred12.asnumpy().ravel(), label12.asnumpy().ravel())[0, 1]
     pearsonr_expected_scipy, _ = pearsonr(pred12.asnumpy().ravel(), label12.asnumpy().ravel())
@@ -295,7 +295,7 @@ def cm_batch(cm):
         for j in range(n):
             labels += [ i ] * cm[i][j]
             preds += [ ident[j] ] * cm[i][j]
-    return ([ mx.nd.array(labels, dtype='int32') ], [ mx.nd.array(preds) ])
+    return ([ mx.np.array(labels, dtype='int32') ], [ mx.np.array(preds) ])
 
 def test_pcc():
     labels, preds = cm_batch([
@@ -374,7 +374,7 @@ def test_pcc():
 
 @xfail_when_nonstandard_decimal_separator
 def test_single_array_input():
-    pred = mx.nd.array([[1,2,3,4]])
+    pred = mx.np.array([[1,2,3,4]])
     label = pred + 0.1
 
     mse = mx.gluon.metric.create('mse')
diff --git a/tests/python/unittest/test_numpy_gluon.py b/tests/python/unittest/test_numpy_gluon.py
index bcd018157791..6c1fa670317f 100644
--- a/tests/python/unittest/test_numpy_gluon.py
+++ b/tests/python/unittest/test_numpy_gluon.py
@@ -23,9 +23,10 @@
 from uuid import uuid4
 import numpy as _np
 import mxnet as mx
-from mxnet import gluon, autograd, np
+from mxnet import gluon, autograd, np, npx
 from mxnet.test_utils import use_np, assert_almost_equal, check_gluon_hybridize_consistency, assert_allclose
 from mxnet.gluon import nn
+from mxnet.base import MXNetError
 import random
 import pytest
 
@@ -43,29 +44,20 @@ def check_block_params(x, TestBlock, hybridize, expected_type, initializer):
         for k, v in params.items():
             assert type(v.data()) is expected_type
 
+    @use_np
     class TestBlock1(gluon.HybridBlock):
         def __init__(self):
             super(TestBlock1, self).__init__()
             self.w = gluon.Parameter('w', shape=(K, N), allow_deferred_init=True)
 
-        def hybrid_forward(self, F, x, w):
-            return F.dot(x, w)
-
-    @use_np
-    class TestBlock2(gluon.HybridBlock):
-        def __init__(self):
-            super(TestBlock2, self).__init__()
-            self.w = gluon.Parameter('w', shape=(K, N), allow_deferred_init=True)
-
-        def hybrid_forward(self, F, x, w):
-            return F.np.dot(x, w)
+        def forward(self, x):
+            ctx = x.ctx
+            return np.dot(x, self.w.data(ctx))
 
-    x = mx.nd.random.uniform(shape=(M, K))
+    x = mx.np.random.uniform(size=(M, K))
     for initializer in [mx.initializer.Uniform, mx.initializer.Normal]:
-        check_block_params(x, TestBlock1, False, mx.nd.NDArray, initializer)
-        check_block_params(x, TestBlock1, True, mx.nd.NDArray, initializer)
-        check_block_params(x.as_np_ndarray(), TestBlock2, False, np.ndarray, initializer)
-        check_block_params(x.as_np_ndarray(), TestBlock2, True, np.ndarray, initializer)
+        check_block_params(x, TestBlock1, False, mx.np.ndarray, initializer)
+        check_block_params(x, TestBlock1, True, mx.np.ndarray, initializer)
 
 
 @use_np
@@ -78,15 +70,20 @@ def __init__(self, num_input_dim=0, num_hidden_dim=100, num_output_dim=10):
             self.w2 = gluon.Parameter('w2', shape=(num_hidden_dim, num_output_dim),
                                       allow_deferred_init=True)
 
-        def hybrid_forward(self, F, x, w1, w2):
-            h = x.dot(w1)  # equivalent to F.np.dot(x, w1)
-            h_relu = F.npx.relu(h)  # equivalent to F.relu(h) but generating np.ndarray
-            y_pred = h_relu.dot(w2)  # equivalent to F.np.dot(h_relu, w2)
+        def forward(self, x):
+            ctx = x.ctx
+            h = x.dot(self.w1.data(ctx))  # equivalent to np.dot(x, w1)
+            h_relu = npx.relu(h)  # equivalent to npx.relu(h) but generating np.ndarray
+            y_pred = h_relu.dot(self.w2.data(ctx))  # equivalent to np.dot(h_relu, w2)
             return y_pred
+        
+        def infer_shape(self, x, *args):
+            pre_shape = self.w1.shape
+            self.w1.shape = (x.shape[x.ndim-1], pre_shape[1])
 
     class TotalLoss(gluon.HybridBlock):
-        def hybrid_forward(self, F, pred, label):
-            return ((pred - label) ** 2).sum()  # equivalent to F.np.sum(F.np.square(pred - label))
+        def forward(self, pred, label):
+            return ((pred - label) ** 2).sum()  # equivalent to np.sum(np.square(pred - label))
 
     regressor = LinearRegression()
     regressor.initialize(mx.init.Uniform())
@@ -162,8 +159,9 @@ def __init__(self):
             super(Foo, self).__init__()
             self.weight = gluon.Constant(const_arr)
 
-        def hybrid_forward(self, F, x, weight):
-            return x + weight.astype(np.float32)
+        def forward(self, x):
+            ctx = x.ctx
+            return x + self.weight.data(ctx).astype(np.float32)
 
     x = np.random.uniform(size=const_arr.shape, dtype=const_arr.dtype)
     for hybridize in [False, True]:
@@ -297,12 +295,12 @@ def random_slice_index(shape):
             cache.add((hashable_index(index1), hashable_index(index2)))
             # Test basic slicing on a single symbol
             class TestSlicingSingleSymbol1(gluon.HybridBlock):
-                def hybrid_forward(self, F, x, y):
+                def forward(self, x, y):
                     return x[()][index1] + y[()][index1]
 
             # Test basic slicing on a single symbol
             class TestSlicingSingleSymbol2(gluon.HybridBlock):
-                def hybrid_forward(self, F, x, y):
+                def forward(self, x, y):
                     return (x[()][index1] + y[()][index1])[index2]
 
             check_gluon_hybridize_consistency(TestSlicingSingleSymbol1, [x, y],
@@ -312,10 +310,10 @@ def hybrid_forward(self, F, x, y):
                                               (a[()][index1] + b[()][index1])[index2])
         # Test for split/hsplit/vsplit
         class TestSlicingWithSplit(gluon.HybridBlock):
-            def hybrid_forward(self, F, x):
-                x = F.np.split(x, shape[2], axis=2)
+            def forward(self, x):
+                x = mx.np.split(x, shape[2], axis=2)
                 x = x[1:-1]
-                x = F.np.concatenate(x, axis=2)
+                x = mx.np.concatenate(x, axis=2)
                 return x
 
         class TestSlicingWithSplit2(gluon.HybridBlock):
@@ -323,23 +321,23 @@ def __init__(self):
                 super(TestSlicingWithSplit2, self).__init__()
                 self.layer = gluon.nn.Dense(16, flatten=False)
 
-            def hybrid_forward(self, F, x, y):
-                x = F.np.split(x, 1)
+            def forward(self, x, y):
+                x = mx.np.split(x, 1)
                 x = x[0]
                 return self.layer(x[:, -1, :] + y[:, -1, :])
 
         class TestSlicingWithHSplit(gluon.HybridBlock):
-            def hybrid_forward(self, F, x):
-                x = F.np.hsplit(x, shape[1])
+            def forward(self, x):
+                x = mx.np.hsplit(x, shape[1])
                 x = x[1:-1]
-                x = F.np.concatenate(x, axis=1)
+                x = mx.np.concatenate(x, axis=1)
                 return x
 
         class TestSlicingWithVSplit(gluon.HybridBlock):
-            def hybrid_forward(self, F, x):
-                x = F.np.vsplit(x, shape[0])
+            def forward(self, x):
+                x = mx.np.vsplit(x, shape[0])
                 x = x[1:-1]
-                x = F.np.concatenate(x, axis=0)
+                x = mx.np.concatenate(x, axis=0)
                 return x
 
         if len(shape) > 2 and shape[2] > 2:
@@ -363,7 +361,7 @@ def hybrid_forward(self, F, x):
                             ((3,), -1),
                             ((3,), 0)]:
         class IntegerIndexing(gluon.HybridBlock):
-            def hybrid_forward(self, F, x):
+            def forward(self, x):
                 return x[idx]
         check_gluon_hybridize_consistency(IntegerIndexing,
                                           [mx.np.ones(data_shape)],
@@ -377,8 +375,8 @@ def __init__(self):
             super(Case1, self).__init__()
             self.layer = gluon.nn.Dense(64, flatten=False)
 
-        def hybrid_forward(self, F, x, y):
-            x = F.np.split(x, 1)
+        def forward(self, x, y):
+            x = mx.np.split(x, 1)
             x = x[0]
             return self.layer(x[:, -1, :] + y[:, -1, :])
     check_gluon_save_load(Case1, [mx.np.random.normal(0, 1, (10, 5, 8, 6)),
@@ -390,8 +388,8 @@ def __init__(self):
             self.layer1 = gluon.nn.Dense(64, flatten=False)
             self.layer2 = gluon.nn.Dense(64, flatten=False)
 
-        def hybrid_forward(self, F, x, y):
-            x = F.np.split(x, 1)
+        def forward(self, x, y):
+            x = mx.np.split(x, 1)
             x = x[0]
             return self.layer1(x[:, -1, :]) + self.layer2(y[:, -1, :])
     check_gluon_save_load(Case2, [mx.np.random.normal(0, 1, (10, 5, 8)),
@@ -403,8 +401,8 @@ class Foo(gluon.HybridBlock):
         def __init__(self):
             super(Foo, self).__init__()
 
-        def hybrid_forward(self, F, valid_length):
-            mask = ((F.np.ones((10,)) / 2) < valid_length)
+        def forward(self, valid_length):
+            mask = ((np.ones((10,)) / 2) < valid_length)
             return mask
 
     valid_length = mx.np.random.uniform(size=(10,))
@@ -424,20 +422,20 @@ class TestBlock(gluon.HybridBlock):
         def __init__(self):
             super(TestBlock, self).__init__()
             self.d = mx.gluon.nn.Dense(1)
-        def hybrid_forward(self, F, a, b, *args):
-            res = self.d.hybrid_forward(F, a, b)
+        def forward(self, a):
+            res = self.d(a)
             return res
 
     a = mx.np.random.uniform(low=-1, high=1, size=(1,1))
-    b = mx.np.random.uniform(low=-1, high=1, size=(1,1))
 
     net = TestBlock()
     net.initialize()
     net.hybridize()
 
-    out = net(a, b)
+    out = net(a)
+    b = net.collect_params().pop('d.weight').data()
     net.optimize_for(a, b, backend="MKLDNN")
-    out2 = net(a, b)
+    out2 = net(a)
 
 
 @use_np
diff --git a/tests/python/unittest/test_numpy_loss.py b/tests/python/unittest/test_numpy_loss.py
index 26b0bc7dfced..d452566f9c26 100644
--- a/tests/python/unittest/test_numpy_loss.py
+++ b/tests/python/unittest/test_numpy_loss.py
@@ -16,7 +16,7 @@
 # under the License.
 
 import mxnet as mx
-import numpy as np
+import numpy as onp
 from mxnet import gluon, autograd
 from mxnet.test_utils import assert_almost_equal, default_context, use_np
 from common import xfail_when_nonstandard_decimal_separator
@@ -117,13 +117,13 @@ def test_loss_np_ndarray(hybridize):
     if hybridize:
         loss.hybridize()
     L = loss(output, label).asnumpy()
-    assert_almost_equal(L, np.array([ 2.12692809,  0.04858733]), rtol=1e-3, atol=1e-4)
+    assert_almost_equal(L, onp.array([ 2.12692809,  0.04858733]), rtol=1e-3, atol=1e-4)
 
     loss = gluon.loss.SoftmaxCrossEntropyLoss()
     if hybridize:
         loss.hybridize()
     L = loss(output, label, weighting).asnumpy()
-    assert_almost_equal(L, np.array([ 1.06346405,  0.04858733]), rtol=1e-3, atol=1e-4)
+    assert_almost_equal(L, onp.array([ 1.06346405,  0.04858733]), rtol=1e-3, atol=1e-4)
 
 
 @use_np
@@ -168,37 +168,37 @@ def test_ctc_loss(hybridize):
     if hybridize:
         loss.hybridize()
     l = loss(mx.np.ones((2,20,4)), mx.np.array([[1,0,-1,-1],[2,1,1,-1]]))
-    assert_almost_equal(l, np.array([18.82820702, 16.50581741]))
+    assert_almost_equal(l, onp.array([18.82820702, 16.50581741]))
 
     loss = gluon.loss.CTCLoss(layout='TNC')
     if hybridize:
         loss.hybridize()
     l = loss(mx.np.ones((20,2,4)), mx.np.array([[1,0,-1,-1],[2,1,1,-1]]))
-    assert_almost_equal(l, np.array([18.82820702, 16.50581741]))
+    assert_almost_equal(l, onp.array([18.82820702, 16.50581741]))
 
     loss = gluon.loss.CTCLoss(layout='TNC', label_layout='TN')
     if hybridize:
         loss.hybridize()
     l = loss(mx.np.ones((20,2,4)), mx.np.array([[1,0,-1,-1],[2,1,1,-1]]).T)
-    assert_almost_equal(l, np.array([18.82820702, 16.50581741]))
+    assert_almost_equal(l, onp.array([18.82820702, 16.50581741]))
 
     loss = gluon.loss.CTCLoss()
     if hybridize:
         loss.hybridize()
     l = loss(mx.np.ones((2,20,4)), mx.np.array([[2,1,2,2],[3,2,2,2]]), None, mx.np.array([2,3]))
-    assert_almost_equal(l, np.array([18.82820702, 16.50581741]))
+    assert_almost_equal(l, onp.array([18.82820702, 16.50581741]))
 
     loss = gluon.loss.CTCLoss()
     if hybridize:
         loss.hybridize()
     l = loss(mx.np.ones((2,25,4)), mx.np.array([[2,1,-1,-1],[3,2,2,-1]]), mx.np.array([20,20]))
-    assert_almost_equal(l, np.array([18.82820702, 16.50581741]))
+    assert_almost_equal(l, onp.array([18.82820702, 16.50581741]))
 
     loss = gluon.loss.CTCLoss()
     if hybridize:
         loss.hybridize()
     l = loss(mx.np.ones((2,25,4)), mx.np.array([[2,1,3,3],[3,2,2,3]]), mx.np.array([20,20]), mx.np.array([2,3]))
-    assert_almost_equal(l, np.array([18.82820702, 16.50581741]))
+    assert_almost_equal(l, onp.array([18.82820702, 16.50581741]))
 
 
 @xfail_when_nonstandard_decimal_separator
@@ -300,8 +300,8 @@ def test_poisson_nllloss(hybridize):
     shape=(2, 3)
     np_pred = mx.np.random.uniform(1, 5, shape)
     np_target = mx.np.random.uniform(1, 5, shape)
-    np_compute_full = mx.np.mean((np_pred - np_target * mx.np.log(np_pred + 1e-08)) + ((np_target * np.log(np_target)-\
-     np_target + 0.5 * np.log(2 * np_target * np.pi))*(np_target > 1)), axis=1)
+    np_compute_full = mx.np.mean((np_pred - np_target * mx.np.log(np_pred + 1e-08)) + ((np_target * onp.log(np_target)-\
+     np_target + 0.5 * onp.log(2 * np_target * onp.pi))*(np_target > 1)), axis=1)
     Loss_compute_full = gluon.loss.PoissonNLLLoss(from_logits=False, compute_full=True)
     if hybridize:
         Loss_compute_full.hybridize()
diff --git a/tests/python/unittest/test_numpy_ndarray.py b/tests/python/unittest/test_numpy_ndarray.py
index fa189f87f4ef..9f2c67b7db29 100644
--- a/tests/python/unittest/test_numpy_ndarray.py
+++ b/tests/python/unittest/test_numpy_ndarray.py
@@ -113,12 +113,12 @@ def __init__(self, shape, dtype=None):
             self._shape = shape
             self._dtype = dtype
 
-        def hybrid_forward(self, F, x, *args, **kwargs):
-            return x + F.np.zeros(shape, dtype)
+        def forward(self, x, *args, **kwargs):
+            return x + np.zeros(shape, dtype)
 
     class TestZerosOutputType(HybridBlock):
-        def hybrid_forward(self, F, x, *args, **kwargs):
-            return x, F.np.zeros(shape=())
+        def forward(self, x, *args, **kwargs):
+            return x, np.zeros(shape=())
 
     # test np.zeros in imperative
     def check_zero_array_creation(shape, dtype):
@@ -166,12 +166,12 @@ def __init__(self, shape, dtype=None):
             self._shape = shape
             self._dtype = dtype
 
-        def hybrid_forward(self, F, x, *args, **kwargs):
-            return x * F.np.ones(shape, dtype)
+        def forward(self, x, *args, **kwargs):
+            return x * np.ones(shape, dtype)
 
     class TestOnesOutputType(HybridBlock):
-        def hybrid_forward(self, F, x, *args, **kwargs):
-            return x, F.np.ones(shape=())
+        def forward(self, x, *args, **kwargs):
+            return x, np.ones(shape=())
 
     # test np.ones in imperative
     def check_ones_array_creation(shape, dtype):
@@ -219,12 +219,12 @@ def __init__(self, shape, dtype=None):
             self._n = n
             self._dtype = dtype
 
-        def hybrid_forward(self, F, x):
-            return x * F.np.identity(self._n, self._dtype)
+        def forward(self, x):
+            return x * np.identity(self._n, self._dtype)
 
     class TestIdentityOutputType(HybridBlock):
-        def hybrid_forward(self, F, x):
-            return x, F.np.identity(0)
+        def forward(self, x):
+            return x, np.identity(0)
 
     def check_identity_array_creation(shape, dtype):
         np_out = _np.identity(n=n, dtype=dtype)
@@ -345,7 +345,7 @@ def __init__(self, op, scalar=None, reverse=False):
             self._scalar = scalar
             self._reverse = reverse  # if false, scalar is the right operand.
 
-        def hybrid_forward(self, F, x, *args):
+        def forward(self, x, *args):
             if self._op == '+':
                 if self._scalar is not None:
                     return x + self._scalar if not self._reverse else self._scalar + x
@@ -514,28 +514,23 @@ def check_binary_op_result(shape1, shape2, op, dtype=None):
 def test_np_hybrid_block_multiple_outputs():
     @use_np
     class TestAllNumpyOutputs(HybridBlock):
-        def hybrid_forward(self, F, x, *args, **kwargs):
-            return F.np.add(x, x), F.np.multiply(x, x)
-
-    class TestAllClassicOutputs(HybridBlock):
-        def hybrid_forward(self, F, x, *args, **kwargs):
-            return x.as_nd_ndarray() + x.as_nd_ndarray(), x.as_nd_ndarray() * x.as_nd_ndarray()
+        def forward(self, x, *args, **kwargs):
+            return np.add(x, x), np.multiply(x, x)
 
     data_np = np.ones((2, 3))
-    for block, expected_out_type in [(TestAllClassicOutputs, mx.nd.NDArray),
-                                     (TestAllNumpyOutputs, np.ndarray)]:
-        net = block()
-        for hybridize in [True, False]:
-            if hybridize:
-                net.hybridize()
-            out1, out2 = net(data_np)
-            assert type(out1) is expected_out_type
-            assert type(out2) is expected_out_type
+    block, expected_out_type = TestAllNumpyOutputs, np.ndarray
+    net = block()
+    for hybridize in [True, False]:
+        if hybridize:
+            net.hybridize(active=hybridize)
+        out1, out2 = net(data_np)
+        assert type(out1) is expected_out_type
+        assert type(out2) is expected_out_type
 
     @use_np
     class TestMixedTypeOutputsFailure(HybridBlock):
-        def hybrid_forward(self, F, x, *args, **kwargs):
-            return x.as_nd_ndarray() + x.as_nd_ndarray(), F.np.multiply(x, x)
+        def forward(self, x, *args, **kwargs):
+            return x.as_nd_ndarray() + x.as_nd_ndarray(), np.multiply(x, x)
 
     net = TestMixedTypeOutputsFailure()
     assert_exception(net, TypeError, data_np)
@@ -560,7 +555,7 @@ def __init__(self, dtype, copy):
             self._dtype = dtype
             self._copy = copy
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             return x.astype(dtype=self._dtype, copy=self._copy)
 
     def check_astype_equal(itype, otype, copy, expect_zero_copy=False, hybridize=False):
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 9f027846c31e..572735f84e2b 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -26,6 +26,7 @@
 import scipy.stats as ss
 import scipy.special as scipy_special
 import pytest
+import mxnet.ndarray.numpy._internal as _npi
 from mxnet import np, npx
 from mxnet.gluon import HybridBlock
 from mxnet.base import MXNetError
@@ -70,8 +71,8 @@ def __init__(self, axes):
             super(TestTensordot, self).__init__()
             self._axes = axes
 
-        def hybrid_forward(self, F, a, b):
-            return F.np.tensordot(a, b, self._axes)
+        def forward(self, a, b):
+            return np.tensordot(a, b, self._axes)
 
     def tensordot_backward(out_grad, a, b, axes=2):
         if (a.ndim < 1) or (b.ndim < 1):
@@ -263,8 +264,8 @@ class TestVdot(HybridBlock):
         def __init__(self):
             super(TestVdot, self).__init__()
 
-        def hybrid_forward(self, F, a, b):
-            return F.np.vdot(a, b)
+        def forward(self, a, b):
+            return np.vdot(a, b)
 
     def vdot_backward(a, b):
         return [b, a]
@@ -314,8 +315,8 @@ class TestInner(HybridBlock):
         def __init__(self):
             super(TestInner, self).__init__()
 
-        def hybrid_forward(self, F, a, b):
-            return F.np.inner(a, b)
+        def forward(self, a, b):
+            return np.inner(a, b)
 
     def inner_backward(a, b):
         a_axes_summed = [a.ndim - 1]
@@ -406,8 +407,8 @@ class TestOuter(HybridBlock):
         def __init__(self):
             super(TestOuter, self).__init__()
 
-        def hybrid_forward(self, F, a, b):
-            return F.np.outer(a, b)
+        def forward(self, a, b):
+            return np.outer(a, b)
 
     test_outer = TestOuter()
     if hybridize:
@@ -461,8 +462,8 @@ class TestMatmul(HybridBlock):
         def __init__(self):
             super(TestMatmul, self).__init__()
 
-        def hybrid_forward(self, F, a, b):
-            return F.np.matmul(a, b)
+        def forward(self, a, b):
+            return np.matmul(a, b)
 
     def matmul_backward(a, b):
         def ShapeInfer(mat_a, mat_b):
@@ -612,8 +613,8 @@ class TestKron(HybridBlock):
         def __init__(self):
             super(TestKron, self).__init__()
 
-        def hybrid_forward(self, F, a, b):
-            return F.np.kron(a, b)
+        def forward(self, a, b):
+            return np.kron(a, b)
 
     test_kron = TestKron()
     if hybridize:
@@ -671,8 +672,8 @@ def __init__(self, axis=None, dtype=None, keepdims=False):
             self._dtype = dtype
             self._keepdims = keepdims
 
-        def hybrid_forward(self, F, a, *args, **kwargs):
-            return F.np.sum(a, axis=self._axis, dtype=self._dtype, keepdims=self._keepdims)
+        def forward(self, a, *args, **kwargs):
+            return np.sum(a, axis=self._axis, dtype=self._dtype, keepdims=self._keepdims)
 
     class TestSumConv(HybridBlock):
         def __init__(self, axis=None, dtype=None, keepdims=False):
@@ -681,7 +682,7 @@ def __init__(self, axis=None, dtype=None, keepdims=False):
             self._dtype = dtype
             self._keepdims = keepdims
 
-        def hybrid_forward(self, F, a, *args, **kwargs):
+        def forward(self, a, *args, **kwargs):
             return a.sum(axis=self._axis, dtype=self._dtype, keepdims=self._keepdims)
 
     def is_int(dtype):
@@ -764,8 +765,8 @@ def __init__(self, axis=None, keepdims=False) :
             self._axis = axis
             self._keepdims = keepdims
 
-        def hybrid_forward(self, F, a):
-            return getattr(F.np, bool_agg)(a, axis=self._axis, keepdims=self._keepdims)
+        def forward(self, a):
+            return getattr(np, bool_agg)(a, axis=self._axis, keepdims=self._keepdims)
 
     ndim = len(shape)
     samples = random.randint(0, ndim)
@@ -797,7 +798,7 @@ def __init__(self, axis=None, keepdims=False):
             self._axis = axis
             self._keepdims = keepdims
 
-        def hybrid_forward(self, F, a, *args, **kwargs):
+        def forward(self, a, *args, **kwargs):
             return getattr(a, func)(axis=self._axis, keepdims=self._keepdims)
 
     def is_int(dtype):
@@ -901,8 +902,8 @@ def __init__(self, axis=None, returned=False):
             self._axis = axis
             self._returned = returned
 
-        def hybrid_forward(self, F, a, weights):
-            return F.np.average(a, weights=weights, axis=self._axis, returned=self._returned)
+        def forward(self, a, weights):
+            return np.average(a, weights=weights, axis=self._axis, returned=self._returned)
 
     def avg_backward(a, w, avg, axes, init_a_grad=None, init_w_grad=None):
         # avg = sum(a * w) / sum(w)
@@ -997,7 +998,7 @@ def __init__(self, axis=None, dtype=None, keepdims=False):
             self._dtype = dtype
             self._keepdims = keepdims
 
-        def hybrid_forward(self, F, a, *args, **kwargs):
+        def forward(self, a, *args, **kwargs):
             return a.mean(axis=self._axis, dtype=self._dtype, keepdims=self._keepdims)
 
     def is_int(dtype):
@@ -1093,7 +1094,7 @@ def __init__(self, name, axis=None, dtype=None, keepdims=False, ddof=0):
             self._keepdims = keepdims
             self._ddof = ddof
 
-        def hybrid_forward(self, F, a, *args, **kwargs):
+        def forward(self, a, *args, **kwargs):
             return getattr(a, self._moment_name)(axis=self._axis, dtype=self._dtype,
                                                  keepdims=self._keepdims, ddof=self._ddof)
 
@@ -1209,11 +1210,11 @@ def __init__(self, start, stop, num=50, endpoint=None, retstep=False, dtype=None
             self._retstep = retstep
             self._dtype = dtype
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             if self._retstep:
                 raise ValueError("linspace didn't support retstep = True inside HybridBlock")
             else:
-                return x + F.np.linspace(self._start, self._stop, self._num, \
+                return x + np.linspace(self._start, self._stop, self._num, \
                 self._endpoint, self._retstep, self._dtype)
 
     x = np.zeros(shape=(), dtype=dtype)
@@ -1269,8 +1270,8 @@ def __init__(self, start, stop, num=50, endpoint=None, base=50.0, dtype=None, ax
             self._dtype = dtype
             self.axis = axis
 
-        def hybrid_forward(self, F, x):
-            return x + F.np.logspace(self._start, self._stop, self._num, self._endpoint, self._base, self._dtype, self.axis)
+        def forward(self, x):
+            return x + np.logspace(self._start, self._stop, self._num, self._endpoint, self._base, self._dtype, self.axis)
 
     x = np.zeros(shape=(), dtype=dtype)
     net = TestLogspace(*config, endpoint=endpoint, base=base, dtype=dtype)
@@ -1315,8 +1316,8 @@ def __init__(self, begin, end, step):
             self._end = end
             self._step = step
 
-        def hybrid_forward(self, F, a):
-            return F.npx.slice(a, begin=self._begin, end=self._end, step=self._step)
+        def forward(self, a):
+            return npx.slice(a, begin=self._begin, end=self._end, step=self._step)
 
     shape = (8, 16, 9, 9)
     np_array = _np.arange(_np.prod(shape), dtype='int32').reshape(shape)
@@ -1349,8 +1350,8 @@ class TestIndexAdd(HybridBlock):
         def __init__(self):
             super(TestIndexAdd, self).__init__()
 
-        def hybrid_forward(self, F, a, ind, val):
-            return F.npx.index_add(a, ind, val)
+        def forward(self, a, ind, val):
+            return npx.index_add(a, ind, val)
 
     def index_add_forward(a, ind, val, ind_ndim, ind_num):
         if val.dtype != a.dtype:
@@ -1502,8 +1503,8 @@ class TestIndexUpdate(HybridBlock):
         def __init__(self):
             super(TestIndexUpdate, self).__init__()
 
-        def hybrid_forward(self, F, a, ind, val):
-            return F.npx.index_update(a, ind, val)
+        def forward(self, a, ind, val):
+            return npx.index_update(a, ind, val)
 
     def check_index_update_forward(mx_ret, a, ind, val, ind_ndim, ind_num, eps):
         if val.dtype != a.dtype:
@@ -1670,8 +1671,8 @@ def __init__(self, transpose_a, transpose_b):
             self._transpose_a = transpose_a
             self._transpose_b = transpose_b
 
-        def hybrid_forward(self, F, lhs, rhs):
-            return F.npx.batch_dot(lhs, rhs,
+        def forward(self, lhs, rhs):
+            return npx.batch_dot(lhs, rhs,
                                    transpose_a=self._transpose_a,
                                    transpose_b=self._transpose_b)
 
@@ -1792,9 +1793,9 @@ def __init__(self, eps=1e-5, fix_gamma=False, momentum=0.9, **kwargs):
             self.fix_gamma = fix_gamma
             self.momentum = momentum
             self.kwargs = kwargs
-        def hybrid_forward(self, F, data, bn_gamma, bn_beta,
+        def forward(self, data, bn_gamma, bn_beta,
                            bn_running_mean, bn_running_var):
-            op = F.npx.batch_norm
+            op = npx.batch_norm
             output = op(data, bn_gamma, bn_beta,
                         bn_running_mean, bn_running_var,
                         momentum=self.momentum, eps=self.eps,
@@ -1950,16 +1951,17 @@ def __init__(self, axis):
             super(TestSoftmax, self).__init__()
             self._axis = axis
 
-        def hybrid_forward(self, F, a):
-            return F.npx.softmax(a, axis=axis)
+        def forward(self, a):
+            return npx.softmax(a, axis=axis)
 
     class TestLogSoftmax(HybridBlock):
         def __init__(self, axis):
             super(TestLogSoftmax, self).__init__()
             self._axis = axis
 
-        def hybrid_forward(self, F, a):
-            return F.npx.log_softmax(a, axis=axis)
+        def forward(self, a):
+            return npx.log_softmax(a, axis=axis)
+
 
     #(operator, function) tuples
     tested_ops = [(TestSoftmax, np_softmax),
@@ -2013,16 +2015,16 @@ def __init__(self, axis):
             super(TestMaskedSoftmax, self).__init__()
             self._axis = axis
 
-        def hybrid_forward(self, F, a, mask):
-            return F.npx.masked_softmax(a, mask, axis=self._axis)
+        def forward(self, a, mask):
+            return npx.masked_softmax(a, mask, axis=self._axis)
 
     class TestMaskedLogSoftmax(HybridBlock):
         def __init__(self, axis):
             super(TestMaskedLogSoftmax, self).__init__()
             self._axis = axis
 
-        def hybrid_forward(self, F, a, mask):
-            return F.npx.masked_log_softmax(a, mask, axis=self._axis)
+        def forward(self, a, mask):
+            return npx.masked_log_softmax(a, mask, axis=self._axis)
 
     #(operator, function) tuples
     tested_ops = [(TestMaskedSoftmax, np_masked_softmax),
@@ -2056,16 +2058,16 @@ def __init__(self, val, start_axis):
             self._val = val
             self._start_axis = start_axis
 
-        def hybrid_forward(self, F, a, mask):
-            return F.np._internal.boolean_mask_assign_scalar(a, mask, self._val, start_axis=self._start_axis, out=a)
+        def forward(self, a, mask):
+            return _npi.boolean_mask_assign_scalar(a, mask, self._val, start_axis=self._start_axis, out=a)
 
     class TestBooleanAssignTensor(HybridBlock):
         def __init__(self, start_axis):
             super(TestBooleanAssignTensor, self).__init__()
             self._start_axis = start_axis
 
-        def hybrid_forward(self, F, a, mask, value):
-            return F.np._internal.boolean_mask_assign_tensor(a, mask, value, start_axis=self._start_axis, out=a)
+        def forward(self, a, mask, value):
+            return _npi.boolean_mask_assign_tensor(a, mask, value, start_axis=self._start_axis, out=a)
 
     configs = [
         ((3, 4), (3, 4), 0),
@@ -2142,8 +2144,8 @@ def __init__(self, newshape):
             super(TestReshape, self).__init__()
             self._newshape = newshape
 
-        def hybrid_forward(self, F, a):
-            return F.np.reshape(a, self._newshape)
+        def forward(self, a):
+            return np.reshape(a, self._newshape)
 
     shape_pairs = [((2, 6), (6, 2)), ((2, 6), (3, 4)), ((1, 0), (0,)), ((0, 0), (0,)), ((), (1, 1, 1))]
     for hybridize in [True, False]:
@@ -2175,8 +2177,8 @@ def __init__(self, axis):
             super(TestArgsort, self).__init__()
             self._axis = axis
 
-        def hybrid_forward(self, F, x):
-            return F.np.argsort(x, axis=self._axis)
+        def forward(self, x):
+            return np.argsort(x, axis=self._axis)
 
     shapes = [
         (),
@@ -2232,8 +2234,8 @@ def __init__(self, axis, kind):
             self._axis = axis
             self._kind = kind
 
-        def hybrid_forward(self, F, x, *args, **kwargs):
-            return F.np.sort(x, self._axis, self._kind)
+        def forward(self, x, *args, **kwargs):
+            return np.sort(x, self._axis, self._kind)
 
     a = np.random.uniform(low=0, high=100, size=shape, dtype='float64').astype(dtype)
     axis_list = list(range(len(shape)))
@@ -2269,8 +2271,8 @@ def __init__(self, axis):
             super(TestSqueeze, self).__init__()
             self._axis = axis
 
-        def hybrid_forward(self, F, x):
-            return F.np.squeeze(x, self._axis)
+        def forward(self, x):
+            return np.squeeze(x, self._axis)
 
     for shape, axis in config:
         data_np = _np.random.uniform(size=shape)
@@ -2303,8 +2305,8 @@ def __init__(self, N, M=None, k=0, dtype=None):
             self._k = k
             self._dtype = dtype
 
-        def hybrid_forward(self, F, x):
-            return x + F.np.tri(self._N, self._M, self._k, self._dtype)
+        def forward(self, x):
+            return x + np.tri(self._N, self._M, self._k, self._dtype)
 
     dtypes = ['float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8', None]
     hybrids = [False, True]
@@ -2335,8 +2337,8 @@ def __init__(self, axis=None, dtype=None, keepdims=False):
             self._dtype = dtype
             self._keepdims = keepdims
 
-        def hybrid_forward(self, F, a, *args, **kwargs):
-            return F.np.prod(a, axis=self._axis, dtype=self._dtype, keepdims=self._keepdims)
+        def forward(self, a, *args, **kwargs):
+            return np.prod(a, axis=self._axis, dtype=self._dtype, keepdims=self._keepdims)
 
     in_data_dim = random.choice([3, 4])
     shape = rand_shape_nd(in_data_dim, dim=3)
@@ -2380,7 +2382,7 @@ def hybrid_forward(self, F, a, *args, **kwargs):
 @use_np
 def test_np_flatten():
     class TestFlatten(HybridBlock):
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             return x.flatten()
 
     shapes = [(), (2, 0, 1), (3, 4, 5), 6, (0,), (0, 0, 0)]
@@ -2417,8 +2419,8 @@ def __init__(self, dst_shape):
             super(TestBroadcastTo, self).__init__()
             self._dst_shape = dst_shape
 
-        def hybrid_forward(self, F, x):
-            return F.np.broadcast_to(x, self._dst_shape)
+        def forward(self, x):
+            return np.broadcast_to(x, self._dst_shape)
 
     class TestScalarBroadcastTo(HybridBlock):
         def __init__(self, scalar, dst_shape):
@@ -2426,8 +2428,8 @@ def __init__(self, scalar, dst_shape):
             self._scalar = scalar
             self._dst_shape = dst_shape
 
-        def hybrid_forward(self, F, x):
-            return F.np.broadcast_to(self._scalar, self._dst_shape)
+        def forward(self, x):
+            return np.broadcast_to(self._scalar, self._dst_shape)
 
     test_broadcast_to = TestBroadcastTo(dst_shape)
     if hybridize:
@@ -2468,8 +2470,8 @@ def __init__(self, dst_shape):
             super(TestBroadcastTo, self).__init__()
             self._dst_shape = dst_shape
 
-        def hybrid_forward(self, F, x):
-            return F.np.broadcast_to(x, self._dst_shape)
+        def forward(self, x):
+            return np.broadcast_to(x, self._dst_shape)
 
     class TestScalarBroadcastTo(HybridBlock):
         def __init__(self, scalar, dst_shape):
@@ -2477,8 +2479,8 @@ def __init__(self, scalar, dst_shape):
             self._scalar = scalar
             self._dst_shape = dst_shape
 
-        def hybrid_forward(self, F, x):
-            return F.np.broadcast_to(self._scalar, self._dst_shape)
+        def forward(self, x):
+            return np.broadcast_to(self._scalar, self._dst_shape)
 
     test_broadcast_to = TestBroadcastTo(npx_dst_shape)
     if hybridize:
@@ -2530,8 +2532,8 @@ def __init__(self, axes=None):
             super(TestTranspose, self).__init__()
             self.axes = axes
 
-        def hybrid_forward(self, F, a):
-            return F.np.transpose(a, self.axes)
+        def forward(self, a):
+            return np.transpose(a, self.axes)
 
     for axes in axes_workload:
         test_trans = TestTranspose(axes)
@@ -2623,8 +2625,8 @@ def __init__(self, reps):
             super(TestTile, self).__init__()
             self._reps = reps
 
-        def hybrid_forward(self, F, x):
-            return F.np.tile(x, reps=self._reps)
+        def forward(self, x):
+            return np.tile(x, reps=self._reps)
 
     for shape, reps in config:
         data_np = _np.random.randint(low=0, high=1000, size=shape)
@@ -2672,8 +2674,8 @@ def __init__(self, k):
             super(TestTril, self).__init__()
             self._k = k
 
-        def hybrid_forward(self, F, x):
-            return F.np.tril(x, k=self._k)
+        def forward(self, x):
+            return np.tril(x, k=self._k)
 
     for prefix in [1, -1]:
         for shape, k in config:
@@ -2732,8 +2734,8 @@ def __init__(self, k):
             super(TestTriu, self).__init__()
             self._k = k
 
-        def hybrid_forward(self, F, x):
-            return F.np.triu(x, k=self._k)
+        def forward(self, x):
+            return np.triu(x, k=self._k)
 
     for prefix in [1, -1]:
         for shape, k in config:
@@ -2769,8 +2771,8 @@ def __init__(self, func):
                 super(TestUnary, self).__init__()
                 self._func = func
 
-            def hybrid_forward(self, F, a, *args, **kwargs):
-                return getattr(F.np, self._func)(a)
+            def forward(self, a, *args, **kwargs):
+                return getattr(np, self._func)(a)
 
         np_func = getattr(_np, func)
         np_test_data = _np.random.uniform(low, high, shape).astype(_np.float32)
@@ -2829,7 +2831,7 @@ def hybrid_forward(self, F, a, *args, **kwargs):
 @use_np
 def test_negation():
     class TestNegation(HybridBlock):
-        def hybrid_forward(self, F, a):
+        def forward(self, a):
             return -a
     mx_func = TestNegation()
     for dtype in [_np.int8, _np.int32, _np.float16, _np.float32, _np.float64]:
@@ -2883,8 +2885,8 @@ def __init__(self, func):
             super(TestMixedUnary, self).__init__()
             self._func = func
 
-        def hybrid_forward(self, F, a, *args, **kwargs):
-            return getattr(F.np, self._func)(a)
+        def forward(self, a, *args, **kwargs):
+            return getattr(np, self._func)(a)
 
     import math
 
@@ -2949,8 +2951,8 @@ def __init__(self, func):
                 super(TestUnary, self).__init__()
                 self._func = func
 
-            def hybrid_forward(self, F, a, *args, **kwargs):
-                return getattr(F.np, self._func)(a)
+            def forward(self, a, *args, **kwargs):
+                return getattr(np, self._func)(a)
 
         np_func = getattr(_np, func)
         mx_func = TestUnary(func)
@@ -2994,8 +2996,8 @@ def __init__(self, func):
                 super(TestBinary, self).__init__()
                 self._func = func
 
-            def hybrid_forward(self, F, a, b, *args, **kwargs):
-                return getattr(F.np, self._func)(a, b)
+            def forward(self, a, b, *args, **kwargs):
+                return getattr(np, self._func)(a, b)
 
         np_func = getattr(_np, func)
         mx_func = TestBinary(func)
@@ -3139,8 +3141,8 @@ def __init__(self, func):
                 super(TestMixedBinary, self).__init__()
                 self._func = func
 
-            def hybrid_forward(self, F, a, b, *args, **kwargs):
-                return getattr(F.np, self._func)(a, b)
+            def forward(self, a, b, *args, **kwargs):
+                return getattr(np, self._func)(a, b)
 
         if (func in ['multiply', 'mod', 'equal', 'not_equal', 'greater',
                     'greater_equal', 'less', 'less_equal']) and \
@@ -3310,8 +3312,8 @@ def __init__(self, func, scalar):
                 self._func = func
                 self._scalar = scalar
 
-            def hybrid_forward(self, F, a, *args, **kwargs):
-                return getattr(F.np, self._func)(a, self._scalar)
+            def forward(self, a, *args, **kwargs):
+                return getattr(np, self._func)(a, self._scalar)
 
         np_test_x1 = _np.random.uniform(low, high, lshape).astype(ltype)
         np_test_x2 = int(_np.random.uniform(low, high)) if scalar_is_int else _np.random.uniform(low, high)
@@ -3368,8 +3370,8 @@ def __init__(self, func):
                 super(TestBooleanBinary, self).__init__()
                 self._func = func
 
-            def hybrid_forward(self, F, a, b, *args, **kwargs):
-                return getattr(F.np, self._func)(a, b)
+            def forward(self, a, b, *args, **kwargs):
+                return getattr(np, self._func)(a, b)
 
         np_x1 = mx_x1.asnumpy()
         np_x2 = mx_x2.asnumpy()
@@ -3424,8 +3426,8 @@ class TestReLU(HybridBlock):
         def __init__(self):
             super(TestReLU, self).__init__()
 
-        def hybrid_forward(self, F, a):
-            return F.npx.relu(a)
+        def forward(self, a):
+            return npx.relu(a)
 
     shapes = [(), (2, 3, 4), (2, 0, 3), (1, 0, 0)]
     for hybridize in [True, False]:
@@ -3483,8 +3485,8 @@ class TestSigmoid(HybridBlock):
         def __init__(self):
             super(TestSigmoid, self).__init__()
 
-        def hybrid_forward(self, F, a):
-            return F.npx.sigmoid(a)
+        def forward(self, a):
+            return npx.sigmoid(a)
 
     shapes = [(), (2, 3, 4), (2, 0, 3), (1, 0, 0)]
     for hybridize in [True, False]:
@@ -3515,13 +3517,13 @@ def __init__(self, n):
             super(TestAtleastND, self).__init__()
             self._n = n
 
-        def hybrid_forward(self, F, *arys):
+        def forward(self, *arys):
             if self._n == 1:
-                return F.np.atleast_1d(*arys)
+                return np.atleast_1d(*arys)
             elif self._n == 2:
-                return F.np.atleast_2d(*arys)
+                return np.atleast_2d(*arys)
             elif self._n == 3:
-                return F.np.atleast_3d(*arys)
+                return np.atleast_3d(*arys)
 
     tensor_shapes = [
         ((), (2,), (3, 4, 5)),
@@ -3604,8 +3606,8 @@ def __init__(self, start, stop=None, step=None, dtype=None):
             self._step = step
             self._dtype = dtype
 
-        def hybrid_forward(self, F, x):
-            return x + F.np.arange(self._start, self._stop, self._step, dtype=self._dtype)
+        def forward(self, x):
+            return x + np.arange(self._start, self._stop, self._step, dtype=self._dtype)
 
     for dtype in dtypes:
         x = np.zeros(shape=(), dtype=dtype)
@@ -3631,8 +3633,8 @@ def __init__(self, obj, axis=None):
             self._obj = obj
             self._axis = axis
 
-        def hybrid_forward(self, F, a, b):
-            return F.np.insert(a, self._obj, b, axis=self._axis)
+        def forward(self, a, b):
+            return np.insert(a, self._obj, b, axis=self._axis)
 
     def GetSize(tp):
         res = 1
@@ -3743,8 +3745,8 @@ def __init__(self, indices_or_sections, axis=None):
             self._axis = axis
             self._indices_or_sections = indices_or_sections
 
-        def hybrid_forward(self, F, a, *args, **kwargs):
-            return F.np.split(a, indices_or_sections=self._indices_or_sections,
+        def forward(self, a, *args, **kwargs):
+            return np.split(a, indices_or_sections=self._indices_or_sections,
                               axis=self._axis)
 
     def get_indices(axis_size):
@@ -3795,8 +3797,8 @@ def __init__(self, indices_or_sections, axis=None):
             self._axis = axis
             self._indices_or_sections = indices_or_sections
 
-        def hybrid_forward(self, F, a, *args, **kwargs):
-            return F.np.array_split(a, indices_or_sections=self._indices_or_sections,
+        def forward(self, a, *args, **kwargs):
+            return np.array_split(a, indices_or_sections=self._indices_or_sections,
                               axis=self._axis)
 
     def get_indices(axis_size):
@@ -3850,8 +3852,8 @@ def __init__(self, indices_or_sections):
             super(TestVsplit, self).__init__()
             self._indices_or_sections = indices_or_sections
 
-        def hybrid_forward(self, F, a, *args, **kwargs):
-            return F.np.vsplit(a, indices_or_sections=self._indices_or_sections)
+        def forward(self, a, *args, **kwargs):
+            return np.vsplit(a, indices_or_sections=self._indices_or_sections)
 
     def get_indices(axis_size):
         if axis_size is 0:
@@ -3904,8 +3906,8 @@ def __init__(self, axis=None):
             super(TestConcat, self).__init__()
             self._axis = axis
 
-        def hybrid_forward(self, F, a, *args):
-            return F.np.concatenate([a] + list(args), axis=self._axis)
+        def forward(self, a, *args):
+            return np.concatenate([a] + list(args), axis=self._axis)
 
     def get_new_shape(shape, axis):
         shape_lst = list(shape)
@@ -3972,8 +3974,8 @@ def __init__(self, axis=None):
             super(TestAppend, self).__init__()
             self._axis = axis
 
-        def hybrid_forward(self, F, a, b):
-            return F.np.append(a, b, axis=self._axis)
+        def forward(self, a, b):
+            return np.append(a, b, axis=self._axis)
 
     def get_new_shape(shape, axis):
         shape_lst = list(shape)
@@ -4027,8 +4029,8 @@ def __init__(self, axis=None):
             super(TestStack, self).__init__()
             self._axis = axis
 
-        def hybrid_forward(self, F, a, *args):
-            return F.np.stack([a] + list(args), axis=self._axis)
+        def forward(self, a, *args):
+            return np.stack([a] + list(args), axis=self._axis)
 
     a, b, c, d = mx.sym.Variable("a"), mx.sym.Variable("b"), mx.sym.Variable("c"), mx.sym.Variable("d")
     ret = mx.sym.np.stack([a.as_np_ndarray(), b.as_np_ndarray(), c.as_np_ndarray(), d.as_np_ndarray()])
@@ -4075,8 +4077,8 @@ class TestHStack(HybridBlock):
         def __init__(self):
             super(TestHStack, self).__init__()
 
-        def hybrid_forward(self, F, a, *args):
-            return F.np.hstack([a] + list(args))
+        def forward(self, a, *args):
+            return np.hstack([a] + list(args))
 
     def get_new_shape(shape):
         if len(shape) == 0:
@@ -4138,8 +4140,8 @@ class TestDStack(HybridBlock):
         def __init__(self):
             super(TestDStack, self).__init__()
 
-        def hybrid_forward(self, F, a, *args):
-            return F.np.dstack([a] + list(args))
+        def forward(self, a, *args):
+            return np.dstack([a] + list(args))
 
     def get_new_shape(shape):
         if len(shape) < 3:
@@ -4198,8 +4200,8 @@ class TestRavel(HybridBlock):
         def __init__(self):
             super(TestRavel, self).__init__()
 
-        def hybrid_forward(self, F, a):
-            return F.np.ravel(a)
+        def forward(self, a):
+            return np.ravel(a)
 
     types = ['float64', 'float32', 'float16', 'int64', 'int32', 'int8']
     for oneType in types:
@@ -4284,8 +4286,8 @@ def __init__(self, axis1, axis2):
             self._axis1 = axis1
             self._axis2 = axis2
 
-        def hybrid_forward(self, F, x):
-            return F.np.swapaxes(x, self._axis1, self._axis2)
+        def forward(self, x):
+            return np.swapaxes(x, self._axis1, self._axis2)
 
     for shape, axis1, axis2 in config:
         data_np = _np.random.uniform(size=shape)
@@ -4311,8 +4313,8 @@ def __init__(self, obj, axis=None):
             self._obj = obj
             self._axis = axis
 
-        def hybrid_forward(self, F, a):
-            return F.np.delete(a, self._obj, axis=self._axis)
+        def forward(self, a):
+            return np.delete(a, self._obj, axis=self._axis)
 
     def GetSize(shp):
         if len(shp) == 0:
@@ -4406,7 +4408,7 @@ def __init__(self, op_name, axis=None):
             self._op_name = op_name
             self._axis = axis
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             return getattr(x, self._op_name)(self._axis)
 
     for op_name in ops:
@@ -4487,7 +4489,7 @@ def __init__(self, a_min=None, a_max=None):
             self._a_min = a_min
             self._a_max = a_max
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             return x.clip(self._a_min, self._a_max)
 
     # Test scalar case
@@ -4576,15 +4578,15 @@ class TestConstraintViolatedCheck(HybridBlock):
         def __init__(self):
             super(TestConstraintViolatedCheck, self).__init__()
 
-        def hybrid_forward(self, F, boolean_tensor):
-            return F.npx.constraint_check(boolean_tensor, msg)
+        def forward(self, boolean_tensor):
+            return npx.constraint_check(boolean_tensor, msg)
 
     class TestConstraintNotViolatedCheck(HybridBlock):
         def __init__(self):
             super(TestConstraintNotViolatedCheck, self).__init__()
 
-        def hybrid_forward(self, F, input, boolean_tensor):
-            return input * F.npx.constraint_check(boolean_tensor, msg)
+        def forward(self, input, boolean_tensor):
+            return input * npx.constraint_check(boolean_tensor, msg)
 
     def raiseFunc(block):
         def executor(boolean_tensor):
@@ -4617,8 +4619,8 @@ def __init__(self, func):
                 super(TestUnary, self).__init__()
                 self._func = func
 
-            def hybrid_forward(self, F, a, *args, **kwargs):
-                return getattr(F.npx, self._func)(a)
+            def forward(self, a, *args, **kwargs):
+                return getattr(npx, self._func)(a)
 
         np_func = getattr(scipy_special, func)
         mx_func = TestUnary(func)
@@ -4670,8 +4672,8 @@ def __init__(self, shape, op_name):
             super(TestRandomGrad, self).__init__()
             self._shape = shape
             self._dist_name = op_name
-        def hybrid_forward(self, F, loc, scale):
-            op = getattr(F.np.random, self._dist_name, None)
+        def forward(self, loc, scale):
+            op = getattr(np.random, self._dist_name, None)
             assert op is not None
             return op(loc=loc, scale=scale, size=self._shape)
 
@@ -4720,8 +4722,8 @@ def __init__(self, shape):
             super(TestLognormalGrad, self).__init__()
             self._shape = shape
 
-        def hybrid_forward(self, F, mean, sigma):
-            return F.np.random.lognormal(mean, sigma, self._shape)
+        def forward(self, mean, sigma):
+            return np.random.lognormal(mean, sigma, self._shape)
 
     param_shape = [
         [(3, 2), (3, 2)],
@@ -4778,8 +4780,8 @@ def __init__(self, shape, op_name):
             self._shape = shape
             self._op_name = op_name
 
-        def hybrid_forward(self, F, param1, param2):
-            op = getattr(F.npx.random, self._op_name, None)
+        def forward(self, param1, param2):
+            op = getattr(npx.random, self._op_name, None)
             assert op is not None
             return op(param1, param2, batch_shape=self._shape)
 
@@ -4826,8 +4828,8 @@ def __init__(self, shape, op_name, param=None):
             # In case parameters are not optional
             self._param = param
 
-        def hybrid_forward(self, F, x):
-            op = getattr(F.np.random, self._op_name, None)
+        def forward(self, x):
+            op = getattr(np.random, self._op_name, None)
             assert op is not None
             if self._param is not None:
                 return x + op(self._param, size=self._shape)
@@ -4886,8 +4888,8 @@ def __init__(self, size, beta):
             self._size = size
             self._beta = beta
 
-        def hybrid_forward(self, F, a):
-            return F.np.random.gamma(a, self._beta, size=self._size)
+        def forward(self, a):
+            return np.random.gamma(a, self._beta, size=self._size)
 
     for hybridize in [True, False]:
         param = np.ones(shape) * a
@@ -4924,8 +4926,8 @@ def __init__(self, size=None, dtype=None, ctx=None):
             self._dtype = dtype
             self._ctx = ctx
 
-        def hybrid_forward(self, F, a, b):
-            return F.np.random.beta(a, b, size=self._size, dtype=self._dtype, ctx=self._ctx)
+        def forward(self, a, b):
+            return np.random.beta(a, b, size=self._size, dtype=self._dtype, ctx=self._ctx)
 
     def _test_random_beta_range(output):
         bigger_than_zero = _np.all(output > 0)
@@ -4966,8 +4968,8 @@ def __init__(self, size=None):
             super(TestRandomF, self).__init__()
             self._size = size
 
-        def hybrid_forward(self, F, dfnum, dfden):
-            return F.np.random.f(dfnum, dfden, size=self._size)
+        def forward(self, dfnum, dfden):
+            return np.random.f(dfnum, dfden, size=self._size)
 
     shape_list = [(), (1,), (2, 3), (4, 0, 5), 6, (7, 8), None]
     hybridize_list = [False, True]
@@ -4999,8 +5001,8 @@ def __init__(self, size=None, dtype=None, ctx=None):
             self._dtype = dtype
             self._ctx = ctx
 
-        def hybrid_forward(self, F, df):
-            return F.np.random.chisquare(df, size=self._size, dtype=self._dtype, ctx=self._ctx)
+        def forward(self, df):
+            return np.random.chisquare(df, size=self._size, dtype=self._dtype, ctx=self._ctx)
 
     shape_list = [(), (1,), (2, 3), (4, 0, 5), 6, (7, 8), None]
 
@@ -5031,8 +5033,8 @@ def __init__(self, shape):
             super(TestRayleigh, self).__init__()
             self._shape = shape
 
-        def hybrid_forward(self, F, scale):
-            return F.np.random.rayleigh(scale, self._shape)
+        def forward(self, scale):
+            return np.random.rayleigh(scale, self._shape)
 
     shapes = [(2, 3), (4, 0, 5), (7, 8)]
     for hybridize in [False, True]:
@@ -5068,8 +5070,8 @@ def __init__(self, shape):
             super(TestRandomExp, self).__init__()
             self._shape = shape
 
-        def hybrid_forward(self, F, scale):
-            return F.np.random.exponential(scale, self._shape)
+        def forward(self, scale):
+            return np.random.exponential(scale, self._shape)
 
     output_shapes = [
         (3, 2),
@@ -5127,8 +5129,8 @@ def __init__(self, shape, op_name):
             self._shape = shape
             self._op_name = op_name
 
-        def hybrid_forward(self, F, a):
-            op = getattr(F.np.random, self._op_name, None)
+        def forward(self, a):
+            op = getattr(np.random, self._op_name, None)
             assert op is not None
             return op(a, size=self._shape)
 
@@ -5171,8 +5173,8 @@ def __init__(self, shape):
             super(TestRandomW, self).__init__()
             self._shape = shape
 
-        def hybrid_forward(self, F, a):
-            return F.np.random.weibull(a, self._shape)
+        def forward(self, a):
+            return np.random.weibull(a, self._shape)
 
     output_shapes = [
         (3, 2),
@@ -5203,8 +5205,8 @@ def __init__(self, shape):
             super(TestRandomP, self).__init__()
             self._shape = shape
 
-        def hybrid_forward(self, F, a):
-            return F.np.random.pareto(a, self._shape)
+        def forward(self, a):
+            return np.random.pareto(a, self._shape)
 
     output_shapes = [
         (3, 2),
@@ -5258,8 +5260,8 @@ def __init__(self, size=None):
             super(TestMultivariateNormal, self).__init__()
             self.size = size
 
-        def hybrid_forward(self, F, mean, cov):
-            return F.np.random.multivariate_normal(mean, cov, self.size)
+        def forward(self, mean, cov):
+            return np.random.multivariate_normal(mean, cov, self.size)
 
     hybridize_list = [True, False]
     dtypes = ['float16', 'float32', 'float64']
@@ -5303,10 +5305,10 @@ def __init__(self, size=None):
             super(TestNumpyCategorical, self).__init__()
             self.size = size
 
-        def hybrid_forward(self, F, prob):
+        def forward(self, prob):
             if self.size is None:
-                return F.npx.random.categorical(prob)
-            return F.npx.random.categorical(prob, shape=self.size)
+                return npx.random.categorical(prob)
+            return npx.random.categorical(prob, shape=self.size)
 
     batch_sizes = [(2,), (2, 3)]
     event_shapes = [None, (10,), (10, 12)]
@@ -5343,7 +5345,7 @@ def __init__(self, axis=None, dtype=None):
             self._axis = axis
             self._dtype = dtype
 
-        def hybrid_forward(self, F, a):
+        def forward(self, a):
             return a.cumsum(axis=self._axis, dtype=self._dtype)
 
     shapes = [(2, 3, 4), (2, 0, 3), ()]
@@ -5408,8 +5410,8 @@ def __init__(self, sample_size, replace):
             self.sample_size = sample_size
             self.replace = replace
 
-        def hybrid_forward(self, F, a):
-            return F.np.random.choice(a=a, size=self.sample_size, replace=self.replace, p=None)
+        def forward(self, a):
+            return np.random.choice(a=a, size=self.sample_size, replace=self.replace, p=None)
 
     class TestWeightedChoice(HybridBlock):
         def __init__(self, sample_size, replace):
@@ -5417,9 +5419,9 @@ def __init__(self, sample_size, replace):
             self.sample_size = sample_size
             self.replace = replace
 
-        def hybrid_forward(self, F, a, p):
-            op = getattr(F.np.random, "choice", None)
-            return F.np.random.choice(a, self.sample_size, self.replace, p)
+        def forward(self, a, p):
+            op = getattr(np.random, "choice", None)
+            return np.random.choice(a, self.sample_size, self.replace, p)
 
     def test_sample_with_replacement(sampler, num_classes, shape, weight=None):
         samples = sampler(num_classes, shape, replace=True, p=weight).asnumpy()
@@ -5545,8 +5547,8 @@ def __init__(self, N, M=None, k=0, dtype=None):
             self._k = k
             self._dtype = dtype
 
-        def hybrid_forward(self, F, x):
-            return x + F.np.eye(self._N, self._M, self._k, dtype=self._dtype)
+        def forward(self, x):
+            return x + np.eye(self._N, self._M, self._k, dtype=self._dtype)
 
     for dtype in dtypes:
         x = np.zeros(shape=(), dtype=dtype)
@@ -5592,8 +5594,8 @@ def __init__(self, dimensions=None, dtype=None):
             self._dimensions = dimensions
             self._dtype = dtype
 
-        def hybrid_forward(self, F, x):
-            return x + F.np.indices(dimensions=self._dimensions, dtype=self._dtype)
+        def forward(self, x):
+            return x + np.indices(dimensions=self._dimensions, dtype=self._dtype)
 
     for dtype in dtypes:
         for shape in shapes:
@@ -5627,7 +5629,7 @@ def __init__(self, repeats, axis=None):
             self._repeats = repeats
             self._axis = axis
 
-        def hybrid_forward(self, F, x):
+        def forward(self, x):
             return x.repeat(self._repeats, self._axis)
 
     for shape, repeats, axis in config:
@@ -5654,8 +5656,8 @@ def __init__(self, ord=None, axis=None, keepdims=False):
             self._axis = axis
             self._keepdims = keepdims
 
-        def hybrid_forward(self, F, x):
-            return F.np.linalg.norm(x, ord=self._ord, axis=self._axis, keepdims=self._keepdims)
+        def forward(self, x):
+            return np.linalg.norm(x, ord=self._ord, axis=self._axis, keepdims=self._keepdims)
 
     configs = [
         ((2, 3, 4), 1, (2, 1)),
@@ -5787,8 +5789,8 @@ class TestSVD(HybridBlock):
         def __init__(self):
             super(TestSVD, self).__init__()
 
-        def hybrid_forward(self, F, data):
-            return F.np.linalg.svd(data)
+        def forward(self, data):
+            return np.linalg.svd(data)
 
     def get_grad(UT, L, V):
         m = V.shape[-2]
@@ -5874,8 +5876,8 @@ class TestQR(HybridBlock):
         def __init__(self):
             super(TestQR, self).__init__()
 
-        def hybrid_forward(self, F, data):
-            return F.np.linalg.qr(data)
+        def forward(self, data):
+            return np.linalg.qr(data)
 
     def get_expected_grad(a, q, r, dq, dr):
         # for all input shapes (..., m, n)
@@ -6013,8 +6015,8 @@ class TestCholesky(HybridBlock):
         def __init__(self):
             super(TestCholesky, self).__init__()
 
-        def hybrid_forward(self, F, data):
-            return F.np.linalg.cholesky(data)
+        def forward(self, data):
+            return np.linalg.cholesky(data)
 
     def get_grad(L):
         # shape of m is [batch, n, n]
@@ -6151,8 +6153,8 @@ class TestInverse(HybridBlock):
         def __init__(self):
             super(TestInverse, self).__init__()
 
-        def hybrid_forward(self, F, data):
-            return F.np.linalg.inv(data)
+        def forward(self, data):
+            return np.linalg.inv(data)
 
     def get_grad(A):
         if 0 in A.shape:
@@ -6219,8 +6221,8 @@ class TestSolve(HybridBlock):
         def __init__(self):
             super(TestSolve, self).__init__()
 
-        def hybrid_forward(self, F, a, b):
-            return F.np.linalg.solve(a, b)
+        def forward(self, a, b):
+            return np.linalg.solve(a, b)
 
     def check_solve(x, a_np, b_np):
         try:
@@ -6320,8 +6322,8 @@ def __init__(self, ind=2):
             super(TestTensorinv, self).__init__()
             self._ind = ind
 
-        def hybrid_forward(self, F, a):
-            return F.np.linalg.tensorinv(a, ind=self._ind)
+        def forward(self, a):
+            return np.linalg.tensorinv(a, ind=self._ind)
 
     def check_tensorinv(inv_a, a_np, ind):
         try:
@@ -6416,8 +6418,8 @@ def __init__(self, axes):
             super(TestTensorsolve, self).__init__()
             self._axes = axes
 
-        def hybrid_forward(self, F, a, b):
-            return F.np.linalg.tensorsolve(a, b, axes=self._axes)
+        def forward(self, a, b):
+            return np.linalg.tensorsolve(a, b, axes=self._axes)
 
     def get_tensorsolve_backward(a_np, b_np, mx_out_np, a_axes, a_origin_axes, a_trans_shape):
         if (a_np.ndim == 0 or b_np.ndim == 0) or (a_np.ndim == b_np.ndim):
@@ -6567,8 +6569,8 @@ def __init__(self, rcond):
             super(TestLstsq, self).__init__()
             self._rcond = rcond
 
-        def hybrid_forward(self, F, a, b, rcond='warn'):
-            return F.np.linalg.lstsq(a, b, rcond=self._rcond)
+        def forward(self, a, b, rcond='warn'):
+            return np.linalg.lstsq(a, b, rcond=self._rcond)
 
     def check_lstsq(a_np, b_np, rcond_np, x, residuals, rank, s):
         try:
@@ -6642,8 +6644,8 @@ def __init__(self, hermitian):
             super(TestMatrixRank, self).__init__()
             self._hermitian = hermitian
 
-        def hybrid_forward(self, F, M, tol=None):
-            return F.np.linalg.matrix_rank(M, tol, hermitian=self._hermitian)
+        def forward(self, M, tol=None):
+            return np.linalg.matrix_rank(M, tol, hermitian=self._hermitian)
 
     def check_matrix_rank(rank, a_np, tol, hermitian):
         try:
@@ -6718,8 +6720,8 @@ def __init__(self, hermitian):
             super(TestPinv, self).__init__()
             self._hermitian = hermitian
 
-        def hybrid_forward(self, F, a, rcond=1e-15):
-            return F.np.linalg.pinv(a, rcond, hermitian=self._hermitian)
+        def forward(self, a, rcond=1e-15):
+            return np.linalg.pinv(a, rcond, hermitian=self._hermitian)
 
     def check_pinv(x, a_np, rcond_np, hermitian, use_rcond):
         try:
@@ -6795,8 +6797,8 @@ class TestEigvals(HybridBlock):
         def __init__(self):
             super(TestEigvals, self).__init__()
 
-        def hybrid_forward(self, F, a):
-            return F.np.linalg.eigvals(a)
+        def forward(self, a):
+            return np.linalg.eigvals(a)
 
     def check_eigvals(x, a_np):
         try:
@@ -6863,8 +6865,8 @@ def __init__(self, UPLO):
             super(TestEigvalsh, self).__init__()
             self._UPLO = UPLO
 
-        def hybrid_forward(self, F, a):
-            return F.np.linalg.eigvalsh(a, UPLO=self._UPLO)
+        def forward(self, a):
+            return np.linalg.eigvalsh(a, UPLO=self._UPLO)
 
     def check_eigvalsh(w, a_np, UPLO):
         try:
@@ -6938,8 +6940,8 @@ class TestEig(HybridBlock):
         def __init__(self):
             super(TestEig, self).__init__()
 
-        def hybrid_forward(self, F, a):
-            return F.np.linalg.eig(a)
+        def forward(self, a):
+            return np.linalg.eig(a)
 
     def check_eig(w, v, a_np):
         try:
@@ -7018,8 +7020,8 @@ def __init__(self, UPLO):
             super(TestEigh, self).__init__()
             self._UPLO = UPLO
 
-        def hybrid_forward(self, F, a):
-            return F.np.linalg.eigh(a, UPLO=self._UPLO)
+        def forward(self, a):
+            return np.linalg.eigh(a, UPLO=self._UPLO)
 
     def check_eigh(w, v, a_np, UPLO):
         try:
@@ -7122,8 +7124,8 @@ class TestDet(HybridBlock):
         def __init__(self):
             super(TestDet, self).__init__()
 
-        def hybrid_forward(self, F, a):
-            return F.np.linalg.det(a)
+        def forward(self, a):
+            return np.linalg.det(a)
 
     # test non zero size input
     tensor_shapes = [
@@ -7185,8 +7187,8 @@ class TestSlogdet(HybridBlock):
         def __init__(self):
             super(TestSlogdet, self).__init__()
 
-        def hybrid_forward(self, F, a):
-            return F.np.linalg.slogdet(a)
+        def forward(self, a):
+            return np.linalg.slogdet(a)
 
     test_slogdet = TestSlogdet()
     if hybridize:
@@ -7217,8 +7219,8 @@ class TestVstack(HybridBlock):
         def __init__(self):
             super(TestVstack, self).__init__()
 
-        def hybrid_forward(self, F, a, *args):
-            return F.np.vstack([a] + list(args))
+        def forward(self, a, *args):
+            return np.vstack([a] + list(args))
 
     def g(data):
         return _np.ones_like(data)
@@ -7274,8 +7276,8 @@ def __init__(self, shape, dtype=None):
             self._shape = shape
             self._dtype = dtype
 
-        def hybrid_forward(self, F, a):
-            return F.np.full(self._shape, a, dtype=self._dtype)
+        def forward(self, a):
+            return np.full(self._shape, a, dtype=self._dtype)
 
     configs = [
         ((3, 4), 2.0),
@@ -7326,8 +7328,8 @@ def __init__(self, fill_value, dtype, ctx):
             self._dtype = dtype
             self._ctx = ctx
 
-        def hybrid_forward(self, F, x, *args, **kwargs):
-            return F.np.full_like(x, self._fill_value, dtype=self._dtype, ctx=self._ctx)
+        def forward(self, x, *args, **kwargs):
+            return np.full_like(x, self._fill_value, dtype=self._dtype, ctx=self._ctx)
 
     if StrictVersion(platform.python_version()) < StrictVersion('3.0.0'):
         return
@@ -7371,8 +7373,8 @@ def __init__(self, shift=None, axis=None):
             self._shift = shift
             self._axis = axis
 
-        def hybrid_forward(self, F, x):
-            return F.np.roll(x, shift=self._shift, axis=self._axis)
+        def forward(self, x):
+            return np.roll(x, shift=self._shift, axis=self._axis)
 
     dtypes = ['int32', 'int64', 'float16', 'float32', 'float64']
     configs = [
@@ -7429,8 +7431,8 @@ def __init__(self, axis1, axis2, offset):
             self._axis2 = axis2
             self._offset = offset
 
-        def hybrid_forward(self, F, data):
-            return F.np.trace(data, axis1=self._axis1, axis2=self._axis2, offset=self._offset)
+        def forward(self, data):
+            return np.trace(data, axis1=self._axis1, axis2=self._axis2, offset=self._offset)
 
     def g(data, axis1, axis2, offset):
         idx = _np.indices(data.shape)
@@ -7509,8 +7511,8 @@ def __init__(self, func, M):
             self._func = func
             self._M = M
 
-        def hybrid_forward(self, F, x, *args, **kwargs):
-            op = getattr(F.np, self._func)
+        def forward(self, x, *args, **kwargs):
+            op = getattr(np, self._func)
             assert op is not None
             return x + op(M=self._M)
 
@@ -7542,8 +7544,8 @@ def __init__(self, axis):
             super(TestFlip, self).__init__()
             self.axis = axis
 
-        def hybrid_forward(self, F, x):
-            return F.np.flip(x, self.axis)
+        def forward(self, x):
+            return np.flip(x, self.axis)
 
     shapes = [(1, 2, 3), (1, 0), ()]
     types = ['int32', 'int64', 'float16', 'float32', 'float64']
@@ -7580,15 +7582,15 @@ class TestFlipud(HybridBlock):
         def __init__(self):
             super(TestFlipud, self).__init__()
 
-        def hybrid_forward(self, F, x):
-            return F.np.flipud(x)
+        def forward(self, x):
+            return np.flipud(x)
 
     class TestFliplr(HybridBlock):
         def __init__(self):
             super(TestFliplr, self).__init__()
 
-        def hybrid_forward(self, F, x):
-            return F.np.fliplr(x)
+        def forward(self, x):
+            return np.fliplr(x)
 
     shapes = [(1, 2, 3), (1, 0)]
     types = ['int32', 'int64', 'float16', 'float32', 'float64']
@@ -7635,8 +7637,8 @@ def __init__(self, decimals):
             super(TestAround, self).__init__()
             self.decimals = decimals
 
-        def hybrid_forward(self, F, x):
-            return F.np.around(x, self.decimals)
+        def forward(self, x):
+            return np.around(x, self.decimals)
 
     shapes = [(), (1, 2, 3), (1, 0)]
     types = ['int32', 'int64', 'float32', 'float64']
@@ -7665,8 +7667,8 @@ class TestFlatnonzero(HybridBlock):
         def __init__(self):
             super(TestFlatnonzero, self).__init__()
 
-        def hybrid_forward(self, F, a):
-            return F.np.flatnonzero(a)
+        def forward(self, a):
+            return np.flatnonzero(a)
 
     shapes = [(1,), (4, 3), (4, 5), (2, 1), (6, 5, 6), (4, 2, 1, 2),
               (5, 1, 3, 3), (3, 3, 1, 0),]
@@ -7696,8 +7698,8 @@ def __init__(self, func, decimals):
             self.func = func
             self.decimals = decimals
 
-        def hybrid_forward(self, F, x):
-            return getattr(F.np, self.func)(x, self.decimals)
+        def forward(self, x):
+            return getattr(np, self.func)(x, self.decimals)
 
     shapes = [(), (1, 2, 3), (1, 0)]
     types = ['int32', 'int64', 'float32', 'float64']
@@ -7726,8 +7728,8 @@ class TestNonzero(HybridBlock):
         def __init__(self):
             super(TestNonzero, self).__init__()
 
-        def hybrid_forward(self, F, x):
-            return F.npx.nonzero(x)
+        def forward(self, x):
+            return npx.nonzero(x)
 
     types = ['int32', 'int64', 'float64', 'float32', 'float16']
     for hybridize in [True, False]:
@@ -7761,8 +7763,8 @@ def __init__(self, return_index=False, return_inverse=False, return_counts=False
             self._return_counts = return_counts
             self._axis = axis
 
-        def hybrid_forward(self, F, a):
-            return F.np.unique(a, self._return_index, self._return_inverse, self._return_counts, self._axis)
+        def forward(self, a):
+            return np.unique(a, self._return_index, self._return_inverse, self._return_counts, self._axis)
 
     configs = [
         ((), True, True, True, None),
@@ -7833,8 +7835,8 @@ def __init__(self, axis, mode):
             self._axis = axis
             self._mode = mode
 
-        def hybrid_forward(self, F, a, indices):
-            return F.np.take(a, indices, axis=self._axis, mode=self._mode)
+        def forward(self, a, indices):
+            return np.take(a, indices, axis=self._axis, mode=self._mode)
 
     def grad_helper(grad_in, axis, idx, mode):
         k = grad_in.shape[axis]
@@ -7940,8 +7942,8 @@ def __init__(self, n, k=0, m=None):
                 m = n
             self._m = m
 
-        def hybrid_forward(self, F, x, *args, **kwargs):
-            return x, F.np.tril_indices(n=self._n, k=self._k, m=self._m)
+        def forward(self, x, *args, **kwargs):
+            return x, np.tril_indices(n=self._n, k=self._k, m=self._m)
 
     for n in _np.random.random_integers(-10, 50, 2):
         for k in _np.random.random_integers(-50, 50, 2):
@@ -7973,8 +7975,8 @@ def __init__(self, val, wrap=False):
             self._val = val
             self._wrap= wrap
 
-        def hybrid_forward(self, F, x):
-            return F.np.fill_diagonal(x, val=self._val, wrap=self._wrap)
+        def forward(self, x):
+            return np.fill_diagonal(x, val=self._val, wrap=self._wrap)
 
     configs = [
         ((10, 10), 2),
@@ -8017,8 +8019,8 @@ def __init__(self, source=None, destination=None):
             self._source = source
             self._destination= destination
 
-        def hybrid_forward(self, F, x):
-            return F.np.moveaxis(x, source=self._source, destination=self._destination)
+        def forward(self, x):
+            return np.moveaxis(x, source=self._source, destination=self._destination)
 
     dtypes = ['int32', 'int64', 'float16', 'float32', 'float64']
     for hybridize in [False, True]:
@@ -8060,8 +8062,8 @@ def __init__(self, k=1, axes=(0, 1)):
             self._k = k
             self._axes = axes
 
-        def hybrid_forward(self, F, a, *args):
-            return F.np.rot90(a, self._k, self._axes)
+        def forward(self, a, *args):
+            return np.rot90(a, self._k, self._axes)
 
     configs = [
         ((2, 3), 1, (0, 1)),
@@ -8111,8 +8113,8 @@ def __init__(self, indices_or_sections):
             super(TestHSplit, self).__init__()
             self._indices_or_sections = indices_or_sections
 
-        def hybrid_forward(self, F, a, *args, **kwargs):
-            return F.np.hsplit(a, indices_or_sections=self._indices_or_sections)
+        def forward(self, a, *args, **kwargs):
+            return np.hsplit(a, indices_or_sections=self._indices_or_sections)
 
     shapes = [
         (10,),
@@ -8161,8 +8163,8 @@ def __init__(self, indices_or_sections):
             super(TestDSplit, self).__init__()
             self._indices_or_sections = indices_or_sections
 
-        def hybrid_forward(self, F, a, *args, **kwargs):
-            return F.np.dsplit(a, indices_or_sections=self._indices_or_sections)
+        def forward(self, a, *args, **kwargs):
+            return np.dsplit(a, indices_or_sections=self._indices_or_sections)
 
     shapes = [
         (2, 4, 6),
@@ -8210,8 +8212,8 @@ def __init__(self, subscripts, optimize):
             self.subscripts = subscripts
             self.optimize = optimize
 
-        def hybrid_forward(self, F, *operands):
-            return F.np.einsum(self.subscripts, *operands, optimize=self.optimize)
+        def forward(self, *operands):
+            return np.einsum(self.subscripts, *operands, optimize=self.optimize)
 
     def dbg(name, data):
         print('type of {} = {}'.format(name, type(data)))
@@ -8366,8 +8368,8 @@ class TestDiagflat(HybridBlock):
         def __init__(self, k=0):
             super(TestDiagflat,self).__init__()
             self._k = k
-        def hybrid_forward(self,F,a):
-            return F.np.diagflat(a, k=self._k)
+        def forward(self, a):
+            return np.diagflat(a, k=self._k)
     shapes = [(2,),5 , (1,5), (2,2), (2,5), (3,3), (4,3),(4,4,5)] # test_shapes, remember to include zero-dim shape and zero-size shapes
     dtypes = [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64] # remember to include all meaningful data types for the operator
     range_k = 6
@@ -8408,8 +8410,8 @@ def __init__(self, pad_width, mode='constant'):
             super(TestPad,self).__init__()
             self._pad_width = pad_width
             self._mode = mode
-        def hybrid_forward(self,F,A,**kwargs):
-            return F.np.pad(A, self._pad_width, mode=self._mode, **kwargs)
+        def forward(self, A, **kwargs):
+            return np.pad(A, self._pad_width, mode=self._mode, **kwargs)
 
     shapes = [6, (1,5), (2,2), (2,2), (3,3), (2,3), (3,4,5)]
     dtypes = [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64]
@@ -8580,8 +8582,8 @@ class TestColumnStack(HybridBlock):
         def __init__(self):
             super(TestColumnStack, self).__init__()
 
-        def hybrid_forward(self, F, a, *args):
-            return F.np.column_stack([a] + list(args))
+        def forward(self, a, *args):
+            return np.column_stack([a] + list(args))
 
     def g(data):
         return _np.ones_like(data)
@@ -8634,8 +8636,8 @@ def __init__(self, newshape, reverse):
             self._newshape = newshape
             self._reverse = reverse
 
-        def hybrid_forward(self, F, a, *args, **kwargs):
-            return F.npx.reshape(a, self._newshape, reverse=self._reverse)
+        def forward(self, a, *args, **kwargs):
+            return npx.reshape(a, self._newshape, reverse=self._reverse)
 
     test_cases = [
         [(2, 3, 5, 5),  (-2, -1),         False, (2, 75)],
@@ -8717,8 +8719,8 @@ def __init__(self, axis=None, keepdims=False):
             self._axis = axis
             self._keepdims = keepdims
 
-        def hybrid_forward(self, F, a):
-            return F.np.median(a, axis=self._axis, keepdims=self._keepdims)
+        def forward(self, a):
+            return np.median(a, axis=self._axis, keepdims=self._keepdims)
 
     flags = [True, False]
     dtypes = ['float16', 'float32', 'float64']
@@ -8759,8 +8761,8 @@ def __init__(self, axis=None, interpolation='linear', keepdims=False):
             self._interpolation = interpolation
             self._keepdims = keepdims
 
-        def hybrid_forward(self, F, a, q):
-            return F.np.quantile(a, q, axis=self._axis, interpolation=self._interpolation, keepdims=self._keepdims)
+        def forward(self, a, q):
+            return np.quantile(a, q, axis=self._axis, interpolation=self._interpolation, keepdims=self._keepdims)
 
     class TestQuantileScalar(HybridBlock):
         def __init__(self, q=None, axis=None, interpolation='linear', keepdims=False):
@@ -8770,8 +8772,8 @@ def __init__(self, q=None, axis=None, interpolation='linear', keepdims=False):
             self._interpolation = interpolation
             self._keepdims = keepdims
 
-        def hybrid_forward(self, F, a):
-            return F.np.quantile(a, self._q, axis=self._axis, interpolation=self._interpolation, keepdims=self._keepdims)
+        def forward(self, a):
+            return np.quantile(a, self._q, axis=self._axis, interpolation=self._interpolation, keepdims=self._keepdims)
 
     flags = [True, False]
     interpolation_options = ['linear', 'lower', 'higher', 'nearest', 'midpoint']
@@ -8819,8 +8821,8 @@ def __init__(self, axis=None, interpolation='linear', keepdims=False):
             self._interpolation = interpolation
             self._keepdims = keepdims
 
-        def hybrid_forward(self, F, a, q):
-            return F.np.percentile(a, q, axis=self._axis, interpolation=self._interpolation, keepdims=self._keepdims)
+        def forward(self, a, q):
+            return np.percentile(a, q, axis=self._axis, interpolation=self._interpolation, keepdims=self._keepdims)
 
     class TestPercentileScalar(HybridBlock):
         def __init__(self, q=None, axis=None, interpolation='linear', keepdims=False):
@@ -8830,8 +8832,8 @@ def __init__(self, q=None, axis=None, interpolation='linear', keepdims=False):
             self._interpolation = interpolation
             self._keepdims = keepdims
 
-        def hybrid_forward(self, F, a):
-            return F.np.percentile(a, self._q, axis=self._axis, interpolation=self._interpolation, keepdims=self._keepdims)
+        def forward(self, a):
+            return np.percentile(a, self._q, axis=self._axis, interpolation=self._interpolation, keepdims=self._keepdims)
 
     flags = [True, False]
     interpolation_options = ['linear', 'lower', 'higher', 'nearest', 'midpoint']
@@ -8886,8 +8888,8 @@ def __init__(self, n=1, axis=-1):
             self._n = n
             self._axis = axis
 
-        def hybrid_forward(self, F, a):
-            return F.np.diff(a, n=self._n, axis=self._axis)
+        def forward(self, a):
+            return np.diff(a, n=self._n, axis=self._axis)
 
     shapes = [tuple(random.randrange(10) for i in range(random.randrange(6))) for j in range(5)]
     for hybridize in [True, False]:
@@ -8937,8 +8939,8 @@ class TestEDiff1DCASE1(HybridBlock):
         def __init__(self):
             super(TestEDiff1DCASE1, self).__init__()
 
-        def hybrid_forward(self, F, a, b, c):
-            return F.np.ediff1d(a, to_end=b, to_begin=c)
+        def forward(self, a, b, c):
+            return np.ediff1d(a, to_end=b, to_begin=c)
 
     # case 2: only `to_end` is array but `to_begin` is scalar/None
     class TestEDiff1DCASE2(HybridBlock):
@@ -8946,8 +8948,8 @@ def __init__(self, to_begin=None):
             super(TestEDiff1DCASE2, self).__init__()
             self._to_begin = to_begin
 
-        def hybrid_forward(self, F, a, b):
-            return F.np.ediff1d(a, to_end=b, to_begin=self._to_begin)
+        def forward(self, a, b):
+            return np.ediff1d(a, to_end=b, to_begin=self._to_begin)
 
     # case 3: only `to_begin` is array but `to_end` is scalar/None
     class TestEDiff1DCASE3(HybridBlock):
@@ -8955,8 +8957,8 @@ def __init__(self, to_end=None):
             super(TestEDiff1DCASE3, self).__init__()
             self._to_end = to_end
 
-        def hybrid_forward(self, F, a, b):
-            return F.np.ediff1d(a, to_end=self._to_end, to_begin=b)
+        def forward(self, a, b):
+            return np.ediff1d(a, to_end=self._to_end, to_begin=b)
 
     # case 4: both `to_begin` and `to_end` are scalar/None
     class TestEDiff1DCASE4(HybridBlock):
@@ -8965,8 +8967,8 @@ def __init__(self, to_end=None, to_begin=None):
             self._to_begin = to_begin
             self._to_end = to_end
 
-        def hybrid_forward(self, F, a):
-            return F.np.ediff1d(a, to_end=self._to_end, to_begin=self._to_begin)
+        def forward(self, a):
+            return np.ediff1d(a, to_end=self._to_end, to_begin=self._to_begin)
 
     rtol = 1e-3
     atol = 1e-5
@@ -9042,8 +9044,8 @@ class TestColumnStack(HybridBlock):
         def __init__(self):
             super(TestColumnStack, self).__init__()
 
-        def hybrid_forward(self, F, a, *args):
-            return F.np.column_stack([a] + list(args))
+        def forward(self, a, *args):
+            return np.column_stack([a] + list(args))
 
     def g(data):
         return _np.ones_like(data)
@@ -9097,8 +9099,8 @@ def __init__(self, new_shape):
             super(TestResize, self).__init__()
             self._new_shape = new_shape
 
-        def hybrid_forward(self, F, x, *args, **kwargs):
-            return F.np.resize(x, self._new_shape)
+        def forward(self, x, *args, **kwargs):
+            return np.resize(x, self._new_shape)
 
     dtypes = [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64, np.bool_]
     shape_config = [
@@ -9131,8 +9133,8 @@ def __init__(self, k=0):
             super(TestDiag, self).__init__()
             self._k = k
 
-        def hybrid_forward(self, F, a):
-            return F.np.diag(a, k=self._k)
+        def forward(self, a):
+            return np.diag(a, k=self._k)
 
     shapes = [(), (2,), (1, 5), (2, 2), (2, 5), (3, 3), (4, 3)]
     dtypes = [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64]
@@ -9204,11 +9206,11 @@ def __init__(self, k=0, axis1=0, axis2=1, call_by_instance=False):
             self._axis2 = axis2
             self._call_by_instance = call_by_instance
 
-        def hybrid_forward(self, F, a):
+        def forward(self, a):
             if self._call_by_instance:
                 return a.diagonal(self._k, self._axis1, self._axis2)
             else:
-                return F.np.diagonal(a, self._k, self._axis1, self._axis2)
+                return np.diagonal(a, self._k, self._axis1, self._axis2)
 
     rtol = 1e-2 if dtype == np.float16 else 1e-3
     atol = 1e-4 if dtype == np.float16 else 1e-5
@@ -9284,8 +9286,8 @@ def __init__(self, copy=True, nan=0.0, posinf=None, neginf=None):
             self.neginf = neginf
             # necessary initializations
 
-        def hybrid_forward(self, F, a):
-            return F.np.nan_to_num(a, self.copy, self.nan, self.posinf, self.neginf)
+        def forward(self, a):
+            return np.nan_to_num(a, self.copy, self.nan, self.posinf, self.neginf)
 
     src_list = [
         _np.nan,
@@ -9306,7 +9308,9 @@ def hybrid_forward(self, F, a):
     dtype_list = ['float16', 'float32', 'float64']
     # [nan, inf, -inf]
     param_list = [[None, None, None], [0, 1000, -100], [0.0, 9999.9, -9999.9]]
-    copy_list = [True, False]
+    # Inplace operations are not supported when recording in deferred compute mode
+    # copy_list = [True, False]
+    copy_list = [True]
     hybridize_list = [True, False]
     atol, rtol = 1e-5, 1e-3
 
@@ -9373,8 +9377,8 @@ def __init__(self, func):
                 super(TestUnary, self).__init__()
                 self._func = func
 
-            def hybrid_forward(self, F, a):
-                return getattr(F.np, self._func)(a)
+            def forward(self, a):
+                return getattr(np, self._func)(a)
 
         src_list = [
             _np.nan,
@@ -9477,8 +9481,8 @@ class TestPolyval(HybridBlock):
         def __init__(self):
             super(TestPolyval, self).__init__()
 
-        def hybrid_forward(self, F, p, x, *args, **kwargs):
-            return F.np.polyval(p, x)
+        def forward(self, p, x, *args, **kwargs):
+            return np.polyval(p, x)
 
     def polyval_grad(p, x):
         x_shape = x.shape
@@ -9543,8 +9547,8 @@ class TestWhere(HybridBlock):
         def __init__(self):
             super(TestWhere, self).__init__()
 
-        def hybrid_forward(self, F, cond, x, y):
-            return F.np.where(cond, x, y)
+        def forward(self, cond, x, y):
+            return np.where(cond, x, y)
 
     dtypes = [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64, np.bool]
     shape_configs = [
@@ -9611,8 +9615,8 @@ def __init__(self, axis):
             super(TestExpandDims, self).__init__()
             self._axis = axis
 
-        def hybrid_forward(self, F, x):
-            return F.np.expand_dims(x, self._axis)
+        def forward(self, x):
+            return np.expand_dims(x, self._axis)
 
     dtypes = [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64, np.bool]
     shapes = [
@@ -9666,8 +9670,8 @@ def __init__(self, shape, order='C') :
             self._shape = shape
             self._order = order
 
-        def hybrid_forward(self, F, a):
-            return F.np.unravel_index(a, self._shape, self._order)
+        def forward(self, a):
+            return np.unravel_index(a, self._shape, self._order)
 
     in_shapes = [
         2, 5,
@@ -9715,8 +9719,8 @@ class TestDiag_indices_from(HybridBlock):
         def __init__(self) :
             super(TestDiag_indices_from, self).__init__()
 
-        def hybrid_forward(self, F, a):
-            return F.np.diag_indices_from(a)
+        def forward(self, a):
+            return np.diag_indices_from(a)
 
     dtypes = [np.int8, np.uint8, np.int32, np.int64, np.float16, np.float32, np.float64]
     shapes = [(2, 2), (4, 4), (5, 5, 5), (6, 6, 6, 6), (8, 8, 8, 8)]
@@ -9754,8 +9758,8 @@ def __init__(self, left=None, right=None, period=None):
             self._right = right
             self._period = period
 
-        def hybrid_forward(self, F, x, xp, fp):
-            return F.np.interp(x, xp, fp, left=self._left, right=self._right, period=self._period)
+        def forward(self, x, xp, fp):
+            return np.interp(x, xp, fp, left=self._left, right=self._right, period=self._period)
 
     class TestInterpScalar(HybridBlock):
         def __init__(self, x=None, left=None, right=None, period=None):
@@ -9765,8 +9769,8 @@ def __init__(self, x=None, left=None, right=None, period=None):
             self._right = right
             self._period = period
 
-        def hybrid_forward(self, F, xp, fp):
-            return F.np.interp(self._x, xp, fp, left=self._left, right=self._right, period=self._period)
+        def forward(self, xp, fp):
+            return np.interp(self._x, xp, fp, left=self._left, right=self._right, period=self._period)
 
     xtypes = [np.int64, np.float32, np.float64]
     dtypes = [np.int32, np.int64, np.float32, np.float64]
@@ -9818,16 +9822,16 @@ def __init__(self, minlength=0):
             super(TestBincount, self).__init__()
             self._minlength = minlength
 
-        def hybrid_forward(self, F, a):
-            return F.np.bincount(a, None, self._minlength)
+        def forward(self, a):
+            return np.bincount(a, None, self._minlength)
 
     class TestBincountWeights(HybridBlock):
         def __init__(self, minlength=0):
             super(TestBincountWeights, self).__init__()
             self._minlength = minlength
 
-        def hybrid_forward(self, F, a, weights):
-            return F.np.bincount(a, weights, self._minlength)
+        def forward(self, a, weights):
+            return np.bincount(a, weights, self._minlength)
 
     dtypes = [np.int8, np.uint8, np.int32, np.int64]
     weight_types = [np.int32, np.int64, np.float16, np.float32, np.float64]
@@ -9870,8 +9874,8 @@ def __init__(self, dtype, order, subok):
             self._order = order
             self._subok = subok
 
-        def hybrid_forward(self, F, x, *args, **kwargs):
-            return F.np.empty_like(x, self._dtype, self._order, self._subok)
+        def forward(self, x, *args, **kwargs):
+            return np.empty_like(x, self._dtype, self._order, self._subok)
 
     if StrictVersion(platform.python_version()) < StrictVersion('3.0.0'):
         return
@@ -10006,8 +10010,8 @@ def __init__(self, axisa=-1, axisb=-1, axisc=-1, axis=None):
             self._axisc = axisc
             self._axis = axis
 
-        def hybrid_forward(self, F, a, b):
-            return F.np.cross(a, b, self._axisa, self._axisb, self._axisc, self._axis)
+        def forward(self, a, b):
+            return np.cross(a, b, self._axisa, self._axisb, self._axisc, self._axis)
 
     def check_np_cross(x, a_np, b_np, axises):
         try:
@@ -10182,8 +10186,8 @@ def __init__(self, axis=0, start=0):
             self._axis = axis
             self._start = start
 
-        def hybrid_forward(self, F, a, *args, **kwargs):
-            return F.np.rollaxis(a, axis=self._axis, start=self._start)
+        def forward(self, a, *args, **kwargs):
+            return np.rollaxis(a, axis=self._axis, start=self._start)
 
     dtypes = ['int32', 'int64', 'float16', 'float32', 'float64']
     for hybridize in [False, True]:
@@ -10216,8 +10220,8 @@ def hybrid_forward(self, F, a, *args, **kwargs):
 @use_np
 def test_npx_stop_gradient():
     class TestStopGradient(HybridBlock):
-        def hybrid_forward(self, F, a):
-            return F.npx.stop_gradient(a)
+        def forward(self, a):
+            return npx.stop_gradient(a)
     dtypes = ['float16', 'float32', 'float64']
     for hybridize in [False, True]:
         for dtype in dtypes:
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index e0139888eb4f..891511b86d8d 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -9487,10 +9487,11 @@ def seq_reverse():
     seq_reverse()
     seq_mask()
 
+@mx.util.use_np
 def test_take_grads():
     # Test for https://github.com/apache/incubator-mxnet/issues/19817
     from mxnet.gluon.nn import HybridBlock, Conv1D, HybridSequential, HybridLambda, Dense
-    from mxnet import autograd, nd
+    from mxnet import autograd, np as mx_np, npx as mx_npx
     from mxnet.gluon.loss import L2Loss
 
     def get_grads(model, grads, ctx=mx.cpu()):
@@ -9536,12 +9537,13 @@ def __init__(self, use_take=False, **kwargs):
             self.use_take = use_take
             self.den = dense_layer()
 
-        def hybrid_forward(self, F, X, axis=1):
+        def forward(self, X, axis=1):
             X1 = self.den(X)
+            print(X1.shape)
             if self.use_take:
-                X2 = F.take(X1, nd.array([0]), axis=axis)
+                X2 = mx_np.take(X1, mx_np.array([0]), axis=axis)
             else:
-                X2 = F.slice_axis(X1, begin=0, end=1, axis=axis)
+                X2 = mx_npx.slice(X1.T, begin=0, end=1).T
             return X2
 
     N = 30
@@ -9550,17 +9552,17 @@ def hybrid_forward(self, F, X, axis=1):
 
     X = np.random.normal(size=(N, T, C))
     Y = np.random.normal(size=(N, 1))
-    X, Y = nd.array(X), nd.array(Y)
+    X, Y = mx_np.array(X), mx_np.array(Y)
     seed = np.random.randint(1000)
 
-    # Using F.take
+    # Using mx_np.take
     mx.random.seed(seed)
     model = Model(use_take=True)
     model.initialize()
     loss = L2Loss()
     grads1 = run_model(model, loss, X, Y)
 
-    # Using F.slice_axis
+    # Using mx_npx.slice
     mx.random.seed(seed)
     model2 = Model(use_take=False)
     model2.initialize()
diff --git a/tests/python/unittest/test_sparse_ndarray.py b/tests/python/unittest/test_sparse_ndarray.py
index 728bdbdc429e..e427377e62f6 100644
--- a/tests/python/unittest/test_sparse_ndarray.py
+++ b/tests/python/unittest/test_sparse_ndarray.py
@@ -32,6 +32,7 @@
 
 import pytest
 
+mx.npx.reset_np()
 
 def sparse_nd_ones(shape, stype):
     return mx.nd.ones(shape).tostype(stype)
diff --git a/tests/python/unittest/test_subgraph.py b/tests/python/unittest/test_subgraph.py
index 947d18e2d845..c4fe6dd39afe 100644
--- a/tests/python/unittest/test_subgraph.py
+++ b/tests/python/unittest/test_subgraph.py
@@ -100,7 +100,7 @@ def make_subgraph4(stype):
         model.hybridize()
         model.initialize()
         s = (1, 3, 32, 32)
-        data = mx.nd.random.normal(shape=s)
+        data = mx.np.random.normal(size=s)
         out = model(data)
         model.export('resnet18')
         orig = mx.sym.load('resnet18-symbol.json')
diff --git a/tests/python/unittest/test_subgraph_op.py b/tests/python/unittest/test_subgraph_op.py
index 51f79914bbad..c17400672fd5 100644
--- a/tests/python/unittest/test_subgraph_op.py
+++ b/tests/python/unittest/test_subgraph_op.py
@@ -25,10 +25,12 @@
 from mxnet.test_utils import assert_almost_equal, environment
 from mxnet import gluon
 from mxnet.gluon import nn
-from mxnet import nd
+from mxnet import nd, npx
 import pytest
 import tempfile
 
+mx.npx.reset_np()
+
 def network_structure_1():
     data1 = mx.sym.var('data1', shape=(2, 3, 10, 10))
     data2 = mx.sym.var('data2')
@@ -456,6 +458,7 @@ def test_subgraph_backend_gluon(sym, subgraph_backend, op_names, tmp_path):
         assert_almost_equal((outputs1[i] - outputs2[i]).abs().sum().asnumpy(), np.zeros(shape=(1,)))
 
 # Test Gluon HybridBlocks for graph partitioning a network created by HybridSequential.
+@mx.util.use_np
 @pytest.mark.serial
 def test_subgraph_backend_gluon_ext1(tmpdir):
     def get_net():  
@@ -466,7 +469,7 @@ def get_net():
         return net
 
     # regular inference
-    x = nd.random.normal(shape=(1, 512),ctx=mx.current_context())
+    x = mx.np.random.normal(size=(1, 512),ctx=mx.current_context())
     net = get_net()
     net.initialize(ctx=mx.current_context())
     outputs1 = net(x)
@@ -487,9 +490,10 @@ def get_net():
     # compare outputs
     assert len(outputs1) == len(outputs2)
     for i in range(len(outputs1)):
-        assert_almost_equal((outputs1[i] - outputs2[i]).abs().sum().asnumpy(), np.zeros(shape=(1,)))
+        assert_almost_equal(mx.np.abs((outputs1[i] - outputs2[i])).sum().asnumpy(), np.zeros(shape=(1,)))
 
 # Test Gluon HybridBlocks for graph partitioning a network created by HybridBlock.
+@mx.util.use_np
 @pytest.mark.serial
 def test_subgraph_backend_gluon_ext2(tmpdir):
     class Net(gluon.HybridBlock):
@@ -499,12 +503,12 @@ def __init__(self, **kwargs):
             self.fc2 = nn.Dense(128)
             self.fc3 = nn.Dense(2)
 
-        def hybrid_forward(self, F, x):
-            x = F.relu(self.fc1(x))
-            x = F.relu(self.fc2(x))
+        def forward(self, x):
+            x = npx.relu(self.fc1(x))
+            x = npx.relu(self.fc2(x))
             return self.fc3(x)
     # regular inference
-    x = nd.random.normal(shape=(1, 512),ctx=mx.current_context())
+    x = mx.np.random.normal(size=(1, 512),ctx=mx.current_context())
     net = Net()
     net.initialize(ctx=mx.current_context())
     outputs1 = net(x)
@@ -525,7 +529,7 @@ def hybrid_forward(self, F, x):
     # compare outputs
     assert len(outputs1) == len(outputs2)
     for i in range(len(outputs1)):
-        assert_almost_equal((outputs1[i] - outputs2[i]).abs().sum().asnumpy(), np.zeros(shape=(1,)))
+        assert_almost_equal(mx.np.abs(outputs1[i] - outputs2[i]).sum().asnumpy(), np.zeros(shape=(1,)))
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_thread_local.py b/tests/python/unittest/test_thread_local.py
index 05308a2d3dfd..8e4370ea6466 100644
--- a/tests/python/unittest/test_thread_local.py
+++ b/tests/python/unittest/test_thread_local.py
@@ -153,7 +153,7 @@ def test_createblock():
     def f():
         net = mx.gluon.nn.Dense(2)
         net.initialize()
-        x = net(mx.nd.array([1, 2, 3]))
+        x = net(mx.np.array([1, 2, 3]))
         x.wait_to_read()
         status[0] = True