From b56f8bd6be8769e7d1c37b2a1d284018346c3ac3 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Wed, 8 Sep 2021 18:57:01 -0700
Subject: [PATCH 01/41] standardize mxnet numpy creation functions

---
 include/mxnet/runtime/c_runtime_api.h         |   1 +
 include/mxnet/runtime/packed_func.h           |   4 +-
 python/mxnet/_ffi/_ctypes/function.py         |  12 +-
 python/mxnet/_ffi/_ctypes/types.py            |   4 +-
 python/mxnet/_ffi/_cython/base.pxi            |   3 +-
 python/mxnet/_ffi/_cython/function.pxi        |  10 +-
 python/mxnet/base.py                          |   3 +
 python/mxnet/ndarray/numpy/_op.py             |  11 +-
 python/mxnet/numpy/multiarray.py              | 259 +++++++++++++++---
 python/mxnet/util.py                          |  23 ++
 src/api/operator/numpy/np_init_op.cc          | Bin 13978 -> 14268 bytes
 .../numpy_extension/npx_convolution_op.cc     |   2 +-
 .../numpy_extension/npx_deconvolution_op.cc   |   2 +-
 src/operator/tensor/init_op.h                 |  38 ++-
 14 files changed, 312 insertions(+), 60 deletions(-)

diff --git a/include/mxnet/runtime/c_runtime_api.h b/include/mxnet/runtime/c_runtime_api.h
index 69de9ca27d12..6a2948225ecc 100644
--- a/include/mxnet/runtime/c_runtime_api.h
+++ b/include/mxnet/runtime/c_runtime_api.h
@@ -75,6 +75,7 @@ typedef union {
   double v_float64;
   void* v_handle;
   const char* v_str;
+  uint64_t v_uint64;
   DLDataType v_type;
 } MXNetValue;
 
diff --git a/include/mxnet/runtime/packed_func.h b/include/mxnet/runtime/packed_func.h
index f498a692c1dc..82e5638b493a 100644
--- a/include/mxnet/runtime/packed_func.h
+++ b/include/mxnet/runtime/packed_func.h
@@ -421,8 +421,8 @@ class MXNetPODValue_ {
     return value_.v_int64;
   }
   operator uint64_t() const {
-    MXNET_CHECK_TYPE_CODE(type_code_, kDLInt);
-    return value_.v_int64;
+    MXNET_CHECK_TYPE_CODE(type_code_, kDLUInt);
+    return value_.v_uint64;
   }
   operator int() const {
     MXNET_CHECK_TYPE_CODE(type_code_, kDLInt);
diff --git a/python/mxnet/_ffi/_ctypes/function.py b/python/mxnet/_ffi/_ctypes/function.py
index 1bfaf0719747..8b9a797a68c0 100644
--- a/python/mxnet/_ffi/_ctypes/function.py
+++ b/python/mxnet/_ffi/_ctypes/function.py
@@ -24,7 +24,7 @@
 from numbers import Number, Integral
 import numpy as onp
 
-from ...base import get_last_ffi_error, _LIB, check_call
+from ...base import get_last_ffi_error, _LIB, check_call, _MAX_VALUE_64_BIT_SIGNED_, _MAX_VALUE_64_BIT_UNSIGNED_
 from ..base import c_str
 from .types import MXNetValue, TypeCode
 from .types import RETURN_SWITCH
@@ -63,8 +63,14 @@ def _make_mxnet_args(args, temp_args):
             values[i].v_handle = arg.handle
             type_codes[i] = TypeCode.NDARRAYHANDLE
         elif isinstance(arg, Integral):
-            values[i].v_int64 = arg
-            type_codes[i] = TypeCode.INT
+            if arg > _MAX_VALUE_64_BIT_UNSIGNED_:
+                raise OverflowError("Integer out of bounds")
+            elif arg > _MAX_VALUE_64_BIT_SIGNED_:
+                values[i].v_uint64 = arg
+                type_codes[i] = TypeCode.UINT
+            else:
+                values[i].v_int64 = arg
+                type_codes[i] = TypeCode.INT
         elif isinstance(arg, ObjectBase):
             values[i].v_handle = arg.handle
             type_codes[i] = TypeCode.OBJECT_HANDLE
diff --git a/python/mxnet/_ffi/_ctypes/types.py b/python/mxnet/_ffi/_ctypes/types.py
index 7058f5400882..dc24cae42431 100644
--- a/python/mxnet/_ffi/_ctypes/types.py
+++ b/python/mxnet/_ffi/_ctypes/types.py
@@ -46,10 +46,12 @@ class MXNetValue(ctypes.Union):
     _fields_ = [("v_int64", ctypes.c_int64),
                 ("v_float64", ctypes.c_double),
                 ("v_handle", ctypes.c_void_p),
-                ("v_str", ctypes.c_char_p)]
+                ("v_str", ctypes.c_char_p),
+                ("v_uint64", ctypes.c_uint64)]
 
 RETURN_SWITCH = {
     TypeCode.INT: lambda x: x.v_int64,
+    TypeCode.UINT: lambda x: x.v_uint64,
     TypeCode.FLOAT: lambda x: x.v_float64,
     TypeCode.NULL: lambda x: None,
     TypeCode.STR: lambda x: py_str(x.v_str),
diff --git a/python/mxnet/_ffi/_cython/base.pxi b/python/mxnet/_ffi/_cython/base.pxi
index 55b69c65f078..b785e99c9f5f 100644
--- a/python/mxnet/_ffi/_cython/base.pxi
+++ b/python/mxnet/_ffi/_cython/base.pxi
@@ -22,7 +22,7 @@ from cpython.version cimport PY_MAJOR_VERSION
 from cpython cimport pycapsule
 from libc.stdint cimport int32_t, int64_t, uint64_t, uint8_t, uint16_t, uint32_t
 import ctypes
-from ...base import get_last_ffi_error
+from ...base import get_last_ffi_error, _MAX_VALUE_64_BIT_SIGNED_, _MAX_VALUE_64_BIT_UNSIGNED_
 
 cdef enum MXNetTypeCode:
     kInt = 0
@@ -45,6 +45,7 @@ cdef extern from "mxnet/runtime/c_runtime_api.h":
         double v_float64
         void* v_handle
         const char* v_str
+        uint64_t v_uint64
 
 ctypedef void* MXNetRetValueHandle
 ctypedef void* MXNetFunctionHandle
diff --git a/python/mxnet/_ffi/_cython/function.pxi b/python/mxnet/_ffi/_cython/function.pxi
index 6d5751e1ad6a..7ea202400137 100644
--- a/python/mxnet/_ffi/_cython/function.pxi
+++ b/python/mxnet/_ffi/_cython/function.pxi
@@ -35,8 +35,14 @@ cdef inline int make_arg(object arg,
         value[0].v_handle = <void*><size_t>(arg._get_handle())
         tcode[0] = kNDArrayHandle
     elif isinstance(arg, Integral):
-        value[0].v_int64 = arg
-        tcode[0] = kInt
+        if arg > _MAX_VALUE_64_BIT_UNSIGNED_:
+            raise OverflowError("Integer out of bounds")
+        elif arg > _MAX_VALUE_64_BIT_SIGNED_:
+            value[0].v_uint64 = arg
+            tcode[0] = kUInt
+        else:
+            value[0].v_int64 = arg
+            tcode[0] = kInt
     elif isinstance(arg, ObjectBase):
         value[0].v_handle = (<ObjectBase>arg).chandle
         tcode[0] = kObjectHandle
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 2e8d4b484318..12b1d2d543c3 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -49,6 +49,9 @@
 string_types = basestring,
 error_types = {}
 
+_MAX_VALUE_64_BIT_SIGNED_ = 9_223_372_036_854_775_807
+_MAX_VALUE_64_BIT_UNSIGNED_ = 18_446_744_073_709_551_615
+
 # this function is needed for python3
 # to convert ctypes.char_p .value back to python str
 py_str = lambda x: x.decode('utf-8')
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index fa255f0ccac4..5cf1b166733a 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -20,7 +20,7 @@
 """Namespace for numpy operators used in Gluon dispatched by F=ndarray."""
 
 import numpy as _np
-from ...base import numeric_types, integer_types
+from ...base import numeric_types, integer_types, _MAX_VALUE_64_BIT_SIGNED_
 from ...util import _sanity_check_params, set_module
 from ...util import wrap_np_unary_func, wrap_np_binary_func
 from ...util import is_np_default_dtype
@@ -383,6 +383,15 @@ def full(shape, fill_value, dtype=None, order='C', ctx=None, out=None):  # pylin
     if isinstance(fill_value, bool):
         fill_value = int(fill_value)
         dtype = _np.bool if dtype is None else dtype
+    elif isinstance(fill_value, integer_types):
+        # fill_value is uint64
+        if fill_value > _MAX_VALUE_64_BIT_SIGNED_:
+            dtype = _np.uint64 if dtype is None else dtype
+        else:
+            dtype = _np.int64 if dtype is None else dtype
+    elif isinstance(fill_value, numeric_types):
+        if dtype is None or dtype is float:
+            dtype = _np.float64
     if dtype is not None and not isinstance(dtype, str):
         dtype = _np.dtype(dtype).name
     return _api_internal.full(shape, dtype, fill_value, ctx, out)
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 5cca1fa9225a..9bcb57c946a6 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -45,12 +45,12 @@
 from ..runtime import Features
 from ..context import Context
 from ..util import set_module, wrap_np_unary_func, wrap_np_binary_func,\
-                   is_np_default_dtype
+                   is_np_default_dtype, wrap_data_api_creation_func
 from ..context import current_context
 from ..ndarray import numpy as _mx_nd_np
 from ..ndarray.numpy import _internal as _npi
 from ..ndarray.ndarray import _storage_type
-from ..dlpack import ndarray_from_numpy
+from ..dlpack import ndarray_from_numpy, ndarray_from_dlpack
 from .utils import _get_np_op
 from .fallback import *  # pylint: disable=wildcard-import,unused-wildcard-import
 from . import fallback
@@ -80,7 +80,7 @@
            'quantile', 'percentile', 'shares_memory', 'may_share_memory', 'diff', 'ediff1d', 'resize', 'matmul',
            'nan_to_num', 'isnan', 'isinf', 'isposinf', 'isneginf', 'isfinite', 'polyval', 'where', 'bincount',
            'atleast_1d', 'atleast_2d', 'atleast_3d', 'fill_diagonal', 'squeeze',
-           'diagflat', 'repeat', 'prod', 'pad', 'cumsum', 'sum', 'rollaxis', 'diag', 'diagonal']
+           'diagflat', 'repeat', 'prod', 'pad', 'cumsum', 'sum', 'rollaxis', 'diag', 'diagonal', 'asarray', 'from_dlpack']
 
 __all__ += fallback.__all__
 
@@ -1585,6 +1585,45 @@ def ctx(self):
             self.handle, ctypes.byref(dev_typeid), ctypes.byref(dev_id)))
         return Context(Context.devtype2str[dev_typeid.value], dev_id.value)
 
+
+    def to_device(self, device):
+        """Returns an array on the target device with the same value as this array.
+
+        If the target device is the same as ``self.device``, then ``self`` is
+        returned.  Otherwise, a copy is made.
+
+        Parameters
+        ----------
+        device : Context
+            The target device.
+
+        Returns
+        -------
+        ndarray
+            The target array.
+        """
+        if self.device == device:
+            return self
+        return self.copyto(device)
+
+    @property
+    def device(self):
+        """Hardware device the array data resides on.
+
+        Examples
+        --------
+        >>> x = np.array([1, 2, 3, 4])
+        >>> x.device
+        cpu(0)
+        >>> type(x.device)
+        <class 'mxnet.context.Context'>
+        >>> y = np.zeros((2, 3), npx.gpu(0))
+        >>> y.device
+        gpu(0)
+        """
+        return self.ctx
+
+
     @property
     def context(self):
         """This function has been deprecated. Please refer to ``ndarray.ctx``."""
@@ -2456,7 +2495,8 @@ def tostype(self, stype):
 
 
 @set_module('mxnet.numpy')
-def empty(shape, dtype=float, order='C', ctx=None):  # pylint: disable=redefined-outer-name
+@wrap_data_api_creation_func
+def empty(shape, *, dtype=None, order='C', device=None):  # pylint: disable=redefined-outer-name
     """Return a new array of given shape and type, without initializing entries.
 
     Parameters
@@ -2472,7 +2512,7 @@ def empty(shape, dtype=float, order='C', ctx=None):  # pylint: disable=redefined
     order : {'C'}, optional, default: 'C'
         How to store multi-dimensional data in memory, currently only row-major
         (C-style) is supported.
-    ctx : device context, optional
+    device : device context, optional
         Device context on which the memory is allocated. Default is
         `mxnet.context.current_context()`.
 
@@ -2494,13 +2534,13 @@ def empty(shape, dtype=float, order='C', ctx=None):  # pylint: disable=redefined
     if order != 'C':
         raise NotImplementedError('`empty` only supports order equal to `C`, while received {}'
                                   .format(str(order)))
-    if ctx is None:
-        ctx = current_context()
+    if device is None:
+        device = current_context()
     if dtype is None or dtype is float:
         dtype = _np.float64 if is_np_default_dtype() else _np.float32
     if isinstance(shape, int):
         shape = (shape,)
-    return ndarray(handle=_new_alloc_handle(shape, ctx, False, dtype))
+    return ndarray(handle=_new_alloc_handle(shape, device, False, dtype))
 
 
 # pylint: disable=redefined-outer-name
@@ -2617,7 +2657,8 @@ def shape(a):
 
 
 @set_module('mxnet.numpy')
-def zeros(shape, dtype=None, order='C', ctx=None):  # pylint: disable=redefined-outer-name
+@wrap_data_api_creation_func
+def zeros(shape, *, dtype=None, order='C', device=None):  # pylint: disable=redefined-outer-name
     """Return a new array of given shape and type, filled with zeros.
     This function currently only supports storing multi-dimensional data
     in row-major (C-style).
@@ -2636,8 +2677,9 @@ def zeros(shape, dtype=None, order='C', ctx=None):  # pylint: disable=redefined-
     order : {'C'}, optional, default: 'C'
         How to store multi-dimensional data in memory, currently only row-major
         (C-style) is supported.
-    ctx : Context, optional
-        An optional device context (default is the current default context).
+    device : device context, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.context.current_context()`.
 
     Returns
     -------
@@ -2656,11 +2698,12 @@ def zeros(shape, dtype=None, order='C', ctx=None):  # pylint: disable=redefined-
     array([[0.],
            [0.]])
     """
-    return _mx_nd_np.zeros(shape, dtype, order, ctx)
+    return _mx_nd_np.zeros(shape, dtype, order, device)
 
 
 @set_module('mxnet.numpy')
-def ones(shape, dtype=None, order='C', ctx=None):  # pylint: disable=redefined-outer-name
+@wrap_data_api_creation_func
+def ones(shape, *, dtype=None, order='C', device=None):  # pylint: disable=redefined-outer-name
     """Return a new array of given shape and type, filled with ones.
     This function currently only supports storing multi-dimensional data
     in row-major (C-style).
@@ -2678,13 +2721,14 @@ def ones(shape, dtype=None, order='C', ctx=None):  # pylint: disable=redefined-o
     order : {'C'}, optional, default: 'C'
         How to store multi-dimensional data in memory, currently only row-major
         (C-style) is supported.
-    ctx : Context, optional
-        An optional device context (default is the current default context).
+    device : device context, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.context.current_context()`.
 
     Returns
     -------
     out : ndarray
-        Array of ones with the given shape, dtype, and ctx.
+        Array of ones with the given shape, dtype, and device.
 
     Examples
     --------
@@ -2703,7 +2747,7 @@ def ones(shape, dtype=None, order='C', ctx=None):  # pylint: disable=redefined-o
     array([[1., 1.],
            [1., 1.]])
     """
-    return _mx_nd_np.ones(shape, dtype, order, ctx)
+    return _mx_nd_np.ones(shape, dtype, order, device)
 
 
 @set_module('mxnet.numpy')
@@ -2736,7 +2780,8 @@ def broadcast_to(array, shape):  # pylint: disable=redefined-outer-name
 
 # pylint: disable=too-many-arguments, redefined-outer-name
 @set_module('mxnet.numpy')
-def full(shape, fill_value, dtype=None, order='C', ctx=None, out=None):
+@wrap_data_api_creation_func
+def full(shape, fill_value, *, dtype=None, order='C', device=None, out=None):
     r"""Return a new array of given shape and type, filled with `fill_value`.
 
     Parameters
@@ -2751,8 +2796,9 @@ def full(shape, fill_value, dtype=None, order='C', ctx=None, out=None):
     order : {'C'}, optional
         Whether to store multidimensional data in C- or Fortran-contiguous
         (row- or column-wise) order in memory. Currently only supports C order.
-    ctx : mxnet.context.Context
-        The device, e.g. the i-th GPU.
+    device : device context, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.context.current_context()`.
     out : ndarray or None, optional
         A location into which the result is stored.
         If provided, it must have the same shape and dtype as input ndarray.
@@ -2783,17 +2829,18 @@ def full(shape, fill_value, dtype=None, order='C', ctx=None, out=None):
     >>> np.full((2, 2), 10)
     array([[10., 10.],
            [10., 10.]])
-    >>> np.full((2, 2), 2, dtype=np.int32, ctx=mx.cpu(0))
+    >>> np.full((2, 2), 2, dtype=np.int32, device=mx.cpu(0))
     array([[2, 2],
            [2, 2]], dtype=int32)
     """
-    return _mx_nd_np.full(shape, fill_value, order=order, ctx=ctx, dtype=dtype, out=out)
+    return _mx_nd_np.full(shape, fill_value, order=order, ctx=device, dtype=dtype, out=out)
 # pylint: enable=too-many-arguments, redefined-outer-name
 
 
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
-def empty_like(prototype, dtype=None, order='C', subok=False, shape=None): # pylint: disable=W0621
+@wrap_data_api_creation_func
+def empty_like(prototype, /, *, dtype=None, device=None, order='C', subok=False, shape=None): # pylint: disable=W0621
     """
     Return a new array with the same shape and type as a given array.
 
@@ -2804,6 +2851,9 @@ def empty_like(prototype, dtype=None, order='C', subok=False, shape=None): # pyl
         of the returned array.
     dtype : data-type, optional
         Overrides the data type of the result.
+    device : device context, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.context.current_context()`.
     order : {'C'}, optional
         Whether to store multidimensional data in C- or Fortran-contiguous
         (row- or column-wise) order in memory. Currently only supports C order.
@@ -2848,7 +2898,10 @@ def empty_like(prototype, dtype=None, order='C', subok=False, shape=None): # pyl
     array([[4.9e-324, 9.9e-324, 1.5e-323], # uninitialized
            [2.0e-323, 2.5e-323, 3.0e-323]])
     """
-    return _mx_nd_np.empty_like(prototype, dtype=dtype, order=order, subok=subok, shape=shape)
+    ret = _mx_nd_np.empty_like(prototype, dtype=dtype, order=order, subok=subok, shape=shape)
+    if device is not None:
+        ret.to_device(device)
+    return ret
 # pylint: enable=redefined-outer-name
 
 
@@ -5492,7 +5545,8 @@ def histogram(a, bins=10, range=None, normed=None, weights=None, density=None):
 
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
-def eye(N, M=None, k=0, dtype=float, **kwargs):
+@wrap_data_api_creation_func
+def eye(N, M=None, /, *, k=0, dtype=None, device=None, **kwargs):
     """
     Return a 2-D array with ones on the diagonal and zeros elsewhere.
 
@@ -5510,6 +5564,9 @@ def eye(N, M=None, k=0, dtype=float, **kwargs):
         Data-type of the returned array.
         When npx.is_np_default_dtype() returns False, default dtype is float32;
         When npx.is_np_default_dtype() returns True, default dtype is float64.
+    device : device context, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.context.current_context()`.
 
     Returns
     -------
@@ -5527,13 +5584,14 @@ def eye(N, M=None, k=0, dtype=float, **kwargs):
            [0., 0., 1.],
            [0., 0., 0.]])
     """
-    return _mx_nd_np.eye(N, M, k, dtype, **kwargs)
+    return _mx_nd_np.eye(N, M, k, dtype, ctx=device, **kwargs)
 # pylint: enable=redefined-outer-name
 
 
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
-def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0, ctx=None):  # pylint: disable=too-many-arguments
+@wrap_data_api_creation_func
+def linspace(start, stop, /, num=50, *, endpoint=True, retstep=False, dtype=None, axis=0, device=None):  # pylint: disable=too-many-arguments
     r"""
     Return evenly spaced numbers over a specified interval.
 
@@ -5563,6 +5621,9 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
         The axis in the result to store the samples. Relevant only if start or
         stop are array-like. By default (0), the samples will be along a new
         axis inserted at the beginning. Use -1 to get an axis at the end.
+    device : device context, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.context.current_context()`.
 
     Returns
     -------
@@ -5612,10 +5673,10 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
 
        * `start` and `stop` do not support list, numpy ndarray and mxnet ndarray
        * axis could only be 0
-       * There could be an additional `ctx` argument to specify the device, e.g. the i-th
+       * There could be an additional `device` argument to specify the device, e.g. the i-th
          GPU.
     """
-    return _mx_nd_np.linspace(start, stop, num, endpoint, retstep, dtype, axis, ctx)
+    return _mx_nd_np.linspace(start, stop, num, endpoint, retstep, dtype, axis, device)
 # pylint: enable=redefined-outer-name
 
 
@@ -6213,7 +6274,8 @@ def triu(m, k=0):
 
 
 @set_module('mxnet.numpy')
-def arange(start, stop=None, step=1, dtype=None, ctx=None):
+@wrap_data_api_creation_func
+def arange(start, /, stop=None, step=1, *, dtype=None, device=None):
     """Return evenly spaced values within a given interval.
 
     Values are generated within the half-open interval ``[start, stop)``
@@ -6240,6 +6302,9 @@ def arange(start, stop=None, step=1, dtype=None, ctx=None):
         Default dtype can be set to be consistent with offical numpy by `npx.set_np(dtype=True)`.
         * When npx.is_np_default_dtype() returns False, default dtype is float32;
         * When npx.is_np_default_dtype() returns True, default dtype is int64.
+    device : device context, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.context.current_context()`.
 
     Returns
     -------
@@ -6271,7 +6336,7 @@ def arange(start, stop=None, step=1, dtype=None, ctx=None):
     >>> np.arange(3).dtype
     dtype('int64')
     """
-    return _mx_nd_np.arange(start, stop, step, dtype, ctx)
+    return _mx_nd_np.arange(start, stop, step, dtype, device)
 # pylint: enable=redefined-outer-name
 
 
@@ -10592,7 +10657,8 @@ def interp(x, xp, fp, left=None, right=None, period=None):  # pylint: disable=to
 
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
-def full_like(a, fill_value, dtype=None, order='C', ctx=None, out=None): # pylint: disable=too-many-arguments
+@wrap_data_api_creation_func
+def full_like(a, /, fill_value, *, dtype=None, order='C', device=None, out=None): # pylint: disable=too-many-arguments
     """
     Return a full array with the same shape and type as a given array.
 
@@ -10609,7 +10675,9 @@ def full_like(a, fill_value, dtype=None, order='C', ctx=None, out=None): # pylin
     order : {'C'}, optional
         Whether to store multidimensional data in C- or Fortran-contiguous
         (row- or column-wise) order in memory. Currently only supports C order.
-    ctx: to specify the device, e.g. the i-th GPU.
+    device : device context, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.context.current_context()`.
     out : ndarray or None, optional
         A location into which the result is stored.
         If provided, it must have the same shape and dtype as input ndarray.
@@ -10642,13 +10710,14 @@ def full_like(a, fill_value, dtype=None, order='C', ctx=None, out=None): # pylin
     >>> np.full_like(y, 0.1)
     array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1])
     """
-    return _mx_nd_np.full_like(a, fill_value=fill_value, dtype=dtype, order=order, ctx=ctx, out=out)
+    return _mx_nd_np.full_like(a, fill_value=fill_value, dtype=dtype, order=order, ctx=device, out=out)
 # pylint: enable=redefined-outer-name
 
 
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
-def zeros_like(a, dtype=None, order='C', ctx=None, out=None):
+@wrap_data_api_creation_func
+def zeros_like(a, /, *, dtype=None, order='C', device=None, out=None):
     """
     Return an array of zeros with the same shape and type as a given array.
 
@@ -10663,7 +10732,9 @@ def zeros_like(a, dtype=None, order='C', ctx=None, out=None):
     order : {'C'}, optional
         Whether to store multidimensional data in C- or Fortran-contiguous
         (row- or column-wise) order in memory. Currently only supports C order.
-    ctx: to specify the device, e.g. the i-th GPU.
+    device : device context, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.context.current_context()`.
     out : ndarray or None, optional
         A location into which the result is stored.
         If provided, it must have the same shape and dtype as input ndarray.
@@ -10700,13 +10771,14 @@ def zeros_like(a, dtype=None, order='C', ctx=None, out=None):
     >>> np.zeros_like(y)
     array([0., 0., 0.], dtype=float64)
     """
-    return _mx_nd_np.full_like(a, fill_value=0, dtype=dtype, order=order, ctx=ctx, out=ctx)
+    return _mx_nd_np.full_like(a, fill_value=0, dtype=dtype, order=order, ctx=device, out=out)
 # pylint: enable=redefined-outer-name
 
 
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
-def ones_like(a, dtype=None, order='C', ctx=None, out=None):
+@wrap_data_api_creation_func
+def ones_like(a, /, *, dtype=None, order='C', device=None, out=None):
     """
     Return an array of ones with the same shape and type as a given array.
 
@@ -10721,7 +10793,9 @@ def ones_like(a, dtype=None, order='C', ctx=None, out=None):
     order : {'C'}, optional
         Whether to store multidimensional data in C- or Fortran-contiguous
         (row- or column-wise) order in memory. Currently only supports C order.
-    ctx: to specify the device, e.g. the i-th GPU.
+    device : device context, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.context.current_context()`.
     out : ndarray or None, optional
         A location into which the result is stored.
         If provided, it must have the same shape and dtype as input ndarray.
@@ -10758,7 +10832,7 @@ def ones_like(a, dtype=None, order='C', ctx=None, out=None):
     >>> np.ones_like(y)
     array([1., 1., 1.], dtype=float64)
     """
-    return _mx_nd_np.full_like(a, fill_value=1, dtype=dtype, order=order, ctx=ctx, out=out)
+    return _mx_nd_np.full_like(a, fill_value=1, dtype=dtype, order=order, ctx=device, out=out)
 # pylint: enable=redefined-outer-name
 
 
@@ -12234,3 +12308,108 @@ def sum(a, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=N
     """
     return _mx_nd_np.sum(a, axis=axis, dtype=dtype, out=out, keepdims=keepdims, initial=initial, where=where)
 # pylint: enable=redefined-outer-name, too-many-arguments
+
+
+@set_module('mxnet.numpy')
+@wrap_data_api_creation_func
+def asarray(obj, /, *, dtype=None, device=None, copy=None):
+    """
+    Convert the input to an array.
+
+    Parameters
+    ----------
+    obj : Union[ <array>, bool, int, float, NestedSequence[ bool | int | float ], SupportsDLPack, SupportsBufferProtocol ]
+        Object to be converted to an array. Can be a Python scalar,
+        a (possibly nested) sequence of Python scalars,
+        or an object supporting DLPack or the Python buffer protocol.
+    dtype : dtype, Optional
+        output array data type. Default: None .
+    device : device context, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.context.current_context()`.
+    copy : bool, Optional
+        Whether or not to make a copy of the input.
+        If True, always copies.
+        If False, never copies for input which supports DLPack or the buffer protocol,
+        and raises ValueError in case that would be necessary.
+        If None, reuses existing memory buffer if possible, copies otherwise. Default: None .
+
+    Returns
+    -------
+    out : ndarray
+        An array containing the data from obj.
+
+    Examples
+    --------
+    >>> a = np.arange(4).reshape(2,2)
+    >>> a
+    array([[0, 1],
+        [2, 3]])
+    >>> np.diagonal(a)
+    array([0, 3])
+    >>> np.diagonal(a, 1)
+    array([1])
+
+    >>> a = np.arange(8).reshape(2,2,2)
+    >>>a
+    array([[[0, 1],
+            [2, 3]],
+            [[4, 5],
+            [6, 7]]])
+    >>> np.diagonal(a, 0, 0, 1)
+    array([[0, 6],
+            [1, 7]])
+    """
+    if isinstance(obj, (integer_types, numeric_types)):
+        obj = _np.asarray(obj, dtype=dtype)
+    if isinstance(obj, _np.ndarray):
+        dtype = obj.dtype if dtype is None else dtype
+    if isinstance(obj, ndarray):
+        dtype = obj.dtype if dtype is None else dtype
+    else:
+        if dtype is None:
+            default_dtype = _np.float64 if is_np_default_dtype() else _np.float32
+            dtype = obj.dtype if hasattr(obj, "dtype") else default_dtype
+    array = _as_mx_np_array(obj, ctx=device, zero_copy=copy)
+    return array.astype(dtype)
+
+@set_module('mxnet.numpy')
+@wrap_data_api_creation_func
+def from_dlpack(x, /):
+    """
+    Returns a np.ndarray backed by a dlpack tensor.
+
+    Parameters
+    ----------
+    dlpack: an object with __dlpack__ method or PyCapsule (the pointer of DLManagedTensor)
+        input data
+
+    Returns
+    -------
+    np.ndarray
+        an ndarray backed by a dlpack tensor
+
+    Examples
+    --------
+    >>> x = mx.np.ones((2,3))
+    >>> y = mx.npx.to_dlpack_for_read(x)
+    >>> type(y)
+    <class 'PyCapsule'>
+    >>> z = mx.npx.from_dlpack(y)
+    >>> type(z)
+    <class 'mxnet.numpy.ndarray'>
+    >>> z
+    array([[1., 1., 1.],
+           [1., 1., 1.]])
+
+    >>> w = mx.npx.to_dlpack_for_write(x)
+    >>> type(w)
+    <class 'PyCapsule'>
+    >>> u = mx.npx.from_dlpack(w)
+    >>> u += 1
+    >>> x
+    array([[2., 2., 2.],
+           [2., 2., 2.]])
+    """
+    from_dlpack = ndarray_from_dlpack(ndarray)
+    return from_dlpack(x)
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index ea75030614be..ba9723be6fb8 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -646,6 +646,29 @@ def _wrap_np_binary_func(x1, x2, out=None, **kwargs):
     return _wrap_np_binary_func
 
 
+def wrap_data_api_creation_func(func):
+    """A convenience decorator for wrapping data apis standardized creation functions to provide
+    context keyward backward compatibility
+
+    Parameters
+    ----------
+    func : a numpy-compatible array creation function to be wrapped for context keyward change.
+
+    Returns
+    -------
+    Function
+        A function wrapped with context keyward changes.
+    """
+    @functools.wraps(func)
+    def _wrap_api_creation_func(*args, **kwargs):
+        if len(kwargs) != 0:
+            device = kwargs.pop('ctx', None)
+            if device is not None:
+                kwargs['device'] = device
+        return func(*args, **kwargs)
+    return _wrap_api_creation_func
+
+
 # pylint: disable=exec-used
 def numpy_fallback(func):
     """decorator for falling back to offical numpy for a specific function"""
diff --git a/src/api/operator/numpy/np_init_op.cc b/src/api/operator/numpy/np_init_op.cc
index 0c617725fa47d43eb6b56923a29fbe9e2deed07e..0821ea83fc7adef44ccf9c6e7c2e7e950011622a 100644
GIT binary patch
literal 14268
zcmeHNZExE)5bo#v3N9ZCIn7k(W%;1I6ds&)23jYuy%#J91lpn<EfOV2lvi)af8QM?
z%91SGTH16*K?8AY@pyMU@p<l!)~i?gr6Zstpd+9opd+9opd+9opd+9opd+9opd+9o
z@U2I{h!QSbpIO$&-}_8-qFGERyd9GAz<`Ngsh6+=5;4&t|LhTx#4eu^PJI@Kbi&kx
za~PTgJQjrWxo=rF;Q<Lka!GDN%X-g*v2XLxwW;tJjfH*W?6+`AD%=ko))68CC-5By
z0+`lGh5OxG8d0Bw_>N;P7t&ihcK>Ez0zEhJvmtd#Ig=;8jRV=Zl*P%?f(bG)b6_l2
znqHSK2O$a$Auixe*PV!qRd240*K&rmhz_AWmxqa1u^0=-vgT|eg6N{(?L-k>99+R7
zP}YX)2D&NzI|MDB7m7$~odyvxP_+www$I_;#d6u_<jos5y(1V9rR&bU&lU`y%phbD
z1+&Ee#F?-xWj&446D(3)ImmRt<fSsxp@OEeYK1`}vL&`<FcT3F?kunN_&j)IMi3s7
zG(NZ_6_~?JH;m!MY*u;EHNpCsa>rv=V25Jt><9T>?vlzSGh#7|=FBLlUmMDTOevUE
zoy-qC*y#3{P9BYXtq@+R8H0EL;DEq&x+9|vt0}u_j@iuRMr(C|amQ$Zk|+4v0~}tD
z?7{W>-f(n1usgTC6fvek%rPKl&y1O{$APmj?(YA045LT52{eW^hD*eyF}w!DCp(8>
z^hje^=V2&CbROQEha<{oY-e(C=FL{@4$$k{a{ZhZ$g!XimMvI)-n9$GB9wDuFRP{9
zDUs#r-O;@%79q1IP^ud(a(PJ}`%tT@3RJ0-^|Rx96U)L<HV6fJ_(Yf&Gub|nwBj~n
zAw{Ke(4AMRMdMgoDa8zX2`PQNMmm*@SjZ?Y>j({}*IYhjku4(z15*mV;9$PClO}6(
zaQQI3=gXqnokR14eNF58D>sJT#&+10CZ4dRAcW{0n5e$|_<=gXUzYXQ<Kpa;SQ}-2
z@&YPOj@Lv`IW!azbY#b}ksE=Jzq#V!#~|?1`vz;vV=MD=EX8Hv2{#7pKf57XZmn`N
zq_)gyIjrARwoGc*f%fWde<D8Z=Zy=@t9Mxj0&iT_ZY|r6#?1)U=V^<zsR_vn`em`-
zzTZ}+v&GJEgLDOBwXrboCw{oNUet3#UxBU23#mHC^=;VoB>*)dhmWAM2{WMF3N5VI
zUUI{uk}nd&ihK_C%C4*@GKH<IZ=M9cA8?8HZi@0QvO(5HuacXZz8RWp_qh!2XF;=c
zbL-JrA-%@vTuUgsj84zx^0aDK()ZGztvLOVxqkcX9IjI6_JoTdnL_$Z;2g!WAgwV@
zp%icm7kmBD*{MCc?hkJU_OSQIb*m+FSZV~$RO-+OgbkB2*1%Gr(Uc0wR5olqaN$fP
z3@|@j5)mab&DRtk1u4li#XUX7)R`kNr;Q)V$*kVG!u^ZV26$4R9&@LkozY8st)4b2
z^x2p;){@;$(?&4cE!xQa*-jgosHTl3z%3BMm!*vgT<U3KOrbe~j-_qNI<|(eG6_jm
zMI=?brJSzLnymV(&=q*{Vsus9<Mnh^q3y<WwU%UenXUrP;$5k5QeU>xRFAvjCajQD
zzKfyzs2)0WKS1YPIdyoTvNInI@q=-6xO$|K7a(blT&>ulFE8qlKr8k>H{(Cy2fP8B
z78>W=%|`9dR{>k`B;_-^{d{%MRuykTV+a-7v63%aAXNR`wXj7>7^e+>dwj8We+q{P
zy8^Y;mxfTo)1l#ndNk6YgVVb17wF@+gKiy`upFo%v^hR<VWVeI708lcd6Fq?Il8sW
z%8H21mVa5{(voR;gsMy@z@h7H<LI)*tEpg0isg}}Qk>Q>>s424oW|WvzV=Gz*qFhJ
zZDB2g)l*dkVkOd2&sz{UyHxkESkTozsG%mo6E*qc7jpikI?N>R@=q3d4P(d#a7*B7
z{+tDv!%49E>;x2LK958*cVtq<Xv24vnzJL++l6=tcqf_}r>m*A_Lvn9r8hqj;W`W+
wUJ{F4;oqe{4LP{Nydj?KnE$l`$#=J|KFpdQ5x#}$<B5>BZwrswNpjfx7hgjJH~;_u

delta 1635
zcmah}O>Y}T7*;B16RUzqG^J50^u|r|VYxd_0+mWA$TUe=HL|PNZ50)2O?JojD7!Pu
z%xoP~Aq#N<Awi|##Bb;UDc2tO1*kc3L%nb<;vewNdfga`gyfH%nfK#)p7)u3Du0Rx
zq*Q<s?C^jI%|b8|OqW45p#lCiWZU$pVz4b!5mLjYfaPjqdj)XG6iC5bwp0*Hj0&VM
zitnceH=<!hpdH0n%!JV%K!dTYe68N}Z)`!EM}w?D$n`K05%jp}Vi4ERlj;E4Sc`@s
zcfDx@T;N8WA#t4*>rfSP(ZSwHU-3@Yfb0pTbeAXCu<5uownu36urT(`+9KIOU#2;k
zG2XcxWw?u#9O*T$0?QcSdKUB3;>rfN?3ngJNCT;M<Pf(j59}~ti9x)GEKVX$MUZ9Z
z^jI$WfFa7gyo%g66ccDh24p+aD(?o=xaRH)&^0D`pO9X!=h19Zo>U!@ORmWdid$=J
zT|v~NoI4_7TBBZ{r(B`R`+cAZ!Wsm0AJIm%=cbqqD>DJY=qVIkbjq%T9vNYLQbP|o
z*+v+V{CEHgM}<o-RvX|qitur@;Wx@~&u@NGzte<!)!o5vwbt~v8c^Sb8}-_z-}LJ>
zTx~<Ow&zlB`L)e5Ff;}FjU6V+(Lku&iORw;Q;k44$K<2Z2@80e2gpx!QrcnAkq?;?
z&M*m6G1mh(UX3tA^hnH2<}B?RzrxPLT~a8J(!4wVG@G(WT3m3`l8F}t+4jETtPMV*
z%UnOaGx;d1w|zI&b}9mAXN~R{fK~yK2!`UVAtH^FKKXZ5Qn<uL5Tzl555hQd@x$xj
zbw3<CiVp=dgeJU&*TUVX{(qgTR2UvJg1$Lm;h^Qv{3EXDc_cMF(q6ZCQr+ziKQF-=
zd}Sc!kpTY_{R*`#em&=PQXa2MEX4_ef}k<eh?YS{5FXjbZ_T~_1yfR6vh+><bWB!`
zeklJqVV})U+b7-y`@A$|%U)ui%`Vtq7xyOHX%w|0eqg<&Z>L5JW{=6G!Zsf5@B>y)
z?4NTp1)OZudPt*`S#Np4E_vtd12}JCe$qa;xL_++7j5XxT6uNDzF6avqip*#2KLt<
z&($`oO3{7^$D5=^-~SA*!y~FX`hLZJzkYGzOZ(=`%*3iaerMTM&KK>?cYCKri0{jc
z&l#RSACt=~*={|#O%;tHalab{078b(UWkJfnS<?gWvwu_yjl5ZeED64ym69c`U$hY
zym#%?l~p^v+Oof2zIMjXyCR9RFCznUXu*r^+|1NO#r`#O%N{S@wes!BQ>&rV-d;a9
fdGyuv3pkx&^X${rS^H$|to{1J%F!QdzrFT1@4_tn

diff --git a/src/api/operator/numpy_extension/npx_convolution_op.cc b/src/api/operator/numpy_extension/npx_convolution_op.cc
index adb1ec379283..03d6e48ddb73 100644
--- a/src/api/operator/numpy_extension/npx_convolution_op.cc
+++ b/src/api/operator/numpy_extension/npx_convolution_op.cc
@@ -150,7 +150,7 @@ MXNET_REGISTER_API("_npx.convolution")
   // num_group
   param.num_group = (uint32_t) (args[num_inputs + 5].operator int());
   // workspace
-  param.workspace = args[num_inputs + 6].operator uint64_t();
+  param.workspace = args[num_inputs + 6].operator int64_t();
   // cudnn_tune
   if (args[num_inputs + 8].type_code() == kNull) {
     param.cudnn_tune = dmlc::nullopt;
diff --git a/src/api/operator/numpy_extension/npx_deconvolution_op.cc b/src/api/operator/numpy_extension/npx_deconvolution_op.cc
index 838f4408bfa1..b9873615dc7a 100644
--- a/src/api/operator/numpy_extension/npx_deconvolution_op.cc
+++ b/src/api/operator/numpy_extension/npx_deconvolution_op.cc
@@ -172,7 +172,7 @@ MXNET_REGISTER_API("_npx.deconvolution")
   // num_group
   param.num_group = (uint32_t) (args[num_inputs + 7].operator int());
   // workspace
-  param.workspace = args[num_inputs + 8].operator uint64_t();
+  param.workspace = args[num_inputs + 8].operator int64_t();
   // cudnn_tune
   if (args[num_inputs + 10].type_code() == kNull) {
     param.cudnn_tune = dmlc::nullopt;
diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h
index c065c0877346..6ec55f51d08c 100644
--- a/src/operator/tensor/init_op.h
+++ b/src/operator/tensor/init_op.h
@@ -291,7 +291,10 @@ struct InitOpWithScalarParam : dmlc::Parameter<InitOpWithScalarParam> {
   mxnet::TShape shape;
   std::string ctx;
   int dtype;
-  double value;
+  double double_value;
+  int64_t int_value;
+  uint64_t uint_value;
+  int value_type;
   DMLC_DECLARE_PARAMETER(InitOpWithScalarParam) {
     DMLC_DECLARE_FIELD(shape)
       .set_default(mxnet::TShape())
@@ -303,20 +306,33 @@ struct InitOpWithScalarParam : dmlc::Parameter<InitOpWithScalarParam> {
     DMLC_DECLARE_FIELD(dtype)
       .set_default(-1)
       .add_enum("None", -1)
-      MXNET_ADD_ALL_TYPES_WITH_BOOL
+      MXNET_ADD_ALL_TYPES_EXT_WITH_BOOL
       .describe("Target data type.");
-    DMLC_DECLARE_FIELD(value)
-      .describe("Value with which to fill newly created tensor");
+    DMLC_DECLARE_FIELD(double_value)
+      .describe("Float value with which to fill newly created tensor");
+    DMLC_DECLARE_FIELD(int_value)
+      .describe("Integer value with which to fill newly created tensor");
+    DMLC_DECLARE_FIELD(uint_value)
+      .describe("Unsigned integer value with which to fill newly created tensor");
+    DMLC_DECLARE_FIELD(value_type)
+      .describe("Choose the value type");
   }
 
   void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
-    std::ostringstream shape_s, dtype_s, value_s;
+    std::ostringstream shape_s, dtype_s, double_value_s, int_value_s, value_type_s,
+    uint_value_s;
     shape_s << shape;
     dtype_s << dtype;
-    value_s << value;
+    double_value_s << double_value;
+    int_value_s << int_value;
+    uint_value_s << uint_value;
+    value_type_s << value_type;
     (*dict)["shape"] = shape_s.str();
     (*dict)["dtype"] = MXNetTypeWithBool2String(dtype);
-    (*dict)["value"] = value_s.str();
+    (*dict)["int_value"] = int_value_s.str();
+    (*dict)["uint_value"] = uint_value_s.str();
+    (*dict)["double_value"] = double_value_s.str();
+    (*dict)["value_type"] = value_type_s.str();
     // We do not set ctx, because ctx has been set in dict instead of InitOpParam.
     // Setting ctx here results in an error.
   }
@@ -547,7 +563,13 @@ void InitFillWithScalarCompute(const nnvm::NodeAttrs &attrs,
   CHECK_EQ(inputs.size(), 0);
   CHECK_EQ(outputs.size(), 1U);
   const auto& param = nnvm::get<InitOpWithScalarParam>(attrs.parsed);
-  Fill<false>(ctx.get_stream<xpu>(), outputs[0], req[0], param.value);
+  if (param.value_type == 0) {
+    Fill<false>(ctx.get_stream<xpu>(), outputs[0], req[0], param.int_value);
+  } else if (param.value_type == 1) {
+    Fill<false>(ctx.get_stream<xpu>(), outputs[0], req[0], param.uint_value);
+  } else {
+    Fill<false>(ctx.get_stream<xpu>(), outputs[0], req[0], param.double_value);
+  }
 }
 
 struct PopulateFullIdxRspKernel : public mxnet_op::tunable {

From 89f9f6b17564231b33946df962433e71d4355d1d Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 9 Sep 2021 15:46:35 -0700
Subject: [PATCH 02/41] update

---
 python/mxnet/ndarray/numpy/_op.py             |   7 +-
 python/mxnet/numpy/multiarray.py              |   4 +-
 src/api/operator/numpy/np_init_op.cc          | Bin 14268 -> 14278 bytes
 .../numpy/np_elemwise_broadcast_logic_op.cc   |   4 +-
 src/operator/numpy/np_init_op.cc              |  11 +--
 src/operator/numpy/np_init_op.cu              |   2 +-
 src/operator/numpy/np_init_op.h               |  71 ++++++++++++++++++
 src/operator/tensor/init_op.h                 |  44 ++++-------
 8 files changed, 100 insertions(+), 43 deletions(-)

diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 5cf1b166733a..6433d0957af1 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -1877,7 +1877,8 @@ def eye(N, M=None, k=0, dtype=float, **kwargs):
         dtype = _np.float64 if is_np_default_dtype() else _np.float32
     if dtype is not None and not isinstance(dtype, str):
         dtype = _np.dtype(dtype).name
-    return _api_internal.eye(N, M, k, ctx, dtype)
+    k = minimum(k, N) if M is None else minimum(k, M)
+    return _api_internal.eye(N, M, int(k), ctx, dtype)
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -1889,9 +1890,9 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
 
     Parameters
     ----------
-    start : real number
+    start : int or float
         The starting value of the sequence.
-    stop : real number
+    stop : int or float
         The end value of the sequence, unless endpoint is set to False. In
         that case, the sequence consists of all but the last of num + 1
         evenly spaced samples, so that stop is excluded. Note that the step
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 9bcb57c946a6..d07c7685fe57 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -5600,9 +5600,9 @@ def linspace(start, stop, /, num=50, *, endpoint=True, retstep=False, dtype=None
 
     Parameters
     ----------
-    start : real number
+    start : int or float
         The starting value of the sequence.
-    stop : real number
+    stop : int or float
         The end value of the sequence, unless endpoint is set to False. In
         that case, the sequence consists of all but the last of num + 1
         evenly spaced samples, so that stop is excluded. Note that the step
diff --git a/src/api/operator/numpy/np_init_op.cc b/src/api/operator/numpy/np_init_op.cc
index 0821ea83fc7adef44ccf9c6e7c2e7e950011622a..d295cb7250687a212c46ea727935856115cdacae 100644
GIT binary patch
delta 1631
zcmah}-EJF27*z_PiG?5%O`%E!U)(f*a(A2rDwR->X_B&PWLL4<C@R#N?2heGc4wBE
z**c~|7Iz>ds5CqV7f89~f-5dj{Q{^@P=v$-@XdPN7+R^4e|BcRpL5Q4W`C8x%RX5x
zz$)zWfC<e)FcM6MK_#I9{?%p6bg5#nBU2Gl!=-@bN_}SqaLE)%!Cbaf5KD{-q%evf
zrUo~nVMU;A#aPUQ(H=mZv8;Tx*6?p^LyJfKtU$>1FcA@Sx#?gK*U**f2wGT+h9P&o
zX#`y0Mw}sWofT_S6>`zW-bqjKcE^D13Z`_2C)lvzxYTz>X!Ni!_RZQN*?v!^Ihiru
zxf~_9i<KPdb*~J|7~pyq^V8zWCb;aF_CQDjsdnTLw<{0qIADoEyofALB2Gn+W#{x*
zE_t6J%7eU$JTw#&XhsHPE7L0P2GqFb?hDW{Ci#$%Znx{vY*L<7ZIVl_$u5dpt#4mJ
z)T5j`B4S#jUSFnMp~{Cnpb5h22lNopMzrgum<=m40mA4i6kW7Su7n;LVSG|U4>;LI
z7?J#V018KiOD|UH;Ma@rX{GMhOR(=ZKC9hnz<y<~zgMX?{Ovl__TWaXy5%?gS`}A2
zP^lic)LVXas{{;9fqrAhiE=a$DtDr?aLiOA5Y93AsC2>t-r@oB6YZ3?8MNgCri3$0
z!c@$4-;Gxz%n&^ibCWqsyT;G4^Kh3G3S@QO9e<upStLy^xM|A73xaI>P;u4*AJJv5
zAKsaKl$G1Qn`$c+fwQwlcl1H4fJg*G@zxNL#z~L-zbYwQ;v$IBkikb`9J%=6b?`bL
zj~&Iwf*C>+-o$I+ZdC6@=PDJ3$BdwFj#$`lIyC=@D|#MD4Ue?fDV|n$yTi{jum)cl
zh<PNyOQN5nmc?)6yiUvGb%~`oMNkklW*X5l$Oyti`{=E?$*-7_+LEPj^QQq>Ir*{l
z(}aCIKVyIP-my<sr)|kg?Bfdy_K(Gbsa6_A&4?daZ|S?~(Sq3lxm4J}qg{T)YKi@8
zZnl7vjhYW=lrrlrFW4pToV^d{EzD2Z`xh5%`Rbw#y$e=e-Ly~F`P3-e{(^zMjYo6U
zt%_2#w~FIUQlsyE0oUOnRc(E*Y=79eIPtZ8b9Q!O%?{pOw&n9hd-MJ7Ya+z=Wya?$
z&z}e6@=CT_OKwv|V@TZZW&wbZ;j<UwAVub2J5ydSj4f}LKN(+sUnXyyW|>~X>~9}j
zJ9A~t&a5@<pO>$l_4BSs;`L{dfjPF|#ddCXdZKLqp1oxUi+8Pjd+N+u=(M-j&rO|t
ZGo#;|tUBYK{6m6Q?34M)ll=|$>c3y%D)0aR

literal 14268
zcmeHNZExE)5bo#v3N9ZCIn7k(W%;1I6ds&)23jYuy%#J91lpn<EfOV2lvi)af8QM?
z%91SGTH16*K?8AY@pyMU@p<l!)~i?gr6Zstpd+9opd+9opd+9opd+9opd+9opd+9o
z@U2I{h!QSbpIO$&-}_8-qFGERyd9GAz<`Ngsh6+=5;4&t|LhTx#4eu^PJI@Kbi&kx
za~PTgJQjrWxo=rF;Q<Lka!GDN%X-g*v2XLxwW;tJjfH*W?6+`AD%=ko))68CC-5By
z0+`lGh5OxG8d0Bw_>N;P7t&ihcK>Ez0zEhJvmtd#Ig=;8jRV=Zl*P%?f(bG)b6_l2
znqHSK2O$a$Auixe*PV!qRd240*K&rmhz_AWmxqa1u^0=-vgT|eg6N{(?L-k>99+R7
zP}YX)2D&NzI|MDB7m7$~odyvxP_+www$I_;#d6u_<jos5y(1V9rR&bU&lU`y%phbD
z1+&Ee#F?-xWj&446D(3)ImmRt<fSsxp@OEeYK1`}vL&`<FcT3F?kunN_&j)IMi3s7
zG(NZ_6_~?JH;m!MY*u;EHNpCsa>rv=V25Jt><9T>?vlzSGh#7|=FBLlUmMDTOevUE
zoy-qC*y#3{P9BYXtq@+R8H0EL;DEq&x+9|vt0}u_j@iuRMr(C|amQ$Zk|+4v0~}tD
z?7{W>-f(n1usgTC6fvek%rPKl&y1O{$APmj?(YA045LT52{eW^hD*eyF}w!DCp(8>
z^hje^=V2&CbROQEha<{oY-e(C=FL{@4$$k{a{ZhZ$g!XimMvI)-n9$GB9wDuFRP{9
zDUs#r-O;@%79q1IP^ud(a(PJ}`%tT@3RJ0-^|Rx96U)L<HV6fJ_(Yf&Gub|nwBj~n
zAw{Ke(4AMRMdMgoDa8zX2`PQNMmm*@SjZ?Y>j({}*IYhjku4(z15*mV;9$PClO}6(
zaQQI3=gXqnokR14eNF58D>sJT#&+10CZ4dRAcW{0n5e$|_<=gXUzYXQ<Kpa;SQ}-2
z@&YPOj@Lv`IW!azbY#b}ksE=Jzq#V!#~|?1`vz;vV=MD=EX8Hv2{#7pKf57XZmn`N
zq_)gyIjrARwoGc*f%fWde<D8Z=Zy=@t9Mxj0&iT_ZY|r6#?1)U=V^<zsR_vn`em`-
zzTZ}+v&GJEgLDOBwXrboCw{oNUet3#UxBU23#mHC^=;VoB>*)dhmWAM2{WMF3N5VI
zUUI{uk}nd&ihK_C%C4*@GKH<IZ=M9cA8?8HZi@0QvO(5HuacXZz8RWp_qh!2XF;=c
zbL-JrA-%@vTuUgsj84zx^0aDK()ZGztvLOVxqkcX9IjI6_JoTdnL_$Z;2g!WAgwV@
zp%icm7kmBD*{MCc?hkJU_OSQIb*m+FSZV~$RO-+OgbkB2*1%Gr(Uc0wR5olqaN$fP
z3@|@j5)mab&DRtk1u4li#XUX7)R`kNr;Q)V$*kVG!u^ZV26$4R9&@LkozY8st)4b2
z^x2p;){@;$(?&4cE!xQa*-jgosHTl3z%3BMm!*vgT<U3KOrbe~j-_qNI<|(eG6_jm
zMI=?brJSzLnymV(&=q*{Vsus9<Mnh^q3y<WwU%UenXUrP;$5k5QeU>xRFAvjCajQD
zzKfyzs2)0WKS1YPIdyoTvNInI@q=-6xO$|K7a(blT&>ulFE8qlKr8k>H{(Cy2fP8B
z78>W=%|`9dR{>k`B;_-^{d{%MRuykTV+a-7v63%aAXNR`wXj7>7^e+>dwj8We+q{P
zy8^Y;mxfTo)1l#ndNk6YgVVb17wF@+gKiy`upFo%v^hR<VWVeI708lcd6Fq?Il8sW
z%8H21mVa5{(voR;gsMy@z@h7H<LI)*tEpg0isg}}Qk>Q>>s424oW|WvzV=Gz*qFhJ
zZDB2g)l*dkVkOd2&sz{UyHxkESkTozsG%mo6E*qc7jpikI?N>R@=q3d4P(d#a7*B7
z{+tDv!%49E>;x2LK958*cVtq<Xv24vnzJL++l6=tcqf_}r>m*A_Lvn9r8hqj;W`W+
wUJ{F4;oqe{4LP{Nydj?KnE$l`$#=J|KFpdQ5x#}$<B5>BZwrswNpjfx7hgjJH~;_u

diff --git a/src/operator/numpy/np_elemwise_broadcast_logic_op.cc b/src/operator/numpy/np_elemwise_broadcast_logic_op.cc
index 58ce40bb746d..d364dd64eee8 100644
--- a/src/operator/numpy/np_elemwise_broadcast_logic_op.cc
+++ b/src/operator/numpy/np_elemwise_broadcast_logic_op.cc
@@ -148,7 +148,9 @@ struct GetBinaryBroadcastCompute {
 
     const TBlob& a = inputs[0];
     const TBlob& b = inputs[1];
-    if (a.type_flag_ != b.type_flag_) {
+    if (a.type_flag_ != b.type_flag_ ||
+        a.type_flag_ == mshadow::kBool ||
+        outputs[0].shape_.ndim() > 5) {
       if (outputs[0].shape_.Size() == 0U) return;
       mxnet::TShape new_lshape, new_rshape, new_oshape;
       const TBlob& lhs = inputs[0];
diff --git a/src/operator/numpy/np_init_op.cc b/src/operator/numpy/np_init_op.cc
index 6e7aca221e9e..3917c7ed72fc 100644
--- a/src/operator/numpy/np_init_op.cc
+++ b/src/operator/numpy/np_init_op.cc
@@ -36,6 +36,7 @@ DMLC_REGISTER_PARAMETER(IndicesOpParam);
 DMLC_REGISTER_PARAMETER(LogspaceParam);
 DMLC_REGISTER_PARAMETER(FullLikeOpParam);
 DMLC_REGISTER_PARAMETER(AtleastNDParam);
+DMLC_REGISTER_PARAMETER(NumpyInitOpWithScalarParam);
 
 inline bool NumpyIndicesShape(const nnvm::NodeAttrs& attrs,
                               mxnet::ShapeVector* in_shapes,
@@ -206,11 +207,11 @@ NNVM_REGISTER_OP(_npi_full)
   .describe("fill target with a scalar value")
   .set_num_inputs(0)
   .set_num_outputs(1)
-  .set_attr_parser(ParamParser<InitOpWithScalarParam>)
-  .set_attr<mxnet::FInferShape>("FInferShape", InitShape<InitOpWithScalarParam>)
-  .set_attr<nnvm::FInferType>("FInferType", InitNumpyType<InitOpWithScalarParam>)
-  .set_attr<FCompute>("FCompute<cpu>", InitFillWithScalarCompute<cpu>)
-.add_arguments(InitOpWithScalarParam::__FIELDS__());
+  .set_attr_parser(ParamParser<NumpyInitOpWithScalarParam>)
+  .set_attr<mxnet::FInferShape>("FInferShape", InitShape<NumpyInitOpWithScalarParam>)
+  .set_attr<nnvm::FInferType>("FInferType", InitNumpyType<NumpyInitOpWithScalarParam>)
+  .set_attr<FCompute>("FCompute<cpu>", NumpyInitFillWithScalarCompute<cpu>)
+.add_arguments(NumpyInitOpWithScalarParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_npi_arange)
 .set_num_inputs(0)
diff --git a/src/operator/numpy/np_init_op.cu b/src/operator/numpy/np_init_op.cu
index 8b0760ed5765..df850c8d69d0 100644
--- a/src/operator/numpy/np_init_op.cu
+++ b/src/operator/numpy/np_init_op.cu
@@ -42,7 +42,7 @@ NNVM_REGISTER_OP(_npi_full_like)
 .set_attr<FCompute>("FCompute<gpu>", FullLikeOpCompute<gpu>);
 
 NNVM_REGISTER_OP(_npi_full)
-.set_attr<FCompute>("FCompute<gpu>", InitFillWithScalarCompute<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", NumpyInitFillWithScalarCompute<gpu>);
 
 NNVM_REGISTER_OP(_npi_atleast_1d)
 .set_attr<FCompute>("FCompute<gpu>", AtleastNDCompute<gpu>);
diff --git a/src/operator/numpy/np_init_op.h b/src/operator/numpy/np_init_op.h
index 016c8892af98..5e618408179b 100644
--- a/src/operator/numpy/np_init_op.h
+++ b/src/operator/numpy/np_init_op.h
@@ -103,6 +103,77 @@ struct IndicesOpParam : public dmlc::Parameter<IndicesOpParam> {
   }
 };
 
+/*! \brief Initialize and fill output with an arbitrary value */
+struct NumpyInitOpWithScalarParam : public dmlc::Parameter<NumpyInitOpWithScalarParam> {
+  mxnet::TShape shape;
+  std::string ctx;
+  int dtype;
+  double double_value;
+  int64_t int_value;
+  uint64_t uint_value;
+  int value_type;
+  DMLC_DECLARE_PARAMETER(NumpyInitOpWithScalarParam) {
+    DMLC_DECLARE_FIELD(shape)
+      .set_default(mxnet::TShape())
+      .describe("The shape of the output");
+    DMLC_DECLARE_FIELD(ctx)
+      .set_default("")
+      .describe("Context of output, in format [cpu|gpu|cpu_pinned](n)."
+                  "Only used for imperative calls.");
+    DMLC_DECLARE_FIELD(dtype)
+      .set_default(-1)
+      .add_enum("None", -1)
+      MXNET_ADD_ALL_TYPES_EXT_WITH_BOOL
+      .describe("Target data type.");
+    DMLC_DECLARE_FIELD(double_value)
+      .describe("Float value with which to fill newly created tensor");
+    DMLC_DECLARE_FIELD(int_value)
+      .describe("Integer value with which to fill newly created tensor");
+    DMLC_DECLARE_FIELD(uint_value)
+      .describe("Unsigned integer value with which to fill newly created tensor");
+    DMLC_DECLARE_FIELD(value_type)
+      .describe("Choose the value type");
+  }
+
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream shape_s, dtype_s, double_value_s, int_value_s, value_type_s,
+    uint_value_s;
+    shape_s << shape;
+    dtype_s << dtype;
+    double_value_s << double_value;
+    int_value_s << int_value;
+    uint_value_s << uint_value;
+    value_type_s << value_type;
+    (*dict)["shape"] = shape_s.str();
+    (*dict)["dtype"] = MXNetTypeWithBool2String(dtype);
+    (*dict)["int_value"] = int_value_s.str();
+    (*dict)["uint_value"] = uint_value_s.str();
+    (*dict)["double_value"] = double_value_s.str();
+    (*dict)["value_type"] = value_type_s.str();
+    // We do not set ctx, because ctx has been set in dict instead of InitOpParam.
+    // Setting ctx here results in an error.
+  }
+};
+
+/*! \brief Fill output with an arbitrary value */
+template<typename xpu>
+void NumpyInitFillWithScalarCompute(const nnvm::NodeAttrs &attrs,
+                                    const OpContext &ctx,
+                                    const std::vector<TBlob> &inputs,
+                                    const std::vector<OpReqType> &req,
+                                    const std::vector<TBlob> &outputs) {
+  CHECK_EQ(inputs.size(), 0);
+  CHECK_EQ(outputs.size(), 1U);
+  const auto& param = nnvm::get<NumpyInitOpWithScalarParam>(attrs.parsed);
+  if (param.value_type == 0) {
+    Fill<false>(ctx.get_stream<xpu>(), outputs[0], req[0], param.int_value);
+  } else if (param.value_type == 1) {
+    Fill<false>(ctx.get_stream<xpu>(), outputs[0], req[0], param.uint_value);
+  } else {
+    Fill<false>(ctx.get_stream<xpu>(), outputs[0], req[0], param.double_value);
+  }
+}
+
 inline bool NumpyRangeShape(const nnvm::NodeAttrs& attrs,
                             mxnet::ShapeVector* in_shapes,
                             mxnet::ShapeVector* out_shapes) {
diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h
index 6ec55f51d08c..54588d84d747 100644
--- a/src/operator/tensor/init_op.h
+++ b/src/operator/tensor/init_op.h
@@ -291,10 +291,7 @@ struct InitOpWithScalarParam : dmlc::Parameter<InitOpWithScalarParam> {
   mxnet::TShape shape;
   std::string ctx;
   int dtype;
-  double double_value;
-  int64_t int_value;
-  uint64_t uint_value;
-  int value_type;
+  double value;
   DMLC_DECLARE_PARAMETER(InitOpWithScalarParam) {
     DMLC_DECLARE_FIELD(shape)
       .set_default(mxnet::TShape())
@@ -308,31 +305,18 @@ struct InitOpWithScalarParam : dmlc::Parameter<InitOpWithScalarParam> {
       .add_enum("None", -1)
       MXNET_ADD_ALL_TYPES_EXT_WITH_BOOL
       .describe("Target data type.");
-    DMLC_DECLARE_FIELD(double_value)
-      .describe("Float value with which to fill newly created tensor");
-    DMLC_DECLARE_FIELD(int_value)
-      .describe("Integer value with which to fill newly created tensor");
-    DMLC_DECLARE_FIELD(uint_value)
-      .describe("Unsigned integer value with which to fill newly created tensor");
-    DMLC_DECLARE_FIELD(value_type)
-      .describe("Choose the value type");
+    DMLC_DECLARE_FIELD(value)
+      .describe("Value with which to fill newly created tensor");
   }
 
   void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
-    std::ostringstream shape_s, dtype_s, double_value_s, int_value_s, value_type_s,
-    uint_value_s;
+    std::ostringstream shape_s, dtype_s, value_s;
     shape_s << shape;
     dtype_s << dtype;
-    double_value_s << double_value;
-    int_value_s << int_value;
-    uint_value_s << uint_value;
-    value_type_s << value_type;
+    value_s << value;
     (*dict)["shape"] = shape_s.str();
     (*dict)["dtype"] = MXNetTypeWithBool2String(dtype);
-    (*dict)["int_value"] = int_value_s.str();
-    (*dict)["uint_value"] = uint_value_s.str();
-    (*dict)["double_value"] = double_value_s.str();
-    (*dict)["value_type"] = value_type_s.str();
+    (*dict)["value"] = value_s.str();
     // We do not set ctx, because ctx has been set in dict instead of InitOpParam.
     // Setting ctx here results in an error.
   }
@@ -563,13 +547,7 @@ void InitFillWithScalarCompute(const nnvm::NodeAttrs &attrs,
   CHECK_EQ(inputs.size(), 0);
   CHECK_EQ(outputs.size(), 1U);
   const auto& param = nnvm::get<InitOpWithScalarParam>(attrs.parsed);
-  if (param.value_type == 0) {
-    Fill<false>(ctx.get_stream<xpu>(), outputs[0], req[0], param.int_value);
-  } else if (param.value_type == 1) {
-    Fill<false>(ctx.get_stream<xpu>(), outputs[0], req[0], param.uint_value);
-  } else {
-    Fill<false>(ctx.get_stream<xpu>(), outputs[0], req[0], param.double_value);
-  }
+  Fill<false>(ctx.get_stream<xpu>(), outputs[0], req[0], param.value);
 }
 
 struct PopulateFullIdxRspKernel : public mxnet_op::tunable {
@@ -766,9 +744,13 @@ inline bool RangeShape(const nnvm::NodeAttrs& attrs,
 
 struct linspace_fwd {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(index_t i, double start, double stop, double step,
-                                  int req, DType* out) {
+  MSHADOW_XINLINE static void Map(index_t i, index_t size, double start,
+                                  double stop, bool endpoint,
+                                  double step, int req, DType* out) {
     KERNEL_ASSIGN(out[i], req, static_cast<DType>(start + step * i));
+    if (endpoint && i != 0 && i == size - 1) {
+      KERNEL_ASSIGN(out[i], req, static_cast<DType>(stop));
+    }
   }
 };
 

From 1d1b2e41e720e0dc8201dfe36b720ec1bb5850d7 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 9 Sep 2021 18:17:26 -0700
Subject: [PATCH 03/41] fix linspace

---
 src/api/operator/numpy/np_init_op.cc | 25 ++++++++--
 src/operator/numpy/np_init_op.cc     | 11 +++--
 src/operator/numpy/np_init_op.cu     |  2 +-
 src/operator/numpy/np_init_op.h      | 71 ++++++++++++++++++++++++++++
 4 files changed, 100 insertions(+), 9 deletions(-)

diff --git a/src/api/operator/numpy/np_init_op.cc b/src/api/operator/numpy/np_init_op.cc
index d295cb725068..efae2dde6327 100644
--- a/src/api/operator/numpy/np_init_op.cc
+++ b/src/api/operator/numpy/np_init_op.cc
@@ -268,9 +268,28 @@ MXNET_REGISTER_API("_npi.linspace")
   using namespace runtime;
   const nnvm::Op* op = Op::Get("_npi_linspace");
   nnvm::NodeAttrs attrs;
-  op::LinspaceParam param;
-  param.start = args[0].operator double();
-  param.stop = args[1].operator double();
+  op::NumpyLinspaceParam param;
+  if (args[0].type_code() == kDLFloat || args[1].type_code() == kDLFloat) {
+    param.start_double = args[0].operator double();
+    param.stop_double = args[1].operator double();
+    param.value_type = 2;
+  } else if (args[0].type_code() == kDLUInt || args[1].type_code() == kDLUInt) {
+    if (args[0].type_code() == kDLUInt) {
+      param.start_uint = args[0].operator uint64_t();
+    } else {
+      param.start_uint = args[0].operator int64_t();
+    }
+    if (args[1].type_code() == kDLUInt) {
+      param.stop_uint = args[1].operator uint64_t();
+    } else {
+      param.stop_uint = args[1].operator int64_t();
+    }
+    param.value_type = 1;
+  } else {
+    param.start_int = args[0].operator int64_t();
+    param.stop_int = args[1].operator int64_t();
+    param.value_type = 0;
+  }
   if (features::is_enabled(features::INT64_TENSOR_SIZE))
     param.num = args[2].operator int64_t();
   else
diff --git a/src/operator/numpy/np_init_op.cc b/src/operator/numpy/np_init_op.cc
index 3917c7ed72fc..461f1e449829 100644
--- a/src/operator/numpy/np_init_op.cc
+++ b/src/operator/numpy/np_init_op.cc
@@ -37,6 +37,7 @@ DMLC_REGISTER_PARAMETER(LogspaceParam);
 DMLC_REGISTER_PARAMETER(FullLikeOpParam);
 DMLC_REGISTER_PARAMETER(AtleastNDParam);
 DMLC_REGISTER_PARAMETER(NumpyInitOpWithScalarParam);
+DMLC_REGISTER_PARAMETER(NumpyLinspaceParam);
 
 inline bool NumpyIndicesShape(const nnvm::NodeAttrs& attrs,
                               mxnet::ShapeVector* in_shapes,
@@ -246,11 +247,11 @@ NNVM_REGISTER_OP(_npi_linspace)
 .describe("Return evenly spaced numbers over a specified interval. Similar to Numpy")
 .set_num_inputs(0)
 .set_num_outputs(1)
-.set_attr_parser(ParamParser<LinspaceParam>)
-.set_attr<mxnet::FInferShape>("FInferShape", LinspaceShape)
-.set_attr<nnvm::FInferType>("FInferType", InitNumpyType<LinspaceParam>)
-.set_attr<FCompute>("FCompute<cpu>", LinspaceCompute<cpu>)
-.add_arguments(RangeParam::__FIELDS__());
+.set_attr_parser(ParamParser<NumpyLinspaceParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", NumpyLinspaceShape)
+.set_attr<nnvm::FInferType>("FInferType", InitNumpyType<NumpyLinspaceParam>)
+.set_attr<FCompute>("FCompute<cpu>", NumpyLinspaceCompute<cpu>)
+.add_arguments(NumpyLinspaceParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_npi_logspace)
 .describe("Return numbers spaced evenly on a log scale.")
diff --git a/src/operator/numpy/np_init_op.cu b/src/operator/numpy/np_init_op.cu
index df850c8d69d0..c2eb5e1cd0de 100644
--- a/src/operator/numpy/np_init_op.cu
+++ b/src/operator/numpy/np_init_op.cu
@@ -63,7 +63,7 @@ NNVM_REGISTER_OP(_npi_indices)
 .set_attr<FCompute>("FCompute<gpu>", IndicesCompute<gpu>);
 
 NNVM_REGISTER_OP(_npi_linspace)
-.set_attr<FCompute>("FCompute<gpu>", LinspaceCompute<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", NumpyLinspaceParam<gpu>);
 
 NNVM_REGISTER_OP(_npi_logspace)
 .set_attr<FCompute>("FCompute<gpu>", LogspaceCompute<gpu>);
diff --git a/src/operator/numpy/np_init_op.h b/src/operator/numpy/np_init_op.h
index 5e618408179b..b09dff768af1 100644
--- a/src/operator/numpy/np_init_op.h
+++ b/src/operator/numpy/np_init_op.h
@@ -174,6 +174,77 @@ void NumpyInitFillWithScalarCompute(const nnvm::NodeAttrs &attrs,
   }
 }
 
+struct numpy_linspace_fwd {
+  template<typename DType, typename ValueType>
+  MSHADOW_XINLINE static void Map(index_t i, index_t size, ValueType start,
+                                  ValueType stop, bool endpoint,
+                                  double step, int req, DType* out) {
+    if (i == 0) {
+      // Special cases : start = 9007199254740993
+      KERNEL_ASSIGN(out[i], req, static_cast<DType>(start));
+    } else {
+      KERNEL_ASSIGN(out[i], req, static_cast<DType>(start + step * i));
+    }
+    if (endpoint && i != 0 && i == size - 1) {
+      KERNEL_ASSIGN(out[i], req, static_cast<DType>(stop));
+    }
+  }
+};
+
+template<typename xpu>
+void NumpyLinspaceCompute(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+  using namespace mxnet_op;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const NumpyLinspaceParam& param = nnvm::get<NumpyLinspaceParam>(attrs.parsed);
+  MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(outputs[0].type_flag_, DType, {
+      index_t step_num = param.endpoint ? param.num - 1 : param.num;
+      if (param.value_type == 0) {
+        int64_t start = param.start_int;
+        int64_t stop = param.stop_int;
+        double step = step_num > 0 ? (stop - start) / step_num : 0.0f;
+        Kernel<numpy_linspace_fwd, xpu>::Launch(s,
+                                                outputs[0].Size(),
+                                                outputs[0].Size(),
+                                                start,
+                                                stop,
+                                                param.endpoint,
+                                                step,
+                                                req[0],
+                                                outputs[0].dptr<DType>());
+      } else if (param.value_type == 1) {
+        uint64_t start = param.start_uint;
+        uint64_t stop = param.stop_uint;
+        double step = step_num > 0 ? (stop - start) / step_num : 0.0f;
+        Kernel<numpy_linspace_fwd, xpu>::Launch(s,
+                                                outputs[0].Size(),
+                                                outputs[0].Size(),
+                                                start,
+                                                stop,
+                                                param.endpoint,
+                                                step,
+                                                req[0],
+                                                outputs[0].dptr<DType>());
+      } else {
+        double start = param.start_double;
+        double stop = param.stop_double;
+        double step = step_num > 0 ? (stop - start) / step_num : 0.0f;
+        Kernel<numpy_linspace_fwd, xpu>::Launch(s,
+                                                outputs[0].Size(),
+                                                outputs[0].Size(),
+                                                start,
+                                                stop,
+                                                param.endpoint,
+                                                step,
+                                                req[0],
+                                                outputs[0].dptr<DType>());
+      }
+  });
+}
+
 inline bool NumpyRangeShape(const nnvm::NodeAttrs& attrs,
                             mxnet::ShapeVector* in_shapes,
                             mxnet::ShapeVector* out_shapes) {

From 5014fade7f6f9586e7dfd9fda725d7c08979581f Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 9 Sep 2021 18:20:04 -0700
Subject: [PATCH 04/41] merge & add tests

---
 ci/docker/runtime_functions.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index a623bfdcd564..fe3e815a55c4 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -819,6 +819,8 @@ unittest_array_api_standardization() {
     export DMLC_LOG_STACK_TRACE_DEPTH=100
     python3 -m pytest --durations=50 --cov-report xml:tests_api.xml --verbose \
         array_api_tests/test_type_promotion.py::test_elementwise_function_two_arg_bool_type_promotion
+    python3 -m pytest --durations=50 --cov-report xml:tests_api.xml --verbose array_api_tests/test_creation_functions.py
+    python3 -m pytest --durations=50 --cov-report xml:tests_api.xml --verbose array_api_tests/test_constants.py
     popd
 }
 

From 85dde73ff90f956d370c6298b8a5e22ff33e49b8 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 9 Sep 2021 18:30:26 -0700
Subject: [PATCH 05/41] add NumpyLinspaceParam

---
 src/api/operator/numpy/np_init_op.cc |  2 +-
 src/operator/numpy/np_init_op.h      | 82 ++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+), 1 deletion(-)

diff --git a/src/api/operator/numpy/np_init_op.cc b/src/api/operator/numpy/np_init_op.cc
index efae2dde6327..6481d9617828 100644
--- a/src/api/operator/numpy/np_init_op.cc
+++ b/src/api/operator/numpy/np_init_op.cc
@@ -306,7 +306,7 @@ MXNET_REGISTER_API("_npi.linspace")
   }
   attrs.parsed = param;
   attrs.op = op;
-  SetAttrDict<op::LinspaceParam>(&attrs);
+  SetAttrDict<op::NumpyLinspaceParam>(&attrs);
   if (args[4].type_code() != kNull) {
     attrs.dict["ctx"] = args[4].operator std::string();
   }
diff --git a/src/operator/numpy/np_init_op.h b/src/operator/numpy/np_init_op.h
index 9a45b9c3df17..93fcd8c8573c 100644
--- a/src/operator/numpy/np_init_op.h
+++ b/src/operator/numpy/np_init_op.h
@@ -174,6 +174,88 @@ void NumpyInitFillWithScalarCompute(const nnvm::NodeAttrs &attrs,
   }
 }
 
+struct NumpyLinspaceParam : public dmlc::Parameter<NumpyLinspaceParam> {
+  double start_double;
+  double stop_double;
+  int64_t start_int;
+  int64_t stop_int;
+  uint64_t start_uint;
+  uint64_t stop_uint;
+  index_t num;
+  bool endpoint;
+  std::string ctx;
+  int dtype;
+  int value_type;
+  DMLC_DECLARE_PARAMETER(NumpyLinspaceParam) {
+    DMLC_DECLARE_FIELD(start_double)
+    .describe("The double type starting value of the sequence.");
+    DMLC_DECLARE_FIELD(stop_double)
+    .describe("The double type ending value of the sequence");
+    DMLC_DECLARE_FIELD(start_int)
+    .describe("The int type starting value of the sequence.");
+    DMLC_DECLARE_FIELD(stop_int)
+    .describe("The int type ending value of the sequence");
+    DMLC_DECLARE_FIELD(start_uint)
+    .describe("The unsigned int type starting value of the sequence.");
+    DMLC_DECLARE_FIELD(stop_uint)
+    .describe("The unsigned int type ending value of the sequence");
+    DMLC_DECLARE_FIELD(num)
+    .describe("Number of samples to generate. Must be non-negative.");
+    DMLC_DECLARE_FIELD(endpoint)
+    .set_default(true)
+    .describe("If True, stop is the last sample. Otherwise, it is not included.");
+    DMLC_DECLARE_FIELD(ctx)
+    .set_default("")
+    .describe("Context of output, in format [cpu|gpu|cpu_pinned](n)."
+              "Only used for imperative calls.");
+    DMLC_DECLARE_FIELD(dtype)
+    .set_default(-1)
+    .add_enum("None", -1)
+    MXNET_ADD_ALL_TYPES_EXT_WITH_BOOL
+    .describe("Target data type.");
+    DMLC_DECLARE_FIELD(value_type)
+    .set_default(0)
+    .describe("Data type for start and stop value");
+  }
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream start_double_s, stop_double_s, num_s, endpoint_s, dtype_s,
+    start_int_s, stop_int_s, start_uint_s, stop_uint_s, value_type_s;
+    start_double_s << start_double;
+    stop_double_s << stop_double;
+    start_int_s << start_int;
+    stop_int_s << stop_int;
+    start_uint_s << start_uint;
+    stop_uint_s << stop_uint;
+    value_type_s << value_type;
+    num_s << num;
+    endpoint_s << endpoint;
+    dtype_s << dtype;
+    (*dict)["start_double"] = start_double_s.str();
+    (*dict)["stop_double"] = stop_double_s.str();
+    (*dict)["start_int"] = start_int_s.str();
+    (*dict)["stop_int"] = stop_int_s.str();
+    (*dict)["start_uint"] = start_uint_s.str();
+    (*dict)["stop_uint"] = stop_uint_s.str();
+    (*dict)["value_type"] = value_type_s.str();
+    (*dict)["num"] = num_s.str();
+    (*dict)["endpoint"] = endpoint_s.str();
+    (*dict)["dtype"] = MXNetTypeWithBool2String(dtype);
+  }
+};
+
+inline bool NumpyLinspaceShape(const nnvm::NodeAttrs& attrs,
+                               mxnet::ShapeVector *in_attrs,
+                               mxnet::ShapeVector *out_attrs) {
+  const NumpyLinspaceParam& param = nnvm::get<NumpyLinspaceParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 0U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  CHECK_GE(param.num, 0)
+    << "Number of sequence should be non-negative, received " << param.num;
+  mxnet::TShape shape = mxnet::TShape({static_cast<nnvm::dim_t>(param.num)});
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, shape);
+  return true;
+}
+
 struct numpy_linspace_fwd {
   template<typename DType, typename ValueType>
   MSHADOW_XINLINE static void Map(index_t i, index_t size, ValueType start,

From 9cb5881c78684e43fe50de2b830200a4a1c5446a Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 9 Sep 2021 19:02:40 -0700
Subject: [PATCH 06/41] fix lint'

'
---
 python/mxnet/_ffi/_ctypes/function.py            |  2 +-
 python/mxnet/gluon/loss.py                       |  1 +
 python/mxnet/gluon/parameter.py                  |  2 +-
 .../probability/distributions/categorical.py     |  2 +-
 .../gluon/probability/distributions/cauchy.py    |  2 +-
 .../probability/distributions/divergence.py      |  2 +-
 .../gluon/probability/distributions/geometric.py |  2 +-
 .../distributions/multivariate_normal.py         |  6 +++---
 .../distributions/transformed_distribution.py    |  4 ++--
 .../probability/transformation/transformation.py |  4 ++--
 python/mxnet/gluon/utils.py                      |  2 +-
 python/mxnet/numpy/multiarray.py                 | 16 ++++++++++------
 12 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/python/mxnet/_ffi/_ctypes/function.py b/python/mxnet/_ffi/_ctypes/function.py
index 8b9a797a68c0..229b98727a00 100644
--- a/python/mxnet/_ffi/_ctypes/function.py
+++ b/python/mxnet/_ffi/_ctypes/function.py
@@ -65,7 +65,7 @@ def _make_mxnet_args(args, temp_args):
         elif isinstance(arg, Integral):
             if arg > _MAX_VALUE_64_BIT_UNSIGNED_:
                 raise OverflowError("Integer out of bounds")
-            elif arg > _MAX_VALUE_64_BIT_SIGNED_:
+            if arg > _MAX_VALUE_64_BIT_SIGNED_:
                 values[i].v_uint64 = arg
                 type_codes[i] = TypeCode.UINT
             else:
diff --git a/python/mxnet/gluon/loss.py b/python/mxnet/gluon/loss.py
index 5bf2cd9be293..57364d06db2b 100644
--- a/python/mxnet/gluon/loss.py
+++ b/python/mxnet/gluon/loss.py
@@ -956,6 +956,7 @@ def _compute_distances(self, x1, x2):
         return squared_diffs.sum(axis=2)
 
 
+    # pylint: disable=too-many-function-args
     def _compute_labels(self, batch_size):
         """
         The function creates the label matrix for the loss.
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 16e2957c2551..49405e189692 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -386,7 +386,7 @@ def _init_grad(self):
             if self._grad_stype != 'default':
                 raise ValueError("mxnet.numpy.zeros does not support stype = {}"
                                  .format(self._grad_stype))
-            self._grad = [_mx_np.zeros(shape=i.shape, dtype=i.dtype, ctx=i.ctx)
+            self._grad = [_mx_np.zeros(shape=i.shape, dtype=i.dtype, device=i.ctx)
                           for i in self._data]
         else:
             self._grad = [ndarray.zeros(shape=i.shape, dtype=i.dtype, ctx=i.ctx,
diff --git a/python/mxnet/gluon/probability/distributions/categorical.py b/python/mxnet/gluon/probability/distributions/categorical.py
index ace162f28374..f95db52a249f 100644
--- a/python/mxnet/gluon/probability/distributions/categorical.py
+++ b/python/mxnet/gluon/probability/distributions/categorical.py
@@ -109,7 +109,7 @@ def log_prob(self, value):
             self._validate_samples(value)
         logit = self.logit
         indices = np.expand_dims(value, -1).astype('int')
-        expanded_logit = logit * np.ones_like(logit + indices)
+        expanded_logit = logit * np.ones_like(logit + indices)  # pylint: disable=too-many-function-args
         return npx.pick(expanded_logit, indices).squeeze()
 
     def sample(self, size=None):
diff --git a/python/mxnet/gluon/probability/distributions/cauchy.py b/python/mxnet/gluon/probability/distributions/cauchy.py
index bd7804d869b5..b609dd4d8212 100644
--- a/python/mxnet/gluon/probability/distributions/cauchy.py
+++ b/python/mxnet/gluon/probability/distributions/cauchy.py
@@ -65,7 +65,7 @@ def sample(self, size=None):
         if (isinstance(self.loc, Number), isinstance(self.scale, Number)) == (True, True):
             u = np.random.uniform(size=size)
         else:
-            u = np.random.uniform(np.zeros_like(
+            u = np.random.uniform(np.zeros_like(  # pylint: disable=too-many-function-args
                 self.loc + self.scale), size=size)
         return self.icdf(u)
 
diff --git a/python/mxnet/gluon/probability/distributions/divergence.py b/python/mxnet/gluon/probability/distributions/divergence.py
index 90c0d1f989f7..5d25871ccad6 100644
--- a/python/mxnet/gluon/probability/distributions/divergence.py
+++ b/python/mxnet/gluon/probability/distributions/divergence.py
@@ -309,7 +309,7 @@ def _kl_mvn_mvn(p, q):
         # Batch matrix vector multiply
         np.einsum('...jk,...j->...k', q.precision, diff)
     ) * -0.5
-    n = np.ones_like(diff).sum(-1)
+    n = np.ones_like(diff).sum(-1)  # pylint: disable=too-many-function-args
     return 0.5 * (term1 + term2 + term3 - n)
 
 
diff --git a/python/mxnet/gluon/probability/distributions/geometric.py b/python/mxnet/gluon/probability/distributions/geometric.py
index b7cbbe1d2be4..fda2f01caae4 100644
--- a/python/mxnet/gluon/probability/distributions/geometric.py
+++ b/python/mxnet/gluon/probability/distributions/geometric.py
@@ -109,7 +109,7 @@ def sample(self, size=None):
         if isinstance(self.prob, Number):
             shape_tensor = np.zeros(())
         else:
-            shape_tensor = np.zeros_like(self.prob)
+            shape_tensor = np.zeros_like(self.prob)  # pylint: disable=too-many-function-args
         u = np.random.uniform(shape_tensor, size=size)
         samples = np.floor(
             np.log(u) / np.log1p(-self.prob)
diff --git a/python/mxnet/gluon/probability/distributions/multivariate_normal.py b/python/mxnet/gluon/probability/distributions/multivariate_normal.py
index 40e7c4c248d4..62387dd76451 100644
--- a/python/mxnet/gluon/probability/distributions/multivariate_normal.py
+++ b/python/mxnet/gluon/probability/distributions/multivariate_normal.py
@@ -115,8 +115,8 @@ def sample(self, size=None):
             if isinstance(size, int):
                 size = (size,)
             shape_tensor = np.broadcast_to(shape_tensor, size + (-2,))
-        noise = np.random.normal(np.zeros_like(
-            shape_tensor), np.ones_like(shape_tensor))
+        noise = np.random.normal(np.zeros_like(  # pylint: disable=too-many-function-args
+            shape_tensor), np.ones_like(shape_tensor))  # pylint: disable=too-many-function-args
         samples = self.loc + \
             np.einsum('...jk,...j->...k', self.scale_tril, noise)
         return samples
@@ -128,7 +128,7 @@ def sample_n(self, size=None):
         shape_tensor = self.loc + self.scale_tril[..., 0]
         if isinstance(size, int):
             size = (size,)
-        noise = np.random.normal(np.zeros_like(shape_tensor), np.ones_like(shape_tensor),
+        noise = np.random.normal(np.zeros_like(shape_tensor), np.ones_like(shape_tensor),  # pylint: disable=too-many-function-args
                                  (-2,) + size)
         samples = self.loc + \
             np.einsum('...jk,...j->...k', self.scale_tril, noise)
diff --git a/python/mxnet/gluon/probability/distributions/transformed_distribution.py b/python/mxnet/gluon/probability/distributions/transformed_distribution.py
index a879ba682c1f..bc7afe7c3cac 100644
--- a/python/mxnet/gluon/probability/distributions/transformed_distribution.py
+++ b/python/mxnet/gluon/probability/distributions/transformed_distribution.py
@@ -84,7 +84,7 @@ def cdf(self, value):
         """
         Compute the cumulative distribution function(CDF) p(Y < `value`)
         """
-        sign = np.ones_like(value)
+        sign = np.ones_like(value)  # pylint: disable=too-many-function-args
         for t in reversed(self._transforms):
             value = t.inv(value)
             sign = sign * t.sign
@@ -92,7 +92,7 @@ def cdf(self, value):
         return sign * (value - 0.5) + 0.5
 
     def icdf(self, value):
-        sign = np.ones_like(value)
+        sign = np.ones_like(value)  # pylint: disable=too-many-function-args
         for t in self._transforms:
             sign = sign * t.sign
         value = sign * (value - 0.5) + 0.5  # value or (1 - value)
diff --git a/python/mxnet/gluon/probability/transformation/transformation.py b/python/mxnet/gluon/probability/transformation/transformation.py
index a1a28294b025..c4826389c804 100644
--- a/python/mxnet/gluon/probability/transformation/transformation.py
+++ b/python/mxnet/gluon/probability/transformation/transformation.py
@@ -159,7 +159,7 @@ def inv(self):
 
     def log_det_jacobian(self, x, y):
         if not self._parts:
-            return np.zeros_like(x)
+            return np.zeros_like(x)  # pylint: disable=too-many-function-args
         result = 0
         x_prime = None
         for t in self._parts[:-1]:
@@ -211,7 +211,7 @@ def _inverse_compute(self, y):
 
     def log_det_jacobian(self, x, y):
         # element-wise abs(log(dy/dx))
-        value = np.ones_like(x) * np.log(np.abs(self._scale))
+        value = np.ones_like(x) * np.log(np.abs(self._scale))  # pylint: disable=too-many-function-args
         return sum_right_most(value, self.event_dim)
 
     @property
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index df641cf1ace5..de47dd61cdd3 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -158,7 +158,7 @@ def multi_sum_sq(*args, ctx=None):
                 UserWarning('nan or inf is detected. '
                             'Clipping results will be undefined.'), stacklevel=2)
     scale = max_norm / (total_norm + 1e-8)
-    scale = _mx_np.min(_mx_np.concatenate([scale, _mx_np.ones(1, ctx=ctx)], axis=0))
+    scale = _mx_np.min(_mx_np.concatenate([scale, _mx_np.ones(1, device=ctx)], axis=0))
     for arr in arrays:
         arr *= scale.item()
     if check_isfinite:
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index d07c7685fe57..e40ee2e557fe 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -80,7 +80,8 @@
            'quantile', 'percentile', 'shares_memory', 'may_share_memory', 'diff', 'ediff1d', 'resize', 'matmul',
            'nan_to_num', 'isnan', 'isinf', 'isposinf', 'isneginf', 'isfinite', 'polyval', 'where', 'bincount',
            'atleast_1d', 'atleast_2d', 'atleast_3d', 'fill_diagonal', 'squeeze',
-           'diagflat', 'repeat', 'prod', 'pad', 'cumsum', 'sum', 'rollaxis', 'diag', 'diagonal', 'asarray', 'from_dlpack']
+           'diagflat', 'repeat', 'prod', 'pad', 'cumsum', 'sum', 'rollaxis', 'diag', 'diagonal',
+           'asarray', 'from_dlpack']
 
 __all__ += fallback.__all__
 
@@ -826,14 +827,14 @@ def __getitem__(self, key):
         elif indexing_dispatch_code == _NDARRAY_BASIC_INDEXING:
             if prepend == _NDARRAY_ZERO_DIM_BOOL_ARRAY_FALSE:
                 return empty((0,) + self._get_np_basic_indexing(key).shape,
-                             dtype=self.dtype, ctx=self.ctx)
+                             dtype=self.dtype, device=self.ctx)
             if prepend == _NDARRAY_ZERO_DIM_BOOL_ARRAY_TRUE:
                 key = (_np.newaxis,) + key
             return self._get_np_basic_indexing(key)
         elif indexing_dispatch_code == _NDARRAY_ADVANCED_INDEXING:
             if prepend == _NDARRAY_ZERO_DIM_BOOL_ARRAY_FALSE:
                 return empty((0,) + self._get_np_adanced_indexing(key).shape,
-                             dtype=self.dtype, ctx=self.ctx)
+                             dtype=self.dtype, device=self.ctx)
             if prepend == _NDARRAY_ZERO_DIM_BOOL_ARRAY_TRUE:
                 key = (_np.newaxis,) + key
             return self._get_np_advanced_indexing(key)
@@ -967,7 +968,7 @@ def _prepare_value_nd(self, value, bcast_shape, squeeze_axes=None):
         Note: mxnet.numpy.ndarray not support NDArray as assigned value.
         """
         if isinstance(value, numeric_types):
-            value_nd = full(bcast_shape, value, ctx=self.ctx, dtype=self.dtype)
+            value_nd = full(bcast_shape, value, device=self.ctx, dtype=self.dtype)
         elif isinstance(value, self.__class__):
             value_nd = value.as_in_ctx(self.ctx)
             if value_nd.dtype != self.dtype:
@@ -2613,7 +2614,7 @@ def array(object, dtype=None, ctx=None):
             # printing out the error raised by official NumPy's array function
             # for transparency on users' side
             raise TypeError('{}'.format(str(e)))
-    ret = empty(object.shape, dtype=dtype, ctx=ctx)
+    ret = empty(object.shape, dtype=dtype, device=ctx)
     if len(object.shape) == 0:
         ret[()] = object
     else:
@@ -12310,6 +12311,7 @@ def sum(a, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=N
 # pylint: enable=redefined-outer-name, too-many-arguments
 
 
+# pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
 @wrap_data_api_creation_func
 def asarray(obj, /, *, dtype=None, device=None, copy=None):
@@ -12318,7 +12320,7 @@ def asarray(obj, /, *, dtype=None, device=None, copy=None):
 
     Parameters
     ----------
-    obj : Union[ <array>, bool, int, float, NestedSequence[ bool | int | float ], SupportsDLPack, SupportsBufferProtocol ]
+    obj : <array>, bool, int, float, NestedSequence[ bool | int | float ]
         Object to be converted to an array. Can be a Python scalar,
         a (possibly nested) sequence of Python scalars,
         or an object supporting DLPack or the Python buffer protocol.
@@ -12373,6 +12375,8 @@ def asarray(obj, /, *, dtype=None, device=None, copy=None):
     array = _as_mx_np_array(obj, ctx=device, zero_copy=copy)
     return array.astype(dtype)
 
+
+# pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
 @wrap_data_api_creation_func
 def from_dlpack(x, /):

From 6c032ea6ab604d5cd014a95235e30e564597bf44 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 10 Sep 2021 09:45:13 -0700
Subject: [PATCH 07/41] fix

---
 src/operator/tensor/init_op.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h
index c53da9336ec3..bc7b629287ed 100644
--- a/src/operator/tensor/init_op.h
+++ b/src/operator/tensor/init_op.h
@@ -744,13 +744,9 @@ inline bool RangeShape(const nnvm::NodeAttrs& attrs,
 
 struct linspace_fwd {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(index_t i, index_t size, double start,
-                                  double stop, bool endpoint,
-                                  double step, int req, DType* out) {
+  MSHADOW_XINLINE static void Map(index_t i, double start, double stop, double step,
+                                  int req, DType* out) {
     KERNEL_ASSIGN(out[i], req, static_cast<DType>(start + step * i));
-    if (endpoint && i != 0 && i == size - 1) {
-      KERNEL_ASSIGN(out[i], req, static_cast<DType>(stop));
-    }
   }
 };
 

From ba01eed0b772856199ad726db51f9b7d9599fead Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 10 Sep 2021 09:49:52 -0700
Subject: [PATCH 08/41] add indexing test

---
 ci/docker/runtime_functions.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index fe3e815a55c4..2056e1bc7524 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -821,6 +821,7 @@ unittest_array_api_standardization() {
         array_api_tests/test_type_promotion.py::test_elementwise_function_two_arg_bool_type_promotion
     python3 -m pytest --durations=50 --cov-report xml:tests_api.xml --verbose array_api_tests/test_creation_functions.py
     python3 -m pytest --durations=50 --cov-report xml:tests_api.xml --verbose array_api_tests/test_constants.py
+    python3 -m pytest --durations=50 --cov-report xml:tests_api.xml --verbose array_api_tests/test_indexing.py
     popd
 }
 

From 3db3b95a57797c94def04c92c898428980df0809 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 10 Sep 2021 15:54:25 -0700
Subject: [PATCH 09/41] fix tests

---
 python/mxnet/gluon/loss.py                        |  5 +++--
 python/mxnet/ndarray/numpy/_op.py                 |  8 ++++----
 python/mxnet/numpy/multiarray.py                  | 10 +++++++---
 python/mxnet/util.py                              | 13 +++++++++++++
 src/operator/numpy/np_init_op.cu                  |  2 +-
 src/operator/numpy/np_init_op.h                   | 10 +++++++---
 tests/python/unittest/test_numpy_default_dtype.py |  2 +-
 tests/python/unittest/test_numpy_ndarray.py       |  4 ++--
 tests/python/unittest/test_numpy_op.py            |  4 ++--
 9 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/python/mxnet/gluon/loss.py b/python/mxnet/gluon/loss.py
index 57364d06db2b..af191d9cb8fa 100644
--- a/python/mxnet/gluon/loss.py
+++ b/python/mxnet/gluon/loss.py
@@ -27,7 +27,7 @@
 import numpy as _np
 from ..base import numeric_types
 from .block import HybridBlock
-from ..util import use_np
+from ..util import use_np, is_np_default_dtype
 from .. import np, npx
 
 
@@ -894,7 +894,8 @@ def _cosine_similarity(self, x, y, axis=-1):
         x_norm = npx.reshape(npx.norm(x, axis=axis), (-1, 1))
         y_norm = npx.reshape(npx.norm(y, axis=axis), (-1, 1))
         x_dot_y = npx.reshape(np.sum(x * y, axis=axis), (-1, 1))
-        eps_arr = np.full((1, 1), 1e-12)
+        default_dtype = "float64" if is_np_default_dtype() else "float32"
+        eps_arr = np.full((1, 1), 1e-12, dtype=default_dtype)
         return (x_dot_y / np.maximum(x_norm * y_norm, eps_arr))
 
 
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 27cc56c5578e..fbf70bcb4b4c 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -323,10 +323,10 @@ def full(shape, fill_value, dtype=None, order='C', ctx=None, out=None):  # pylin
     fill_value : scalar or ndarray
         Fill value.
     dtype : data-type, optional
-        The desired data-type for the array. The default, `None`, means
-        `np.array(fill_value).dtype`.
-        - When npx.is_np_default_dtype() returns False, default dtype is float32;
-        - When npx.is_np_default_dtype() returns True, default dtype is float64.
+        If dtype is None, the output array data type must be inferred from fill_value.
+        If it’s an int, the output array dtype must be the default integer dtype;
+        If it’s a float, then the output array dtype must be the default floating-point data type;
+        If it’s a bool then the output array must have boolean dtype. Default: None.
     order : {'C'}, optional
         Whether to store multidimensional data in C- or Fortran-contiguous
         (row- or column-wise) order in memory. Currently only supports C order.
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index e40ee2e557fe..b29badcf9da8 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -45,7 +45,8 @@
 from ..runtime import Features
 from ..context import Context
 from ..util import set_module, wrap_np_unary_func, wrap_np_binary_func,\
-                   is_np_default_dtype, wrap_data_api_creation_func
+                   is_np_default_dtype, wrap_data_api_creation_func,\
+                   numpy_eye_standardized
 from ..context import current_context
 from ..ndarray import numpy as _mx_nd_np
 from ..ndarray.numpy import _internal as _npi
@@ -2792,8 +2793,10 @@ def full(shape, fill_value, *, dtype=None, order='C', device=None, out=None):
     fill_value : scalar or ndarray
         Fill value.
     dtype : data-type, optional
-        The desired data-type for the array. The default, `None`, means
-        `np.array(fill_value).dtype`.
+        If dtype is None, the output array data type must be inferred from fill_value.
+        If it’s an int, the output array dtype must be the default integer dtype;
+        If it’s a float, then the output array dtype must be the default floating-point data type;
+        If it’s a bool then the output array must have boolean dtype. Default: None.
     order : {'C'}, optional
         Whether to store multidimensional data in C- or Fortran-contiguous
         (row- or column-wise) order in memory. Currently only supports C order.
@@ -5547,6 +5550,7 @@ def histogram(a, bins=10, range=None, normed=None, weights=None, density=None):
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
 @wrap_data_api_creation_func
+@numpy_eye_standardized
 def eye(N, M=None, /, *, k=0, dtype=None, device=None, **kwargs):
     """
     Return a 2-D array with ones on the diagonal and zeros elsewhere.
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index ba9723be6fb8..63c1c05779f3 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -669,6 +669,19 @@ def _wrap_api_creation_func(*args, **kwargs):
     return _wrap_api_creation_func
 
 
+def numpy_eye_standardized(func):
+    """decorator for numpy.eye operator because k is positional
+       arg in numpy while key-word in api standard
+    """
+    @functools.wraps(func)
+    def _wrap_numpy_eye_func(*args, **kwargs):
+        if len(args) > 2:
+            kwargs["k"] = args[2]
+            args = args[0:2]
+        return func(*args, **kwargs)
+    return _wrap_numpy_eye_func
+
+
 # pylint: disable=exec-used
 def numpy_fallback(func):
     """decorator for falling back to offical numpy for a specific function"""
diff --git a/src/operator/numpy/np_init_op.cu b/src/operator/numpy/np_init_op.cu
index c2eb5e1cd0de..ac5f33973bea 100644
--- a/src/operator/numpy/np_init_op.cu
+++ b/src/operator/numpy/np_init_op.cu
@@ -63,7 +63,7 @@ NNVM_REGISTER_OP(_npi_indices)
 .set_attr<FCompute>("FCompute<gpu>", IndicesCompute<gpu>);
 
 NNVM_REGISTER_OP(_npi_linspace)
-.set_attr<FCompute>("FCompute<gpu>", NumpyLinspaceParam<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", NumpyLinspaceCompute<gpu>);
 
 NNVM_REGISTER_OP(_npi_logspace)
 .set_attr<FCompute>("FCompute<gpu>", LogspaceCompute<gpu>);
diff --git a/src/operator/numpy/np_init_op.h b/src/operator/numpy/np_init_op.h
index 93fcd8c8573c..9daba471f91e 100644
--- a/src/operator/numpy/np_init_op.h
+++ b/src/operator/numpy/np_init_op.h
@@ -265,7 +265,11 @@ struct numpy_linspace_fwd {
       // Special cases : start = 9007199254740993
       KERNEL_ASSIGN(out[i], req, static_cast<DType>(start));
     } else {
-      KERNEL_ASSIGN(out[i], req, static_cast<DType>(start + step * i));
+      if (std::is_integral<DType>::value) {
+        KERNEL_ASSIGN(out[i], req, static_cast<DType>(std::floor(start + step * i)));
+      } else {
+        KERNEL_ASSIGN(out[i], req, static_cast<DType>(start + step * i));
+      }
     }
     if (endpoint && i != 0 && i == size - 1) {
       KERNEL_ASSIGN(out[i], req, static_cast<DType>(stop));
@@ -287,7 +291,7 @@ void NumpyLinspaceCompute(const nnvm::NodeAttrs& attrs,
       if (param.value_type == 0) {
         int64_t start = param.start_int;
         int64_t stop = param.stop_int;
-        double step = step_num > 0 ? (stop - start) / step_num : 0.0f;
+        double step = step_num > 0 ? ((double)stop - (double)start) / step_num : 0.0f;
         Kernel<numpy_linspace_fwd, xpu>::Launch(s,
                                                 outputs[0].Size(),
                                                 outputs[0].Size(),
@@ -300,7 +304,7 @@ void NumpyLinspaceCompute(const nnvm::NodeAttrs& attrs,
       } else if (param.value_type == 1) {
         uint64_t start = param.start_uint;
         uint64_t stop = param.stop_uint;
-        double step = step_num > 0 ? (stop - start) / step_num : 0.0f;
+        double step = step_num > 0 ? ((double)stop - (double)start) / step_num : 0.0f;
         Kernel<numpy_linspace_fwd, xpu>::Launch(s,
                                                 outputs[0].Size(),
                                                 outputs[0].Size(),
diff --git a/tests/python/unittest/test_numpy_default_dtype.py b/tests/python/unittest/test_numpy_default_dtype.py
index 4f9867a65b67..c6f66b972e81 100644
--- a/tests/python/unittest/test_numpy_default_dtype.py
+++ b/tests/python/unittest/test_numpy_default_dtype.py
@@ -43,7 +43,7 @@ def get_workloads(name):
     'ones',
     'zeros',
     'eye',
-    'full',
+    # 'full',  dtype of mx.np.full now infers from fill_value
     'identity',
     'linspace',
     'logspace',
diff --git a/tests/python/unittest/test_numpy_ndarray.py b/tests/python/unittest/test_numpy_ndarray.py
index 559b8a575f5d..ed2d4ac6ace8 100644
--- a/tests/python/unittest/test_numpy_ndarray.py
+++ b/tests/python/unittest/test_numpy_ndarray.py
@@ -69,12 +69,12 @@ def test_np_empty():
             for order in orders:
                 for ctx in ctxes:
                     if order == 'C':
-                        ret = np.empty(shape, dtype, order, ctx)
+                        ret = np.empty(shape, dtype=dtype, order=order, device=ctx)
                         assert ret.dtype == expected_dtype
                         assert ret.shape == shape if isinstance(shape, tuple) else (shape,)
                         assert ret.ctx == npx.current_context()
                     else:
-                        assert_exception(np.empty, NotImplementedError, shape, dtype, order, ctx)
+                        assert_exception(np.empty, NotImplementedError, shape, dtype=dtype, order=order, device=ctx)
 
 
 @use_np
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index e21e8fdc49b8..104676767274 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -1218,8 +1218,8 @@ def forward(self, x):
             if self._retstep:
                 raise ValueError("linspace didn't support retstep = True inside HybridBlock")
             else:
-                return x + np.linspace(self._start, self._stop, self._num, \
-                self._endpoint, self._retstep, self._dtype)
+                return x + np.linspace(self._start, self._stop, num=self._num, \
+                endpoint=self._endpoint, retstep=self._retstep, dtype=self._dtype)
 
     x = np.zeros(shape=(), dtype=dtype)
     if isinstance(config, tuple):

From dbea44acefbf86ed7e4fc419baca16f60638f28d Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 10 Sep 2021 17:03:39 -0700
Subject: [PATCH 10/41] fix sanity

---
 src/operator/numpy/np_init_op.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/operator/numpy/np_init_op.h b/src/operator/numpy/np_init_op.h
index 9daba471f91e..f16a0a3ac23f 100644
--- a/src/operator/numpy/np_init_op.h
+++ b/src/operator/numpy/np_init_op.h
@@ -291,7 +291,8 @@ void NumpyLinspaceCompute(const nnvm::NodeAttrs& attrs,
       if (param.value_type == 0) {
         int64_t start = param.start_int;
         int64_t stop = param.stop_int;
-        double step = step_num > 0 ? ((double)stop - (double)start) / step_num : 0.0f;
+        double step = step_num > 0 ? \
+            (static_cast<double>(stop) - static_cast<double>(start)) / step_num : 0.0f;
         Kernel<numpy_linspace_fwd, xpu>::Launch(s,
                                                 outputs[0].Size(),
                                                 outputs[0].Size(),
@@ -304,7 +305,8 @@ void NumpyLinspaceCompute(const nnvm::NodeAttrs& attrs,
       } else if (param.value_type == 1) {
         uint64_t start = param.start_uint;
         uint64_t stop = param.stop_uint;
-        double step = step_num > 0 ? ((double)stop - (double)start) / step_num : 0.0f;
+        double step = step_num > 0 ? \
+            (static_cast<double>(stop) - static_cast<double>(start)) / step_num : 0.0f;
         Kernel<numpy_linspace_fwd, xpu>::Launch(s,
                                                 outputs[0].Size(),
                                                 outputs[0].Size(),

From 91c9b75385de93d005df06a62f7a81a2b5121939 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 13 Sep 2021 10:16:21 -0700
Subject: [PATCH 11/41] fix lint

---
 src/operator/numpy/np_init_op.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/operator/numpy/np_init_op.cu b/src/operator/numpy/np_init_op.cu
index d855afd9a85c..0766f844bef4 100644
--- a/src/operator/numpy/np_init_op.cu
+++ b/src/operator/numpy/np_init_op.cu
@@ -37,7 +37,8 @@ NNVM_REGISTER_OP(_npi_identity).set_attr<FCompute>("FCompute<gpu>", IdentityComp
 
 NNVM_REGISTER_OP(_npi_full_like).set_attr<FCompute>("FCompute<gpu>", FullLikeOpCompute<gpu>);
 
-NNVM_REGISTER_OP(_npi_full).set_attr<FCompute>("FCompute<gpu>", NumpyInitFillWithScalarCompute<gpu>);
+NNVM_REGISTER_OP(_npi_full)
+.set_attr<FCompute>("FCompute<gpu>", NumpyInitFillWithScalarCompute<gpu>);
 
 NNVM_REGISTER_OP(_npi_atleast_1d).set_attr<FCompute>("FCompute<gpu>", AtleastNDCompute<gpu>);
 

From 47a61bd684799b894a5e4a24e385a4a42d3f01f2 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 13 Sep 2021 11:14:02 -0700
Subject: [PATCH 12/41] fix tests

---
 python/mxnet/gluon/loss.py                  |  5 ++--
 python/mxnet/ndarray/numpy/_op.py           |  2 +-
 src/operator/numpy/np_init_op.cc            | 27 ++++++---------------
 tests/python/unittest/test_numpy_ndarray.py |  2 +-
 4 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/python/mxnet/gluon/loss.py b/python/mxnet/gluon/loss.py
index af191d9cb8fa..57364d06db2b 100644
--- a/python/mxnet/gluon/loss.py
+++ b/python/mxnet/gluon/loss.py
@@ -27,7 +27,7 @@
 import numpy as _np
 from ..base import numeric_types
 from .block import HybridBlock
-from ..util import use_np, is_np_default_dtype
+from ..util import use_np
 from .. import np, npx
 
 
@@ -894,8 +894,7 @@ def _cosine_similarity(self, x, y, axis=-1):
         x_norm = npx.reshape(npx.norm(x, axis=axis), (-1, 1))
         y_norm = npx.reshape(npx.norm(y, axis=axis), (-1, 1))
         x_dot_y = npx.reshape(np.sum(x * y, axis=axis), (-1, 1))
-        default_dtype = "float64" if is_np_default_dtype() else "float32"
-        eps_arr = np.full((1, 1), 1e-12, dtype=default_dtype)
+        eps_arr = np.full((1, 1), 1e-12)
         return (x_dot_y / np.maximum(x_norm * y_norm, eps_arr))
 
 
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index fbf70bcb4b4c..cee771965dcc 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -391,7 +391,7 @@ def full(shape, fill_value, dtype=None, order='C', ctx=None, out=None):  # pylin
             dtype = _np.int64 if dtype is None else dtype
     elif isinstance(fill_value, numeric_types):
         if dtype is None or dtype is float:
-            dtype = _np.float64
+            dtype = _np.float64 if is_np_default_dtype() else _np.float32
     if dtype is not None and not isinstance(dtype, str):
         dtype = _np.dtype(dtype).name
     return _api_internal.full(shape, dtype, fill_value, ctx, out)
diff --git a/src/operator/numpy/np_init_op.cc b/src/operator/numpy/np_init_op.cc
index 037c706b6c03..528fa0769460 100644
--- a/src/operator/numpy/np_init_op.cc
+++ b/src/operator/numpy/np_init_op.cc
@@ -205,25 +205,14 @@ NNVM_REGISTER_OP(_npi_full_like)
     .add_arguments(FullLikeOpParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_npi_full)
-<<<<<<< HEAD
-  .describe("fill target with a scalar value")
-  .set_num_inputs(0)
-  .set_num_outputs(1)
-  .set_attr_parser(ParamParser<NumpyInitOpWithScalarParam>)
-  .set_attr<mxnet::FInferShape>("FInferShape", InitShape<NumpyInitOpWithScalarParam>)
-  .set_attr<nnvm::FInferType>("FInferType", InitNumpyType<NumpyInitOpWithScalarParam>)
-  .set_attr<FCompute>("FCompute<cpu>", NumpyInitFillWithScalarCompute<cpu>)
-.add_arguments(NumpyInitOpWithScalarParam::__FIELDS__());
-=======
-    .describe("fill target with a scalar value")
-    .set_num_inputs(0)
-    .set_num_outputs(1)
-    .set_attr_parser(ParamParser<InitOpWithScalarParam>)
-    .set_attr<mxnet::FInferShape>("FInferShape", InitShape<InitOpWithScalarParam>)
-    .set_attr<nnvm::FInferType>("FInferType", InitNumpyType<InitOpWithScalarParam>)
-    .set_attr<FCompute>("FCompute<cpu>", InitFillWithScalarCompute<cpu>)
-    .add_arguments(InitOpWithScalarParam::__FIELDS__());
->>>>>>> upstream/master
+      .describe("fill target with a scalar value")
+      .set_num_inputs(0)
+      .set_num_outputs(1)
+      .set_attr_parser(ParamParser<NumpyInitOpWithScalarParam>)
+      .set_attr<mxnet::FInferShape>("FInferShape", InitShape<NumpyInitOpWithScalarParam>)
+      .set_attr<nnvm::FInferType>("FInferType", InitNumpyType<NumpyInitOpWithScalarParam>)
+      .set_attr<FCompute>("FCompute<cpu>", NumpyInitFillWithScalarCompute<cpu>)
+    .add_arguments(NumpyInitOpWithScalarParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_npi_arange)
     .set_num_inputs(0)
diff --git a/tests/python/unittest/test_numpy_ndarray.py b/tests/python/unittest/test_numpy_ndarray.py
index ed2d4ac6ace8..332328d8c365 100644
--- a/tests/python/unittest/test_numpy_ndarray.py
+++ b/tests/python/unittest/test_numpy_ndarray.py
@@ -167,7 +167,7 @@ def __init__(self, shape, dtype=None):
             self._dtype = dtype
 
         def forward(self, x, *args, **kwargs):
-            return x * np.ones(shape, dtype)
+            return x * np.ones(shape, dtype=dtype)
 
     class TestOnesOutputType(HybridBlock):
         def forward(self, x, *args, **kwargs):

From c6e5596af0e2966367038fca01d5bf1bbbac239a Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 13 Sep 2021 14:33:16 -0700
Subject: [PATCH 13/41] disable warning

---
 src/operator/numpy/np_init_op.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/operator/numpy/np_init_op.h b/src/operator/numpy/np_init_op.h
index abdaa03942b8..6be6cc137a6d 100644
--- a/src/operator/numpy/np_init_op.h
+++ b/src/operator/numpy/np_init_op.h
@@ -164,12 +164,16 @@ void NumpyInitFillWithScalarCompute(const nnvm::NodeAttrs &attrs,
 }
 
 struct NumpyLinspaceParam : public dmlc::Parameter<NumpyLinspaceParam> {
+#if __GNUC__ >= 6
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
   double start_double;
   double stop_double;
   int64_t start_int;
   int64_t stop_int;
   uint64_t start_uint;
   uint64_t stop_uint;
+#pragma GCC diagnostic pop
   index_t num;
   bool endpoint;
   std::string ctx;

From b96e75ab4f215f2cb321b6146f6a91afa045049d Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 13 Sep 2021 18:11:50 -0700
Subject: [PATCH 14/41] fix

---
 python/mxnet/base.py                          |  4 +++
 python/mxnet/ndarray/numpy/_op.py             | 26 +++++++++-------
 python/mxnet/numpy/multiarray.py              | 11 +++----
 python/mxnet/util.py                          | 30 ++++++++++++++++++-
 src/operator/numpy/np_init_op.h               | 16 +++++-----
 .../tensor/elemwise_binary_scalar_op.h        |  2 +-
 .../unittest/test_numpy_interoperability.py   | 12 ++++----
 tests/python/unittest/test_numpy_ndarray.py   |  2 +-
 8 files changed, 68 insertions(+), 35 deletions(-)

diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 12b1d2d543c3..d1dbc0d64cb4 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -49,8 +49,12 @@
 string_types = basestring,
 error_types = {}
 
+# Upper bound of uint64
 _MAX_VALUE_64_BIT_SIGNED_ = 9_223_372_036_854_775_807
+# Upper bound of int64
 _MAX_VALUE_64_BIT_UNSIGNED_ = 18_446_744_073_709_551_615
+# Upper bound of float32
+_MAX_VALUE_FLOAT32_REPRESENT_ = 16_777_216
 
 # this function is needed for python3
 # to convert ctypes.char_p .value back to python str
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index cee771965dcc..05a0e394670d 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -20,10 +20,11 @@
 """Namespace for numpy operators used in Gluon dispatched by F=ndarray."""
 
 import numpy as _np
-from ...base import numeric_types, integer_types, _MAX_VALUE_64_BIT_SIGNED_
+from ...base import numeric_types, integer_types,\
+                    _MAX_VALUE_64_BIT_SIGNED_, _MAX_VALUE_FLOAT32_REPRESENT_
 from ...util import _sanity_check_params, set_module
 from ...util import wrap_np_unary_func, wrap_np_binary_func
-from ...util import is_np_default_dtype
+from ...util import is_np_default_dtype, dtype_from_number
 from ...context import current_context
 from . import _internal as _npi
 from . import _api_internal
@@ -383,15 +384,9 @@ def full(shape, fill_value, dtype=None, order='C', ctx=None, out=None):  # pylin
     if isinstance(fill_value, bool):
         fill_value = int(fill_value)
         dtype = _np.bool if dtype is None else dtype
-    elif isinstance(fill_value, integer_types):
-        # fill_value is uint64
-        if fill_value > _MAX_VALUE_64_BIT_SIGNED_:
-            dtype = _np.uint64 if dtype is None else dtype
-        else:
-            dtype = _np.int64 if dtype is None else dtype
-    elif isinstance(fill_value, numeric_types):
+    elif isinstance(fill_value, (integer_types, numeric_types)):
         if dtype is None or dtype is float:
-            dtype = _np.float64 if is_np_default_dtype() else _np.float32
+            dtype = dtype_from_number(fill_value)
     if dtype is not None and not isinstance(dtype, str):
         dtype = _np.dtype(dtype).name
     return _api_internal.full(shape, dtype, fill_value, ctx, out)
@@ -1977,6 +1972,17 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
         ctx = str(ctx)
     if dtype is not None and not isinstance(dtype, str):
         dtype = _np.dtype(dtype).name
+    if dtype is None:
+        if isinstance(start, numeric_types) or isinstance(stop, numeric_types):
+            if abs(start) > _MAX_VALUE_FLOAT32_REPRESENT_ or \
+                abs(stop) > _MAX_VALUE_FLOAT32_REPRESENT_:
+                dtype = _np.float64
+            else:
+                dtype = _np.float64 if is_np_default_dtype() else _np.float32
+        elif start > _MAX_VALUE_64_BIT_SIGNED_ or stop > _MAX_VALUE_64_BIT_SIGNED_:
+            dtype = _np.uint64
+        else:
+            dtype = _np.int64
     if retstep:
         step = (stop - start) / (num - 1)
         return _api_internal.linspace(start, stop, num, endpoint, ctx, dtype), step
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index b29badcf9da8..1ef024d8bc02 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -46,7 +46,7 @@
 from ..context import Context
 from ..util import set_module, wrap_np_unary_func, wrap_np_binary_func,\
                    is_np_default_dtype, wrap_data_api_creation_func,\
-                   numpy_eye_standardized
+                   numpy_eye_standardized, dtype_from_number
 from ..context import current_context
 from ..ndarray import numpy as _mx_nd_np
 from ..ndarray.numpy import _internal as _npi
@@ -12367,15 +12367,12 @@ def asarray(obj, /, *, dtype=None, device=None, copy=None):
             [1, 7]])
     """
     if isinstance(obj, (integer_types, numeric_types)):
+        dtype = dtype_from_number(obj) if dtype is None else dtype
         obj = _np.asarray(obj, dtype=dtype)
-    if isinstance(obj, _np.ndarray):
+    elif isinstance(obj, _np.ndarray):
         dtype = obj.dtype if dtype is None else dtype
-    if isinstance(obj, ndarray):
+    elif isinstance(obj, ndarray):
         dtype = obj.dtype if dtype is None else dtype
-    else:
-        if dtype is None:
-            default_dtype = _np.float64 if is_np_default_dtype() else _np.float32
-            dtype = obj.dtype if hasattr(obj, "dtype") else default_dtype
     array = _as_mx_np_array(obj, ctx=device, zero_copy=copy)
     return array.astype(dtype)
 
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index 63c1c05779f3..6d942e8babe8 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -21,7 +21,12 @@
 import inspect
 import threading
 
-from .base import _LIB, check_call, c_str, py_str
+from struct import calcsize
+from .base import (_LIB, check_call, c_str, py_str,
+                   numeric_types, integer_types,
+                   _MAX_VALUE_64_BIT_UNSIGNED_,
+                   _MAX_VALUE_64_BIT_SIGNED_,
+                   _MAX_VALUE_FLOAT32_REPRESENT_)
 
 
 _np_ufunc_default_kwargs = {
@@ -1260,3 +1265,26 @@ def set_flush_denorms(value):
     passed_value = ctypes.c_bool(value)
     check_call(_LIB.MXSetFlushDenorms(passed_value, ctypes.byref(ret)))
     return ret.value
+
+
+def dtype_from_number(number):
+    """Get the data type from the given int or float number
+    """
+    assert isinstance(number, (integer_types, numeric_types)),\
+        "The input number should be either integer for float types"
+    import numpy as _np
+    if isinstance(number, integer_types):
+        if number > _MAX_VALUE_64_BIT_UNSIGNED_:
+            raise OverflowError("Integer out of bounds")
+        if number > _MAX_VALUE_64_BIT_SIGNED_:
+            return _np.uint64
+        elif calcsize("P") == 8:
+            return _np.int64
+        else:
+            return _np.int32
+    else:
+        if abs(number) > _MAX_VALUE_FLOAT32_REPRESENT_:
+            return _np.float64
+        else:
+            return _np.float64 if is_np_default_dtype() else _np.float32
+
diff --git a/src/operator/numpy/np_init_op.h b/src/operator/numpy/np_init_op.h
index 6be6cc137a6d..6c419fd4563c 100644
--- a/src/operator/numpy/np_init_op.h
+++ b/src/operator/numpy/np_init_op.h
@@ -164,16 +164,12 @@ void NumpyInitFillWithScalarCompute(const nnvm::NodeAttrs &attrs,
 }
 
 struct NumpyLinspaceParam : public dmlc::Parameter<NumpyLinspaceParam> {
-#if __GNUC__ >= 6
-#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
-#endif
   double start_double;
   double stop_double;
   int64_t start_int;
   int64_t stop_int;
   uint64_t start_uint;
   uint64_t stop_uint;
-#pragma GCC diagnostic pop
   index_t num;
   bool endpoint;
   std::string ctx;
@@ -181,16 +177,22 @@ struct NumpyLinspaceParam : public dmlc::Parameter<NumpyLinspaceParam> {
   int value_type;
   DMLC_DECLARE_PARAMETER(NumpyLinspaceParam) {
     DMLC_DECLARE_FIELD(start_double)
+    .set_default(0.0)
     .describe("The double type starting value of the sequence.");
     DMLC_DECLARE_FIELD(stop_double)
+    .set_default(0.0)
     .describe("The double type ending value of the sequence");
     DMLC_DECLARE_FIELD(start_int)
+    .set_default(0)
     .describe("The int type starting value of the sequence.");
     DMLC_DECLARE_FIELD(stop_int)
+    .set_default(0)
     .describe("The int type ending value of the sequence");
     DMLC_DECLARE_FIELD(start_uint)
+    .set_default(0)
     .describe("The unsigned int type starting value of the sequence.");
     DMLC_DECLARE_FIELD(stop_uint)
+    .set_default(0)
     .describe("The unsigned int type ending value of the sequence");
     DMLC_DECLARE_FIELD(num)
     .describe("Number of samples to generate. Must be non-negative.");
@@ -258,11 +260,7 @@ struct numpy_linspace_fwd {
       // Special cases : start = 9007199254740993
       KERNEL_ASSIGN(out[i], req, static_cast<DType>(start));
     } else {
-      if (std::is_integral<DType>::value) {
-        KERNEL_ASSIGN(out[i], req, static_cast<DType>(std::floor(start + step * i)));
-      } else {
-        KERNEL_ASSIGN(out[i], req, static_cast<DType>(start + step * i));
-      }
+      KERNEL_ASSIGN(out[i], req, static_cast<DType>(start + step * i));
     }
     if (endpoint && i != 0 && i == size - 1) {
       KERNEL_ASSIGN(out[i], req, static_cast<DType>(stop));
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.h b/src/operator/tensor/elemwise_binary_scalar_op.h
index 9c19b163d61d..511eb25b1376 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.h
+++ b/src/operator/tensor/elemwise_binary_scalar_op.h
@@ -345,7 +345,7 @@ class BinaryScalarOp : public UnaryOp {
     } else {
       temp_tblob = inputs[0];
     }
-    MSHADOW_TYPE_SWITCH_WITH_BOOL(temp_tblob.type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(temp_tblob.type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
         mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
             s, inputs[0].Size(), outputs[0].dptr<bool>(), temp_tblob.dptr<DType>(), DType(alpha));
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index db643d1090dc..ca852a85de59 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -313,7 +313,7 @@ def _add_workload_concatenate(array_pool):
     OpArgMngr.add_workload('concatenate', (a0, a1, a2), axis=2)
     OpArgMngr.add_workload('concatenate', (a0, a1, a2), axis=-1)
     OpArgMngr.add_workload('concatenate', (a0.T, a1.T, a2.T), axis=0)
-    out = np.empty(4, np.float32)
+    out = np.empty(4, dtype=np.float32)
     OpArgMngr.add_workload('concatenate', (np.array([1, 2]), np.array([3, 4])), out=out)
     OpArgMngr.add_workload('concatenate', [array_pool['4x1'], array_pool['4x1']], axis=None)
     OpArgMngr.add_workload('concatenate', (np.arange(4).reshape((2, 2)), np.arange(4).reshape((2, 2))), axis=None)
@@ -1324,17 +1324,17 @@ def _add_workload_var(array_pool):
 
 def _add_workload_zeros_like(array_pool):
     OpArgMngr.add_workload('zeros_like', array_pool['4x1'])
-    OpArgMngr.add_workload('zeros_like', np.random.uniform(size=(3, 3)).astype(np.float64), np.int64)
-    OpArgMngr.add_workload('zeros_like', np.random.uniform(size=(3, 3)).astype(np.float32), np.float64)
-    OpArgMngr.add_workload('zeros_like', np.random.randint(2, size = (3, 3)), int)
+    OpArgMngr.add_workload('zeros_like', np.random.uniform(size=(3, 3)).astype(np.float64), dtype=np.int64)
+    OpArgMngr.add_workload('zeros_like', np.random.uniform(size=(3, 3)).astype(np.float32), dtype=np.float64)
+    OpArgMngr.add_workload('zeros_like', np.random.randint(2, size = (3, 3)), dtype=int)
 
 
 def _add_workload_full_like(array_pool):
     OpArgMngr.add_workload('full_like', array_pool['4x1'], 1)
     OpArgMngr.add_workload('full_like', np.random.uniform(low=0, high=100, size=(1,3,4), dtype='float64'), 1)
-    OpArgMngr.add_workload('full_like', np.random.uniform(low=0, high=100, size=(9,3,1)), 2, np.int64)
+    OpArgMngr.add_workload('full_like', np.random.uniform(low=0, high=100, size=(9,3,1)), 2, dtype=np.int64)
     OpArgMngr.add_workload('full_like', np.random.uniform(low=0, high=100, size=(9,3)), np.nan)
-    OpArgMngr.add_workload('full_like', np.random.uniform(low=0, high=100, size=(2,0)), 0, np.float32)
+    OpArgMngr.add_workload('full_like', np.random.uniform(low=0, high=100, size=(2,0)), 0, dtype=np.float32)
 
 
 def _add_workload_outer():
diff --git a/tests/python/unittest/test_numpy_ndarray.py b/tests/python/unittest/test_numpy_ndarray.py
index 332328d8c365..c107ca0a2d83 100644
--- a/tests/python/unittest/test_numpy_ndarray.py
+++ b/tests/python/unittest/test_numpy_ndarray.py
@@ -114,7 +114,7 @@ def __init__(self, shape, dtype=None):
             self._dtype = dtype
 
         def forward(self, x, *args, **kwargs):
-            return x + np.zeros(shape, dtype)
+            return x + np.zeros(shape, dtype=dtype)
 
     class TestZerosOutputType(HybridBlock):
         def forward(self, x, *args, **kwargs):

From a7a671b9a70a6c910919ca5d354d82db438633d5 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 13 Sep 2021 22:32:11 -0700
Subject: [PATCH 15/41] update

---
 python/mxnet/util.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index 6d942e8babe8..765a63080ef0 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -1287,4 +1287,3 @@ def dtype_from_number(number):
             return _np.float64
         else:
             return _np.float64 if is_np_default_dtype() else _np.float32
-

From cc01d7740acf396dd26b13fb6b04b093dacbc59c Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Wed, 15 Sep 2021 13:38:47 -0700
Subject: [PATCH 16/41] skip signature standardization

---
 python/mxnet/numpy/multiarray.py | 29 ++++++++--------
 python/mxnet/util.py             | 13 -------
 src/operator/numpy/np_init_op.cc | 14 ++++----
 src/operator/numpy/np_init_op.h  | 58 ++++++++++++++------------------
 4 files changed, 47 insertions(+), 67 deletions(-)

diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 1ef024d8bc02..999db4d71ce6 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -46,7 +46,7 @@
 from ..context import Context
 from ..util import set_module, wrap_np_unary_func, wrap_np_binary_func,\
                    is_np_default_dtype, wrap_data_api_creation_func,\
-                   numpy_eye_standardized, dtype_from_number
+                   dtype_from_number
 from ..context import current_context
 from ..ndarray import numpy as _mx_nd_np
 from ..ndarray.numpy import _internal as _npi
@@ -2498,7 +2498,7 @@ def tostype(self, stype):
 
 @set_module('mxnet.numpy')
 @wrap_data_api_creation_func
-def empty(shape, *, dtype=None, order='C', device=None):  # pylint: disable=redefined-outer-name
+def empty(shape, dtype=None, order='C', device=None):  # pylint: disable=redefined-outer-name
     """Return a new array of given shape and type, without initializing entries.
 
     Parameters
@@ -2660,7 +2660,7 @@ def shape(a):
 
 @set_module('mxnet.numpy')
 @wrap_data_api_creation_func
-def zeros(shape, *, dtype=None, order='C', device=None):  # pylint: disable=redefined-outer-name
+def zeros(shape, dtype=None, order='C', device=None):  # pylint: disable=redefined-outer-name
     """Return a new array of given shape and type, filled with zeros.
     This function currently only supports storing multi-dimensional data
     in row-major (C-style).
@@ -2705,7 +2705,7 @@ def zeros(shape, *, dtype=None, order='C', device=None):  # pylint: disable=rede
 
 @set_module('mxnet.numpy')
 @wrap_data_api_creation_func
-def ones(shape, *, dtype=None, order='C', device=None):  # pylint: disable=redefined-outer-name
+def ones(shape, dtype=None, order='C', device=None):  # pylint: disable=redefined-outer-name
     """Return a new array of given shape and type, filled with ones.
     This function currently only supports storing multi-dimensional data
     in row-major (C-style).
@@ -2783,7 +2783,7 @@ def broadcast_to(array, shape):  # pylint: disable=redefined-outer-name
 # pylint: disable=too-many-arguments, redefined-outer-name
 @set_module('mxnet.numpy')
 @wrap_data_api_creation_func
-def full(shape, fill_value, *, dtype=None, order='C', device=None, out=None):
+def full(shape, fill_value, dtype=None, order='C', device=None, out=None):
     r"""Return a new array of given shape and type, filled with `fill_value`.
 
     Parameters
@@ -2844,7 +2844,7 @@ def full(shape, fill_value, *, dtype=None, order='C', device=None, out=None):
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
 @wrap_data_api_creation_func
-def empty_like(prototype, /, *, dtype=None, device=None, order='C', subok=False, shape=None): # pylint: disable=W0621
+def empty_like(prototype, dtype=None, device=None, order='C', subok=False, shape=None): # pylint: disable=W0621
     """
     Return a new array with the same shape and type as a given array.
 
@@ -5550,8 +5550,7 @@ def histogram(a, bins=10, range=None, normed=None, weights=None, density=None):
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
 @wrap_data_api_creation_func
-@numpy_eye_standardized
-def eye(N, M=None, /, *, k=0, dtype=None, device=None, **kwargs):
+def eye(N, M=None, k=0, dtype=None, device=None, **kwargs):
     """
     Return a 2-D array with ones on the diagonal and zeros elsewhere.
 
@@ -5596,7 +5595,7 @@ def eye(N, M=None, /, *, k=0, dtype=None, device=None, **kwargs):
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
 @wrap_data_api_creation_func
-def linspace(start, stop, /, num=50, *, endpoint=True, retstep=False, dtype=None, axis=0, device=None):  # pylint: disable=too-many-arguments
+def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0, device=None):  # pylint: disable=too-many-arguments
     r"""
     Return evenly spaced numbers over a specified interval.
 
@@ -6280,7 +6279,7 @@ def triu(m, k=0):
 
 @set_module('mxnet.numpy')
 @wrap_data_api_creation_func
-def arange(start, /, stop=None, step=1, *, dtype=None, device=None):
+def arange(start, stop=None, step=1, dtype=None, device=None):
     """Return evenly spaced values within a given interval.
 
     Values are generated within the half-open interval ``[start, stop)``
@@ -10663,7 +10662,7 @@ def interp(x, xp, fp, left=None, right=None, period=None):  # pylint: disable=to
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
 @wrap_data_api_creation_func
-def full_like(a, /, fill_value, *, dtype=None, order='C', device=None, out=None): # pylint: disable=too-many-arguments
+def full_like(a, fill_value, dtype=None, order='C', device=None, out=None): # pylint: disable=too-many-arguments
     """
     Return a full array with the same shape and type as a given array.
 
@@ -10722,7 +10721,7 @@ def full_like(a, /, fill_value, *, dtype=None, order='C', device=None, out=None)
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
 @wrap_data_api_creation_func
-def zeros_like(a, /, *, dtype=None, order='C', device=None, out=None):
+def zeros_like(a, dtype=None, order='C', device=None, out=None):
     """
     Return an array of zeros with the same shape and type as a given array.
 
@@ -10783,7 +10782,7 @@ def zeros_like(a, /, *, dtype=None, order='C', device=None, out=None):
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
 @wrap_data_api_creation_func
-def ones_like(a, /, *, dtype=None, order='C', device=None, out=None):
+def ones_like(a, dtype=None, order='C', device=None, out=None):
     """
     Return an array of ones with the same shape and type as a given array.
 
@@ -12318,7 +12317,7 @@ def sum(a, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=N
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
 @wrap_data_api_creation_func
-def asarray(obj, /, *, dtype=None, device=None, copy=None):
+def asarray(obj, dtype=None, device=None, copy=None):
     """
     Convert the input to an array.
 
@@ -12380,7 +12379,7 @@ def asarray(obj, /, *, dtype=None, device=None, copy=None):
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
 @wrap_data_api_creation_func
-def from_dlpack(x, /):
+def from_dlpack(x):
     """
     Returns a np.ndarray backed by a dlpack tensor.
 
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index 765a63080ef0..3ffca9246b4b 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -674,19 +674,6 @@ def _wrap_api_creation_func(*args, **kwargs):
     return _wrap_api_creation_func
 
 
-def numpy_eye_standardized(func):
-    """decorator for numpy.eye operator because k is positional
-       arg in numpy while key-word in api standard
-    """
-    @functools.wraps(func)
-    def _wrap_numpy_eye_func(*args, **kwargs):
-        if len(args) > 2:
-            kwargs["k"] = args[2]
-            args = args[0:2]
-        return func(*args, **kwargs)
-    return _wrap_numpy_eye_func
-
-
 # pylint: disable=exec-used
 def numpy_fallback(func):
     """decorator for falling back to offical numpy for a specific function"""
diff --git a/src/operator/numpy/np_init_op.cc b/src/operator/numpy/np_init_op.cc
index 528fa0769460..18421667b3cf 100644
--- a/src/operator/numpy/np_init_op.cc
+++ b/src/operator/numpy/np_init_op.cc
@@ -205,13 +205,13 @@ NNVM_REGISTER_OP(_npi_full_like)
     .add_arguments(FullLikeOpParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_npi_full)
-      .describe("fill target with a scalar value")
-      .set_num_inputs(0)
-      .set_num_outputs(1)
-      .set_attr_parser(ParamParser<NumpyInitOpWithScalarParam>)
-      .set_attr<mxnet::FInferShape>("FInferShape", InitShape<NumpyInitOpWithScalarParam>)
-      .set_attr<nnvm::FInferType>("FInferType", InitNumpyType<NumpyInitOpWithScalarParam>)
-      .set_attr<FCompute>("FCompute<cpu>", NumpyInitFillWithScalarCompute<cpu>)
+    .describe("fill target with a scalar value")
+    .set_num_inputs(0)
+    .set_num_outputs(1)
+    .set_attr_parser(ParamParser<NumpyInitOpWithScalarParam>)
+    .set_attr<mxnet::FInferShape>("FInferShape", InitShape<NumpyInitOpWithScalarParam>)
+    .set_attr<nnvm::FInferType>("FInferType", InitNumpyType<NumpyInitOpWithScalarParam>)
+    .set_attr<FCompute>("FCompute<cpu>", NumpyInitFillWithScalarCompute<cpu>)
     .add_arguments(NumpyInitOpWithScalarParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_npi_arange)
diff --git a/src/operator/numpy/np_init_op.h b/src/operator/numpy/np_init_op.h
index 6c419fd4563c..3cdf3e1ff35c 100644
--- a/src/operator/numpy/np_init_op.h
+++ b/src/operator/numpy/np_init_op.h
@@ -133,12 +133,12 @@ struct NumpyInitOpWithScalarParam : public dmlc::Parameter<NumpyInitOpWithScalar
     int_value_s << int_value;
     uint_value_s << uint_value;
     value_type_s << value_type;
-    (*dict)["shape"] = shape_s.str();
-    (*dict)["dtype"] = MXNetTypeWithBool2String(dtype);
-    (*dict)["int_value"] = int_value_s.str();
-    (*dict)["uint_value"] = uint_value_s.str();
+    (*dict)["shape"]        = shape_s.str();
+    (*dict)["dtype"]        = MXNetTypeWithBool2String(dtype);
+    (*dict)["int_value"]    = int_value_s.str();
+    (*dict)["uint_value"]   = uint_value_s.str();
     (*dict)["double_value"] = double_value_s.str();
-    (*dict)["value_type"] = value_type_s.str();
+    (*dict)["value_type"]   = value_type_s.str();
     // We do not set ctx, because ctx has been set in dict instead of InitOpParam.
     // Setting ctx here results in an error.
   }
@@ -164,12 +164,12 @@ void NumpyInitFillWithScalarCompute(const nnvm::NodeAttrs &attrs,
 }
 
 struct NumpyLinspaceParam : public dmlc::Parameter<NumpyLinspaceParam> {
-  double start_double;
-  double stop_double;
-  int64_t start_int;
-  int64_t stop_int;
-  uint64_t start_uint;
-  uint64_t stop_uint;
+  double start_double = 0.0;
+  double stop_double  = 0.0;
+  int64_t start_int   = 0;
+  int64_t stop_int    = 0;
+  uint64_t start_uint = 0;
+  uint64_t stop_uint  = 0;
   index_t num;
   bool endpoint;
   std::string ctx;
@@ -177,22 +177,16 @@ struct NumpyLinspaceParam : public dmlc::Parameter<NumpyLinspaceParam> {
   int value_type;
   DMLC_DECLARE_PARAMETER(NumpyLinspaceParam) {
     DMLC_DECLARE_FIELD(start_double)
-    .set_default(0.0)
     .describe("The double type starting value of the sequence.");
     DMLC_DECLARE_FIELD(stop_double)
-    .set_default(0.0)
     .describe("The double type ending value of the sequence");
     DMLC_DECLARE_FIELD(start_int)
-    .set_default(0)
     .describe("The int type starting value of the sequence.");
     DMLC_DECLARE_FIELD(stop_int)
-    .set_default(0)
     .describe("The int type ending value of the sequence");
     DMLC_DECLARE_FIELD(start_uint)
-    .set_default(0)
     .describe("The unsigned int type starting value of the sequence.");
     DMLC_DECLARE_FIELD(stop_uint)
-    .set_default(0)
     .describe("The unsigned int type ending value of the sequence");
     DMLC_DECLARE_FIELD(num)
     .describe("Number of samples to generate. Must be non-negative.");
@@ -226,15 +220,15 @@ struct NumpyLinspaceParam : public dmlc::Parameter<NumpyLinspaceParam> {
     endpoint_s << endpoint;
     dtype_s << dtype;
     (*dict)["start_double"] = start_double_s.str();
-    (*dict)["stop_double"] = stop_double_s.str();
-    (*dict)["start_int"] = start_int_s.str();
-    (*dict)["stop_int"] = stop_int_s.str();
-    (*dict)["start_uint"] = start_uint_s.str();
-    (*dict)["stop_uint"] = stop_uint_s.str();
-    (*dict)["value_type"] = value_type_s.str();
-    (*dict)["num"] = num_s.str();
-    (*dict)["endpoint"] = endpoint_s.str();
-    (*dict)["dtype"] = MXNetTypeWithBool2String(dtype);
+    (*dict)["stop_double"]  = stop_double_s.str();
+    (*dict)["start_int"]    = start_int_s.str();
+    (*dict)["stop_int"]     = stop_int_s.str();
+    (*dict)["start_uint"]   = start_uint_s.str();
+    (*dict)["stop_uint"]    = stop_uint_s.str();
+    (*dict)["value_type"]   = value_type_s.str();
+    (*dict)["num"]          = num_s.str();
+    (*dict)["endpoint"]     = endpoint_s.str();
+    (*dict)["dtype"]        = MXNetTypeWithBool2String(dtype);
   }
 };
 
@@ -281,8 +275,8 @@ void NumpyLinspaceCompute(const nnvm::NodeAttrs& attrs,
       index_t step_num = param.endpoint ? param.num - 1 : param.num;
       if (param.value_type == 0) {
         int64_t start = param.start_int;
-        int64_t stop = param.stop_int;
-        double step = step_num > 0 ? \
+        int64_t stop  = param.stop_int;
+        double step   = step_num > 0 ? \
             (static_cast<double>(stop) - static_cast<double>(start)) / step_num : 0.0f;
         Kernel<numpy_linspace_fwd, xpu>::Launch(s,
                                                 outputs[0].Size(),
@@ -295,8 +289,8 @@ void NumpyLinspaceCompute(const nnvm::NodeAttrs& attrs,
                                                 outputs[0].dptr<DType>());
       } else if (param.value_type == 1) {
         uint64_t start = param.start_uint;
-        uint64_t stop = param.stop_uint;
-        double step = step_num > 0 ? \
+        uint64_t stop  = param.stop_uint;
+        double step    = step_num > 0 ? \
             (static_cast<double>(stop) - static_cast<double>(start)) / step_num : 0.0f;
         Kernel<numpy_linspace_fwd, xpu>::Launch(s,
                                                 outputs[0].Size(),
@@ -309,8 +303,8 @@ void NumpyLinspaceCompute(const nnvm::NodeAttrs& attrs,
                                                 outputs[0].dptr<DType>());
       } else {
         double start = param.start_double;
-        double stop = param.stop_double;
-        double step = step_num > 0 ? (stop - start) / step_num : 0.0f;
+        double stop  = param.stop_double;
+        double step  = step_num > 0 ? (stop - start) / step_num : 0.0f;
         Kernel<numpy_linspace_fwd, xpu>::Launch(s,
                                                 outputs[0].Size(),
                                                 outputs[0].Size(),

From 6c2f48c1b47d5dc20ea18b4d1120e6af343ece01 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Wed, 15 Sep 2021 14:54:31 -0700
Subject: [PATCH 17/41] fix lint

---
 python/mxnet/numpy/multiarray.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 999db4d71ce6..42935e4e43f2 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -2841,7 +2841,7 @@ def full(shape, fill_value, dtype=None, order='C', device=None, out=None):
 # pylint: enable=too-many-arguments, redefined-outer-name
 
 
-# pylint: disable=redefined-outer-name
+# pylint: disable=redefined-outer-name, too-many-arguments
 @set_module('mxnet.numpy')
 @wrap_data_api_creation_func
 def empty_like(prototype, dtype=None, device=None, order='C', subok=False, shape=None): # pylint: disable=W0621

From b78365eee3a371d23192365a074d9ec2832d314a Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 16 Sep 2021 11:21:21 -0700
Subject: [PATCH 18/41] update

---
 python/mxnet/ndarray/numpy/_op.py | 16 +++-------------
 python/mxnet/numpy/multiarray.py  |  2 +-
 python/mxnet/util.py              |  4 ++--
 3 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 05a0e394670d..0d50a1861c97 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -20,8 +20,7 @@
 """Namespace for numpy operators used in Gluon dispatched by F=ndarray."""
 
 import numpy as _np
-from ...base import numeric_types, integer_types,\
-                    _MAX_VALUE_64_BIT_SIGNED_, _MAX_VALUE_FLOAT32_REPRESENT_
+from ...base import numeric_types, integer_types
 from ...util import _sanity_check_params, set_module
 from ...util import wrap_np_unary_func, wrap_np_binary_func
 from ...util import is_np_default_dtype, dtype_from_number
@@ -384,7 +383,7 @@ def full(shape, fill_value, dtype=None, order='C', ctx=None, out=None):  # pylin
     if isinstance(fill_value, bool):
         fill_value = int(fill_value)
         dtype = _np.bool if dtype is None else dtype
-    elif isinstance(fill_value, (integer_types, numeric_types)):
+    elif isinstance(fill_value, numeric_types):
         if dtype is None or dtype is float:
             dtype = dtype_from_number(fill_value)
     if dtype is not None and not isinstance(dtype, str):
@@ -1973,16 +1972,7 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
     if dtype is not None and not isinstance(dtype, str):
         dtype = _np.dtype(dtype).name
     if dtype is None:
-        if isinstance(start, numeric_types) or isinstance(stop, numeric_types):
-            if abs(start) > _MAX_VALUE_FLOAT32_REPRESENT_ or \
-                abs(stop) > _MAX_VALUE_FLOAT32_REPRESENT_:
-                dtype = _np.float64
-            else:
-                dtype = _np.float64 if is_np_default_dtype() else _np.float32
-        elif start > _MAX_VALUE_64_BIT_SIGNED_ or stop > _MAX_VALUE_64_BIT_SIGNED_:
-            dtype = _np.uint64
-        else:
-            dtype = _np.int64
+        dtype = _np.float64 if is_np_default_dtype() else _np.float32
     if retstep:
         step = (stop - start) / (num - 1)
         return _api_internal.linspace(start, stop, num, endpoint, ctx, dtype), step
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 42935e4e43f2..4e25fb2be1cc 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -12365,7 +12365,7 @@ def asarray(obj, dtype=None, device=None, copy=None):
     array([[0, 6],
             [1, 7]])
     """
-    if isinstance(obj, (integer_types, numeric_types)):
+    if isinstance(obj, numeric_types):
         dtype = dtype_from_number(obj) if dtype is None else dtype
         obj = _np.asarray(obj, dtype=dtype)
     elif isinstance(obj, _np.ndarray):
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index 3ffca9246b4b..d3315404fd4d 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -1257,8 +1257,8 @@ def set_flush_denorms(value):
 def dtype_from_number(number):
     """Get the data type from the given int or float number
     """
-    assert isinstance(number, (integer_types, numeric_types)),\
-        "The input number should be either integer for float types"
+    assert isinstance(number, numeric_types),\
+        "The input number should be either int for float types"
     import numpy as _np
     if isinstance(number, integer_types):
         if number > _MAX_VALUE_64_BIT_UNSIGNED_:

From 4a798daff11aa0301633d13b93621a60c0235668 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 17 Sep 2021 17:03:36 -0700
Subject: [PATCH 19/41] rm test_contants

---
 ci/docker/runtime_functions.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 2056e1bc7524..c4a47262fbca 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -820,7 +820,6 @@ unittest_array_api_standardization() {
     python3 -m pytest --durations=50 --cov-report xml:tests_api.xml --verbose \
         array_api_tests/test_type_promotion.py::test_elementwise_function_two_arg_bool_type_promotion
     python3 -m pytest --durations=50 --cov-report xml:tests_api.xml --verbose array_api_tests/test_creation_functions.py
-    python3 -m pytest --durations=50 --cov-report xml:tests_api.xml --verbose array_api_tests/test_constants.py
     python3 -m pytest --durations=50 --cov-report xml:tests_api.xml --verbose array_api_tests/test_indexing.py
     popd
 }

From b09814c7b328a821ef65a063d4d3439ff480e6bf Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 27 Sep 2021 10:20:01 -0700
Subject: [PATCH 20/41] Add Code Signing Key

---
 KEYS | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/KEYS b/KEYS
index 2ec1c7ddab0f..1985d9a0ed25 100644
--- a/KEYS
+++ b/KEYS
@@ -1016,3 +1016,62 @@ mpeZ4OrvobSYYrFcMxHwzbFKqS1tbpbPHHYLjIrvl50rhrWgUaIrV4uZJSEQHl76
 d+K3IWKQXHwuFIHhHcT5CI1c/2aL1hVhGrhexBQC
 =TIzk
 -----END PGP PUBLIC KEY BLOCK-----
+pub   rsa4096 2021-09-27 [SC]
+      61C9B12002DA9BA1A44D02683A9F7FE20D4D3DEA
+uid           [ultimate] Zhenghui Jin (CODE SIGNING KEY) <zhenghuijin@apache.org>
+sig 3        3A9F7FE20D4D3DEA 2021-09-27  Zhenghui Jin (CODE SIGNING KEY) <zhenghuijin@apache.org>
+sub   rsa4096 2021-09-27 [E]
+sig          3A9F7FE20D4D3DEA 2021-09-27  Zhenghui Jin (CODE SIGNING KEY) <zhenghuijin@apache.org>
+
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+
+mQINBGFR+9sBEACwc+Ozt7MSwkvz/JBBHJOMWzDgTZYR+kWmkZiX5PgYEeV0EkIJ
+blsmr10o/pDstnABH3jMpMTbAlF8cxh6M2+zBj+oLiAtJ0ToIgCWiIg9f1KETp76
+4uz5xzPHtZEVd3EpO7DdsltgiAwa71rEJY4AO5nNs3INbr/6bk5xgeEj1GgIG1Jt
+Gn1iVcvxTJh+g/1X3jW47UphHRzzeeJprT98xXKB5g2kdQVcRx62NdEhWJ2JjKY/
+EsBj3Fj/Jm4qJJ0Guj7/ovpk0E2QAt9kN6iAXDaBbFFP0WUPFaQRB5Wqdv9S4opz
+P7CnGX+rhYH/A2HBuV1wAhkAQYmRdW37SX1VnaX1Rwyh4Yzr/UwmAx6ZXXCk55kk
+H3XqZNSlq73s/s8gGfKiMw1DS3ZCCUE+lZH6nUJJV0/onBYWcu8pSJALSxPUDywm
+1YcBnjRTzzBq1XV0tisy5Lzm6SMAbRUb876Pfz97T/3stT2mH8u+VVxESlCg98U1
+JUbM2z8XODF9A+xlVZfaCJF4LTeBWnsPg89D9u9v1v603C9bVZ99xoyqS2Q5mose
+PQghCoH7ipvrGnHchN0gGecleVrZ5IO4TWSqloOtOzdr+5PyRQ86UzfJUXBaVZsH
+utKo1YBVuMMNHyedyLEmeFCoW5X5v5znQMH9cP6B9PELKEErjSLvHhUXbQARAQAB
+tDhaaGVuZ2h1aSBKaW4gKENPREUgU0lHTklORyBLRVkpIDx6aGVuZ2h1aWppbkBh
+cGFjaGUub3JnPokCTgQTAQgAOBYhBGHJsSAC2puhpE0CaDqff+INTT3qBQJhUfvb
+AhsDBQsJCAcCBhUKCQgLAgQWAgMBAh4BAheAAAoJEDqff+INTT3qBocP/32Nay7R
+y1o8fq+EbKjduw98vvLkJU4+JCJ9/gVZigXHHB/ng0tkY0U3aKjASwfg7KmNcYPZ
+qMj2r1Sru5pkqRwBusrl72KtyJUoX7HUQL4BegGv4NAJ/TemzpNer5Gjui2j1QfC
+mY7mIdcL7fz/JcBICGxb98ogzx5b+0fHDvkn+iEjhJ0DlouL30h7uDEISYNrEuax
+PPStIJTHBzFPEbPd0dhk7O2RzmPsI/BhphrUZYvkwnd39PPjWfq3axs7QaixMdMN
+ZOMsY3+/0DFd8l2vUVKxwkUJ8jaZSc6FgkODD2MvLQKOcghujBLp31+xnV/y79MN
+r9RyIJBwvjvBtZb+e7xMqBtygkR8SdakcrF+EhH7P/jloA7SxQFp8TpP+L1Psxt2
+b6XLbBO8rg65zQfl28DwcBA+f23VaN+P+CtGHF+ryPG1MRwisqoisjfqaH9TKE28
+5lqDgT+FKphjnuJRimAa5k2VzowI4QuY4Oq6b4ZOcuSiY94fGaCgFaTyF/qgqcPZ
+pqwNpzS45/sAFkWLhBU/70QHIlF7c5BR9JNurXZhvu99iv8QdaHpR9ZlYH0GVdpZ
+iMgTADhFvR4Tn9NqUnyYJ42hEoqSeueUx/ORYwRyII9m+XP+S4UAg4hSARcsK/SS
+0S3wCx2aaXSF1tra4+igsm0QUoAEd2zrPkEfuQINBGFR+9sBEADhlb8YsCMt977e
+eBU2IqQoFhj9dA0iNTPGLCW4jrtjL7RrurVg/WXzqAib5PU5nVxVKC4QUWauD5s9
+8fTOL2HcQiy6089yGoIjwKC4+30aQAULVT3wxAwzRmsM//QHryhkZUc5v8ty/GxR
+78T6UOS+6Wvb4B+jjpcTPsjohxI2JoaSfFotCFT8t8e+Qrvu5xPrEoXUDwS1VUUG
+omvJO7Zq7jXu5FVx6TFoBz/zPuaYHTlJR1h5kKnT/DBSid9G2feBOhyJUKcbhQGX
+KpIyhkfexEvSj9j0OESExs5yCfPa7UjKwFD7hO5atylNYs1yCY9nWYm+Obi4pjHH
+6AlbM3La/50waRtHb3q6j8E4QozJBaiturFNAebAhAACLLDRCG9llJPmb6gkwbzy
+XnCQdaQdQDLEjqi1Z51/tTdNUiR1aeXfafkjPREJPwkcuGoUR5arxvRfzO27YOGT
+T51C8rTbOsXtmmeAJxrdlerJg0ODsc5J3uFnX/KrejroS4XvVNj0WjwipHRoLFP9
+glCIp62U0gJAWAm4rpoT+/Ku3w+hvn2stKg7dbPV94wJiSXxFKoJJC46WcQn8rCc
+ajSO49K0x+rc0zvgTNP4WjMNUb928AzwvlnpFtGoVlGIaQfDUt0fx0GEtncDRSEN
+jwIfBjnAn3nKtn9MOzQaRdBUUEoCswARAQABiQI2BBgBCAAgFiEEYcmxIALam6Gk
+TQJoOp9/4g1NPeoFAmFR+9sCGwwACgkQOp9/4g1NPeoHzw//SkVWW06Lj8RCbvaE
+XwwydvV+6i1cMHjyOUJpSHDMDD55iFZMDoHiu+PTu+B5vwyYrgXzx/7SMxC1Solh
+GdR2v1Er3jc7OJSatZuJvybAIQzHQbloXqQtjq8JYPEuP50s9gxZ23oAheaZlpfi
+ye0hhl4aBjs0G0jJHS43ll7cD5/RahVsHh2ffPyAgww5Y2eGenHoXDcv/XYbkh7q
+D+bVYx+dFB1x1hzds2Va9YbgPAl/VrcJDWum8wMr2QGV22m/HHffiBL3aLRd2PpK
+DUeylBmU6qZJVN+bwoPWEek/g83G+7XTS1GYA2iKworTxqqCh+XMp5eXJFlUrX0t
+FGjLVBuk5/3MgXeaJLzZUq3vTlCbghsXvVxLYuVVDREA2/hj8vh8JbZUrSc1w5Fq
+wEbrFVzwvKydYx9GGpxG2GhWeFOncE46/P7hS8JGM2TMUAfDCULJCNBKpszK/vBs
+ehjb47SjbwJwS6bBW4gaeTEBcnTtuR8ZTugQ8X+6FNVBbBHLjeiz7SOd+RAPqsEE
+kMHW4da5KWNKDOGkuUul96mGl1H1PebLInMVzcs7UfOvOGm6Hd1W0BJbjfFPIEKJ
+XaqZByJCnaJAEUESQ6Dt7oOAWu1WBJaQb82oBvkDAoEKz/6nBM+yDx/9SrTRTIc7
+fF1nH1ZRQw06zoBoIyvQuM4vykg=
+=lW1K
+-----END PGP PUBLIC KEY BLOCK-----

From c9fe8893d8499f559d2c9648c61cdbf7b2d7c022 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 27 Sep 2021 10:21:10 -0700
Subject: [PATCH 21/41] Revert "Add Code Signing Key"

This reverts commit b09814c7b328a821ef65a063d4d3439ff480e6bf.
---
 KEYS | 59 -----------------------------------------------------------
 1 file changed, 59 deletions(-)

diff --git a/KEYS b/KEYS
index 1985d9a0ed25..2ec1c7ddab0f 100644
--- a/KEYS
+++ b/KEYS
@@ -1016,62 +1016,3 @@ mpeZ4OrvobSYYrFcMxHwzbFKqS1tbpbPHHYLjIrvl50rhrWgUaIrV4uZJSEQHl76
 d+K3IWKQXHwuFIHhHcT5CI1c/2aL1hVhGrhexBQC
 =TIzk
 -----END PGP PUBLIC KEY BLOCK-----
-pub   rsa4096 2021-09-27 [SC]
-      61C9B12002DA9BA1A44D02683A9F7FE20D4D3DEA
-uid           [ultimate] Zhenghui Jin (CODE SIGNING KEY) <zhenghuijin@apache.org>
-sig 3        3A9F7FE20D4D3DEA 2021-09-27  Zhenghui Jin (CODE SIGNING KEY) <zhenghuijin@apache.org>
-sub   rsa4096 2021-09-27 [E]
-sig          3A9F7FE20D4D3DEA 2021-09-27  Zhenghui Jin (CODE SIGNING KEY) <zhenghuijin@apache.org>
-
------BEGIN PGP PUBLIC KEY BLOCK-----
-
-mQINBGFR+9sBEACwc+Ozt7MSwkvz/JBBHJOMWzDgTZYR+kWmkZiX5PgYEeV0EkIJ
-blsmr10o/pDstnABH3jMpMTbAlF8cxh6M2+zBj+oLiAtJ0ToIgCWiIg9f1KETp76
-4uz5xzPHtZEVd3EpO7DdsltgiAwa71rEJY4AO5nNs3INbr/6bk5xgeEj1GgIG1Jt
-Gn1iVcvxTJh+g/1X3jW47UphHRzzeeJprT98xXKB5g2kdQVcRx62NdEhWJ2JjKY/
-EsBj3Fj/Jm4qJJ0Guj7/ovpk0E2QAt9kN6iAXDaBbFFP0WUPFaQRB5Wqdv9S4opz
-P7CnGX+rhYH/A2HBuV1wAhkAQYmRdW37SX1VnaX1Rwyh4Yzr/UwmAx6ZXXCk55kk
-H3XqZNSlq73s/s8gGfKiMw1DS3ZCCUE+lZH6nUJJV0/onBYWcu8pSJALSxPUDywm
-1YcBnjRTzzBq1XV0tisy5Lzm6SMAbRUb876Pfz97T/3stT2mH8u+VVxESlCg98U1
-JUbM2z8XODF9A+xlVZfaCJF4LTeBWnsPg89D9u9v1v603C9bVZ99xoyqS2Q5mose
-PQghCoH7ipvrGnHchN0gGecleVrZ5IO4TWSqloOtOzdr+5PyRQ86UzfJUXBaVZsH
-utKo1YBVuMMNHyedyLEmeFCoW5X5v5znQMH9cP6B9PELKEErjSLvHhUXbQARAQAB
-tDhaaGVuZ2h1aSBKaW4gKENPREUgU0lHTklORyBLRVkpIDx6aGVuZ2h1aWppbkBh
-cGFjaGUub3JnPokCTgQTAQgAOBYhBGHJsSAC2puhpE0CaDqff+INTT3qBQJhUfvb
-AhsDBQsJCAcCBhUKCQgLAgQWAgMBAh4BAheAAAoJEDqff+INTT3qBocP/32Nay7R
-y1o8fq+EbKjduw98vvLkJU4+JCJ9/gVZigXHHB/ng0tkY0U3aKjASwfg7KmNcYPZ
-qMj2r1Sru5pkqRwBusrl72KtyJUoX7HUQL4BegGv4NAJ/TemzpNer5Gjui2j1QfC
-mY7mIdcL7fz/JcBICGxb98ogzx5b+0fHDvkn+iEjhJ0DlouL30h7uDEISYNrEuax
-PPStIJTHBzFPEbPd0dhk7O2RzmPsI/BhphrUZYvkwnd39PPjWfq3axs7QaixMdMN
-ZOMsY3+/0DFd8l2vUVKxwkUJ8jaZSc6FgkODD2MvLQKOcghujBLp31+xnV/y79MN
-r9RyIJBwvjvBtZb+e7xMqBtygkR8SdakcrF+EhH7P/jloA7SxQFp8TpP+L1Psxt2
-b6XLbBO8rg65zQfl28DwcBA+f23VaN+P+CtGHF+ryPG1MRwisqoisjfqaH9TKE28
-5lqDgT+FKphjnuJRimAa5k2VzowI4QuY4Oq6b4ZOcuSiY94fGaCgFaTyF/qgqcPZ
-pqwNpzS45/sAFkWLhBU/70QHIlF7c5BR9JNurXZhvu99iv8QdaHpR9ZlYH0GVdpZ
-iMgTADhFvR4Tn9NqUnyYJ42hEoqSeueUx/ORYwRyII9m+XP+S4UAg4hSARcsK/SS
-0S3wCx2aaXSF1tra4+igsm0QUoAEd2zrPkEfuQINBGFR+9sBEADhlb8YsCMt977e
-eBU2IqQoFhj9dA0iNTPGLCW4jrtjL7RrurVg/WXzqAib5PU5nVxVKC4QUWauD5s9
-8fTOL2HcQiy6089yGoIjwKC4+30aQAULVT3wxAwzRmsM//QHryhkZUc5v8ty/GxR
-78T6UOS+6Wvb4B+jjpcTPsjohxI2JoaSfFotCFT8t8e+Qrvu5xPrEoXUDwS1VUUG
-omvJO7Zq7jXu5FVx6TFoBz/zPuaYHTlJR1h5kKnT/DBSid9G2feBOhyJUKcbhQGX
-KpIyhkfexEvSj9j0OESExs5yCfPa7UjKwFD7hO5atylNYs1yCY9nWYm+Obi4pjHH
-6AlbM3La/50waRtHb3q6j8E4QozJBaiturFNAebAhAACLLDRCG9llJPmb6gkwbzy
-XnCQdaQdQDLEjqi1Z51/tTdNUiR1aeXfafkjPREJPwkcuGoUR5arxvRfzO27YOGT
-T51C8rTbOsXtmmeAJxrdlerJg0ODsc5J3uFnX/KrejroS4XvVNj0WjwipHRoLFP9
-glCIp62U0gJAWAm4rpoT+/Ku3w+hvn2stKg7dbPV94wJiSXxFKoJJC46WcQn8rCc
-ajSO49K0x+rc0zvgTNP4WjMNUb928AzwvlnpFtGoVlGIaQfDUt0fx0GEtncDRSEN
-jwIfBjnAn3nKtn9MOzQaRdBUUEoCswARAQABiQI2BBgBCAAgFiEEYcmxIALam6Gk
-TQJoOp9/4g1NPeoFAmFR+9sCGwwACgkQOp9/4g1NPeoHzw//SkVWW06Lj8RCbvaE
-XwwydvV+6i1cMHjyOUJpSHDMDD55iFZMDoHiu+PTu+B5vwyYrgXzx/7SMxC1Solh
-GdR2v1Er3jc7OJSatZuJvybAIQzHQbloXqQtjq8JYPEuP50s9gxZ23oAheaZlpfi
-ye0hhl4aBjs0G0jJHS43ll7cD5/RahVsHh2ffPyAgww5Y2eGenHoXDcv/XYbkh7q
-D+bVYx+dFB1x1hzds2Va9YbgPAl/VrcJDWum8wMr2QGV22m/HHffiBL3aLRd2PpK
-DUeylBmU6qZJVN+bwoPWEek/g83G+7XTS1GYA2iKworTxqqCh+XMp5eXJFlUrX0t
-FGjLVBuk5/3MgXeaJLzZUq3vTlCbghsXvVxLYuVVDREA2/hj8vh8JbZUrSc1w5Fq
-wEbrFVzwvKydYx9GGpxG2GhWeFOncE46/P7hS8JGM2TMUAfDCULJCNBKpszK/vBs
-ehjb47SjbwJwS6bBW4gaeTEBcnTtuR8ZTugQ8X+6FNVBbBHLjeiz7SOd+RAPqsEE
-kMHW4da5KWNKDOGkuUul96mGl1H1PebLInMVzcs7UfOvOGm6Hd1W0BJbjfFPIEKJ
-XaqZByJCnaJAEUESQ6Dt7oOAWu1WBJaQb82oBvkDAoEKz/6nBM+yDx/9SrTRTIc7
-fF1nH1ZRQw06zoBoIyvQuM4vykg=
-=lW1K
------END PGP PUBLIC KEY BLOCK-----

From 60ee272972f5d5a45ac38447c041ac95a8679728 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 11 Oct 2021 18:19:43 -0700
Subject: [PATCH 22/41] Replace context with device & update
 multiarray.py/_op.py

---
 python/mxnet/__init__.py          |   3 +-
 python/mxnet/context.py           | 285 +---------------------
 python/mxnet/device.py            | 297 +++++++++++++++++++++++
 python/mxnet/ndarray/numpy/_op.py | 235 ++++++++++---------
 python/mxnet/numpy/multiarray.py  | 376 +++++++++++++++---------------
 python/mxnet/util.py              |  11 +-
 6 files changed, 621 insertions(+), 586 deletions(-)
 create mode 100644 python/mxnet/device.py

diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 7d5bff7f1e85..a89b8805e3ce 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -20,7 +20,8 @@
 # coding: utf-8
 """MXNet: a concise, fast and flexible framework for deep learning."""
 
-from .context import Context, current_context, cpu, gpu, cpu_pinned
+from .context import Context, current_context
+from .device import Device, current_device, cpu, gpu, cpu_pinned
 from . import engine, error
 from .base import MXNetError
 from .util import is_np_shape, set_np_shape, np_shape, use_np_shape
diff --git a/python/mxnet/context.py b/python/mxnet/context.py
index 5cd9c7fad766..b4c086cfeedb 100644
--- a/python/mxnet/context.py
+++ b/python/mxnet/context.py
@@ -15,283 +15,18 @@
 # specific language governing permissions and limitations
 # under the License.
 """Context management API of mxnet."""
-import contextvars
-import ctypes
-from .base import _LIB
-from .base import check_call
+from warnings import warn
+from .device import Device, _current
 
 
-class Context:
-    """Constructs a context.
-
-    MXNet can run operations on CPU and different GPUs.
-    A context describes the device type and ID on which computation should be carried on.
-
-    One can use mx.cpu and mx.gpu for short.
-
-    See also
-    ----------
-    `How to run MXNet on multiple CPU/GPUs <http://mxnet.incubator.apache.org/api/faq/distributed_training>`
-    for more details.
-
-    Parameters
-    ----------
-    device_type : {'cpu', 'gpu'} or Context.
-        String representing the device type.
-
-    device_id : int (default=0)
-        The device id of the device, needed for GPU.
-
-    Note
-    ----
-    Context can also be used as a way to change the default context.
-
-    Examples
-    --------
-    >>> # array on cpu
-    >>> cpu_array = mx.nd.ones((2, 3))
-    >>> # switch default context to GPU(2)
-    >>> with mx.Context(mx.gpu(2)):
-    ...     gpu_array = mx.nd.ones((2, 3))
-    >>> gpu_array.context
-    gpu(2)
-
-    One can also explicitly specify the context when creating an array.
-
-    >>> gpu_array = mx.nd.ones((2, 3), mx.gpu(1))
-    >>> gpu_array.context
-    gpu(1)
-    """
-    devtype2str = {1: 'cpu', 2: 'gpu', 3: 'cpu_pinned', 5: 'cpu_shared'}
-    devstr2type = {'cpu': 1, 'gpu': 2, 'cpu_pinned': 3, 'cpu_shared': 5}
-    def __init__(self, device_type, device_id=0):
-        if isinstance(device_type, Context):
-            self.device_typeid = device_type.device_typeid
-            self.device_id = device_type.device_id
-        else:
-            self.device_typeid = Context.devstr2type[device_type]
-            self.device_id = device_id
-        self._old_ctx = None
-
-    @property
-    def device_type(self):
-        """Returns the device type of current context.
-
-        Examples
-        -------
-        >>> mx.context.current_context().device_type
-        'cpu'
-        >>> mx.current_context().device_type
-        'cpu'
-
-        Returns
-        -------
-        device_type : str
-        """
-        return Context.devtype2str[self.device_typeid]
-
-    def __hash__(self):
-        """Compute hash value of context for dictionary lookup"""
-        return hash((self.device_typeid, self.device_id))
-
-    def __eq__(self, other):
-        """Compares two contexts. Two contexts are equal if they
-        have the same device type and device id.
-        """
-        return isinstance(other, Context) and \
-            self.device_typeid == other.device_typeid and \
-            self.device_id == other.device_id
-
-    def __str__(self):
-        return '%s(%d)' % (self.device_type, self.device_id)
-
-    def __repr__(self):
-        return self.__str__()
-
-    def __enter__(self):
-        # Token can't be pickled and Token.old_value is Token.MISSING if _current.get() uses default value
-        self._old_ctx = _current.get()
-        _current.set(self)
-        return self
-
-    def __exit__(self, ptype, value, trace):
-        _current.set(self._old_ctx)
-
-    def empty_cache(self):
-        """Empties the memory cache for the current contexts device.
-
-        MXNet utilizes a memory pool to avoid excessive allocations.
-        Calling empty_cache will empty the memory pool of the contexts
-        device. This will only free the memory of the unreferenced data.
-
-        Examples
-        -------
-        >>> ctx = mx.gpu(0)
-        >>> arr = mx.nd.ones((200,200), ctx=ctx)
-        >>> del arr
-        >>> ctx.empty_cache() # forces release of memory allocated for arr
-        """
-        dev_type = ctypes.c_int(self.device_typeid)
-        dev_id = ctypes.c_int(self.device_id)
-        check_call(_LIB.MXStorageEmptyCache(dev_type, dev_id))
-
-
-def cpu(device_id=0):
-    """Returns a CPU context.
-
-    This function is a short cut for ``Context('cpu', device_id)``.
-    For most operations, when no context is specified, the default context is `cpu()`.
-
-    Examples
-    ----------
-    >>> with mx.cpu():
-    ...     cpu_array = mx.nd.ones((2, 3))
-    >>> cpu_array.context
-    cpu(0)
-    >>> cpu_array = mx.nd.ones((2, 3), ctx=mx.cpu())
-    >>> cpu_array.context
-    cpu(0)
-
-    Parameters
-    ----------
-    device_id : int, optional
-        The device id of the device. `device_id` is not needed for CPU.
-        This is included to make interface compatible with GPU.
-
-    Returns
-    -------
-    context : Context
-        The corresponding CPU context.
-    """
-    return Context('cpu', device_id)
-
-
-def cpu_pinned(device_id=0):
-    """Returns a CPU pinned memory context. Copying from CPU pinned memory to GPU
-    is faster than from normal CPU memory.
-
-    This function is a short cut for ``Context('cpu_pinned', device_id)``.
-
-    Examples
-    ----------
-    >>> with mx.cpu_pinned():
-    ...     cpu_array = mx.nd.ones((2, 3))
-    >>> cpu_array.context
-    cpu_pinned(0)
-    >>> cpu_array = mx.nd.ones((2, 3), ctx=mx.cpu_pinned())
-    >>> cpu_array.context
-    cpu_pinned(0)
-
-    Parameters
-    ----------
-    device_id : int, optional
-        The device id of the device. `device_id` is not needed for CPU.
-        This is included to make interface compatible with GPU.
-
-    Returns
-    -------
-    context : Context
-        The corresponding CPU pinned memory context.
-    """
-    return Context('cpu_pinned', device_id)
-
-
-def gpu(device_id=0):
-    """Returns a GPU context.
-
-    This function is a short cut for Context('gpu', device_id).
-    The K GPUs on a node are typically numbered as 0,...,K-1.
-
-    Examples
-    ----------
-    >>> cpu_array = mx.nd.ones((2, 3))
-    >>> cpu_array.context
-    cpu(0)
-    >>> with mx.gpu(1):
-    ...     gpu_array = mx.nd.ones((2, 3))
-    >>> gpu_array.context
-    gpu(1)
-    >>> gpu_array = mx.nd.ones((2, 3), ctx=mx.gpu(1))
-    >>> gpu_array.context
-    gpu(1)
-
-    Parameters
-    ----------
-    device_id : int, optional
-        The device id of the device, needed for GPU.
-
-    Returns
-    -------
-    context : Context
-        The corresponding GPU context.
-    """
-    return Context('gpu', device_id)
-
-
-def num_gpus():
-    """Query CUDA for the number of GPUs present.
-
-    Raises
-    ------
-    Will raise an exception on any CUDA error.
-
-    Returns
-    -------
-    count : int
-        The number of GPUs.
-
-    """
-    count = ctypes.c_int()
-    check_call(_LIB.MXGetGPUCount(ctypes.byref(count)))
-    return count.value
-
-
-def gpu_memory_info(device_id=0):
-    """Query CUDA for the free and total bytes of GPU global memory.
-
-    Parameters
-    ----------
-    device_id : int, optional
-        The device id of the GPU device.
-
-    Raises
-    ------
-    Will raise an exception on any CUDA error.
-
-    Returns
-    -------
-    (free, total) : (int, int)
-    """
-    free = ctypes.c_uint64()
-    total = ctypes.c_uint64()
-    dev_id = ctypes.c_int(device_id)
-    check_call(_LIB.MXGetGPUMemoryInformation64(dev_id, ctypes.byref(free), ctypes.byref(total)))
-    return (free.value, total.value)
-
-
-_current = contextvars.ContextVar('namemanager', default=Context('cpu', 0))
-
+def Context(*args, **kwargs):
+    """This class has been deprecated. Please refer to ``device.Device``."""
+    warn('Directly use Context class to construct a device will be deprecated. '
+        'Please use Device class instead. ', DeprecationWarning)
+    return Device(*args, **kwargs)
 
 def current_context():
-    """Returns the current context.
-
-    By default, `mx.cpu()` is used for all the computations
-    and it can be overridden by using `with mx.Context(x)` statement where
-    x can be cpu(device_id) or gpu(device_id).
-
-    Examples
-    -------
-    >>> mx.current_context()
-    cpu(0)
-    >>> with mx.Context('gpu', 1):  # Context changed in `with` block.
-    ...    mx.current_context()  # Computation done here will be on gpu(1).
-    ...
-    gpu(1)
-    >>> mx.current_context() # Back to default context.
-    cpu(0)
-
-    Returns
-    -------
-    default_ctx : Context
-    """
+    """This function has been deprecated. Please refer to ``device.current_device``."""
+    warn('Directly use current_context to get current device will be deprecated. '
+        'Please use current_device method instead. ', DeprecationWarning)
     return _current.get()
diff --git a/python/mxnet/device.py b/python/mxnet/device.py
new file mode 100644
index 000000000000..cf2c9a3af4ee
--- /dev/null
+++ b/python/mxnet/device.py
@@ -0,0 +1,297 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Device management API of mxnet."""
+import contextvars
+import ctypes
+from .base import _LIB
+from .base import check_call
+
+
+class Device:
+    """Constructs a device structure.
+
+    MXNet can run operations on CPU and different GPUs.
+    A Device class describes the device type and ID on which computation should be carried on.
+
+    One can use mx.cpu and mx.gpu for short.
+
+    See also
+    ----------
+    `How to run MXNet on multiple CPU/GPUs <http://mxnet.incubator.apache.org/api/faq/distributed_training>`
+    for more details.
+
+    Parameters
+    ----------
+    device_type : {'cpu', 'gpu'} or Device.
+        String representing the device type.
+
+    device_id : int (default=0)
+        The device id of the device, needed for GPU.
+
+    Note
+    ----
+    Device can also be used as a way to change the default device.
+
+    Examples
+    --------
+    >>> # array on cpu
+    >>> cpu_array = mx.np.ones((2, 3))
+    >>> # switch default Device to GPU(2)
+    >>> with mx.Device(mx.gpu(2)):
+    ...     gpu_array = mx.np.ones((2, 3))
+    >>> gpu_array.device
+    gpu(2)
+
+    One can also explicitly specify the device when creating an array.
+
+    >>> gpu_array = mx.np.ones((2, 3), mx.gpu(1))
+    >>> gpu_array.device
+    gpu(1)
+    """
+    devtype2str = {1: 'cpu', 2: 'gpu', 3: 'cpu_pinned', 5: 'cpu_shared'}
+    devstr2type = {'cpu': 1, 'gpu': 2, 'cpu_pinned': 3, 'cpu_shared': 5}
+    def __init__(self, device_type, device_id=0):
+        if isinstance(device_type, Device):
+            self.device_typeid = device_type.device_typeid
+            self.device_id = device_type.device_id
+        else:
+            self.device_typeid = Device.devstr2type[device_type]
+            self.device_id = device_id
+        self._old_ctx = None
+
+    @property
+    def device_type(self):
+        """Returns the device type of current device.
+
+        Examples
+        -------
+        >>> mx.device.current_device().device_type
+        'cpu'
+        >>> mx.current_device().device_type
+        'cpu'
+
+        Returns
+        -------
+        device_type : str
+        """
+        return Device.devtype2str[self.device_typeid]
+
+    def __hash__(self):
+        """Compute hash value of device for dictionary lookup"""
+        return hash((self.device_typeid, self.device_id))
+
+    def __eq__(self, other):
+        """Compares two devices. Two devices are equal if they
+        have the same device type and device id.
+        """
+        return isinstance(other, Device) and \
+            self.device_typeid == other.device_typeid and \
+            self.device_id == other.device_id
+
+    def __str__(self):
+        return '%s(%d)' % (self.device_type, self.device_id)
+
+    def __repr__(self):
+        return self.__str__()
+
+    def __enter__(self):
+        # Token can't be pickled and Token.old_value is Token.MISSING if _current.get() uses default value
+        self._old_ctx = _current.get()
+        _current.set(self)
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        _current.set(self._old_ctx)
+
+    def empty_cache(self):
+        """Empties the memory cache for the current device.
+
+        MXNet utilizes a memory pool to avoid excessive allocations.
+        Calling empty_cache will empty the memory pool of the
+        device. This will only free the memory of the unreferenced data.
+
+        Examples
+        -------
+        >>> ctx = mx.gpu(0)
+        >>> arr = mx.np.ones((200,200), ctx=ctx)
+        >>> del arr
+        >>> ctx.empty_cache() # forces release of memory allocated for arr
+        """
+        dev_type = ctypes.c_int(self.device_typeid)
+        dev_id = ctypes.c_int(self.device_id)
+        check_call(_LIB.MXStorageEmptyCache(dev_type, dev_id))
+
+
+def cpu(device_id=0):
+    """Returns a CPU device.
+
+    This function is a short cut for ``Device('cpu', device_id)``.
+    For most operations, when no device is specified, the default device is `cpu()`.
+
+    Examples
+    ----------
+    >>> with mx.cpu():
+    ...     cpu_array = mx.np.ones((2, 3))
+    >>> cpu_array.device
+    cpu(0)
+    >>> cpu_array = mx.np.ones((2, 3), ctx=mx.cpu())
+    >>> cpu_array.device
+    cpu(0)
+
+    Parameters
+    ----------
+    device_id : int, optional
+        The device id of the device. `device_id` is not needed for CPU.
+        This is included to make interface compatible with GPU.
+
+    Returns
+    -------
+    device : Device
+        The corresponding CPU device.
+    """
+    return Device('cpu', device_id)
+
+
+def cpu_pinned(device_id=0):
+    """Returns a CPU pinned memory device. Copying from CPU pinned memory to GPU
+    is faster than from normal CPU memory.
+
+    This function is a short cut for ``Device('cpu_pinned', device_id)``.
+
+    Examples
+    ----------
+    >>> with mx.cpu_pinned():
+    ...     cpu_array = mx.np.ones((2, 3))
+    >>> cpu_array.device
+    cpu_pinned(0)
+    >>> cpu_array = mx.np.ones((2, 3), ctx=mx.cpu_pinned())
+    >>> cpu_array.device
+    cpu_pinned(0)
+
+    Parameters
+    ----------
+    device_id : int, optional
+        The device id of the device. `device_id` is not needed for CPU.
+        This is included to make interface compatible with GPU.
+
+    Returns
+    -------
+    device : Device
+        The corresponding CPU pinned memory device.
+    """
+    return Device('cpu_pinned', device_id)
+
+
+def gpu(device_id=0):
+    """Returns a GPU device.
+
+    This function is a short cut for Device('gpu', device_id).
+    The K GPUs on a node are typically numbered as 0,...,K-1.
+
+    Examples
+    ----------
+    >>> cpu_array = mx.np.ones((2, 3))
+    >>> cpu_array.device
+    cpu(0)
+    >>> with mx.gpu(1):
+    ...     gpu_array = mx.np.ones((2, 3))
+    >>> gpu_array.device
+    gpu(1)
+    >>> gpu_array = mx.np.ones((2, 3), ctx=mx.gpu(1))
+    >>> gpu_array.device
+    gpu(1)
+
+    Parameters
+    ----------
+    device_id : int, optional
+        The device id of the device, needed for GPU.
+
+    Returns
+    -------
+    device : Device
+        The corresponding GPU device.
+    """
+    return Device('gpu', device_id)
+
+
+def num_gpus():
+    """Query CUDA for the number of GPUs present.
+
+    Raises
+    ------
+    Will raise an exception on any CUDA error.
+
+    Returns
+    -------
+    count : int
+        The number of GPUs.
+
+    """
+    count = ctypes.c_int()
+    check_call(_LIB.MXGetGPUCount(ctypes.byref(count)))
+    return count.value
+
+
+def gpu_memory_info(device_id=0):
+    """Query CUDA for the free and total bytes of GPU global memory.
+
+    Parameters
+    ----------
+    device_id : int, optional
+        The device id of the GPU device.
+
+    Raises
+    ------
+    Will raise an exception on any CUDA error.
+
+    Returns
+    -------
+    (free, total) : (int, int)
+    """
+    free = ctypes.c_uint64()
+    total = ctypes.c_uint64()
+    dev_id = ctypes.c_int(device_id)
+    check_call(_LIB.MXGetGPUMemoryInformation64(dev_id, ctypes.byref(free), ctypes.byref(total)))
+    return (free.value, total.value)
+
+
+_current = contextvars.DeviceVar('namemanager', default=Device('cpu', 0))
+
+
+def current_device():
+    """Returns the current device.
+
+    By default, `mx.cpu()` is used for all the computations
+    and it can be overridden by using `with mx.Device(x)` statement where
+    x can be cpu(device_id) or gpu(device_id).
+
+    Examples
+    -------
+    >>> mx.current_device()
+    cpu(0)
+    >>> with mx.Device('gpu', 1):  # Device changed in `with` block.
+    ...    mx.current_device()  # Computation done here will be on gpu(1).
+    ...
+    gpu(1)
+    >>> mx.current_device() # Back to default device.
+    cpu(0)
+
+    Returns
+    -------
+    default_device : Device
+    """
+    return _current.get()
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 0d50a1861c97..57f1f29de608 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -24,7 +24,7 @@
 from ...util import _sanity_check_params, set_module
 from ...util import wrap_np_unary_func, wrap_np_binary_func
 from ...util import is_np_default_dtype, dtype_from_number
-from ...context import current_context
+from ...device import current_device
 from . import _internal as _npi
 from . import _api_internal
 from ..ndarray import NDArray
@@ -88,7 +88,7 @@ def shape(a):
 
 
 @set_module('mxnet.ndarray.numpy')
-def zeros(shape, dtype=None, order='C', ctx=None):  # pylint: disable=redefined-outer-name
+def zeros(shape, dtype=None, order='C', device=None):  # pylint: disable=redefined-outer-name
     """Return a new array of given shape and type, filled with zeros.
     This function currently only supports storing multi-dimensional data
     in row-major (C-style).
@@ -107,29 +107,30 @@ def zeros(shape, dtype=None, order='C', ctx=None):  # pylint: disable=redefined-
     order : {'C'}, optional, default: 'C'
         How to store multi-dimensional data in memory, currently only row-major
         (C-style) is supported.
-    ctx : Context, optional
-        An optional device context (default is the current default context).
+    device : Device, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.device.current_device()`.
 
     Returns
     -------
     out : ndarray
-        Array of zeros with the given shape, dtype, and ctx.
+        Array of zeros with the given shape, dtype, and device.
     """
     if order != 'C':
         raise NotImplementedError
-    # If the following code (4 lines) regarding ctx is removed
+    # If the following code (4 lines) regarding device is removed
     # np.zeros((3, 4)) can be as fast as 4.96 us
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if dtype is not None and not isinstance(dtype, str):
         dtype = _np.dtype(dtype).name
-    return _api_internal.zeros(shape, dtype, ctx)
+    return _api_internal.zeros(shape, dtype, device)
 
 
 @set_module('mxnet.ndarray.numpy')
-def ones(shape, dtype=None, order='C', ctx=None):  # pylint: disable=redefined-outer-name
+def ones(shape, dtype=None, order='C', device=None):  # pylint: disable=redefined-outer-name
     """Return a new array of given shape and type, filled with ones.
     This function currently only supports storing multi-dimensional data
     in row-major (C-style).
@@ -147,28 +148,29 @@ def ones(shape, dtype=None, order='C', ctx=None):  # pylint: disable=redefined-o
     order : {'C'}, optional, default: 'C'
         How to store multi-dimensional data in memory, currently only row-major
         (C-style) is supported.
-    ctx : Context, optional
-        An optional device context (default is the current default context).
+    device : Device, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.device.current_device()`.
 
     Returns
     -------
     out : ndarray
-        Array of ones with the given shape, dtype, and ctx.
+        Array of ones with the given shape, dtype, and device.
     """
     if order != 'C':
         raise NotImplementedError
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if dtype is not None and not isinstance(dtype, str):
         dtype = _np.dtype(dtype).name
-    return _api_internal.ones(shape, dtype, ctx)
+    return _api_internal.ones(shape, dtype, device)
 
 
 # pylint: disable=too-many-arguments, redefined-outer-name
 @set_module('mxnet.ndarray.numpy')
-def zeros_like(a, dtype=None, order='C', ctx=None, out=None):
+def zeros_like(a, dtype=None, order='C', device=None, out=None):
     """
     Return an array of zeros with the same shape and type as a given array.
 
@@ -183,7 +185,9 @@ def zeros_like(a, dtype=None, order='C', ctx=None, out=None):
     order : {'C'}, optional
         Whether to store multidimensional data in C- or Fortran-contiguous
         (row- or column-wise) order in memory. Currently only supports C order.
-    ctx: to specify the device, e.g. the i-th GPU.
+    device : Device, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.device.current_device()`.
     out : ndarray or None, optional
         A location into which the result is stored.
         If provided, it must have the same shape and dtype as input ndarray.
@@ -222,11 +226,11 @@ def zeros_like(a, dtype=None, order='C', ctx=None, out=None):
     """
     if order != 'C':
         raise NotImplementedError
-    return full_like(a, 0, dtype=dtype, order=order, ctx=ctx, out=out)
+    return full_like(a, 0, dtype=dtype, order=order, device=device, out=out)
 
 
 @set_module('mxnet.ndarray.numpy')
-def ones_like(a, dtype=None, order='C', ctx=None, out=None):
+def ones_like(a, dtype=None, order='C', device=None, out=None):
     """
     Return an array of ones with the same shape and type as a given array.
 
@@ -241,7 +245,9 @@ def ones_like(a, dtype=None, order='C', ctx=None, out=None):
     order : {'C'}, optional
         Whether to store multidimensional data in C- or Fortran-contiguous
         (row- or column-wise) order in memory. Currently only supports C order.
-    ctx: to specify the device, e.g. the i-th GPU.
+    device : Device, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.device.current_device()`.
     out : ndarray or None, optional
         A location into which the result is stored.
         If provided, it must have the same shape and dtype as input ndarray.
@@ -278,7 +284,7 @@ def ones_like(a, dtype=None, order='C', ctx=None, out=None):
     >>> np.ones_like(y)
     array([1., 1., 1.], dtype=float64)
     """
-    return full_like(a, 1, dtype=dtype, order=order, ctx=ctx, out=out)
+    return full_like(a, 1, dtype=dtype, order=order, device=device, out=out)
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -312,7 +318,7 @@ def broadcast_to(array, shape):
 
 
 @set_module('mxnet.ndarray.numpy')
-def full(shape, fill_value, dtype=None, order='C', ctx=None, out=None):  # pylint: disable=too-many-arguments
+def full(shape, fill_value, dtype=None, order='C', device=None, out=None):  # pylint: disable=too-many-arguments
     """
     Return a new array of given shape and type, filled with `fill_value`.
 
@@ -330,7 +336,9 @@ def full(shape, fill_value, dtype=None, order='C', ctx=None, out=None):  # pylin
     order : {'C'}, optional
         Whether to store multidimensional data in C- or Fortran-contiguous
         (row- or column-wise) order in memory. Currently only supports C order.
-    ctx: to specify the device, e.g. the i-th GPU.
+    device : Device, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.device.current_device()`.
     out : ndarray or None, optional
         A location into which the result is stored.
         If provided, it must have the same shape and dtype as input ndarray.
@@ -340,15 +348,15 @@ def full(shape, fill_value, dtype=None, order='C', ctx=None, out=None):  # pylin
     -------
     out : ndarray
         Array of `fill_value` with the given shape, dtype, and order.
-        If `fill_value` is an ndarray, out will have the same context as `fill_value`
-        regardless of the provided `ctx`.
+        If `fill_value` is an ndarray, out will have the same device as `fill_value`
+        regardless of the provided `device`.
 
     Notes
     -----
     This function differs from the original `numpy.full
     https://docs.scipy.org/doc/numpy/reference/generated/numpy.full.html`_ in
     the following way(s):
-    - Have an additional `ctx` argument to specify the device
+    - Have an additional `device` argument to specify the device
     - Have an additional `out` argument
     - Currently does not support `order` selection
 
@@ -363,7 +371,7 @@ def full(shape, fill_value, dtype=None, order='C', ctx=None, out=None):  # pylin
     >>> np.full((2, 2), 10)
     array([[10., 10.],
            [10., 10.]])
-    >>> np.full((2, 2), 2, dtype=np.int32, ctx=mx.cpu(0))
+    >>> np.full((2, 2), 2, dtype=np.int32, device=mx.cpu(0))
     array([[2, 2],
            [2, 2]], dtype=int32)
 
@@ -376,10 +384,10 @@ def full(shape, fill_value, dtype=None, order='C', ctx=None, out=None):  # pylin
         else:
             ret = broadcast_to(fill_value, shape).astype(dtype)
         return ret
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if isinstance(fill_value, bool):
         fill_value = int(fill_value)
         dtype = _np.bool if dtype is None else dtype
@@ -388,12 +396,12 @@ def full(shape, fill_value, dtype=None, order='C', ctx=None, out=None):  # pylin
             dtype = dtype_from_number(fill_value)
     if dtype is not None and not isinstance(dtype, str):
         dtype = _np.dtype(dtype).name
-    return _api_internal.full(shape, dtype, fill_value, ctx, out)
+    return _api_internal.full(shape, dtype, fill_value, device, out)
 # pylint: enable=too-many-arguments, redefined-outer-name
 
 
 @set_module('mxnet.ndarray.numpy')
-def full_like(a, fill_value, dtype=None, order='C', ctx=None, out=None): # pylint: disable=too-many-arguments
+def full_like(a, fill_value, dtype=None, order='C', device=None, out=None): # pylint: disable=too-many-arguments
     """
     Return a full array with the same shape and type as a given array.
 
@@ -410,7 +418,9 @@ def full_like(a, fill_value, dtype=None, order='C', ctx=None, out=None): # pylin
     order : {'C'}, optional
         Whether to store multidimensional data in C- or Fortran-contiguous
         (row- or column-wise) order in memory. Currently only supports C order.
-    ctx: to specify the device, e.g. the i-th GPU.
+    device : Device, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.device.current_device()`.
     out : ndarray or None, optional
         A location into which the result is stored.
         If provided, it must have the same shape and dtype as input ndarray.
@@ -447,13 +457,13 @@ def full_like(a, fill_value, dtype=None, order='C', ctx=None, out=None): # pylin
         raise NotImplementedError
     if isinstance(fill_value, bool):
         fill_value = int(fill_value)
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if dtype is not None and not isinstance(dtype, str):
         dtype = _np.dtype(dtype).name
-    return _api_internal.full_like(a, fill_value, dtype, ctx, out)
+    return _api_internal.full_like(a, fill_value, dtype, device, out)
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -531,7 +541,7 @@ def empty_like(prototype, dtype=None, order='C', subok=False, shape=None): # pyl
 
 
 @set_module('mxnet.ndarray.numpy')
-def arange(start, stop=None, step=1, dtype=None, ctx=None):
+def arange(start, stop=None, step=1, dtype=None, device=None):
     """Return evenly spaced values within a given interval.
 
     Values are generated within the half-open interval ``[start, stop)``
@@ -570,10 +580,10 @@ def arange(start, stop=None, step=1, dtype=None, ctx=None):
     """
     if dtype is not None and not isinstance(dtype, str):
         dtype = _np.dtype(dtype).name
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if stop is None:
         stop = start
         start = 0
@@ -583,11 +593,11 @@ def arange(start, stop=None, step=1, dtype=None, ctx=None):
         raise ValueError('start and stop cannot be both None')
     if step == 0:
         raise ZeroDivisionError('step cannot be 0')
-    return _api_internal.arange(start, stop, step, dtype, ctx)
+    return _api_internal.arange(start, stop, step, dtype, device)
 
 
 @set_module('mxnet.ndarray.numpy')
-def identity(n, dtype=None, ctx=None):
+def identity(n, dtype=None, device=None):
     """
     Return the identity array.
 
@@ -602,8 +612,9 @@ def identity(n, dtype=None, ctx=None):
         Data-type of the output.
         - When npx.is_np_default_dtype() returns False, default dtype is float32;
         - When npx.is_np_default_dtype() returns True, default dtype is float64.
-    ctx : Context, optional
-        An optional device context (default is the current default context).
+    device : Device, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.device.current_device()`.
 
     Returns
     -------
@@ -622,14 +633,14 @@ def identity(n, dtype=None, ctx=None):
         raise TypeError("Input 'n' should be an integer")
     if n < 0:
         raise ValueError("Input 'n' cannot be negative")
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     shape = (n, n)  # pylint: disable=redefined-outer-name
     if dtype is not None and not isinstance(dtype, str):
         dtype = _np.dtype(dtype).name
-    return _api_internal.identity(shape, dtype, ctx)
+    return _api_internal.identity(shape, dtype, device)
 
 
 # pylint: disable=redefined-outer-name
@@ -1864,21 +1875,21 @@ def eye(N, M=None, k=0, dtype=float, **kwargs):
         except for the k-th diagonal, whose values are equal to one.
     """
     _sanity_check_params('eye', ['order'], kwargs)
-    ctx = kwargs.pop('ctx', current_context())
-    if ctx is None:
-        ctx = str(current_context())
+    device = kwargs.pop('device', current_device())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if dtype is None or dtype is float:
         dtype = _np.float64 if is_np_default_dtype() else _np.float32
     if dtype is not None and not isinstance(dtype, str):
         dtype = _np.dtype(dtype).name
     k = minimum(k, N) if M is None else minimum(k, M)
-    return _api_internal.eye(N, M, int(k), ctx, dtype)
+    return _api_internal.eye(N, M, int(k), device, dtype)
 
 
 @set_module('mxnet.ndarray.numpy')
-def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0, ctx=None):  # pylint: disable=too-many-arguments
+def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0, device=None):  # pylint: disable=too-many-arguments
     r"""
     Return evenly spaced numbers over a specified interval.
     Returns num evenly spaced samples, calculated over the interval [start, stop].
@@ -1957,7 +1968,7 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
 
     - `start` and `stop` do not support list, numpy ndarray and mxnet ndarray
     - axis could only be 0
-    - There could be an additional `ctx` argument to specify the device, e.g. the i-th
+    - There could be an additional `device` argument to specify the device, e.g. the i-th
       GPU.
     """
     if isinstance(start, (list, _np.ndarray, NDArray)) or \
@@ -1965,23 +1976,23 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
         raise NotImplementedError('start and stop only support int')
     if axis != 0:
         raise NotImplementedError("the function only support axis 0")
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if dtype is not None and not isinstance(dtype, str):
         dtype = _np.dtype(dtype).name
     if dtype is None:
         dtype = _np.float64 if is_np_default_dtype() else _np.float32
     if retstep:
         step = (stop - start) / (num - 1)
-        return _api_internal.linspace(start, stop, num, endpoint, ctx, dtype), step
+        return _api_internal.linspace(start, stop, num, endpoint, device, dtype), step
     else:
-        return _api_internal.linspace(start, stop, num, endpoint, ctx, dtype)
+        return _api_internal.linspace(start, stop, num, endpoint, device, dtype)
 
 
 @set_module('mxnet.ndarray.numpy')
-def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None, axis=0, ctx=None):  # pylint: disable=too-many-arguments
+def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None, axis=0, device=None):  # pylint: disable=too-many-arguments
     r"""Return numbers spaced evenly on a log scale.
 
     In linear space, the sequence starts at ``base ** start``
@@ -2015,8 +2026,9 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None, axis=0,
         The axis in the result to store the samples.  Relevant only if start
         or stop are array-like.  By default (0), the samples will be along a
         new axis inserted at the beginning. Now, axis only support axis = 0.
-    ctx : Context, optional
-        An optional device context (default is the current default context).
+    device : Device, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.device.current_device()`.
 
     Returns
     -------
@@ -2050,21 +2062,21 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None, axis=0,
     array([4.       , 5.0396843, 6.349604 , 8.       ])
     >>> np.logspace(2.0, 3.0, num=4, base=2.0, dtype=np.int32)
     array([4, 5, 6, 8], dtype=int32)
-    >>> np.logspace(2.0, 3.0, num=4, ctx=npx.gpu(0))
-    array([ 100.     ,  215.44347,  464.15887, 1000.     ], ctx=gpu(0))
+    >>> np.logspace(2.0, 3.0, num=4, device=npx.gpu(0))
+    array([ 100.     ,  215.44347,  464.15887, 1000.     ], device=gpu(0))
     """
     if isinstance(start, (list, tuple, _np.ndarray, NDArray)) or \
        isinstance(stop, (list, tuple, _np.ndarray, NDArray)):
         raise NotImplementedError('start and stop only support int and float')
     if axis != 0:
         raise NotImplementedError("the function only support axis 0")
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if dtype is not None and not isinstance(dtype, str):
         dtype = _np.dtype(dtype).name
-    return _api_internal.logspace(start, stop, num, endpoint, base, ctx, dtype)
+    return _api_internal.logspace(start, stop, num, endpoint, base, device, dtype)
 
 
 @set_module('mxnet.ndarray.numpy')
@@ -2278,7 +2290,7 @@ def trace(a, offset=0, axis1=0, axis2=1, out=None):
 
 
 @set_module('mxnet.ndarray.numpy')
-def tri(N, M=None, k=0, dtype=None, ctx=None):
+def tri(N, M=None, k=0, dtype=None, device=None):
     r"""
     An array with ones at and below the given diagonal and zeros elsewhere.
 
@@ -2314,13 +2326,13 @@ def tri(N, M=None, k=0, dtype=None, ctx=None):
            [1.,  0.,  0.,  0.,  0.],
            [1.,  1.,  0.,  0.,  0.]])
     """
-    if ctx is None:
-        ctx = str(current_context())
-    return _api_internal.tri(N, M, k, dtype, ctx)
+    if device is None:
+        device = str(current_device())
+    return _api_internal.tri(N, M, k, dtype, device)
 
 
 @set_module('mxnet.ndarray.numpy')
-def triu_indices(n, k=0, m=None, ctx=None):
+def triu_indices(n, k=0, m=None, device=None):
     r"""
     Return the indices for the upper-triangle of an (n, m) array.
 
@@ -2394,7 +2406,7 @@ def triu_indices(n, k=0, m=None, ctx=None):
            [  8,   9,  -1,  -1],
            [ 12,  13,  14,  -1]])
         """
-    return nonzero(~tri(N=n, M=m, k=k-1, dtype=bool, ctx=ctx))
+    return nonzero(~tri(N=n, M=m, k=k-1, dtype=bool, device=device))
 
 
 
@@ -3015,7 +3027,7 @@ def arcsin(x, out=None, **kwargs):
     For each value that cannot be expressed as a real number or infinity,
     it yields ``nan`` and sets the `invalid` floating point error flag.
     The inverse sine is also known as `asin` or sin^{-1}.
-    The output `ndarray` has the same `ctx` as the input `ndarray`.
+    The output `ndarray` has the same `device` as the input `ndarray`.
     This function differs from the original `numpy.arcsin
     <https://docs.scipy.org/doc/numpy/reference/generated/numpy.arcsin.html>`_ in
     the following aspects:
@@ -3490,7 +3502,7 @@ def reciprocal(x, out=None, **kwargs):
     For integer arguments with absolute value larger than 1 the result is
     always zero because of the way Python handles integer division.  For
     integer zero the result is an overflow.
-    The output `ndarray` has the same `ctx` as the input `ndarray`.
+    The output `ndarray` has the same `device` as the input `ndarray`.
     This function differs from the original `numpy.reciprocal
     <https://docs.scipy.org/doc/numpy/reference/generated/numpy.reciprocal.html>`_ in
     the following aspects:
@@ -3530,7 +3542,7 @@ def square(x, out=None, **kwargs):
 
     Notes
     -----
-    The output `ndarray` has the same `ctx` as the input `ndarray`.
+    The output `ndarray` has the same `device` as the input `ndarray`.
     This function differs from the original `numpy.square
     <https://docs.scipy.org/doc/numpy/reference/generated/numpy.square.html>`_ in
     the following aspects:
@@ -5779,7 +5791,7 @@ def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):  # pylint:
 
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.ndarray.numpy')
-def indices(dimensions, dtype=None, ctx=None):
+def indices(dimensions, dtype=None, device=None):
     """Return an array representing the indices of a grid.
 
     Compute an array where the subarrays contain index values 0,1,...
@@ -5791,9 +5803,9 @@ def indices(dimensions, dtype=None, ctx=None):
         The shape of the grid.
     dtype : data-type, optional
         The desired data-type for the array. Default is `int64`.
-    ctx : device context, optional
+    device : Device, optional
         Device context on which the memory is allocated. Default is
-        `mxnet.context.current_context()`.
+        `mxnet.device.current_device()`.
 
     Returns
     -------
@@ -5837,13 +5849,13 @@ def indices(dimensions, dtype=None, ctx=None):
     extract the required elements directly with ``x[:2, :3]``.
     """
     if isinstance(dimensions, (tuple, list)):
-        if ctx is None:
-            ctx = str(current_context())
+        if device is None:
+            device = str(current_device())
         else:
-            ctx = str(ctx)
+            device = str(device)
         if dtype is not None and not isinstance(dtype, str):
             dtype = _np.dtype(dtype).name
-        return _api_internal.indices(dimensions, dtype, ctx)
+        return _api_internal.indices(dimensions, dtype, device)
     else:
         raise ValueError("The dimensions must be sequence of ints")
 # pylint: enable=redefined-outer-name
@@ -6070,7 +6082,7 @@ def diag_indices_from(arr):
 
 
 @set_module('mxnet.ndarray.numpy')
-def hanning(M, dtype=None, ctx=None):
+def hanning(M, dtype=None, device=None):
     r"""Return the Hanning window.
 
     The Hanning window is a taper formed by using a weighted cosine.
@@ -6080,8 +6092,9 @@ def hanning(M, dtype=None, ctx=None):
     M : int
         Number of points in the output window. If zero or less, an
         empty array is returned.
-    ctx : Context, optional
-        An optional device context (default is the current default context).
+    device : Device, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.device.current_device()`.
 
     Returns
     -------
@@ -6146,17 +6159,17 @@ def hanning(M, dtype=None, ctx=None):
     Text(0.5, 0, 'Sample')
     >>> plt.show()
     """
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if dtype is not None and not isinstance(dtype, str):
         dtype = _np.dtype(dtype).name
-    return _api_internal.hanning(M, dtype, ctx)
+    return _api_internal.hanning(M, dtype, device)
 
 
 @set_module('mxnet.ndarray.numpy')
-def hamming(M, dtype=None, ctx=None):
+def hamming(M, dtype=None, device=None):
     r"""Return the hamming window.
 
     The hamming window is a taper formed by using a weighted cosine.
@@ -6166,8 +6179,9 @@ def hamming(M, dtype=None, ctx=None):
     M : int
         Number of points in the output window. If zero or less, an
         empty array is returned.
-    ctx : Context, optional
-        An optional device context (default is the current default context).
+    device : Device, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.device.current_device()`.
 
     Returns
     -------
@@ -6230,17 +6244,17 @@ def hamming(M, dtype=None, ctx=None):
     Text(0.5, 0, 'Sample')
     >>> plt.show()
     """
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if dtype is not None and not isinstance(dtype, str):
         dtype = _np.dtype(dtype).name
-    return _api_internal.hamming(M, dtype, ctx)
+    return _api_internal.hamming(M, dtype, device)
 
 
 @set_module('mxnet.ndarray.numpy')
-def blackman(M, dtype=None, ctx=None):
+def blackman(M, dtype=None, device=None):
     r"""Return the Blackman window.
 
     The Blackman window is a taper formed by using the first three
@@ -6253,8 +6267,9 @@ def blackman(M, dtype=None, ctx=None):
     M : int
         Number of points in the output window. If zero or less, an
         empty array is returned.
-    ctx : Context, optional
-        An optional device context (default is the current default context).
+    device : Device, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.device.current_device()`.
 
     Returns
     -------
@@ -6312,13 +6327,13 @@ def blackman(M, dtype=None, ctx=None):
     Text(0.5, 0, 'Sample')
     >>> plt.show()
     """
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if dtype is not None and not isinstance(dtype, str):
         dtype = _np.dtype(dtype).name
-    return _api_internal.blackman(M, dtype, ctx)
+    return _api_internal.blackman(M, dtype, device)
 
 
 @set_module('mxnet.ndarray.numpy')
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index 0fb51e4d0c83..c4249505e26f 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -43,11 +43,11 @@
 from ..base import check_call, _LIB, NDArrayHandle, c_array, mx_int, mx_int64
 from ..base import mx_real_t, c_array_buf, mx_uint, numeric_types, integer_types
 from ..runtime import Features
-from ..context import Context
+from ..device import Device
 from ..util import set_module, wrap_np_unary_func, wrap_np_binary_func,\
-                   is_np_default_dtype, wrap_data_api_creation_func,\
+                   is_np_default_dtype, wrap_ctx_to_device_func,\
                    dtype_from_number
-from ..context import current_context
+from ..device import current_device
 from ..ndarray import numpy as _mx_nd_np
 from ..ndarray.numpy import _internal as _npi
 from ..ndarray.ndarray import _storage_type
@@ -109,8 +109,8 @@ def _int64_enabled():
 
 # This function is copied from ndarray.py since pylint
 # keeps giving false alarm error of undefined-all-variable
-def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):  # pylint: disable=redefined-outer-name
-    """Return a new handle with specified shape and context.
+def _new_alloc_handle(shape, device, delay_alloc, dtype=mx_real_t):  # pylint: disable=redefined-outer-name
+    """Return a new handle with specified shape and device.
 
     Empty handle is only used to hold results.
 
@@ -124,8 +124,8 @@ def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):  # pylint: disa
         check_call(_LIB.MXNDArrayCreate64(
             c_array_buf(mx_int64, native_array('q', shape)),
             ctypes.c_int(len(shape)),
-            ctypes.c_int(ctx.device_typeid),
-            ctypes.c_int(ctx.device_id),
+            ctypes.c_int(device.device_typeid),
+            ctypes.c_int(device.device_id),
             ctypes.c_int(int(delay_alloc)),
             ctypes.c_int(int(_DTYPE_NP_TO_MX[_np.dtype(dtype).type])),
             ctypes.byref(hdl)))
@@ -146,8 +146,8 @@ def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):  # pylint: disa
         check_call(_LIB.MXNDArrayCreate(
             c_array_buf(mx_uint, native_array('I', shape)),
             mx_uint(len(shape)),
-            ctypes.c_int(ctx.device_typeid),
-            ctypes.c_int(ctx.device_id),
+            ctypes.c_int(device.device_typeid),
+            ctypes.c_int(device.device_id),
             ctypes.c_int(int(delay_alloc)),
             ctypes.c_int(int(_DTYPE_NP_TO_MX[dtype_type])),
             ctypes.byref(hdl)))
@@ -182,8 +182,8 @@ def _reshape_view(a, *shape):  # pylint: disable=redefined-outer-name
                                        ctypes.byref(handle)))
     return ndarray(handle=handle, writable=a.writable)
 
-def _as_mx_np_array(object, ctx=None, zero_copy=False):
-    """Convert arrays or any array member of container to mxnet.numpy.ndarray on ctx."""
+def _as_mx_np_array(object, device=None, zero_copy=False):
+    """Convert arrays or any array member of container to mxnet.numpy.ndarray on device."""
     if object is None or isinstance(object, ndarray):
         return object
     elif isinstance(object, _np.ndarray):
@@ -192,43 +192,43 @@ def _as_mx_np_array(object, ctx=None, zero_copy=False):
     elif isinstance(object, (integer_types, numeric_types)):
         return object
     elif isinstance(object, (_np.bool_, _np.bool)):
-        return array(object, dtype=_np.bool_, ctx=ctx)
+        return array(object, dtype=_np.bool_, device=device)
     elif isinstance(object, (list, tuple)):
-        tmp = [_as_mx_np_array(arr, ctx=ctx, zero_copy=zero_copy) for arr in object]
+        tmp = [_as_mx_np_array(arr, device=device, zero_copy=zero_copy) for arr in object]
         return object.__class__(tmp)
     else:
         raise TypeError('Does not support converting {} to mx.np.ndarray.'.format(str(type(object))))
 
 
-def _as_onp_array(object, cur_ctx=None):
+def _as_onp_array(object, cur_device=None):
     """Convert object to numpy.ndarray."""
-    def _update_ctx(cur_ctx, tmp_ctx):
-        if cur_ctx is None:
-            cur_ctx = tmp_ctx
-        elif tmp_ctx is not None and cur_ctx != tmp_ctx:
-            raise ValueError('Ambiguous to set the context for the output ndarray since'  # pylint: disable=too-few-format-args
+    def _update_device(cur_device, tmp_device):
+        if cur_device is None:
+            cur_device = tmp_device
+        elif tmp_device is not None and cur_device != tmp_device:
+            raise ValueError('Ambiguous to set the device for the output ndarray since'  # pylint: disable=too-few-format-args
                              ' input ndarrays are allocated on different devices: {} and {}'
-                             .format(str(cur_ctx, tmp_ctx)))
-        return cur_ctx
+                             .format(str(cur_device, tmp_device)))
+        return cur_device
 
     if isinstance(object, ndarray):
-        return object.asnumpy(), object.ctx
+        return object.asnumpy(), object.device
     elif isinstance(object, (list, tuple)):
         tmp = []
         for arr in object:
-            arr, tmp_ctx = _as_onp_array(arr, cur_ctx)
+            arr, tmp_device = _as_onp_array(arr, cur_device)
             tmp.append(arr)
-            cur_ctx = _update_ctx(cur_ctx, tmp_ctx)
-        return object.__class__(tmp), cur_ctx
+            cur_device = _update_device(cur_device, tmp_device)
+        return object.__class__(tmp), cur_device
     elif isinstance(object, dict):
         tmp = dict()
         for key, value in object.items():
-            value, tmp_ctx = _as_onp_array(value, cur_ctx)
+            value, tmp_device = _as_onp_array(value, cur_device)
             tmp[key] = value
-            cur_ctx = _update_ctx(cur_ctx, tmp_ctx)
-        return object.__class__(tmp), cur_ctx
+            cur_device = _update_device(cur_device, tmp_device)
+        return object.__class__(tmp), cur_device
     else:
-        return object, cur_ctx
+        return object, cur_device
 
 
 # Have to use 0 as default value for stype since pylint does not allow
@@ -266,7 +266,7 @@ def wrap_mxnp_np_ufunc(func):
     @functools.wraps(func)
     def _wrap_mxnp_np_ufunc(x1, x2):
         if isinstance(x2, _np.ndarray):
-            x2 = _as_mx_np_array(x2, ctx=x1.ctx)
+            x2 = _as_mx_np_array(x2, device=x1.device)
         return func(x1, x2)
     return _wrap_mxnp_np_ufunc
 
@@ -357,7 +357,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):  # pylint: disable=
                                     "which is actually using official numpy's implementation", name)
                     _FALLBACK_ARRAY_UFUNC_WARNED_RECORD[onp_op] = True
                 out = onp_op(*new_inputs, **kwargs)
-                return _as_mx_np_array(out, ctx=inputs[0].ctx)
+                return _as_mx_np_array(out, device=inputs[0].device)
             # ops with np mx_np
             elif name in ufunc_list and isinstance(inputs[0], _np.ndarray):
                 # inplace
@@ -365,7 +365,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):  # pylint: disable=
                     new_inputs = [arg.asnumpy() if isinstance(arg, ndarray) else arg for arg in inputs]
                     return onp_op(*new_inputs, **kwargs)
                 else:
-                    new_inputs = [_as_mx_np_array(arg, ctx=inputs[1].ctx)
+                    new_inputs = [_as_mx_np_array(arg, device=inputs[1].device)
                                   if isinstance(arg, _np.ndarray) else arg for arg in inputs]
                     return mx_ufunc(*new_inputs, **kwargs)
             else:
@@ -387,11 +387,11 @@ def __array_function__(self, func, types, args, kwargs):  # pylint: disable=bad-
                 raise ValueError("Falling back to NumPy operator {} with autograd active is not supported."
                                  "Please consider moving the operator to the outside of the autograd scope.")\
                                  .format(func)
-            cur_ctx = None
-            new_args, cur_ctx = _as_onp_array(args, cur_ctx)
-            new_kwargs, cur_ctx = _as_onp_array(kwargs, cur_ctx)
-            if cur_ctx is None:
-                raise ValueError('Unknown context for the input ndarrays. It is probably a bug. Please'
+            cur_device = None
+            new_args, cur_device = _as_onp_array(args, cur_device)
+            new_kwargs, cur_device = _as_onp_array(kwargs, cur_device)
+            if cur_device is None:
+                raise ValueError('Unknown device for the input ndarrays. It is probably a bug. Please'
                                  ' create an issue on GitHub.')
             if func not in _FALLBACK_ARRAY_FUNCTION_WARNED_RECORD:
                 import logging
@@ -399,18 +399,18 @@ def __array_function__(self, func, types, args, kwargs):  # pylint: disable=bad-
                                 "which is actually using official numpy's implementation.", func_name)
                 _FALLBACK_ARRAY_FUNCTION_WARNED_RECORD[func] = True
             out = func(*new_args, **new_kwargs)
-            return _as_mx_np_array(out, ctx=cur_ctx)
+            return _as_mx_np_array(out, device=cur_device)
         else:
             if py_all(issubclass(t, ndarray) for t in types):
                 return mx_np_func(*args, **kwargs)
             else:
                 try:
-                    cur_ctx = next(a.ctx for a in args if hasattr(a, 'ctx'))
+                    cur_device = next(a.device for a in args if hasattr(a, 'device'))
                 except StopIteration:
-                    cur_ctx = next(a.ctx for a in kwargs.values() if hasattr(a, 'ctx'))
-                new_args = _as_mx_np_array(args, ctx=cur_ctx,
+                    cur_device = next(a.device for a in kwargs.values() if hasattr(a, 'device'))
+                new_args = _as_mx_np_array(args, device=cur_device,
                                            zero_copy=func_name in {'may_share_memory', 'shares_memory'})
-                new_kwargs = {k: _as_mx_np_array(v, cur_ctx) for k, v in kwargs.items()}
+                new_kwargs = {k: _as_mx_np_array(v, cur_device) for k, v in kwargs.items()}
                 return mx_np_func(*new_args, **new_kwargs)
 
 
@@ -712,7 +712,7 @@ def __getitem__(self, key):
         ndim = self.ndim  # pylint: disable=redefined-outer-name
         shape = self.shape  # pylint: disable=redefined-outer-name
         if isinstance(key, bool): # otherwise will be treated as 0 and 1
-            key = array(key, dtype=_np.bool, ctx=self.ctx)
+            key = array(key, dtype=_np.bool, device=self.device)
         if isinstance(key, list):
             try:
                 new_key = _np.array(key)
@@ -724,7 +724,7 @@ def __getitem__(self, key):
             if dc.is_deferred_compute():
                 raise TypeError('Indexing with a numpy array is not supported in HybridBlock.')
             if key.dtype == _np.bool_:
-                key = array(key, dtype='bool', ctx=self.ctx)
+                key = array(key, dtype='bool', device=self.device)
 
         # Handle single boolean index of matching dimensionality and size first for higher speed
         # If the boolean array is mixed with other idices, it is instead expanded into (multiple)
@@ -828,14 +828,14 @@ def __getitem__(self, key):
         elif indexing_dispatch_code == _NDARRAY_BASIC_INDEXING:
             if prepend == _NDARRAY_ZERO_DIM_BOOL_ARRAY_FALSE:
                 return empty((0,) + self._get_np_basic_indexing(key).shape,
-                             dtype=self.dtype, device=self.ctx)
+                             dtype=self.dtype, device=self.device)
             if prepend == _NDARRAY_ZERO_DIM_BOOL_ARRAY_TRUE:
                 key = (_np.newaxis,) + key
             return self._get_np_basic_indexing(key)
         elif indexing_dispatch_code == _NDARRAY_ADVANCED_INDEXING:
             if prepend == _NDARRAY_ZERO_DIM_BOOL_ARRAY_FALSE:
                 return empty((0,) + self._get_np_adanced_indexing(key).shape,
-                             dtype=self.dtype, device=self.ctx)
+                             dtype=self.dtype, device=self.device)
             if prepend == _NDARRAY_ZERO_DIM_BOOL_ARRAY_TRUE:
                 key = (_np.newaxis,) + key
             return self._get_np_advanced_indexing(key)
@@ -960,7 +960,7 @@ def __setitem__(self, key, value):
                 )
 
     def _prepare_value_nd(self, value, bcast_shape, squeeze_axes=None):
-        """Return a broadcast `ndarray` with same context and dtype as ``self``.
+        """Return a broadcast `ndarray` with same device and dtype as ``self``.
         For setting item, The returned `ndarray` is squeezed according to squeeze_axes since the
         value_nd is assigned to not yet expanded space in original array.
         `value`: numeric types or array like.
@@ -969,14 +969,14 @@ def _prepare_value_nd(self, value, bcast_shape, squeeze_axes=None):
         Note: mxnet.numpy.ndarray not support NDArray as assigned value.
         """
         if isinstance(value, numeric_types):
-            value_nd = full(bcast_shape, value, device=self.ctx, dtype=self.dtype)
+            value_nd = full(bcast_shape, value, device=self.device, dtype=self.dtype)
         elif isinstance(value, self.__class__):
-            value_nd = value.as_in_ctx(self.ctx)
+            value_nd = value.to_device(self.device)
             if value_nd.dtype != self.dtype:
                 value_nd = value_nd.astype(self.dtype)
         else:
             try:
-                value_nd = array(value, ctx=self.ctx, dtype=self.dtype)
+                value_nd = array(value, device=self.device, dtype=self.dtype)
             except:
                 raise TypeError('mxnet.np.ndarray does not support assignment with non-array-like '
                                 'object {} of type {}'.format(value, type(value)))
@@ -1312,7 +1312,7 @@ def __repr__(self):
         """
         Returns a string representation of the array.
         The dtype of the ndarray will be appended if it's inconsistent with current dtype.
-        The context of the ndarray will be appended for devices other than CPU.
+        The device of the ndarray will be appended for devices other than CPU.
 
         Examples
         --------
@@ -1346,14 +1346,14 @@ def __repr__(self):
         >>> c = a.copyto(npx.gpu(0))
         >>> c
         array([[0.5488135 , 0.5928446 , 0.71518934],
-               [0.84426576, 0.60276335, 0.8579456 ]], ctx=gpu(0))
+               [0.84426576, 0.60276335, 0.8579456 ]], device=gpu(0))
         >>> print(c)
         [[0.5488135  0.5928446  0.71518934]
          [0.84426576 0.60276335 0.8579456 ]] @gpu(0)
         >>> d = b.copyto(npx.gpu(0))
         >>> d
         array([[0.54881352, 0.59284461, 0.71518934],
-               [0.84426576, 0.60276335, 0.85794562]], dtype=float64, ctx=gpu(0))
+               [0.84426576, 0.60276335, 0.85794562]], dtype=float64, device=gpu(0))
         >>> print(d)
         [[0.54881352 0.59284461 0.71518934]
          [0.84426576 0.60276335 0.85794562]] @gpu(0)
@@ -1369,20 +1369,20 @@ def __repr__(self):
             elif dtype not in (default_dtype, _np.bool_):
                 array_str = array_str[:-1] + ', dtype={})'.format(dtype)
 
-            context = self.ctx
-            if context.device_type == 'cpu':
+            device = self.device
+            if device.device_type == 'cpu':
                 return array_str
-            return array_str[:-1] + ', ctx={})'.format(str(context))
+            return array_str[:-1] + ', device={})'.format(str(device))
         else:
             return '<FREED {}>'.format(self.__class__.__name__)
 
     def __str__(self):
         """Returns a string representation of the array."""
         array_str = self.asnumpy().__str__()
-        context = self.ctx
-        if context.device_type == 'cpu' or self.ndim == 0:
+        device = self.device
+        if device.device_type == 'cpu' or self.ndim == 0:
             return array_str
-        return '{array} @{ctx}'.format(array=array_str, ctx=context)
+        return '{array} @{device}'.format(array=array_str, device=device)
 
     def __format__(self, fmt):
         """Return value.__format__(format_spec). Overwrite to include 0-d array"""
@@ -1462,7 +1462,7 @@ def astype(self, dtype, order='K', casting='unsafe', subok=True, copy=True):  #
             the returned array will be forced to be a base-class array.
         copy : bool, optional
             Default `True`. By default, astype always returns a newly
-            allocated ndarray on the same context. If this is set to
+            allocated ndarray on the same device. If this is set to
             `False`, and the dtype requested is the same as the ndarray's
             dtype, the ndarray is returned instead of a copy.
 
@@ -1501,13 +1501,13 @@ def copyto(self, other):
         ``self.shape`` should be the same. This function copies the value from
         ``self`` to ``other``.
 
-        If ``other`` is a context, a new ``np.ndarray`` will be first created on
-        the target context, and the value of ``self`` is copied.
+        If ``other`` is a device, a new ``np.ndarray`` will be first created on
+        the target device, and the value of ``self`` is copied.
 
         Parameters
         ----------
-        other : ndarray or Context
-            The destination array or context.
+        other : ndarray or Device
+            The destination array or device.
 
         Returns
         -------
@@ -1518,7 +1518,7 @@ def copyto(self, other):
         Examples
         --------
         >>> x = np.ones((2, 3))
-        >>> y = np.zeros((2, 3), ctx=npx.gpu(0))
+        >>> y = np.zeros((2, 3), device=npx.gpu(0))
         >>> z = x.copyto(y)
         >>> z is y
         True
@@ -1531,7 +1531,7 @@ def copyto(self, other):
                 warnings.warn('You are attempting to copy an array to itself', RuntimeWarning)
                 return False
             return _npi.copyto(self, out=other)
-        elif isinstance(other, Context):
+        elif isinstance(other, Device):
             hret = ndarray(_new_alloc_handle(self.shape, other, True, self.dtype))
             return _npi.copyto(self, out=hret)
         else:
@@ -1546,51 +1546,22 @@ def argmax(self, axis=None, out=None):  # pylint: disable=arguments-differ
         return argmax(self, axis, out)
 
     def as_in_context(self, context):
-        """This function has been deprecated. Please refer to ``ndarray.as_in_ctx``."""
+        """This function has been deprecated. Please refer to ``ndarray.to_device``."""
         warnings.warn('ndarray.as_in_context has been renamed to'
                       ' ndarray.as_in_ctx', DeprecationWarning)
         return self.as_nd_ndarray().as_in_context(context).as_np_ndarray()
 
     def as_in_ctx(self, ctx):
-        """Returns an array on the target device with the same value as this array.
-
-        If the target context is the same as ``self.context``, then ``self`` is
-        returned.  Otherwise, a copy is made.
-
-        Parameters
-        ----------
-        context : Context
-            The target context.
-
-        Returns
-        -------
-        ndarray
-            The target array.
-        """
-        if self.ctx == ctx:
-            return self
-        return self.copyto(ctx)
+        """This function has been deprecated. Please refer to ``ndarray.to_device``."""
+        warnings.warn('ndarray.as_in_ctx has been renamed to'
+                      ' ndarray.to_device', DeprecationWarning)
+        return self.to_device(ctx)
 
     @property
     def ctx(self):
-        """Device context of the array.
-
-        Examples
-        --------
-        >>> x = np.array([1, 2, 3, 4])
-        >>> x.ctx
-        cpu(0)
-        >>> type(x.ctx)
-        <class 'mxnet.context.Context'>
-        >>> y = np.zeros((2, 3), npx.gpu(0))
-        >>> y.ctx
-        gpu(0)
-        """
-        dev_typeid = ctypes.c_int()
-        dev_id = ctypes.c_int()
-        check_call(_LIB.MXNDArrayGetContext(
-            self.handle, ctypes.byref(dev_typeid), ctypes.byref(dev_id)))
-        return Context(Context.devtype2str[dev_typeid.value], dev_id.value)
+        """This property has been deprecated. Please refer to ``ndarray.device``."""
+        warnings.warn('ndarray.ctx has been renamed to ndarray.device', DeprecationWarning)
+        return self.device
 
 
     def to_device(self, device):
@@ -1601,7 +1572,7 @@ def to_device(self, device):
 
         Parameters
         ----------
-        device : Context
+        device : Device
             The target device.
 
         Returns
@@ -1623,12 +1594,16 @@ def device(self):
         >>> x.device
         cpu(0)
         >>> type(x.device)
-        <class 'mxnet.context.Context'>
+        <class 'mxnet.device.Device'>
         >>> y = np.zeros((2, 3), npx.gpu(0))
         >>> y.device
         gpu(0)
         """
-        return self.ctx
+        dev_typeid = ctypes.c_int()
+        dev_id = ctypes.c_int()
+        check_call(_LIB.MXNDArrayGetContext(
+            self.handle, ctypes.byref(dev_typeid), ctypes.byref(dev_id)))
+        return Device(Device.devtype2str[dev_typeid.value], dev_id.value)
 
 
     @property
@@ -1638,7 +1613,7 @@ def context(self):
         return self.as_nd_ndarray().context
 
     def copy(self, order='C'):  # pylint: disable=arguments-differ
-        """Return a coyp of the array, keeping the same context.
+        """Return a coyp of the array, keeping the same device.
 
         Parameters
         ----------
@@ -1657,7 +1632,7 @@ def copy(self, order='C'):  # pylint: disable=arguments-differ
         if order != 'C':
             raise NotImplementedError('ndarray.copy only supports order=\'C\', while '
                                       'received {}'.format(str(order)))
-        return self.copyto(self.ctx)
+        return self.copyto(self.device)
 
     def dot(self, b, out=None):
         """Dot product of two arrays.
@@ -2425,7 +2400,7 @@ def _full(self, value):
         Currently for internal use only. Implemented for __setitem__.
         Assign to self an array of self's same shape and type, filled with value.
         """
-        return _mx_nd_np.full(self.shape, value, ctx=self.ctx, dtype=self.dtype, out=self)
+        return _mx_nd_np.full(self.shape, value, device=self.device, dtype=self.dtype, out=self)
 
     # pylint: disable=redefined-outer-name
     def _scatter_set_nd(self, value_nd, indices):
@@ -2502,7 +2477,7 @@ def tostype(self, stype):
 
 
 @set_module('mxnet.numpy')
-@wrap_data_api_creation_func
+@wrap_ctx_to_device_func
 def empty(shape, dtype=None, order='C', device=None):  # pylint: disable=redefined-outer-name
     """Return a new array of given shape and type, without initializing entries.
 
@@ -2519,9 +2494,9 @@ def empty(shape, dtype=None, order='C', device=None):  # pylint: disable=redefin
     order : {'C'}, optional, default: 'C'
         How to store multi-dimensional data in memory, currently only row-major
         (C-style) is supported.
-    device : device context, optional
+    device : Device, optional
         Device context on which the memory is allocated. Default is
-        `mxnet.context.current_context()`.
+        `mxnet.device.current_device()`.
 
     Returns
     -------
@@ -2542,7 +2517,7 @@ def empty(shape, dtype=None, order='C', device=None):  # pylint: disable=redefin
         raise NotImplementedError('`empty` only supports order equal to `C`, while received {}'
                                   .format(str(order)))
     if device is None:
-        device = current_context()
+        device = current_device()
     if dtype is None or dtype is float:
         dtype = _np.float64 if is_np_default_dtype() else _np.float32
     if isinstance(shape, int):
@@ -2552,7 +2527,8 @@ def empty(shape, dtype=None, order='C', device=None):  # pylint: disable=redefin
 
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
-def array(object, dtype=None, ctx=None):
+@wrap_ctx_to_device_func
+def array(object, dtype=None, device=None):
     """
     Create an array.
 
@@ -2569,9 +2545,9 @@ def array(object, dtype=None, ctx=None):
         * When npx.is_np_default_dtype() returns False, default dtype is float32;
         * When npx.is_np_default_dtype() returns True, default dtype is float64.
 
-    ctx : device context, optional
+    device : Device, optional
         Device context on which the memory is allocated. Default is
-        `mxnet.context.current_context()`.
+        `mxnet.device.current_device()`.
 
     Returns
     -------
@@ -2598,8 +2574,8 @@ def array(object, dtype=None, ctx=None):
     >>> np.array([1, 2, 3]).dtype
     dtype('float64')
     """
-    if ctx is None:
-        ctx = current_context()
+    if device is None:
+        device = current_device()
     if isinstance(object, _np.ndarray):
         if is_np_default_dtype():
             dtype = object.dtype if dtype is None else dtype
@@ -2620,7 +2596,7 @@ def array(object, dtype=None, ctx=None):
             # printing out the error raised by official NumPy's array function
             # for transparency on users' side
             raise TypeError('{}'.format(str(e)))
-    ret = empty(object.shape, dtype=dtype, device=ctx)
+    ret = empty(object.shape, dtype=dtype, device=device)
     if len(object.shape) == 0:
         ret[()] = object
     else:
@@ -2664,7 +2640,7 @@ def shape(a):
 
 
 @set_module('mxnet.numpy')
-@wrap_data_api_creation_func
+@wrap_ctx_to_device_func
 def zeros(shape, dtype=None, order='C', device=None):  # pylint: disable=redefined-outer-name
     """Return a new array of given shape and type, filled with zeros.
     This function currently only supports storing multi-dimensional data
@@ -2684,14 +2660,14 @@ def zeros(shape, dtype=None, order='C', device=None):  # pylint: disable=redefin
     order : {'C'}, optional, default: 'C'
         How to store multi-dimensional data in memory, currently only row-major
         (C-style) is supported.
-    device : device context, optional
+    device : Device, optional
         Device context on which the memory is allocated. Default is
-        `mxnet.context.current_context()`.
+        `mxnet.device.current_device()`.
 
     Returns
     -------
     out : ndarray
-        Array of zeros with the given shape, dtype, and ctx.
+        Array of zeros with the given shape, dtype, and device.
 
     Examples
     --------
@@ -2709,7 +2685,7 @@ def zeros(shape, dtype=None, order='C', device=None):  # pylint: disable=redefin
 
 
 @set_module('mxnet.numpy')
-@wrap_data_api_creation_func
+@wrap_ctx_to_device_func
 def ones(shape, dtype=None, order='C', device=None):  # pylint: disable=redefined-outer-name
     """Return a new array of given shape and type, filled with ones.
     This function currently only supports storing multi-dimensional data
@@ -2728,9 +2704,9 @@ def ones(shape, dtype=None, order='C', device=None):  # pylint: disable=redefine
     order : {'C'}, optional, default: 'C'
         How to store multi-dimensional data in memory, currently only row-major
         (C-style) is supported.
-    device : device context, optional
+    device : Device, optional
         Device context on which the memory is allocated. Default is
-        `mxnet.context.current_context()`.
+        `mxnet.device.current_device()`.
 
     Returns
     -------
@@ -2787,7 +2763,7 @@ def broadcast_to(array, shape):  # pylint: disable=redefined-outer-name
 
 # pylint: disable=too-many-arguments, redefined-outer-name
 @set_module('mxnet.numpy')
-@wrap_data_api_creation_func
+@wrap_ctx_to_device_func
 def full(shape, fill_value, dtype=None, order='C', device=None, out=None):
     r"""Return a new array of given shape and type, filled with `fill_value`.
 
@@ -2805,9 +2781,9 @@ def full(shape, fill_value, dtype=None, order='C', device=None, out=None):
     order : {'C'}, optional
         Whether to store multidimensional data in C- or Fortran-contiguous
         (row- or column-wise) order in memory. Currently only supports C order.
-    device : device context, optional
+    device : Device, optional
         Device context on which the memory is allocated. Default is
-        `mxnet.context.current_context()`.
+        `mxnet.device.current_device()`.
     out : ndarray or None, optional
         A location into which the result is stored.
         If provided, it must have the same shape and dtype as input ndarray.
@@ -2817,13 +2793,13 @@ def full(shape, fill_value, dtype=None, order='C', device=None, out=None):
     -------
     out : ndarray
         Array of `fill_value` with the given shape, dtype, and order.
-        If `fill_value` is an ndarray, out will have the same context as `fill_value`
-        regardless of the provided `ctx`.
+        If `fill_value` is an ndarray, out will have the same device as `fill_value`
+        regardless of the provided `device`.
 
     .. note::
        This function differs from the original numpy.full in the following way(s):
 
-       * Has an additional `ctx` argument to specify the device
+       * Has an additional `device` argument to specify the device
        * Has an additional `out` argument
        * Currently does not support `order` selection
 
@@ -2842,13 +2818,13 @@ def full(shape, fill_value, dtype=None, order='C', device=None, out=None):
     array([[2, 2],
            [2, 2]], dtype=int32)
     """
-    return _mx_nd_np.full(shape, fill_value, order=order, ctx=device, dtype=dtype, out=out)
+    return _mx_nd_np.full(shape, fill_value, order=order, device=device, dtype=dtype, out=out)
 # pylint: enable=too-many-arguments, redefined-outer-name
 
 
 # pylint: disable=redefined-outer-name, too-many-arguments
 @set_module('mxnet.numpy')
-@wrap_data_api_creation_func
+@wrap_ctx_to_device_func
 def empty_like(prototype, dtype=None, device=None, order='C', subok=False, shape=None): # pylint: disable=W0621
     """
     Return a new array with the same shape and type as a given array.
@@ -2860,9 +2836,9 @@ def empty_like(prototype, dtype=None, device=None, order='C', subok=False, shape
         of the returned array.
     dtype : data-type, optional
         Overrides the data type of the result.
-    device : device context, optional
+    device : Device, optional
         Device context on which the memory is allocated. Default is
-        `mxnet.context.current_context()`.
+        `mxnet.device.current_device()`.
     order : {'C'}, optional
         Whether to store multidimensional data in C- or Fortran-contiguous
         (row- or column-wise) order in memory. Currently only supports C order.
@@ -3020,7 +2996,8 @@ def any(a, axis=None, out=None, keepdims=False):
 
 
 @set_module('mxnet.numpy')
-def identity(n, dtype=None, ctx=None):
+@wrap_ctx_to_device_func
+def identity(n, dtype=None, device=None):
     """
     Return the identity array.
 
@@ -3035,8 +3012,9 @@ def identity(n, dtype=None, ctx=None):
         Data-type of the output.
         When npx.is_np_default_dtype() returns False, default dtype is float32;
         When npx.is_np_default_dtype() returns True, default dtype is float64.
-    ctx : Context, optional
-        An optional device context (default is the current default context).
+    device : Device, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.device.current_device()`.
 
     Returns
     -------
@@ -3052,7 +3030,7 @@ def identity(n, dtype=None, ctx=None):
            [0., 1., 0.],
            [0., 0., 1.]])
     """
-    return _mx_nd_np.identity(n, dtype, ctx)
+    return _mx_nd_np.identity(n, dtype, device)
 # pylint: enable=redefined-outer-name
 
 
@@ -4270,7 +4248,7 @@ def arcsin(x, out=None, **kwargs):
        For each value that cannot be expressed as a real number or infinity,
        it yields ``nan`` and sets the `invalid` floating point error flag.
        The inverse sine is also known as `asin` or sin^{-1}.
-       The output `ndarray` has the same `ctx` as the input `ndarray`.
+       The output `ndarray` has the same `device` as the input `ndarray`.
        This function differs from the original `numpy.arcsin
        <https://docs.scipy.org/doc/numpy/reference/generated/numpy.arcsin.html>`_ in
        the following aspects:
@@ -4796,7 +4774,7 @@ def reciprocal(x, out=None, **kwargs):
        For integer arguments with absolute value larger than 1 the result is
        always zero because of the way Python handles integer division.  For
        integer zero the result is an overflow.
-       The output `ndarray` has the same `ctx` as the input `ndarray`.
+       The output `ndarray` has the same `device` as the input `ndarray`.
        This function differs from the original `numpy.reciprocal
        <https://docs.scipy.org/doc/numpy/reference/generated/numpy.reciprocal.html>`_ in
        the following aspects:
@@ -4837,7 +4815,7 @@ def square(x, out=None, **kwargs):
     array([1., 4., 1.])
 
     .. note::
-       The output `ndarray` has the same `ctx` as the input `ndarray`.
+       The output `ndarray` has the same `device` as the input `ndarray`.
        This function differs from the original `numpy.square
        <https://docs.scipy.org/doc/numpy/reference/generated/numpy.square.html>`_ in
        the following aspects:
@@ -5554,7 +5532,7 @@ def histogram(a, bins=10, range=None, normed=None, weights=None, density=None):
 
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
-@wrap_data_api_creation_func
+@wrap_ctx_to_device_func
 def eye(N, M=None, k=0, dtype=None, device=None, **kwargs):
     """
     Return a 2-D array with ones on the diagonal and zeros elsewhere.
@@ -5573,9 +5551,9 @@ def eye(N, M=None, k=0, dtype=None, device=None, **kwargs):
         Data-type of the returned array.
         When npx.is_np_default_dtype() returns False, default dtype is float32;
         When npx.is_np_default_dtype() returns True, default dtype is float64.
-    device : device context, optional
+    device : Device, optional
         Device context on which the memory is allocated. Default is
-        `mxnet.context.current_context()`.
+        `mxnet.device.current_device()`.
 
     Returns
     -------
@@ -5593,13 +5571,13 @@ def eye(N, M=None, k=0, dtype=None, device=None, **kwargs):
            [0., 0., 1.],
            [0., 0., 0.]])
     """
-    return _mx_nd_np.eye(N, M, k, dtype, ctx=device, **kwargs)
+    return _mx_nd_np.eye(N, M, k, dtype, device=device, **kwargs)
 # pylint: enable=redefined-outer-name
 
 
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
-@wrap_data_api_creation_func
+@wrap_ctx_to_device_func
 def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0, device=None):  # pylint: disable=too-many-arguments
     r"""
     Return evenly spaced numbers over a specified interval.
@@ -5630,9 +5608,9 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
         The axis in the result to store the samples. Relevant only if start or
         stop are array-like. By default (0), the samples will be along a new
         axis inserted at the beginning. Use -1 to get an axis at the end.
-    device : device context, optional
+    device : Device, optional
         Device context on which the memory is allocated. Default is
-        `mxnet.context.current_context()`.
+        `mxnet.device.current_device()`.
 
     Returns
     -------
@@ -5691,7 +5669,8 @@ def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis
 
 # pylint: disable=too-many-arguments, redefined-outer-name
 @set_module('mxnet.numpy')
-def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None, axis=0, ctx=None):
+@wrap_ctx_to_device_func
+def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None, axis=0, device=None):
     r"""Return numbers spaced evenly on a log scale.
 
     In linear space, the sequence starts at ``base ** start``
@@ -5725,8 +5704,9 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None, axis=0,
         The axis in the result to store the samples.  Relevant only if start
         or stop are array-like.  By default (0), the samples will be along a
         new axis inserted at the beginning. Now, axis only support axis = 0.
-    ctx : Context, optional
-        An optional device context (default is the current default context).
+    device : Device, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.device.current_device()`.
 
     Returns
     -------
@@ -5760,10 +5740,10 @@ def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None, axis=0,
     array([4.       , 5.0396843, 6.349604 , 8.       ])
     >>> np.logspace(2.0, 3.0, num=4, base=2.0, dtype=np.int32)
     array([4, 5, 6, 8], dtype=int32)
-    >>> np.logspace(2.0, 3.0, num=4, ctx=npx.gpu(0))
-    array([ 100.     ,  215.44347,  464.15887, 1000.     ], ctx=gpu(0))
+    >>> np.logspace(2.0, 3.0, num=4, device=npx.gpu(0))
+    array([ 100.     ,  215.44347,  464.15887, 1000.     ], device=gpu(0))
     """
-    return _mx_nd_np.logspace(start, stop, num, endpoint, base, dtype, axis, ctx=ctx)
+    return _mx_nd_np.logspace(start, stop, num, endpoint, base, dtype, axis, device=device)
 # pylint: enable=too-many-arguments, redefined-outer-name
 
 
@@ -6058,7 +6038,8 @@ def tril(m, k=0):
 
 
 @set_module('mxnet.numpy')
-def tri(N, M=None, k=0, dtype=None, ctx=None):    # pylint: disable=redefined-outer-name
+@wrap_ctx_to_device_func
+def tri(N, M=None, k=0, dtype=None, device=None):    # pylint: disable=redefined-outer-name
     r"""
     An array with ones at and below the given diagonal and zeros elsewhere.
     Parameters
@@ -6090,11 +6071,11 @@ def tri(N, M=None, k=0, dtype=None, ctx=None):    # pylint: disable=redefined-ou
            [1.,  0.,  0.,  0.,  0.],
            [1.,  1.,  0.,  0.,  0.]])
     """
-    return _mx_nd_np.tri(N, M, k, dtype, ctx)
+    return _mx_nd_np.tri(N, M, k, dtype, device)
 
 
 @set_module('mxnet.numpy')
-def triu_indices(n, k=0, m=None, ctx=None):    # pylint: disable=redefined-outer-name
+def triu_indices(n, k=0, m=None, device=None):    # pylint: disable=redefined-outer-name
     r"""
     Return the indices for the upper-triangle of an (n, m) array.
     Parameters
@@ -6153,7 +6134,7 @@ def triu_indices(n, k=0, m=None, ctx=None):    # pylint: disable=redefined-outer
            [  8,   9,  -1,  -1],
            [ 12,  13,  14,  -1]])
         """
-    return _mx_nd_np.triu_indices(n, k, m, ctx)
+    return _mx_nd_np.triu_indices(n, k, m, device)
 
 
 @set_module('mxnet.numpy')
@@ -6283,7 +6264,7 @@ def triu(m, k=0):
 
 
 @set_module('mxnet.numpy')
-@wrap_data_api_creation_func
+@wrap_ctx_to_device_func
 def arange(start, stop=None, step=1, dtype=None, device=None):
     """Return evenly spaced values within a given interval.
 
@@ -7914,7 +7895,8 @@ def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):  # pylint:
 
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
-def indices(dimensions, dtype=None, ctx=None):
+@wrap_ctx_to_device_func
+def indices(dimensions, dtype=None, device=None):
     """Return an array representing the indices of a grid.
 
     Compute an array where the subarrays contain index values 0,1,...
@@ -7926,9 +7908,9 @@ def indices(dimensions, dtype=None, ctx=None):
         The shape of the grid.
     dtype : data-type, optional
         The desired data-type for the array. Default is `int64`.
-    ctx : device context, optional
+    device : Device, optional
         Device context on which the memory is allocated. Default is
-        `mxnet.context.current_context()`.
+        `mxnet.device.current_device()`.
 
     Returns
     -------
@@ -7971,7 +7953,7 @@ def indices(dimensions, dtype=None, ctx=None):
     Note that it would be more straightforward in the above example to
     extract the required elements directly with ``x[:2, :3]``.
     """
-    return _mx_nd_np.indices(dimensions=dimensions, dtype=dtype, ctx=ctx)
+    return _mx_nd_np.indices(dimensions=dimensions, dtype=dtype, device=device)
 # pylint: enable=redefined-outer-name
 
 
@@ -8184,7 +8166,8 @@ def diag_indices_from(arr):
 
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
-def hanning(M, dtype=None, ctx=None):
+@wrap_ctx_to_device_func
+def hanning(M, dtype=None, device=None):
     r"""Return the Hanning window.
 
     The Hanning window is a taper formed by using a weighted cosine.
@@ -8194,8 +8177,9 @@ def hanning(M, dtype=None, ctx=None):
     M : int
         Number of points in the output window. If zero or less, an
         empty array is returned.
-    ctx : Context, optional
-        An optional device context (default is the current default context).
+    device : Device, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.device.current_device()`.
 
     Returns
     -------
@@ -8260,12 +8244,13 @@ def hanning(M, dtype=None, ctx=None):
     Text(0.5, 0, 'Sample')
     >>> plt.show()
     """
-    return _mx_nd_np.hanning(M, dtype=dtype, ctx=ctx)
+    return _mx_nd_np.hanning(M, dtype=dtype, device=device)
 
 
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
-def hamming(M, dtype=None, ctx=None):
+@wrap_ctx_to_device_func
+def hamming(M, dtype=None, device=None):
     r"""Return the hamming window.
 
     The hamming window is a taper formed by using a weighted cosine.
@@ -8275,8 +8260,9 @@ def hamming(M, dtype=None, ctx=None):
     M : int
         Number of points in the output window. If zero or less, an
         empty array is returned.
-    ctx : Context, optional
-        An optional device context (default is the current default context).
+    device : Device, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.device.current_device()`.
 
     Returns
     -------
@@ -8339,12 +8325,13 @@ def hamming(M, dtype=None, ctx=None):
     Text(0.5, 0, 'Sample')
     >>> plt.show()
     """
-    return _mx_nd_np.hamming(M, dtype=dtype, ctx=ctx)
+    return _mx_nd_np.hamming(M, dtype=dtype, device=device)
 
 
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
-def blackman(M, dtype=None, ctx=None):
+@wrap_ctx_to_device_func
+def blackman(M, dtype=None, device=None):
     r"""Return the Blackman window.
 
     The Blackman window is a taper formed by using the first three
@@ -8357,8 +8344,9 @@ def blackman(M, dtype=None, ctx=None):
     M : int
         Number of points in the output window. If zero or less, an
         empty array is returned.
-    ctx : Context, optional
-        An optional device context (default is the current default context).
+    device : Device, optional
+        Device context on which the memory is allocated. Default is
+        `mxnet.device.current_device()`.
 
     Returns
     -------
@@ -8416,7 +8404,7 @@ def blackman(M, dtype=None, ctx=None):
     Text(0.5, 0, 'Sample')
     >>> plt.show()
     """
-    return _mx_nd_np.blackman(M, dtype=dtype, ctx=ctx)
+    return _mx_nd_np.blackman(M, dtype=dtype, device=device)
 
 
 @set_module('mxnet.numpy')
@@ -10666,7 +10654,7 @@ def interp(x, xp, fp, left=None, right=None, period=None):  # pylint: disable=to
 
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
-@wrap_data_api_creation_func
+@wrap_ctx_to_device_func
 def full_like(a, fill_value, dtype=None, order='C', device=None, out=None): # pylint: disable=too-many-arguments
     """
     Return a full array with the same shape and type as a given array.
@@ -10684,9 +10672,9 @@ def full_like(a, fill_value, dtype=None, order='C', device=None, out=None): # py
     order : {'C'}, optional
         Whether to store multidimensional data in C- or Fortran-contiguous
         (row- or column-wise) order in memory. Currently only supports C order.
-    device : device context, optional
+    device : Device, optional
         Device context on which the memory is allocated. Default is
-        `mxnet.context.current_context()`.
+        `mxnet.device.current_device()`.
     out : ndarray or None, optional
         A location into which the result is stored.
         If provided, it must have the same shape and dtype as input ndarray.
@@ -10719,13 +10707,13 @@ def full_like(a, fill_value, dtype=None, order='C', device=None, out=None): # py
     >>> np.full_like(y, 0.1)
     array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1])
     """
-    return _mx_nd_np.full_like(a, fill_value=fill_value, dtype=dtype, order=order, ctx=device, out=out)
+    return _mx_nd_np.full_like(a, fill_value=fill_value, dtype=dtype, order=order, device=device, out=out)
 # pylint: enable=redefined-outer-name
 
 
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
-@wrap_data_api_creation_func
+@wrap_ctx_to_device_func
 def zeros_like(a, dtype=None, order='C', device=None, out=None):
     """
     Return an array of zeros with the same shape and type as a given array.
@@ -10741,9 +10729,9 @@ def zeros_like(a, dtype=None, order='C', device=None, out=None):
     order : {'C'}, optional
         Whether to store multidimensional data in C- or Fortran-contiguous
         (row- or column-wise) order in memory. Currently only supports C order.
-    device : device context, optional
+    device : Device, optional
         Device context on which the memory is allocated. Default is
-        `mxnet.context.current_context()`.
+        `mxnet.device.current_device()`.
     out : ndarray or None, optional
         A location into which the result is stored.
         If provided, it must have the same shape and dtype as input ndarray.
@@ -10780,13 +10768,13 @@ def zeros_like(a, dtype=None, order='C', device=None, out=None):
     >>> np.zeros_like(y)
     array([0., 0., 0.], dtype=float64)
     """
-    return _mx_nd_np.full_like(a, fill_value=0, dtype=dtype, order=order, ctx=device, out=out)
+    return _mx_nd_np.full_like(a, fill_value=0, dtype=dtype, order=order, device=device, out=out)
 # pylint: enable=redefined-outer-name
 
 
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
-@wrap_data_api_creation_func
+@wrap_ctx_to_device_func
 def ones_like(a, dtype=None, order='C', device=None, out=None):
     """
     Return an array of ones with the same shape and type as a given array.
@@ -10802,9 +10790,9 @@ def ones_like(a, dtype=None, order='C', device=None, out=None):
     order : {'C'}, optional
         Whether to store multidimensional data in C- or Fortran-contiguous
         (row- or column-wise) order in memory. Currently only supports C order.
-    device : device context, optional
+    device : Device, optional
         Device context on which the memory is allocated. Default is
-        `mxnet.context.current_context()`.
+        `mxnet.device.current_device()`.
     out : ndarray or None, optional
         A location into which the result is stored.
         If provided, it must have the same shape and dtype as input ndarray.
@@ -10841,7 +10829,7 @@ def ones_like(a, dtype=None, order='C', device=None, out=None):
     >>> np.ones_like(y)
     array([1., 1., 1.], dtype=float64)
     """
-    return _mx_nd_np.full_like(a, fill_value=1, dtype=dtype, order=order, ctx=device, out=out)
+    return _mx_nd_np.full_like(a, fill_value=1, dtype=dtype, order=order, device=device, out=out)
 # pylint: enable=redefined-outer-name
 
 
@@ -12321,7 +12309,7 @@ def sum(a, axis=None, dtype=None, out=None, keepdims=None, initial=None, where=N
 
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
-@wrap_data_api_creation_func
+@wrap_ctx_to_device_func
 def asarray(obj, dtype=None, device=None, copy=None):
     """
     Convert the input to an array.
@@ -12334,9 +12322,9 @@ def asarray(obj, dtype=None, device=None, copy=None):
         or an object supporting DLPack or the Python buffer protocol.
     dtype : dtype, Optional
         output array data type. Default: None .
-    device : device context, optional
+    device : Device, optional
         Device context on which the memory is allocated. Default is
-        `mxnet.context.current_context()`.
+        `mxnet.device.current_device()`.
     copy : bool, Optional
         Whether or not to make a copy of the input.
         If True, always copies.
@@ -12377,13 +12365,13 @@ def asarray(obj, dtype=None, device=None, copy=None):
         dtype = obj.dtype if dtype is None else dtype
     elif isinstance(obj, ndarray):
         dtype = obj.dtype if dtype is None else dtype
-    array = _as_mx_np_array(obj, ctx=device, zero_copy=copy)
+    array = _as_mx_np_array(obj, device=device, zero_copy=copy)
     return array.astype(dtype)
 
 
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.numpy')
-@wrap_data_api_creation_func
+@wrap_ctx_to_device_func
 def from_dlpack(x):
     """
     Returns a np.ndarray backed by a dlpack tensor.
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index d3315404fd4d..406186e866f9 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -651,13 +651,12 @@ def _wrap_np_binary_func(x1, x2, out=None, **kwargs):
     return _wrap_np_binary_func
 
 
-def wrap_data_api_creation_func(func):
-    """A convenience decorator for wrapping data apis standardized creation functions to provide
-    context keyward backward compatibility
+def wrap_ctx_to_device_func(func):
+    """A convenience decorator for converting ctx to device keyward backward compatibility
 
     Parameters
     ----------
-    func : a numpy-compatible array creation function to be wrapped for context keyward change.
+    func : a function to be wrapped for context keyward change.
 
     Returns
     -------
@@ -665,13 +664,13 @@ def wrap_data_api_creation_func(func):
         A function wrapped with context keyward changes.
     """
     @functools.wraps(func)
-    def _wrap_api_creation_func(*args, **kwargs):
+    def _wrap_func_with_ctx(*args, **kwargs):
         if len(kwargs) != 0:
             device = kwargs.pop('ctx', None)
             if device is not None:
                 kwargs['device'] = device
         return func(*args, **kwargs)
-    return _wrap_api_creation_func
+    return _wrap_func_with_ctx
 
 
 # pylint: disable=exec-used

From eac4095a7b6779ab9b40c8eae5a1db328db65991 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Wed, 13 Oct 2021 18:56:47 -0700
Subject: [PATCH 23/41] ctx => device

---
 python/mxnet/device.py                        |   2 +-
 python/mxnet/gluon/block.py                   | 166 +++++++------
 python/mxnet/gluon/parameter.py               | 226 +++++++++---------
 python/mxnet/test_utils.py                    |  20 +-
 python/mxnet/util.py                          |  74 +++---
 .../numpy/np_elemwise_broadcast_logic_op.h    |   3 +-
 src/operator/numpy/np_init_op.cu              |   4 +-
 src/operator/numpy/np_init_op.h               | 200 +++++++---------
 8 files changed, 351 insertions(+), 344 deletions(-)

diff --git a/python/mxnet/device.py b/python/mxnet/device.py
index cf2c9a3af4ee..3a69ee7bd76a 100644
--- a/python/mxnet/device.py
+++ b/python/mxnet/device.py
@@ -269,7 +269,7 @@ def gpu_memory_info(device_id=0):
     return (free.value, total.value)
 
 
-_current = contextvars.DeviceVar('namemanager', default=Device('cpu', 0))
+_current = contextvars.ContextVar('namemanager', default=Device('cpu', 0))
 
 
 def current_device():
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 0c129912d169..8c0b416c8366 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -33,7 +33,7 @@
 
 from ..base import mx_real_t, MXNetError, NDArrayHandle, SymbolHandle, py_str, check_call, _LIB
 from .. import symbol, ndarray, initializer, autograd, _deferred_compute as dc, name as _name, \
-    profiler as _profiler, context as _context
+    profiler as _profiler, device as _device
 from ..symbol.numpy import _symbol as np_symbol
 from ..symbol import Symbol, fromjson
 from ..ndarray import NDArray
@@ -42,7 +42,7 @@
 from .utils import _check_same_symbol_type, _check_all_np_ndarrays, _check_block_input_np_ndarrays
 from .. import numpy_extension as _mx_npx
 from .. import numpy as _mx_np, ndarray as nd
-from .. util import is_np_array, np_shape, np_array
+from .. util import is_np_array, np_shape, np_array, wrap_ctx_to_device_func
 
 
 _naming_counter = contextvars.ContextVar('namecounter')
@@ -67,11 +67,11 @@ def _block_scope(block):
     _prefix.reset(prefix_token)
 
 
-def _gather_type_ctx_info(args):
+def _gather_type_device_info(args):
     """Analyze the elements inside the nested args object and find:
         - If there exists ndarray
         - If there exists symbol
-        - All contexts appearing in args
+        - All devices appearing in args
 
     Parameters
     ----------
@@ -84,32 +84,32 @@ def _gather_type_ctx_info(args):
         Whether the elements in args contains symbols
     has_ndarray : bool
         Whether the elements in args contains ndarrays
-    ctx_set : set of mxnet.context.Context
-        Contains all possible contexts of the inner ndarrays in args. Can be empty if there is no
+    device_set : set of mxnet.device.Device
+        Contains all possible devices of the inner ndarrays in args. Can be empty if there is no
         ndarray inside args.
-    first_ctx : mxnet.context.Context or None
-        Context of the first appeared NDArray (for backward-compatibility)
+    first_device : mxnet.device.Device or None
+        Device of the first appeared NDArray (for backward-compatibility)
     """
     if isinstance(args, NDArray):
-        return False, True, {args.ctx}, args.ctx
+        return False, True, {args.device}, args.device
     elif isinstance(args, Symbol):
         return True, False, set(), None
     elif isinstance(args, (list, tuple)):
         has_symbol = False
         has_ndarray = False
-        ctx_set = set()
-        first_ctx = None
+        device_set = set()
+        first_device = None
         for ele in args:
-            ele_has_sym, ele_has_nd, ele_ctx_set, ele_first_ctx =\
-                _gather_type_ctx_info(ele)
+            ele_has_sym, ele_has_nd, ele_device_set, ele_first_device =\
+                _gather_type_device_info(ele)
             has_symbol = has_symbol or ele_has_sym
             has_ndarray = has_ndarray or ele_has_nd
-            if first_ctx is None and ele_first_ctx is not None:
-                first_ctx = ele_first_ctx
-            ctx_set = ctx_set | ele_ctx_set
+            if first_device is None and ele_first_device is not None:
+                first_device = ele_first_device
+            device_set = device_set | ele_device_set
             if has_symbol and has_ndarray:
                 break
-        return has_symbol, has_ndarray, ctx_set, first_ctx
+        return has_symbol, has_ndarray, device_set, first_device
     else:
         return False, False, set(), None
 
@@ -220,8 +220,8 @@ def forward(self, x):
                 return mx.npx.relu(self.dense1(x))
 
         model = Model()
-        model.initialize(ctx=mx.cpu(0))
-        model(mx.np.zeros((10, 10), ctx=mx.cpu(0)))
+        model.initialize(device=mx.cpu(0))
+        model(mx.np.zeros((10, 10), device=mx.cpu(0)))
 
 
     Child :py:class:`Block` assigned this way will be registered and :py:meth:`collect_params`
@@ -375,7 +375,8 @@ def save_parameters(self, filename, deduplicate=False):
         else:
             ndarray.save(filename, arg_dict)
 
-    def load_parameters(self, filename, ctx=None, allow_missing=False,
+    @wrap_ctx_to_device_func
+    def load_parameters(self, filename, device=None, allow_missing=False,
                         ignore_extra=False, cast_dtype=False, dtype_source='current'):
         """Load parameters from file previously saved by `save_parameters`.
 
@@ -383,8 +384,8 @@ def load_parameters(self, filename, ctx=None, allow_missing=False,
         ----------
         filename : str
             Path to parameter file.
-        ctx : Context or list of Context, default cpu()
-            Context(s) to initialize loaded parameters on.
+        device : Device or list of Device, default cpu()
+            Device(s) to initialize loaded parameters on.
         allow_missing : bool, default False
             Whether to silently skip loading parameters not represents in the file.
         ignore_extra : bool, default False
@@ -428,9 +429,9 @@ def load_parameters(self, filename, ctx=None, allow_missing=False,
         if not loaded:
             return
         full_dict = {'params': loaded, 'filename': filename}
-        self.load_dict(full_dict, ctx, allow_missing, ignore_extra, cast_dtype, dtype_source)
+        self.load_dict(full_dict, device, allow_missing, ignore_extra, cast_dtype, dtype_source)
 
-    def load_dict(self, param_dict, ctx=None, allow_missing=False,
+    def load_dict(self, param_dict, device=None, allow_missing=False,
                   ignore_extra=False, cast_dtype=False, dtype_source="current"):
         """Load parameters from dict
 
@@ -438,8 +439,9 @@ def load_dict(self, param_dict, ctx=None, allow_missing=False,
         ----------
         param_dict : dict
             Dictionary containing model parameters
-        ctx : Context or list of Context
-            Context(s) initialize loaded parameters on.
+        device : Device, optional
+            Device context on which the memory is allocated. Default is
+            `mxnet.device.current_device()`.
         allow_missing : bool, default False
             Whether to silently skip loading parameters not represented in the file.
         ignore_extra : bool, default False
@@ -475,8 +477,8 @@ def load_dict(self, param_dict, ctx=None, allow_missing=False,
                     "Set allow_missing=True to ignore missing parameters."%(
                         name, error_str, _brief_print_list(loaded.keys()))
 
-        if ctx is None:
-            ctx = _context.current_context()
+        if device is None:
+            device = _device.current_device()
         for name in loaded:
             if not ignore_extra and name not in params:
                 raise ValueError(
@@ -487,7 +489,7 @@ def load_dict(self, param_dict, ctx=None, allow_missing=False,
                 param = loaded[name]
                 if isinstance(param, np.ndarray):
                     param = _mx_np.array(param) if is_np_array() else nd.array(param)
-                params[name]._load_init(param, ctx, cast_dtype=cast_dtype, dtype_source=dtype_source)
+                params[name]._load_init(param, device, cast_dtype=cast_dtype, dtype_source=dtype_source)
 
     def register_child(self, block, name=None):
         """Registers block as a child of self. :py:class:`Block` s assigned to self as
@@ -551,7 +553,8 @@ def apply(self, fn):
         fn(self)
         return self
 
-    def initialize(self, init=initializer.Uniform(), ctx=None, verbose=False,
+    @wrap_ctx_to_device_func
+    def initialize(self, init=initializer.Uniform(), device=None, verbose=False,
                    force_reinit=False):
         """Initializes :py:class:`Parameter` s of this :py:class:`Block` and its children.
 
@@ -560,8 +563,8 @@ def initialize(self, init=initializer.Uniform(), ctx=None, verbose=False,
         init : Initializer
             Global default Initializer to be used when :py:meth:`Parameter.init` is ``None``.
             Otherwise, :py:meth:`Parameter.init` takes precedence.
-        ctx : Context or list of Context
-            Keeps a copy of Parameters on one or many context(s).
+        device : Device or list of Device
+            Keeps a copy of Parameters on one or many device(s).
         verbose : bool, default False
             Whether to verbosely print out details on initialization.
         force_reinit : bool, default False
@@ -571,7 +574,7 @@ def initialize(self, init=initializer.Uniform(), ctx=None, verbose=False,
         if verbose:
             init.set_verbosity(verbose=verbose)
         for v in params.values():
-            v.initialize(None, ctx, init, force_reinit=force_reinit)
+            v.initialize(None, device, init, force_reinit=force_reinit)
 
     def save(self, prefix):
         """Save the model architecture and parameters to load again later
@@ -731,7 +734,7 @@ def cast(self, dtype):
 
     def zero_grad(self):
         """Sets all Parameters' gradient buffer to 0."""
-        # collect gradient arrays for each ctx
+        # collect gradient arrays for each device
         arrays = defaultdict(list)
         params = self.collect_params()
         for p in params.values():
@@ -742,9 +745,9 @@ def zero_grad(self):
                     ndarray.zeros_like(g, out=g)
                 else:
                     if is_np_array():
-                        arrays[g.ctx].append(g.as_nd_ndarray())
+                        arrays[g.device].append(g.as_nd_ndarray())
                     else:
-                        arrays[g.ctx].append(g)
+                        arrays[g.device].append(g)
 
         if len(arrays) == 0:
             return
@@ -752,18 +755,24 @@ def zero_grad(self):
         for arr in arrays.values():
             ndarray.reset_arrays(*arr, num_arrays=len(arr))
 
-    def reset_ctx(self, ctx):
-        """Re-assign all Parameters to other contexts.
+    def reset_device(self, device):
+        """Re-assign all Parameters to other devices.
 
         Parameters
         ----------
-        ctx : Context or list of Context, default :py:meth:`context.current_context()`.
-            Assign Parameter to given context. If ctx is a list of Context, a
-            copy will be made for each context.
+        device : Device or list of Device, default :py:meth:`device.current_device()`.
+            Assign Parameter to given device. If device is a list of Device, a
+            copy will be made for each device.
         """
         params = self.collect_params()
         for i in params.values():
-            i.reset_ctx(ctx)
+            i.reset_device(device)
+    
+    def reset_ctx(self, ctx):
+        """This function has been deprecated. Please refer to ``Block.reset_device``."""
+        warnings.warn('Block.reset_ctx has been renamed to'
+                      ' Block.reset_device', DeprecationWarning)
+        self.reset_device(ctx)
 
     def setattr(self, name, value):
         """Set an attribute to a new value for all Parameters.
@@ -1013,9 +1022,9 @@ def forward(self, x):
                 return mx.npx.relu(self.dense1(x))
 
         model = Model()
-        model.initialize(ctx=mx.cpu(0))
+        model.initialize(device=mx.cpu(0))
         model.hybridize()
-        model(mx.np.zeros((10, 10), ctx=mx.cpu(0)))
+        model(mx.np.zeros((10, 10), device=mx.cpu(0)))
 
     Forward computation in :py:class:`HybridBlock` must be static to work with :py:class:`Symbol` s,
     i.e. you cannot call :py:meth:`NDArray.asnumpy`, :py:attr:`NDArray.shape`,
@@ -1130,9 +1139,9 @@ def _build_cache(self, *args, update_graph=True):
 
         arg_dict, aux_dict = dict(), dict()
         if self._backend:
-            # set context for inputs
-            _, _, ctx_set, _ = _gather_type_ctx_info(list(args))
-            ctx = ctx_set.pop() if len(ctx_set) > 0 else None
+            # set device for inputs
+            _, _, device_set, _ = _gather_type_device_info(list(args))
+            device = device_set.pop() if len(device_set) > 0 else None
             # get list of params in the order of out.list_arguments
             input_shapes = dict()
             for name in out.list_arguments():
@@ -1158,7 +1167,7 @@ def _build_cache(self, *args, update_graph=True):
                     aux_dict[name] = params[name].data()
 
             # Partition the graph
-            out = out.optimize_for(self._backend, arg_dict, aux_dict, ctx, input_shapes, **self._backend_opts)
+            out = out.optimize_for(self._backend, arg_dict, aux_dict, device, input_shapes, **self._backend_opts)
 
             #update cached graph with partitioned graph
             if update_graph:
@@ -1198,7 +1207,7 @@ def _build_cache(self, *args, update_graph=True):
                     param = Parameter(name, dtype=param_data.dtype)
                     param._var_name = name
                     serialization_name = name  # HybridBlock.export
-                    param._load_init(param_data, param_data.context)
+                    param._load_init(param_data, param_data.device)
                 triple = (False, serialization_name, param)
 
             self._cached_op_args.append(triple)
@@ -1334,16 +1343,16 @@ def optimize_for(self, x, *args, backend=None, clear=False,
                            inline_limit, forward_bulk_size, backward_bulk_size)
 
         # do part of forward API call
-        has_symbol, has_ndarray, ctx_set, _ = _gather_type_ctx_info([x] + list(args))
+        has_symbol, has_ndarray, device_set, _ = _gather_type_device_info([x] + list(args))
         if not has_symbol and not has_ndarray:
             raise ValueError('In HybridBlock, there must be one NDArray or one Symbol in the input.'
                              ' Please check the type of the args.\n')
-        if len(ctx_set) > 1:
-            raise ValueError('Found multiple contexts in the input, '
+        if len(device_set) > 1:
+            raise ValueError('Found multiple devices in the input, '
                              'After hybridized, the HybridBlock only supports one input '
-                             'context. You can print the ele.ctx in the '
-                             'input arguments to inspect their contexts. '
-                             'Find all contexts = {}'.format(ctx_set))
+                             'device. You can print the ele.device in the '
+                             'input arguments to inspect their devices. '
+                             'Find all devices = {}'.format(device_set))
 
         self._build_cache(x, *args)
         assert self._cached_op, "Gluon failed to build the cache. " \
@@ -1580,18 +1589,18 @@ def __call__(self, x, *args):
             'Must define {name}.forward. '
             'Defining {name}.hybrid_forward is deprecated.'.format(name=type(self).__name__))
 
-        _, has_ndarray, ctx_set, first_ctx = _gather_type_ctx_info([x] + list(args))
+        _, has_ndarray, device_set, first_device = _gather_type_device_info([x] + list(args))
         if not has_ndarray:
             raise ValueError('In HybridBlock, there must be one NDArray in the input.'
                              ' Please check the type of the args.\n')
         if self._active and not dc.is_deferred_compute():
             # Do not call CachedOp if not hybridized or inside deferred compute mode.
-            if len(ctx_set) > 1:
-                raise ValueError('Find multiple contexts in the input, '
+            if len(device_set) > 1:
+                raise ValueError('Find multiple devices in the input, '
                                  'After hybridized, the HybridBlock only supports one input '
-                                 'context. You can print the ele.ctx in the '
-                                 'input arguments to inspect their contexts. '
-                                 'Find all contexts = {}'.format(ctx_set))
+                                 'device. You can print the ele.device in the '
+                                 'input arguments to inspect their devices. '
+                                 'Find all devices = {}'.format(device_set))
 
         if not self._called_infer_shape_already:
             self.infer_shape(x, *args)
@@ -1608,7 +1617,7 @@ def __call__(self, x, *args):
             # HybridBlock is a child block of a HybridBlock that has been hybridized.
             return super().__call__(x, *args)
 
-        with first_ctx:
+        with first_device:
             return self._call_cached_op(x, *args)
 
     def forward(self, x, *args):
@@ -1617,23 +1626,30 @@ def forward(self, x, *args):
 
         raise NotImplementedError
 
-    def reset_ctx(self, ctx):
-        """Re-assign all Parameters to other contexts. If the Block is hybridized, it will reset the _cached_op_args.
+    def reset_device(self, device):
+        """Re-assign all Parameters to other devices. If the Block is hybridized, it will reset the _cached_op_args.
 
         Parameters
         ----------
-        ctx : Context or list of Context, default :py:meth:`context.current_context()`.
-            Assign Parameter to given context. If ctx is a list of Context, a
-            copy will be made for each context.
+        device : Device or list of Device, default :py:meth:`device.current_device()`.
+            Assign Parameter to given device. If device is a list of Device, a
+            copy will be made for each device.
         """
         params = self.collect_params()
         if self._cached_op:
             for p in self._cached_op_args:
                 # resetting parameters creating by the partitioning backend
                 if p.name not in params:
-                    p.reset_ctx(ctx)
+                    p.reset_device(device)
         for p in params.values():
-            p.reset_ctx(ctx)
+            p.reset_device(device)
+
+    def reset_ctx(self, ctx):
+        """This function has been deprecated. Please refer to ``HybridBlock.reset_device``."""
+        warnings.warn('HybridBlock.reset_ctx has been renamed to'
+                      ' HybridBlock.reset_device', DeprecationWarning)
+        self.reset_device(ctx)
+
 
 class SymbolBlock(HybridBlock):
     """Construct block from symbol. This is useful for using pre-trained models
@@ -1653,7 +1669,7 @@ class SymbolBlock(HybridBlock):
     Examples
     --------
     >>> # To extract the feature from fc1 and fc2 layers of AlexNet:
-    >>> alexnet = gluon.model_zoo.vision.alexnet(pretrained=True, ctx=mx.cpu())
+    >>> alexnet = gluon.model_zoo.vision.alexnet(pretrained=True, device=mx.cpu())
     >>> inputs = mx.sym.var('data')
     >>> out = alexnet(inputs)
     >>> internals = out.get_internals()
@@ -1667,7 +1683,7 @@ class SymbolBlock(HybridBlock):
     >>> print(feat_model(x))
     """
     @staticmethod
-    def imports(symbol_file, input_names, param_file=None, ctx=None, allow_missing=False,
+    def imports(symbol_file, input_names, param_file=None, device=None, allow_missing=False,
                 ignore_extra=False):
         """Import model previously saved by `gluon.HybridBlock.export`
         as a `gluon.SymbolBlock` for use in Gluon.
@@ -1680,8 +1696,8 @@ def imports(symbol_file, input_names, param_file=None, ctx=None, allow_missing=F
             List of input variable names
         param_file : str, optional
             Path to parameter file.
-        ctx : Context, default None
-            The context to initialize `gluon.SymbolBlock` on.
+        device : Device, default None
+            The device to initialize `gluon.SymbolBlock` on.
         allow_missing : bool, default False
             Whether to silently skip loading parameters not represents in the file.
         ignore_extra : bool, default False
@@ -1719,7 +1735,7 @@ def imports(symbol_file, input_names, param_file=None, ctx=None, allow_missing=F
             inputs = [symbol.var(i).as_np_ndarray() if is_np_array() else symbol.var(i) for i in input_names]
         ret = SymbolBlock(sym, inputs)
         if param_file is not None:
-            ret.load_parameters(param_file, ctx, allow_missing, ignore_extra, True, 'saved')
+            ret.load_parameters(param_file, device, allow_missing, ignore_extra, True, 'saved')
         return ret
 
     def __repr__(self):
@@ -1817,7 +1833,7 @@ def forward(self, x, *args):
                                'is not yet supported in Gluon 2.')
 
         if isinstance(x, NDArray):
-            with x.ctx:
+            with x.device:
                 return self._call_cached_op(x, *args)
 
         assert isinstance(x, Symbol), \
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 49405e189692..1907a785a91d 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -29,11 +29,11 @@
 import numpy as np
 
 from ..base import mx_real_t, MXNetError
-from .. import symbol, ndarray, initializer, context, _deferred_compute as dc
-from ..context import Context, cpu
+from .. import symbol, ndarray, initializer, device as _device, _deferred_compute as dc
+from ..device import Device, cpu
 from .. import autograd
 from .utils import shape_is_known
-from ..util import is_np_shape, is_np_array
+from ..util import is_np_shape, is_np_array, wrap_ctx_to_device_func
 from .. import numpy as _mx_np  # pylint: disable=reimported
 
 # pylint: disable= invalid-name
@@ -47,17 +47,17 @@ class DeferredInitializationError(MXNetError):
 class Parameter(object):
     """A Container holding parameters (weights) of Blocks.
 
-    :py:class:`Parameter` holds a copy of the parameter on each :py:class:`Context` after
+    :py:class:`Parameter` holds a copy of the parameter on each :py:class:`Device` after
     it is initialized with ``Parameter.initialize(...)``. If :py:attr:`grad_req` is
-    not ``'null'``, it will also hold a gradient array on each :py:class:`Context`::
+    not ``'null'``, it will also hold a gradient array on each :py:class:`Device`::
 
-        ctx = mx.gpu(0)
-        x = mx.np.zeros((16, 100), ctx=ctx)
+        device = mx.gpu(0)
+        x = mx.np.zeros((16, 100), device=device)
         w = mx.gluon.Parameter('fc_weight', shape=(64, 100), init=mx.init.Xavier())
         b = mx.gluon.Parameter('fc_bias', shape=(64,), init=mx.init.Zero())
-        w.initialize(ctx=ctx)
-        b.initialize(ctx=ctx)
-        out = mx.npx.fully_connected(x, w.data(ctx), b.data(ctx), num_hidden=64)
+        w.initialize(device=device)
+        b.initialize(device=device)
+        out = mx.npx.fully_connected(x, w.data(device), b.data(device), num_hidden=64)
 
     Parameters
     ----------
@@ -111,8 +111,8 @@ def __init__(self, name='weight', grad_req='write', shape=None, dtype=mx_real_t,
         self._var_name = None
         self._data = None
         self._grad = None
-        self._ctx_list = None
-        self._ctx_map = None
+        self._device_list = None
+        self._device_map = None
         self._trainer = None
         self._deferred_init = ()
         self._differentiable = differentiable
@@ -218,24 +218,24 @@ def _set_trainer(self, trainer):
         else:
             self._trainer = trainer
 
-    def _check_and_get(self, arr_list, ctx):
+    def _check_and_get(self, arr_list, device):
         if arr_list is not None:
-            if ctx is list:
+            if device is list:
                 return arr_list
-            if ctx is None:
+            if device is None:
                 if len(arr_list) == 1:
                     return arr_list[0]
                 else:
-                    ctx = context.current_context()
-            ctx_list = self._ctx_map[ctx.device_typeid&1]
-            if ctx.device_id < len(ctx_list):
-                idx = ctx_list[ctx.device_id]
+                    device = _device.current_device()
+            device_list = self._device_map[device.device_typeid&1]
+            if device.device_id < len(device_list):
+                idx = device_list[device.device_id]
                 if idx is not None:
                     return arr_list[idx]
             raise RuntimeError(
-                "Parameter '%s' was not initialized on context %s. "
+                "Parameter '%s' was not initialized on device %s. "
                 "It was only initialized on %s."%(
-                    self.name, str(ctx), str(self._ctx_list)))
+                    self.name, str(device), str(self._device_list)))
         if self._deferred_init:
             raise DeferredInitializationError(
                 "Parameter '%s' has not been initialized yet because initialization was " \
@@ -250,7 +250,8 @@ def _check_and_get(self, arr_list, ctx):
             "because the later does not include Parameters of " \
             "nested child Blocks"%(self.name))
 
-    def _get_row_sparse(self, arr_list, ctx, row_id):
+    @wrap_ctx_to_device_func
+    def _get_row_sparse(self, arr_list, device, row_id):
         """ Get row_sparse data from row_sparse parameters based on row_id. """
         # get row sparse params based on row ids
         if not isinstance(row_id, ndarray.NDArray):
@@ -259,21 +260,22 @@ def _get_row_sparse(self, arr_list, ctx, row_id):
         if not trainer:
             raise RuntimeError("Cannot get row_sparse data for Parameter '%s' when no " \
                                "Trainer is created with it."%self.name)
-        results = self._check_and_get(arr_list, ctx)
+        results = self._check_and_get(arr_list, device)
 
         # fetch row sparse params from the trainer
         trainer._row_sparse_pull(self, results, row_id)
         return results
 
-    def _load_init(self, data, ctx, cast_dtype=False, dtype_source='current'):
+    @wrap_ctx_to_device_func
+    def _load_init(self, data, device, cast_dtype=False, dtype_source='current'):
         """
         (Re)initializes by loading from data.
         Parameters
         ----------
         data : NDArray
             The data to load
-        ctx : Context or list of Context
-            Context(s) initialize loaded parameters on.
+        device : Device or list of Device
+            Device(s) initialize loaded parameters on.
         cast_dtype : bool, default False
             Cast the data type of the parameter
         dtype_source : str, default 'current'
@@ -313,23 +315,23 @@ def _load_init(self, data, ctx, cast_dtype=False, dtype_source='current'):
                         self.name, str(self.dtype), str(data.dtype))
         if self._stype != data.stype:
             data = data.tostype(self._stype)
-        if isinstance(ctx, Context):
-            ctx = [ctx]
+        if isinstance(device, Device):
+            device = [device]
         if self._data is None:
             if self._deferred_init:
-                assert ctx is None or set(ctx) == set(self._deferred_init[1]), \
+                assert device is None or set(device) == set(self._deferred_init[1]), \
                     "Failed to load Parameter '%s' on %s because it was " \
                     "previous initialized on %s."%(
-                        self.name, str(ctx), str(self.list_ctx()))
-                ctx = self._deferred_init[1]
-            elif ctx is None:
-                ctx = [cpu()]
-            self._init_impl(data, ctx)
+                        self.name, str(device), str(self.list_device()))
+                device = self._deferred_init[1]
+            elif device is None:
+                device = [cpu()]
+            self._init_impl(data, device)
         else:
-            assert ctx is None or set(ctx) == set(self.list_ctx()), \
+            assert device is None or set(device) == set(self.list_device()), \
                 "Failed to load Parameter '%s' on %s because it was " \
                 "previous initialized on %s."%(
-                    self.name, str(ctx), str(self.list_ctx()))
+                    self.name, str(device), str(self.list_device()))
             self.set_data(data)
         self._deferred_init = ()
 
@@ -337,7 +339,7 @@ def _finish_deferred_init(self):
         """Finishes deferred initialization."""
         if not self._deferred_init:
             return
-        init, ctx, default_init, data = self._deferred_init
+        init, device, default_init, data = self._deferred_init
         self._deferred_init = ()
 
         assert shape_is_known(self.shape), \
@@ -348,7 +350,7 @@ def _finish_deferred_init(self):
 
         with autograd.pause(), dc.context(False):
             if data is None:
-                kwargs = {'shape': self.shape, 'dtype': self.dtype, 'ctx': context.cpu()}
+                kwargs = {'shape': self.shape, 'dtype': self.dtype, 'device': cpu()}
                 if is_np_array():
                     if self._stype != 'default':
                         raise ValueError("mxnet.numpy.zeros does not support stype = {}"
@@ -361,19 +363,19 @@ def _finish_deferred_init(self):
                 initializer.create(default_init)(
                     initializer.InitDesc(self.name, {'__init__': init}), data)
 
-            self._init_impl(data, ctx)
+            self._init_impl(data, device)
 
-    def _init_impl(self, data, ctx_list):
+    def _init_impl(self, data, device_list):
         """Sets data and grad."""
-        self._ctx_list = list(ctx_list)
-        self._ctx_map = [[], []]
-        for i, ctx in enumerate(self._ctx_list):
-            dev_list = self._ctx_map[ctx.device_typeid&1]
-            while len(dev_list) <= ctx.device_id:
+        self._device_list = list(device_list)
+        self._device_map = [[], []]
+        for i, device in enumerate(self._device_list):
+            dev_list = self._device_map[device.device_typeid&1]
+            while len(dev_list) <= device.device_id:
                 dev_list.append(None)
-            dev_list[ctx.device_id] = i
+            dev_list[device.device_id] = i
 
-        self._data = [data.copyto(ctx) for ctx in self._ctx_list]
+        self._data = [data.copyto(device) for device in self._device_list]
         self._init_grad()
 
     def _init_grad(self):
@@ -386,31 +388,31 @@ def _init_grad(self):
             if self._grad_stype != 'default':
                 raise ValueError("mxnet.numpy.zeros does not support stype = {}"
                                  .format(self._grad_stype))
-            self._grad = [_mx_np.zeros(shape=i.shape, dtype=i.dtype, device=i.ctx)
+            self._grad = [_mx_np.zeros(shape=i.shape, dtype=i.dtype, device=i.device)
                           for i in self._data]
         else:
-            self._grad = [ndarray.zeros(shape=i.shape, dtype=i.dtype, ctx=i.ctx,
+            self._grad = [ndarray.zeros(shape=i.shape, dtype=i.dtype, device=i.device,
                                         stype=self._grad_stype) for i in self._data]
 
         autograd.mark_variables(self._check_and_get(self._data, list),
                                 self._grad, self.grad_req)
 
     def _reduce(self):
-        """Reduce data from multiple context to cpu."""
-        ctx = context.cpu()
+        """Reduce data from multiple device to cpu."""
+        device = cpu()
         if self._stype == 'default':
             block = self.list_data()
             if len(block) > 1:
                 if is_np_array():
-                    data = sum([w.copyto(ctx) for w in block]) / len(block)
+                    data = sum([w.copyto(device) for w in block]) / len(block)
                 else:
-                    data = ndarray.add_n(*(w.copyto(ctx) for w in block)) / len(block)
+                    data = ndarray.add_n(*(w.copyto(device) for w in block)) / len(block)
             else:
-                data = self.data().copyto(ctx)
+                data = self.data().copyto(device)
         else:
             # fetch all rows for 'row_sparse' param
-            all_row_ids = ndarray.arange(0, self.shape[0], dtype='int64', ctx=ctx)
-            data = ndarray.zeros(self.shape, stype='row_sparse', ctx=ctx)
+            all_row_ids = ndarray.arange(0, self.shape[0], dtype='int64', device=device)
+            data = ndarray.zeros(self.shape, stype='row_sparse', device=device)
             trainer = self._trainer() if self._trainer else None
             if not trainer:
                 raise RuntimeError("Cannot reduce row_sparse data for Parameter '%s' when no " \
@@ -418,7 +420,8 @@ def _reduce(self):
             trainer._row_sparse_pull(self, data, all_row_ids, full_idx=True)
         return data
 
-    def initialize(self, init=None, ctx=None, default_init=initializer.Uniform(),
+    @wrap_ctx_to_device_func
+    def initialize(self, init=None, device=None, default_init=initializer.Uniform(),
                    force_reinit=False):
         """Initializes parameter and gradient arrays. Only used for :py:class:`NDArray` API.
 
@@ -426,9 +429,9 @@ def initialize(self, init=None, ctx=None, default_init=initializer.Uniform(),
         ----------
         init : Initializer
             The initializer to use. Overrides :py:meth:`Parameter.init` and default_init.
-        ctx : Context or list of Context, defaults to :py:meth:`context.current_context()`.
-            Initialize Parameter on given context. If ctx is a list of Context, a
-            copy will be made for each context.
+        device : Device or list of Device, default :py:meth:`device.current_device()`.
+            Assign Parameter to given device. If device is a list of Device, a
+            copy will be made for each device.
 
             .. note::
                 Copies are independent arrays. User is responsible for keeping
@@ -443,7 +446,7 @@ def initialize(self, init=None, ctx=None, default_init=initializer.Uniform(),
         Examples
         --------
         >>> weight = mx.gluon.Parameter('weight', shape=(2, 2))
-        >>> weight.initialize(ctx=mx.cpu(0))
+        >>> weight.initialize(device=mx.cpu(0))
         >>> weight.data()
         [[-0.01068833  0.01729892]
          [ 0.02042518 -0.01618656]]
@@ -452,7 +455,7 @@ def initialize(self, init=None, ctx=None, default_init=initializer.Uniform(),
         [[ 0.  0.]
          [ 0.  0.]]
         <NDArray 2x2 @cpu(0)>
-        >>> weight.initialize(ctx=[mx.gpu(0), mx.gpu(1)])
+        >>> weight.initialize(device=[mx.gpu(0), mx.gpu(1)])
         >>> weight.data(mx.gpu(0))
         [[-0.00873779 -0.02834515]
          [ 0.05484822 -0.06206018]]
@@ -468,48 +471,54 @@ def initialize(self, init=None, ctx=None, default_init=initializer.Uniform(),
                           stacklevel=2)
             return
         self._data = self._grad = None
-        if ctx is None:
-            ctx = [context.current_context()]
-        if isinstance(ctx, Context):
-            ctx = [ctx]
+        if device is None:
+            device = [device.current_device()]
+        if isinstance(device, Device):
+            device = [device]
         if init is None:
             init = default_init if self.init is None else self.init
         if not shape_is_known(self.shape):
             if self._allow_deferred_init:
-                self._deferred_init = (init, ctx, default_init, None)
+                self._deferred_init = (init, device, default_init, None)
                 return
             raise ValueError("Cannot initialize Parameter '%s' because it has " \
                              "invalid shape: %s."%(self.name, str(self.shape)))
 
-        self._deferred_init = (init, ctx, default_init, None)
+        self._deferred_init = (init, device, default_init, None)
         self._finish_deferred_init()
 
-    def reset_ctx(self, ctx):
-        """Re-assign Parameter to other contexts.
+    def reset_device(self, device):
+        """Re-assign Parameter to other devices.
 
         Parameters
         ----------
-        ctx : Context or list of Context, default ``context.current_context()``.
-            Assign Parameter to given context. If ctx is a list of Context, a
-            copy will be made for each context.
+        device : Device or list of Device, default ``device.current_device()``.
+            Assign Parameter to given device. If device is a list of Device, a
+            copy will be made for each device.
         """
-        if ctx is None:
-            ctx = [context.current_context()]
-        if isinstance(ctx, Context):
-            ctx = [ctx]
+        if device is None:
+            device = [device.current_device()]
+        if isinstance(device, Device):
+            device = [device]
         if self._data:
             data = self._reduce()
             with autograd.pause():
-                self._init_impl(data, ctx)
+                self._init_impl(data, device)
         elif self._deferred_init:
             init, _, default_init, data = self._deferred_init
-            self._deferred_init = (init, ctx, default_init, data)
+            self._deferred_init = (init, device, default_init, data)
         else:
-            raise ValueError("Cannot reset context for Parameter '%s' because it "
+            raise ValueError("Cannot reset device for Parameter '%s' because it "
                              "has not been initialized."%self.name)
 
+    def reset_ctx(self, ctx):
+        """This function has been deprecated. Please refer to ``Parameter.reset_device``."""
+        warnings.warn('Parameter.reset_ctx has been renamed to'
+                      ' Parameter.reset_device', DeprecationWarning)
+        self.reset_device(ctx)
+
     def set_data(self, data):
-        """Sets this parameter's value on all contexts."""
+        """Sets this parameter's value on all devices."""
         self.shape = data.shape
 
         if self._data is None:
@@ -528,9 +537,9 @@ def set_data(self, data):
             arr[:] = data
 
     def row_sparse_data(self, row_id):
-        """Returns a copy of the 'row_sparse' parameter on the same context as row_id's.
+        """Returns a copy of the 'row_sparse' parameter on the same device as row_id's.
         The copy only retains rows whose ids occur in provided row ids.
-        The parameter must have been initialized on this context before.
+        The parameter must have been initialized on this device before.
 
         Parameters
         ----------
@@ -539,16 +548,16 @@ def row_sparse_data(self, row_id):
 
         Returns
         -------
-        NDArray on row_id's context
+        NDArray on row_id's device
         """
         if self._stype != 'row_sparse':
             raise RuntimeError("Cannot return a copy of Parameter %s via row_sparse_data() " \
                                "because its storage type is %s. Please use data() instead." \
                                %(self.name, self._stype))
-        return self._get_row_sparse(self._data, row_id.ctx, row_id)
+        return self._get_row_sparse(self._data, row_id.device, row_id)
 
     def list_row_sparse_data(self, row_id):
-        """Returns copies of the 'row_sparse' parameter on all contexts, in the same order
+        """Returns copies of the 'row_sparse' parameter on all devices, in the same order
         as creation. The copy only retains rows whose ids occur in provided row ids.
         The parameter must have been initialized before.
 
@@ -562,35 +571,36 @@ def list_row_sparse_data(self, row_id):
         list of NDArrays
         """
         if self._stype != 'row_sparse':
-            raise RuntimeError("Cannot return copies of Parameter '%s' on all contexts via " \
+            raise RuntimeError("Cannot return copies of Parameter '%s' on all devices via " \
                                "list_row_sparse_data() because its storage type is %s. Please " \
                                "use data() instead." % (self.name, self._stype))
         return self._get_row_sparse(self._data, list, row_id)
 
-    def data(self, ctx=None):
-        """Returns a copy of this parameter on one context. Must have been
-        initialized on this context before. For sparse parameters, use
+    @wrap_ctx_to_device_func
+    def data(self, device=None):
+        """Returns a copy of this parameter on one device. Must have been
+        initialized on this device before. For sparse parameters, use
         :py:meth:`Parameter.row_sparse_data` instead.
 
         Parameters
         ----------
-        ctx : Context
-            Desired context.
+        device : Device
+            Desired device.
 
         Returns
         -------
-        NDArray on ctx
+        NDArray on device
         """
         if self._stype != 'default':
-            raise RuntimeError("Cannot return a copy of Parameter '%s' on ctx %s via data() " \
+            raise RuntimeError("Cannot return a copy of Parameter '%s' on device %s via data() " \
                                "because its storage type is %s. Please use row_sparse_data() " \
-                               "instead." % (self.name, str(ctx), self._stype))
-        data = self._check_and_get(self._data, ctx)
+                               "instead." % (self.name, str(device), self._stype))
+        data = self._check_and_get(self._data, device)
         dc.set_variable(data, self.var())
         return data
 
     def list_data(self):
-        """Returns copies of this parameter on all contexts, in the same order
+        """Returns copies of this parameter on all devices, in the same order
         as creation. For sparse parameters, use :py:meth:`Parameter.list_row_sparse_data`
         instead.
 
@@ -599,27 +609,27 @@ def list_data(self):
         list of NDArrays
         """
         if self._stype != 'default':
-            raise RuntimeError("Cannot return copies of Parameter '%s' on all contexts via " \
+            raise RuntimeError("Cannot return copies of Parameter '%s' on all devices via " \
                                "list_data() because its storage type is %s. Please use " \
                                "row_sparse_data() instead." % (self.name, self._stype))
         return self._check_and_get(self._data, list)
 
-    def grad(self, ctx=None):
-        """Returns a gradient buffer for this parameter on one context.
+    def grad(self, device=None):
+        """Returns a gradient buffer for this parameter on one device.
 
         Parameters
         ----------
-        ctx : Context
-            Desired context.
+        device : Device
+            Desired device.
         """
         if self._data is not None and self._grad is None:
             raise RuntimeError(
                 "Cannot get gradient array for Parameter '%s' " \
                 "because grad_req='null'"%(self.name))
-        return self._check_and_get(self._grad, ctx)
+        return self._check_and_get(self._grad, device)
 
     def list_grad(self):
-        """Returns gradient buffers on all contexts, in the same order
+        """Returns gradient buffers on all devices, in the same order
         as :py:meth:`values`."""
         if self._data is not None and self._grad is None:
             raise RuntimeError(
@@ -627,16 +637,16 @@ def list_grad(self):
                 "because grad_req='null'"%(self.name))
         return self._check_and_get(self._grad, list)
 
-    def list_ctx(self):
-        """Returns a list of contexts this parameter is initialized on."""
+    def list_device(self):
+        """Returns a list of devices this parameter is initialized on."""
         if self._data is None:
             if self._deferred_init:
                 return self._deferred_init[1]
             raise RuntimeError("Parameter '%s' has not been initialized"%self.name)
-        return self._ctx_list
+        return self._device_list
 
     def zero_grad(self):
-        """Sets gradient buffer on all contexts to 0. No action is taken if
+        """Sets gradient buffer on all devices to 0. No action is taken if
         parameter is uninitialized or doesn't require gradient."""
         if self._grad is None:
             return
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 2fdb913f7872..d91fb5452a79 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -45,7 +45,7 @@
     # in rare cases requests may be not installed
     pass
 import mxnet as mx
-from .context import current_context
+from .device import current_device
 from .ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID
 from .symbol import Symbol
 from .symbol.numpy import _Symbol as np_symbol
@@ -59,7 +59,7 @@ def default_context():
     """Get default context for regression test."""
     # _TODO: get context from environment variable to support
     # testing with GPUs
-    return current_context()
+    return current_device()
 
 
 def set_default_context(ctx):
@@ -2461,13 +2461,13 @@ def has_tvm_ops():
     is GPU, it only returns True for CUDA compute capability > 52 where FP16 is supported.
     """
     built_with_tvm_op = _features.is_enabled("TVM_OP")
-    ctx = current_context()
-    if ctx.device_type == 'gpu':
+    device = current_device()
+    if device.device_type == 'gpu':
         try:
-            cc = get_cuda_compute_capability(ctx)
+            cc = get_cuda_compute_capability(device)
         except:  # pylint: disable=bare-except
             print('Failed to get CUDA compute capability for context {}. The operators '
-                  'built with USE_TVM_OP=1 will not be run in unit tests.'.format(ctx))
+                  'built with USE_TVM_OP=1 will not be run in unit tests.'.format(device))
             return False
         print('Cuda arch compute capability: sm_{}'.format(str(cc)))
         return built_with_tvm_op and cc >= 53
@@ -2479,16 +2479,16 @@ def is_op_runnable():
     1. Built with USE_TVM_OP=0.
     2. Built with USE_TVM_OP=1, but with compute capability >= 53.
     """
-    ctx = current_context()
-    if ctx.device_type == 'gpu':
+    device = current_device()
+    if device.device_type == 'gpu':
         if not _features.is_enabled("TVM_OP"):
             return True
         else:
             try:
-                cc = get_cuda_compute_capability(ctx)
+                cc = get_cuda_compute_capability(device)
             except:  # pylint: disable=bare-except
                 print('Failed to get CUDA compute capability for context {}. The operators '
-                      'built with USE_TVM_OP=1 will not be run in unit tests.'.format(ctx))
+                      'built with USE_TVM_OP=1 will not be run in unit tests.'.format(device))
                 return False
             print('Cuda arch compute capability: sm_{}'.format(str(cc)))
             return cc >= 53
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index 406186e866f9..3adebced29e6 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -676,52 +676,52 @@ def _wrap_func_with_ctx(*args, **kwargs):
 # pylint: disable=exec-used
 def numpy_fallback(func):
     """decorator for falling back to offical numpy for a specific function"""
-    def get_ctx(ctx, new_ctx):
-        if ctx is None:
-            return new_ctx
+    def get_device(device, new_device):
+        if device is None:
+            return new_device
         else:
-            if new_ctx is None:
-                new_ctx = ctx
-            assert ctx == new_ctx, "inconsistent context %s and %s" % (str(ctx), str(new_ctx))
-            return ctx
+            if new_device is None:
+                new_device = device
+            assert device == new_device, "inconsistent device %s and %s" % (str(device), str(new_device))
+            return device
 
     def _as_official_np_array(object):
-        ctx = None
+        device = None
         if hasattr(object, 'asnumpy'):
-            return object.asnumpy(), object.ctx
+            return object.asnumpy(), object.device
         elif isinstance(object, (list, tuple)):
             tmp = []
             for arr in object:
-                new_arr, new_ctx = _as_official_np_array(arr)
-                ctx = get_ctx(ctx, new_ctx)
+                new_arr, new_device = _as_official_np_array(arr)
+                device = get_device(device, new_device)
                 tmp.append(new_arr)
-            return object.__class__(tmp), ctx
+            return object.__class__(tmp), device
         elif isinstance(object, dict):
             tmp = {}
             for k, v in object.items():
-                new_v, new_ctx = _as_official_np_array(v)
-                ctx = get_ctx(ctx, new_ctx)
+                new_v, new_device = _as_official_np_array(v)
+                device = get_device(device, new_device)
                 tmp[k] = new_v
-            return tmp, ctx
+            return tmp, device
         else:
             return object, None
 
     from .ndarray import from_numpy
     from .numpy import array
-    from .context import current_context
-    def _as_mx_np_array(object, ctx=current_context()):
+    from .device import current_device
+    def _as_mx_np_array(object, device=current_device()):
         import numpy as _np
         if isinstance(object, _np.ndarray):
             try:
                 ret = from_numpy(object).as_np_ndarray()
             except ValueError:
-                ret = array(object, dtype=object.dtype, ctx=ctx)
-            return (ret if ('cpu' in str(ctx)) else ret.as_in_ctx(ctx))
+                ret = array(object, dtype=object.dtype, device=device)
+            return (ret if ('cpu' in str(device)) else ret.to_device(device))
         elif isinstance(object, (list, tuple)):
-            tmp = [_as_mx_np_array(arr, ctx) for arr in object]
+            tmp = [_as_mx_np_array(arr, device) for arr in object]
             return object.__class__(tmp)
         elif isinstance(object, dict):
-            return {k:_as_mx_np_array(v, ctx) for k, v in object}
+            return {k:_as_mx_np_array(v, device) for k, v in object}
         else:
             return object
 
@@ -747,13 +747,13 @@ def _as_mx_np_array(object, ctx=current_context()):
     @functools.wraps(func)
     def _fallback_to_official_np(*args, **kwargs):
         # for every ndarray input, fallback
-        new_args, ctx0 = _as_official_np_array(args)
-        new_kwargs, ctx1 = _as_official_np_array(kwargs)
-        ctx = get_ctx(ctx0, ctx1)
+        new_args, device0 = _as_official_np_array(args)
+        new_kwargs, device1 = _as_official_np_array(kwargs)
+        device = get_device(device0, device1)
         ret = func(*new_args, **new_kwargs)
         if ret is None:
             raise ValueError("Only functions with return values are allowed to use this decorator")
-        ret = _as_mx_np_array(ret, ctx=ctx)
+        ret = _as_mx_np_array(ret, device=device)
         return ret
 
     return _fallback_to_official_np
@@ -881,12 +881,12 @@ def reset_np():
 _CUDA_SUCCESS = 0
 
 
-def get_cuda_compute_capability(ctx):
-    """Returns the cuda compute capability of the input `ctx`.
+def get_cuda_compute_capability(device):
+    """Returns the cuda compute capability of the input `device`.
 
     Parameters
     ----------
-    ctx : Context
+    device : Device
         GPU context whose corresponding cuda compute capability is to be retrieved.
 
     Returns
@@ -898,9 +898,9 @@ def get_cuda_compute_capability(ctx):
     ----------
     https://gist.github.com/f0k/63a664160d016a491b2cbea15913d549#file-cuda_check-py
     """
-    if ctx.device_type != 'gpu':
+    if device.device_type != 'gpu':
         raise ValueError('Expecting a gpu context to get cuda compute capability, '
-                         'while received ctx {}'.format(str(ctx)))
+                         'while received device {}'.format(str(device)))
 
     libnames = ('libcuda.so', 'libcuda.dylib', 'nvcuda.dll', 'cuda.dll')
     for libname in libnames:
@@ -926,7 +926,7 @@ def get_cuda_compute_capability(ctx):
         raise RuntimeError('cuInit failed with erro code {}: {}'
                            .format(ret, error_str.value.decode()))
 
-    ret = cuda.cuDeviceGet(ctypes.byref(device), ctx.device_id)
+    ret = cuda.cuDeviceGet(ctypes.byref(device), device.device_id)
     if ret != _CUDA_SUCCESS:
         cuda.cuGetErrorString(ret, ctypes.byref(error_str))
         raise RuntimeError('cuDeviceGet failed with error code {}: {}'
@@ -939,7 +939,7 @@ def get_cuda_compute_capability(ctx):
     return cc_major.value * 10 + cc_minor.value
 
 
-def default_array(source_array, ctx=None, dtype=None):
+def default_array(source_array, device=None, dtype=None):
     """Creates an array from any object exposing the default(nd or np) array interface.
 
     Parameters
@@ -947,7 +947,7 @@ def default_array(source_array, ctx=None, dtype=None):
     source_array : array_like
         An object exposing the array interface, an object whose `__array__`
         method returns an array, or any (nested) sequence.
-    ctx : Context, optional
+    device : Device, optional
         Device context (default is the current default context).
     dtype : str or numpy.dtype, optional
         The data type of the output array. The default dtype is ``source_array.dtype``
@@ -961,9 +961,9 @@ def default_array(source_array, ctx=None, dtype=None):
     from . import nd as _mx_nd
     from . import np as _mx_np
     if is_np_array():
-        return _mx_np.array(source_array, ctx=ctx, dtype=dtype)
+        return _mx_np.array(source_array, device=device, dtype=dtype)
     else:
-        return _mx_nd.array(source_array, ctx=ctx, dtype=dtype)
+        return _mx_nd.array(source_array, device=device, dtype=dtype)
 
 class _NumpyDefaultDtypeScope(object):
     """Scope for managing NumPy default dtype semantics.
@@ -1213,10 +1213,10 @@ def get_max_supported_compute_capability():
     return max_supported_cc.value
 
 
-def get_rtc_compile_opts(ctx):
+def get_rtc_compile_opts(device):
     """Get the compile ops suitable for the context, given the toolkit/driver config
     """
-    device_cc = get_cuda_compute_capability(ctx)
+    device_cc = get_cuda_compute_capability(device)
     max_supported_cc = get_max_supported_compute_capability()
 
     # CUDA toolkits starting with 11.1 (first to support arch 86) can compile directly to SASS
diff --git a/src/operator/numpy/np_elemwise_broadcast_logic_op.h b/src/operator/numpy/np_elemwise_broadcast_logic_op.h
index da37adc4a595..9d25615757a6 100644
--- a/src/operator/numpy/np_elemwise_broadcast_logic_op.h
+++ b/src/operator/numpy/np_elemwise_broadcast_logic_op.h
@@ -158,8 +158,7 @@ struct GetBinaryBroadcastCompute {
 
     const TBlob& a = inputs[0];
     const TBlob& b = inputs[1];
-    if (a.type_flag_ != b.type_flag_ ||
-        a.type_flag_ == mshadow::kBool ||
+    if (a.type_flag_ != b.type_flag_ || a.type_flag_ == mshadow::kBool ||
         outputs[0].shape_.ndim() > 5) {
       if (outputs[0].shape_.Size() == 0U)
         return;
diff --git a/src/operator/numpy/np_init_op.cu b/src/operator/numpy/np_init_op.cu
index 4bd594cb86dd..46caa499581f 100644
--- a/src/operator/numpy/np_init_op.cu
+++ b/src/operator/numpy/np_init_op.cu
@@ -36,8 +36,8 @@ NNVM_REGISTER_OP(_npi_identity).set_attr<FCompute>("FCompute<gpu>", IdentityComp
 
 NNVM_REGISTER_OP(_npi_full_like).set_attr<FCompute>("FCompute<gpu>", FullLikeOpCompute<gpu>);
 
-NNVM_REGISTER_OP(_npi_full)
-.set_attr<FCompute>("FCompute<gpu>", NumpyInitFillWithScalarCompute<gpu>);
+NNVM_REGISTER_OP(_npi_full).set_attr<FCompute>("FCompute<gpu>",
+                                               NumpyInitFillWithScalarCompute<gpu>);
 
 NNVM_REGISTER_OP(_npi_atleast_1d).set_attr<FCompute>("FCompute<gpu>", AtleastNDCompute<gpu>);
 
diff --git a/src/operator/numpy/np_init_op.h b/src/operator/numpy/np_init_op.h
index b5373dfb6ced..261c0321256d 100644
--- a/src/operator/numpy/np_init_op.h
+++ b/src/operator/numpy/np_init_op.h
@@ -101,31 +101,22 @@ struct NumpyInitOpWithScalarParam : public dmlc::Parameter<NumpyInitOpWithScalar
   uint64_t uint_value;
   int value_type;
   DMLC_DECLARE_PARAMETER(NumpyInitOpWithScalarParam) {
-    DMLC_DECLARE_FIELD(shape)
-      .set_default(mxnet::TShape())
-      .describe("The shape of the output");
-    DMLC_DECLARE_FIELD(ctx)
-      .set_default("")
-      .describe("Context of output, in format [cpu|gpu|cpu_pinned](n)."
-                  "Only used for imperative calls.");
-    DMLC_DECLARE_FIELD(dtype)
-      .set_default(-1)
-      .add_enum("None", -1)
-      MXNET_ADD_ALL_TYPES_EXT_WITH_BOOL
-      .describe("Target data type.");
+    DMLC_DECLARE_FIELD(shape).set_default(mxnet::TShape()).describe("The shape of the output");
+    DMLC_DECLARE_FIELD(ctx).set_default("").describe(
+        "Context of output, in format [cpu|gpu|cpu_pinned](n)."
+        "Only used for imperative calls.");
+    DMLC_DECLARE_FIELD(dtype).set_default(-1).add_enum("None", -1)
+        MXNET_ADD_ALL_TYPES_EXT_WITH_BOOL.describe("Target data type.");
     DMLC_DECLARE_FIELD(double_value)
-      .describe("Float value with which to fill newly created tensor");
-    DMLC_DECLARE_FIELD(int_value)
-      .describe("Integer value with which to fill newly created tensor");
+        .describe("Float value with which to fill newly created tensor");
+    DMLC_DECLARE_FIELD(int_value).describe("Integer value with which to fill newly created tensor");
     DMLC_DECLARE_FIELD(uint_value)
-      .describe("Unsigned integer value with which to fill newly created tensor");
-    DMLC_DECLARE_FIELD(value_type)
-      .describe("Choose the value type");
+        .describe("Unsigned integer value with which to fill newly created tensor");
+    DMLC_DECLARE_FIELD(value_type).describe("Choose the value type");
   }
 
   void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
-    std::ostringstream shape_s, dtype_s, double_value_s, int_value_s, value_type_s,
-    uint_value_s;
+    std::ostringstream shape_s, dtype_s, double_value_s, int_value_s, value_type_s, uint_value_s;
     shape_s << shape;
     dtype_s << dtype;
     double_value_s << double_value;
@@ -144,12 +135,12 @@ struct NumpyInitOpWithScalarParam : public dmlc::Parameter<NumpyInitOpWithScalar
 };
 
 /*! \brief Fill output with an arbitrary value */
-template<typename xpu>
-void NumpyInitFillWithScalarCompute(const nnvm::NodeAttrs &attrs,
-                                    const OpContext &ctx,
-                                    const std::vector<TBlob> &inputs,
-                                    const std::vector<OpReqType> &req,
-                                    const std::vector<TBlob> &outputs) {
+template <typename xpu>
+void NumpyInitFillWithScalarCompute(const nnvm::NodeAttrs& attrs,
+                                    const OpContext& ctx,
+                                    const std::vector<TBlob>& inputs,
+                                    const std::vector<OpReqType>& req,
+                                    const std::vector<TBlob>& outputs) {
   CHECK_EQ(inputs.size(), 0);
   CHECK_EQ(outputs.size(), 1U);
   const auto& param = nnvm::get<NumpyInitOpWithScalarParam>(attrs.parsed);
@@ -175,39 +166,26 @@ struct NumpyLinspaceParam : public dmlc::Parameter<NumpyLinspaceParam> {
   int dtype;
   int value_type;
   DMLC_DECLARE_PARAMETER(NumpyLinspaceParam) {
-    DMLC_DECLARE_FIELD(start_double)
-    .describe("The double type starting value of the sequence.");
-    DMLC_DECLARE_FIELD(stop_double)
-    .describe("The double type ending value of the sequence");
-    DMLC_DECLARE_FIELD(start_int)
-    .describe("The int type starting value of the sequence.");
-    DMLC_DECLARE_FIELD(stop_int)
-    .describe("The int type ending value of the sequence");
+    DMLC_DECLARE_FIELD(start_double).describe("The double type starting value of the sequence.");
+    DMLC_DECLARE_FIELD(stop_double).describe("The double type ending value of the sequence");
+    DMLC_DECLARE_FIELD(start_int).describe("The int type starting value of the sequence.");
+    DMLC_DECLARE_FIELD(stop_int).describe("The int type ending value of the sequence");
     DMLC_DECLARE_FIELD(start_uint)
-    .describe("The unsigned int type starting value of the sequence.");
-    DMLC_DECLARE_FIELD(stop_uint)
-    .describe("The unsigned int type ending value of the sequence");
-    DMLC_DECLARE_FIELD(num)
-    .describe("Number of samples to generate. Must be non-negative.");
-    DMLC_DECLARE_FIELD(endpoint)
-    .set_default(true)
-    .describe("If True, stop is the last sample. Otherwise, it is not included.");
-    DMLC_DECLARE_FIELD(ctx)
-    .set_default("")
-    .describe("Context of output, in format [cpu|gpu|cpu_pinned](n)."
-              "Only used for imperative calls.");
-    DMLC_DECLARE_FIELD(dtype)
-    .set_default(-1)
-    .add_enum("None", -1)
-    MXNET_ADD_ALL_TYPES_EXT_WITH_BOOL
-    .describe("Target data type.");
-    DMLC_DECLARE_FIELD(value_type)
-    .set_default(0)
-    .describe("Data type for start and stop value");
+        .describe("The unsigned int type starting value of the sequence.");
+    DMLC_DECLARE_FIELD(stop_uint).describe("The unsigned int type ending value of the sequence");
+    DMLC_DECLARE_FIELD(num).describe("Number of samples to generate. Must be non-negative.");
+    DMLC_DECLARE_FIELD(endpoint).set_default(true).describe(
+        "If True, stop is the last sample. Otherwise, it is not included.");
+    DMLC_DECLARE_FIELD(ctx).set_default("").describe(
+        "Context of output, in format [cpu|gpu|cpu_pinned](n)."
+        "Only used for imperative calls.");
+    DMLC_DECLARE_FIELD(dtype).set_default(-1).add_enum("None", -1)
+        MXNET_ADD_ALL_TYPES_EXT_WITH_BOOL.describe("Target data type.");
+    DMLC_DECLARE_FIELD(value_type).set_default(0).describe("Data type for start and stop value");
   }
   void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
-    std::ostringstream start_double_s, stop_double_s, num_s, endpoint_s, dtype_s,
-    start_int_s, stop_int_s, start_uint_s, stop_uint_s, value_type_s;
+    std::ostringstream start_double_s, stop_double_s, num_s, endpoint_s, dtype_s, start_int_s,
+        stop_int_s, start_uint_s, stop_uint_s, value_type_s;
     start_double_s << start_double;
     stop_double_s << stop_double;
     start_int_s << start_int;
@@ -232,23 +210,27 @@ struct NumpyLinspaceParam : public dmlc::Parameter<NumpyLinspaceParam> {
 };
 
 inline bool NumpyLinspaceShape(const nnvm::NodeAttrs& attrs,
-                               mxnet::ShapeVector *in_attrs,
-                               mxnet::ShapeVector *out_attrs) {
+                               mxnet::ShapeVector* in_attrs,
+                               mxnet::ShapeVector* out_attrs) {
   const NumpyLinspaceParam& param = nnvm::get<NumpyLinspaceParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 0U);
   CHECK_EQ(out_attrs->size(), 1U);
-  CHECK_GE(param.num, 0)
-    << "Number of sequence should be non-negative, received " << param.num;
+  CHECK_GE(param.num, 0) << "Number of sequence should be non-negative, received " << param.num;
   mxnet::TShape shape = mxnet::TShape({static_cast<nnvm::dim_t>(param.num)});
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, shape);
   return true;
 }
 
 struct numpy_linspace_fwd {
-  template<typename DType, typename ValueType>
-  MSHADOW_XINLINE static void Map(index_t i, index_t size, ValueType start,
-                                  ValueType stop, bool endpoint,
-                                  double step, int req, DType* out) {
+  template <typename DType, typename ValueType>
+  MSHADOW_XINLINE static void Map(index_t i,
+                                  index_t size,
+                                  ValueType start,
+                                  ValueType stop,
+                                  bool endpoint,
+                                  double step,
+                                  int req,
+                                  DType* out) {
     if (i == 0) {
       // Special cases : start = 9007199254740993
       KERNEL_ASSIGN(out[i], req, static_cast<DType>(start));
@@ -261,59 +243,59 @@ struct numpy_linspace_fwd {
   }
 };
 
-template<typename xpu>
+template <typename xpu>
 void NumpyLinspaceCompute(const nnvm::NodeAttrs& attrs,
                           const OpContext& ctx,
                           const std::vector<TBlob>& inputs,
                           const std::vector<OpReqType>& req,
                           const std::vector<TBlob>& outputs) {
   using namespace mxnet_op;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
+  Stream<xpu>* s                  = ctx.get_stream<xpu>();
   const NumpyLinspaceParam& param = nnvm::get<NumpyLinspaceParam>(attrs.parsed);
   MSHADOW_TYPE_SWITCH_EXT_WITH_BOOL(outputs[0].type_flag_, DType, {
-      index_t step_num = param.endpoint ? param.num - 1 : param.num;
-      if (param.value_type == 0) {
-        int64_t start = param.start_int;
-        int64_t stop  = param.stop_int;
-        double step   = step_num > 0 ? \
-            (static_cast<double>(stop) - static_cast<double>(start)) / step_num : 0.0f;
-        Kernel<numpy_linspace_fwd, xpu>::Launch(s,
-                                                outputs[0].Size(),
-                                                outputs[0].Size(),
-                                                start,
-                                                stop,
-                                                param.endpoint,
-                                                step,
-                                                req[0],
-                                                outputs[0].dptr<DType>());
-      } else if (param.value_type == 1) {
-        uint64_t start = param.start_uint;
-        uint64_t stop  = param.stop_uint;
-        double step    = step_num > 0 ? \
-            (static_cast<double>(stop) - static_cast<double>(start)) / step_num : 0.0f;
-        Kernel<numpy_linspace_fwd, xpu>::Launch(s,
-                                                outputs[0].Size(),
-                                                outputs[0].Size(),
-                                                start,
-                                                stop,
-                                                param.endpoint,
-                                                step,
-                                                req[0],
-                                                outputs[0].dptr<DType>());
-      } else {
-        double start = param.start_double;
-        double stop  = param.stop_double;
-        double step  = step_num > 0 ? (stop - start) / step_num : 0.0f;
-        Kernel<numpy_linspace_fwd, xpu>::Launch(s,
-                                                outputs[0].Size(),
-                                                outputs[0].Size(),
-                                                start,
-                                                stop,
-                                                param.endpoint,
-                                                step,
-                                                req[0],
-                                                outputs[0].dptr<DType>());
-      }
+    index_t step_num = param.endpoint ? param.num - 1 : param.num;
+    if (param.value_type == 0) {
+      int64_t start = param.start_int;
+      int64_t stop  = param.stop_int;
+      double step =
+          step_num > 0 ? (static_cast<double>(stop) - static_cast<double>(start)) / step_num : 0.0f;
+      Kernel<numpy_linspace_fwd, xpu>::Launch(s,
+                                              outputs[0].Size(),
+                                              outputs[0].Size(),
+                                              start,
+                                              stop,
+                                              param.endpoint,
+                                              step,
+                                              req[0],
+                                              outputs[0].dptr<DType>());
+    } else if (param.value_type == 1) {
+      uint64_t start = param.start_uint;
+      uint64_t stop  = param.stop_uint;
+      double step =
+          step_num > 0 ? (static_cast<double>(stop) - static_cast<double>(start)) / step_num : 0.0f;
+      Kernel<numpy_linspace_fwd, xpu>::Launch(s,
+                                              outputs[0].Size(),
+                                              outputs[0].Size(),
+                                              start,
+                                              stop,
+                                              param.endpoint,
+                                              step,
+                                              req[0],
+                                              outputs[0].dptr<DType>());
+    } else {
+      double start = param.start_double;
+      double stop  = param.stop_double;
+      double step  = step_num > 0 ? (stop - start) / step_num : 0.0f;
+      Kernel<numpy_linspace_fwd, xpu>::Launch(s,
+                                              outputs[0].Size(),
+                                              outputs[0].Size(),
+                                              start,
+                                              stop,
+                                              param.endpoint,
+                                              step,
+                                              req[0],
+                                              outputs[0].dptr<DType>());
+    }
   });
 }
 

From 74c8fc62ddda716606df1c819ca4c8ee1ef0610f Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Wed, 13 Oct 2021 22:35:15 -0700
Subject: [PATCH 24/41] ctx/context => device

---
 benchmark/opperf/opperf.py                    |   2 +-
 benchmark/python/sparse/cast_storage.py       |   2 +-
 benchmark/python/sparse/dot.py                |   4 +-
 benchmark/python/sparse/sparse_op.py          |   4 +-
 docs/python_docs/python/api/npx/index.rst     |   2 +-
 .../inference/image_classification_jetson.md  |   2 +-
 .../crash-course/6-train-nn.md                |   6 +-
 .../getting-started/gluon_migration_guide.md  |   8 +-
 .../packages/gluon/blocks/save_load_params.md |   2 +-
 .../tutorials/packages/gluon/image/mnist.md   |  12 +-
 .../tutorials/performance/backend/amp.md      |   2 +-
 example/bi-lstm-sort/bi-lstm-sort.ipynb       |   2 +-
 example/gluon/mnist/mnist.py                  |   8 +-
 example/multi-task/multi-task-learning.ipynb  |  14 +-
 example/quantization/imagenet_inference.py    |  10 +-
 example/recommenders/demo1-MF.ipynb           |   8 +-
 python/mxnet/_ctypes/cached_op.py             |  14 +-
 python/mxnet/amp/amp.py                       |  10 +-
 python/mxnet/context.py                       |   2 +-
 python/mxnet/contrib/quantization.py          |  67 ++---
 .../data/vision/transforms/bbox/bbox.py       |   2 +-
 .../gluon/contrib/estimator/estimator.py      |  66 +++--
 python/mxnet/gluon/data/batchify.py           |  14 +-
 python/mxnet/gluon/loss.py                    |   2 +-
 python/mxnet/gluon/metric.py                  |  42 +--
 .../mxnet/gluon/model_zoo/vision/alexnet.py   |  13 +-
 .../mxnet/gluon/model_zoo/vision/densenet.py  |  29 +-
 .../mxnet/gluon/model_zoo/vision/inception.py |  13 +-
 .../mxnet/gluon/model_zoo/vision/mobilenet.py |  62 +++--
 python/mxnet/gluon/model_zoo/vision/resnet.py |  63 +++--
 .../gluon/model_zoo/vision/squeezenet.py      |  23 +-
 python/mxnet/gluon/model_zoo/vision/vgg.py    |  53 ++--
 python/mxnet/gluon/nn/activations.py          |   4 +-
 python/mxnet/gluon/nn/basic_layers.py         |  56 ++--
 python/mxnet/gluon/nn/conv_layers.py          |  32 +--
 python/mxnet/gluon/parameter.py               |   6 +
 python/mxnet/gluon/rnn/conv_rnn_cell.py       |  16 +-
 python/mxnet/gluon/rnn/rnn_cell.py            |  72 ++---
 python/mxnet/gluon/rnn/rnn_layer.py           |  20 +-
 python/mxnet/gluon/utils.py                   |   4 +-
 python/mxnet/model.py                         |   2 +-
 python/mxnet/ndarray/contrib.py               |   2 +-
 python/mxnet/ndarray/numpy/random.py          | 239 ++++++++--------
 .../mxnet/ndarray/numpy_extension/random.py   |  60 ++--
 python/mxnet/numpy/io.py                      |  12 +-
 python/mxnet/numpy/multiarray.py              |   8 +-
 python/mxnet/numpy_extension/__init__.py      |   2 +-
 python/mxnet/numpy_op_fallback.py             |   8 +-
 python/mxnet/optimizer/optimizer.py           |   6 +
 python/mxnet/optimizer/updater.py             |   2 +-
 python/mxnet/random.py                        |  30 +-
 python/mxnet/test_utils.py                    |  84 +++---
 .../dist_device_sync_kvstore_byteps.py        |  14 +-
 tests/nightly/test_large_array.py             |   4 +-
 tests/nightly/test_np_large_array.py          |   2 +-
 tests/nightly/test_np_random.py               |  18 +-
 .../python/dnnl/subgraphs/subgraph_common.py  |   6 +-
 .../dnnl/subgraphs/test_conv_subgraph.py      |  32 +--
 .../python/dnnl/subgraphs/test_fc_subgraph.py |  12 +-
 tests/python/dnnl/test_amp.py                 |   2 +-
 tests/python/dnnl/test_bf16_operator.py       |   2 +-
 tests/python/gpu/test_amp.py                  |   6 +-
 tests/python/gpu/test_deferred_compute_gpu.py |   2 +-
 tests/python/gpu/test_extensions_gpu.py       |   2 +-
 tests/python/gpu/test_gluon_gpu.py            | 118 ++++----
 tests/python/gpu/test_gluon_transforms.py     |   4 +-
 tests/python/gpu/test_kvstore_gpu.py          |   4 +-
 tests/python/gpu/test_numpy_fallback.py       |  14 +-
 tests/python/gpu/test_numpy_op.py             |   4 +-
 tests/python/gpu/test_operator_gpu.py         |  20 +-
 tests/python/gpu/test_profiler_gpu.py         |   6 +-
 tests/python/gpu/test_tvm_op_gpu.py           |   4 +-
 .../python/quantization/test_quantization.py  |  80 +++---
 tests/python/test_quantization_gpu.py         |   4 +-
 tests/python/unittest/common.py               |   2 +-
 .../unittest/test_contrib_control_flow.py     |  12 +-
 .../python/unittest/test_contrib_operator.py  |  12 +-
 tests/python/unittest/test_contrib_stes_op.py |  18 +-
 .../python/unittest/test_deferred_compute.py  |  42 +--
 tests/python/unittest/test_exc_handling.py    |  24 +-
 tests/python/unittest/test_extensions.py      |   2 +-
 tests/python/unittest/test_gluon.py           | 138 +++++-----
 .../unittest/test_gluon_control_flow.py       |  52 ++--
 tests/python/unittest/test_gluon_rnn.py       |  72 ++---
 tests/python/unittest/test_loss.py            |   6 +-
 tests/python/unittest/test_ndarray.py         |   8 +-
 tests/python/unittest/test_numpy_gluon.py     |  14 +-
 tests/python/unittest/test_numpy_loss.py      |   6 +-
 tests/python/unittest/test_numpy_ndarray.py   |  22 +-
 tests/python/unittest/test_numpy_op.py        | 128 ++++-----
 tests/python/unittest/test_operator.py        | 256 +++++++++---------
 tests/python/unittest/test_optimizer.py       |   2 +-
 tests/python/unittest/test_sparse_ndarray.py  |   8 +-
 tests/python/unittest/test_sparse_operator.py |  28 +-
 tests/python/unittest/test_subgraph.py        |  14 +-
 tests/python/unittest/test_thread_local.py    |  30 +-
 96 files changed, 1284 insertions(+), 1200 deletions(-)

diff --git a/benchmark/opperf/opperf.py b/benchmark/opperf/opperf.py
index 47bd970f930d..5aa466eef20a 100644
--- a/benchmark/opperf/opperf.py
+++ b/benchmark/opperf/opperf.py
@@ -198,7 +198,7 @@ def main():
         "Output file {output_file} already exists.".format(output_file=args.output_file)
 
     # 2. RUN BENCHMARKS
-    ctx = _parse_mxnet_context(args.ctx)
+    ctx = _parse_mxnet_context(args.device)
     dtype = args.dtype
     profiler = args.profiler
     int64_tensor = args.int64_tensor
diff --git a/benchmark/python/sparse/cast_storage.py b/benchmark/python/sparse/cast_storage.py
index 7ae537398c42..6f4fc413edc2 100644
--- a/benchmark/python/sparse/cast_storage.py
+++ b/benchmark/python/sparse/cast_storage.py
@@ -41,7 +41,7 @@ def measure_cost(repeat, f, *args, **kwargs):
 
 def run_cast_storage_synthetic():
     def dense_to_sparse(m, n, density, ctx, repeat, stype):
-        set_default_context(ctx)
+        set_default_device(ctx)
         data_shape = (m, n)
         dns_data = rand_ndarray(data_shape, stype, density).tostype('default')
         dns_data.wait_to_read()
diff --git a/benchmark/python/sparse/dot.py b/benchmark/python/sparse/dot.py
index 5cfd540c04be..a2dfd03a6bd3 100644
--- a/benchmark/python/sparse/dot.py
+++ b/benchmark/python/sparse/dot.py
@@ -26,7 +26,7 @@
 import mxnet as mx
 import numpy as np
 import numpy.random as rnd
-from mxnet.test_utils import rand_ndarray, set_default_context, assert_almost_equal, get_bz2_data
+from mxnet.test_utils import rand_ndarray, set_default_device, assert_almost_equal, get_bz2_data
 from mxnet.base import check_call, _LIB
 from util import estimate_density
 
@@ -267,7 +267,7 @@ def test_dot_synthetic(data_dict):
     # Benchmark MXNet and Scipys dot operator
     def bench_dot(lhs_shape, rhs_shape, lhs_stype, rhs_stype,
                   lhs_den, rhs_den, trans_lhs, ctx, num_repeat=10, fw="mxnet", distribution="uniform"):
-        set_default_context(ctx)
+        set_default_device(ctx)
         assert fw == "mxnet" or fw == "scipy"
         # Set funcs
         dot_func_sparse = mx.nd.sparse.dot if fw == "mxnet" else sp.spmatrix.dot
diff --git a/benchmark/python/sparse/sparse_op.py b/benchmark/python/sparse/sparse_op.py
index ffa6de6d762e..6c4fe8188c7e 100644
--- a/benchmark/python/sparse/sparse_op.py
+++ b/benchmark/python/sparse/sparse_op.py
@@ -155,7 +155,7 @@ def measure_cost_backward_baseline(repeat, dot, transpose, lhs, rhs):
         return diff / repeat
 
     def bench_dot_forward(m, k, n, density, ctx, repeat):
-        set_default_context(ctx)
+        set_default_device(ctx)
         dns = mx.nd.random.uniform(shape=(k, n)).copyto(ctx)
         data_shape = (m, k)
         csr_data = rand_ndarray(data_shape, 'csr', density)
@@ -184,7 +184,7 @@ def bench_dot_forward(m, k, n, density, ctx, repeat):
                      ratio_baseline, costs_baseline[0], costs_baseline[1]))
 
     def bench_dot_backward(m, k, n, density, ctx, repeat):
-        set_default_context(ctx)
+        set_default_device(ctx)
         dns = mx.nd.random.uniform(shape=(m, n)).copyto(ctx)
         data_shape = (m, k)
         csr_data = rand_ndarray(data_shape, 'csr', density)
diff --git a/docs/python_docs/python/api/npx/index.rst b/docs/python_docs/python/api/npx/index.rst
index e89ad3d138a8..e89a9cc96d6e 100644
--- a/docs/python_docs/python/api/npx/index.rst
+++ b/docs/python_docs/python/api/npx/index.rst
@@ -50,7 +50,7 @@ Devices
    cpu_pinned
    gpu
    gpu_memory_info
-   current_context
+   current_device
    num_gpus
 
 Nerual networks
diff --git a/docs/python_docs/python/tutorials/deploy/inference/image_classification_jetson.md b/docs/python_docs/python/tutorials/deploy/inference/image_classification_jetson.md
index 85aca898643b..152dc6d80c0e 100644
--- a/docs/python_docs/python/tutorials/deploy/inference/image_classification_jetson.md
+++ b/docs/python_docs/python/tutorials/deploy/inference/image_classification_jetson.md
@@ -99,7 +99,7 @@ img = mx.image.color_normalize(img.astype(dtype='float32')/255,
                                std=mx.np.array([0.229, 0.224, 0.225])) # normalize
 img = img.transpose((2, 0, 1)) # channel first
 img = mx.np.expand_dims(img, axis=0) # batchify
-img = img.as_in_ctx(ctx)
+img = img.to_device(ctx)
 
 prob = mx.npx.softmax(net(img)) # predict and normalize output
 idx = mx.npx.topk(prob, k=5)[0] # get top 5 result
diff --git a/docs/python_docs/python/tutorials/getting-started/crash-course/6-train-nn.md b/docs/python_docs/python/tutorials/getting-started/crash-course/6-train-nn.md
index 8f1b23b1e4ae..b4498853c08c 100644
--- a/docs/python_docs/python/tutorials/getting-started/crash-course/6-train-nn.md
+++ b/docs/python_docs/python/tutorials/getting-started/crash-course/6-train-nn.md
@@ -368,7 +368,7 @@ def test(val_data):
     for batch in val_data:
         data = batch[0]
         labels = batch[1]
-        outputs = model(data.as_in_ctx(ctx))
+        outputs = model(data.to_device(ctx))
         acc.update([labels], [outputs])
 
     _, accuracy = acc.get()
@@ -396,8 +396,8 @@ for epoch in range(epochs):
         data = batch[0]
         label = batch[1]
         with mx.autograd.record():
-            outputs = model(data.as_in_ctx(ctx))
-            loss = loss_fn(outputs, label.as_in_ctx(ctx))
+            outputs = model(data.to_device(ctx))
+            loss = loss_fn(outputs, label.to_device(ctx))
         mx.autograd.backward(loss)
         trainer.step(batch_size)
         accuracy.update([label], [outputs])
diff --git a/docs/python_docs/python/tutorials/getting-started/gluon_migration_guide.md b/docs/python_docs/python/tutorials/getting-started/gluon_migration_guide.md
index 98a4c266d030..54a3cfd41bed 100644
--- a/docs/python_docs/python/tutorials/getting-started/gluon_migration_guide.md
+++ b/docs/python_docs/python/tutorials/getting-started/gluon_migration_guide.md
@@ -78,8 +78,8 @@ MXNet [NumPy ndarray(i.e. `mx.np.ndarray`)](../../api/np/arrays.ndarray.html) is
     |                   Deprecated Attributes               |    NumPy ndarray Equivalent    |
     | ----------------------------------------------------- | ------------------------------ |
     |                   `a.asscalar()`                      |         `a.item()`         |
-    |                 `a.as_in_context()`                   |      `a.as_in_ctx()`       |
-    |                    `a.context`                        |          `a.ctx`           |
+    |                 `a.as_in_context()`                   |      `a.to_device()`       |
+    |                    `a.context`                        |          `a.device`           |
     |                   `a.reshape_like(b)`                 |    `a.reshape(b.shape)`    |
     |                    `a.zeros_like(b)`                  |   `mx.np.zeros_like(b)`  |
     |                    `a.ones_like(b)`                   |   `mx.np.ones_like(b)`   |
@@ -223,7 +223,7 @@ Now, in deferred computation mode of Gluon2.0, the divergence of NDArray and Sym
 # forward interface, no F any more
 def forward(self, x):
     # get the context information of input array and make parameters run on the same context
-    ctx = x.ctx
+    ctx = x.device
     # use np/npx interfaces instead of F
     act = npx.fully_connected(x, self.weight.data(ctx),
                               self.bias.data(ctx) if self.bias is not None else None,
@@ -276,7 +276,7 @@ class Dense(HybridBlock):
             self.act = None
 
     def forward(self, x):
-        ctx = x.ctx
+        ctx = x.device
         act = npx.fully_connected(x, self.weight.data(ctx),
                                   self.bias.data(ctx) if self.bias is not None else None,
                                   no_bias=self.bias is None,
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md
index 932175d3dc1b..738f128bb6d8 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md
@@ -190,7 +190,7 @@ def verify_loaded_model(net):
 
         # Display the predictions
         data = np.transpose(data, (0, 3, 1, 2))
-        out = net(data.as_in_ctx(ctx))
+        out = net(data.to_device(ctx))
         predictions = np.argmax(out, axis=1)
         print('Model predictions: ', predictions.asnumpy())
 
diff --git a/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md b/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
index 64d81e463dde..bdd2b06deed0 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
@@ -163,9 +163,9 @@ for i in range(epoch):
         # Inside training scope
         with ag.record():
             for x, y in zip(data, label):
-                z = net(x.as_in_ctx(ctx))
+                z = net(x.to_device(ctx))
                 # Computes softmax cross entropy loss.
-                loss = softmax_cross_entropy_loss(z, y.as_in_ctx(ctx))
+                loss = softmax_cross_entropy_loss(z, y.to_device(ctx))
                 # Backpropagate the error for one iteration.
                 loss.backward()
                 outputs.append(z)
@@ -192,7 +192,7 @@ metric = mx.gluon.metric.Accuracy()
 for batch_num, (data, label) in enumerate(val_data):
     outputs = []
     for x in data:
-        outputs.append(net(x.as_in_ctx(ctx)))
+        outputs.append(net(x.to_device(ctx)))
     # Updates internal evaluation
     metric.update(label, outputs)
 print('validation acc: %s=%f'%metric.get())
@@ -279,9 +279,9 @@ for i in range(epoch):
         # Inside training scope
         with ag.record():
             for x, y in zip(data, label):
-                z = net(x.as_in_ctx(ctx))
+                z = net(x.to_device(ctx))
                 # Computes softmax cross entropy loss.
-                loss = softmax_cross_entropy_loss(z, y.as_in_ctx(ctx))
+                loss = softmax_cross_entropy_loss(z, y.to_device(ctx))
                 # Backpropogate the error for one iteration.
                 loss.backward()
                 outputs.append(z)
@@ -308,7 +308,7 @@ metric = mx.gluon.metric.Accuracy()
 for batch_num, (data, label) in enumerate(val_data):
     outputs = []
     for x in data:
-        outputs.append(net(x.as_in_ctx(ctx)))
+        outputs.append(net(x.to_device(ctx)))
     # Updates internal evaluation
     metric.update(label, outputs)
 print('validation acc: %s=%f'%metric.get())
diff --git a/docs/python_docs/python/tutorials/performance/backend/amp.md b/docs/python_docs/python/tutorials/performance/backend/amp.md
index 9656441ef4d0..52ec2c4d3040 100644
--- a/docs/python_docs/python/tutorials/performance/backend/amp.md
+++ b/docs/python_docs/python/tutorials/performance/backend/amp.md
@@ -272,7 +272,7 @@ with mx.Context(mx.gpu(0)):
     with warnings.catch_warnings(record=True) as w:
         warnings.simplefilter("ignore")
         model = get_model("resnet50_v1")
-        model.initialize(ctx=mx.current_context())
+        model.initialize(ctx=mx.current_device())
         model.hybridize()
         model(mx.np.zeros((1, 3, 224, 224)))
         converted_model = amp.convert_hybrid_block(model)
diff --git a/example/bi-lstm-sort/bi-lstm-sort.ipynb b/example/bi-lstm-sort/bi-lstm-sort.ipynb
index 4b9f1da7d37a..3eb08f1508d2 100644
--- a/example/bi-lstm-sort/bi-lstm-sort.ipynb
+++ b/example/bi-lstm-sort/bi-lstm-sort.ipynb
@@ -485,7 +485,7 @@
    "source": [
     "def get_pred(x):\n",
     "    x, _ = transform(x, x)\n",
-    "    output = net(mx.np.expand_dims(x.as_in_ctx(ctx), axis=0))\n",
+    "    output = net(mx.np.expand_dims(x.to_device(ctx), axis=0))\n",
     "\n",
     "    # Convert output back to string\n",
     "    pred = ''.join([vocab[int(o)] for o in output[0].argmax(axis=1).asnumpy().tolist()])\n",
diff --git a/example/gluon/mnist/mnist.py b/example/gluon/mnist/mnist.py
index 121fcdf12250..a660cd57f286 100644
--- a/example/gluon/mnist/mnist.py
+++ b/example/gluon/mnist/mnist.py
@@ -71,8 +71,8 @@ def transformer(data, label):
 def test(ctx):
     metric = mx.gluon.metric.Accuracy()
     for data, label in val_data:
-        data = data.as_in_ctx(ctx)
-        label = label.as_in_ctx(ctx)
+        data = data.to_device(ctx)
+        label = label.to_device(ctx)
         output = net(data)
         metric.update([label], [output])
 
@@ -93,8 +93,8 @@ def train(epochs, ctx):
         metric.reset()
         for i, (data, label) in enumerate(train_data):
             # Copy data to ctx if necessary
-            data = data.as_in_ctx(ctx)
-            label = label.as_in_ctx(ctx)
+            data = data.to_device(ctx)
+            label = label.to_device(ctx)
             # Start recording computation graph with record() section.
             # Recorded graphs can then be differentiated with backward.
             with autograd.record():
diff --git a/example/multi-task/multi-task-learning.ipynb b/example/multi-task/multi-task-learning.ipynb
index 608f5be95fec..460e56ee7c3d 100644
--- a/example/multi-task/multi-task-learning.ipynb
+++ b/example/multi-task/multi-task-learning.ipynb
@@ -290,9 +290,9 @@
     "    acc_odd_even = mx.gluon.metric.Accuracy(name='odd_even')\n",
     "    \n",
     "    for i, (data, label_digit, label_odd_even) in enumerate(data_iterator):\n",
-    "        data = data.as_in_ctx(ctx)\n",
-    "        label_digit = label_digit.as_in_ctx(ctx)\n",
-    "        label_odd_even = label_odd_even.as_in_ctx(ctx).reshape(-1,1)\n",
+    "        data = data.to_device(ctx)\n",
+    "        label_digit = label_digit.to_device(ctx)\n",
+    "        label_odd_even = label_odd_even.to_device(ctx).reshape(-1,1)\n",
     "\n",
     "        output_digit, output_odd_even = net(data)\n",
     "        \n",
@@ -339,9 +339,9 @@
     "    l_odd_even_ = 0. \n",
     "    \n",
     "    for i, (data, label_digit, label_odd_even) in enumerate(train_data):\n",
-    "        data = data.as_in_ctx(ctx)\n",
-    "        label_digit = label_digit.as_in_ctx(ctx)\n",
-    "        label_odd_even = label_odd_even.as_in_ctx(ctx).reshape(-1,1)\n",
+    "        data = data.to_device(ctx)\n",
+    "        label_digit = label_digit.to_device(ctx)\n",
+    "        label_odd_even = label_odd_even.to_device(ctx).reshape(-1,1)\n",
     "        \n",
     "        with autograd.record():\n",
     "            output_digit, output_odd_even = net(data)\n",
@@ -407,7 +407,7 @@
     "\n",
     "    img = test_dataset[idx][0]\n",
     "    data, _, _ = test_dataset_t[idx]\n",
-    "    data = np.expand_dims(data.as_in_ctx(ctx), axis=0)\n",
+    "    data = np.expand_dims(data.to_device(ctx), axis=0)\n",
     "\n",
     "    plt.imshow(img.squeeze().asnumpy(), cmap='gray')\n",
     "    \n",
diff --git a/example/quantization/imagenet_inference.py b/example/quantization/imagenet_inference.py
index 7d51408d350a..69f42a0b5794 100644
--- a/example/quantization/imagenet_inference.py
+++ b/example/quantization/imagenet_inference.py
@@ -44,8 +44,8 @@ def score(symblock, data, ctx, max_num_examples, skip_num_batches, logger=None):
     for i, input_data in enumerate(data):
         if i < skip_num_batches:
             continue
-        x = input_data[0].as_in_ctx(ctx)
-        label = input_data[1].as_in_ctx(ctx)
+        x = input_data[0].to_device(ctx)
+        label = input_data[1].to_device(ctx)
         outputs = symblock.forward(x)
         for m in metrics:
             m.update(label, outputs)
@@ -126,13 +126,13 @@ def benchmark_score(symblock, ctx, batch_size, warmup_batches, num_batches, data
     logger = logging.getLogger('logger')
     logger.setLevel(logging.INFO)
 
-    if args.ctx == 'cpu':
+    if args.device == 'cpu':
         ctx = mx.cpu(0)
-    elif args.ctx == 'gpu':
+    elif args.device == 'gpu':
         ctx = mx.gpu(0)
         logger.warning('Notice that oneDNN optimized and quantized model may not work with GPU context')
     else:
-        raise ValueError('ctx %s is not supported in this script' % args.ctx)
+        raise ValueError('ctx %s is not supported in this script' % args.device)
 
     symbol_file = args.symbol_file
     param_file = args.param_file
diff --git a/example/recommenders/demo1-MF.ipynb b/example/recommenders/demo1-MF.ipynb
index 611141d9f36e..71ea91801d71 100644
--- a/example/recommenders/demo1-MF.ipynb
+++ b/example/recommenders/demo1-MF.ipynb
@@ -203,7 +203,7 @@
    "cell_type": "code",
    "execution_count": 7,
    "source": [
-    "net1.summary(user.as_in_ctx(ctx[0]), item.as_in_ctx(ctx[0]))"
+    "net1.summary(user.to_device(ctx[0]), item.to_device(ctx[0]))"
    ],
    "outputs": [
     {
@@ -589,7 +589,7 @@
    "cell_type": "code",
    "execution_count": 16,
    "source": [
-    "net2.summary(user.as_in_ctx(ctx[0]), item.as_in_ctx(ctx[0]))"
+    "net2.summary(user.to_device(ctx[0]), item.to_device(ctx[0]))"
    ],
    "outputs": [
     {
@@ -959,8 +959,8 @@
     "dataloader = gluon.data.DataLoader(dataset, batch_size=batch_size, shuffle=False)\n",
     "ratings = onp.zeros((max_user+1, max_item+1))\n",
     "for users, items in dataloader:\n",
-    "    users = users.as_in_ctx(ctx[0])\n",
-    "    items = items.as_in_ctx(ctx[0])\n",
+    "    users = users.to_device(ctx[0])\n",
+    "    items = items.to_device(ctx[0])\n",
     "    scores = net3(users, items).asnumpy()\n",
     "    ratings[users.asnumpy().astype('int32'), items.asnumpy().astype('int32')] = scores.reshape(-1)"
    ],
diff --git a/python/mxnet/_ctypes/cached_op.py b/python/mxnet/_ctypes/cached_op.py
index 856eb5dbdf91..03715ef612a5 100644
--- a/python/mxnet/_ctypes/cached_op.py
+++ b/python/mxnet/_ctypes/cached_op.py
@@ -73,7 +73,7 @@ def get_optimized_symbol(self):
     def __call__(self, *args, **kwargs):
         """ctypes implementation of imperative invoke wrapper"""
         # New FFI only supports numpy ndarray
-        default_ctx = kwargs.pop('default_ctx', None)
+        default_device = kwargs.pop('default_device', None)
         out = kwargs.pop('out', None)
         if kwargs:
             raise TypeError(
@@ -82,8 +82,8 @@ def __call__(self, *args, **kwargs):
         if self.is_np_sym:
             if len(args) == 1 and args[0] is None:
                 args = []
-            type_id = default_ctx.device_typeid if default_ctx else None
-            device_id = default_ctx.device_id if default_ctx else None
+            type_id = default_device.device_typeid if default_device else None
+            device_id = default_device.device_id if default_device else None
             out_arg = out if out is not None and not isinstance(out, NDArrayBase) else (out, )
             output_vars = _api_internal.invoke(
                 self.handle,
@@ -119,16 +119,16 @@ def __call__(self, *args, **kwargs):
             # (None, ) -> []
             if len(args) == 1 and args[0] is None:
                 args = []
-                assert default_ctx is not None, 'default_ctx is required if no input is provided'
+                assert default_device is not None, 'default_device is required if no input is provided'
             else:
-                default_ctx = args[0].ctx if default_ctx is None else default_ctx
+                default_device = args[0].device if default_device is None else default_device
 
             check_call(_LIB.MXInvokeCachedOp(
                 self.handle,
                 ctypes.c_int(len(args)),
                 c_handle_array(args),
-                ctypes.c_int(default_ctx.device_typeid),
-                ctypes.c_int(default_ctx.device_id),
+                ctypes.c_int(default_device.device_typeid),
+                ctypes.c_int(default_device.device_id),
                 ctypes.byref(num_output),
                 ctypes.byref(output_vars),
                 ctypes.byref(out_stypes)))
diff --git a/python/mxnet/amp/amp.py b/python/mxnet/amp/amp.py
index 99272bb46bca..c7aab71d5a54 100644
--- a/python/mxnet/amp/amp.py
+++ b/python/mxnet/amp/amp.py
@@ -33,7 +33,7 @@
 
 from mxnet import numpy
 from .. import symbol
-from ..context import gpu
+from ..device import gpu
 from ..symbol import Symbol
 from ..symbol import contrib as symbol_contrib
 from .. import ndarray
@@ -47,6 +47,7 @@
 from .. import optimizer as opt
 from .loss_scaler import LossScaler
 from ..operator import get_all_registered_operators_grouped
+from ..util import wrap_ctx_to_device_func
 
 bfloat16 = np.dtype([('bfloat16', np.uint16)])
 
@@ -667,9 +668,10 @@ def convert_model(sym, arg_params, aux_params, target_dtype="float16", target_dt
     # Return the converted symbol and casted params
     return sym, arg_params, aux_params
 
+@wrap_ctx_to_device_func
 def convert_hybrid_block(block, target_dtype="float16", target_dtype_ops=None,
                          fp32_ops=None, conditional_fp32_ops=None,
-                         excluded_sym_names=None, ctx=gpu(0),
+                         excluded_sym_names=None, device=gpu(0),
                          cast_optional_params=False):
     """Given a hybrid block/symbol block representing a FP32 model and a target_dtype,
     return a block with mixed precision support which can be used for inference use cases.
@@ -692,7 +694,7 @@ def convert_hybrid_block(block, target_dtype="float16", target_dtype_ops=None,
     excluded_sym_names : list of strs
         A list of strings that represent the names of symbols that users want to exclude
         from being quantized
-    ctx : Context
+    device : Context
         Context on which model parameters should live
     cast_optional_params : bool, default False
         Whether to cast the arg_params and aux_params that don't require to be in LP16
@@ -757,7 +759,7 @@ def convert_hybrid_block(block, target_dtype="float16", target_dtype_ops=None,
         if aux_param_name in arg_dict and param.dtype != arg_dict[aux_param_name].dtype:
             param.cast(arg_dict[aux_param_name].dtype)
 
-    ret.load_dict(arg_dict, ctx=ctx)
+    ret.load_dict(arg_dict, device=device)
     return ret
 
 def list_lp16_ops(target_dtype):
diff --git a/python/mxnet/context.py b/python/mxnet/context.py
index b4c086cfeedb..668f9cc6e25c 100644
--- a/python/mxnet/context.py
+++ b/python/mxnet/context.py
@@ -16,7 +16,7 @@
 # under the License.
 """Context management API of mxnet."""
 from warnings import warn
-from .device import Device, _current
+from .device import Device, _current, cpu, gpu, cpu_pinned
 
 
 def Context(*args, **kwargs):
diff --git a/python/mxnet/contrib/quantization.py b/python/mxnet/contrib/quantization.py
index 4444c4b0fc97..972bdac04518 100644
--- a/python/mxnet/contrib/quantization.py
+++ b/python/mxnet/contrib/quantization.py
@@ -29,8 +29,8 @@
 from ..symbol import Symbol
 from .. import ndarray
 from ..io import DataDesc
-from ..context import cpu, Context
-from ..util import is_np_array
+from ..device import cpu, Device
+from ..util import is_np_array, wrap_ctx_to_device_func
 
 
 def _quantize_params(qsym, params, min_max_dict):
@@ -88,7 +88,7 @@ def _quantize_params(qsym, params, min_max_dict):
     return quantized_params
 
 
-def _quantize_symbol(sym, ctx, excluded_symbols=None, excluded_operators=None,
+def _quantize_symbol(sym, device, excluded_symbols=None, excluded_operators=None,
                      offline_params=None, quantized_dtype='int8', quantize_mode='smart',
                      quantize_granularity='tensor-wise'):
     """Given a symbol object representing a neural network of data type FP32,
@@ -98,7 +98,7 @@ def _quantize_symbol(sym, ctx, excluded_symbols=None, excluded_operators=None,
     ----------
     sym : Symbol
         FP32 neural network symbol.
-    ctx : Context
+    device : Device
         Defines the device that users want to run quantized symbol.
     excluded_symbols : list of strings
         A list of strings representing the names of the symbols that users want to excluding
@@ -144,7 +144,7 @@ def _quantize_symbol(sym, ctx, excluded_symbols=None, excluded_operators=None,
     calib_str = ctypes.POINTER(ctypes.c_char_p)()
     check_call(_LIB.MXQuantizeSymbol(sym.handle,
                                      ctypes.byref(out),
-                                     ctypes.byref(ctypes.c_int(ctx.device_typeid)),
+                                     ctypes.byref(ctypes.c_int(device.device_typeid)),
                                      mx_uint(num_excluded_symbols),
                                      c_str_array(excluded_symbols),
                                      mx_uint(num_excluded_ops),
@@ -259,8 +259,8 @@ def get_optimal_threshold(hist_data, quantized_dtype, num_quantized_bins=255):
         if min_val >= 0 and quantized_dtype in ['auto', 'uint8']:
             # We need to move negative bins to positive bins to fit uint8 range.
             num_quantized_bins = num_quantized_bins * 2 + 1
-        hist = ndarray.array(hist, ctx=cpu())
-        hist_edges = ndarray.array(hist_edges, ctx=cpu())
+        hist = ndarray.array(hist, device=cpu())
+        hist_edges = ndarray.array(hist_edges, device=cpu())
         threshold, divergence = ndarray.contrib.calibrate_entropy(hist=hist,
                                                                   hist_edges=hist_edges,
                                                                   num_quantized_bins=num_quantized_bins)
@@ -382,8 +382,9 @@ def _generate_list_of_data_desc(data_shapes, data_types):
     raise ValueError('data_shapes must be either a list of DataDesc or a list of Tuple')
 
 
+@wrap_ctx_to_device_func
 def quantize_model(sym, arg_params, aux_params, data_names=('data',),
-                   ctx=cpu(), excluded_sym_names=None, excluded_op_names=None, calib_mode='entropy',
+                   device=cpu(), excluded_sym_names=None, excluded_op_names=None, calib_mode='entropy',
                    calib_data=None, num_calib_batches=None,
                    quantized_dtype='int8', quantize_mode='smart',
                    quantize_granularity='tensor-wise', logger=None):
@@ -408,9 +409,9 @@ def quantize_model(sym, arg_params, aux_params, data_names=('data',),
     data_names : a list of strs
         Data names required for creating a Module object to run forward propagation on the
         calibration dataset.
-    ctx : Context
+    device : Device
         Defines the device that users want to run forward propagation on the calibration
-        dataset for collecting layer output statistics. Currently, only supports single context.
+        dataset for collecting layer output statistics. Currently, only supports single device.
     excluded_sym_names : list of strings
         A list of strings representing the names of the symbols that users want to excluding
         from being quantized.
@@ -475,7 +476,7 @@ def quantize_model(sym, arg_params, aux_params, data_names=('data',),
     if quantize_granularity not in ('tensor-wise', 'channel-wise'):
         raise ValueError('unkonwn quantize_granularity %s received,'
                          ' expected `tensor-wise` or `channel-wise`.' % quantize_granularity)
-    qsym, calib_layers = _quantize_symbol(sym, ctx, excluded_symbols=excluded_sym_names,
+    qsym, calib_layers = _quantize_symbol(sym, device, excluded_symbols=excluded_sym_names,
                                           excluded_operators=excluded_op_names,
                                           offline_params=list(arg_params.keys()),
                                           quantized_dtype=quantized_dtype,
@@ -483,8 +484,8 @@ def quantize_model(sym, arg_params, aux_params, data_names=('data',),
                                           quantize_granularity=quantize_granularity)
     min_max_dict = {}
     if calib_mode is not None and calib_mode != 'none':
-        if not isinstance(ctx, Context):
-            raise ValueError('currently only supports single ctx, while received %s' % str(ctx))
+        if not isinstance(device, Device):
+            raise ValueError('currently only supports single device, while received %s' % str(device))
         if calib_data is None:
             raise ValueError('calib_data must be provided when calib_mode=%s' % calib_mode)
         if not isinstance(calib_data, mx.gluon.data.DataLoader):
@@ -529,8 +530,9 @@ def quantize_model(sym, arg_params, aux_params, data_names=('data',),
 
     return qsym, qarg_params, aux_params
 
+@wrap_ctx_to_device_func
 def quantize_model_onednn(sym, arg_params, aux_params, data_names=('data',),
-                          ctx=cpu(), excluded_sym_names=None, excluded_op_names=None,
+                          device=cpu(), excluded_sym_names=None, excluded_op_names=None,
                           calib_mode='entropy', calib_data=None, num_calib_batches=None,
                           quantized_dtype='int8', quantize_mode='smart',
                           quantize_granularity='tensor-wise', logger=None):
@@ -550,16 +552,16 @@ def quantize_model_onednn(sym, arg_params, aux_params, data_names=('data',),
     quantized_model: tuple
         A tuple of quantized symbol, quantized arg_params, and aux_params.
     """
-    if not isinstance(ctx, Context):
-        raise ValueError('currently only supports single ctx, while received %s' % str(ctx))
-    if ctx.device_type != 'cpu':
+    if not isinstance(device, Device):
+        raise ValueError('currently only supports single device, while received %s' % str(device))
+    if device.device_type != 'cpu':
         raise ValueError(
             'quantize_model_onednn only support Intel cpu platform with oneDNN Backend')
 
     sym = sym.optimize_for(backend='ONEDNN_QUANTIZE')
 
     qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
-                                                   data_names=data_names, ctx=ctx,
+                                                   data_names=data_names, device=device,
                                                    excluded_sym_names=excluded_sym_names,
                                                    excluded_op_names=excluded_op_names,
                                                    calib_mode=calib_mode, calib_data=calib_data,
@@ -571,7 +573,7 @@ def quantize_model_onednn(sym, arg_params, aux_params, data_names=('data',),
 
     return qsym, qarg_params, aux_params
 
-def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
+def quantize_graph(sym, arg_params, aux_params, device=cpu(),
                    excluded_sym_names=None, excluded_op_names=None,
                    calib_mode='entropy', quantized_dtype='int8',
                    quantize_mode='full', quantize_granularity='tensor-wise',
@@ -584,9 +586,9 @@ def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
     ----------
     sym : str or Symbol
         Defines the structure of a neural network for FP32 data types.
-    ctx : Context
+    device : Device
         Defines the device that users want to run forward propagation on the calibration
-        dataset for collecting layer output statistics. Currently, only supports single context.
+        dataset for collecting layer output statistics. Currently, only supports single device.
     arg_params : dict
         Dictionary of name to `NDArray`.
     aux_params : dict
@@ -633,8 +635,8 @@ def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
         raise ValueError('excluded_sym_names must be a list of strings representing'
                          ' the names of the symbols that will not be quantized,'
                          ' while received type %s' % str(type(excluded_sym_names)))
-    if not isinstance(ctx, Context):
-        raise ValueError('currently only supports single ctx, while received %s' % str(ctx))
+    if not isinstance(device, Device):
+        raise ValueError('currently only supports single device, while received %s' % str(device))
     if logger:
         os.environ['MXNET_QUANTIZATION_VERBOSE'] = '1'
         logger.info('Quantizing graph')
@@ -644,7 +646,7 @@ def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
     if quantize_granularity not in ('tensor-wise', 'channel-wise'):
         raise ValueError('unkonwn quantize_granularity %s received,'
                          ' expected `tensor-wise` or `channel-wise`.' % quantize_granularity)
-    qsym, calib_layers = _quantize_symbol(sym, ctx, excluded_symbols=excluded_sym_names,
+    qsym, calib_layers = _quantize_symbol(sym, device, excluded_symbols=excluded_sym_names,
                                           excluded_operators=excluded_op_names,
                                           offline_params=list(arg_params.keys()),
                                           quantized_dtype=quantized_dtype,
@@ -754,10 +756,11 @@ def calib_graph(qsym, arg_params, aux_params, collector,
 
     return qsym, qarg_params, aux_params
 
+@wrap_ctx_to_device_func
 def quantize_net(network, quantized_dtype='auto', quantize_mode='full', quantize_granularity='tensor-wise',
                  exclude_layers=None, exclude_layers_match=None, exclude_operators=None,
                  calib_data=None, data_shapes=None, calib_mode='none',
-                 num_calib_batches=None, ctx=cpu(), LayerOutputCollector=None, logger=None):
+                 num_calib_batches=None, device=cpu(), LayerOutputCollector=None, logger=None):
     """User-level API for Gluon users to generate a quantized SymbolBlock from a FP32 HybridBlock w/ or w/o calibration.
     The backend quantized operators are only enabled for Linux systems. Please do not run
     inference using the quantized models on Windows for now.
@@ -805,9 +808,9 @@ def quantize_net(network, quantized_dtype='auto', quantize_mode='full', quantize
     num_calib_batches : int or None
         The maximum number of batches that user would like to use for calibration. If not provided,
         the whole calibration dataset will be used.
-    ctx : Context
+    device : Device
         Defines the device that users want to run forward propagation on the calibration
-        dataset for collecting layer output statistics. Currently, only supports single context.
+        dataset for collecting layer output statistics. Currently, only supports single device.
     LayerOutputCollector : subclass of CalibrationCollector
         For `custom` calibration method usage.
         Passed object's include_layers attribute will be feed with names of layers which needs calibration
@@ -821,8 +824,8 @@ def quantize_net(network, quantized_dtype='auto', quantize_mode='full', quantize
     """
     from ..gluon import SymbolBlock
 
-    if ctx != mx.cpu():
-        raise ValueError('Quantization currently supports only CPU context')
+    if device != mx.cpu():
+        raise ValueError('Quantization currently supports only CPU device')
     backend = 'ONEDNN_QUANTIZE'
 
     network.hybridize(static_alloc=False, static_shape=False)
@@ -899,16 +902,16 @@ def quantize_net(network, quantized_dtype='auto', quantize_mode='full', quantize
         logger.info('These layers have been excluded %s' % exclude_layers)
 
     qsym, qarg_params, aux_params, collector, _ = quantize_graph(
-        sym=symnet, arg_params=args, aux_params=auxs, ctx=ctx,
+        sym=symnet, arg_params=args, aux_params=auxs, device=device,
         excluded_sym_names=exclude_layers, excluded_op_names=exclude_operators,
         calib_mode=calib_mode, quantized_dtype=quantized_dtype, quantize_mode=quantize_mode,
         quantize_granularity=quantize_granularity, LayerOutputCollector=LayerOutputCollector,
         logger=logger)
 
     if calib_mode is not None and calib_mode != 'none':
-        if not isinstance(ctx, Context):
+        if not isinstance(device, Device):
             raise ValueError(
-                'currently only supports single ctx, while received %s' % str(ctx))
+                'currently only supports single device, while received %s' % str(device))
         if calib_data is None:
             raise ValueError(
                 'calib_data must be provided when calib_mode=%s' % calib_mode)
diff --git a/python/mxnet/gluon/contrib/data/vision/transforms/bbox/bbox.py b/python/mxnet/gluon/contrib/data/vision/transforms/bbox/bbox.py
index 65a18aaf80cd..146e529425a2 100644
--- a/python/mxnet/gluon/contrib/data/vision/transforms/bbox/bbox.py
+++ b/python/mxnet/gluon/contrib/data/vision/transforms/bbox/bbox.py
@@ -277,7 +277,7 @@ def forward(self, img, bbox):
         if isinstance(self._fill, numeric_types):
             dst = F.full(shape=(oh, ow, c), val=self._fill, dtype=img.dtype)
         else:
-            fill = F.array(self._fill, dtype=img.dtype, ctx=img.ctx)
+            fill = F.array(self._fill, dtype=img.dtype, ctx=img.device)
             if not c == fill.size:
                 raise ValueError("Channel and fill size mismatch, {} vs {}".format(c, fill.size))
             dst = F.tile(fill.reshape((1, c)), reps=(oh * ow, 1)).reshape((oh, ow, c))
diff --git a/python/mxnet/gluon/contrib/estimator/estimator.py b/python/mxnet/gluon/contrib/estimator/estimator.py
index 73ddcbc0ff8e..423d198be17e 100644
--- a/python/mxnet/gluon/contrib/estimator/estimator.py
+++ b/python/mxnet/gluon/contrib/estimator/estimator.py
@@ -32,7 +32,7 @@
 from ...loss import Loss as gluon_loss
 from ...trainer import Trainer
 from ...utils import split_and_load
-from ....context import Context, cpu, gpu, num_gpus
+from ....device import Device, cpu, gpu, num_gpus
 from ...metric import Loss as metric_loss
 from .batch_processor import BatchProcessor
 
@@ -59,7 +59,7 @@ class Estimator(object):
         Initializer to initialize the network.
     trainer : Trainer
         Trainer to apply optimizer on network parameters.
-    context : Context or list of Context
+    device : Device or list of Device
         Device(s) to run the training on.
     val_net : gluon.Block
         The model used for validation. The validation model does not necessarily belong to
@@ -73,7 +73,7 @@ class Estimator(object):
         >>> net = _get_train_network()
         >>> val_net = _get_test_network()
         >>> val_net.share_parameters(net.collect_params())
-        >>> net.initialize(ctx=ctx)
+        >>> net.initialize(device=device)
         >>> est = Estimator(net, loss, val_net=val_net)
 
         Proper namespace match is required for weight sharing between two networks. Most networks
@@ -113,7 +113,7 @@ def __init__(self, net,
                  val_metrics=None,
                  initializer=None,
                  trainer=None,
-                 context=None,
+                 device=None,
                  val_net=None,
                  val_loss=None,
                  batch_processor=None):
@@ -133,7 +133,7 @@ def __init__(self, net,
         self.logger = logging.Logger(name='Estimator', level=logging.INFO)
         self.logger.addHandler(logging.StreamHandler(sys.stdout))
 
-        self.context = self._check_context(context)
+        self.device = self._check_devices(device)
         self._initialize(initializer)
         self.trainer = self._check_trainer(trainer)
         self.batch_processor = self._check_batch_processor(batch_processor)
@@ -145,37 +145,43 @@ def _check_loss(self, loss):
         return loss
 
     def _check_context(self, context):
-        # infer available context
+        """This function has been deprecated. Please refer to ``Estimator._check_devices``."""
+        warnings.warn('Estimator._check_context has been renamed to'
+                      ' Estimator._check_devices', DeprecationWarning)
+        return self._check_devices(context)
+    
+    def _check_devices(self, devices):
+        # infer available devices
         gpus = num_gpus()
         available_gpus = [gpu(i) for i in range(gpus)]
 
-        if context:
-            # check context values, only accept Context or a list of Context
-            if isinstance(context, Context):
-                context = [context]
-            elif isinstance(context, list) and all([isinstance(c, Context) for c in context]):
-                context = context
+        if devices:
+            # check devices values, only accept Device or a list of Device
+            if isinstance(devices, Device):
+                devices = [devices]
+            elif isinstance(devices, list) and all([isinstance(c, Device) for c in devices]):
+                devices = devices
             else:
-                raise ValueError("context must be a Context or a list of Context, "
+                raise ValueError("devices must be a Device or a list of Device, "
                                  "for example mx.cpu() or [mx.gpu(0), mx.gpu(1)], "
-                                 "refer to mxnet.Context:{}".format(context))
-            for ctx in context:
-                assert ctx in available_gpus or str(ctx).startswith('cpu'), \
+                                 "refer to mxnet.Device:{}".format(devices))
+            for device in devices:
+                assert device in available_gpus or str(device).startswith('cpu'), \
                     "%s is not available, please make sure " \
-                    "your context is in one of: mx.cpu(), %s" % \
-                    (ctx, ", ".join([str(ctx) for ctx in available_gpus]))
+                    "your device is in one of: mx.cpu(), %s" % \
+                    (device, ", ".join([str(device) for device in available_gpus]))
         else:
-            # provide default context
+            # provide default device
             if gpus > 0:
                 # only use 1 GPU by default
                 if gpus > 1:
                     warnings.warn("You have multiple GPUs, gpu(0) will be used by default."
-                                  "To utilize all your GPUs, specify context as a list of gpus, "
-                                  "e.g. context=[mx.gpu(0), mx.gpu(1)] ")
-                context = [gpu(0)]
+                                  "To utilize all your GPUs, specify device as a list of gpus, "
+                                  "e.g. devices=[mx.gpu(0), mx.gpu(1)] ")
+                devices = [gpu(0)]
             else:
-                context = [cpu()]
-        return context
+                devices = [cpu()]
+        return devices
 
     def _check_batch_processor(self, batch_processor):
         # check whether the batch processor contains fit_batch() and evaluate_batch() methods
@@ -197,9 +203,9 @@ def _initialize(self, initializer):
             # if initializer is None, default initializer will be used
             # do not re-init layers already initialized
             if initializer:
-                self.net.initialize(init=initializer, ctx=self.context)
+                self.net.initialize(init=initializer, device=self.device)
             else:
-                self.net.initialize(ctx=self.context)
+                self.net.initialize(device=self.device)
         elif initializer:
             # net is fully initialized, and user passed not None initializer
             # do not force reinitialize, give warning
@@ -225,16 +231,16 @@ def _is_initialized(self):
         param_dict = self.net.collect_params()
         for param in param_dict:
             try:
-                param_dict[param].list_ctx()
+                param_dict[param].list_device()
             except RuntimeError:
                 return False
         return True
 
-    def _get_data_and_label(self, batch, ctx, batch_axis=0):
+    def _get_data_and_label(self, batch, device, batch_axis=0):
         data = batch[0]
         label = batch[1]
-        data = split_and_load(data, ctx_list=ctx, batch_axis=batch_axis)
-        label = split_and_load(label, ctx_list=ctx, batch_axis=batch_axis)
+        data = split_and_load(data, device_list=device, batch_axis=batch_axis)
+        label = split_and_load(label, device_list=device, batch_axis=batch_axis)
         return data, label
 
     def _add_default_training_metrics(self):
diff --git a/python/mxnet/gluon/data/batchify.py b/python/mxnet/gluon/data/batchify.py
index 7140a5962881..2d5ae0cb3a63 100644
--- a/python/mxnet/gluon/data/batchify.py
+++ b/python/mxnet/gluon/data/batchify.py
@@ -22,7 +22,7 @@
 import warnings
 import numpy as np
 
-from ...context import Context, cpu
+from ...device import Device, cpu
 from ... import ndarray as nd
 from ... import numpy as _np
 from ...util import is_np_array
@@ -82,7 +82,7 @@ def __call__(self, data):
             dtype = data[0].dtype
             if self._use_shared_mem:
                 out = _arr.empty((len(data),) + data[0].shape, dtype=dtype,
-                                 ctx=Context('cpu_shared', 0))
+                                 device=Device('cpu_shared', 0))
                 return _arr.stack(data, out=out) if is_np_array() else _arr.stack(*data, out=out)
             else:
                 return _arr.stack(data) if is_np_array() else _arr.stack(*data)
@@ -93,7 +93,7 @@ def __call__(self, data):
             out = np.asarray(data)
             dtype = out.dtype
             if self._use_shared_mem:
-                return _arr.array(out, ctx=Context('cpu_shared', 0), dtype=dtype)
+                return _arr.array(out, device=Device('cpu_shared', 0), dtype=dtype)
             else:
                 return _arr.array(out, dtype=dtype)
 
@@ -148,8 +148,8 @@ def _pad_arrs_to_max_length(arrs, pad_val, use_shared_mem, dtype, round_to=None)
             ret[tuple(slices)] = arr
 
 
-    ctx = Context('cpu_shared', 0) if use_shared_mem else cpu()
-    ret = _arr.array(ret, ctx=ctx, dtype=dtype)
+    device = Device('cpu_shared', 0) if use_shared_mem else cpu()
+    ret = _arr.array(ret, device=device, dtype=dtype)
 
     return ret
 
@@ -261,12 +261,12 @@ def _append_arrs(arrs, use_shared_mem=False, expand=False, batch_axis=0):
     _arr = _np if is_np_array() else nd
     if isinstance(arrs[0], _arr.NDArray):
         if use_shared_mem:
-            out = [x.as_in_context(Context('cpu_shared', 0)) for x in arrs]
+            out = [x.as_in_context(Device('cpu_shared', 0)) for x in arrs]
         else:
             out = arrs
     else:
         if use_shared_mem:
-            out = [_arr.array(x, ctx=Context('cpu_shared', 0)) for x in arrs]
+            out = [_arr.array(x, device=Device('cpu_shared', 0)) for x in arrs]
         else:
             out = [_arr.array(x) for x in arrs]
 
diff --git a/python/mxnet/gluon/loss.py b/python/mxnet/gluon/loss.py
index 57364d06db2b..14195b8e0902 100644
--- a/python/mxnet/gluon/loss.py
+++ b/python/mxnet/gluon/loss.py
@@ -1006,4 +1006,4 @@ def forward(self, x1, x2):
         # multiply for the number of labels to obtain the correct loss (gluon kl_loss averages instead of sum)
         # PR#18423:multiply for the number of labels should multiply x1.shape[1] rather than x1.shape[0])
         # After PR#18423, it is no need to multiply it anymore.
-        return self.kl_loss(log_probabilities, labels.as_in_ctx(distances.ctx))
+        return self.kl_loss(log_probabilities, labels.to_device(distances.device))
diff --git a/python/mxnet/gluon/metric.py b/python/mxnet/gluon/metric.py
index b99a902bcfdb..974ce0cd6b93 100644
--- a/python/mxnet/gluon/metric.py
+++ b/python/mxnet/gluon/metric.py
@@ -421,7 +421,7 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
 
         for label, pred_label in zip(labels, preds):
-            pred_label = pred_label.as_np_ndarray().as_in_ctx(label.ctx)
+            pred_label = pred_label.as_np_ndarray().to_device(label.device)
             label = label.as_np_ndarray()
             if pred_label.shape != label.shape:
                 pred_label = pred_label.argmax(axis=self.axis)
@@ -503,8 +503,8 @@ def update(self, labels, preds):
             # we do not care about the order of top k elements. It is
             # much faster, which is important since that computation is
             # single-threaded due to Python GIL.
-            pred_label = pred_label.as_np_ndarray().as_in_ctx(label.ctx).astype('float32')
-            pred_label = numpy.argpartition(pred_label, -self.top_k).as_in_ctx(label.ctx)
+            pred_label = pred_label.as_np_ndarray().to_device(label.device).astype('float32')
+            pred_label = numpy.argpartition(pred_label, -self.top_k).to_device(label.device)
             label = label.as_np_ndarray().astype('int32')
             check_label_shapes(label, pred_label)
             num_samples = pred_label.shape[0]
@@ -574,13 +574,13 @@ def __init__(self, class_type="binary", threshold=0.5, beta=1):
         self.beta = beta
         self.reset_stats()
 
-    def _set(self, num, ctx):
+    def _set(self, num, device):
         if self.num_classes is None:
             self.num_classes = num
-            self.true_positives = numpy.zeros(num, dtype='float64').as_in_ctx(ctx)
-            self.false_negatives = numpy.zeros(num, dtype='float64').as_in_ctx(ctx)
-            self.false_positives = numpy.zeros(num, dtype='float64').as_in_ctx(ctx)
-            self.true_negatives = numpy.zeros(num, dtype='float64').as_in_ctx(ctx)
+            self.true_positives = numpy.zeros(num, dtype='float64').to_device(device)
+            self.false_negatives = numpy.zeros(num, dtype='float64').to_device(device)
+            self.false_positives = numpy.zeros(num, dtype='float64').to_device(device)
+            self.true_negatives = numpy.zeros(num, dtype='float64').to_device(device)
         else:
             assert self.num_classes == num, \
                 "Input number of classes has changed from {} to {}".format(self.num_classes, num)
@@ -596,10 +596,10 @@ def update_stats(self, label, pred):
         pred : `NDArray`
             Predicted values.
         """
-        pred = pred.as_np_ndarray().as_in_ctx(label.ctx)
+        pred = pred.as_np_ndarray().to_device(label.device)
         label = label.as_np_ndarray().astype('int32')
         if self.class_type == "binary":
-            self._set(1, label.ctx)
+            self._set(1, label.device)
             if label.max() > 1:
                 raise ValueError("Wrong label for binary classification.")
             if pred.shape == label.shape:
@@ -613,14 +613,14 @@ def update_stats(self, label, pred):
 
         elif self.class_type == "multiclass":
             num = pred.shape[-1]
-            self._set(num, label.ctx)
+            self._set(num, label.device)
             assert label.max() < num, "pred contains fewer classes than label!"
             pred_label = one_hot(pred.argmax(axis=-1).reshape(-1), num)
             label = one_hot(label.reshape(-1), num)
 
         elif self.class_type == "multilabel":
             num = pred.shape[-1]
-            self._set(num, label.ctx)
+            self._set(num, label.device)
             assert pred.shape == label.shape, \
                 "The shape of label should be same as that of prediction for multilabel classification."
             pred_label = predict_with_threshold(pred, self.threshold).reshape(-1, num)
@@ -922,7 +922,7 @@ def update(self, labels, preds):
         for label, pred_label in zip(labels, preds):
             pred_label = predict_with_threshold(pred_label, self.threshold)
 
-            pred_label = pred_label.as_np_ndarray().astype('int32').as_in_ctx(label.ctx)
+            pred_label = pred_label.as_np_ndarray().astype('int32').to_device(label.device)
             label = label.as_np_ndarray().astype('int32')
             # flatten before checking shapes to avoid shape miss match
             label = label.reshape(-1)
@@ -1085,7 +1085,7 @@ def update(self, labels, preds):
 
         for label, pred in zip(labels, preds):
             label = label.as_np_ndarray()
-            pred = pred.as_np_ndarray().as_in_ctx(label.ctx)
+            pred = pred.as_np_ndarray().to_device(label.device)
 
             num_inst = label.shape[0]
             mae = numpy.abs(label - pred).reshape(num_inst, -1).mean(axis=-1).sum()
@@ -1145,7 +1145,7 @@ def update(self, labels, preds):
 
         for label, pred in zip(labels, preds):
             label = label.as_np_ndarray()
-            pred = pred.as_np_ndarray().as_in_ctx(label.ctx)
+            pred = pred.as_np_ndarray().to_device(label.device)
 
             num_inst = label.shape[0]
             mse = ((label - pred)**2.0).reshape(num_inst, -1).mean(axis=-1).sum()
@@ -1251,7 +1251,7 @@ def update(self, labels, preds):
 
         for label, pred in zip(labels, preds):
             label = label.as_np_ndarray()
-            pred = pred.as_np_ndarray().as_in_ctx(label.ctx)
+            pred = pred.as_np_ndarray().to_device(label.device)
 
             label = label.reshape(label.shape[0], -1)
             pred = pred.reshape(pred.shape[0], -1)
@@ -1320,7 +1320,7 @@ def update(self, labels, preds):
 
         for label, pred in zip(labels, preds):
             label = label.as_np_ndarray()
-            pred = pred.as_np_ndarray().as_in_ctx(label.ctx)
+            pred = pred.as_np_ndarray().to_device(label.device)
 
             if len(label.shape) == 1:
                 label = label.reshape(1, label.shape[0])
@@ -1415,7 +1415,7 @@ def update(self, labels, preds):
             label = label.reshape((label.size,))
             if self.from_logits:
                 pred = npx.softmax(pred, axis=self.axis)
-            pred = npx.pick(pred.as_in_ctx(label.ctx), label.astype(dtype='int32'), axis=self.axis)
+            pred = npx.pick(pred.to_device(label.device), label.astype(dtype='int32'), axis=self.axis)
             if self.ignore_label is not None:
                 ignore = (label == self.ignore_label).astype(pred.dtype)
                 num -= ignore.sum()
@@ -1573,7 +1573,7 @@ def update(self, labels, preds):
         for label, pred in zip(labels, preds):
             check_label_shapes(label, pred, False, True)
             label = label.as_np_ndarray().reshape(-1).astype(numpy.float64)
-            pred = pred.as_np_ndarray().as_in_ctx(label.ctx).reshape(-1).astype(numpy.float64)
+            pred = pred.as_np_ndarray().to_device(label.device).reshape(-1).astype(numpy.float64)
 
             self.num_inst += 1
             self._label_nums, self._mean_l, self._sse_l = \
@@ -1685,7 +1685,7 @@ def update(self, labels, preds):
         # update the confusion matrix
         for label, pred in zip(labels, preds):
             label = label.astype('int32', copy=False).as_np_ndarray()
-            pred = pred.as_np_ndarray().as_in_ctx(label.ctx)
+            pred = pred.as_np_ndarray().to_device(label.device)
             if pred.shape != label.shape:
                 pred = pred.argmax(axis=1).astype(label, copy=False)
             else:
@@ -1816,7 +1816,7 @@ def update(self, labels, preds):
 
         for pred, label in zip(preds, labels):
             label = label.as_np_ndarray()
-            pred = pred.as_np_ndarray().as_in_ctx(label.ctx)
+            pred = pred.as_np_ndarray().to_device(label.device)
 
             reval = self._feval(label, pred)
             if isinstance(reval, tuple):
diff --git a/python/mxnet/gluon/model_zoo/vision/alexnet.py b/python/mxnet/gluon/model_zoo/vision/alexnet.py
index 713ed9a82329..d8d5c5caa7c5 100644
--- a/python/mxnet/gluon/model_zoo/vision/alexnet.py
+++ b/python/mxnet/gluon/model_zoo/vision/alexnet.py
@@ -22,11 +22,11 @@
 
 import os
 
-from ....context import cpu
+from ....device import cpu
 from ...block import HybridBlock
 from ... import nn
 from .... import base
-from ....util import use_np
+from ....util import use_np, wrap_ctx_to_device_func
 
 # Net
 @use_np
@@ -68,7 +68,8 @@ def forward(self, x):
         return x
 
 # Constructor
-def alexnet(pretrained=False, ctx=cpu(),
+@wrap_ctx_to_device_func
+def alexnet(pretrained=False, device=cpu(),
             root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""AlexNet model from the `"One weird trick..." <https://arxiv.org/abs/1404.5997>`_ paper.
 
@@ -76,13 +77,13 @@ def alexnet(pretrained=False, ctx=cpu(),
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
     net = AlexNet(**kwargs)
     if pretrained:
         from ..model_store import get_model_file
-        net.load_parameters(get_model_file('alexnet', root=root), ctx=ctx)
+        net.load_parameters(get_model_file('alexnet', root=root), device=device)
     return net
diff --git a/python/mxnet/gluon/model_zoo/vision/densenet.py b/python/mxnet/gluon/model_zoo/vision/densenet.py
index ffa06c03637d..088596d96aa2 100644
--- a/python/mxnet/gluon/model_zoo/vision/densenet.py
+++ b/python/mxnet/gluon/model_zoo/vision/densenet.py
@@ -22,11 +22,11 @@
 
 import os
 
-from ....context import cpu
+from ....device import cpu
 from ...block import HybridBlock
 from ... import nn
 from .... import base
-from ....util import use_np
+from ....util import use_np, wrap_ctx_to_device_func
 
 # Helpers
 def _make_dense_block(num_layers, bn_size, growth_rate, dropout):
@@ -121,7 +121,8 @@ def forward(self, x):
 
 
 # Constructor
-def get_densenet(num_layers, pretrained=False, ctx=cpu(),
+@wrap_ctx_to_device_func
+def get_densenet(num_layers, pretrained=False, device=cpu(),
                  root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""Densenet-BC model from the
     `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_ paper.
@@ -132,8 +133,8 @@ def get_densenet(num_layers, pretrained=False, ctx=cpu(),
         Number of layers for the variant of densenet. Options are 121, 161, 169, 201.
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
@@ -141,7 +142,7 @@ def get_densenet(num_layers, pretrained=False, ctx=cpu(),
     net = DenseNet(num_init_features, growth_rate, block_config, **kwargs)
     if pretrained:
         from ..model_store import get_model_file
-        net.load_parameters(get_model_file('densenet%d'%(num_layers), root=root), ctx=ctx)
+        net.load_parameters(get_model_file('densenet%d'%(num_layers), root=root), device=device)
     return net
 
 def densenet121(**kwargs):
@@ -152,8 +153,8 @@ def densenet121(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
@@ -167,8 +168,8 @@ def densenet161(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
@@ -182,8 +183,8 @@ def densenet169(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
@@ -197,8 +198,8 @@ def densenet201(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
diff --git a/python/mxnet/gluon/model_zoo/vision/inception.py b/python/mxnet/gluon/model_zoo/vision/inception.py
index ca62d4eb69ef..17181eed5902 100644
--- a/python/mxnet/gluon/model_zoo/vision/inception.py
+++ b/python/mxnet/gluon/model_zoo/vision/inception.py
@@ -22,11 +22,11 @@
 
 import os
 
-from ....context import cpu
+from ....device import cpu
 from ...block import HybridBlock
 from ... import nn
 from .... import base
-from ....util import use_np
+from ....util import use_np, wrap_ctx_to_device_func
 
 # Helpers
 def _make_basic_conv(**kwargs):
@@ -194,7 +194,8 @@ def forward(self, x):
         return x
 
 # Constructor
-def inception_v3(pretrained=False, ctx=cpu(),
+@wrap_ctx_to_device_func
+def inception_v3(pretrained=False, device=cpu(),
                  root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""Inception v3 model from
     `"Rethinking the Inception Architecture for Computer Vision"
@@ -204,13 +205,13 @@ def inception_v3(pretrained=False, ctx=cpu(),
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
     net = Inception3(**kwargs)
     if pretrained:
         from ..model_store import get_model_file
-        net.load_parameters(get_model_file('inceptionv3', root=root), ctx=ctx)
+        net.load_parameters(get_model_file('inceptionv3', root=root), device=device)
     return net
diff --git a/python/mxnet/gluon/model_zoo/vision/mobilenet.py b/python/mxnet/gluon/model_zoo/vision/mobilenet.py
index 40dc40d340ef..4b21a9005143 100644
--- a/python/mxnet/gluon/model_zoo/vision/mobilenet.py
+++ b/python/mxnet/gluon/model_zoo/vision/mobilenet.py
@@ -28,10 +28,10 @@
 import os
 
 from ... import nn
-from ....context import cpu
+from ....device import cpu
 from ...block import HybridBlock
 from .... import base, np
-from ....util import use_np
+from ....util import use_np, wrap_ctx_to_device_func
 
 
 # Helpers
@@ -187,7 +187,8 @@ def forward(self, x):
 
 
 # Constructor
-def get_mobilenet(multiplier, pretrained=False, ctx=cpu(),
+@wrap_ctx_to_device_func
+def get_mobilenet(multiplier, pretrained=False, device=cpu(),
                   root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""MobileNet model from the
     `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
@@ -201,8 +202,8 @@ def get_mobilenet(multiplier, pretrained=False, ctx=cpu(),
         channel size multiplied by this multiplier.
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
@@ -214,11 +215,12 @@ def get_mobilenet(multiplier, pretrained=False, ctx=cpu(),
         if version_suffix in ('1.00', '0.50'):
             version_suffix = version_suffix[:-1]
         net.load_parameters(
-            get_model_file('mobilenet%s' % version_suffix, root=root), ctx=ctx)
+            get_model_file('mobilenet%s' % version_suffix, root=root), device=device)
     return net
 
 
-def get_mobilenet_v2(multiplier, pretrained=False, ctx=cpu(),
+@wrap_ctx_to_device_func
+def get_mobilenet_v2(multiplier, pretrained=False, device=cpu(),
                      root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""MobileNetV2 model from the
     `"Inverted Residuals and Linear Bottlenecks:
@@ -233,8 +235,8 @@ def get_mobilenet_v2(multiplier, pretrained=False, ctx=cpu(),
         channel size multiplied by this multiplier.
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
@@ -246,10 +248,11 @@ def get_mobilenet_v2(multiplier, pretrained=False, ctx=cpu(),
         if version_suffix in ('1.00', '0.50'):
             version_suffix = version_suffix[:-1]
         net.load_parameters(
-            get_model_file('mobilenetv2_%s' % version_suffix, root=root), ctx=ctx)
+            get_model_file('mobilenetv2_%s' % version_suffix, root=root), device=device)
     return net
 
 
+@wrap_ctx_to_device_func
 def mobilenet1_0(**kwargs):
     r"""MobileNet model from the
     `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
@@ -259,12 +262,13 @@ def mobilenet1_0(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     """
     return get_mobilenet(1.0, **kwargs)
 
 
+@wrap_ctx_to_device_func
 def mobilenet_v2_1_0(**kwargs):
     r"""MobileNetV2 model from the
     `"Inverted Residuals and Linear Bottlenecks:
@@ -275,12 +279,13 @@ def mobilenet_v2_1_0(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     """
     return get_mobilenet_v2(1.0, **kwargs)
 
 
+@wrap_ctx_to_device_func
 def mobilenet0_75(**kwargs):
     r"""MobileNet model from the
     `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
@@ -290,12 +295,13 @@ def mobilenet0_75(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     """
     return get_mobilenet(0.75, **kwargs)
 
 
+@wrap_ctx_to_device_func
 def mobilenet_v2_0_75(**kwargs):
     r"""MobileNetV2 model from the
     `"Inverted Residuals and Linear Bottlenecks:
@@ -306,12 +312,13 @@ def mobilenet_v2_0_75(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     """
     return get_mobilenet_v2(0.75, **kwargs)
 
 
+@wrap_ctx_to_device_func
 def mobilenet0_5(**kwargs):
     r"""MobileNet model from the
     `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
@@ -321,12 +328,13 @@ def mobilenet0_5(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     """
     return get_mobilenet(0.5, **kwargs)
 
 
+@wrap_ctx_to_device_func
 def mobilenet_v2_0_5(**kwargs):
     r"""MobileNetV2 model from the
     `"Inverted Residuals and Linear Bottlenecks:
@@ -337,12 +345,13 @@ def mobilenet_v2_0_5(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     """
     return get_mobilenet_v2(0.5, **kwargs)
 
 
+@wrap_ctx_to_device_func
 def mobilenet0_25(**kwargs):
     r"""MobileNet model from the
     `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
@@ -352,12 +361,13 @@ def mobilenet0_25(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     """
     return get_mobilenet(0.25, **kwargs)
 
 
+@wrap_ctx_to_device_func
 def mobilenet_v2_0_25(**kwargs):
     r"""MobileNetV2 model from the
     `"Inverted Residuals and Linear Bottlenecks:
@@ -368,7 +378,7 @@ def mobilenet_v2_0_25(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     """
     return get_mobilenet_v2(0.25, **kwargs)
diff --git a/python/mxnet/gluon/model_zoo/vision/resnet.py b/python/mxnet/gluon/model_zoo/vision/resnet.py
index 33d921bb398a..fe0aa68667c2 100644
--- a/python/mxnet/gluon/model_zoo/vision/resnet.py
+++ b/python/mxnet/gluon/model_zoo/vision/resnet.py
@@ -28,11 +28,11 @@
 
 import os
 
-from ....context import cpu
+from ....device import cpu
 from ...block import HybridBlock
 from ... import nn
 from .... import base
-from .... util import use_np
+from .... util import use_np, wrap_ctx_to_device_func
 from .... import npx
 
 # Helpers
@@ -357,7 +357,8 @@ def forward(self, x):
 
 
 # Constructor
-def get_resnet(version, num_layers, pretrained=False, ctx=cpu(),
+@wrap_ctx_to_device_func
+def get_resnet(version, num_layers, pretrained=False, device=cpu(),
                root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""ResNet V1 model from `"Deep Residual Learning for Image Recognition"
     <http://arxiv.org/abs/1512.03385>`_ paper.
@@ -372,8 +373,8 @@ def get_resnet(version, num_layers, pretrained=False, ctx=cpu(),
         Numbers of layers. Options are 18, 34, 50, 101, 152.
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
@@ -389,9 +390,10 @@ def get_resnet(version, num_layers, pretrained=False, ctx=cpu(),
     if pretrained:
         from ..model_store import get_model_file
         net.load_parameters(get_model_file('resnet%d_v%d'%(num_layers, version),
-                                           root=root), ctx=ctx)
+                                           root=root), device=device)
     return net
 
+@wrap_ctx_to_device_func
 def resnet18_v1(**kwargs):
     r"""ResNet-18 V1 model from `"Deep Residual Learning for Image Recognition"
     <http://arxiv.org/abs/1512.03385>`_ paper.
@@ -400,13 +402,14 @@ def resnet18_v1(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(1, 18, **kwargs)
 
+@wrap_ctx_to_device_func
 def resnet34_v1(**kwargs):
     r"""ResNet-34 V1 model from `"Deep Residual Learning for Image Recognition"
     <http://arxiv.org/abs/1512.03385>`_ paper.
@@ -415,13 +418,14 @@ def resnet34_v1(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(1, 34, **kwargs)
 
+@wrap_ctx_to_device_func
 def resnet50_v1(**kwargs):
     r"""ResNet-50 V1 model from `"Deep Residual Learning for Image Recognition"
     <http://arxiv.org/abs/1512.03385>`_ paper.
@@ -430,13 +434,14 @@ def resnet50_v1(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(1, 50, **kwargs)
 
+@wrap_ctx_to_device_func
 def resnet101_v1(**kwargs):
     r"""ResNet-101 V1 model from `"Deep Residual Learning for Image Recognition"
     <http://arxiv.org/abs/1512.03385>`_ paper.
@@ -445,13 +450,14 @@ def resnet101_v1(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(1, 101, **kwargs)
 
+@wrap_ctx_to_device_func
 def resnet152_v1(**kwargs):
     r"""ResNet-152 V1 model from `"Deep Residual Learning for Image Recognition"
     <http://arxiv.org/abs/1512.03385>`_ paper.
@@ -460,13 +466,14 @@ def resnet152_v1(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(1, 152, **kwargs)
 
+@wrap_ctx_to_device_func
 def resnet18_v2(**kwargs):
     r"""ResNet-18 V2 model from `"Identity Mappings in Deep Residual Networks"
     <https://arxiv.org/abs/1603.05027>`_ paper.
@@ -475,13 +482,14 @@ def resnet18_v2(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(2, 18, **kwargs)
 
+@wrap_ctx_to_device_func
 def resnet34_v2(**kwargs):
     r"""ResNet-34 V2 model from `"Identity Mappings in Deep Residual Networks"
     <https://arxiv.org/abs/1603.05027>`_ paper.
@@ -490,13 +498,14 @@ def resnet34_v2(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(2, 34, **kwargs)
 
+@wrap_ctx_to_device_func
 def resnet50_v2(**kwargs):
     r"""ResNet-50 V2 model from `"Identity Mappings in Deep Residual Networks"
     <https://arxiv.org/abs/1603.05027>`_ paper.
@@ -505,13 +514,14 @@ def resnet50_v2(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(2, 50, **kwargs)
 
+@wrap_ctx_to_device_func
 def resnet101_v2(**kwargs):
     r"""ResNet-101 V2 model from `"Identity Mappings in Deep Residual Networks"
     <https://arxiv.org/abs/1603.05027>`_ paper.
@@ -520,13 +530,14 @@ def resnet101_v2(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_resnet(2, 101, **kwargs)
 
+@wrap_ctx_to_device_func
 def resnet152_v2(**kwargs):
     r"""ResNet-152 V2 model from `"Identity Mappings in Deep Residual Networks"
     <https://arxiv.org/abs/1603.05027>`_ paper.
@@ -535,8 +546,8 @@ def resnet152_v2(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
diff --git a/python/mxnet/gluon/model_zoo/vision/squeezenet.py b/python/mxnet/gluon/model_zoo/vision/squeezenet.py
index ac04129369c9..571e6dc38a1b 100644
--- a/python/mxnet/gluon/model_zoo/vision/squeezenet.py
+++ b/python/mxnet/gluon/model_zoo/vision/squeezenet.py
@@ -22,11 +22,11 @@
 
 import os
 
-from ....context import cpu
+from ....device import cpu
 from ...block import HybridBlock
 from ... import nn
 from .... import base
-from ....util import use_np
+from ....util import use_np, wrap_ctx_to_device_func
 
 # Helpers
 def _make_fire(squeeze_channels, expand1x1_channels, expand3x3_channels):
@@ -110,7 +110,8 @@ def forward(self, x):
         return x
 
 # Constructor
-def get_squeezenet(version, pretrained=False, ctx=cpu(),
+@wrap_ctx_to_device_func
+def get_squeezenet(version, pretrained=False, device=cpu(),
                    root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""SqueezeNet model from the `"SqueezeNet: AlexNet-level accuracy with 50x fewer parameters
     and <0.5MB model size" <https://arxiv.org/abs/1602.07360>`_ paper.
@@ -125,17 +126,18 @@ def get_squeezenet(version, pretrained=False, ctx=cpu(),
         Version of squeezenet. Options are '1.0', '1.1'.
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
     net = SqueezeNet(version, **kwargs)
     if pretrained:
         from ..model_store import get_model_file
-        net.load_parameters(get_model_file('squeezenet%s'%version, root=root), ctx=ctx)
+        net.load_parameters(get_model_file('squeezenet%s'%version, root=root), device=device)
     return net
 
+@wrap_ctx_to_device_func
 def squeezenet1_0(**kwargs):
     r"""SqueezeNet 1.0 model from the `"SqueezeNet: AlexNet-level accuracy with 50x fewer parameters
     and <0.5MB model size" <https://arxiv.org/abs/1602.07360>`_ paper.
@@ -144,13 +146,14 @@ def squeezenet1_0(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_squeezenet('1.0', **kwargs)
 
+@wrap_ctx_to_device_func
 def squeezenet1_1(**kwargs):
     r"""SqueezeNet 1.1 model from the `official SqueezeNet repo
     <https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1>`_.
@@ -161,8 +164,8 @@ def squeezenet1_1(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
diff --git a/python/mxnet/gluon/model_zoo/vision/vgg.py b/python/mxnet/gluon/model_zoo/vision/vgg.py
index 0734bd370169..dd657a6f75c7 100644
--- a/python/mxnet/gluon/model_zoo/vision/vgg.py
+++ b/python/mxnet/gluon/model_zoo/vision/vgg.py
@@ -25,12 +25,12 @@
 
 import os
 
-from ....context import cpu
+from ....device import cpu
 from ....initializer import Xavier
 from ...block import HybridBlock
 from ... import nn
 from .... import base
-from ....util import use_np
+from ....util import use_np, wrap_ctx_to_device_func
 
 
 @use_np
@@ -94,7 +94,8 @@ def forward(self, x):
 
 
 # Constructors
-def get_vgg(num_layers, pretrained=False, ctx=cpu(),
+@wrap_ctx_to_device_func
+def get_vgg(num_layers, pretrained=False, device=cpu(),
             root=os.path.join(base.data_dir(), 'models'), **kwargs):
     r"""VGG model from the `"Very Deep Convolutional Networks for Large-Scale Image Recognition"
     <https://arxiv.org/abs/1409.1556>`_ paper.
@@ -105,8 +106,8 @@ def get_vgg(num_layers, pretrained=False, ctx=cpu(),
         Number of layers for the variant of densenet. Options are 11, 13, 16, 19.
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default $MXNET_HOME/models
         Location for keeping the model parameters.
     """
@@ -116,9 +117,10 @@ def get_vgg(num_layers, pretrained=False, ctx=cpu(),
         from ..model_store import get_model_file
         batch_norm_suffix = '_bn' if kwargs.get('batch_norm') else ''
         net.load_parameters(get_model_file('vgg%d%s'%(num_layers, batch_norm_suffix),
-                                           root=root), ctx=ctx)
+                                           root=root), device=device)
     return net
 
+@wrap_ctx_to_device_func
 def vgg11(**kwargs):
     r"""VGG-11 model from the `"Very Deep Convolutional Networks for Large-Scale Image Recognition"
     <https://arxiv.org/abs/1409.1556>`_ paper.
@@ -127,13 +129,14 @@ def vgg11(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_vgg(11, **kwargs)
 
+@wrap_ctx_to_device_func
 def vgg13(**kwargs):
     r"""VGG-13 model from the `"Very Deep Convolutional Networks for Large-Scale Image Recognition"
     <https://arxiv.org/abs/1409.1556>`_ paper.
@@ -142,13 +145,14 @@ def vgg13(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_vgg(13, **kwargs)
 
+@wrap_ctx_to_device_func
 def vgg16(**kwargs):
     r"""VGG-16 model from the `"Very Deep Convolutional Networks for Large-Scale Image Recognition"
     <https://arxiv.org/abs/1409.1556>`_ paper.
@@ -157,13 +161,14 @@ def vgg16(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_vgg(16, **kwargs)
 
+@wrap_ctx_to_device_func
 def vgg19(**kwargs):
     r"""VGG-19 model from the `"Very Deep Convolutional Networks for Large-Scale Image Recognition"
     <https://arxiv.org/abs/1409.1556>`_ paper.
@@ -172,13 +177,14 @@ def vgg19(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     return get_vgg(19, **kwargs)
 
+@wrap_ctx_to_device_func
 def vgg11_bn(**kwargs):
     r"""VGG-11 model with batch normalization from the
     `"Very Deep Convolutional Networks for Large-Scale Image Recognition"
@@ -188,14 +194,15 @@ def vgg11_bn(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     kwargs['batch_norm'] = True
     return get_vgg(11, **kwargs)
 
+@wrap_ctx_to_device_func
 def vgg13_bn(**kwargs):
     r"""VGG-13 model with batch normalization from the
     `"Very Deep Convolutional Networks for Large-Scale Image Recognition"
@@ -205,14 +212,15 @@ def vgg13_bn(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     kwargs['batch_norm'] = True
     return get_vgg(13, **kwargs)
 
+@wrap_ctx_to_device_func
 def vgg16_bn(**kwargs):
     r"""VGG-16 model with batch normalization from the
     `"Very Deep Convolutional Networks for Large-Scale Image Recognition"
@@ -222,14 +230,15 @@ def vgg16_bn(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
     kwargs['batch_norm'] = True
     return get_vgg(16, **kwargs)
 
+@wrap_ctx_to_device_func
 def vgg19_bn(**kwargs):
     r"""VGG-19 model with batch normalization from the
     `"Very Deep Convolutional Networks for Large-Scale Image Recognition"
@@ -239,8 +248,8 @@ def vgg19_bn(**kwargs):
     ----------
     pretrained : bool, default False
         Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
+    device : Device, default CPU
+        The device in which to load the pretrained weights.
     root : str, default '$MXNET_HOME/models'
         Location for keeping the model parameters.
     """
diff --git a/python/mxnet/gluon/nn/activations.py b/python/mxnet/gluon/nn/activations.py
index 02f62821223b..26b85bb95564 100644
--- a/python/mxnet/gluon/nn/activations.py
+++ b/python/mxnet/gluon/nn/activations.py
@@ -139,8 +139,8 @@ def __init__(self, alpha_initializer=initializer.Constant(0.25),
         self.alpha = Parameter('alpha', shape=(in_channels,), init=alpha_initializer)
 
     def forward(self, x):
-        ctx = x.ctx
-        return npx.leaky_relu(x, gamma=self.alpha.data(ctx), act_type='prelu', name='fwd')
+        device = x.device
+        return npx.leaky_relu(x, gamma=self.alpha.data(device), act_type='prelu', name='fwd')
 
 
 @use_np
diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py
index 167fab550a54..b8a85f269e6f 100644
--- a/python/mxnet/gluon/nn/basic_layers.py
+++ b/python/mxnet/gluon/nn/basic_layers.py
@@ -28,7 +28,7 @@
 from .activations import Activation
 from ..block import Block, HybridBlock
 from ..utils import _indent
-from ... import np, npx, context
+from ... import np, npx, device as _device
 from ...util import use_np
 from ..parameter import Parameter
 
@@ -223,9 +223,9 @@ def __init__(self, units, activation=None, use_bias=True, flatten=True,
             self.act = None
 
     def forward(self, x):
-        ctx = x.ctx
-        act = npx.fully_connected(x, self.weight.data(ctx),
-                                  self.bias.data(ctx) if self.bias is not None else None,
+        device = x.device
+        act = npx.fully_connected(x, self.weight.data(device),
+                                  self.bias.data(device) if self.bias is not None else None,
                                   no_bias=self.bias is None,
                                   num_hidden=self._units, flatten=self._flatten, name='fwd')
         if self.act is not None:
@@ -382,16 +382,16 @@ def cast(self, dtype):
         super(_BatchNorm, self).cast(dtype)
 
     def forward(self, x):
-        ctx = x.ctx
+        device = x.device
         if self.fuse_relu:
-            return npx.batch_norm_with_relu(x, self.gamma.data(ctx), self.beta.data(ctx),
-                                            self.running_mean.data(ctx),
-                                            self.running_var.data(ctx),
+            return npx.batch_norm_with_relu(x, self.gamma.data(device), self.beta.data(device),
+                                            self.running_mean.data(device),
+                                            self.running_var.data(device),
                                             name='fwd', **self._kwargs)
         else:
-            return npx.batch_norm(x, self.gamma.data(ctx), self.beta.data(ctx),
-                                  self.running_mean.data(ctx),
-                                  self.running_var.data(ctx),
+            return npx.batch_norm(x, self.gamma.data(device), self.beta.data(device),
+                                  self.running_mean.data(device),
+                                  self.running_var.data(device),
                                   name='fwd', **self._kwargs)
 
     def infer_shape(self, x, *args):
@@ -583,8 +583,8 @@ def __init__(self, input_dim, output_dim, dtype='float32',
                                 allow_deferred_init=True, grad_stype=grad_stype)
 
     def forward(self, x):
-        ctx = x.ctx
-        return npx.embedding(x, self.weight.data(ctx), name='fwd', **self._kwargs)
+        device = x.device
+        return npx.embedding(x, self.weight.data(device), name='fwd', **self._kwargs)
 
     def __repr__(self):
         s = '{block_name}({input_dim} -> {output_dim}, {dtype})'
@@ -671,7 +671,7 @@ class InstanceNorm(HybridBlock):
     ...                 [[ 3.3,  4.4]]])
     >>> # Instance normalization is calculated with the above formula
     >>> layer = InstanceNorm()
-    >>> layer.initialize(ctx=mx.cpu(0))
+    >>> layer.initialize(device=mx.cpu(0))
     >>> layer(x)
     [[[-0.99998355  0.99998331]]
      [[-0.99998319  0.99998361]]]
@@ -691,12 +691,12 @@ def __init__(self, axis=1, epsilon=1e-5, center=True, scale=False,
                               allow_deferred_init=True)
 
     def forward(self, x):
-        ctx = x.ctx
+        device = x.device
         if self._axis == 1:
-            return npx.instance_norm(x, self.gamma.data(ctx), self.beta.data(ctx),
+            return npx.instance_norm(x, self.gamma.data(device), self.beta.data(device),
                                      name='fwd', eps=self._epsilon)
         x = x.swapaxes(1, self._axis)
-        return npx.instance_norm(x, self.gamma.data(ctx), self.beta.data(ctx),
+        return npx.instance_norm(x, self.gamma.data(device), self.beta.data(device),
                                  name='fwd', eps=self._epsilon).swapaxes(1, self._axis)
 
     def infer_shape(self, x, *args):
@@ -762,7 +762,7 @@ class LayerNorm(HybridBlock):
     >>> x = mx.np.array([[1, 2, 3, 4, 5], [1, 1, 2, 2, 2]])
     >>> # Layer normalization is calculated with the above formula
     >>> layer = LayerNorm()
-    >>> layer.initialize(ctx=mx.cpu(0))
+    >>> layer.initialize(device=mx.cpu(0))
     >>> layer(x)
     [[-1.41421    -0.707105    0.          0.707105    1.41421   ]
      [-1.2247195  -1.2247195   0.81647956  0.81647956  0.81647956]]
@@ -784,9 +784,9 @@ def __init__(self, axis=-1, epsilon=1e-5, center=True, scale=True,
                               allow_deferred_init=True)
 
     def forward(self, data):
-        ctx = data.ctx
-        return npx.layer_norm(data, gamma=self.gamma.data(ctx),
-                              beta=self.beta.data(ctx), axis=self._axis, eps=self._epsilon)
+        device = data.device
+        return npx.layer_norm(data, gamma=self.gamma.data(device),
+                              beta=self.beta.data(device), axis=self._axis, eps=self._epsilon)
 
     def infer_shape(self, data, *args):
         channel_axis = self._axis if self._axis >= 0 else self._axis + data.ndim
@@ -856,7 +856,7 @@ class GroupNorm(HybridBlock):
                           [20, 21, 22, 23]]])
     >>> # Group normalization is calculated with the above formula
     >>> layer = GroupNorm()
-    >>> layer.initialize(ctx=mx.cpu(0))
+    >>> layer.initialize(device=mx.cpu(0))
     >>> layer(x)
     [[[-1.5932543 -1.3035717 -1.0138891 -0.7242065]
       [-0.4345239 -0.1448413  0.1448413  0.4345239]
@@ -882,8 +882,8 @@ def __init__(self, num_groups=1, epsilon=1e-5, center=True, scale=True,
                               allow_deferred_init=True)
 
     def forward(self, data):
-        ctx = data.ctx
-        norm_data = npx.group_norm(data, gamma=self.gamma.data(ctx), beta=self.beta.data(ctx),
+        device = data.device
+        norm_data = npx.group_norm(data, gamma=self.gamma.data(device), beta=self.beta.data(device),
                                    num_groups=self._num_groups, eps=self._epsilon)
         return norm_data
 
@@ -1160,12 +1160,12 @@ def _get_num_devices(self):
         warnings.warn("Caution using SyncBatchNorm: "
                       "if not using all the GPUs, please mannually set num_devices",
                       UserWarning)
-        num_devices = context.num_gpus()
+        num_devices = _device.num_gpus()
         num_devices = num_devices if num_devices > 0 else 1
         return num_devices
 
     def forward(self, x):
-        ctx = x.ctx
-        return npx.sync_batch_norm(x, self.gamma.data(ctx), self.beta.data(ctx),
-                                   self.running_mean.data(ctx), self.running_var.data(ctx),
+        device = x.device
+        return npx.sync_batch_norm(x, self.gamma.data(device), self.beta.data(device),
+                                   self.running_mean.data(device), self.running_var.data(device),
                                    name='fwd', **self._kwargs)
diff --git a/python/mxnet/gluon/nn/conv_layers.py b/python/mxnet/gluon/nn/conv_layers.py
index 65e22d82eded..e103640cb477 100644
--- a/python/mxnet/gluon/nn/conv_layers.py
+++ b/python/mxnet/gluon/nn/conv_layers.py
@@ -124,11 +124,11 @@ def __init__(self, channels, kernel_size, strides, padding, dilation,
             self.act = None
 
     def forward(self, x):
-        ctx = x.ctx
+        device = x.device
         if self.bias is None:
-            act = getattr(npx, self._op_name)(x, self.weight.data(ctx), **self._kwargs)
+            act = getattr(npx, self._op_name)(x, self.weight.data(device), **self._kwargs)
         else:
-            act = getattr(npx, self._op_name)(x, self.weight.data(ctx), self.bias.data(ctx),
+            act = getattr(npx, self._op_name)(x, self.weight.data(device), self.bias.data(device),
                                               **self._kwargs)
         if self.act is not None:
             act = self.act(act)
@@ -1416,21 +1416,21 @@ def __init__(self, channels, kernel_size=(1, 1), strides=(1, 1), padding=(0, 0),
             self.act = None
 
     def forward(self, x):
-        ctx = x.ctx
+        device = x.device
         if self.offset_bias is None:
-            offset = npx.convolution(x, self.offset_weight.data(ctx), cudnn_off=True, **self._kwargs_offset)
+            offset = npx.convolution(x, self.offset_weight.data(device), cudnn_off=True, **self._kwargs_offset)
         else:
-            offset = npx.convolution(x, self.offset_weight.data(ctx), self.offset_bias.data(ctx),
+            offset = npx.convolution(x, self.offset_weight.data(device), self.offset_bias.data(device),
                                      cudnn_off=True, **self._kwargs_offset)
 
         if self.deformable_conv_bias is None:
             act = npx.deformable_convolution(data=x, offset=offset,
-                                             weight=self.deformable_conv_weight.data(ctx),
+                                             weight=self.deformable_conv_weight.data(device),
                                              name='fwd', **self._kwargs_deformable_conv)
         else:
             act = npx.deformable_convolution(data=x, offset=offset,
-                                             weight=self.deformable_conv_weight.data(ctx),
-                                             bias=self.deformable_conv_bias.data(ctx), name='fwd',
+                                             weight=self.deformable_conv_weight.data(device),
+                                             bias=self.deformable_conv_bias.data(device), name='fwd',
                                              **self._kwargs_deformable_conv)
 
         if self.act:
@@ -1639,13 +1639,13 @@ def __init__(self, channels, kernel_size=(1, 1), strides=(1, 1), padding=(0, 0),
             self.act = None
 
     def forward(self, x):
-        ctx = x.ctx
+        device = x.device
         if self.offset_bias is None:
-            offset = npx.convolution(x, self.offset_weight.data(ctx),
+            offset = npx.convolution(x, self.offset_weight.data(device),
                                      cudnn_off=True, **self._kwargs_offset)
         else:
-            offset = npx.convolution(x, self.offset_weight.data(ctx),
-                                     self.offset_bias.data(ctx), cudnn_off=True, **self._kwargs_offset)
+            offset = npx.convolution(x, self.offset_weight.data(device),
+                                     self.offset_bias.data(device), cudnn_off=True, **self._kwargs_offset)
 
         offset_t = npx.slice_axis(offset, axis=1, begin=0, end=self.offset_split_index)
         mask = npx.slice_axis(offset, axis=1, begin=self.offset_split_index, end=None)
@@ -1653,12 +1653,12 @@ def forward(self, x):
 
         if self.deformable_conv_bias is None:
             act = npx.modulated_deformable_convolution(data=x, offset=offset_t, mask=mask,
-                                                       weight=self.deformable_conv_weight.data(ctx),
+                                                       weight=self.deformable_conv_weight.data(device),
                                                        name='fwd', **self._kwargs_deformable_conv)
         else:
             act = npx.modulated_deformable_convolution(data=x, offset=offset_t, mask=mask,
-                                                       weight=self.deformable_conv_weight.data(ctx),
-                                                       bias=self.deformable_conv_bias.data(ctx), name='fwd',
+                                                       weight=self.deformable_conv_weight.data(device),
+                                                       bias=self.deformable_conv_bias.data(device), name='fwd',
                                                        **self._kwargs_deformable_conv)
 
         if self.act:
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 1907a785a91d..950e7c121396 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -637,6 +637,12 @@ def list_grad(self):
                 "because grad_req='null'"%(self.name))
         return self._check_and_get(self._grad, list)
 
+    def list_ctx(self):
+        """This function has been deprecated. Please refer to ``Parameter.list_device``."""
+        warnings.warn('Parameter.list_ctx has been renamed to'
+                      ' Parameter.list_device', DeprecationWarning)
+        return self.list_device()
+
     def list_device(self):
         """Returns a list of devices this parameter is initialized on."""
         if self._data is None:
diff --git a/python/mxnet/gluon/rnn/conv_rnn_cell.py b/python/mxnet/gluon/rnn/conv_rnn_cell.py
index d951002bd41a..7ac6db4049fc 100644
--- a/python/mxnet/gluon/rnn/conv_rnn_cell.py
+++ b/python/mxnet/gluon/rnn/conv_rnn_cell.py
@@ -146,24 +146,24 @@ def _num_gates(self):
         return len(self._gate_names)
 
     def _conv_forward(self, inputs, states):
-        ctx = inputs.ctx
+        device = inputs.device
         i2h = npx.convolution(data=inputs,
                               num_filter=self._hidden_channels*self._num_gates,
                               kernel=self._i2h_kernel,
                               stride=self._stride,
                               pad=self._i2h_pad,
                               dilate=self._i2h_dilate,
-                              weight=self.i2h_weight.data(ctx),
-                              bias=self.i2h_bias.data(ctx),
+                              weight=self.i2h_weight.data(device),
+                              bias=self.i2h_bias.data(device),
                               layout=self._conv_layout)
-        h2h = npx.convolution(data=states[0].as_in_ctx(ctx),
+        h2h = npx.convolution(data=states[0].to_device(device),
                               num_filter=self._hidden_channels*self._num_gates,
                               kernel=self._h2h_kernel,
                               dilate=self._h2h_dilate,
                               pad=self._h2h_pad,
                               stride=self._stride,
-                              weight=self.h2h_weight.data(ctx),
-                              bias=self.h2h_bias.data(ctx),
+                              weight=self.h2h_weight.data(device),
+                              bias=self.h2h_bias.data(device),
                               layout=self._conv_layout)
         return i2h, h2h
 
@@ -443,7 +443,7 @@ def forward(self, inputs, states):
         forget_gate = npx.activation(slice_gates[1], act_type="sigmoid")
         in_transform = self._get_activation(slice_gates[2], self._activation)
         out_gate = npx.activation(slice_gates[3], act_type="sigmoid")
-        next_c = forget_gate * states[1].as_in_ctx(inputs.ctx) + in_gate * in_transform
+        next_c = forget_gate * states[1].to_device(inputs.device) + in_gate * in_transform
         next_h = np.multiply(out_gate, self._get_activation(next_c, self._activation))
 
         return next_h, [next_h, next_c]
@@ -706,7 +706,7 @@ def forward(self, inputs, states):
         next_h_tmp = self._get_activation(i2h + reset_gate * h2h, self._activation)
 
         next_h = (1. - update_gate) * next_h_tmp + update_gate * \
-            states[0].as_in_ctx(inputs.ctx)
+            states[0].to_device(inputs.device)
 
         return next_h, [next_h]
 
diff --git a/python/mxnet/gluon/rnn/rnn_cell.py b/python/mxnet/gluon/rnn/rnn_cell.py
index 0f1c31a875e9..d189e397582a 100644
--- a/python/mxnet/gluon/rnn/rnn_cell.py
+++ b/python/mxnet/gluon/rnn/rnn_cell.py
@@ -26,7 +26,7 @@
            'ModifierCell', 'ZoneoutCell', 'ResidualCell',
            'BidirectionalCell', 'VariationalDropoutCell', 'LSTMPCell']
 
-from ... import np, npx, context
+from ... import np, npx, cpu
 from ...util import use_np
 from ...base import string_types, numeric_types, _as_list
 from ..block import Block, HybridBlock
@@ -44,8 +44,8 @@ def _cells_begin_state(cells, **kwargs):
 
 def _get_begin_state(cell, begin_state, inputs, batch_size):
     if begin_state is None:
-        ctx = inputs.context if isinstance(inputs, tensor_types) else inputs[0].context
-        with ctx:
+        device = inputs.device if isinstance(inputs, tensor_types) else inputs[0].device
+        with device:
             begin_state = cell.begin_state(func=np.zeros, batch_size=batch_size)
     return begin_state
 
@@ -163,7 +163,7 @@ def begin_state(self, batch_size=0, func=np.zeros, **kwargs):
             else:
                 info = kwargs
             state = func(shape=info.pop("shape", ()),
-                         ctx=info.pop("ctx", context.cpu()),
+                         device=info.pop("device", cpu()),
                          dtype=info.pop("dtype", "float32"))
             states.append(state)
         return states
@@ -384,14 +384,14 @@ def __repr__(self):
                         **self.__dict__)
 
     def forward(self, inputs, states):
-        ctx = inputs.ctx
-        i2h = npx.fully_connected(inputs, weight=self.i2h_weight.data(ctx),
-                                  bias=self.i2h_bias.data(ctx),
+        device = inputs.device
+        i2h = npx.fully_connected(inputs, weight=self.i2h_weight.data(device),
+                                  bias=self.i2h_bias.data(device),
                                   num_hidden=self._hidden_size,
                                   no_bias=False)
-        h2h = npx.fully_connected(states[0].as_in_ctx(ctx),
-                                  weight=self.h2h_weight.data(ctx),
-                                  bias=self.h2h_bias.data(ctx),
+        h2h = npx.fully_connected(states[0].to_device(device),
+                                  weight=self.h2h_weight.data(device),
+                                  bias=self.h2h_bias.data(device),
                                   num_hidden=self._hidden_size,
                                   no_bias=False)
         i2h_plus_h2h = i2h + h2h
@@ -507,13 +507,13 @@ def __repr__(self):
 
     def forward(self, inputs, states):
         # pylint: disable=too-many-locals
-        ctx = inputs.ctx
-        i2h = npx.fully_connected(inputs, weight=self.i2h_weight.data(ctx),
-                                  bias=self.i2h_bias.data(ctx),
+        device = inputs.device
+        i2h = npx.fully_connected(inputs, weight=self.i2h_weight.data(device),
+                                  bias=self.i2h_bias.data(device),
                                   num_hidden=self._hidden_size*4, no_bias=False)
-        h2h = npx.fully_connected(states[0].as_in_ctx(ctx),
-                                  weight=self.h2h_weight.data(ctx),
-                                  bias=self.h2h_bias.data(ctx),
+        h2h = npx.fully_connected(states[0].to_device(device),
+                                  weight=self.h2h_weight.data(device),
+                                  bias=self.h2h_bias.data(device),
                                   num_hidden=self._hidden_size*4, no_bias=False)
         gates = i2h + h2h
         slice_gates = npx.slice_channel(gates, num_outputs=4)
@@ -521,7 +521,7 @@ def forward(self, inputs, states):
         forget_gate = self._get_activation(slice_gates[1], self._recurrent_activation)
         in_transform = self._get_activation(slice_gates[2], self._activation)
         out_gate = self._get_activation(slice_gates[3], self._recurrent_activation)
-        next_c = np.multiply(forget_gate, states[1].as_in_ctx(ctx)) + \
+        next_c = np.multiply(forget_gate, states[1].to_device(device)) + \
                  np.multiply(in_gate, in_transform)
         next_h = np.multiply(out_gate, npx.activation(next_c, act_type=self._activation))
 
@@ -630,16 +630,16 @@ def __repr__(self):
 
     def forward(self, inputs, states):
         # pylint: disable=too-many-locals
-        ctx = inputs.ctx
-        prev_state_h = states[0].as_in_ctx(ctx)
+        device = inputs.device
+        prev_state_h = states[0].to_device(device)
         i2h = npx.fully_connected(inputs,
-                                  weight=self.i2h_weight.data(ctx),
-                                  bias=self.i2h_bias.data(ctx),
+                                  weight=self.i2h_weight.data(device),
+                                  bias=self.i2h_bias.data(device),
                                   num_hidden=self._hidden_size * 3,
                                   no_bias=False)
         h2h = npx.fully_connected(prev_state_h,
-                                  weight=self.h2h_weight.data(ctx),
-                                  bias=self.h2h_bias.data(ctx),
+                                  weight=self.h2h_weight.data(device),
+                                  bias=self.h2h_bias.data(device),
                                   num_hidden=self._hidden_size * 3,
                                   no_bias=False)
 
@@ -959,7 +959,7 @@ def reset(self):
         self._prev_output = None
 
     def forward(self, inputs, states):
-        ctx = inputs.ctx
+        device = inputs.device
         cell, p_outputs, p_states = self.base_cell, self.zoneout_outputs, self.zoneout_states
         next_output, next_states = cell(inputs, states)
         mask = (lambda p, like: npx.dropout(np.ones(like.shape), p=p))
@@ -970,7 +970,7 @@ def forward(self, inputs, states):
 
         output = (np.where(mask(p_outputs, next_output), next_output, prev_output)
                   if p_outputs != 0. else next_output)
-        states = ([np.where(mask(p_states, new_s), new_s, old_s.as_in_ctx(ctx)) for new_s, old_s in
+        states = ([np.where(mask(p_states, new_s), new_s, old_s.to_device(device)) for new_s, old_s in
                    zip(next_states, states)] if p_states != 0. else next_states)
 
         self._prev_output = output
@@ -1172,14 +1172,14 @@ def _initialize_output_mask(self, output):
 
 
     def forward(self, inputs, states):
-        ctx = inputs.ctx
+        device = inputs.device
         cell = self.base_cell
         self._initialize_input_masks(inputs, states)
 
         if self.drop_states:
             states = list(states)
             # state dropout only needs to be applied on h, which is always the first state.
-            states[0] = states[0].as_in_ctx(ctx) * self.drop_states_mask
+            states[0] = states[0].to_device(device) * self.drop_states_mask
 
         if self.drop_inputs:
             inputs = inputs * self.drop_inputs_mask
@@ -1380,13 +1380,13 @@ def __repr__(self):
 
     # pylint: disable= arguments-differ
     def forward(self, inputs, states):
-        ctx = inputs.ctx
-        i2h = npx.fully_connected(inputs, weight=self.i2h_weight.data(ctx),
-                                  bias=self.i2h_bias.data(ctx),
+        device = inputs.device
+        i2h = npx.fully_connected(inputs, weight=self.i2h_weight.data(device),
+                                  bias=self.i2h_bias.data(device),
                                   num_hidden=self._hidden_size*4, no_bias=False)
-        h2h = npx.fully_connected(states[0].as_in_ctx(ctx),
-                                  weight=self.h2h_weight.data(ctx),
-                                  bias=self.h2h_bias.data(ctx),
+        h2h = npx.fully_connected(states[0].to_device(device),
+                                  weight=self.h2h_weight.data(device),
+                                  bias=self.h2h_bias.data(device),
                                   num_hidden=self._hidden_size*4, no_bias=False)
         gates = i2h + h2h
         slice_gates = npx.slice_channel(gates, num_outputs=4)
@@ -1394,10 +1394,10 @@ def forward(self, inputs, states):
         forget_gate = npx.activation(slice_gates[1], act_type="sigmoid")
         in_transform = npx.activation(slice_gates[2], act_type="tanh")
         out_gate = npx.activation(slice_gates[3], act_type="sigmoid")
-        next_c = forget_gate * states[1].as_in_ctx(ctx) + in_gate * in_transform
+        next_c = forget_gate * states[1].to_device(device) + in_gate * in_transform
         hidden = np.multiply(out_gate, npx.activation(next_c, act_type="tanh"))
         next_r = npx.fully_connected(hidden, num_hidden=self._projection_size,
-                                     weight=self.h2r_weight.data(ctx), no_bias=True)
+                                     weight=self.h2r_weight.data(device), no_bias=True)
 
         return next_r, [next_r, next_c]
 
@@ -1459,7 +1459,7 @@ def dynamic_unroll(cell, inputs, begin_state, drop_inputs=0, drop_outputs=0,
     >>> batch_size = 2
     >>> input_size = 5
     >>> cell = mx.gluon.rnn.LSTMCell(input_size)
-    >>> cell.initialize(ctx=mx.cpu())
+    >>> cell.initialize(device=mx.cpu())
     >>> rnn_data = mx.np.normal(loc=0, scale=1, shape=(seq_len, batch_size, input_size))
     >>> state_shape = (batch_size, input_size)
     >>> states = [mx.np.normal(loc=0, scale=1, shape=state_shape) for i in range(2)]
diff --git a/python/mxnet/gluon/rnn/rnn_layer.py b/python/mxnet/gluon/rnn/rnn_layer.py
index 59056de6ce7b..31f3a0e4c8ee 100644
--- a/python/mxnet/gluon/rnn/rnn_layer.py
+++ b/python/mxnet/gluon/rnn/rnn_layer.py
@@ -23,7 +23,7 @@
 
 __all__ = ['RNN', 'LSTM', 'GRU']
 
-from ... import np, npx, context
+from ... import np, npx, cpu
 from .. import HybridBlock, tensor_types
 from ..parameter import Parameter
 from ...util import use_np
@@ -162,7 +162,7 @@ def begin_state(self, batch_size=0, func=np.zeros, **kwargs):
             else:
                 info = kwargs
             state = func(shape=info.pop("shape", ()),
-                         ctx=info.pop("ctx", context.cpu()),
+                         device=info.pop("device", cpu()),
                          dtype=info.pop("dtype", "float32"))
             states.append(state)
         return states
@@ -171,7 +171,7 @@ def __call__(self, inputs, states=None, sequence_length=None, **kwargs):
         self.skip_states = states is None
         if states is None:
             batch_size = inputs.shape[self._layout.find('N')]
-            states = self.begin_state(batch_size, ctx=inputs.context, dtype=inputs.dtype)
+            states = self.begin_state(batch_size, device=inputs.device, dtype=inputs.dtype)
         if isinstance(states, tensor_types):
             states = [states]
 
@@ -209,17 +209,17 @@ def infer_shape(self, inputs, *args):
 
     def _forward_kernel(self, inputs, states, sequence_length):
         """ forward using CUDNN or CPU kenrel"""
-        ctx = inputs.ctx
+        device = inputs.device
         if self._layout == 'NTC':
             inputs = np.swapaxes(inputs, 0, 1)
         if self._projection_size is None:
-            params = (getattr(self, '{}{}_{}_{}'.format(d, l, g, t)).data(ctx).reshape(-1)
+            params = (getattr(self, '{}{}_{}_{}'.format(d, l, g, t)).data(device).reshape(-1)
                       for t in ['weight', 'bias']
                       for l in range(self._num_layers)
                       for d in ['l', 'r'][:self._dir]
                       for g in ['i2h', 'h2h'])
         else:
-            params = (getattr(self, '{}{}_{}_{}'.format(d, l, g, t)).data(ctx).reshape(-1)
+            params = (getattr(self, '{}{}_{}_{}'.format(d, l, g, t)).data(device).reshape(-1)
                       for t in ['weight', 'bias']
                       for l in range(self._num_layers)
                       for d in ['l', 'r'][:self._dir]
@@ -233,12 +233,12 @@ def _forward_kernel(self, inputs, states, sequence_length):
         else:
             rnn_args = states
 
-        rnn_args_ctx = []
+        rnn_args_device = []
         for args in rnn_args:
-            new_args = args.as_in_ctx(ctx)
-            rnn_args_ctx.append(new_args)
+            new_args = args.to_device(device)
+            rnn_args_device.append(new_args)
 
-        rnn = npx.rnn(inputs, params, *rnn_args_ctx, use_sequence_length=self._use_sequence_length,
+        rnn = npx.rnn(inputs, params, *rnn_args_device, use_sequence_length=self._use_sequence_length,
                       state_size=self._hidden_size, projection_size=self._projection_size,
                       num_layers=self._num_layers, bidirectional=self._dir == 2,
                       p=self._dropout, state_outputs=True, mode=self._mode,
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index de47dd61cdd3..8c3b490875fa 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -136,7 +136,7 @@ def clip_global_norm(arrays, max_norm, check_isfinite=True):
     def group_by_ctx(arr_list):
         groups = collections.defaultdict(list)
         for arr in arr_list:
-            ctx = arr.ctx
+            ctx = arr.device
             groups[ctx].append(arr)
         return groups
     def multi_sum_sq(*args, ctx=None):
@@ -146,7 +146,7 @@ def multi_sum_sq(*args, ctx=None):
         return sum
     arrays_groups = group_by_ctx(arrays)
     all_ctx_sum = _mx_np.array([0])
-    ctx = arrays[0].ctx
+    ctx = arrays[0].device
     for group in arrays_groups:
         sum_sq = multi_sum_sq(*arrays_groups[group], ctx=ctx)
         all_ctx_sum += sum_sq
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index 7aee2578f2a3..e634aef25656 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -27,7 +27,7 @@
 from . import ndarray as nd
 from . import symbol as sym
 from . import kvstore as kvs
-from .context import cpu
+from .device import cpu
 
 BASE_ESTIMATOR = object
 
diff --git a/python/mxnet/ndarray/contrib.py b/python/mxnet/ndarray/contrib.py
index ed70f8ccfc6e..5188628a1b7a 100644
--- a/python/mxnet/ndarray/contrib.py
+++ b/python/mxnet/ndarray/contrib.py
@@ -21,7 +21,7 @@
 import math
 import numpy as np
 import mxnet as mx
-from ..context import current_context
+from ..device import current_device
 from ..random import uniform
 from ..base import _as_list
 from . import ndarray
diff --git a/python/mxnet/ndarray/numpy/random.py b/python/mxnet/ndarray/numpy/random.py
index 141177b033fd..e54201a26526 100644
--- a/python/mxnet/ndarray/numpy/random.py
+++ b/python/mxnet/ndarray/numpy/random.py
@@ -18,9 +18,10 @@
 """Namespace for operators used in Gluon dispatched by F=ndarray."""
 import numpy as np
 from ...util import is_np_default_dtype
-from ...context import current_context
+from ...device import current_device
 from . import _internal as _npi
 from . import _api_internal
+from ...util import wrap_device_to_device_func
 
 
 __all__ = ['randint', 'uniform', 'normal', "choice", "rand", "multinomial", "multivariate_normal",
@@ -29,7 +30,8 @@
            "shuffle", 'gamma', 'beta', 'chisquare', 'exponential', 'lognormal', 'weibull', 'pareto', 'power']
 
 
-def randint(low, high=None, size=None, dtype=None, ctx=None, out=None):
+@wrap_device_to_device_func
+def randint(low, high=None, size=None, dtype=None, device=None, out=None):
     r"""Return random integers from `low` (inclusive) to `high` (exclusive).
 
     Return random integers from the "discrete uniform" distribution of
@@ -54,8 +56,8 @@ def randint(low, high=None, size=None, dtype=None, ctx=None, out=None):
         name, i.e., 'int64', 'int', etc, so byteorder is not available
         and a specific precision may have different C types depending
         on the platform. The default value is 'np.int'.
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
     out : ndarray, optional
         The output ndarray (default is `None`).
 
@@ -82,19 +84,20 @@ def randint(low, high=None, size=None, dtype=None, ctx=None, out=None):
         dtype = 'int64'
     elif not isinstance(dtype, str):
         dtype = np.dtype(dtype).name
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if size is None:
         size = ()
     if high is None:
         high = low
         low = 0
-    return _api_internal.randint(low, high, size, dtype, ctx, out)
+    return _api_internal.randint(low, high, size, dtype, device, out)
 
 
-def uniform(low=0.0, high=1.0, size=None, dtype=None, ctx=None, out=None):
+@wrap_device_to_device_func
+def uniform(low=0.0, high=1.0, size=None, dtype=None, device=None, out=None):
     r"""Draw samples from a uniform distribution.
 
     Samples are uniformly distributed over the half-open interval
@@ -119,8 +122,8 @@ def uniform(low=0.0, high=1.0, size=None, dtype=None, ctx=None, out=None):
         Data type of output samples.
         When npx.is_np_default_dtype() returns False, default dtype is float32;
         When npx.is_np_default_dtype() returns True, default dtype is float64.
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
     out : ``ndarray``, optional
         Store output to an existing ``ndarray``.
 
@@ -129,18 +132,19 @@ def uniform(low=0.0, high=1.0, size=None, dtype=None, ctx=None, out=None):
     out : ndarray
         Drawn samples from the parameterized uniform distribution.
     """
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if dtype is not None and not isinstance(dtype, str):
         dtype = np.dtype(dtype).name
     if size == ():
         size = None
-    return _api_internal.uniform(low, high, size, ctx, dtype, out)
+    return _api_internal.uniform(low, high, size, device, dtype, out)
 
 
-def normal(loc=0.0, scale=1.0, size=None, dtype=None, ctx=None, out=None):
+@wrap_device_to_device_func
+def normal(loc=0.0, scale=1.0, size=None, dtype=None, device=None, out=None):
     r"""Draw random samples from a normal (Gaussian) distribution.
 
     Samples are distributed according to a normal distribution parametrized
@@ -161,8 +165,8 @@ def normal(loc=0.0, scale=1.0, size=None, dtype=None, ctx=None, out=None):
         Data type of output samples.
         When npx.is_np_default_dtype() returns False, default dtype is float32;
         When npx.is_np_default_dtype() returns True, default dtype is float64.
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
     out : ``ndarray``, optional
         Store output to an existing ``ndarray``.
 
@@ -171,18 +175,19 @@ def normal(loc=0.0, scale=1.0, size=None, dtype=None, ctx=None, out=None):
     out : ndarray
         Drawn samples from the parameterized normal distribution.
     """
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if dtype is not None and not isinstance(dtype, str):
         dtype = np.dtype(dtype).name
     if size == ():
         size = None
-    return _api_internal.normal(loc, scale, size, ctx, dtype, out)
+    return _api_internal.normal(loc, scale, size, device, dtype, out)
 
 
-def lognormal(mean=0.0, sigma=1.0, size=None, dtype=None, ctx=None, out=None):
+@wrap_device_to_device_func
+def lognormal(mean=0.0, sigma=1.0, size=None, dtype=None, device=None, out=None):
     r"""Draw samples from a log-normal distribution.
 
     Draw samples from a log-normal distribution with specified mean,
@@ -204,8 +209,8 @@ def lognormal(mean=0.0, sigma=1.0, size=None, dtype=None, ctx=None, out=None):
         Otherwise, ``np.broadcast(mean, sigma).size`` samples are drawn.
     dtype : {'float16', 'float32', 'float64'}, optional
         Data type of output samples. Default is 'float32'
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
     out : ``ndarray``, optional
         Store output to an existing ``ndarray``.
 
@@ -215,10 +220,11 @@ def lognormal(mean=0.0, sigma=1.0, size=None, dtype=None, ctx=None, out=None):
         Drawn samples from the parameterized log-normal distribution.
     """
     from . import _op as _mx_np_op
-    return _mx_np_op.exp(normal(loc=mean, scale=sigma, size=size, dtype=dtype, ctx=ctx, out=out))
+    return _mx_np_op.exp(normal(loc=mean, scale=sigma, size=size, dtype=dtype, device=device, out=out))
 
 
-def logistic(loc=0.0, scale=1.0, size=None, ctx=None, out=None):
+@wrap_device_to_device_func
+def logistic(loc=0.0, scale=1.0, size=None, device=None, out=None):
     r"""Draw samples from a logistic distribution.
 
     Samples are drawn from a logistic distribution with specified
@@ -236,8 +242,8 @@ def logistic(loc=0.0, scale=1.0, size=None, ctx=None, out=None):
         ``m * n * k`` samples are drawn.  If size is ``None`` (default),
         a single value is returned if ``loc`` and ``scale`` are both scalars.
         Otherwise, ``np.broadcast(loc, scale).size`` samples are drawn.
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
     out : ``ndarray``, optional
         Store output to an existing ``ndarray``.
 
@@ -246,16 +252,17 @@ def logistic(loc=0.0, scale=1.0, size=None, ctx=None, out=None):
     out : ndarray or scalar
         Drawn samples from the parameterized logistic distribution.
     """
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if size == ():
         size = None
-    return _api_internal.logistic(loc, scale, size, ctx, out)
+    return _api_internal.logistic(loc, scale, size, device, out)
 
 
-def gumbel(loc=0.0, scale=1.0, size=None, ctx=None, out=None):
+@wrap_device_to_device_func
+def gumbel(loc=0.0, scale=1.0, size=None, device=None, out=None):
     r"""Draw samples from a Gumbel distribution.
 
     Draw samples from a Gumbel distribution with specified location and
@@ -273,8 +280,8 @@ def gumbel(loc=0.0, scale=1.0, size=None, ctx=None, out=None):
         ``m * n * k`` samples are drawn.  If size is ``None`` (default),
         a single value is returned if ``loc`` and ``scale`` are both scalars.
         Otherwise, ``np.broadcast(loc, scale).size`` samples are drawn.
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
     out : ``ndarray``, optional
         Store output to an existing ``ndarray``.
 
@@ -283,13 +290,13 @@ def gumbel(loc=0.0, scale=1.0, size=None, ctx=None, out=None):
     out : ndarray or scalar
         Drawn samples from the parameterized Gumbel distribution.
     """
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if size == ():
         size = None
-    return _api_internal.gumbel(loc, scale, size, ctx, out)
+    return _api_internal.gumbel(loc, scale, size, device, out)
 
 
 def multinomial(n, pvals, size=None):
@@ -341,7 +348,8 @@ def multinomial(n, pvals, size=None):
     return _api_internal.multinomial(n, pvals, size)
 
 
-def rayleigh(scale=1.0, size=None, ctx=None, out=None):
+@wrap_device_to_device_func
+def rayleigh(scale=1.0, size=None, device=None, out=None):
     r"""Draw samples from a Rayleigh distribution.
 
     The :math:`\chi` and Weibull distributions are generalizations of the
@@ -356,8 +364,8 @@ def rayleigh(scale=1.0, size=None, ctx=None, out=None):
         ``m * n * k`` samples are drawn.  If size is ``None`` (default),
         a single value is returned if ``scale`` is a scalar.  Otherwise,
         ``np.array(scale).size`` samples are drawn.
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
     out : ``ndarray``, optional
         Store output to an existing ``ndarray``.
 
@@ -366,13 +374,13 @@ def rayleigh(scale=1.0, size=None, ctx=None, out=None):
     out : ndarray or scalar
         Drawn samples from the parameterized Rayleigh distribution.
     """
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if size == ():
         size = None
-    return _api_internal.rayleigh(scale, size, ctx, out)
+    return _api_internal.rayleigh(scale, size, device, out)
 
 
 def multivariate_normal(mean, cov, size=None, check_valid=None, tol=None):
@@ -456,7 +464,8 @@ def multivariate_normal(mean, cov, size=None, check_valid=None, tol=None):
     return _npi.mvn_fallback(mean, cov, size=size)
 
 
-def choice(a, size=None, replace=True, p=None, ctx=None, out=None):
+@wrap_device_to_device_func
+def choice(a, size=None, replace=True, p=None, device=None, out=None):
     r"""Generates a random sample from a given 1-D array
 
     Parameters
@@ -474,8 +483,8 @@ def choice(a, size=None, replace=True, p=None, ctx=None, out=None):
         The probabilities associated with each entry in a.
         If not given the sample assumes a uniform distribution over all
         entries in a.
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
 
     Returns
     --------
@@ -509,20 +518,21 @@ def choice(a, size=None, replace=True, p=None, ctx=None, out=None):
     array([2, 3, 0])
     """
     from ...numpy import ndarray as np_ndarray
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if size == ():
         size = None
     if isinstance(a, np_ndarray):
-        indices = _api_internal.choice(a, size, replace, p, ctx, out)
+        indices = _api_internal.choice(a, size, replace, p, device, out)
         return _api_internal.take(a, indices, 0, 'raise', out)
     else:
-        return _api_internal.choice(a, size, replace, p, ctx, out)
+        return _api_internal.choice(a, size, replace, p, device, out)
 
 
-def exponential(scale=1.0, size=None, ctx=None, out=None):
+@wrap_device_to_device_func
+def exponential(scale=1.0, size=None, device=None, out=None):
     r"""Draw samples from an exponential distribution.
 
     Parameters
@@ -535,8 +545,8 @@ def exponential(scale=1.0, size=None, ctx=None, out=None):
         ``m * n * k`` samples are drawn.  If size is ``None`` (default),
         a single value is returned if ``scale`` is a scalar.  Otherwise,
         ``np.array(scale).size`` samples are drawn.
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
     out : ``ndarray``, optional
         Store output to an existing ``ndarray``.
 
@@ -545,16 +555,17 @@ def exponential(scale=1.0, size=None, ctx=None, out=None):
     out : ndarray or scalar
         Drawn samples from the parameterized exponential distribution.
     """
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if size == ():
         size = None
-    return _api_internal.exponential(scale, size, ctx, out)
+    return _api_internal.exponential(scale, size, device, out)
 
 
-def weibull(a, size=None, ctx=None, out=None):
+@wrap_device_to_device_func
+def weibull(a, size=None, device=None, out=None):
     r"""Draw samples from a 1-parameter Weibull distribution with given
     parameter a, via inversion.
 
@@ -596,16 +607,17 @@ def weibull(a, size=None, ctx=None, out=None):
     model time to failure, in modeling particle sizes, in information retrieval
     to model dwell time on pages, in quantitative finance to model risk etc.
     """
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if size == ():
         size = None
-    return _api_internal.weibull(a, size, ctx, out)
+    return _api_internal.weibull(a, size, device, out)
 
 
-def pareto(a, size=None, ctx=None, out=None):
+@wrap_device_to_device_func
+def pareto(a, size=None, device=None, out=None):
     r"""Draw samples from a Pareto II or Lomax distribution with specified shape a.
 
     Parameters
@@ -637,16 +649,17 @@ def pareto(a, size=None, ctx=None, out=None):
     where a is the shape and m the scale. Here m is assumed 1. The Pareto distribution
     is a power law distribution. Pareto created it to describe the wealth in the economy.
     """
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if size == ():
         size = None
-    return _api_internal.pareto(a, size, ctx, out)
+    return _api_internal.pareto(a, size, device, out)
 
 
-def power(a, size=None, ctx=None, out=None):
+@wrap_device_to_device_func
+def power(a, size=None, device=None, out=None):
     r"""Draw samples in [0, 1] from a power distribution with given parameter a.
 
     Parameters
@@ -678,16 +691,17 @@ def power(a, size=None, ctx=None, out=None):
     The power distribution is just the inverse of the Pareto distribution and
     a special case of the Beta distribution.
     """
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if size == ():
         size = None
-    return _api_internal.powerd(a, size, ctx, out)
+    return _api_internal.powerd(a, size, device, out)
 
 
-def gamma(shape, scale=1.0, size=None, dtype=None, ctx=None, out=None):
+@wrap_device_to_device_func
+def gamma(shape, scale=1.0, size=None, dtype=None, device=None, out=None):
     """Draw samples from a Gamma distribution.
 
     Samples are drawn from a Gamma distribution with specified parameters,
@@ -710,8 +724,8 @@ def gamma(shape, scale=1.0, size=None, dtype=None, ctx=None, out=None):
         Data type of output samples.
         When npx.is_np_default_dtype() returns False, default dtype is float32;
         When npx.is_np_default_dtype() returns True, default dtype is float64.
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
 
     Returns
     -------
@@ -726,16 +740,17 @@ def gamma(shape, scale=1.0, size=None, dtype=None, ctx=None, out=None):
         size = out.shape
     if size == ():
         size = None
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if dtype is not None and not isinstance(dtype, str):
         dtype = np.dtype(dtype).name
-    return _api_internal.gamma(shape, scale, size, ctx, dtype, out)
+    return _api_internal.gamma(shape, scale, size, device, dtype, out)
 
 
-def beta(a, b, size=None, dtype=None, ctx=None):
+@wrap_device_to_device_func
+def beta(a, b, size=None, dtype=None, device=None):
     r"""Draw samples from a Beta distribution.
 
     The Beta distribution is a special case of the Dirichlet distribution,
@@ -767,8 +782,8 @@ def beta(a, b, size=None, dtype=None, ctx=None):
         Data type of output samples.
         When npx.is_np_default_dtype() returns False, default dtype is float32;
         When npx.is_np_default_dtype() returns True, default dtype is float64.
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
 
     Notes
     -------
@@ -781,18 +796,19 @@ def beta(a, b, size=None, dtype=None, ctx=None):
     """
     if dtype is None:
         dtype = np.float64 if is_np_default_dtype() else np.float32
-    if ctx is None:
-        ctx = current_context()
+    if device is None:
+        device = current_device()
     if size == ():
         size = None
     # use fp64 to prevent precision loss
-    X = gamma(a, 1, size=size, dtype='float64', ctx=ctx)
-    Y = gamma(b, 1, size=size, dtype='float64', ctx=ctx)
+    X = gamma(a, 1, size=size, dtype='float64', device=device)
+    Y = gamma(b, 1, size=size, dtype='float64', device=device)
     out = X / (X + Y)
     return out.astype(dtype)
 
 
-def f(dfnum, dfden, size=None, ctx=None):
+@wrap_device_to_device_func
+def f(dfnum, dfden, size=None, device=None):
     r"""Draw samples from an F distribution.
 
     Samples are drawn from an F distribution with specified parameters,
@@ -816,8 +832,8 @@ def f(dfnum, dfden, size=None, ctx=None):
         ``m * n * k`` samples are drawn.  If size is ``None`` (default),
         a single value is returned if ``dfnum`` and ``dfden`` are both scalars.
         Otherwise, ``np.broadcast(dfnum, dfden).size`` samples are drawn.
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
 
     Returns
     -------
@@ -851,14 +867,15 @@ def f(dfnum, dfden, size=None, ctx=None):
     the measured value is 36, so the null hypothesis is rejected at the 1%
     level.
     """
-    X = chisquare(df=dfnum, size=size, ctx=ctx)
-    Y = chisquare(df=dfden, size=size, ctx=ctx)
+    X = chisquare(df=dfnum, size=size, device=device)
+    Y = chisquare(df=dfden, size=size, device=device)
     return (X * dfden) / (Y * dfnum)
 
 
-def chisquare(df, size=None, dtype=None, ctx=None):
+@wrap_device_to_device_func
+def chisquare(df, size=None, dtype=None, device=None):
     r"""
-    chisquare(df, size=None, dtype=None, ctx=None)
+    chisquare(df, size=None, dtype=None, device=None)
 
     Draw samples from a chi-square distribution.
 
@@ -882,8 +899,8 @@ def chisquare(df, size=None, dtype=None, ctx=None):
         When npx.is_np_default_dtype() returns True, default dtype is float64.
         Dtype 'float32' or 'float64' is strongly recommended,
         since lower precision might lead to out of range issue.
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
 
     Returns
     -------
@@ -928,13 +945,14 @@ def chisquare(df, size=None, dtype=None, ctx=None):
     """
     if dtype is None:
         dtype = np.float64 if is_np_default_dtype() else np.float32
-    if ctx is None:
-        ctx = current_context()
+    if device is None:
+        device = current_device()
     if size == ():
         size = None
-    return gamma(df/2, 2, size=size, dtype=dtype, ctx=ctx)
+    return gamma(df/2, 2, size=size, dtype=dtype, device=device)
 
 
+@wrap_device_to_device_func
 def rand(*size, **kwargs):
     r"""Random values in a given shape.
 
@@ -998,7 +1016,8 @@ def shuffle(x):
     _api_internal.shuffle(x, x)
 
 
-def laplace(loc=0.0, scale=1.0, size=None, dtype=None, ctx=None, out=None):
+@wrap_device_to_device_func
+def laplace(loc=0.0, scale=1.0, size=None, dtype=None, device=None, out=None):
     r"""Draw random samples from a Laplace distribution.
 
     Samples are distributed according to a Laplace distribution parametrized
@@ -1017,8 +1036,8 @@ def laplace(loc=0.0, scale=1.0, size=None, dtype=None, ctx=None, out=None):
 
     dtype : {'float16', 'float32', 'float64'}, optional
         Data type of output samples. Default is 'float32'
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
     out : ``ndarray``, optional
         Store output to an existing ``ndarray``.
 
@@ -1027,12 +1046,12 @@ def laplace(loc=0.0, scale=1.0, size=None, dtype=None, ctx=None, out=None):
     out : ndarray
         Drawn samples from the parameterized Laplace distribution.
     """
-    if ctx is None:
-        ctx = str(current_context())
+    if device is None:
+        device = str(current_device())
     else:
-        ctx = str(ctx)
+        device = str(device)
     if dtype is not None and not isinstance(dtype, str):
         dtype = np.dtype(dtype).name
     if size == ():
         size = None
-    return _api_internal.laplace(loc, scale, size, dtype, ctx, out)
+    return _api_internal.laplace(loc, scale, size, dtype, device, out)
diff --git a/python/mxnet/ndarray/numpy_extension/random.py b/python/mxnet/ndarray/numpy_extension/random.py
index fcc65b3084ee..3ed627661126 100644
--- a/python/mxnet/ndarray/numpy_extension/random.py
+++ b/python/mxnet/ndarray/numpy_extension/random.py
@@ -16,14 +16,16 @@
 # under the License.
 
 """Namespace for operators used in Gluon dispatched by F=ndarray."""
-from ...context import current_context
+from ...device import current_device
 from ..numpy import _internal as _npi
+from ...util import wrap_ctx_to_device_func
 
 
 __all__ = ['bernoulli', 'normal_n', 'uniform_n']
 
 
-def bernoulli(prob=None, logit=None, size=None, dtype=None, ctx=None, out=None):
+@wrap_ctx_to_device_func
+def bernoulli(prob=None, logit=None, size=None, dtype=None, device=None, out=None):
     """Creates a Bernoulli distribution parameterized by :attr:`prob`
     or :attr:`logit` (but not both).
 
@@ -47,8 +49,8 @@ def bernoulli(prob=None, logit=None, size=None, dtype=None, ctx=None, out=None):
         name, i.e., 'int64', 'int', etc, so byteorder is not available
         and a specific precision may have different C types depending
         on the platform. The default value is 'np.float32'.
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
     out : symbol, optional
         The output symbol (default is `None`).
 
@@ -81,29 +83,30 @@ def bernoulli(prob=None, logit=None, size=None, dtype=None, ctx=None, out=None):
             "Received prob={}, logit={}".format(prob, logit))
     if dtype is None:
         dtype = 'float32'
-    if ctx is None:
-        ctx = current_context()
+    if device is None:
+        device = current_device()
     if size == ():
         size = None
     if prob is not None:
         is_tensor = isinstance(prob, tensor_type_name)
         if is_tensor:
             return _npi.bernoulli(prob, prob=None, logit=None, is_logit=False,
-                                  size=size, ctx=ctx, dtype=dtype, out=out)
+                                  size=size, device=device, dtype=dtype, out=out)
         else:
             return _npi.bernoulli(prob=prob, logit=None, is_logit=False,
-                                  size=size, ctx=ctx, dtype=dtype, out=out)
+                                  size=size, device=device, dtype=dtype, out=out)
     else:
         is_tensor = isinstance(logit, tensor_type_name)
         if is_tensor:
             return _npi.bernoulli(logit, prob=None, logit=None, is_logit=True,
-                                  size=size, ctx=ctx, dtype=dtype, out=out)
+                                  size=size, device=device, dtype=dtype, out=out)
         else:
             return _npi.bernoulli(prob=None, logit=logit, is_logit=True,
-                                  size=size, ctx=ctx, dtype=dtype, out=out)
+                                  size=size, device=device, dtype=dtype, out=out)
 
 
-def uniform_n(low=0.0, high=1.0, batch_shape=None, dtype=None, ctx=None):
+@wrap_ctx_to_device_func
+def uniform_n(low=0.0, high=1.0, batch_shape=None, dtype=None, device=None):
     r"""Draw samples from a uniform distribution.
 
     Samples are uniformly distributed over the half-open interval
@@ -128,8 +131,8 @@ def uniform_n(low=0.0, high=1.0, batch_shape=None, dtype=None, ctx=None):
         ``np.broadcast(low, high).size`` samples are drawn.
     dtype : {'float16', 'float32', 'float64'}, optional
         Data type of output samples. Default is 'float32'
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
 
     Returns
     -------
@@ -161,8 +164,8 @@ def uniform_n(low=0.0, high=1.0, batch_shape=None, dtype=None, ctx=None):
     input_type = (isinstance(low, np_ndarray), isinstance(high, np_ndarray))
     if dtype is None:
         dtype = 'float32'
-    if ctx is None:
-        ctx = current_context()
+    if device is None:
+        device = current_device()
     if batch_shape == ():
         batch_shape = None
     else:
@@ -171,19 +174,20 @@ def uniform_n(low=0.0, high=1.0, batch_shape=None, dtype=None, ctx=None):
         batch_shape = (-2,) + batch_shape
     if input_type == (True, True):
         return _npi.uniform(low, high, low=None, high=None, size=batch_shape,
-                            ctx=ctx, dtype=dtype)
+                            device=device, dtype=dtype)
     elif input_type == (False, True):
         return _npi.uniform(high, low=low, high=None, size=batch_shape,
-                            ctx=ctx, dtype=dtype)
+                            device=device, dtype=dtype)
     elif input_type == (True, False):
         return _npi.uniform(low, low=None, high=high, size=batch_shape,
-                            ctx=ctx, dtype=dtype)
+                            device=device, dtype=dtype)
     else:
         return _npi.uniform(low=low, high=high, size=batch_shape,
-                            ctx=ctx, dtype=dtype)
+                            device=device, dtype=dtype)
 
 
-def normal_n(loc=0.0, scale=1.0, batch_shape=None, dtype=None, ctx=None):
+@wrap_ctx_to_device_func
+def normal_n(loc=0.0, scale=1.0, batch_shape=None, dtype=None, device=None):
     r"""Draw random samples from a normal (Gaussian) distribution.
 
     Samples are distributed according to a normal distribution parametrized
@@ -205,8 +209,8 @@ def normal_n(loc=0.0, scale=1.0, batch_shape=None, dtype=None, ctx=None):
         ``np.broadcast(loc, scale).size`` samples are drawn.
     dtype : {'float16', 'float32', 'float64'}, optional
         Data type of output samples. Default is 'float32'
-    ctx : Context, optional
-        Device context of output, default is current context.
+    device : Device, optional
+        Device context of output, default is current device.
 
     Returns
     -------
@@ -252,8 +256,8 @@ def normal_n(loc=0.0, scale=1.0, batch_shape=None, dtype=None, ctx=None):
     input_type = (isinstance(loc, np_ndarray), isinstance(scale, np_ndarray))
     if dtype is None:
         dtype = 'float32'
-    if ctx is None:
-        ctx = current_context()
+    if device is None:
+        device = current_device()
     if batch_shape == ():
         batch_shape = None
     else:
@@ -262,13 +266,13 @@ def normal_n(loc=0.0, scale=1.0, batch_shape=None, dtype=None, ctx=None):
         batch_shape = (-2,) + batch_shape
     if input_type == (True, True):
         return _npi.normal(loc, scale, loc=None, scale=None, size=batch_shape,
-                           ctx=ctx, dtype=dtype)
+                           device=device, dtype=dtype)
     elif input_type == (False, True):
         return _npi.normal(scale, loc=loc, scale=None, size=batch_shape,
-                           ctx=ctx, dtype=dtype)
+                           device=device, dtype=dtype)
     elif input_type == (True, False):
         return _npi.normal(loc, loc=None, scale=scale, size=batch_shape,
-                           ctx=ctx, dtype=dtype)
+                           device=device, dtype=dtype)
     else:
         return _npi.normal(loc=loc, scale=scale, size=batch_shape,
-                           ctx=ctx, dtype=dtype)
+                           device=device, dtype=dtype)
diff --git a/python/mxnet/numpy/io.py b/python/mxnet/numpy/io.py
index ce2606740886..4def43073083 100644
--- a/python/mxnet/numpy/io.py
+++ b/python/mxnet/numpy/io.py
@@ -18,7 +18,7 @@
 
 """I/O functions for ndarrays."""
 import numpy as onp
-from ..context import current_context
+from ..device import current_device
 from .multiarray import array
 
 __all__ = ['genfromtxt']
@@ -32,11 +32,11 @@ def genfromtxt(*args, **kwargs):
 
     Notes
     -----
-    This function has added an additional parameter `ctx` which allows to create
+    This function has added an additional parameter `device` which allows to create
     ndarrays on the user-specified device.
     """
-    ctx = kwargs.pop('ctx', current_context())
-    if ctx is None:
-        ctx = current_context()
+    device = kwargs.pop('device', current_device())
+    if device is None:
+        device = current_device()
     ret = onp.genfromtxt(*args, **kwargs)
-    return array(ret, dtype=ret.dtype, ctx=ctx)
+    return array(ret, dtype=ret.dtype, device=device)
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index c4249505e26f..25121abaffb6 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -1548,12 +1548,12 @@ def argmax(self, axis=None, out=None):  # pylint: disable=arguments-differ
     def as_in_context(self, context):
         """This function has been deprecated. Please refer to ``ndarray.to_device``."""
         warnings.warn('ndarray.as_in_context has been renamed to'
-                      ' ndarray.as_in_ctx', DeprecationWarning)
+                      ' ndarray.to_device', DeprecationWarning)
         return self.as_nd_ndarray().as_in_context(context).as_np_ndarray()
 
-    def as_in_ctx(self, ctx):
+    def default_device(self, ctx):
         """This function has been deprecated. Please refer to ``ndarray.to_device``."""
-        warnings.warn('ndarray.as_in_ctx has been renamed to'
+        warnings.warn('ndarray.to_device has been renamed to'
                       ' ndarray.to_device', DeprecationWarning)
         return self.to_device(ctx)
 
@@ -6294,7 +6294,7 @@ def arange(start, stop=None, step=1, dtype=None, device=None):
         * When npx.is_np_default_dtype() returns True, default dtype is int64.
     device : device context, optional
         Device context on which the memory is allocated. Default is
-        `mxnet.context.current_context()`.
+        `mxnet.device.current_device()`.
 
     Returns
     -------
diff --git a/python/mxnet/numpy_extension/__init__.py b/python/mxnet/numpy_extension/__init__.py
index ad325ff9e160..8cdd85a55fb5 100644
--- a/python/mxnet/numpy_extension/__init__.py
+++ b/python/mxnet/numpy_extension/__init__.py
@@ -26,7 +26,7 @@
 from . import control_flow
 from ._op import *  # pylint: disable=wildcard-import
 from .control_flow import *  # pylint: disable=wildcard-import
-from ..context import *  # pylint: disable=wildcard-import
+from ..device import *  # pylint: disable=wildcard-import
 from ..util import is_np_shape, is_np_array, set_np, reset_np, get_cuda_compute_capability,\
                    is_np_default_dtype, set_np_default_dtype
 from ..ndarray import waitall
diff --git a/python/mxnet/numpy_op_fallback.py b/python/mxnet/numpy_op_fallback.py
index 3503c7a3a278..8804701765e8 100644
--- a/python/mxnet/numpy_op_fallback.py
+++ b/python/mxnet/numpy_op_fallback.py
@@ -67,7 +67,7 @@ def forward(self, is_train, req, in_data, out_data, aux):
                                 subok=self._subok)
         else:
             out = np.empty_like(in_data[0].asnumpy())
-        self.assign(out_data[0], req[0], _mx_np.array(out, ctx=in_data[0].ctx))
+        self.assign(out_data[0], req[0], _mx_np.array(out, ctx=in_data[0].device))
 
     def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
         raise NotImplementedError('Operator empty_like does not support gradient computation')
@@ -108,7 +108,7 @@ def __init__(self, new_shape):
 
     def forward(self, is_train, req, in_data, out_data, aux):
         out = np.resize(in_data[0].asnumpy(), self._new_shape)
-        self.assign(out_data[0], req[0], _mx_np.array(out, dtype=out.dtype, ctx=out_data[0].ctx))
+        self.assign(out_data[0], req[0], _mx_np.array(out, dtype=out.dtype, ctx=out_data[0].device))
 
     def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
         raise NotImplementedError('Operator resize does not support gradient computation')
@@ -141,7 +141,7 @@ def __init__(self, shape):
 
     def forward(self, is_train, req, in_data, out_data, aux):
         out = np.unravel_index(in_data[0].asnumpy(), self._shape)
-        self.assign(out_data[0], req[0], _mx_np.array(out, dtype=out[0].dtype, ctx=out_data[0].ctx))
+        self.assign(out_data[0], req[0], _mx_np.array(out, dtype=out[0].dtype, ctx=out_data[0].device))
 
     def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
         raise NotImplementedError('Operator Unravel_index does not support gradient computation')
@@ -181,7 +181,7 @@ def forward(self, is_train, req, in_data, out_data, aux):
         else:
             scale = _mx_np.linalg.cholesky(cov)
         #set context
-        noise = _mx_np.random.normal(size=out_data[0].shape, dtype=loc.dtype, ctx=loc.ctx)
+        noise = _mx_np.random.normal(size=out_data[0].shape, dtype=loc.dtype, ctx=loc.device)
         out = loc + _mx_np.einsum('...jk,...j->...k', scale, noise)
         self.assign(out_data[0], req[0], out)
 
diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index 2fef62eb8319..640fd90da0ca 100644
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -431,6 +431,12 @@ def set_wd_mult(self, args_wd_mult):
         self.wd_mult.update(args_wd_mult)
 
     def _set_current_context(self, device_id):
+        """This function has been deprecated. Please refer to ``Optimizer._set_current_context``."""
+        warnings.warn('Optimizer._set_current_context has been renamed to'
+                      ' Optimizer._set_current_device', DeprecationWarning)
+        return self._set_current_device(device_id)
+
+    def _set_current_device(self, device_id):
         """Sets the number of the currently handled device.
 
         Parameters
diff --git a/python/mxnet/optimizer/updater.py b/python/mxnet/optimizer/updater.py
index 9a5b25ecc2a4..487775ca6589 100644
--- a/python/mxnet/optimizer/updater.py
+++ b/python/mxnet/optimizer/updater.py
@@ -48,7 +48,7 @@ def __call__(self, index, grad, weight):
             grads = _as_classic(grad, allow_np)
             weights = _as_classic(weight, allow_np)
         if weights:
-            self.optimizer._set_current_context(weights[0].context.device_id)
+            self.optimizer._set_current_device(weights[0].context.device_id)
         for i, idx in enumerate(indices):
             # convert ctypes.char_p.value back to python str if needed
             if isinstance(idx, bytes):
diff --git a/python/mxnet/random.py b/python/mxnet/random.py
index 2d1087a4e19a..48bef91cd97a 100644
--- a/python/mxnet/random.py
+++ b/python/mxnet/random.py
@@ -23,10 +23,12 @@
 import ctypes
 from .base import _LIB, check_call, integer_types
 from .ndarray.random import *
-from .context import Context
+from .device import Device
+from .util import wrap_ctx_to_device_func
 
 
-def seed(seed_state, ctx="all"):
+@wrap_ctx_to_device_func
+def seed(seed_state, device="all"):
     """Seeds the random number generators in MXNet.
 
     This affects the behavior of modules in MXNet that uses random number generators,
@@ -37,7 +39,7 @@ def seed(seed_state, ctx="all"):
     seed_state : int
         The random number seed.
 
-    ctx : Context
+    device : Device
         The device context of the generator. The default is "all" which means seeding random
         number generators of all devices.
 
@@ -49,7 +51,7 @@ def seed(seed_state, ctx="all"):
     even if they are seeded using the same seed.
 
     To produce identical random number sequences independent of the device id,
-    set optional `ctx` argument. This produces the same sequence of random numbers independent
+    set optional `device` argument. This produces the same sequence of random numbers independent
     of the device id, but the sequence can be different on different kind of devices as MXNet's
     random number generators for CPU and GPU use different algorithms.
 
@@ -72,28 +74,28 @@ def seed(seed_state, ctx="all"):
      [ 0.20251541  0.95352972]]
     # Different results on gpu(0) and gpu(1) with the same seed
     >>> mx.np.random.seed(128)
-    >>> print(mx.np.random.normal(shape=(2,2), ctx=mx.gpu(0)).asnumpy())
+    >>> print(mx.np.random.normal(shape=(2,2), device=mx.gpu(0)).asnumpy())
     [[ 2.5020072 -1.6884501]
      [-0.7931333 -1.4218881]]
     >>> mx.np.random.seed(128)
-    >>> print(mx.np.random.normal(shape=(2,2), ctx=mx.gpu(1)).asnumpy())
+    >>> print(mx.np.random.normal(shape=(2,2), device=mx.gpu(1)).asnumpy())
     [[ 0.24336822 -1.664805  ]
      [-1.0223296   1.253198  ]]
-    # Seeding with `ctx` argument produces identical results on gpu(0) and gpu(1)
-    >>> mx.np.random.seed(128, ctx=mx.gpu(0))
-    >>> print(mx.np.random.normal(shape=(2,2), ctx=mx.gpu(0)).asnumpy())
+    # Seeding with `device` argument produces identical results on gpu(0) and gpu(1)
+    >>> mx.np.random.seed(128, device=mx.gpu(0))
+    >>> print(mx.np.random.normal(shape=(2,2), device=mx.gpu(0)).asnumpy())
     [[ 2.5020072 -1.6884501]
      [-0.7931333 -1.4218881]]
-    >>> mx.np.random.seed(128, ctx=mx.gpu(1))
-    >>> print(mx.np.random.normal(shape=(2,2), ctx=mx.gpu(1)).asnumpy())
+    >>> mx.np.random.seed(128, device=mx.gpu(1))
+    >>> print(mx.np.random.normal(shape=(2,2), device=mx.gpu(1)).asnumpy())
     [[ 2.5020072 -1.6884501]
      [-0.7931333 -1.4218881]]
     """
     if not isinstance(seed_state, integer_types):
         raise ValueError('seed_state must be int')
     seed_state = ctypes.c_int(int(seed_state))
-    if ctx == "all":
+    if device == "all":
         check_call(_LIB.MXRandomSeed(seed_state))
     else:
-        ctx = Context(ctx)
-        check_call(_LIB.MXRandomSeedContext(seed_state, ctx.device_typeid, ctx.device_id))
+        device = Device(device)
+        check_call(_LIB.MXRandomSeedContext(seed_state, device.device_typeid, device.device_id))
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index d91fb5452a79..b18e8965196e 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -55,16 +55,16 @@
 from .numpy_extension import get_cuda_compute_capability
 
 
-def default_context():
-    """Get default context for regression test."""
-    # _TODO: get context from environment variable to support
+def default_device():
+    """Get default device for regression test."""
+    # _TODO: get device from environment variable to support
     # testing with GPUs
     return current_device()
 
 
-def set_default_context(ctx):
-    """Set default context."""
-    mx.context._current.set(ctx)
+def set_default_device(device):
+    """Set default device."""
+    mx.device._current.set(device)
 
 
 def default_dtype():
@@ -116,25 +116,25 @@ def effective_dtype(dat):
     # inputs to be of comparable precision to a float16, so float16 becomes the
     # 'effective dtype' for tolerance tests involving such op outputs.
 
-    # Is TF32 enabled in the ctx (the default on arch 80 GPUs)
-    def is_TF32_enabled(ctx):
+    # Is TF32 enabled in the device (the default on arch 80 GPUs)
+    def is_TF32_enabled(device):
         try:
-            return (ctx.device_type == 'gpu' and
-                    get_cuda_compute_capability(ctx) == 80 and
+            return (device.device_type == 'gpu' and
+                    get_cuda_compute_capability(device) == 80 and
                     os.environ.get('NVIDIA_TF32_OVERRIDE') != '0')
         except:  # pylint: disable=bare-except
             return False
 
-    ctx = dat.ctx if hasattr(dat, 'ctx') else None
+    device = dat.device if hasattr(dat, 'device') else None
     dtype = np.dtype(dat.dtype)
-    if dtype == np.dtype(np.float32) and is_TF32_enabled(ctx):
+    if dtype == np.dtype(np.float32) and is_TF32_enabled(device):
         return np.dtype(np.float16)
     else:
         return dtype
 
 
 def get_tolerance(dat, tol, default_tol):
-    """ Return the tolerance to be used for dat comparisons based on the given tol, datatype and context.
+    """ Return the tolerance to be used for dat comparisons based on the given tol, datatype and device.
     Parameters
     ----------
     dat : np.ndarray or mx.nd.array or mx.np.ndarray
@@ -437,7 +437,7 @@ def rand_sparse_ndarray(shape, stype, density=None, dtype=None, distribution=Non
     >>> assert(row4nnz == 2*row3nnz)
 
     """
-    ctx = ctx if ctx else default_context()
+    ctx = ctx if ctx else default_device()
     density = rnd.rand() if density is None else density
     dtype = default_dtype() if dtype is None else dtype
     distribution = "uniform" if distribution is None else distribution
@@ -485,7 +485,7 @@ def rand_sparse_ndarray(shape, stype, density=None, dtype=None, distribution=Non
 def rand_ndarray(shape, stype='default', density=None, dtype=None, modifier_func=None,
                  shuffle_csr_indices=False, distribution=None, ctx=None):
     """Generate a random sparse ndarray. Returns the generated ndarray."""
-    ctx = ctx if ctx else default_context()
+    ctx = ctx if ctx else default_device()
     if stype == 'default':
         arr = mx.nd.array(random_arrays(shape), dtype=dtype, ctx=ctx)
     else:
@@ -683,7 +683,7 @@ def assert_almost_equal(a, b, rtol=None, atol=None, names=('a', 'b'), equal_nan=
         b = b.asnumpy()
     use_np_allclose = isinstance(a, np.ndarray) and isinstance(b, np.ndarray)
     if not use_np_allclose:
-        if not (hasattr(a, 'ctx') and hasattr(b, 'ctx') and a.ctx == b.ctx and a.dtype == b.dtype):
+        if not (hasattr(a, 'ctx') and hasattr(b, 'ctx') and a.device == b.device and a.dtype == b.dtype):
             use_np_allclose = True
             if isinstance(a, mx.nd.NDArray):
                 a = a.asnumpy()
@@ -843,7 +843,7 @@ def assert_exception(f, exception_type, *args, **kwargs):
         return
 
 
-def _parse_location(sym, location, ctx, dtype=default_dtype()):
+def _parse_location(sym, location, device, dtype=default_dtype()):
     """Parses the given location to a ordered dictionary.
 
     Arguments of the provided op `sym` are used as dictionary keys
@@ -862,7 +862,7 @@ def _parse_location(sym, location, ctx, dtype=default_dtype()):
         - if type is dict of str -> `np.ndarray`
             maps the name of arguments to the corresponding `np.ndarray`.
         *In either case, value of all the arguments must be provided.*
-    ctx : Context
+    device : Device
         Device context.
     dtype: "asnumpy" or np.float16 or np.float32 or np.float64
         If dtype is "asnumpy" then the mx.nd.array created will have the same
@@ -898,12 +898,12 @@ def _parse_location(sym, location, ctx, dtype=default_dtype()):
                              % (str(set(sym.list_arguments())), str(set(location.keys()))))
     else:
         location = {k: v for k, v in zip(sym.list_arguments(), location)}
-    location = {k: mx.nd.array(v, ctx=ctx, dtype=v.dtype if dtype == "asnumpy" else dtype) \
+    location = {k: mx.nd.array(v, ctx=device, dtype=v.dtype if dtype == "asnumpy" else dtype) \
                if isinstance(v, np.ndarray) else v for k, v in location.items()}
     return _sorted_dict(location)
 
 
-def _parse_aux_states(sym, aux_states, ctx, dtype=default_dtype()):
+def _parse_aux_states(sym, aux_states, device, dtype=default_dtype()):
     """Parses the given auxiliary states to a dictionary.
 
     Auxiliary states of the provided op `sym` are used as dictionary
@@ -922,7 +922,7 @@ def _parse_aux_states(sym, aux_states, ctx, dtype=default_dtype()):
         - if type is dict of str -> `np.ndarray`
             maps the name of arguments to the corresponding `np.ndarray`.
         *In either case, all aux states of `sym` must be provided.*
-    ctx : Context
+    device : Device
         Device context.
     dtype: "asnumpy" or np.float16 or np.float32 or np.float64
         If dtype is "asnumpy" then the mx.nd.array created will have the same
@@ -963,7 +963,7 @@ def _parse_aux_states(sym, aux_states, ctx, dtype=default_dtype()):
         elif isinstance(aux_states, (list, tuple)):
             aux_names = sym.list_auxiliary_states()
             aux_states = {k:v for k, v in zip(aux_names, aux_states)}
-        aux_states = {k: mx.nd.array(v, ctx=ctx, dtype=v.dtype if dtype == "asnumpy" else dtype) \
+        aux_states = {k: mx.nd.array(v, ctx=device, dtype=v.dtype if dtype == "asnumpy" else dtype) \
                       for k, v in aux_states.items()}
     return aux_states
 
@@ -1087,7 +1087,7 @@ def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=None, rto
     """
     assert dtype in (np.float16, np.float32, np.float64)
     if ctx is None:
-        ctx = default_context()
+        ctx = default_device()
 
     def random_projection(shape):
         """Get a random weight matrix with not too small elements
@@ -1224,7 +1224,7 @@ def check_symbolic_forward(sym, location, expected, rtol=None, atol=None,
             Contains all the NumPy arrays corresponding to sym.list_auxiliary_states
         - if type is dict of str to np.ndarray
             Contains the mapping between names of auxiliary states and their values.
-    ctx : Context, optional
+    device : Device, optional
         running context
     dtype: "asnumpy" or np.float16 or np.float32 or np.float64
         If dtype is "asnumpy" then the mx.nd.array created will have the same
@@ -1248,7 +1248,7 @@ def check_symbolic_forward(sym, location, expected, rtol=None, atol=None,
     """
     assert dtype == "asnumpy" or dtype in (np.float16, np.float32, np.float64)
     if ctx is None:
-        ctx = default_context()
+        ctx = default_device()
 
     location = _parse_location(sym=sym, location=location, ctx=ctx, dtype=dtype)
     aux_states = _parse_aux_states(sym=sym, aux_states=aux_states, ctx=ctx,
@@ -1330,7 +1330,7 @@ def check_symbolic_backward(sym, location, out_grads, expected, rtol=None, atol=
     >>> mat2 = np.array([[5, 6], [7, 8]])
     >>> grad1 = mx.nd.zeros(shape)
     >>> grad2 = mx.nd.zeros(shape)
-    >>> exec_add = sym_add._bind(default_context(), args={'lhs': mat1, 'rhs': mat2},
+    >>> exec_add = sym_add._bind(default_device(), args={'lhs': mat1, 'rhs': mat2},
     ... args_grad={'lhs': grad1, 'rhs': grad2}, grad_req={'lhs': 'write', 'rhs': 'write'})
     >>> exec_add.forward(is_train=True)
     >>> ograd = mx.nd.ones(shape)
@@ -1339,7 +1339,7 @@ def check_symbolic_backward(sym, location, out_grads, expected, rtol=None, atol=
     """
     assert dtype == 'asnumpy' or dtype in (np.float16, np.float32, np.float64)
     if ctx is None:
-        ctx = default_context()
+        ctx = default_device()
 
     location = _parse_location(sym=sym, location=location, ctx=ctx, dtype=dtype)
     aux_states = _parse_aux_states(sym=sym, aux_states=aux_states, ctx=ctx,
@@ -1439,7 +1439,7 @@ def check_speed(sym, location=None, ctx=None, N=20, grad_req=None, typ="whole",
             Only test the forward speed.
     """
     if ctx is None:
-        ctx = default_context()
+        ctx = default_device()
 
     if grad_req is None:
         grad_req = 'write'
@@ -1673,8 +1673,8 @@ def smaller_dtype(dt1, dt2):
                     assert_almost_equal(arr, gtarr, rtol=rt, atol=at, equal_nan=equal_nan)
                 except AssertionError as e:
                     print('Train Err: {} {} ctx {} vs {} {} ctx {} at {}'.format(
-                        np.dtype(arr.dtype).name, arr.ctx, i,
-                        np.dtype(gtarr.dtype).name, gtarr.ctx, gt_idx, name))
+                        np.dtype(arr.dtype).name, arr.device, i,
+                        np.dtype(gtarr.dtype).name, gtarr.device, gt_idx, name))
                     traceback.print_exc()
                     if raise_on_err:
                         raise e
@@ -2253,19 +2253,19 @@ def compare_optimizer(opt1, opt2, shapes, dtype, w_stype='default', g_stype='def
     s1_list, s2_list = [], []
     for i, shape in enumerate(shapes):
         if w_stype == 'default':
-            w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
-            w1 = w2.copyto(default_context())
+            w2 = mx.random.uniform(shape=shape, ctx=default_device(), dtype=dtype)
+            w1 = w2.copyto(default_device())
         elif w_stype in ('row_sparse', 'csr'):
             w2 = rand_ndarray(shape, w_stype, density=1, dtype=dtype)
-            w1 = w2.copyto(default_context()).tostype('default')
+            w1 = w2.copyto(default_device()).tostype('default')
         else:
             raise Exception("type not supported yet")
         if g_stype == 'default':
-            g2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
-            g1 = g2.copyto(default_context())
+            g2 = mx.random.uniform(shape=shape, ctx=default_device(), dtype=dtype)
+            g1 = g2.copyto(default_device())
         elif g_stype in ('row_sparse', 'csr'):
             g2 = rand_ndarray(shape, g_stype, dtype=dtype)
-            g1 = g2.copyto(default_context()).tostype('default')
+            g1 = g2.copyto(default_device()).tostype('default')
         else:
             raise Exception("type not supported yet")
         s1 = opt1.create_state_multi_precision(i, w1)
@@ -2301,19 +2301,19 @@ def compare_optimizer_noise_seeded(opt1, opt2, shapes, dtype, noise_seed,
     s1_list, s2_list = [], []
     for i, shape in enumerate(shapes):
         if w_stype == 'default':
-            w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
-            w1 = w2.copyto(default_context())
+            w2 = mx.random.uniform(shape=shape, ctx=default_device(), dtype=dtype)
+            w1 = w2.copyto(default_device())
         elif w_stype in ('row_sparse', 'csr'):
             w2 = rand_ndarray(shape, w_stype, density=1, dtype=dtype)
-            w1 = w2.copyto(default_context()).tostype('default')
+            w1 = w2.copyto(default_device()).tostype('default')
         else:
             raise Exception("type not supported yet")
         if g_stype == 'default':
-            g2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
-            g1 = g2.copyto(default_context())
+            g2 = mx.random.uniform(shape=shape, ctx=default_device(), dtype=dtype)
+            g1 = g2.copyto(default_device())
         elif g_stype in ('row_sparse', 'csr'):
             g2 = rand_ndarray(shape, g_stype, dtype=dtype)
-            g1 = g2.copyto(default_context()).tostype('default')
+            g1 = g2.copyto(default_device()).tostype('default')
         else:
             raise Exception("type not supported yet")
         s1 = opt1.create_state_multi_precision(i, w1)
diff --git a/tests/nightly/dist_device_sync_kvstore_byteps.py b/tests/nightly/dist_device_sync_kvstore_byteps.py
index 4ddde57de919..c41875d981c6 100644
--- a/tests/nightly/dist_device_sync_kvstore_byteps.py
+++ b/tests/nightly/dist_device_sync_kvstore_byteps.py
@@ -54,27 +54,27 @@ def check_diff_to_scalar(A, x, rank=None):
 
 has_gpu = mx.context.num_gpus() > 0
 
-def get_current_context(device=False):
+def get_current_device(device=False):
     if has_gpu and device==True:
         return mx.gpu(kv.local_rank)
     else:
-        return mx.current_context()
+        return mx.current_device()
 
 def test_pushpull():
     def check_default_keys(nrepeat=3):
         # init kv dns keys
-        kv.broadcast('3', mx.nd.ones(shape, ctx=get_current_context(device=True)), mx.nd.ones(shape, ctx=get_current_context(device=True)))
-        kv.broadcast('99', mx.nd.ones(big_shape, ctx=get_current_context(device=True)), mx.nd.ones(big_shape, ctx=get_current_context(device=True)))
+        kv.broadcast('3', mx.nd.ones(shape, ctx=get_current_device(device=True)), mx.nd.ones(shape, ctx=get_current_device(device=True)))
+        kv.broadcast('99', mx.nd.ones(big_shape, ctx=get_current_device(device=True)), mx.nd.ones(big_shape, ctx=get_current_device(device=True)))
         for _ in range(nrepeat):
             scale = my_rank + 1
             num = (my_num_workers + 1) * my_num_workers / 2
 
-            arr = mx.nd.ones(shape, ctx=get_current_context(device=True)) * scale
+            arr = mx.nd.ones(shape, ctx=get_current_device(device=True)) * scale
             # inplace
             kv.pushpull('3', arr)
             check_diff_to_scalar(arr, num)
 
-            big_arr = mx.nd.ones(big_shape, ctx=get_current_context(device=True)) * scale
+            big_arr = mx.nd.ones(big_shape, ctx=get_current_device(device=True)) * scale
             # inplace
             kv.pushpull('99', big_arr)
             check_diff_to_scalar(big_arr, num)
@@ -85,7 +85,7 @@ def check_default_keys(nrepeat=3):
 def test_broadcast():
     def check_broadcast(kv, cur_keys, cur_shape, device=False):
         print("check_broadcast: {}, {}, {}, {}".format(kv, cur_keys, cur_shape, device))
-        ctx = get_current_context(device=device)
+        ctx = get_current_device(device=device)
         val = [mx.nd.zeros(cur_shape, ctx) for i in cur_keys]
         for i in range(len(cur_keys)):
             expected = i
diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
index e16255b9ad4d..fb1fad544ec7 100644
--- a/tests/nightly/test_large_array.py
+++ b/tests/nightly/test_large_array.py
@@ -22,7 +22,7 @@
 import numpy as np
 import mxnet as mx
 
-from mxnet.test_utils import rand_ndarray, assert_almost_equal, rand_coord_2d, default_context, check_symbolic_forward, create_2d_tensor
+from mxnet.test_utils import rand_ndarray, assert_almost_equal, rand_coord_2d, default_device, check_symbolic_forward, create_2d_tensor
 from mxnet import gluon, nd
 from common import with_seed
 import pytest
@@ -228,7 +228,7 @@ def check_dropout():
         shape = (LARGE_X, SMALL_Y)
         x = mx.sym.var('data')
         y = mx.sym.Dropout(x, p=1, cudnn_off=True)
-        exe = y._simple_bind(ctx=default_context(), data=shape)
+        exe = y._simple_bind(ctx=default_device(), data=shape)
         exe.arg_arrays[0][:] = 1
         out = exe.forward(is_train=True)
         nd.waitall()
diff --git a/tests/nightly/test_np_large_array.py b/tests/nightly/test_np_large_array.py
index b827546ae2ea..ba9369abd4cb 100644
--- a/tests/nightly/test_np_large_array.py
+++ b/tests/nightly/test_np_large_array.py
@@ -25,7 +25,7 @@
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.append(os.path.join(curr_path, '../python/unittest/'))
 
-from mxnet.test_utils import rand_ndarray, assert_almost_equal, rand_coord_2d, default_context, check_symbolic_forward, create_2d_np_tensor, use_np
+from mxnet.test_utils import rand_ndarray, assert_almost_equal, rand_coord_2d, default_device, check_symbolic_forward, create_2d_np_tensor, use_np
 from mxnet import gluon, np, npx
 import pytest
 from tests.python.unittest.common import assertRaises
diff --git a/tests/nightly/test_np_random.py b/tests/nightly/test_np_random.py
index 753a23261e18..da51f61c0fb2 100644
--- a/tests/nightly/test_np_random.py
+++ b/tests/nightly/test_np_random.py
@@ -56,7 +56,7 @@ def test_np_exponential():
 @use_np
 def test_np_uniform():
     types = [None, "float32", "float64"]
-    ctx = mx.context.current_context()
+    device = mx.device.current_device()
     samples = 1000000
     # Generation test
     trials = 8
@@ -67,7 +67,7 @@ def test_np_uniform():
             buckets, probs = gen_buckets_probs_with_ppf(lambda x: ss.uniform.ppf(x, loc=low, scale=scale), num_buckets)
             buckets = np.array(buckets, dtype=dtype).tolist()
             probs = [(buckets[i][1] - buckets[i][0])/scale for i in range(num_buckets)]
-            generator_mx_np = lambda x: mx.np.random.uniform(low, high, size=x, ctx=ctx, dtype=dtype).asnumpy()
+            generator_mx_np = lambda x: mx.np.random.uniform(low, high, size=x, device=device, dtype=dtype).asnumpy()
             verify_generator(generator=generator_mx_np, buckets=buckets, probs=probs, nsamples=samples, nrepeat=trials)
 
 
@@ -106,7 +106,7 @@ def test_np_gumbel():
 @use_np
 def test_np_normal():
     types = [None, "float32", "float64"]
-    ctx = mx.context.current_context()
+    device = mx.device.current_device()
     samples = 1000000
     # Generation test
     trials = 8
@@ -117,7 +117,7 @@ def test_np_normal():
             buckets = np.array(buckets, dtype=dtype).tolist()
             probs = [(ss.norm.cdf(buckets[i][1], loc, scale) -
                       ss.norm.cdf(buckets[i][0], loc, scale)) for i in range(num_buckets)]
-            generator_mx_np = lambda x: mx.np.random.normal(loc, scale, size=x, ctx=ctx, dtype=dtype).asnumpy()
+            generator_mx_np = lambda x: mx.np.random.normal(loc, scale, size=x, device=device, dtype=dtype).asnumpy()
             verify_generator(generator=generator_mx_np, buckets=buckets, probs=probs, nsamples=samples, nrepeat=trials)
 
 
@@ -125,7 +125,7 @@ def test_np_normal():
 @use_np
 def test_np_gamma():
     types = [None, "float32", "float64"]
-    ctx = mx.context.current_context()
+    device = mx.device.current_device()
     samples = 1000000
     # Generation test
     trials = 8
@@ -136,12 +136,12 @@ def test_np_gamma():
                 lambda x: ss.gamma.ppf(x, a=alpha, loc=0, scale=beta), num_buckets)
             buckets = np.array(buckets).tolist()
             def generator_mx(x): return np.random.gamma(
-                alpha, beta, size=samples, ctx=ctx).asnumpy()
+                alpha, beta, size=samples, device=device).asnumpy()
             verify_generator(generator=generator_mx, buckets=buckets, probs=probs,
                              nsamples=samples, nrepeat=trials)
             generator_mx_same_seed =\
                 lambda x: _np.concatenate(
-                    [np.random.gamma(alpha, beta, size=(x // 10), ctx=ctx).asnumpy()
+                    [np.random.gamma(alpha, beta, size=(x // 10), device=device).asnumpy()
                         for _ in range(10)])
             verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs,
                              nsamples=samples, nrepeat=trials)
@@ -151,7 +151,7 @@ def generator_mx(x): return np.random.gamma(
 @use_np
 def test_np_laplace():
     types = [None, "float32", "float64"]
-    ctx = mx.context.current_context()
+    device = mx.device.current_device()
     samples = 1000000
     # Generation test
     trials = 8
@@ -161,6 +161,6 @@ def test_np_laplace():
             buckets, probs = gen_buckets_probs_with_ppf(lambda x: ss.laplace.ppf(x, loc=loc, scale=scale), num_buckets)
             buckets = np.array(buckets, dtype=dtype).tolist()
             probs = [(buckets[i][1] - buckets[i][0])/scale for i in range(num_buckets)]
-            generator_mx_np = lambda x: np.random.laplace(loc, scale, size=x, ctx=ctx, dtype=dtype).asnumpy()
+            generator_mx_np = lambda x: np.random.laplace(loc, scale, size=x, device=device, dtype=dtype).asnumpy()
             verify_generator(generator=generator_mx_np, buckets=buckets, probs=probs, nsamples=samples, nrepeat=trials)
 
diff --git a/tests/python/dnnl/subgraphs/subgraph_common.py b/tests/python/dnnl/subgraphs/subgraph_common.py
index 349cf628866e..3ed526ca56d5 100644
--- a/tests/python/dnnl/subgraphs/subgraph_common.py
+++ b/tests/python/dnnl/subgraphs/subgraph_common.py
@@ -122,7 +122,7 @@ def check_quantize(net_original, data_shape, out_type, name='conv',
 
   net_original.initialize(init=mx.init.Normal(0.5), force_reinit=True)
   min_value = -1 if out_type != 'uint8' else 0
-  data = mx.np.random.uniform(min_value, 1.0, size=data_shape, dtype='float32', ctx=mx.current_context())
+  data = mx.np.random.uniform(min_value, 1.0, size=data_shape, dtype='float32', ctx=mx.current_device())
 
   outputs = net_original(data)
   for output in outputs:
@@ -132,7 +132,7 @@ def check_quantize(net_original, data_shape, out_type, name='conv',
   calib_data = mx.gluon.data.DataLoader(data, batch_size=1)
   for quantize_granularity in quantize_granularity_list:
     qnet = quantization.quantize_net(net_original,
-                                     ctx=mx.current_context(),
+                                     ctx=mx.current_device(),
                                      exclude_layers=None,
                                      exclude_operators=None,
                                      quantized_dtype=out_type,
@@ -159,7 +159,7 @@ def check_fusion(net_original, data_shape, attrs_dict, check_fp32_fusion=True, c
                  out_types=['uint8', 'int8', 'auto'], dedup_subgraph=True):
   net_original.initialize()
   net_original.hybridize(static_alloc=False, static_shape=False)
-  data = mx.np.random.uniform(size=data_shape, dtype='float32', ctx=mx.current_context())
+  data = mx.np.random.uniform(size=data_shape, dtype='float32', ctx=mx.current_device())
   net_original(data)
   net_fusion = copy.copy(net_original)
   sym, params = net_original.export(None)
diff --git a/tests/python/dnnl/subgraphs/test_conv_subgraph.py b/tests/python/dnnl/subgraphs/test_conv_subgraph.py
index 5154d241a926..0b0840c5ee94 100644
--- a/tests/python/dnnl/subgraphs/test_conv_subgraph.py
+++ b/tests/python/dnnl/subgraphs/test_conv_subgraph.py
@@ -37,8 +37,8 @@ def __init__(self, dtype='float32', **kwargs):
 
     def forward(self, x):
         out = mx.npx.convolution(x, kernel=(1,1), num_filter=3,
-                                 weight=self.weight.data(x.ctx), no_bias=False,
-                                 bias=self.bias.data(x.ctx))
+                                 weight=self.weight.data(x.device), no_bias=False,
+                                 bias=self.bias.data(x.device))
         return out
     
     def infer_shape(self, x):
@@ -300,13 +300,13 @@ def __init__(self, **kwargs):
 
     def forward(self, x):
         conv1 = mx.npx.convolution(x, kernel=(3,3), num_filter=64,
-                                   weight=self.shared_weight.data(x.ctx), no_bias=True)
+                                   weight=self.shared_weight.data(x.device), no_bias=True)
         conv2 = mx.npx.convolution(x, kernel=(3,3), num_filter=64,
-                                   weight=self.shared_weight.data(x.ctx)*2, no_bias=True)
+                                   weight=self.shared_weight.data(x.device)*2, no_bias=True)
         conv3 = mx.npx.convolution(x, kernel=(3,3), num_filter=64,
-                                   weight=self.shared_weight.data(x.ctx)*3, no_bias=True)
+                                   weight=self.shared_weight.data(x.device)*3, no_bias=True)
         conv4 = mx.npx.convolution(x, kernel=(3,3), num_filter=64,
-                                   weight=self.shared_weight.data(x.ctx)*4, no_bias=True)
+                                   weight=self.shared_weight.data(x.device)*4, no_bias=True)
         return mx.np.concatenate([conv1, conv2, conv3, conv4], axis=1)
 
     def infer_shape(self, x, *args):
@@ -436,7 +436,7 @@ def test_mobilenetv2_struct(data_shape, reverse_sum_order, dedup_subgraph):
 @pytest.mark.parametrize('reverse_sum_order', [False, True])
 @pytest.mark.parametrize('model_name', ['conv_bn_sum', 'mobilenetv2_struct'])
 def test_deduplication(data_shape, reverse_sum_order, model_name):
-  data_nd = mx.np.random.uniform(-1, 1, size=data_shape, ctx=mx.cpu())
+  data_nd = mx.np.random.uniform(-1, 1, size=data_shape, device=mx.cpu())
   if (model_name == 'mobilenetv2_struct'):
     model_dedup = MobileNetV2Struct(reverse_sum_order=reverse_sum_order)
   else:
@@ -539,7 +539,7 @@ def __init__(self, **kwargs):
     def forward(self, x):
       conv = self.conv1(x)
       print(conv.shape)
-      sum1 = conv + self.add_value.data(x.ctx)
+      sum1 = conv + self.add_value.data(x.device)
       pool = self.pool(conv)
       return self.tailneg(sum1, pool)
     
@@ -629,7 +629,7 @@ def forward(self, x):
       conv = self.conv1(x)
       bn = self.bn(conv)
       print(bn.shape)
-      sum1 = bn + self.add_value.data(x.ctx)
+      sum1 = bn + self.add_value.data(x.device)
       relu = self.act(sum1)
       if self.connect_mode == "conv_customop":
         pool = self.pool(conv)
@@ -673,9 +673,9 @@ def infer_shape(self, x):
 ])
 def test_quantized_conv_bias_overflow(data_min, data_max, weight_min, weight_max):
   data_shape = (1, 32, 2, 2)
-  data_nd = mx.np.random.uniform(data_min, data_max, size=data_shape, ctx=mx.cpu())
-  weight_nd = mx.np.random.uniform(weight_min, weight_max, size=[64, 32, 1, 1], ctx=mx.cpu())
-  bias_nd = mx.np.random.uniform(-1, +1, size=[64], ctx=mx.cpu())
+  data_nd = mx.np.random.uniform(data_min, data_max, size=data_shape, device=mx.cpu())
+  weight_nd = mx.np.random.uniform(weight_min, weight_max, size=[64, 32, 1, 1], device=mx.cpu())
+  bias_nd = mx.np.random.uniform(-1, +1, size=[64], device=mx.cpu())
 
   class ConvBiasOverflow(nn.HybridBlock):
         def __init__(self, dtype='float32', **kwargs):
@@ -685,8 +685,8 @@ def __init__(self, dtype='float32', **kwargs):
 
         def forward(self, x):
             conv1 = mx.npx.convolution(x, num_filter=64, kernel=(1,1),
-                                       weight=self.weight.data(x.ctx),
-                                       no_bias=False, bias=self.bias.data(x.ctx))
+                                       weight=self.weight.data(x.device),
+                                       no_bias=False, bias=self.bias.data(x.device))
             return conv1
         
         def infer_shape(self, x):
@@ -703,7 +703,7 @@ def infer_shape(self, x):
   
   calib_data = mx.gluon.data.DataLoader(data_nd, batch_size=data_shape[0])
   qnet = quantization.quantize_net(net,
-                                   ctx=mx.cpu(),
+                                   device=mx.cpu(),
                                    exclude_layers=None,
                                    exclude_operators=None,
                                    quantized_dtype='int8',
@@ -748,7 +748,7 @@ def test_quantized_fc_bias_overflow(data_min, data_max, weight_min, weight_max):
   qsym, qarg_params, qaux_params = quantization.quantize_model(sym=sym_sg,
                                                                arg_params=arg_params,
                                                                aux_params={},
-                                                               ctx=mx.cpu(),
+                                                               device=mx.cpu(),
                                                                excluded_sym_names=None,
                                                                excluded_op_names=None,
                                                                quantized_dtype='int8',
diff --git a/tests/python/dnnl/subgraphs/test_fc_subgraph.py b/tests/python/dnnl/subgraphs/test_fc_subgraph.py
index d12a156ca1f6..afcb51605953 100644
--- a/tests/python/dnnl/subgraphs/test_fc_subgraph.py
+++ b/tests/python/dnnl/subgraphs/test_fc_subgraph.py
@@ -134,9 +134,9 @@ def forward(self, x):
 ])
 def test_quantized_fc_bias_overflow(data_min, data_max, weight_min, weight_max):
   data_shape = (1, 32)
-  data_nd = mx.np.random.uniform(data_min, data_max, size=data_shape, ctx=mx.cpu())
-  weight_nd = mx.np.random.uniform(weight_min, weight_max, size=[64, 32], ctx=mx.cpu())
-  bias_nd = mx.np.random.uniform(-1, +1, size=[64], ctx=mx.cpu())
+  data_nd = mx.np.random.uniform(data_min, data_max, size=data_shape, device=mx.cpu())
+  weight_nd = mx.np.random.uniform(weight_min, weight_max, size=[64, 32], device=mx.cpu())
+  bias_nd = mx.np.random.uniform(-1, +1, size=[64], device=mx.cpu())
 
   class FCBiasOverflow(nn.HybridBlock):
     def __init__(self, dtype='float32', **kwargs):
@@ -145,8 +145,8 @@ def __init__(self, dtype='float32', **kwargs):
         self.bias = mx.gluon.Parameter('bias', dtype=dtype, allow_deferred_init=True)
 
     def forward(self, x):
-        conv1 = mx.npx.fully_connected(x, num_hidden=64, weight=self.weight.data(x.ctx),
-                                       no_bias=False, bias=self.bias.data(x.ctx))
+        conv1 = mx.npx.fully_connected(x, num_hidden=64, weight=self.weight.data(x.device),
+                                       no_bias=False, bias=self.bias.data(x.device))
         return conv1
     
     def infer_shape(self, x, *args):
@@ -163,7 +163,7 @@ def infer_shape(self, x, *args):
 
   calib_data = mx.gluon.data.DataLoader(data_nd, batch_size=1)
   qnet = quantization.quantize_net(net,
-                                   ctx=mx.cpu(),
+                                   device=mx.cpu(),
                                    exclude_layers=None,
                                    exclude_operators=None,
                                    quantized_dtype='int8',
diff --git a/tests/python/dnnl/test_amp.py b/tests/python/dnnl/test_amp.py
index 2b67a41fc73b..9a3d7e026426 100644
--- a/tests/python/dnnl/test_amp.py
+++ b/tests/python/dnnl/test_amp.py
@@ -25,7 +25,7 @@
 import ctypes
 from mxnet import amp
 import pytest
-from mxnet.test_utils import set_default_context, same_symbol_structure, assert_almost_equal
+from mxnet.test_utils import set_default_device, same_symbol_structure, assert_almost_equal
 from mxnet.gluon.model_zoo.vision import get_model
 from mxnet.gluon import SymbolBlock, nn, rnn
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
diff --git a/tests/python/dnnl/test_bf16_operator.py b/tests/python/dnnl/test_bf16_operator.py
index 14eaa1a2b287..2c696ee789b1 100644
--- a/tests/python/dnnl/test_bf16_operator.py
+++ b/tests/python/dnnl/test_bf16_operator.py
@@ -25,7 +25,7 @@
 import ctypes
 import itertools
 from mxnet import amp
-from mxnet.test_utils import set_default_context, same_symbol_structure, assert_almost_equal_with_err, rand_shape_nd
+from mxnet.test_utils import set_default_device, same_symbol_structure, assert_almost_equal_with_err, rand_shape_nd
 from mxnet.gluon.model_zoo.vision import get_model
 from mxnet.gluon import SymbolBlock, nn, rnn
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
diff --git a/tests/python/gpu/test_amp.py b/tests/python/gpu/test_amp.py
index 89687bfcf823..ccc31783a79d 100644
--- a/tests/python/gpu/test_amp.py
+++ b/tests/python/gpu/test_amp.py
@@ -25,7 +25,7 @@
 import ctypes
 from mxnet import amp
 import pytest
-from mxnet.test_utils import set_default_context, same_symbol_structure
+from mxnet.test_utils import set_default_device, same_symbol_structure
 from mxnet.gluon.model_zoo.vision import get_model
 from mxnet.gluon import SymbolBlock, nn, rnn
 from mxnet.operator import get_all_registered_operators_grouped
@@ -33,7 +33,7 @@
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
 from common import assert_raises_cudnn_not_satisfied
 sys.path.insert(0, os.path.join(curr_path, '../train'))
-set_default_context(mx.gpu(0))
+set_default_device(mx.gpu(0))
 
 @pytest.fixture()
 def amp_tests(request):
@@ -99,7 +99,7 @@ def test_amp_coverage(amp_tests):
 @pytest.mark.skip(reason='Error during waitall(). Tracked in #18099')
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_amp_conversion_rnn(amp_tests):
-    with mx.Context(mx.gpu(0)):
+    with mx.Device(mx.gpu(0)):
         model = nn.HybridSequential()
         model.add(rnn.LSTM(hidden_size=10, num_layers=2, bidirectional=True))
         model.add(nn.Dense(2))
diff --git a/tests/python/gpu/test_deferred_compute_gpu.py b/tests/python/gpu/test_deferred_compute_gpu.py
index 9802d2b57d24..8b65141e2391 100644
--- a/tests/python/gpu/test_deferred_compute_gpu.py
+++ b/tests/python/gpu/test_deferred_compute_gpu.py
@@ -19,7 +19,7 @@
 import sys
 
 import mxnet as mx
-mx.test_utils.set_default_context(mx.gpu(0))
+mx.test_utils.set_default_device(mx.gpu(0))
 
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
diff --git a/tests/python/gpu/test_extensions_gpu.py b/tests/python/gpu/test_extensions_gpu.py
index 9d3683166053..339446e511b1 100644
--- a/tests/python/gpu/test_extensions_gpu.py
+++ b/tests/python/gpu/test_extensions_gpu.py
@@ -24,7 +24,7 @@
 from mxnet import nd
 from mxnet.gluon import nn
 from mxnet.base import MXNetError
-from mxnet.test_utils import download, is_cd_run, assert_almost_equal, default_context
+from mxnet.test_utils import download, is_cd_run, assert_almost_equal, default_device
 import pytest
 
 base_path = os.path.join(os.path.dirname(__file__), "../../..")
diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index 134eab397640..ecc9e8dea9b6 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -20,7 +20,7 @@
 import time
 import mxnet as mx
 import multiprocessing as mp
-from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal, rand_ndarray, environment
+from mxnet.test_utils import check_consistency, set_default_device, assert_almost_equal, rand_ndarray, environment
 import numpy as _np
 import math
 from mxnet import autograd
@@ -34,11 +34,11 @@
 from test_numpy_loss import *
 from test_gluon_rnn import *
 
-set_default_context(mx.gpu(0))
+set_default_device(mx.gpu(0))
 
 
 def check_rnn_layer(layer):
-    layer.initialize(ctx=[mx.cpu(0), mx.gpu(0)])
+    layer.initialize(device=[mx.cpu(0), mx.gpu(0)])
     with mx.gpu(0):
         x = mx.np.ones((10, 16, 30))
         states = layer.begin_state(16)
@@ -55,7 +55,7 @@ def check_rnn_layer(layer):
 
 
 def check_rnn_layer_w_rand_inputs(layer):
-    layer.initialize(ctx=[mx.cpu(0), mx.gpu(0)])
+    layer.initialize(device=[mx.cpu(0), mx.gpu(0)])
     x = mx.np.random.uniform(size=(10, 16, 30))
     with mx.gpu(0):
         x = x.copyto(mx.gpu(0))
@@ -79,9 +79,9 @@ def test_lstmp():
     rtol, atol = 1e-2, 1e-2
     batch_size, seq_len = 7, 11
     input_size = 5
-    ctx = mx.gpu(0)
+    device = mx.gpu(0)
     lstm_input = mx.np.random.uniform(
-        size=(seq_len, batch_size, input_size), ctx=ctx)
+        size=(seq_len, batch_size, input_size), device=device)
     shapes = {'i2h_weight': (hidden_size * 4, input_size),
               'h2h_weight': (hidden_size * 4, projection_size),
               'i2h_bias': (hidden_size * 4,),
@@ -93,8 +93,8 @@ def test_lstmp():
     lstm_cell = gluon.rnn.LSTMPCell(hidden_size=hidden_size,
                                     projection_size=projection_size,
                                     input_size=input_size)
-    lstm_layer.initialize(ctx=ctx)
-    lstm_cell.initialize(ctx=ctx)
+    lstm_layer.initialize(device=device)
+    lstm_cell.initialize(device=device)
     layer_params = lstm_layer.collect_params()
     cell_params = lstm_cell.collect_params()
     for k, v in weights.items():
@@ -114,14 +114,14 @@ def test_lstmp():
         print('checking gradient for {}'.format('lstm0_l0_' + k))
         assert_almost_equal(layer_grad, cell_grad, rtol=rtol, atol=atol)
     check_rnn_layer_forward(gluon.rnn.LSTM(
-        10, 2, projection_size=5), mx.np.ones((8, 3, 20)), ctx=ctx)
+        10, 2, projection_size=5), mx.np.ones((8, 3, 20)), device=device)
     check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, projection_size=5, bidirectional=True), mx.np.ones(
-        (8, 3, 20)), [mx.np.ones((4, 3, 5)), mx.np.ones((4, 3, 10))], ctx=ctx)
+        (8, 3, 20)), [mx.np.ones((4, 3, 5)), mx.np.ones((4, 3, 10))], device=device)
     check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, dropout=0.5, projection_size=5), mx.np.ones((8, 3, 20)),
-                            run_only=True, ctx=ctx)
+                            run_only=True, device=device)
     check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, bidirectional=True, dropout=0.5, projection_size=5),
                             mx.np.ones((8, 3, 20)),
-                            [mx.np.ones((4, 3, 5)), mx.np.ones((4, 3, 10))], run_only=True, ctx=ctx)
+                            [mx.np.ones((4, 3, 5)), mx.np.ones((4, 3, 10))], run_only=True, device=device)
     lstm_layer.save_parameters('gpu_tmp.params')
     lstm_layer.load_parameters('gpu_tmp.params')
 
@@ -134,16 +134,16 @@ def test_lstm_clip():
     input_size = 50
     clip_min, clip_max, clip_nan = -5, 5, True
     lstm_input = mx.np.random.uniform(
-        size=(seq_len, batch_size, input_size), ctx=mx.gpu(0))
-    lstm_states = [mx.np.random.uniform(size=(2, batch_size, projection_size), ctx=mx.gpu(0)),
-                   mx.np.random.uniform(size=(2, batch_size, hidden_size), ctx=mx.gpu(0))]
+        size=(seq_len, batch_size, input_size), device=mx.gpu(0))
+    lstm_states = [mx.np.random.uniform(size=(2, batch_size, projection_size), device=mx.gpu(0)),
+                   mx.np.random.uniform(size=(2, batch_size, hidden_size), device=mx.gpu(0))]
     lstm_layer = gluon.rnn.LSTM(hidden_size, projection_size=projection_size,
                                 input_size=input_size,
                                 bidirectional=True,
                                 state_clip_min=clip_min,
                                 state_clip_max=clip_max,
                                 state_clip_nan=clip_nan)
-    lstm_layer.initialize(ctx=mx.gpu(0))
+    lstm_layer.initialize(device=mx.gpu(0))
     with autograd.record():
         _, layer_output_states = lstm_layer(lstm_input, lstm_states)
     cell_states = layer_output_states[0]
@@ -302,11 +302,11 @@ def test_rnn_layer_begin_state_type():
 
 def test_gluon_ctc_consistency():
     loss = mx.gluon.loss.CTCLoss()
-    data = mx.np.flip(mx.np.repeat(mx.np.arange(0, 4, ctx=mx.gpu(0)), 40).reshape((2, 20, 4)), axis=0)
-    cpu_label = mx.np.array([[2, 1, -1, -1], [3, 2, 2, -1]], ctx=mx.cpu(0))
-    gpu_label = mx.np.array([[2, 1, -1, -1], [3, 2, 2, -1]], ctx=mx.gpu(0))
+    data = mx.np.flip(mx.np.repeat(mx.np.arange(0, 4, device=mx.gpu(0)), 40).reshape((2, 20, 4)), axis=0)
+    cpu_label = mx.np.array([[2, 1, -1, -1], [3, 2, 2, -1]], device=mx.cpu(0))
+    gpu_label = mx.np.array([[2, 1, -1, -1], [3, 2, 2, -1]], device=mx.gpu(0))
 
-    cpu_data = data.copy().as_in_context(mx.cpu(0))
+    cpu_data = data.copy().to_device(mx.cpu(0))
     cpu_data.attach_grad()
     with mx.autograd.record():
         l_cpu = loss(cpu_data, cpu_label)
@@ -323,10 +323,10 @@ def test_gluon_ctc_consistency():
 
 def test_global_norm_clip_multi_device():
     for check_isfinite in [True, False]:
-        x1 = mx.np.ones((3, 3), ctx=mx.gpu(0))
-        x2 = mx.np.ones((4, 4), ctx=mx.cpu(0))
-        x3 = mx.np.ones((7, 4), ctx=mx.gpu(0))
-        x4 = mx.np.ones((7, 4), ctx=mx.cpu(0))
+        x1 = mx.np.ones((3, 3), device=mx.gpu(0))
+        x2 = mx.np.ones((4, 4), device=mx.cpu(0))
+        x3 = mx.np.ones((7, 4), device=mx.gpu(0))
+        x4 = mx.np.ones((7, 4), device=mx.cpu(0))
         norm = gluon.utils.clip_global_norm(
             [x1, x2, x3, x4], 1.0, check_isfinite=check_isfinite)
         if check_isfinite:
@@ -349,34 +349,34 @@ def _find_bn(module):
 
         raise RuntimeError('BN not found')
 
-    def _syncParameters(bn1, bn2, ctx):
-        ctx = input.context
-        bn2.gamma.set_data(bn1.gamma.data(ctx))
-        bn2.beta.set_data(bn1.beta.data(ctx))
-        bn2.running_mean.set_data(bn1.running_mean.data(ctx))
-        bn2.running_var.set_data(bn1.running_var.data(ctx))
+    def _syncParameters(bn1, bn2, device):
+        device = input.context
+        bn2.gamma.set_data(bn1.gamma.data(device))
+        bn2.beta.set_data(bn1.beta.data(device))
+        bn2.running_mean.set_data(bn1.running_mean.data(device))
+        bn2.running_var.set_data(bn1.running_var.data(device))
 
     input1 = input.copy()
     input2 = input.copy()
 
     if cuda:
-        input1 = input.as_in_context(mx.gpu(0))
-        ctx_list = [mx.gpu(i) for i in range(num_devices)]
+        input1 = input.to_device(mx.gpu(0))
+        device_list = [mx.gpu(i) for i in range(num_devices)]
     else:
-        ctx_list = [mx.cpu(0) for _ in range(num_devices)]
+        device_list = [mx.cpu(0) for _ in range(num_devices)]
 
     nch = input.shape[1]
     bn1 = mx.gluon.nn.BatchNorm(in_channels=nch)
     bn2 = mx.gluon.nn.SyncBatchNorm(in_channels=nch, num_devices=num_devices)
 
-    bn1.initialize(ctx=ctx_list[0])
-    bn2.initialize(ctx=ctx_list)
+    bn1.initialize(device=device_list[0])
+    bn2.initialize(device=device_list)
 
     # using the same values for gamma and beta
-    #_syncParameters(_find_bn(bn1), _find_bn(bn2), ctx_list[0])
+    #_syncParameters(_find_bn(bn1), _find_bn(bn2), device_list[0])
 
     input1.attach_grad()
-    inputs2 = split_and_load(input2, ctx_list, batch_axis=0)
+    inputs2 = split_and_load(input2, device_list, batch_axis=0)
     for xi in inputs2:
         xi.attach_grad()
 
@@ -388,17 +388,17 @@ def _syncParameters(bn1, bn2, ctx):
         mx.autograd.backward(loss1)
         mx.autograd.backward(loss2)
 
-    output2 = mx.np.concatenate([output.as_in_context(input.context) for output in output2], axis=0)
+    output2 = mx.np.concatenate([output.to_device(input.context) for output in output2], axis=0)
     # assert forwarding
     assert_almost_equal(input1, input2, atol=1e-3, rtol=1e-3)
     assert_almost_equal(output1, output2, atol=1e-3, rtol=1e-3)
-    assert_almost_equal(_find_bn(bn1).running_mean.data(ctx_list[0]),
-                        _find_bn(bn2).running_mean.data(ctx_list[0]),
+    assert_almost_equal(_find_bn(bn1).running_mean.data(device_list[0]),
+                        _find_bn(bn2).running_mean.data(device_list[0]),
                         atol=1e-3, rtol=1e-3)
-    assert_almost_equal(_find_bn(bn1).running_var.data(ctx_list[0]),
-                        _find_bn(bn2).running_var.data(ctx_list[0]),
+    assert_almost_equal(_find_bn(bn1).running_var.data(device_list[0]),
+                        _find_bn(bn2).running_var.data(device_list[0]),
                         atol=1e-3, rtol=1e-3)
-    input2grad = mx.np.concatenate([output.grad.as_in_context(input.context) for output in inputs2], axis=0)
+    input2grad = mx.np.concatenate([output.grad.to_device(input.context) for output in inputs2], axis=0)
     assert_almost_equal(input1.grad, input2grad, atol=1e-3, rtol=1e-3)
 
 @mx.util.use_np
@@ -406,7 +406,7 @@ def test_sync_batchnorm():
     def get_num_devices():
         for i in range(100):
             try:
-                mx.np.zeros((1,), ctx=mx.gpu(i))
+                mx.np.zeros((1,), device=mx.gpu(i))
             except:
                 return i
     # no need to use SyncBN with 1 gpu
@@ -425,13 +425,13 @@ def test_symbol_block_fp16(tmpdir):
     # 1. Load a resnet model, cast it to fp16 and export
     tmp = str(tmpdir)
     tmpfile = os.path.join(tmp, 'resnet34_fp16')
-    ctx = mx.gpu(0)
+    device = mx.gpu(0)
 
     net_fp32 = mx.gluon.model_zoo.vision.resnet34_v2(
-        pretrained=True, ctx=ctx, root=tmp)
+        pretrained=True, device=device, root=tmp)
     net_fp32.cast('float16')
     net_fp32.hybridize()
-    data = mx.np.zeros((1, 3, 224, 224), dtype='float16', ctx=ctx)
+    data = mx.np.zeros((1, 3, 224, 224), dtype='float16', device=device)
     net_fp32(data)
     symbol_file, param_file = net_fp32.export(tmpfile, 0)
 
@@ -440,7 +440,7 @@ def test_symbol_block_fp16(tmpdir):
     sm = mx.sym.load(symbol_file)
     inputs = mx.sym.var('data', dtype='float16')
     net_fp16 = mx.gluon.SymbolBlock(sm, inputs)
-    net_fp16.load_parameters(param_file, ctx=ctx)
+    net_fp16.load_parameters(param_file, device=device)
     # 3. Get a conv layer's weight parameter name. Conv layer's weight param is
     # expected to be of dtype casted, fp16.
     name = None
@@ -453,7 +453,7 @@ def test_symbol_block_fp16(tmpdir):
 
 @pytest.mark.serial
 def test_large_models():
-    ctx = default_context()
+    device = default_device()
     # Create model
     net = gluon.nn.HybridSequential()
 
@@ -461,7 +461,7 @@ def test_large_models():
     net.add(nn.Conv2D(largest_num_features, 3))
 
     net.hybridize()
-    net.initialize(mx.init.Normal(sigma=0.01), ctx=ctx)
+    net.initialize(mx.init.Normal(sigma=0.01), device=device)
 
     # Compute the height (=width) of the square tensor of the given size in bytes
     def tensor_size(big_tensor_bytes):
@@ -473,7 +473,7 @@ def tensor_size(big_tensor_bytes):
     # The idea is to create models with large tensors of (say) 20% of the total memory.
     # This in the past has given cudnnFind() trouble when it needed to allocate similar I/O's
     # from the area carved out by the MXNET_GPU_MEM_POOL_RESERVE setting (by default 5%).
-    (free_mem_bytes, total_mem_bytes) = mx.context.gpu_memory_info(ctx.device_id)
+    (free_mem_bytes, total_mem_bytes) = mx.context.gpu_memory_info(device.device_id)
     # This test needs to be 'qualified' for use with each new larger memory size
     largest_supported_total_mem_GB = 32
     if (total_mem_bytes > largest_supported_total_mem_GB * 1024 * 1024 * 1024):
@@ -492,7 +492,7 @@ def tensor_size(big_tensor_bytes):
         sys.stderr.write(" {}x{} ".format(height, width))
         sys.stderr.flush()
         data_in = mx.np.random.uniform(low=0, high=255, size=(1, 3, height, width),
-                                       ctx=ctx, dtype="float32")
+                                       device=device, dtype="float32")
         # Evaluate model
         net(data_in).asnumpy()
 
@@ -582,7 +582,7 @@ def test_bulking_gluon_gpu():
 
 
 @mx.util.use_np
-def test_hybridblock_mix_ctx_raise():
+def test_hybridblock_mix_device_raise():
     class FooHybrid(gluon.HybridBlock):
         def forward(self, a, b):
             if isinstance(a, (list, tuple)):
@@ -592,19 +592,19 @@ def forward(self, a, b):
             return a + b
     foo_hybrid = FooHybrid()
     foo_hybrid.hybridize()
-    pytest.raises(ValueError, lambda: foo_hybrid(mx.np.ones((10,), ctx=mx.gpu()),
-                                                 mx.np.ones((10,), ctx=mx.cpu())))
+    pytest.raises(ValueError, lambda: foo_hybrid(mx.np.ones((10,), device=mx.gpu()),
+                                                 mx.np.ones((10,), device=mx.cpu())))
 
 
 @mx.util.use_np
 def test_gemms_true_fp16():
-    ctx = mx.gpu(0)
-    input = mx.np.random.uniform(size=(1, 512), dtype='float16', ctx=ctx)
-    weights = mx.np.random.uniform(size=(128, 512), ctx=ctx)
+    device = mx.gpu(0)
+    input = mx.np.random.uniform(size=(1, 512), dtype='float16', device=device)
+    weights = mx.np.random.uniform(size=(128, 512), device=device)
 
     net = nn.Dense(128, in_units=512, use_bias=False)
     net.cast('float16')
-    net.initialize(ctx=ctx)
+    net.initialize(device=device)
     net.weight.set_data(weights)
 
     with environment('MXNET_FC_TRUE_FP16', '0'):
diff --git a/tests/python/gpu/test_gluon_transforms.py b/tests/python/gpu/test_gluon_transforms.py
index b51a8fab0b45..edb195d3866f 100644
--- a/tests/python/gpu/test_gluon_transforms.py
+++ b/tests/python/gpu/test_gluon_transforms.py
@@ -23,14 +23,14 @@
 from mxnet import gluon
 from mxnet.base import MXNetError
 from mxnet.gluon.data.vision import transforms
-from mxnet.test_utils import assert_almost_equal, set_default_context
+from mxnet.test_utils import assert_almost_equal, set_default_device
 from mxnet.test_utils import almost_equal, same
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
 from common import assertRaises
 from test_numpy_gluon_data_vision import test_to_tensor, test_normalize, test_crop_resize
 
-set_default_context(mx.gpu(0))
+set_default_device(mx.gpu(0))
 
 def test_normalize_gpu():
     test_normalize()
diff --git a/tests/python/gpu/test_kvstore_gpu.py b/tests/python/gpu/test_kvstore_gpu.py
index 18f632037ec6..356977437431 100644
--- a/tests/python/gpu/test_kvstore_gpu.py
+++ b/tests/python/gpu/test_kvstore_gpu.py
@@ -21,7 +21,7 @@
 import mxnet as mx
 import numpy as np
 import pytest
-from mxnet.test_utils import assert_almost_equal, default_context, environment
+from mxnet.test_utils import assert_almost_equal, default_device, environment
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
 
@@ -42,7 +42,7 @@ def init_kv_with_str(stype='default', kv_type='local'):
 # 2. Test seed 1155716252 (module seed 1032824746) resulted in py3-dnnl-gpu have error
 # src/operator/nn/dnnl/dnnl_base.cc:567: Check failed: similar
 # Both of them are not reproducible, so this test is back on random seeds.
-@pytest.mark.skipif(mx.context.num_gpus() < 2, reason="test_rsp_push_pull needs more than 1 GPU")
+@pytest.mark.skipif(mx.device.num_gpus() < 2, reason="test_rsp_push_pull needs more than 1 GPU")
 @pytest.mark.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/14189")
 @pytest.mark.serial
 def test_rsp_push_pull():
diff --git a/tests/python/gpu/test_numpy_fallback.py b/tests/python/gpu/test_numpy_fallback.py
index dc367b03139c..de8bbe0618a5 100644
--- a/tests/python/gpu/test_numpy_fallback.py
+++ b/tests/python/gpu/test_numpy_fallback.py
@@ -28,7 +28,7 @@
 import scipy.special as scipy_special
 from mxnet import np, npx
 from mxnet.base import MXNetError
-from mxnet.test_utils import assert_almost_equal, use_np, set_default_context
+from mxnet.test_utils import assert_almost_equal, use_np, set_default_device
 import os
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
@@ -38,7 +38,7 @@
 from mxnet.numpy_op_signature import _get_builtin_op
 from mxnet.util import numpy_fallback
 
-set_default_context(mx.gpu(0))
+set_default_device(mx.gpu(0))
 
 @use_np
 @pytest.mark.serial
@@ -88,18 +88,18 @@ def get_indices(axis_size):
     for fallback_out, onp_out in zip(fallback_ret, onp_ret):
         if isinstance(fallback_out, (list, tuple)):
             for fallback_item, onp_item in zip(fallback_out, onp_out):
-                assert fallback_item.ctx == mx.context.current_context(), "incorrect output context %s vs desired %s" % (str(fallback_item.ctx), str(mx.context.current_context()))
+                assert fallback_item.device == mx.device.current_device(), "incorrect output device %s vs desired %s" % (str(fallback_item.device), str(mx.device.current_device()))
                 assert isinstance(fallback_item, np.ndarray)
                 assert_almost_equal(fallback_item.asnumpy(), onp_item, rtol=1e-3, atol=1e-5, equal_nan=False)
         else:
-            assert fallback_out.ctx == mx.context.current_context(), "incorrect output context %s vs desired %s" % (str(fallback_out.ctx), str(mx.context.current_context()))
+            assert fallback_out.device == mx.device.current_device(), "incorrect output device %s vs desired %s" % (str(fallback_out.device), str(mx.device.current_device()))
             assert isinstance(fallback_out, np.ndarray)
             assert_almost_equal(fallback_out.asnumpy(), onp_out, rtol=1e-3, atol=1e-5, equal_nan=False)
 
-    # does not support mixed-context inputs
-    assertRaises(AssertionError, dnp_func, mx_a.as_in_ctx(npx.cpu(0)), b=mx_b, split_inputs=(mx_c, mx_indices), ret_type=ret_type)
+    # does not support mixed-device inputs
+    assertRaises(AssertionError, dnp_func, mx_a.to_device(npx.cpu(0)), b=mx_b, split_inputs=(mx_c, mx_indices), ret_type=ret_type)
     assertRaises(AssertionError, dnp_func, mx_a, b=mx_b,
-                 split_inputs=(mx_c.as_in_ctx(npx.cpu(0)), mx_indices.as_in_ctx(npx.gpu(0))), ret_type=ret_type)
+                 split_inputs=(mx_c.to_device(npx.cpu(0)), mx_indices.to_device(npx.gpu(0))), ret_type=ret_type)
 
     @numpy_fallback
     def empty_ret_func():
diff --git a/tests/python/gpu/test_numpy_op.py b/tests/python/gpu/test_numpy_op.py
index a72947dea1f6..08571eba2e45 100644
--- a/tests/python/gpu/test_numpy_op.py
+++ b/tests/python/gpu/test_numpy_op.py
@@ -21,13 +21,13 @@
 import mxnet as mx
 from mxnet import np
 from mxnet.gluon import HybridBlock
-from mxnet.test_utils import assert_almost_equal, use_np, set_default_context, environment
+from mxnet.test_utils import assert_almost_equal, use_np, set_default_device, environment
 import os
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
 from common import assertRaises
 
-set_default_context(mx.gpu(0))
+set_default_device(mx.gpu(0))
 
 @use_np
 def test_np_einsum():
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 9ce005b2f72f..066ab97f6a77 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -25,9 +25,9 @@
 import itertools
 import scipy.sparse as sps
 import mxnet.ndarray.sparse as mxsps
-from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal, assert_allclose
+from mxnet.test_utils import check_consistency, set_default_device, assert_almost_equal, assert_allclose
 from mxnet.test_utils import check_symbolic_forward, check_symbolic_backward, discard_stderr
-from mxnet.test_utils import default_context, rand_shape_2d, rand_ndarray, same, environment, get_rtc_compile_opts
+from mxnet.test_utils import default_device, rand_shape_2d, rand_ndarray, same, environment, get_rtc_compile_opts
 from mxnet.base import MXNetError
 from mxnet import autograd
 
@@ -53,7 +53,7 @@
 from test_optimizer import test_adamW
 del test_custom_op_fork  #noqa
 
-set_default_context(mx.gpu(0))
+set_default_device(mx.gpu(0))
 
 def check_countsketch(in_dim,out_dim,n):
     data = mx.sym.Variable("data")
@@ -2059,8 +2059,8 @@ def test_bilinear_sampler_versions():
         data_shape, grid_shape = item
         # kWriteTo
         exe_cpu = sym1._simple_bind(data=data_shape, grid=grid_shape, ctx=mx.cpu(), grad_req='write')
-        exe_gpu = sym2._simple_bind(data=data_shape, grid=grid_shape, ctx=default_context(), grad_req='write')
-        exe_cudnn = sym3._simple_bind(data=data_shape, grid=grid_shape, ctx=default_context(), grad_req='write')
+        exe_gpu = sym2._simple_bind(data=data_shape, grid=grid_shape, ctx=default_device(), grad_req='write')
+        exe_cudnn = sym3._simple_bind(data=data_shape, grid=grid_shape, ctx=default_device(), grad_req='write')
         exe_list = [exe_cpu, exe_gpu, exe_cudnn]
         ref_idx = 0
         test_data = np.random.uniform(low=-0.1, high=0.1,size=data_shape).astype(np.float32)
@@ -2082,8 +2082,8 @@ def test_bilinear_sampler_versions():
 
         # kAddTo
         exe_cpu_addto = sym1._simple_bind(data=data_shape, grid=grid_shape, ctx=mx.cpu(), grad_req='add')
-        exe_gpu_addto = sym2._simple_bind(data=data_shape, grid=grid_shape, ctx=default_context(), grad_req='add')
-        exe_cudnn_addto = sym3._simple_bind(data=data_shape, grid=grid_shape, ctx=default_context(), grad_req='add')
+        exe_gpu_addto = sym2._simple_bind(data=data_shape, grid=grid_shape, ctx=default_device(), grad_req='add')
+        exe_cudnn_addto = sym3._simple_bind(data=data_shape, grid=grid_shape, ctx=default_device(), grad_req='add')
         exe_list = [exe_cpu_addto, exe_gpu_addto, exe_cudnn_addto]
         data_initial_grad = np.random.normal(size=exe_list[ref_idx].grad_dict['data'].shape).astype(np.float32)
         grid_initial_grad = np.random.normal(size=exe_list[ref_idx].grad_dict['grid'].shape).astype(np.float32)
@@ -2102,8 +2102,8 @@ def test_bilinear_sampler_versions():
         for req_dict in [{'data' : 'null', 'grid' : 'write'}, {'data' : 'write', 'grid' : 'null'}]:
             # Mixture of kWriteTo and kNullOp
             exe_cpu_mix = sym1._simple_bind(data=data_shape, grid=grid_shape, ctx=mx.cpu(), grad_req=req_dict)
-            exe_gpu_mix = sym2._simple_bind(data=data_shape, grid=grid_shape, ctx=default_context(), grad_req=req_dict)
-            exe_cudnn_mix = sym3._simple_bind(data=data_shape, grid=grid_shape, ctx=default_context(), grad_req=req_dict)
+            exe_gpu_mix = sym2._simple_bind(data=data_shape, grid=grid_shape, ctx=default_device(), grad_req=req_dict)
+            exe_cudnn_mix = sym3._simple_bind(data=data_shape, grid=grid_shape, ctx=default_device(), grad_req=req_dict)
             exe_list = [exe_cpu_mix, exe_gpu_mix, exe_cudnn_mix]
             for exe in exe_list:
                 exe.arg_dict['data'][:] = test_data
@@ -2122,7 +2122,7 @@ def _test_bulking_in_process(seed, time_per_iteration):
     num_ops = 1000
     num_iterations = 20
 
-    ctx = default_context()
+    ctx = default_device()
     # build symbol
     X = mx.sym.Variable('X')
     sym = mx.sym.flip(X, axis=0)
diff --git a/tests/python/gpu/test_profiler_gpu.py b/tests/python/gpu/test_profiler_gpu.py
index 5e7d2d72f010..79720897cd25 100644
--- a/tests/python/gpu/test_profiler_gpu.py
+++ b/tests/python/gpu/test_profiler_gpu.py
@@ -21,7 +21,7 @@
 
 import numpy as np
 import mxnet as mx
-mx.test_utils.set_default_context(mx.gpu(0))
+mx.test_utils.set_default_device(mx.gpu(0))
 
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
@@ -120,11 +120,11 @@ def test_gpu_memory_profiler_gluon():
     model.add(nn.Dense(64, activation='tanh'),
               nn.Dense(32, in_units=64))
     model.add(nn.Activation('relu'))
-    model.initialize(ctx=mx.gpu())
+    model.initialize(device=mx.gpu())
     model.hybridize()
 
     with mx.autograd.record():
-        out = model(mx.np.zeros((16, 10), ctx=mx.gpu()))
+        out = model(mx.np.zeros((16, 10), device=mx.gpu()))
     out.backward()
     mx.npx.waitall()
     profiler.set_state('stop')
diff --git a/tests/python/gpu/test_tvm_op_gpu.py b/tests/python/gpu/test_tvm_op_gpu.py
index 16b459c7a0c8..d490b7b93850 100644
--- a/tests/python/gpu/test_tvm_op_gpu.py
+++ b/tests/python/gpu/test_tvm_op_gpu.py
@@ -16,11 +16,11 @@
 # under the License.
 
 import mxnet as mx
-from mxnet.test_utils import set_default_context
+from mxnet.test_utils import set_default_device
 import os
 import sys
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
 from test_tvm_op import *
 
-set_default_context(mx.gpu(0))
+set_default_device(mx.gpu(0))
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 99d7791f8e20..236034722f7e 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -42,16 +42,16 @@ def collect_block_args_aux(block, sym):
   return arg_params, aux_params
 
 def is_test_for_gpu():
-    return mx.current_context().device_type == 'gpu'
+    return mx.current_device().device_type == 'gpu'
 
 
 def is_test_for_dnnl():
-    return (mx.current_context().device_type == 'cpu'
+    return (mx.current_device().device_type == 'cpu'
             and os.environ.get('ENABLE_ONEDNN_QUANTIZATION_TEST') == '1')
 
 
 def is_test_for_native_cpu():
-    return (mx.current_context().device_type == 'cpu'
+    return (mx.current_device().device_type == 'cpu'
             and os.environ.get('ENABLE_ONEDNN_QUANTIZATION_TEST') == None)
 
 
@@ -101,7 +101,7 @@ def test_symbolic_api_dequantization(qdata, min_range, max_range, expected_resul
         sym_max_range = mx.sym.Variable('max_range')
         dequant = mx.sym.contrib.dequantize(sym_data, sym_min_range,
                                             sym_max_range, out_type='float32')
-        out = dequant._bind(ctx=mx.current_context(),
+        out = dequant._bind(ctx=mx.current_device(),
                            args={'data':qdata, 'min_range':min_range, 'max_range':max_range})
         data = out.forward()[0]
         assert data.dtype == onp.float32
@@ -291,15 +291,15 @@ def __init__(self, channels, kernel_size, strides=(1, 1),
                     self.max_bias = mx.gluon.Parameter('max_bias', dtype='float32', shape=(1), allow_deferred_init=True)
 
             def forward(self, x):
-                ctx = x.ctx
-                weight = self.weight.data().as_in_ctx(ctx)
-                bias = self.bias.data().as_in_ctx(ctx) if self.use_bias else None
-                min_data = self.min_data.data().as_in_ctx(ctx)
-                max_data = self.max_data.data().as_in_ctx(ctx)
-                min_weight = self.min_weight.data().as_in_ctx(ctx)
-                max_weight = self.max_weight.data().as_in_ctx(ctx)
-                min_bias = self.min_bias.data().as_in_ctx(ctx) if self.use_bias else None
-                max_bias = self.max_bias.data().as_in_ctx(ctx) if self.use_bias else None
+                device = x.device
+                weight = self.weight.data().to_device(device)
+                bias = self.bias.data().to_device(device) if self.use_bias else None
+                min_data = self.min_data.data().to_device(device)
+                max_data = self.max_data.data().to_device(device)
+                min_weight = self.min_weight.data().to_device(device)
+                max_weight = self.max_weight.data().to_device(device)
+                min_bias = self.min_bias.data().to_device(device) if self.use_bias else None
+                max_bias = self.max_bias.data().to_device(device) if self.use_bias else None
                 out = npx.quantized_conv(data=x, weight=weight, bias=bias, 
                                          min_data=min_data, max_data=max_data,
                                          min_weight=min_weight, max_weight=max_weight,
@@ -655,15 +655,15 @@ def __init__(self, num_hidden, use_bias, flatten, **kwargs):
                     self.max_bias = mx.gluon.Parameter('max_bias', dtype='float32', shape=(1), allow_deferred_init=True)
 
             def forward(self, x):
-                ctx = x.ctx
-                weight = self.weight.data().as_in_ctx(ctx)
-                bias = self.bias.data().as_in_ctx(ctx) if self.use_bias else None
-                min_data = self.min_data.data().as_in_ctx(ctx)
-                max_data = self.max_data.data().as_in_ctx(ctx)
-                min_weight = self.min_weight.data().as_in_ctx(ctx)
-                max_weight = self.max_weight.data().as_in_ctx(ctx)
-                min_bias = self.min_bias.data().as_in_ctx(ctx) if self.use_bias else None
-                max_bias = self.max_bias.data().as_in_ctx(ctx) if self.use_bias else None
+                device = x.device
+                weight = self.weight.data().to_device(device)
+                bias = self.bias.data().to_device(device) if self.use_bias else None
+                min_data = self.min_data.data().to_device(device)
+                max_data = self.max_data.data().to_device(device)
+                min_weight = self.min_weight.data().to_device(device)
+                max_weight = self.max_weight.data().to_device(device)
+                min_bias = self.min_bias.data().to_device(device) if self.use_bias else None
+                max_bias = self.max_bias.data().to_device(device) if self.use_bias else None
                 out = npx.quantized_fully_connected(data=x, weight=weight, bias=bias, 
                                                     min_data=min_data, max_data=max_data,
                                                     min_weight=min_weight, max_weight=max_weight,
@@ -759,10 +759,10 @@ def __init__(self, input_dim=input_dim, output_dim=output_dim, **kwargs):
                 self.max_weight = mx.gluon.Parameter('max_weight', dtype='float32', shape=(1), allow_deferred_init=True)
 
             def forward(self, x):
-                ctx = x.ctx
-                weight = self.weight.data().as_in_ctx(ctx)
-                min_weight = self.min_weight.data().as_in_ctx(ctx)
-                max_weight = self.max_weight.data().as_in_ctx(ctx)
+                device = x.device
+                weight = self.weight.data().to_device(device)
+                min_weight = self.min_weight.data().to_device(device)
+                max_weight = self.max_weight.data().to_device(device)
                 out = npx.quantized_embedding(data=x, weight=weight,
                                               min_weight=min_weight,
                                               max_weight=max_weight,
@@ -933,7 +933,7 @@ def check_quantized_bn(data_shape, qdtype):
                                                  calib_data=calib_data,
                                                  calib_mode='naive',
                                                  num_calib_batches=1,
-                                                 ctx=mx.current_context())
+                                                 device=mx.current_device())
 
         output_int8_to_fp32 = quant_bn(data)
 
@@ -958,7 +958,7 @@ def test_quantize_params():
     params = {}
     for name in offline_params:
         params[name] = mx.nd.uniform(shape=(2, 2))
-    qsym, _ = mx.contrib.quant._quantize_symbol(sym, ctx=mx.current_context(),
+    qsym, _ = mx.contrib.quant._quantize_symbol(sym, device=mx.current_device(),
                                                 offline_params=offline_params, quantize_mode='full')
     qparams = mx.contrib.quant._quantize_params(qsym, params, min_max_dict = {})
     param_names = params.keys()
@@ -1096,7 +1096,7 @@ def check_quantize_model(qdtype):
             qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
                                                                              arg_params=arg_params,
                                                                              aux_params=aux_params,
-                                                                             ctx=mx.current_context(),
+                                                                             device=mx.current_device(),
                                                                              quantized_dtype=qdtype,
                                                                              calib_mode='none',
                                                                              quantize_mode='full')
@@ -1108,7 +1108,7 @@ def check_quantize_model(qdtype):
             qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
                                                                              arg_params=arg_params,
                                                                              aux_params=aux_params,
-                                                                             ctx=mx.current_context(),
+                                                                             device=mx.current_device(),
                                                                              quantized_dtype=qdtype,
                                                                              calib_mode='naive',
                                                                              calib_data=calib_data,
@@ -1136,7 +1136,7 @@ def check_quantize_model_multiple_inputs(qdtype):
         qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
                                                                          arg_params=arg_params,
                                                                          aux_params=aux_params,
-                                                                         ctx=mx.current_context(),
+                                                                         device=mx.current_device(),
                                                                          quantized_dtype=qdtype,
                                                                          calib_mode='none',
                                                                          quantize_mode='full')
@@ -1149,7 +1149,7 @@ def check_quantize_model_multiple_inputs(qdtype):
         qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym,
                                                                          arg_params=arg_params,
                                                                          aux_params=aux_params,
-                                                                         ctx=mx.current_context(),
+                                                                         device=mx.current_device(),
                                                                          quantized_dtype=qdtype,
                                                                          calib_mode='naive',
                                                                          calib_data=calib_data,
@@ -1180,9 +1180,9 @@ def check_quantize_net(qdtype):
         data_shape = (32, 3, 224, 224)
         batch_size = 1
         resnet18_v1 = vision.resnet18_v1(pretrained=True)
-        resnet18_v1.reset_ctx(mx.current_context())
+        resnet18_v1.reset_device(mx.current_device())
         excluded_names_match = []
-        if mx.current_context() == mx.gpu():
+        if mx.current_device() == mx.gpu():
             excluded_names_match += ['activation', 'relu', 'conv0']
         num_calib_batches = 1
 
@@ -1194,7 +1194,7 @@ def check_quantize_net(qdtype):
                                                               exclude_layers_match=excluded_names_match,
                                                               calib_mode='none',
                                                               data_shapes=[data_shape],
-                                                              ctx=mx.current_context())
+                                                              device=mx.current_device())
         quantized_resnet18_v1.hybridize(static_alloc=True, static_shape=True)
         quantized_resnet18_v1(random_data)
 
@@ -1208,7 +1208,7 @@ def check_quantize_net(qdtype):
                                                                     calib_mode=mode,
                                                                     quantize_granularity=quantize_granularity,
                                                                     num_calib_batches=num_calib_batches,
-                                                                    ctx=mx.current_context())
+                                                                    device=mx.current_device())
 
                 quantized_resnet18_v1.hybridize(static_alloc=True, static_shape=True)
                 quantized_resnet18_v1(random_data)
@@ -1236,7 +1236,7 @@ def get_fp32_sym():
     sym = get_fp32_sym()
     offline_params = [name for name in sym.list_arguments()
                       if not name.startswith('data') and not name.endswith('label')]
-    qsym, _ = mx.contrib.quant._quantize_symbol(sym, ctx=mx.current_context(),
+    qsym, _ = mx.contrib.quant._quantize_symbol(sym, device=mx.current_device(),
                                              offline_params=offline_params, quantize_mode='full')
     requantize_op_names = ['requantize_conv', 'requantize_fc']
     min_max_dict = {'conv_output': (onp.random.uniform(low=100.0, high=200.0), onp.random.uniform(low=100.0, high=200.0)),
@@ -1274,7 +1274,7 @@ def test_quantization_net_with_different_data_inputs_options():
     quantized_net = mx.contrib.quant.quantize_net(net,
                                                   quantized_dtype='auto',
                                                   data_shapes=[data_shape],
-                                                  ctx=mx.current_context())
+                                                  device=mx.current_device())
     out = quantized_net(random_data)
     out.wait_to_read()
 
@@ -1286,7 +1286,7 @@ def test_quantization_net_with_different_data_inputs_options():
     quantized_net2 = mx.contrib.quant.quantize_net(net2,
                                                    quantized_dtype='auto',
                                                    data_shapes=[data_desc],
-                                                   ctx=mx.current_context())
+                                                   device=mx.current_device())
     out2 = quantized_net2(random_data)
     out2.wait_to_read()
 
@@ -1298,7 +1298,7 @@ def test_quantization_net_with_different_data_inputs_options():
     quantized_net3 = mx.contrib.quant.quantize_net(net3,
                                                    quantized_dtype='auto',
                                                    calib_data=data_loader,
-                                                   ctx=mx.current_context())
+                                                   device=mx.current_device())
     out3 = quantized_net3(random_data)
     out3.wait_to_read()
 
diff --git a/tests/python/test_quantization_gpu.py b/tests/python/test_quantization_gpu.py
index 0f14fa1ac961..a2501193475a 100644
--- a/tests/python/test_quantization_gpu.py
+++ b/tests/python/test_quantization_gpu.py
@@ -21,7 +21,7 @@
 
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../quantization'))
-from mxnet.test_utils import set_default_context
+from mxnet.test_utils import set_default_device
 from test_quantization import *
 
-set_default_context(mx.gpu(0))
+set_default_device(mx.gpu(0))
diff --git a/tests/python/unittest/common.py b/tests/python/unittest/common.py
index 7566b4bbc7d2..d2dd8e378683 100644
--- a/tests/python/unittest/common.py
+++ b/tests/python/unittest/common.py
@@ -136,7 +136,7 @@ def test_new(*args, **kwargs):
             cuxx_off = os.getenv(cfg['TEST_OFF_ENV_VAR']) == 'true'
             cuxx_env_version = os.getenv(cfg['VERSION_ENV_VAR'], None if cuxx_off else cfg['DEFAULT_VERSION'])
             cuxx_test_disabled = cuxx_off or less_than(cuxx_env_version, min_version)
-            if not cuxx_test_disabled or mx.context.current_context().device_type == 'cpu':
+            if not cuxx_test_disabled or mx.device.current_device.device_type == 'cpu':
                 orig_test(*args, **kwargs)
             else:
                 pytest.raises((MXNetError, RuntimeError), orig_test, *args, **kwargs)
diff --git a/tests/python/unittest/test_contrib_control_flow.py b/tests/python/unittest/test_contrib_control_flow.py
index 20833dc115e1..7b6d7cb74849 100644
--- a/tests/python/unittest/test_contrib_control_flow.py
+++ b/tests/python/unittest/test_contrib_control_flow.py
@@ -101,7 +101,7 @@ def _zeros_like_dict(name_list):
                    + ["LoopVar" + str(i) for i, _ in enumerate(loop_var_shapes) if i >= loop_var_start]
         args_grad = None if not is_train else _zeros_like_dict(x for x in args_names)
         executor = loop_result_sym._bind(
-            ctx=default_context(),
+            ctx=default_device(),
             args=_copy_args_dict(loop_result_sym.list_inputs()),
             args_grad=args_grad,
         )
@@ -758,7 +758,7 @@ def _get_sym_result(is_train, args, args_grad, out_grad):
         ]
         result_sym = mx.sym.Group(make_loop(i, j, x_sum, sc))
         executor = result_sym._bind(
-            ctx=default_context(),
+            ctx=default_device(),
             args=args,
             args_grad=args_grad,
         )
@@ -840,7 +840,7 @@ def _get_symbolic_result(out_grads):
         outputs_sym = [x * 2 for x in outputs_sym]
         outputs_sym = mx.sym.Group(outputs_sym)
         executor = outputs_sym._bind(
-            ctx=default_context(),
+            ctx=default_device(),
             args={name: _args_dict[name].copy() for name in outputs_sym.list_inputs()},
             args_grad=None if not is_train else _merge_dict(
                 {"InputVar" + str(i): mx.nd.zeros(s) for i, s in enumerate(input_var_shapes)},
@@ -985,9 +985,9 @@ def verify_foreach(step, in_syms, state_syms, free_syms,
             i = i + 1
 
         if is_train:
-            e = out._bind(ctx=default_context(), args=arg_dict, args_grad=arg_grad_dict)
+            e = out._bind(ctx=default_device(), args=arg_dict, args_grad=arg_grad_dict)
         else:
-            e = out._bind(ctx=default_context(), args=arg_dict)
+            e = out._bind(ctx=default_device(), args=arg_dict)
         # the inputs to forward and backward are the same so forward and backward
         # should always return the same outputs.
         for _ in range(num_iters):
@@ -1288,7 +1288,7 @@ def step_nd(in1, states):
     state = mx.nd.arange(2)
     data_grad = mx.nd.empty(data.shape)
     state_grad = mx.nd.empty(state.shape)
-    e = out._bind(ctx=default_context(), args={'v1':data, 'v2':state},
+    e = out._bind(ctx=default_device(), args={'v1':data, 'v2':state},
             args_grad={'v1':data_grad, 'v2':state_grad})
     e.forward(is_train=True)
     out_grads = []
diff --git a/tests/python/unittest/test_contrib_operator.py b/tests/python/unittest/test_contrib_operator.py
index 5f132f96c2de..970260befeec 100644
--- a/tests/python/unittest/test_contrib_operator.py
+++ b/tests/python/unittest/test_contrib_operator.py
@@ -44,7 +44,7 @@ def test_box_nms_backward(data, grad, expected, thresh=0.5, valid=0, topk=-1, co
         op = mx.contrib.sym.box_nms(in_var, overlap_thresh=thresh, valid_thresh=valid, topk=topk,
                                     coord_start=coord, score_index=score, id_index=cid, background_id=bid,
                                     force_suppress=force, in_format=in_format, out_format=out_format)
-        exe = op._bind(ctx=default_context(), args=[arr_data], args_grad=[arr_grad])
+        exe = op._bind(ctx=default_device(), args=[arr_data], args_grad=[arr_grad])
         exe.forward(is_train=True)
         exe.backward(mx.nd.array(grad))
         assert_almost_equal(arr_grad.asnumpy(), expected)
@@ -269,9 +269,9 @@ def assert_match(inputs, x, y, threshold, is_ascend=False):
     assert_match([[0.5, 0.6], [0.1, 0.2], [0.3, 0.4]], [-1, 0, 1], [1, 2], 100, True)
 
 def test_multibox_target_op():
-    anchors = mx.nd.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]], ctx=default_context()).reshape((1, -1, 4))
-    cls_pred = mx.nd.array(list(range(10)), ctx=default_context()).reshape((1, -1, 2))
-    label = mx.nd.array([1, 0.1, 0.1, 0.5, 0.6], ctx=default_context()).reshape((1, -1, 5))
+    anchors = mx.nd.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]], ctx=default_device()).reshape((1, -1, 4))
+    cls_pred = mx.nd.array(list(range(10)), ctx=default_device()).reshape((1, -1, 2))
+    label = mx.nd.array([1, 0.1, 0.1, 0.5, 0.6], ctx=default_device()).reshape((1, -1, 5))
 
     loc_target, loc_mask, cls_target = \
         mx.nd.contrib.MultiBoxTarget(anchors, label, cls_pred,
@@ -354,7 +354,7 @@ def test_box_decode_op():
         [0.13240421, 0.17859563, 0.93759584, 1.1174043 ]]]), atol=1e-5, rtol=1e-5)
 
 def test_op_mrcnn_mask_target():
-    if default_context().device_type != 'gpu':
+    if default_device().device_type != 'gpu':
         return
 
     num_rois = 2
@@ -425,7 +425,7 @@ def dynamic_reshape_testcases(src_shape, shape_arg, dst_shape):
         args_grad = {
             'data': mx.nd.empty(src_shape)
         }
-        exe = net._bind(default_context(), args, args_grad)
+        exe = net._bind(default_device(), args, args_grad)
         exe.forward(is_train=True)
         assert np.square(exe.outputs[0].asnumpy() - dat_npy.reshape(dst_shape)).mean() < 1E-7
         exe.backward(out_grads=mx.nd.array(grad_npy))
diff --git a/tests/python/unittest/test_contrib_stes_op.py b/tests/python/unittest/test_contrib_stes_op.py
index a5b38e7b5661..b3d8a143e16f 100644
--- a/tests/python/unittest/test_contrib_stes_op.py
+++ b/tests/python/unittest/test_contrib_stes_op.py
@@ -18,7 +18,7 @@
 from common import xfail_when_nonstandard_decimal_separator
 import mxnet as mx
 from mxnet import nd, autograd, gluon
-from mxnet.test_utils import default_context
+from mxnet.test_utils import default_device
 
 
 @mx.util.use_np
@@ -37,11 +37,11 @@ def expected_output(self, in_data, w_init):
 
     def forward(self, x):
         # Simple forward function: round_ste(w*x)*w
-        out = self.w.data(x.ctx) * x
+        out = self.w.data(x.device) * x
         out = mx.npx.round_ste(out)
         # Uncomment to see how test fails with round
         # out = F.round(out)
-        out = out * self.w.data(x.ctx)
+        out = out * self.w.data(x.device)
         return out
 
 
@@ -61,25 +61,25 @@ def expected_output(self, in_data, w_init):
 
     def forward(self, x):
         # Simple forward function: sign_ste(w*x)*w
-        out = self.w.data(x.ctx) * x
+        out = self.w.data(x.device) * x
         out = mx.npx.sign_ste(out)
         # Uncomment to see how test fails with sign
         # out = F.sign(out)
-        out = out * self.w.data(x.ctx)
+        out = out * self.w.data(x.device)
         return out
 
 
-def check_ste(net_type_str, w_init, hybridize, in_data, ctx=None):
-    ctx = ctx or default_context()
+def check_ste(net_type_str, w_init, hybridize, in_data, device=None):
+    device = device or default_device()
 
     net = eval(net_type_str)(w_init=w_init)
     if hybridize:
         net.hybridize()
     # Init
-    net.initialize(mx.init.Constant([w_init]), ctx=ctx)
+    net.initialize(mx.init.Constant([w_init]), device=device)
 
     # Test:
-    in_data = in_data.as_in_context(ctx)
+    in_data = in_data.to_device(device)
     with mx.autograd.record():
         out = net(in_data)
     assert all(out == net.expected_output(in_data, w_init)), net_type_str + " output is " + str(out) + ", but" + \
diff --git a/tests/python/unittest/test_deferred_compute.py b/tests/python/unittest/test_deferred_compute.py
index dcdbd9bc9a86..b9d93a5e6a96 100644
--- a/tests/python/unittest/test_deferred_compute.py
+++ b/tests/python/unittest/test_deferred_compute.py
@@ -94,7 +94,7 @@ def _assert_dc(setup, compute, mode='all', setup_is_deterministic=True, numpy=Tr
                 xs = setup(nd=nd)
 
             args = {name: x for name, x in zip(xs_names, xs)}
-            ys_sym = sym._bind(mx.context.current_context(), args=args).forward()
+            ys_sym = sym._bind(mx.device.current_device(), args=args).forward()
 
             ys_sym_np = [y.asnumpy() for y in ys_sym]
             _all_same(ys_np, ys_sym_np)
@@ -272,10 +272,10 @@ def f(*, nd):
         a = nd.arange(10)
         if nd is mx.nd:
             b = a.as_in_context(mx.cpu(1))
-            c = (b - 1).as_in_context(mx.context.current_context())
+            c = (b - 1).as_in_context(mx.device.current_device())
         else:
-            b = a.as_in_ctx(mx.cpu(1))
-            c = (b - 1).as_in_ctx(mx.context.current_context())
+            b = a.to_device(mx.cpu(1))
+            c = (b - 1).to_device(mx.device.current_device())
         return [c]
 
     _assert_dc(_dc_empty_setup, f)
@@ -285,10 +285,10 @@ def test_dc_context_switch():
     def f(a, *, nd):
         if nd is mx.nd:
             b = a.as_in_context(mx.cpu(1))
-            c = (b - 1).as_in_context(mx.context.current_context())
+            c = (b - 1).as_in_context(mx.device.current_device())
         else:
-            b = a.as_in_ctx(mx.cpu(1))
-            c = (b - 1).as_in_ctx(mx.context.current_context())
+            b = a.to_device(mx.cpu(1))
+            c = (b - 1).to_device(mx.device.current_device())
         return [c]
 
     _assert_dc(_dc_simple_setup, f)
@@ -371,7 +371,7 @@ def f(a, *, nd):
 ###############################################################################
 # Gluon
 ###############################################################################
-def _assert_dc_gluon(setup, net, setup_is_deterministic=True, numpy=True, autograd=True, ctx=None):
+def _assert_dc_gluon(setup, net, setup_is_deterministic=True, numpy=True, autograd=True, device=None):
     """Compare results of deferred compute and normal imperative mode.
 
     Parameters
@@ -392,9 +392,9 @@ def _assert_dc_gluon(setup, net, setup_is_deterministic=True, numpy=True, autogr
 
     nd = mx.np if numpy else mx.nd
 
-    if ctx is None:
-        ctx = mx.context.current_context()
-    with ctx:
+    if device is None:
+        device = mx.device.current_device()
+    with device:
         xs = setup(nd=nd)
 
     ys = net(*xs)
@@ -425,7 +425,7 @@ def _assert_dc_gluon(setup, net, setup_is_deterministic=True, numpy=True, autogr
             net.export(root)
 
 def _dc_gluon_simple_setup(shape=(8, 10), *, nd):
-    return [nd.ones(shape=shape, ctx=mx.context.current_context())]
+    return [nd.ones(shape=shape, device=mx.device.current_device())]
 
 
 def test_dc_hybridblock():
@@ -439,14 +439,14 @@ def forward(self, x):
             assert x.shape[1] == 10  # due to in_units=10 above
             return self.dense(x) + self.weight.data(x.context)
 
-    if mx.context.current_context() == mx.cpu(0):  # CPU tests
-        contexts = [mx.cpu(0), mx.cpu(1)]
-    else:  # Use default context, GPU tests
-        contexts = [mx.context.current_context()]
-    for ctx in contexts:
+    if mx.device.current_device() == mx.cpu(0):  # CPU tests
+        devices = [mx.cpu(0), mx.cpu(1)]
+    else:  # Use default device, GPU tests
+        devices = [mx.device.current_device()]
+    for device in devices:
         net = MyBlock()
-        net.initialize(ctx=contexts)
-        _assert_dc_gluon(_dc_gluon_simple_setup, net, numpy=True, ctx=ctx)
+        net.initialize(device=devices)
+        _assert_dc_gluon(_dc_gluon_simple_setup, net, numpy=True, device=device)
 
 
 def test_dc_hybridblock_wrapped():
@@ -478,7 +478,7 @@ def forward(self, x):
 
     net = MyBlock()
     net.initialize()
-    data = mx.np.ones(shape=(8, 10), ctx=mx.context.current_context())
+    data = mx.np.ones(shape=(8, 10), device=mx.device.current_device())
     with pytest.raises(RuntimeError):
         net(data)
 
@@ -494,7 +494,7 @@ def infer_shape(self, x):
             self.weight.shape = (x.shape[1], )
 
         def forward(self, x):
-            return self.dense(x) + self.weight.data(x.context)
+            return self.dense(x) + self.weight.data(x.device)
 
     net = MyBlock()
     net.initialize()
diff --git a/tests/python/unittest/test_exc_handling.py b/tests/python/unittest/test_exc_handling.py
index 9009f5f2265a..d44beb316770 100644
--- a/tests/python/unittest/test_exc_handling.py
+++ b/tests/python/unittest/test_exc_handling.py
@@ -21,7 +21,7 @@
 from mxnet import gluon
 from mxnet.gluon import nn
 from mxnet.base import MXNetError
-from mxnet.test_utils import assert_exception, default_context, set_default_context, use_np
+from mxnet.test_utils import assert_exception, default_device, set_default_device, use_np
 import pytest
 
 mx.npx.reset_np()
@@ -49,14 +49,14 @@ def symbolic(exec_backward=True, waitall=True):
         inputs = [x, y]
         out = mx.symbol.ElementWiseSum(*inputs, name="esum")
         out = mx.sym.dot(z, out)
-        out2 = mx.sym.random.normal(0, -1, x_shape, ctx=default_context())
+        out2 = mx.sym.random.normal(0, -1, x_shape, ctx=default_device())
         out = mx.sym.dot(out, out2)
         out = mx.sym.make_loss(out)
-        arr = {'x': mx.nd.random.normal(0, 1, x_shape, ctx=default_context()),
-               'y': mx.nd.random.normal(0, 1, x_shape, ctx=default_context()),
-               'z': mx.nd.random.normal(0, 1, z_shape, ctx=default_context())}
+        arr = {'x': mx.nd.random.normal(0, 1, x_shape, ctx=default_device()),
+               'y': mx.nd.random.normal(0, 1, x_shape, ctx=default_device()),
+               'z': mx.nd.random.normal(0, 1, z_shape, ctx=default_device())}
         arr_grad = {'x': mx.nd.empty(x_shape), 'y': mx.nd.empty(x_shape), 'z': mx.nd.empty(z_shape)}
-        exec1 = out._bind(ctx=default_context(), args=arr, args_grad=arr_grad)
+        exec1 = out._bind(ctx=default_device(), args=arr, args_grad=arr_grad)
         outputs = exec1.forward()
         if exec_backward:
             exec1.backward()
@@ -84,7 +84,7 @@ def multiple_waits(waitall=False):
         # for vars with exceptions in same scope
         caught = False
         try:
-            a = mx.nd.random.normal(0, -1, (2, 2)).copyto(default_context())
+            a = mx.nd.random.normal(0, -1, (2, 2)).copyto(default_device())
             if waitall:
                 mx.nd.waitall()
             else:
@@ -93,7 +93,7 @@ def multiple_waits(waitall=False):
             caught = True
         assert caught, "No exception thrown, exception should be rethrown with wait_to_read/waitall"
         try:
-            b = mx.nd.random.normal(0, -1, (2, 2)).copyto(default_context())
+            b = mx.nd.random.normal(0, -1, (2, 2)).copyto(default_device())
             if waitall:
                 mx.nd.waitall()
             else:
@@ -111,7 +111,7 @@ def test_exc_post_fail():
     def post_fail(waitall=False):
         caught = False
         try:
-            a, b = mx.nd.random_normal(0, -1, (2, 2)).copyto(default_context())
+            a, b = mx.nd.random_normal(0, -1, (2, 2)).copyto(default_device())
             if waitall:
                 mx.nd.waitall()
             else:
@@ -125,7 +125,7 @@ def post_fail(waitall=False):
 
 def test_exc_mutable_var_fail():
     def mutable_var_check(waitall=False):
-        a, b = mx.nd.random_normal(0, -1, (2, 2)).copyto(default_context())
+        a, b = mx.nd.random_normal(0, -1, (2, 2)).copyto(default_device())
         a = mx.nd.dot(a, a)
         if waitall:
             mx.nd.waitall()
@@ -137,7 +137,7 @@ def mutable_var_check(waitall=False):
 def test_multiple_waitalls():
     caught = False
     try:
-        a = mx.nd.random.normal(0, -1, (2, 2)).copyto(default_context())
+        a = mx.nd.random.normal(0, -1, (2, 2)).copyto(default_device())
         mx.nd.waitall()
     except MXNetError:
         caught = True
@@ -150,7 +150,7 @@ def run_training_iteration(data):
     net = gluon.nn.HybridSequential()
     net.add(gluon.nn.Dense(10))
 
-    ctx = default_context()
+    ctx = default_device()
     net.initialize(mx.init.Xavier(), ctx=ctx)
     data = mx.nd.ones((3, 4))
     mx.profiler.set_state("run")
diff --git a/tests/python/unittest/test_extensions.py b/tests/python/unittest/test_extensions.py
index c11f6f9075b0..2c4ac9d4b30c 100644
--- a/tests/python/unittest/test_extensions.py
+++ b/tests/python/unittest/test_extensions.py
@@ -24,7 +24,7 @@
 from mxnet import nd
 from mxnet.gluon import nn
 from mxnet.base import MXNetError
-from mxnet.test_utils import download, is_cd_run, assert_almost_equal, default_context
+from mxnet.test_utils import download, is_cd_run, assert_almost_equal, default_device
 import pytest
 
 base_path = os.path.join(os.path.dirname(__file__), "../../..")
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index d34519c332cc..0d40a3196d46 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -23,7 +23,7 @@
 from mxnet import init
 from mxnet.gluon import nn
 from mxnet.base import py_str, MXNetError
-from mxnet.test_utils import assert_almost_equal, default_context, assert_allclose
+from mxnet.test_utils import assert_almost_equal, default_device, assert_allclose
 from mxnet.util import is_np_array
 from mxnet.ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID
 from mxnet.test_utils import use_np
@@ -42,7 +42,7 @@
 
 def test_parameter():
     p = gluon.Parameter('weight', shape=(10, 10))
-    p.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)])
+    p.initialize(init='xavier', device=[mx.cpu(0), mx.cpu(1)])
     assert len(p.list_data()) == 2
     assert len(p.list_grad()) == 2
     assert p.data(mx.cpu(1)).context == mx.cpu(1)
@@ -50,8 +50,8 @@ def test_parameter():
     assert p.grad(mx.cpu(0)).stype == 'default'
     assert p.data(mx.cpu(0)).stype == 'default'
 
-    p.reset_ctx(ctx=[mx.cpu(1), mx.cpu(2)])
-    assert p.list_ctx() == [mx.cpu(1), mx.cpu(2)]
+    p.reset_device(device=[mx.cpu(1), mx.cpu(2)])
+    assert p.list_device() == [mx.cpu(1), mx.cpu(2)]
 
 def test_invalid_parameter_stype():
     with pytest.raises(AssertionError):
@@ -63,8 +63,8 @@ def test_invalid_parameter_grad_stype():
 
 def test_sparse_parameter():
     p = gluon.Parameter('weight', shape=(10, 10), stype='row_sparse', grad_stype='row_sparse')
-    p.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)])
-    row_id = mx.np.arange(0, 10, ctx=mx.cpu(1))
+    p.initialize(init='xavier', device=[mx.cpu(0), mx.cpu(1)])
+    row_id = mx.np.arange(0, 10, device=mx.cpu(1))
     assert len(p.list_grad()) == 2
     # getting row_sparse data without trainer throws an exception
     assertRaises(RuntimeError, p.list_row_sparse_data, row_id)
@@ -77,19 +77,19 @@ def test_sparse_parameter():
     assert p.var().attr('__storage_type__') == str(_STORAGE_TYPE_STR_TO_ID['row_sparse'])
     assert p.grad(mx.cpu(0)).stype == 'row_sparse'
 
-    p.reset_ctx(ctx=[mx.cpu(1), mx.cpu(2)])
-    assert p.list_ctx() == [mx.cpu(1), mx.cpu(2)]
+    p.reset_device(device=[mx.cpu(1), mx.cpu(2)])
+    assert p.list_device() == [mx.cpu(1), mx.cpu(2)]
 
 def test_parameter_invalid_access():
     # cannot call data on row_sparse parameters
     p0 = gluon.Parameter('weight', shape=(10, 10), stype='row_sparse', grad_stype='row_sparse')
-    p0.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)])
+    p0.initialize(init='xavier', device=[mx.cpu(0), mx.cpu(1)])
     assertRaises(RuntimeError, p0.data)
     assertRaises(RuntimeError, p0.list_data)
     row_id = mx.np.arange(0, 10)
     # cannot call row_sparse_data on dense parameters
     p1 = gluon.Parameter('weight', shape=(10, 10))
-    p1.initialize(init='xavier', ctx=[mx.cpu(0), mx.cpu(1)])
+    p1.initialize(init='xavier', device=[mx.cpu(0), mx.cpu(1)])
     assertRaises(RuntimeError, p1.row_sparse_data, row_id.copyto(mx.cpu(0)))
     assertRaises(RuntimeError, p1.list_row_sparse_data, row_id)
 
@@ -375,8 +375,8 @@ def forward(self, a, b):
     pytest.raises(TypeError, lambda: foo_hybrid(mx.np.ones((10,)), mx.sym.var('a')))
     foo_hybrid = FooHybrid()
     foo_hybrid.hybridize()
-    pytest.raises(ValueError, lambda: foo_hybrid(mx.np.ones((10,), ctx=mx.cpu(1)),
-                                                 mx.np.ones((10,), ctx=mx.cpu(2))))
+    pytest.raises(ValueError, lambda: foo_hybrid(mx.np.ones((10,), device=mx.cpu(1)),
+                                                 mx.np.ones((10,), device=mx.cpu(2))))
 
 
 def check_layer_forward(layer, dshape):
@@ -425,7 +425,7 @@ def test_conv(layer, shape):
     (nn.Conv2D(16, (3, 3), layout='NHWC', in_channels=4), (1, 10, 10, 4)),
     # (nn.Conv3D(16, (3, 3, 3), layout='NDHWC', in_channels=4), (1, 10, 10, 10, 4)),
 ])
-@pytest.mark.skipif(mx.context.current_context().device_type!='gpu' or
+@pytest.mark.skipif(mx.device.current_device().device_type!='gpu' or
                     not mx.runtime.Features().is_enabled('CUDNN'),
                     reason='nhwc/ndhwc layout is only supported with CUDNN.')
 def test_conv_nhwc(layer, shape):
@@ -449,7 +449,7 @@ def test_conv_nhwc(layer, shape):
     (nn.Conv3DTranspose(16, (3, 3, 3), padding=4, in_channels=4), (1, 4, 10, 10, 10)),
 ])
 def test_deconv(layer, shape):
-    if len(shape) == 5 and mx.current_context().device_type == 'gpu':
+    if len(shape) == 5 and mx.current_device().device_type == 'gpu':
         pytest.skip('Skipping Conv3DTranspose tests for GPU')
     check_layer_forward(layer, shape)
 
@@ -566,13 +566,13 @@ def test_batchnorm_backward_synchronization(variable):
     Tests if synchronization of BatchNorm running variables is done correctly.
     If not, the test sometimes fails - depending on the timing.
     """
-    ctx = mx.test_utils.default_context()
+    device = mx.test_utils.to_device()
 
     for _ in range(20):
         layer = nn.BatchNorm()
-        layer.initialize(ctx=ctx)
+        layer.initialize(device=device)
         for _ in range(3):
-            data = mx.np.random.normal(loc=10, scale=2, size=(1, 3, 10, 10), ctx=ctx)
+            data = mx.np.random.normal(loc=10, scale=2, size=(1, 3, 10, 10), device=device)
             with mx.autograd.record():
                 out = layer(data)
             out.backward()
@@ -604,35 +604,35 @@ def _find_bn(module):
 
             raise RuntimeError('BN not found')
 
-        def _syncParameters(bn1, bn2, ctx):
-            ctx = input.context
-            bn2.gamma.set_data(bn1.gamma.data(ctx))
-            bn2.beta.set_data(bn1.beta.data(ctx))
-            bn2.running_mean.set_data(bn1.running_mean.data(ctx))
-            bn2.running_var.set_data(bn1.running_var.data(ctx))
+        def _syncParameters(bn1, bn2, device):
+            device = input.context
+            bn2.gamma.set_data(bn1.gamma.data(device))
+            bn2.beta.set_data(bn1.beta.data(device))
+            bn2.running_mean.set_data(bn1.running_mean.data(device))
+            bn2.running_var.set_data(bn1.running_var.data(device))
 
         input1 = input.copy()
         input2 = input.copy()
 
         if cuda:
             input1 = input.as_in_context(mx.gpu(0))
-            ctx_list = [mx.gpu(i) for i in range(num_devices)]
+            device_list = [mx.gpu(i) for i in range(num_devices)]
         else:
-            ctx_list = [mx.cpu(0) for _ in range(num_devices)]
+            device_list = [mx.cpu(0) for _ in range(num_devices)]
 
         nch = input.shape[1] if input.ndim > 1 else 1
         bn1 = mx.gluon.nn.BatchNorm(in_channels=nch)
         bn2 = mx.gluon.nn.SyncBatchNorm(
             in_channels=nch, num_devices=num_devices)
 
-        bn1.initialize(ctx=ctx_list[0])
-        bn2.initialize(ctx=ctx_list)
+        bn1.initialize(device=device_list[0])
+        bn2.initialize(device=device_list)
 
         # using the same values for gamma and beta
-        #_syncParameters(_find_bn(bn1), _find_bn(bn2), ctx_list[0])
+        #_syncParameters(_find_bn(bn1), _find_bn(bn2), device_list[0])
 
         input1.attach_grad()
-        inputs2 = split_and_load(input2, ctx_list, batch_axis=0)
+        inputs2 = split_and_load(input2, device_list, batch_axis=0)
         for xi in inputs2:
             xi.attach_grad()
 
@@ -652,8 +652,8 @@ def _syncParameters(bn1, bn2, ctx):
         epsilon = 1e-5
         axis = 1
         data = input1
-        running_mean = mx.np.zeros(nch, ctx=data.context)
-        running_var = mx.np.ones(nch, ctx=data.context)
+        running_mean = mx.np.zeros(nch, device=data.context)
+        running_var = mx.np.ones(nch, device=data.context)
 
         axes = list(range(data.ndim))
         del axes[axis]
@@ -675,10 +675,10 @@ def _syncParameters(bn1, bn2, ctx):
         rtol = 1e-2
         assert_almost_equal(output1.asnumpy(), target_output.asnumpy(),
                             atol=atol, rtol=rtol)
-        assert_almost_equal(_find_bn(bn1).running_mean.data(ctx_list[0]).asnumpy(),
+        assert_almost_equal(_find_bn(bn1).running_mean.data(device_list[0]).asnumpy(),
                             running_mean.asnumpy(),
                             atol=atol, rtol=rtol)
-        assert_almost_equal(_find_bn(bn1).running_var.data(ctx_list[0]).asnumpy(),
+        assert_almost_equal(_find_bn(bn1).running_var.data(device_list[0]).asnumpy(),
                             running_var.asnumpy(),
                             atol=atol, rtol=rtol)
         # assert forwarding
@@ -686,19 +686,19 @@ def _syncParameters(bn1, bn2, ctx):
                             atol=atol, rtol=rtol)
         assert_almost_equal(output1.asnumpy(),
                             output2.asnumpy(), atol=atol, rtol=rtol)
-        assert_almost_equal(_find_bn(bn1).running_mean.data(ctx_list[0]).asnumpy(),
-                            _find_bn(bn2).running_mean.data(ctx_list[0]).asnumpy(),
+        assert_almost_equal(_find_bn(bn1).running_mean.data(device_list[0]).asnumpy(),
+                            _find_bn(bn2).running_mean.data(device_list[0]).asnumpy(),
                             atol=atol, rtol=rtol)
-        assert_almost_equal(_find_bn(bn1).running_var.data(ctx_list[0]).asnumpy(),
-                            _find_bn(bn2).running_var.data(ctx_list[0]).asnumpy(),
+        assert_almost_equal(_find_bn(bn1).running_var.data(device_list[0]).asnumpy(),
+                            _find_bn(bn2).running_var.data(device_list[0]).asnumpy(),
                             atol=atol, rtol=rtol)
         input2grad = mx.np.concatenate(
-            [output.grad.as_in_context(input.ctx) for output in inputs2], axis=0)
+            [output.grad.as_in_context(input.device) for output in inputs2], axis=0)
         assert_almost_equal(input1.grad.asnumpy(),
                             input2grad.asnumpy(), atol=atol, rtol=rtol)
 
     cfgs = [(1, False)]
-    num_gpus = 0 if default_context().device_type != 'gpu' else mx.context.num_gpus()
+    num_gpus = 0 if default_device().device_type != 'gpu' else mx.context.num_gpus()
     batch_size = 24
     for i in range(1, num_gpus + 1):
         if batch_size % i == 0:
@@ -709,7 +709,7 @@ def _syncParameters(bn1, bn2, ctx):
             print(str((ndev, cuda, shape)))
             for _ in range(10):
                 _check_batchnorm_result(mx.np.random.uniform(size=shape,
-                                                             ctx=mx.cpu(0)),
+                                                             device=mx.cpu(0)),
                                         num_devices=ndev, cuda=cuda)
 
 
@@ -1016,9 +1016,9 @@ def check_embedding_large_input():
 
 def test_export(tmpdir):
     tmpfile = os.path.join(str(tmpdir), 'gluon')
-    ctx = mx.context.current_context()
+    device = mx.device.current_device()
     model = gluon.model_zoo.vision.resnet18_v1(
-        ctx=ctx, pretrained=False)
+        device=device, pretrained=False)
     model.initialize()
     model.hybridize()
     data = mx.np.random.normal(size=(1, 3, 32, 32))
@@ -1030,9 +1030,9 @@ def test_export(tmpdir):
 
 @use_np
 def test_import():
-    ctx = mx.context.current_context()
+    device = mx.device.current_device()
     net1 = gluon.model_zoo.vision.resnet18_v1(
-        ctx=ctx, pretrained=False)
+        device=device, pretrained=False)
     net1.initialize()
     net1.hybridize()
     data = mx.np.random.normal(size=(1, 3, 32, 32))
@@ -1041,7 +1041,7 @@ def test_import():
     net1.export('net1', epoch=1)
 
     net2 = gluon.SymbolBlock.imports(
-        'net1-symbol.json', ['data'], 'net1-0001.params', ctx)
+        'net1-symbol.json', ['data'], 'net1-0001.params', device)
     out2 = net2(data)
     lines = str(net2).splitlines()
 
@@ -1151,15 +1151,15 @@ def forward(self, x):
     mx.npx.waitall()
 
 def test_fill_shape_load():
-    ctx = mx.context.current_context()
+    device = mx.device.current_device()
     net1 = nn.HybridSequential()
     net1.add(nn.Conv2D(64, kernel_size=2, padding=1),
              nn.BatchNorm(),
              nn.Dense(10))
     net1
     net1.hybridize()
-    net1.initialize(ctx=ctx)
-    net1(mx.np.ones((2,3,5,7), ctx=ctx))
+    net1.initialize(device=device)
+    net1(mx.np.ones((2,3,5,7), device=device))
     net1.save_parameters('net_fill.params')
 
     net2 = nn.HybridSequential()
@@ -1168,7 +1168,7 @@ def test_fill_shape_load():
              nn.Dense(10))
     net2.hybridize()
     net2.initialize()
-    net2.load_parameters('net_fill.params', ctx)
+    net2.load_parameters('net_fill.params', device)
     assert net2[0].weight.shape[1] == 3, net2[0].weight.shape[1]
     assert net2[1].gamma.shape[0] == 64, net2[1].gamma.shape[0]
     assert net2[2].weight.shape[1] == 3072, net2[2].weight.shape[1]
@@ -1362,7 +1362,7 @@ def forward(self, x):
                 x = self.encoders[i](x)
             return x
     net = Network()
-    net.initialize(mx.init.Xavier(), ctx=mx.cpu())
+    net.initialize(mx.init.Xavier(), device=mx.cpu())
     net.hybridize()
     x = onp.random.rand(32, 10, 10)
     x = mx.np.array(x).as_in_context(mx.cpu())
@@ -1416,17 +1416,17 @@ def __init__(self, b1, b2):
 
 def test_hybrid_multi_context():
     net = mx.gluon.model_zoo.vision.get_resnet(1, 18)
-    net.initialize(ctx=[mx.cpu(0), mx.cpu(1)])
+    net.initialize(device=[mx.cpu(0), mx.cpu(1)])
     net.hybridize()
-    net(mx.np.zeros((1, 3, 32, 32), ctx=mx.cpu(0))).asnumpy()
+    net(mx.np.zeros((1, 3, 32, 32), device=mx.cpu(0))).asnumpy()
 
 def test_zero_grad():
-    def _test_grad_reset(ctx, dtype='float32', sparse=False, embeddingType=None):
-        data = mx.np.random.uniform(size=(3,3), dtype=dtype, ctx=ctx)
+    def _test_grad_reset(device, dtype='float32', sparse=False, embeddingType=None):
+        data = mx.np.random.uniform(size=(3,3), dtype=dtype, device=device)
         if embeddingType is None:
             embeddingType = dtype
         net = nn.Embedding(3, 4, sparse_grad=sparse, dtype=embeddingType)
-        net.initialize(ctx=ctx)
+        net.initialize(device=device)
         with mx.autograd.record():
             l = net(data)
             l.backward()
@@ -1434,7 +1434,7 @@ def _test_grad_reset(ctx, dtype='float32', sparse=False, embeddingType=None):
         grad = net.collect_params()['weight'].grad()
         assert_almost_equal(grad.asnumpy(), grad.asnumpy() * 0)
 
-    def _test_multi_reset(nArrays, dtype, ctx):
+    def _test_multi_reset(nArrays, dtype, device):
         # Construct the list of non-zeros arrays with random shapes
         arr = []
         for _ in range(nArrays):
@@ -1442,7 +1442,7 @@ def _test_multi_reset(nArrays, dtype, ctx):
             shape = ()
             for _ in range(onp.random.randint(1, 5)):
                 shape = shape + (onp.random.randint(1, 10),)
-            arr.append(mx.nd.random.uniform(shape=shape, dtype=arrType, ctx=ctx))
+            arr.append(mx.nd.random.uniform(shape=shape, dtype=arrType, device=device))
 
         # Reset all arrays
         mx.nd.reset_arrays(*arr, num_arrays=len(arr))
@@ -1454,13 +1454,13 @@ def _test_multi_reset(nArrays, dtype, ctx):
 
 
     # Setting context for current test
-    ctx = mx.context.current_context()
+    device = mx.device.current_device()
 
     # Launching _test_multi_reset 10 times with different types & randomly chosen nArrays
     testedTypes = ['float16', 'float32', 'float64']
     for _ in range(10):
         for type in [testedTypes] + testedTypes:
-            _test_multi_reset(onp.random.randint(1, 50), type, ctx)
+            _test_multi_reset(onp.random.randint(1, 50), type, device)
 
     with environment('MXNET_STORAGE_FALLBACK_LOG_VERBOSE', '0'):
         for type in ['float16', 'float32', 'float64']:
@@ -1477,7 +1477,7 @@ def test_hybrid_static_memory(static_alloc, static_shape):
     x.attach_grad()
 
     net = gluon.model_zoo.vision.get_resnet(
-        1, 18, pretrained=False, ctx=mx.context.current_context())
+        1, 18, pretrained=False, device=mx.device.current_device())
     net.initialize()
     net(x)
 
@@ -1505,7 +1505,7 @@ def test_hybrid_static_memory_switching(static_alloc, static_shape):
     if static_shape and not static_alloc:
         pytest.skip()
     net = gluon.model_zoo.vision.get_resnet(
-        1, 18, pretrained=False, ctx=mx.context.current_context())
+        1, 18, pretrained=False, device=mx.device.current_device())
     net.initialize()
     net.hybridize(static_alloc=static_alloc, static_shape=static_shape)
 
@@ -1720,7 +1720,7 @@ def forward(self, x):
 
 def test_hybrid_static_memory_recording():
     net = gluon.model_zoo.vision.get_resnet(
-        1, 18, pretrained=False, ctx=mx.context.current_context())
+        1, 18, pretrained=False, device=mx.device.current_device())
     net.initialize()
     net.hybridize(static_alloc=True)
 
@@ -2973,8 +2973,8 @@ def test_DeformableConvolution():
     currently this layer only supports gpu
     """
     try:
-        ctx = mx.gpu()
-        _ = mx.np.array([0], ctx=ctx)
+        device = mx.gpu()
+        _ = mx.np.array([0], device=device)
     except mx.base.MXNetError:
         pytest.skip("deformable_convolution only supports GPU")
     net = nn.HybridSequential()
@@ -2992,10 +2992,10 @@ def test_DeformableConvolution():
         nn.DeformableConvolution(12, kernel_size=(3, 2), strides=1, padding=0, use_bias=False, num_deformable_group=4),
     )
 
-    net.initialize(force_reinit=True, ctx=ctx)
+    net.initialize(force_reinit=True, device=device)
     net.hybridize()
 
-    x = mx.np.random.uniform(size=(8, 5, 30, 31), ctx=ctx)
+    x = mx.np.random.uniform(size=(8, 5, 30, 31), device=device)
     with mx.autograd.record():
         y = net(x)
         y.backward()
@@ -3023,11 +3023,11 @@ def test_ModulatedDeformableConvolution():
         nn.DeformableConvolution(12, kernel_size=(3, 2), strides=1, padding=0, use_bias=False, num_deformable_group=4),
     )
 
-    ctx = default_context()
-    net.initialize(force_reinit=True, ctx=ctx)
+    device = default_device()
+    net.initialize(force_reinit=True, device=device)
     net.hybridize()
 
-    x = mx.np.random.uniform(size=(8, 5, 30, 31), ctx=ctx)
+    x = mx.np.random.uniform(size=(8, 5, 30, 31), device=device)
     with mx.autograd.record():
         y = net(x)
 
diff --git a/tests/python/unittest/test_gluon_control_flow.py b/tests/python/unittest/test_gluon_control_flow.py
index 9eaa33cbae6e..717dab1965a5 100644
--- a/tests/python/unittest/test_gluon_control_flow.py
+++ b/tests/python/unittest/test_gluon_control_flow.py
@@ -156,14 +156,14 @@ def step2(data, states):
     data = mx.np.random.normal(loc=0, scale=1, size=(5, 10))
     states = mx.np.random.normal(loc=0, scale=1, size=(10))
     layer = TestLayer()
-    layer.initialize(ctx=default_context())
+    layer.initialize(device=default_device())
     res1 = layer(data, [states])
 
     with mx.autograd.record():
         res1 = layer(data, [states])
 
     layer = TestLayer()
-    layer.initialize(ctx=default_context())
+    layer.initialize(device=default_device())
     layer.hybridize()
     res2 = layer(data, [states])
 
@@ -243,14 +243,14 @@ def step2(state1, state2):
     states = mx.np.random.normal(loc=0, scale=1, size=(5))
     for TestLayer in TestLayers:
         layer = TestLayer()
-        layer.initialize(ctx=default_context())
+        layer.initialize(device=default_device())
         res1 = layer(data, [states])
 
         with mx.autograd.record():
             res1 = layer(data, [states])
 
         layer = TestLayer()
-        layer.initialize(ctx=default_context())
+        layer.initialize(device=default_device())
         layer.hybridize()
         res2 = layer(data, [states])
 
@@ -280,12 +280,12 @@ def forward(self, data):
             return data2[0]
     data = mx.np.random.normal(loc=0, scale=1, size=(1, ))
     layer = TestLayer()
-    layer.initialize(ctx=default_context())
+    layer.initialize(device=default_device())
     res1 = layer(data)
     with mx.autograd.record():
         res1 = layer(data)
     layer = TestLayer()
-    layer.initialize(ctx=default_context())
+    layer.initialize(device=default_device())
     layer.hybridize()
     res2 = layer(data)
     with mx.autograd.record():
@@ -314,12 +314,12 @@ def forward(self, data):
             return data2
     data = mx.np.random.normal(loc=0, scale=1, size=(1, ))
     layer = TestLayer()
-    layer.initialize(ctx=default_context())
+    layer.initialize(device=default_device())
     res1 = layer(data)
     with mx.autograd.record():
         res1 = layer(data)
     layer = TestLayer()
-    layer.initialize(ctx=default_context())
+    layer.initialize(device=default_device())
     layer.hybridize()
     res2 = layer(data)
     with mx.autograd.record():
@@ -357,9 +357,9 @@ def step4(data, state):
     state = mx.np.random.normal(loc=0, scale=1, size=(2))
     for step in steps:
         layer1 = TestLayer1(step)
-        layer1.initialize(ctx=default_context())
+        layer1.initialize(device=default_device())
         layer2 = TestLayer1(step)
-        layer2.initialize(ctx=default_context())
+        layer2.initialize(device=default_device())
         layer2.hybridize()
         out1, state1 = layer1(data, [state])
         out2, state2 = layer2(data, [state])
@@ -378,9 +378,9 @@ def step4(data, state):
             assert_almost_equal(state1[i].asnumpy(), state2[i].asnumpy(), rtol=0.001, atol=0.0001)
 
         layer1 = TestLayer1(step)
-        layer1.initialize(ctx=default_context())
+        layer1.initialize(device=default_device())
         layer2 = TestLayer1(step)
-        layer2.initialize(ctx=default_context())
+        layer2.initialize(device=default_device())
         layer2.hybridize()
         out1, state1 = layer1(data, state)
         out2, state2 = layer2(data, state)
@@ -401,9 +401,9 @@ def step4(data, state):
         if step == step3:
             continue
         layer1 = TestLayer1(step)
-        layer1.initialize(ctx=default_context())
+        layer1.initialize(device=default_device())
         layer2 = TestLayer1(step)
-        layer2.initialize(ctx=default_context())
+        layer2.initialize(device=default_device())
         layer2.hybridize()
         out1, state1 = layer1(data, [state, [state + 1]])
         out2, state2 = layer2(data, [state, [state + 1]])
@@ -465,9 +465,9 @@ def step3(state):
     state = mx.np.random.normal(loc=0, scale=1, size=(2))
     for step in steps:
         layer1 = TestLayer1(step, False)
-        layer1.initialize(ctx=default_context())
+        layer1.initialize(device=default_device())
         layer2 = TestLayer1(step, False)
-        layer2.initialize(ctx=default_context())
+        layer2.initialize(device=default_device())
         layer2.hybridize()
         out1, state1 = layer1(state)
         out2, state2 = layer2(state)
@@ -483,9 +483,9 @@ def step3(state):
             assert_almost_equal(state1[i].asnumpy(), state2[i].asnumpy(), rtol=0.001, atol=0.0001)
 
         layer1 = TestLayer1(step, True)
-        layer1.initialize(ctx=default_context())
+        layer1.initialize(device=default_device())
         layer2 = TestLayer1(step, True)
-        layer2.initialize(ctx=default_context())
+        layer2.initialize(device=default_device())
         layer2.hybridize()
         out1, state1 = layer1(state)
         out2, state2 = layer2(state)
@@ -519,9 +519,9 @@ def step6(state, state2):
     steps = [step4, step5, step6]
     for step in steps:
         layer1 = TestLayer1(step, False, True)
-        layer1.initialize(ctx=default_context())
+        layer1.initialize(device=default_device())
         layer2 = TestLayer1(step, False, True)
-        layer2.initialize(ctx=default_context())
+        layer2.initialize(device=default_device())
         layer2.hybridize()
         out1, state1 = layer1(state)
         out2, state2 = layer2(state)
@@ -564,9 +564,9 @@ def func3(data):
     data = mx.np.random.normal(loc=0, scale=1, size=(2))
     for func in funcs:
         layer1 = TestLayer1(func)
-        layer1.initialize(ctx=default_context())
+        layer1.initialize(device=default_device())
         layer2 = TestLayer1(func)
-        layer2.initialize(ctx=default_context())
+        layer2.initialize(device=default_device())
         layer2.hybridize()
         out1 = layer1(data)
         out2 = layer2(data)
@@ -613,11 +613,11 @@ def forward(self, data):
     data = mx.np.random.normal(loc=0, scale=1, size=(1, ))
     with AttrScope(__subgraph_name__="my_cond"):
         block1 = TestBlock1()
-        block1.initialize(ctx=default_context())
+        block1.initialize(device=default_device())
         block1.hybridize()
         _ = block1(data)
         block2 = TestBlock2()
-        block2.initialize(ctx=default_context())
+        block2.initialize(device=default_device())
         block2.hybridize()
         _ = block2(data)
         assert len(AttrScope._subgraph_names) == 3
@@ -647,7 +647,7 @@ def check_rnn(cell_type, num_states):
     states = [mx.np.random.normal(loc=0, scale=1, size=state_shape) for i in range(num_states)]
     layer = RNNLayer(cell_type, hidden_size)
     layer.infer_shape(rnn_data)
-    layer.initialize(ctx=default_context())
+    layer.initialize(device=default_device())
     res1 = layer(rnn_data, states)
     params1 = layer.collect_params()
     orig_params1 = copy.deepcopy(params1)
@@ -666,7 +666,7 @@ def check_rnn(cell_type, num_states):
     for config in configs:
         layer = RNNLayer(cell_type, hidden_size)
         layer.infer_shape(rnn_data)
-        layer.initialize(ctx=default_context())
+        layer.initialize(device=default_device())
         layer.hybridize(**config)
         res2 = layer(rnn_data, states)
         params2 = layer.collect_params()
diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index 6af620969ff8..74aa7be2351b 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -22,7 +22,7 @@
 from functools import partial
 from numpy.testing import assert_allclose
 import pytest
-from mxnet.test_utils import almost_equal, assert_almost_equal, default_context
+from mxnet.test_utils import almost_equal, assert_almost_equal, default_device
 from common import assert_raises_cudnn_not_satisfied, retry
 
 
@@ -520,7 +520,7 @@ def infer_shape(self, input):
             self.cell.infer_shape(0, input, False)
 
     for hybrid in [RNNLayer(), LSTMLayer(), GRULayer()]:
-        input = mx.np.ones(shape=(1, 2, 1), ctx=mx.context.current_context())
+        input = mx.np.ones(shape=(1, 2, 1), device=mx.device.current_device())
         hybrid.infer_shape(input)
         hybrid.initialize()
         hybrid.hybridize()
@@ -530,21 +530,21 @@ def infer_shape(self, input):
             symbol_file="./model-symbol.json",
             input_names=["data"],
             param_file="./model-0000.params",
-            ctx=mx.context.current_context()
+            device=mx.device.current_device()
         )
         output2 = symbol(input)
         assert_almost_equal(output1.asnumpy(), output2.asnumpy())
 
 
-def check_rnn_layer_forward(layer, inputs, states=None, run_only=False, ctx=mx.cpu()):
-    layer.initialize(ctx=ctx)
-    inputs = inputs.as_in_context(ctx)
+def check_rnn_layer_forward(layer, inputs, states=None, run_only=False, device=mx.cpu()):
+    layer.initialize(device=device)
+    inputs = inputs.to_device(device)
     inputs.attach_grad()
     if states is not None:
         if isinstance(states, (list, tuple)):
-            states = [s.as_in_context(ctx) for s in states]
+            states = [s.to_device(device) for s in states]
         else:
-            states = states.as_in_context(ctx)
+            states = states.to_device(device)
     with mx.autograd.record():
         if states is None:
             out = layer(inputs)
@@ -583,39 +583,39 @@ def check_rnn_layer_forward(layer, inputs, states=None, run_only=False, ctx=mx.c
 
 
 @mx.util.use_np
-def run_rnn_layers(dtype, dtype2, ctx=mx.cpu()):
+def run_rnn_layers(dtype, dtype2, device=mx.cpu()):
 
-    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype), mx.np.ones((8, 3, 20), dtype=dtype), ctx=ctx)
-    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype, bidirectional=True), mx.np.ones((8, 3, 20),  dtype=dtype), mx.np.ones((4, 3, 10),  dtype=dtype), ctx=ctx)
-    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2,dtype=dtype), mx.np.ones((8, 3, 20),  dtype=dtype), ctx=ctx)
-    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2,dtype=dtype,  bidirectional=True), mx.np.ones((8, 3, 20),  dtype=dtype), [mx.np.ones((4, 3, 10),  dtype=dtype), mx.np.ones((4, 3, 10),  dtype=dtype)],ctx=ctx)
-    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, dtype=dtype, ), mx.np.ones((8, 3, 20), dtype=dtype),ctx=ctx)
-    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, dtype=dtype, bidirectional=True), mx.np.ones((8, 3, 20),  dtype=dtype), mx.np.ones((4, 3, 10),  dtype=dtype),ctx=ctx)
+    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype), mx.np.ones((8, 3, 20), dtype=dtype), device=device)
+    check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype, bidirectional=True), mx.np.ones((8, 3, 20),  dtype=dtype), mx.np.ones((4, 3, 10),  dtype=dtype), device=device)
+    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2,dtype=dtype), mx.np.ones((8, 3, 20),  dtype=dtype), device=device)
+    check_rnn_layer_forward(gluon.rnn.LSTM(10, 2,dtype=dtype,  bidirectional=True), mx.np.ones((8, 3, 20),  dtype=dtype), [mx.np.ones((4, 3, 10),  dtype=dtype), mx.np.ones((4, 3, 10),  dtype=dtype)],device=device)
+    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, dtype=dtype, ), mx.np.ones((8, 3, 20), dtype=dtype),device=device)
+    check_rnn_layer_forward(gluon.rnn.GRU(10, 2, dtype=dtype, bidirectional=True), mx.np.ones((8, 3, 20),  dtype=dtype), mx.np.ones((4, 3, 10),  dtype=dtype),device=device)
 
 
     check_rnn_layer_forward(gluon.rnn.RNN(10, 2, dtype=dtype, dropout=0.5), mx.np.ones((8, 3, 20), dtype=dtype),
-                            run_only=True, ctx=ctx)
+                            run_only=True, device=device)
     check_rnn_layer_forward(gluon.rnn.RNN(10, 2, bidirectional=True, dropout=0.5, dtype=dtype),
-                            mx.np.ones((8, 3, 20), dtype=dtype), mx.np.ones((4, 3, 10), dtype=dtype), run_only=True, ctx=ctx)
+                            mx.np.ones((8, 3, 20), dtype=dtype), mx.np.ones((4, 3, 10), dtype=dtype), run_only=True, device=device)
     check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, dropout=0.5, dtype=dtype), mx.np.ones((8, 3, 20), dtype=dtype),
-                            run_only=True, ctx=ctx)
+                            run_only=True, device=device)
     check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, bidirectional=True, dropout=0.5, dtype=dtype),
                             mx.np.ones((8, 3, 20), dtype=dtype),
-                            [mx.np.ones((4, 3, 10), dtype=dtype), mx.np.ones((4, 3, 10), dtype=dtype)], run_only=True, ctx=ctx)
+                            [mx.np.ones((4, 3, 10), dtype=dtype), mx.np.ones((4, 3, 10), dtype=dtype)], run_only=True, device=device)
     check_rnn_layer_forward(gluon.rnn.GRU(10, 2, dropout=0.5, dtype=dtype), mx.np.ones((8, 3, 20), dtype=dtype),
-                            run_only=True, ctx=ctx)
+                            run_only=True, device=device)
     check_rnn_layer_forward(gluon.rnn.GRU(10, 2, bidirectional=True, dropout=0.5, dtype=dtype),
-                            mx.np.ones((8, 3, 20), dtype=dtype), mx.np.ones((4, 3, 10), dtype=dtype), run_only=True, ctx=ctx)
+                            mx.np.ones((8, 3, 20), dtype=dtype), mx.np.ones((4, 3, 10), dtype=dtype), run_only=True, device=device)
 
     net = gluon.nn.Sequential()
     net.add(gluon.rnn.LSTM(10, bidirectional=True, dtype=dtype2))
     net.add(gluon.nn.BatchNorm(axis=2))
     net.add(gluon.nn.Flatten())
     net.add(gluon.nn.Dense(3, activation='relu'))
-    net.initialize(ctx=ctx)
+    net.initialize(device=device)
     net.cast(dtype)
     with mx.autograd.record():
-        out = net(mx.np.ones((2, 3, 10), dtype=dtype, ctx=ctx))
+        out = net(mx.np.ones((2, 3, 10), dtype=dtype, device=device))
         out.backward()
         out = out.asnumpy()
 
@@ -625,10 +625,10 @@ def run_rnn_layers(dtype, dtype2, ctx=mx.cpu()):
     net2.add(gluon.nn.Flatten())
     net2.add(gluon.nn.Dense(3, activation='relu'))
     net2.hybridize()
-    net2.initialize(ctx=ctx)
+    net2.initialize(device=device)
     net2.cast(dtype)
     with mx.autograd.record():
-        out = net2(mx.np.ones((2, 3, 10), dtype=dtype, ctx=ctx))
+        out = net2(mx.np.ones((2, 3, 10), dtype=dtype, device=device))
         out.backward()
         out = out.asnumpy()
 
@@ -638,10 +638,10 @@ def run_rnn_layers(dtype, dtype2, ctx=mx.cpu()):
     net3.add(gluon.nn.Flatten())
     net3.add(gluon.nn.Dense(3, activation='relu'))
     net3.hybridize()
-    net3.initialize(ctx=ctx)
+    net3.initialize(device=device)
     net3.cast(dtype2)
     with mx.autograd.record():
-        out = net3(mx.np.ones((2, 3, 10), dtype=dtype2, ctx=ctx))
+        out = net3(mx.np.ones((2, 3, 10), dtype=dtype2, device=device))
         out.backward()
         out = out.asnumpy()
 
@@ -994,7 +994,7 @@ def check_vardrop(drop_inputs, drop_states, drop_outputs):
                                                 drop_states=drop_states,
                                                 drop_inputs=drop_inputs)
 
-        input_data = mx.np.random.uniform(size=(10, 3, 50), ctx=mx.context.current_context())
+        input_data = mx.np.random.uniform(size=(10, 3, 50), device=mx.device.current_device())
         cell.infer_shape(0, input_data, False)
         cell.initialize(init='xavier')
         with mx.autograd.record():
@@ -1052,26 +1052,26 @@ def infer_shape(self, x, *args):
     input_size = 50
     hidden_size = 30
     seq_len = 10
-    ctx = default_context()
+    device = default_device()
     if layout == 'TNC':
-        rnn_data = mx.np.random.normal(loc=0, scale=1, size=(seq_len, batch_size, input_size), ctx=ctx)
+        rnn_data = mx.np.random.normal(loc=0, scale=1, size=(seq_len, batch_size, input_size), device=device)
     elif layout == 'NTC':
-        rnn_data = mx.np.random.normal(loc=0, scale=1, size=(batch_size, seq_len, input_size), ctx=ctx)
+        rnn_data = mx.np.random.normal(loc=0, scale=1, size=(batch_size, seq_len, input_size), device=device)
     else:
         print("Wrong layout")
         return
-    valid_length = mx.np.round(mx.np.random.uniform(low=1, high=10, size=(batch_size), ctx=ctx))
+    valid_length = mx.np.round(mx.np.random.uniform(low=1, high=10, size=(batch_size), device=device))
     state_shape = (batch_size, hidden_size)
-    states = [mx.np.random.normal(loc=0, scale=1, size=state_shape, ctx=ctx) for i in range(num_states)]
+    states = [mx.np.random.normal(loc=0, scale=1, size=state_shape, device=device) for i in range(num_states)]
 
     cell = cell_type(hidden_size)
     if layout == 'TNC':
         cell.infer_shape(0, rnn_data[0], False)
-        cell.initialize(ctx=default_context())
+        cell.initialize(device=default_device())
         cell(rnn_data[0], states)
     else:
         cell.infer_shape(0, rnn_data[:,0,:], False)
-        cell.initialize(ctx=default_context())
+        cell.initialize(device=default_device())
         cell(rnn_data[:,0,:], states)
     params1 = cell.collect_params()
     orig_params1 = copy.deepcopy(params1)
@@ -1096,7 +1096,7 @@ def infer_shape(self, x, *args):
     for config in configs:
         layer = RNNLayer(cell_type, hidden_size, layout)
         layer.infer_shape(rnn_data)
-        layer.initialize(ctx=default_context())
+        layer.initialize(device=default_device())
         config(layer)
         res2, states2 = layer(rnn_data, states, valid_length)
         params2 = layer.collect_params()
diff --git a/tests/python/unittest/test_loss.py b/tests/python/unittest/test_loss.py
index 8c28691c2e19..1780f40de7e7 100644
--- a/tests/python/unittest/test_loss.py
+++ b/tests/python/unittest/test_loss.py
@@ -18,7 +18,7 @@
 import mxnet as mx
 import numpy as onp
 from mxnet import gluon, autograd
-from mxnet.test_utils import assert_almost_equal, default_context
+from mxnet.test_utils import assert_almost_equal, default_device
 from numpy.core.fromnumeric import size
 from common import xfail_when_nonstandard_decimal_separator
 import unittest
@@ -123,13 +123,13 @@ def test_sdml_loss():
     # Init model and trainer
     sdml_loss = gluon.loss.SDMLLoss()
     model = gluon.nn.Dense(DIM, activation='tanh') # Simple NN encoder
-    model.initialize(mx.init.Xavier(), ctx=mx.current_context())
+    model.initialize(mx.init.Xavier(), device=mx.current_device())
     trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate' : 0.1})
 
     for _ in range(EPOCHS): # Training loop
         data_iter.reset()
         for iter_batch in data_iter:
-            batch = [datum.as_in_context(mx.current_context()) for datum in iter_batch.data]
+            batch = [datum.to_device(mx.current_device()) for datum in iter_batch.data]
             with autograd.record():
                 data, pos = batch
                 z_data, z_pos = model(data), model(pos)
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index a2aae07bd362..49ed35faefbb 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -27,7 +27,7 @@
 from common import assertRaises, TemporaryDirectory
 from mxnet.test_utils import almost_equal
 from mxnet.test_utils import assert_almost_equal, assert_exception
-from mxnet.test_utils import default_context
+from mxnet.test_utils import default_device
 from mxnet.test_utils import np_reduce
 from mxnet.test_utils import same
 from mxnet.test_utils import random_sample, rand_shape_nd, random_arrays
@@ -890,7 +890,7 @@ def test_linspace():
 
 @pytest.mark.serial
 def test_order():
-    ctx = default_context()
+    ctx = default_device()
     dat_size = 5
     is_large_tensor_enabled = runtime.Features().is_enabled('INT64_TENSOR_SIZE')
     def gt_topk(dat, axis, ret_typ, k, is_ascend):
@@ -1295,7 +1295,7 @@ def test_ndarray_fluent():
                     'softmin', 'reciprocal'])
     def check_fluent_regular(func, kwargs, shape=(5, 17, 1), equal_nan=False):
         with mx.name.NameManager():
-            data = mx.nd.random_uniform(shape=shape, ctx=default_context())
+            data = mx.nd.random_uniform(shape=shape, ctx=default_device())
             regular = getattr(mx.ndarray, func)(data, **kwargs)
             fluent = getattr(data, func)(**kwargs)
             if isinstance(regular, list):
@@ -1746,7 +1746,7 @@ def test_ndarray_astype():
 
 
 @pytest.mark.serial
-def test_norm(ctx=default_context()):
+def test_norm(ctx=default_device()):
     try:
         import scipy
         assert LooseVersion(scipy.__version__) >= LooseVersion('0.1')
diff --git a/tests/python/unittest/test_numpy_gluon.py b/tests/python/unittest/test_numpy_gluon.py
index d5971dc5cb43..1241ead997d9 100644
--- a/tests/python/unittest/test_numpy_gluon.py
+++ b/tests/python/unittest/test_numpy_gluon.py
@@ -51,8 +51,8 @@ def __init__(self):
             self.w = gluon.Parameter('w', shape=(K, N), allow_deferred_init=True)
 
         def forward(self, x):
-            ctx = x.ctx
-            return np.dot(x, self.w.data(ctx))
+            device = x.device
+            return np.dot(x, self.w.data(device))
 
     x = mx.np.random.uniform(size=(M, K))
     for initializer in [mx.initializer.Uniform, mx.initializer.Normal]:
@@ -71,10 +71,10 @@ def __init__(self, num_input_dim=0, num_hidden_dim=100, num_output_dim=10):
                                       allow_deferred_init=True)
 
         def forward(self, x):
-            ctx = x.ctx
-            h = x.dot(self.w1.data(ctx))  # equivalent to np.dot(x, w1)
+            device = x.device
+            h = x.dot(self.w1.data(device))  # equivalent to np.dot(x, w1)
             h_relu = npx.relu(h)  # equivalent to npx.relu(h) but generating np.ndarray
-            y_pred = h_relu.dot(self.w2.data(ctx))  # equivalent to np.dot(h_relu, w2)
+            y_pred = h_relu.dot(self.w2.data(device))  # equivalent to np.dot(h_relu, w2)
             return y_pred
         
         def infer_shape(self, x, *args):
@@ -160,8 +160,8 @@ def __init__(self):
             self.weight = gluon.Constant(const_arr)
 
         def forward(self, x):
-            ctx = x.ctx
-            return x + self.weight.data(ctx).astype(np.float32)
+            device = x.device
+            return x + self.weight.data(device).astype(np.float32)
 
     x = np.random.uniform(size=const_arr.shape, dtype=const_arr.dtype)
     for hybridize in [False, True]:
diff --git a/tests/python/unittest/test_numpy_loss.py b/tests/python/unittest/test_numpy_loss.py
index 93fd2f9a3fc2..4f6c0bc39a51 100644
--- a/tests/python/unittest/test_numpy_loss.py
+++ b/tests/python/unittest/test_numpy_loss.py
@@ -18,7 +18,7 @@
 import mxnet as mx
 import numpy as onp
 from mxnet import gluon, autograd
-from mxnet.test_utils import assert_almost_equal, default_context, use_np
+from mxnet.test_utils import assert_almost_equal, default_device, use_np
 from common import xfail_when_nonstandard_decimal_separator
 import pytest
 
@@ -217,13 +217,13 @@ def test_sdml_loss():
     # Init model and trainer
     sdml_loss = gluon.loss.SDMLLoss()
     model = gluon.nn.Dense(DIM, activation='tanh') # Simple NN encoder
-    model.initialize(mx.init.Xavier(), ctx=mx.current_context())
+    model.initialize(mx.init.Xavier(), device=mx.current_device())
     trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate' : 0.1})
 
     for _ in range(EPOCHS): # Training loop
         data_iter.reset()
         for iter_batch in data_iter:
-            batch = [datum.as_in_ctx(mx.current_context()).as_np_ndarray() for datum in iter_batch.data]
+            batch = [datum.to_device(mx.current_device()).as_np_ndarray() for datum in iter_batch.data]
             with autograd.record():
                 data, pos = batch
                 z_data, z_pos = model(data), model(pos)
diff --git a/tests/python/unittest/test_numpy_ndarray.py b/tests/python/unittest/test_numpy_ndarray.py
index c107ca0a2d83..bf54cabd8921 100644
--- a/tests/python/unittest/test_numpy_ndarray.py
+++ b/tests/python/unittest/test_numpy_ndarray.py
@@ -63,18 +63,18 @@ def test_np_empty():
         (4, 5),
         (1, 1, 1, 1),
     ]
-    ctxes = [npx.current_context(), None]
+    devices = [npx.current_device(), None]
     for dtype, expected_dtype in dtype_pairs:
         for shape in shapes:
             for order in orders:
-                for ctx in ctxes:
+                for device in devices:
                     if order == 'C':
-                        ret = np.empty(shape, dtype=dtype, order=order, device=ctx)
+                        ret = np.empty(shape, dtype=dtype, order=order, device=device)
                         assert ret.dtype == expected_dtype
                         assert ret.shape == shape if isinstance(shape, tuple) else (shape,)
-                        assert ret.ctx == npx.current_context()
+                        assert ret.device == npx.current_device()
                     else:
-                        assert_exception(np.empty, NotImplementedError, shape, dtype=dtype, order=order, device=ctx)
+                        assert_exception(np.empty, NotImplementedError, shape, dtype=dtype, order=order, device=device)
 
 
 @use_np
@@ -92,7 +92,7 @@ def test_np_array_creation():
     for dtype in dtypes:
         for src in objects:
             mx_arr = np.array(src, dtype=dtype)
-            assert mx_arr.ctx == mx.current_context()
+            assert mx_arr.device == mx.current_device()
             if dtype is None:
                 dtype = src.dtype if isinstance(src, _np.ndarray) else _np.float32
             if isinstance(src, mx.nd.NDArray):
@@ -609,12 +609,12 @@ def test_nd_no_format():
         b = np.arange(8).reshape(2,2,2)
         assert '{}'.format(a) == '{}'.format(_a)
 
-    context = mx.context.current_context()
-    if str(context)[:3] != 'gpu':
+    device = mx.device.current_device()
+    if str(device)[:3] != 'gpu':
         test_0d()
         test_nd_format()
         test_nd_no_format()
-    # if the program is running in GPU, the formatted string would be appended with context notation
+    # if the program is running in GPU, the formatted string would be appended with device notation
     # for exmpale, if a = np.array([np.pi]), the return value of '{}'.format(a) is '[3.1415927] @gpu(0)'
 
 
@@ -1321,7 +1321,7 @@ def test_np_get_dtype():
     for dtype in dtypes:
         for src in objects:
             mx_arr = np.array(src, dtype=dtype)
-            assert mx_arr.ctx == mx.current_context()
+            assert mx_arr.device == mx.current_device()
             if isinstance(src, mx.nd.NDArray):
                 np_arr = _np.array(src.asnumpy(), dtype=dtype if dtype is not None else _np.float32)
             else:
@@ -1418,7 +1418,7 @@ def test_mixed_array_types_share_memory():
     assert _np.may_share_memory(np_array_slice, mx_array)
     assert _np.shares_memory(np_array_slice, mx_array)
 
-    mx_pinned_array = mx_array.as_in_ctx(mx.cpu_pinned(0))
+    mx_pinned_array = mx_array.to_device(mx.cpu_pinned(0))
     assert not _np.may_share_memory(np_array, mx_pinned_array)
     assert not _np.shares_memory(np_array, mx_pinned_array)
 
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index a4fdcbf4177b..288d68c972b3 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -1664,9 +1664,9 @@ def index_update_bwd(out_grad, a_grad, ind, val_grad, ind_ndim, ind_num, grad_re
 
 @use_np
 def test_npx_batch_dot():
-    ctx = mx.context.current_context()
+    device = mx.device.current_device()
     dtypes = ['float32', 'float64']
-    if ctx.device_type == 'gpu':
+    if device.device_type == 'gpu':
         dtypes += ['float16']
     eps_dict = {'float32': 1E-4, 'float64': 1E-4, 'float16': 1E-3}
     class TestBatchDot(HybridBlock):
@@ -4284,7 +4284,7 @@ def forward(self, a):
 
 @use_np
 def test_np_randint():
-    ctx = mx.context.current_context()
+    device = mx.device.current_device()
     # test shapes
     params = [
         (0, 10),
@@ -4316,13 +4316,13 @@ def test_np_randint():
             # Quantize bucket boundaries to reflect the actual dtype and adjust probs accordingly
             buckets = onp.array(buckets, dtype=dtype).tolist()
             probs = [(buckets[i][1] - buckets[i][0]) / float(scale) for i in range(5)]
-            generator_mx = lambda x: np.random.randint(low, high, size=x, dtype=dtype, ctx=ctx).asnumpy()
+            generator_mx = lambda x: np.random.randint(low, high, size=x, dtype=dtype, device=device).asnumpy()
             verify_generator(generator=generator_mx, buckets=buckets, probs=probs, nrepeat=100)
             # Scipy uses alpha = 0.01 for testing discrete distribution generator but we are using default alpha=0.05 (higher threshold ensures robustness)
             # Refer - https://github.com/scipy/scipy/blob/9f12af697763fb5f9767d5cb1280ce62456a3974/scipy/stats/tests/test_discrete_basic.py#L45
             generator_mx_same_seed = \
                 lambda x: onp.concatenate(
-                    [np.random.randint(low, high, size=x // 10, dtype=dtype, ctx=ctx).asnumpy()
+                    [np.random.randint(low, high, size=x // 10, dtype=dtype, device=device).asnumpy()
                         for _ in range(10)])
             verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs, nrepeat=100)
 
@@ -4977,14 +4977,14 @@ def forward(self, a):
 @pytest.mark.skip(reason='https://github.com/apache/incubator-mxnet/issues/18600')
 def test_np_random_beta():
     class TestRandomBeta(HybridBlock):
-        def __init__(self, size=None, dtype=None, ctx=None):
+        def __init__(self, size=None, dtype=None, device=None):
             super(TestRandomBeta, self).__init__()
             self._size = size
             self._dtype = dtype
-            self._ctx = ctx
+            self._device = device
 
         def forward(self, a, b):
-            return np.random.beta(a, b, size=self._size, dtype=self._dtype, ctx=self._ctx)
+            return np.random.beta(a, b, size=self._size, dtype=self._dtype, device=self._device)
 
     def _test_random_beta_range(output):
         bigger_than_zero = onp.all(output > 0)
@@ -5052,14 +5052,14 @@ def forward(self, dfnum, dfden):
 @pytest.mark.skip(reason='https://github.com/apache/incubator-mxnet/issues/18600')
 def test_np_random_chisquare():
     class TestRandomChisquare(HybridBlock):
-        def __init__(self, size=None, dtype=None, ctx=None):
+        def __init__(self, size=None, dtype=None, device=None):
             super(TestRandomChisquare, self).__init__()
             self._size = size
             self._dtype = dtype
-            self._ctx = ctx
+            self._device = device
 
         def forward(self, df):
-            return np.random.chisquare(df, size=self._size, dtype=self._dtype, ctx=self._ctx)
+            return np.random.chisquare(df, size=self._size, dtype=self._dtype, device=self._device)
 
     shape_list = [(), (1,), (2, 3), (4, 0, 5), 6, (7, 8), None]
 
@@ -7379,14 +7379,14 @@ def forward(self, a):
 @pytest.mark.skip(reason='Skipped as the test is flaky and the feature causes curand error. Tracked in #18100')
 def test_np_full_like():
     class TestFullLike(HybridBlock):
-        def __init__(self, fill_value, dtype, ctx):
+        def __init__(self, fill_value, dtype, device):
             super(TestFullLike, self).__init__()
             self._fill_value = fill_value
             self._dtype = dtype
-            self._ctx = ctx
+            self._device = device
 
         def forward(self, x, *args, **kwargs):
-            return np.full_like(x, self._fill_value, dtype=self._dtype, ctx=self._ctx)
+            return np.full_like(x, self._fill_value, dtype=self._dtype, device=self._device)
 
     if StrictVersion(platform.python_version()) < StrictVersion('3.0.0'):
         return
@@ -7410,7 +7410,7 @@ def forward(self, x, *args, **kwargs):
         fill_values, dtypes, shapes, flags):
         param_dtype = onp.random.choice(dtypes)
         a = np.random.uniform(low=0, high=100, size=shape, dtype='float64').astype(dtype)
-        test = TestFullLike(fill_value, param_dtype, npx.current_context())
+        test = TestFullLike(fill_value, param_dtype, npx.current_device())
         expected_ret = onp.full_like(a.asnumpy(), fill_value=fill_value, dtype=param_dtype)
         if hybridize:
             test.hybridize()
@@ -8503,22 +8503,22 @@ def forward(self, A, **kwargs):
 
             # test gradient
             if m == "constant":
-                ctx = mx.context.current_context()
+                device = mx.device.current_device()
                 x = mx.np.random.uniform(-1.0, 1.0, size=shape)
-                x = mx.np.array(x, ctx=ctx)
+                x = mx.np.array(x, device=device)
                 for grad_req in ['write', 'add']:
                     x.attach_grad(grad_req)
                     if grad_req == 'add':
-                        init_grad = mx.np.random.uniform(-1.0, 1.0, size=shape, ctx=ctx)
+                        init_grad = mx.np.random.uniform(-1.0, 1.0, size=shape, device=device)
                         x.grad[:] = init_grad
                     with mx.autograd.record():
                         mx_out = mx.np.pad(x, pad_width=pw, mode="constant")
                         out_grad = mx.np.random.normal(0, 1, mx_out.shape)
-                        out_grad = mx.np.array(out_grad, ctx=ctx)
+                        out_grad = mx.np.array(out_grad, device=device)
                         loss = mx_out * out_grad
                         loss = loss.sum()
                         loss.backward()
-                    gt_in_grad = mx.np.pad(mx.np.ones_like(x.grad), pad_width=pw, mode="constant") * mx.np.array(out_grad, ctx=ctx)
+                    gt_in_grad = mx.np.pad(mx.np.ones_like(x.grad), pad_width=pw, mode="constant") * mx.np.array(out_grad, device=device)
                     mx_grad = x.grad
                     if grad_req == 'add':
                         assert_almost_equal(mx.np.pad(mx_grad - init_grad, pad_width=pw, mode="constant"), gt_in_grad.asnumpy(), rtol=rtol, atol=atol)
@@ -8547,7 +8547,7 @@ def test_np_rand():
             assert data_mx.shape == shape
 
     # Test random generator.
-    ctx = mx.context.current_context()
+    device = mx.device.current_device()
     samples = 1000000
     trials = 8
     num_buckets = 10
@@ -8564,12 +8564,12 @@ def test_np_rand():
                  for i in range(num_buckets)]
 
         def generator_mx(x): return np.random.rand(
-            samples, ctx=ctx, dtype=dtype).asnumpy()
+            samples, device=device, dtype=dtype).asnumpy()
         verify_generator(generator=generator_mx, buckets=buckets,
                          probs=probs, nsamples=samples, nrepeat=trials)
         generator_mx_same_seed =\
             lambda x: onp.concatenate(
-                [np.random.rand(x // 10, ctx=ctx, dtype=dtype).asnumpy()
+                [np.random.rand(x // 10, device=device, dtype=dtype).asnumpy()
                     for _ in range(10)])
         verify_generator(generator=generator_mx_same_seed, buckets=buckets,
                          probs=probs, nsamples=samples, nrepeat=trials)
@@ -10481,22 +10481,22 @@ def __init__(self):
                                       init=None, dtype=dtype, allow_deferred_init=True)
 
         def forward(self, qkv):
-            ctx = qkv.ctx
-            qkv_weight = self.convert_weight(self.q_weight.data().as_in_ctx(ctx),
-                                             self.k_weight.data().as_in_ctx(ctx),
-                                             self.v_weight.data().as_in_ctx(ctx),
+            device = qkv.device
+            qkv_weight = self.convert_weight(self.q_weight.data().to_device(device),
+                                             self.k_weight.data().to_device(device),
+                                             self.v_weight.data().to_device(device),
                                              self.num_heads)
-            qkv_bias = self.convert_bias(self.q_bias.data().as_in_ctx(ctx),
-                                         self.k_bias.data().as_in_ctx(ctx),
-                                         self.v_bias.data().as_in_ctx(ctx),
+            qkv_bias = self.convert_bias(self.q_bias.data().to_device(device),
+                                         self.k_bias.data().to_device(device),
+                                         self.v_bias.data().to_device(device),
                                          self.num_heads)
             qkv = np.transpose(qkv, axes=(1, 0, 2))
             qkv_proj = npx.fully_connected(qkv, weight=qkv_weight, bias=qkv_bias, flatten=False,
                                            num_hidden=self.qkv_units * 3, no_bias=False)
             att_score = npx.interleaved_matmul_selfatt_qk(qkv_proj, heads=self.num_heads)
             weighted_value = npx.interleaved_matmul_selfatt_valatt(qkv_proj, att_score, heads=self.num_heads)
-            output = npx.fully_connected(weighted_value, weight=self.out_weight.data().as_in_ctx(ctx),
-                                         bias=self.out_bias.data().as_in_ctx(ctx), flatten=False,
+            output = npx.fully_connected(weighted_value, weight=self.out_weight.data().to_device(device),
+                                         bias=self.out_bias.data().to_device(device), flatten=False,
                                          num_hidden=self.out_dim, no_bias=False)
             return np.transpose(output, axes=(1, 0, 2)), att_score
 
@@ -10545,15 +10545,15 @@ def __init__(self):
                                       init=None, dtype=dtype, allow_deferred_init=True)
 
         def forward(self, qkv):
-            ctx = qkv.ctx
-            q = npx.fully_connected(qkv, weight=self.q_weight.data().as_in_ctx(ctx),
-                                    bias=self.q_bias.data().as_in_ctx(ctx), flatten=False,
+            device = qkv.device
+            q = npx.fully_connected(qkv, weight=self.q_weight.data().to_device(device),
+                                    bias=self.q_bias.data().to_device(device), flatten=False,
                                     num_hidden=self.qkv_units, no_bias=False)
-            k = npx.fully_connected(qkv, weight=self.k_weight.data().as_in_ctx(ctx),
-                                    bias=self.k_bias.data().as_in_ctx(ctx), flatten=False,
+            k = npx.fully_connected(qkv, weight=self.k_weight.data().to_device(device),
+                                    bias=self.k_bias.data().to_device(device), flatten=False,
                                     num_hidden=self.qkv_units, no_bias=False)
-            v = npx.fully_connected(qkv, weight=self.v_weight.data().as_in_ctx(ctx),
-                                    bias=self.v_bias.data().as_in_ctx(ctx), flatten=False,
+            v = npx.fully_connected(qkv, weight=self.v_weight.data().to_device(device),
+                                    bias=self.v_bias.data().to_device(device), flatten=False,
                                     num_hidden=self.qkv_units, no_bias=False)
             q = npx.reshape(q, (-2, -2, self.num_heads, -1))
             q = np.transpose(q, axes=(0, 2, 1, 3))
@@ -10573,8 +10573,8 @@ def forward(self, qkv):
                                          reverse=True)
             weighted_value = np.transpose(weighted_value, axes=(0, 2, 1, 3))
             weighted_value = npx.reshape(weighted_value, (-2, -2, -1))
-            output = npx.fully_connected(weighted_value, weight=self.out_weight.data().as_in_ctx(ctx),
-                                         bias=self.out_bias.data().as_in_ctx(ctx), flatten=False,
+            output = npx.fully_connected(weighted_value, weight=self.out_weight.data().to_device(device),
+                                         bias=self.out_bias.data().to_device(device), flatten=False,
                                          num_hidden=self.out_dim, no_bias=False)
             return output, att_score
 
@@ -10612,7 +10612,7 @@ def forward(self, qkv):
 @pytest.mark.serial
 def test_multihead_attention_selfatt():
     dtypes = ['float32']
-    if mx.context.current_context().device_type == 'gpu':
+    if mx.device.current_device().device_type == 'gpu':
         dtypes += ['float16']
 
     for dtype in dtypes:
@@ -10649,24 +10649,24 @@ def __init__(self):
                                       init=None, dtype=dtype, allow_deferred_init=True)
 
         def forward(self, q, kv):
-            ctx = kv.ctx
-            kv_weight = self.convert_weight(self.k_weight.data().as_in_ctx(ctx),
-                                            self.v_weight.data().as_in_ctx(ctx),
+            device = kv.device
+            kv_weight = self.convert_weight(self.k_weight.data().to_device(device),
+                                            self.v_weight.data().to_device(device),
                                             self.num_heads)
-            kv_bias = self.convert_bias(self.k_bias.data().as_in_ctx(ctx),
-                                        self.v_bias.data().as_in_ctx(ctx),
+            kv_bias = self.convert_bias(self.k_bias.data().to_device(device),
+                                        self.v_bias.data().to_device(device),
                                         self.num_heads)
             kv = np.transpose(kv, axes=(1, 0, 2))
             kv_proj = npx.fully_connected(kv, weight=kv_weight, bias=kv_bias, flatten=False,
                                           num_hidden=self.qkv_units * 2, no_bias=False)
             q = np.transpose(q, axes=(1, 0, 2))
-            q_proj = npx.fully_connected(q, weight=self.q_weight.data().as_in_ctx(ctx),
-                                         bias=self.q_bias.data().as_in_ctx(ctx), flatten=False,
+            q_proj = npx.fully_connected(q, weight=self.q_weight.data().to_device(device),
+                                         bias=self.q_bias.data().to_device(device), flatten=False,
                                          num_hidden=self.qkv_units, no_bias=False)
             att_score = npx.interleaved_matmul_encdec_qk(q_proj, kv_proj, heads=self.num_heads)
             weighted_value = npx.interleaved_matmul_encdec_valatt(kv_proj, att_score, heads=self.num_heads)
-            output = npx.fully_connected(weighted_value, weight=self.out_weight.data().as_in_ctx(ctx),
-                                         bias=self.out_bias.data().as_in_ctx(ctx), flatten=False,
+            output = npx.fully_connected(weighted_value, weight=self.out_weight.data().to_device(device),
+                                         bias=self.out_bias.data().to_device(device), flatten=False,
                                          num_hidden=self.out_dim, no_bias=False)
             return np.transpose(output, axes=(1, 0, 2)), att_score
 
@@ -10713,15 +10713,15 @@ def __init__(self):
                                       init=None, dtype=dtype, allow_deferred_init=True)
 
         def forward(self, q, kv):
-            ctx = kv.ctx
-            q = npx.fully_connected(q, weight=self.q_weight.data().as_in_ctx(ctx),
-                                    bias=self.q_bias.data().as_in_ctx(ctx), flatten=False,
+            device = kv.device
+            q = npx.fully_connected(q, weight=self.q_weight.data().to_device(device),
+                                    bias=self.q_bias.data().to_device(device), flatten=False,
                                     num_hidden=self.qkv_units, no_bias=False)
-            k = npx.fully_connected(kv, weight=self.k_weight.data().as_in_ctx(ctx),
-                                    bias=self.k_bias.data().as_in_ctx(ctx), flatten=False,
+            k = npx.fully_connected(kv, weight=self.k_weight.data().to_device(device),
+                                    bias=self.k_bias.data().to_device(device), flatten=False,
                                     num_hidden=self.qkv_units, no_bias=False)
-            v = npx.fully_connected(kv, weight=self.v_weight.data().as_in_ctx(ctx),
-                                    bias=self.v_bias.data().as_in_ctx(ctx), flatten=False,
+            v = npx.fully_connected(kv, weight=self.v_weight.data().to_device(device),
+                                    bias=self.v_bias.data().to_device(device), flatten=False,
                                     num_hidden=self.qkv_units, no_bias=False)
             q = npx.reshape(q, (-2, -2, self.num_heads, -1))
             q = np.transpose(q, axes=(0, 2, 1, 3))
@@ -10740,8 +10740,8 @@ def forward(self, q, kv):
                                          reverse=True)
             weighted_value = np.transpose(weighted_value, axes=(0, 2, 1, 3))
             weighted_value = npx.reshape(weighted_value, (-2, -2, -1))
-            output = npx.fully_connected(weighted_value, weight=self.out_weight.data().as_in_ctx(ctx),
-                                         bias=self.out_bias.data().as_in_ctx(ctx), flatten=False,
+            output = npx.fully_connected(weighted_value, weight=self.out_weight.data().to_device(device),
+                                         bias=self.out_bias.data().to_device(device), flatten=False,
                                          num_hidden=self.out_dim, no_bias=False)
             return output, att_score
 
@@ -10780,7 +10780,7 @@ def forward(self, q, kv):
 @pytest.mark.serial
 def test_multihead_attention_encdec():
     dtypes = ['float32']
-    if mx.context.current_context().device_type == 'gpu':
+    if mx.device.current_device().device_type == 'gpu':
         dtypes += ['float16']
 
     for dtype in dtypes:
@@ -10845,7 +10845,7 @@ def test_slice_like():
     ((1, 4, 3, 15, 16), 16, 2, (2, 2, 2), (0, 0, 0)),
     ((8, 4, 3, 16, 16), 16, 1, (3, 3, 3), (1, 1, 1))])
 def test_npx_deconvolution(shape, num_filter, num_group, kernel, pad):
-    if len(kernel) == 3 and mx.current_context().device_type == 'gpu':
+    if len(kernel) == 3 and mx.current_device().device_type == 'gpu':
         pytest.skip('Skipping deconvoluition 3D tests for GPU')
 
     class TestConv(mx.gluon.HybridBlock):
@@ -10854,7 +10854,7 @@ def __init__(self, w):
             self.weight = w
 
         def forward(self, x, *args):
-            return npx.convolution(x, self.weight.data(x.ctx), no_bias=True, kernel=kernel,
+            return npx.convolution(x, self.weight.data(x.device), no_bias=True, kernel=kernel,
                                    pad=pad, num_filter=self.weight.shape[0], num_group=num_group)
 
     class TestDeconv(mx.gluon.HybridBlock):
@@ -10865,7 +10865,7 @@ def __init__(self):
             self.bias = mx.gluon.Parameter('bias', shape=num_filter)
 
         def forward(self, x, *args):
-            return npx.deconvolution(x, self.weight.data(x.ctx), self.bias.data(x.ctx), kernel,
+            return npx.deconvolution(x, self.weight.data(x.device), self.bias.data(x.device), kernel,
                                      pad=pad, num_filter=num_filter, num_group=num_group)
     
     deconvNet = TestDeconv()
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index b4a6d6d40f24..d8a134d3b321 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -59,7 +59,7 @@ def test_rnn_with_new_param():
                 bind_dict['rnn_state_cell'] = mx.ndarray.zeros(
                     shape=(num_layers * directions, batch_size, state_size))
 
-            ex = sym._bind(default_context(), bind_dict)
+            ex = sym._bind(default_device(), bind_dict)
             ex.forward(is_train=True)
             ex01 = ex.output_dict['rnn_output'].asnumpy()
             ex.forward(is_train=False)
@@ -124,7 +124,7 @@ def test_rnnrelu_dropout():
     out[0].wait_to_read()
 
 def test_RNN_float64():
-    if default_context().device_type == 'gpu':
+    if default_device().device_type == 'gpu':
         return
     sym = mx.sym.RNN(
         mx.sym.Variable('in'),
@@ -145,7 +145,7 @@ def test_RNN_float64():
     args_grad = explicit_grad
     grad_req = 'write'
 
-    ex = sym._bind(default_context(),
+    ex = sym._bind(default_device(),
         {
             'in': mx.nd.ones([2, 1, 2], dtype=dtype),
             'par': mx.nd.ones([12], dtype=dtype),
@@ -172,7 +172,7 @@ def check_elementwise_sum_with_shape(shape, n):
     arr_grad = [mx.nd.empty(shape) for i in range(n)]
     for i in range(n):
         arr[i][:] = np.random.uniform(-10, 10, shape)
-    exec1 = out._bind(default_context(),
+    exec1 = out._bind(default_device(),
                      args=arr,
                      args_grad=arr_grad)
 
@@ -225,7 +225,7 @@ def check_concat_with_shape(shapes, dimension, skip_second):
     args = out.list_arguments()
     arg_shapes, out_shapes, aux_shapes = out.infer_shape(**dict(zip(args, shapes)))
     out_grad = mx.nd.empty(out_shapes[0])
-    exec1 = out._bind(default_context(),
+    exec1 = out._bind(default_device(),
                      args=arr,
                      args_grad=dict_grad)
     exec1.forward(is_train=True)
@@ -317,7 +317,7 @@ def check_slice_channel(data_ndim, axis, num_outputs, squeeze_axis):
         out_grads_npy = [np.random.normal(size=out_ele_shape) for i in range(num_outputs)]
         data = mx.sym.Variable('data')
         sym = mx.sym.SliceChannel(data=data, num_outputs=num_outputs, axis=axis, squeeze_axis=squeeze_axis)
-        exe = sym._simple_bind(ctx=default_context(), data=data_npy.shape)
+        exe = sym._simple_bind(ctx=default_device(), data=data_npy.shape)
         outputs = exe.forward(is_train=True, data=data_npy)
         assert len(exe.outputs) == num_outputs
         for i in range(num_outputs):
@@ -351,7 +351,7 @@ def test_python_op():
     x = mx.ndarray.ones((10))*10
     dx = mx.ndarray.zeros((10))
     dy = mx.ndarray.ones((10))
-    exec1 = s._bind(default_context(), args=[x], args_grad = {'X': dx})
+    exec1 = s._bind(default_device(), args=[x], args_grad = {'X': dx})
     exec1.forward(is_train=True)
     assert_almost_equal(x, exec1.outputs[0])
     exec1.backward(dy)
@@ -367,7 +367,7 @@ def test_swapaxes():
     arr_data = mx.nd.array(data_tmp)
     swap0 = mx.symbol.SwapAxis(data=data, dim1=0, dim2=2)
     swap = mx.symbol.SwapAxis(data=swap0, dim1=1, dim2=2)
-    exe_c = swap._bind(default_context(), args=[arr_data])
+    exe_c = swap._bind(default_device(), args=[arr_data])
     exe_c.forward(is_train=True)
     out = exe_c.outputs[0]
 
@@ -388,7 +388,7 @@ def test_swapaxes():
         data_mx = mx.nd.array(data_np, dtype=data_np.dtype)
         ret_np = np.swapaxes(data_np, axis1=axis1, axis2=axis2)
         ret_mx = mx.symbol.SwapAxis(data, dim1=axis1, dim2=axis2)
-        exe_c = ret_mx._bind(default_context(), args=[data_mx])
+        exe_c = ret_mx._bind(default_device(), args=[data_mx])
         exe_c.forward(is_train=True)
         out = exe_c.outputs[0]
         assert_almost_equal(out, ret_np)
@@ -719,7 +719,7 @@ def test_shape_array():
         xg = mx.nd.empty(xa.shape)
         ya = np.shape(xa)
         yg = mx.nd.ones(ya)
-        exe = y._bind(ctx=default_context(), args={'x': xa},
+        exe = y._bind(ctx=default_device(), args={'x': xa},
                      args_grad={'x': xg})
         exe.forward(is_train=True)
         exe.backward([yg])
@@ -736,7 +736,7 @@ def test_size_array():
         xg = mx.nd.empty(xa.shape)
         ya = np.size(xa)
         yg = mx.nd.ones(ya)
-        exe = y._bind(ctx=default_context(), args={'x': xa},
+        exe = y._bind(ctx=default_device(), args={'x': xa},
                      args_grad={'x': xg})
         exe.forward(is_train=True)
         exe.backward([yg])
@@ -797,15 +797,15 @@ def _inner_test(forward_gt, logic_sym, x_shape, y_shape, test_scalar=True):
         z = logic_sym(x, y)
         x_npy = np.random.randint(0, 4, size=x_shape).astype(np.float32)
         y_npy = np.random.randint(0, 4, size=y_shape).astype(np.float32)
-        exe = z._simple_bind(ctx=default_context(), x=x_shape, y=y_shape)
+        exe = z._simple_bind(ctx=default_device(), x=x_shape, y=y_shape)
         mx_out = exe.forward(is_train=True, x=x_npy, y=y_npy)[0]
         assert_almost_equal(mx_out, forward_gt(x_npy, y_npy))
         exe.backward()
         if test_scalar:
             z_lscalar = logic_sym(1, y)
             z_rscalar = logic_sym(x, 1)
-            exe_lscalar = z_lscalar._simple_bind(ctx=default_context(), y=y_shape)
-            exe_rscalar = z_rscalar._simple_bind(ctx=default_context(), x=x_shape)
+            exe_lscalar = z_lscalar._simple_bind(ctx=default_device(), y=y_shape)
+            exe_rscalar = z_rscalar._simple_bind(ctx=default_device(), x=x_shape)
             mx_lscalar_out = exe_lscalar.forward(is_train=True, y=y_npy)[0]
             mx_rscalar_out = exe_rscalar.forward(is_train=True, x=x_npy)[0]
             assert_almost_equal(mx_lscalar_out, forward_gt(1, y_npy))
@@ -856,7 +856,7 @@ def reference(a, dtype):
     assert_almost_equal(mx_out, reference(xa, dtype=xa.dtype))
     x = mx.sym.Variable('x')
     y = mx.sym.logical_not(data=x)
-    exe = y._simple_bind(ctx=default_context(), x=shape)
+    exe = y._simple_bind(ctx=default_device(), x=shape)
     sym_out = exe.forward(is_train=True, x=mx_xa)[0]
     assert_almost_equal(sym_out, reference(xa, dtype=xa.dtype))
 
@@ -868,7 +868,7 @@ def test_embedding():
 
     data = mx.sym.Variable("data")
     embed = mx.sym.Embedding(data=data, input_dim=in_dim, output_dim=out_dim, name="embed")
-    exe_test = embed._simple_bind(default_context(), grad_req={'data': 'null', 'embed_weight': 'write'}, data=(batch,))
+    exe_test = embed._simple_bind(default_device(), grad_req={'data': 'null', 'embed_weight': 'write'}, data=(batch,))
     arg_map = dict(zip(embed.list_arguments(), exe_test.arg_arrays))
     grad_map = dict(zip(embed.list_arguments(), exe_test.grad_arrays))
     np_data = np.random.randint(low=0, high=in_dim, size=batch)
@@ -903,7 +903,7 @@ def test_binary_op_duplicate_input():
     out_grad = mx.nd.empty(shape)
     out_grad[:] = 1
     square = data * data
-    exe_square = square._bind(default_context(), args=[arr_data], args_grad=[arr_grad])
+    exe_square = square._bind(default_device(), args=[arr_data], args_grad=[arr_grad])
     exe_square.forward(is_train=True)
     assert_almost_equal(exe_square.outputs[0], data_tmp * data_tmp)
     exe_square.backward(out_grad)
@@ -920,7 +920,7 @@ def test_sign():
     arr_grad[:]=3
 
     test = mx.sym.sign(data)
-    exe_test = test._bind(default_context(), args=[arr_data], args_grad=[arr_grad])
+    exe_test = test._bind(default_device(), args=[arr_data], args_grad=[arr_grad])
     exe_test.forward(is_train=True)
     out = exe_test.outputs[0]
     npout = np.sign(data_tmp)
@@ -944,7 +944,7 @@ def test_round_ceil_floor():
     arr_grad[:]= 2
 
     test = mx.sym.round(data) + mx.sym.ceil(data) +  mx.sym.floor(data)
-    exe_test = test._bind(default_context(), args=[arr_data])
+    exe_test = test._bind(default_device(), args=[arr_data])
     exe_test.forward(is_train=True)
     out = exe_test.outputs[0]
     npout = np.round(data_tmp) + np.ceil(data_tmp) + np.floor(data_tmp)
@@ -957,7 +957,7 @@ def test_trunc():
     data = mx.symbol.Variable('data')
     test = mx.sym.trunc(data)
 
-    exe_test = test._bind(default_context(), args=[arr_data])
+    exe_test = test._bind(default_device(), args=[arr_data])
     exe_test.forward(is_train=True)
     out = exe_test.outputs[0]
     # 'trunc' is sensitive to the precision of the calculation.  Force numpy to match mxnet's float32.
@@ -977,7 +977,7 @@ def test_rsqrt_cos_sin():
     arr_grad[:]=3
 
     test =  mx.sym.rsqrt(data) + mx.sym.cos(data) + mx.sym.sin(data)
-    exe_test = test._bind(default_context(), args=[arr_data], args_grad=[arr_grad])
+    exe_test = test._bind(default_device(), args=[arr_data], args_grad=[arr_grad])
     exe_test.forward(is_train=True)
     out = exe_test.outputs[0]
     npout =  1/ np.sqrt(data_tmp) + np.cos(data_tmp) + np.sin(data_tmp)
@@ -1007,7 +1007,7 @@ def test_maximum_minimum():
     arr_grad2 = mx.nd.empty(shape)
 
     test =  mx.sym.maximum(data1,data2) + mx.sym.minimum(data1,data2)
-    exe_test = test._bind(default_context(), args=[arr_data1,arr_data2], args_grad=[arr_grad1,arr_grad2])
+    exe_test = test._bind(default_device(), args=[arr_data1,arr_data2], args_grad=[arr_grad1,arr_grad2])
     exe_test.forward(is_train=True)
     out = exe_test.outputs[0]
     npout =  np.maximum(data_tmp1,data_tmp2) + np.minimum(data_tmp1,data_tmp2)
@@ -1038,7 +1038,7 @@ def test_maximum_minimum_scalar():
     arr_grad1 = mx.nd.empty(shape)
 
     test =  mx.sym.maximum(data1,3) + mx.sym.maximum(9,data1) + mx.sym.minimum(5,data1) + mx.sym.minimum(data1,4)
-    exe_test = test._bind(default_context(), args=[arr_data1], args_grad=[arr_grad1])
+    exe_test = test._bind(default_device(), args=[arr_data1], args_grad=[arr_grad1])
     exe_test.forward(is_train=True)
     out = exe_test.outputs[0]
     npout =  np.maximum(data_tmp1,3) + np.maximum(9,data_tmp1) + np.minimum(5,data_tmp1) + np.minimum(data_tmp1,4)
@@ -1069,7 +1069,7 @@ def test_abs():
     arr_grad[:]=3
 
     test = mx.sym.abs(data)
-    exe_test = test._bind(default_context(), args=[arr_data], args_grad=[arr_grad])
+    exe_test = test._bind(default_device(), args=[arr_data], args_grad=[arr_grad])
     exe_test.forward(is_train=True)
     out = exe_test.outputs[0]
     npout = abs(data_tmp)
@@ -1102,15 +1102,15 @@ def check_deconvolution_forward_backward(input_shape, num_filter, kernel, stride
 
     arg_names = deconv.list_arguments()
     arg_shapes, out_shapes, _ = deconv.infer_shape(data=input_shape)
-    input_data = mx.random.uniform(-5, 5, input_shape, ctx=mx.cpu()).copyto(default_context())
+    input_data = mx.random.uniform(-5, 5, input_shape, ctx=mx.cpu()).copyto(default_device())
     out_grad = input_data
     args = {}
     args["data"] = input_data
     args['conv_weight'] = args['deconv_weight'] = mx.random.normal(0, 1,
-        (num_filter, input_shape[1]) + kernel, ctx=mx.cpu()).copyto(default_context())
+        (num_filter, input_shape[1]) + kernel, ctx=mx.cpu()).copyto(default_device())
     args_grad = [mx.nd.empty(s) for s in arg_shapes]
 
-    exe = deconv._bind(default_context(), args=args, args_grad=args_grad)
+    exe = deconv._bind(default_device(), args=args, args_grad=args_grad)
     exe.forward(is_train=True)
     out = exe.outputs[0]
     exe.backward(out_grad)
@@ -1118,7 +1118,7 @@ def check_deconvolution_forward_backward(input_shape, num_filter, kernel, stride
 
     args_grad_addto_npy = [np.random.normal(size=s) for s in arg_shapes]
     args_grad_addto = [mx.nd.array(ele) for ele in args_grad_addto_npy]
-    exe = deconv._bind(default_context(), args=args, args_grad=args_grad_addto, grad_req="add")
+    exe = deconv._bind(default_device(), args=args, args_grad=args_grad_addto, grad_req="add")
     exe.forward(is_train=True)
     out = exe.outputs[0].asnumpy()
     exe.backward(out_grad)
@@ -1145,16 +1145,16 @@ def check_deconvolution_gradient(input_shape, num_filter, pad):
         data=data_deconv, kernel=kernel, stride=stride, pad=pad,
         num_filter=num_filter, no_bias = "true", name = "deconv")
 
-    conv_data = mx.random.uniform(-5, 5, input_shape, ctx=mx.cpu()).copyto(default_context())
+    conv_data = mx.random.uniform(-5, 5, input_shape, ctx=mx.cpu()).copyto(default_device())
     conv_args = {}
     conv_args["data_conv"] = conv_data
     conv_args['conv_weight'] = \
-        mx.random.normal(0, 1,(num_filter, input_shape[1]) + kernel, ctx=mx.cpu()).copyto(default_context())
+        mx.random.normal(0, 1,(num_filter, input_shape[1]) + kernel, ctx=mx.cpu()).copyto(default_device())
     conv_args_grad = [mx.nd.zeros(conv_data.shape),
         mx.nd.zeros((num_filter, input_shape[1]) + kernel)]
-    exe_conv = conv._bind(default_context(), args=conv_args, args_grad=conv_args_grad)
+    exe_conv = conv._bind(default_device(), args=conv_args, args_grad=conv_args_grad)
     exe_conv.forward(is_train=True)
-    conv_out_grad = mx.random.normal(0, 2, exe_conv.outputs[0].shape, ctx=mx.cpu()).copyto(default_context())
+    conv_out_grad = mx.random.normal(0, 2, exe_conv.outputs[0].shape, ctx=mx.cpu()).copyto(default_device())
     exe_conv.backward(conv_out_grad)
 
     deconv_data = conv_out_grad
@@ -1167,13 +1167,13 @@ def check_deconvolution_gradient(input_shape, num_filter, pad):
                                   np.random.normal(size=(num_filter, input_shape[1]) + kernel)]
     deconv_addto_args_grad = [mx.nd.array(deconv_addto_args_grad_npy[0]),
                               mx.nd.array(deconv_addto_args_grad_npy[1])]
-    exe_deconv = deconv._bind(default_context(), args=deconv_args, args_grad=deconv_args_grad)
+    exe_deconv = deconv._bind(default_device(), args=deconv_args, args_grad=deconv_args_grad)
     exe_deconv.forward(is_train=True)
     deconv_out_grad = conv_data[:]
     exe_deconv.backward(deconv_out_grad)
     assert_almost_equal(conv_args_grad[1], deconv_args_grad[1], rtol=1e-3, atol=1e-2)
     # Test AddTo
-    exe_deconv_addto = deconv._bind(default_context(), args=deconv_args,
+    exe_deconv_addto = deconv._bind(default_device(), args=deconv_args,
                                    args_grad=deconv_addto_args_grad,
                                    grad_req="add")
     exe_deconv_addto.forward(is_train=True)
@@ -1340,11 +1340,11 @@ def exe_forward(exe):
 
 
 def check_nearest_upsampling_with_shape(shapes, scale, root_scale):
-    arr = {'arg_%d'%i: mx.random.uniform(-10.0, 10.0, shape, ctx=mx.cpu()).copyto(default_context()) for i, shape in zip(range(len(shapes)), shapes)}
+    arr = {'arg_%d'%i: mx.random.uniform(-10.0, 10.0, shape, ctx=mx.cpu()).copyto(default_device()) for i, shape in zip(range(len(shapes)), shapes)}
     arr_grad = {'arg_%d'%i: mx.nd.zeros(shape) for i, shape in zip(range(len(shapes)), shapes)}
 
     up = mx.sym.UpSampling(*[mx.sym.Variable('arg_%d'%i) for i in range(len(shapes))], sample_type='nearest', scale=root_scale)
-    exe = up._bind(default_context(), args=arr, args_grad=arr_grad)
+    exe = up._bind(default_device(), args=arr, args_grad=arr_grad)
     exe.forward(is_train=True)
     exe.backward(exe.outputs)
     for k in range(len(shapes)):
@@ -1368,11 +1368,11 @@ def _init_bilinear(arr, f):
         mx.sym.Variable('weight'), sample_type='bilinear', scale=root_scale,
         num_filter=num_filter, num_args=2)
     arg_shapes, out_shapes, _ = up.infer_shape(data=data_shape)
-    arr = {'data': mx.random.uniform(-5, 5, data_shape, ctx=mx.cpu()).copyto(default_context()),
+    arr = {'data': mx.random.uniform(-5, 5, data_shape, ctx=mx.cpu()).copyto(default_device()),
         'weight':  mx.nd.array(_init_bilinear(mx.ndarray.empty(arg_shapes[1]).asnumpy(), root_scale))}
 
     arr_grad = [mx.nd.empty(s) for s in arg_shapes]
-    exe = up._bind(default_context(), args=arr, args_grad=arr_grad)
+    exe = up._bind(default_device(), args=arr, args_grad=arr_grad)
     exe.forward(is_train=True)
     out = exe.outputs[0].asnumpy()
     exe.backward(exe.outputs)
@@ -1742,8 +1742,8 @@ def test_convolution_grouping():
                                                     num_filter=num_filter//num_group, kernel=kernel)
                             for i in range(num_group)])
 
-            exe1 = y1._simple_bind(default_context(), x=shape)
-            exe2 = y2._simple_bind(default_context(), x=shape, w=(num_filter, shape[1]//num_group) + kernel, b=(num_filter,))
+            exe1 = y1._simple_bind(default_device(), x=shape)
+            exe2 = y2._simple_bind(default_device(), x=shape, w=(num_filter, shape[1]//num_group) + kernel, b=(num_filter,))
             for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays):
                 arr1[:] = np.float32(np.random.normal(size=arr1.shape))
                 arr2[:] = arr1
@@ -1784,7 +1784,7 @@ def test_depthwise_convolution():
                                                                     stride=stride, pad=pad)
                                                 for i in range(num_group)])
 
-                            dev = default_context()
+                            dev = default_device()
                             exe1 = y1._simple_bind(dev, x=shape)
                             exe2 = y2._simple_bind(dev, x=shape, w=(num_filter, shape[1]//num_group)+kernel,
                                     b=(num_filter,))
@@ -1959,7 +1959,7 @@ def check_binary_op_forward(symbol, baseline, gen_data, rtol=1e-3, atol=1e-5, mx
     sample_num = 200
     for i in range(sample_num):
         d = gen_data(i)
-        y = symbol._bind(default_context(), args={'a': mx.nd.array(d[0]), 'b': mx.nd.array(d[1])})
+        y = symbol._bind(default_device(), args={'a': mx.nd.array(d[0]), 'b': mx.nd.array(d[1])})
         y.forward(is_train=True)
         y = y.outputs[0].asnumpy()
         x = baseline(d[0], d[1]).astype(y.dtype)
@@ -2023,7 +2023,7 @@ def reduce_op(shape, x):
         x_2 = reduce_op(d[1].shape, baseline_grad2)
         y_1 = mx.nd.empty(d[0].shape)
         y_2 = mx.nd.empty(d[1].shape)
-        y = symbol._bind(default_context(), args={'a': mx.nd.array(d[0]), 'b': mx.nd.array(d[1])},
+        y = symbol._bind(default_device(), args={'a': mx.nd.array(d[0]), 'b': mx.nd.array(d[1])},
                         args_grad=[y_1, y_2])
         o = y.forward(is_train=True)
         y.backward([mx.nd.array(out, dtype=o[0].dtype)])
@@ -2224,7 +2224,7 @@ def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3),
     in_img = mx.symbol.Variable('input')
     net = mx.symbol.Convolution(in_img, num_filter=1,kernel=kernel_shape, dilate=dil, no_bias="true", name='test_convolution')
     net.list_arguments()
-    be = net._bind(default_context(), args={ 'input' : spike_img, 'test_convolution_weight' : kernel_weights},
+    be = net._bind(default_device(), args={ 'input' : spike_img, 'test_convolution_weight' : kernel_weights},
                 args_grad={'input' : spike_img2, 'test_convolution_weight' : kernel_weights2 } )
     be.forward(True)
     out_o = be.outputs[0].asnumpy()
@@ -2243,7 +2243,7 @@ def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3),
     # Now check whether the input gradient was computed correctly
     input_grad = mx.nd.array(vgrad)
 
-    be = net._bind(default_context(), args={ 'input' : input_grad, 'test_convolution_weight' : kernel_weights})
+    be = net._bind(default_device(), args={ 'input' : input_grad, 'test_convolution_weight' : kernel_weights})
     be.forward(True)
     out_o = be.outputs[0].asnumpy()
     assert_allclose(out_o[center],np.prod(kernel_shape),atol=1e-5)
@@ -2256,7 +2256,7 @@ def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3),
     white_in = mx.nd.ones(shape=data_shape)
     white_in2 = mx.nd.ones(shape=data_shape)
 
-    be = net._bind(default_context(), args={ 'input' : white_in, 'test_convolution_weight' : rnd_kernel},
+    be = net._bind(default_device(), args={ 'input' : white_in, 'test_convolution_weight' : rnd_kernel},
                 args_grad={'input' : white_in2, 'test_convolution_weight' : rnd_kernel2 } )
 
     be.forward(True)
@@ -2266,7 +2266,7 @@ def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3),
 
     dkernel = mx.nd.array(rnd_kernel_s + kernel_gradient)
 
-    be = net._bind(default_context(), args={ 'input' : white_in, 'test_convolution_weight' : dkernel})
+    be = net._bind(default_device(), args={ 'input' : white_in, 'test_convolution_weight' : dkernel})
 
     be.forward(True)
     out = be.outputs[0].asnumpy()
@@ -2330,7 +2330,7 @@ def test_reshape_new(src_shape, shape_args, reverse, dst_shape):
                               str(dst_shape), str(output_shape[0]))
     dat_npy = np.random.rand(*src_shape)
     grad_npy = np.random.rand(*dst_shape)
-    exe = net._simple_bind(default_context(), data=src_shape)
+    exe = net._simple_bind(default_device(), data=src_shape)
     exe.arg_dict['data'][:] = dat_npy
     exe.forward(is_train=True)
     assert np.square(exe.outputs[0].asnumpy() - dat_npy.reshape(dst_shape)).mean() < 1E-7, \
@@ -2367,12 +2367,12 @@ def test_reshape_old():
     # Test for Flatten
     data = mx.sym.Variable("data")
     net = mx.sym.Flatten(data)
-    exe = net._simple_bind(ctx=default_context(), data=(5, 4, 3, 7))
+    exe = net._simple_bind(ctx=default_device(), data=(5, 4, 3, 7))
     data_npy = np.random.normal(size=(5, 4, 3, 7))
     out_grad_npy = np.random.normal(size=(5, 4 * 3 * 7))
     outputs = exe.forward(is_train=True, data=data_npy)[0].asnumpy()
     assert_allclose(outputs, data_npy.reshape((5, 4 * 3 * 7)))
-    exe.backward(out_grads=[mx.nd.array(out_grad_npy, ctx=default_context())])
+    exe.backward(out_grads=[mx.nd.array(out_grad_npy, ctx=default_device())])
     assert_allclose(exe.grad_arrays[0].asnumpy(), out_grad_npy.reshape((5, 4, 3, 7)))
 
 
@@ -2393,7 +2393,7 @@ def test_reshape_like_new(lhs_shape, rhs_shape, lbeg, lend, rbeg, rend, dst_shap
         rhs_npy = np.random.rand(*rhs_shape)
         grad_npy = np.random.rand(*dst_shape)
 
-        exe = net._simple_bind(default_context(), lhs=lhs_shape, rhs=rhs_shape)
+        exe = net._simple_bind(default_device(), lhs=lhs_shape, rhs=rhs_shape)
         exe.arg_dict['lhs'][:] = lhs_npy
         exe.arg_dict['rhs'][:] = rhs_npy
         exe.forward(is_train=True)
@@ -2483,7 +2483,7 @@ def test_reduce_inner(numpy_reduce_func, numpy_reduce_grad_func, mx_reduce_sym,
                                                       outdata=sum_groundtruth,
                                                       axis=axes, keepdims=keepdims,
                                                       keepdim_shape=keepdim_shape)
-            net = b._bind(default_context(), args={'a': mx.nd.array(dat_npy)},
+            net = b._bind(default_device(), args={'a': mx.nd.array(dat_npy)},
                          args_grad={'a': grad_nd})
             net.forward(is_train=True)
 
@@ -2566,7 +2566,7 @@ def test_broadcasting_ele(sym_bcast):
             outgrad_npy = np.random.rand(*target_shape)
             grad_groundtruth = np_reduce(outgrad_npy, axis=axis, keepdims=True,
                                          numpy_reduce_func=np.sum)
-            net = sym_bcast._bind(default_context(), args={'a': mx.nd.array(dat_npy)},
+            net = sym_bcast._bind(default_device(), args={'a': mx.nd.array(dat_npy)},
                                  args_grad={'a': grad_nd})
             net.forward(is_train=True)
             assert (net.outputs[0].shape == target_shape).all()
@@ -2708,7 +2708,7 @@ def test_slice_axis():
             Y = mx.symbol.slice_axis(data=X, axis=t, begin=b, end=e)
 
             xgrad = mx.nd.empty(x.shape)
-            exec1 = Y._bind(default_context(), args = [x], args_grad = {'X': xgrad})
+            exec1 = Y._bind(default_device(), args = [x], args_grad = {'X': xgrad})
             exec1.forward(is_train=True)
             y = exec1.outputs[0]
             assert_allclose(x.asnumpy()[idx], y.asnumpy())
@@ -2719,7 +2719,7 @@ def test_slice_axis():
             assert_allclose(xx, xgrad.asnumpy())
             x_grad_npy = np.random.normal(size=x.shape)
             xgrad = mx.nd.array(x_grad_npy)
-            exec2 = Y._bind(default_context(), args=[x], args_grad={'X': xgrad}, grad_req="add")
+            exec2 = Y._bind(default_device(), args=[x], args_grad={'X': xgrad}, grad_req="add")
             exec2.forward(is_train=True)
             exec2.backward([exec2.outputs[0]])
             xx = np.zeros(shape=x.shape, dtype=np.float32)
@@ -2754,7 +2754,7 @@ def test_slice_like():
 
             xgrad = mx.nd.empty(x.shape)
             xgrad1 = mx.nd.empty(x1.shape)
-            exec1 = Y._bind(default_context(), args = [x, x1],
+            exec1 = Y._bind(default_device(), args = [x, x1],
                            args_grad = {'X': xgrad, 'X1': xgrad1})
             exec1.forward(is_train=True)
             y = exec1.outputs[0]
@@ -2830,7 +2830,7 @@ def test_stn():
                     arg_shapes, out_shapes, _ = stn.infer_shape(data=data_shape)
                     # check shape
                     assert out_shapes[0] == (data_shape[0], data_shape[1], target_shape[0], target_shape[1])
-                    dev = default_context()
+                    dev = default_device()
                     #dev = mx.gpu(0)
                     args = {}
                     args['data'] = mx.random.normal(0, 1, data_shape, ctx=mx.cpu()).copyto(dev)
@@ -2885,7 +2885,7 @@ def test_stn_valid_sampling():
         'loc': mx.nd.array(np.zeros_like(loc_array))
     }
     executor = stn._bind(
-        ctx=default_context(),
+        ctx=default_device(),
         args={'data': mx.nd.array(data_array),
               'loc': mx.nd.array(loc_array)},
         grad_req=grad_req,
@@ -2898,7 +2898,7 @@ def test_stn_valid_sampling():
 
 
 def test_dot():
-    ctx = default_context()
+    ctx = default_device()
     dtypes = ['float32', 'float64']
     ndims = [2]
     if ctx.device_type == 'gpu':
@@ -2972,7 +2972,7 @@ def dot_sym_xT_yT(data_type):
 
 
 def test_batch_dot():
-    ctx = default_context()
+    ctx = default_device()
     dtypes = ['float32', 'float64']
     if ctx.device_type == 'gpu':
         dtypes += ['float16']
@@ -3157,7 +3157,7 @@ def unittest_correlation(data_shape,kernel_size,max_displacement,stride1,stride2
     net1 = get_correlation(img1,img2,kernel_size,max_displacement,stride1,stride2,pad_size,is_multiply)
     net2 = get_correlation(img1,img2,kernel_size,max_displacement,stride1,stride2,pad_size,is_multiply )
 
-    exe1 = net1._simple_bind(default_context(),img1=img1.shape,img2=img1.shape)
+    exe1 = net1._simple_bind(default_device(),img1=img1.shape,img2=img1.shape)
     exe1.arg_dict['img1'][:] = img1
     exe1.arg_dict['img2'][:] = img2
 
@@ -3171,7 +3171,7 @@ def unittest_correlation(data_shape,kernel_size,max_displacement,stride1,stride2
 
     # out_grad
     a = np.ones(forward_result.shape)
-    out_grad1 = mx.nd.array(a,default_context())
+    out_grad1 = mx.nd.array(a,default_device())
     # cpu backward
     exe1.backward(out_grads=out_grad1)
     # python backward
@@ -3264,7 +3264,7 @@ def check_pad_with_shape(shape, xpu, pad_width, mode, dtype="float64"):
 
 
 def test_pad():
-    ctx = default_context()
+    ctx = default_device()
     shape1 = (2, 3, 3, 5)
     pad1 = (0, 0, 0, 0, 1, 2, 3, 4)
     shape2 = (2, 3, 3, 5, 4)
@@ -3319,14 +3319,14 @@ def check_instance_norm_with_shape(shape, xpu):
 
 
 def test_instance_normalization():
-    check_instance_norm_with_shape((1, 1, 1), default_context())
-    check_instance_norm_with_shape((2, 1, 2), default_context())
-    check_instance_norm_with_shape((2,4,5,6), default_context())
-    check_instance_norm_with_shape((3,3,2,3,2,1,1), default_context())
+    check_instance_norm_with_shape((1, 1, 1), default_device())
+    check_instance_norm_with_shape((2, 1, 2), default_device())
+    check_instance_norm_with_shape((2,4,5,6), default_device())
+    check_instance_norm_with_shape((3,3,2,3,2,1,1), default_device())
 
 
 def check_l2_normalization(in_shape, mode, dtype, norm_eps=1e-10):
-    ctx = default_context()
+    ctx = default_device()
     data = mx.symbol.Variable('data')
     out = mx.symbol.L2Normalization(data=data, mode=mode, eps=norm_eps)
     in_data = np.random.uniform(-1, 1, in_shape).astype(dtype)
@@ -3402,7 +3402,7 @@ def npy_layer_norm_grad(data, gamma, out_grad, axis, eps):
         beta_grad = beta_grad.reshape((-1,))
         return data_grad, gamma_grad, beta_grad
 
-    ctx = default_context()
+    ctx = default_device()
     data = np.random.normal(0, 1, in_shape).astype(dtype)
     gamma = np.random.normal(0, 1, (in_shape[axis],)).astype(dtype)
     beta = np.random.normal(0, 1, (in_shape[axis],)).astype(dtype)
@@ -3479,7 +3479,7 @@ def l1norm(input_data, axis=0, keepdims=True):
     def l2norm(input_data, axis=0, keepdims=True):
         return sp_norm(input_data, axis=axis, keepdims=keepdims)
 
-    ctx = default_context()
+    ctx = default_device()
     data = mx.symbol.Variable('data')
     in_data_dim = random_sample([2,3,4], 1)[0]
     in_shape = rand_shape_nd(in_data_dim, dim=5)
@@ -3600,7 +3600,7 @@ def sequence_reverse_numpy(array, lengths, axis):
 
 def check_sequence_func(ftype, mask_value=0, axis=0):
     # bind with label
-    xpu = default_context()
+    xpu = default_device()
     X = mx.symbol.Variable('X')
     L = mx.symbol.Variable('L') # lengths
     shapes = [(3, 4), (1, 1), (3, 4, 3, 1, 1)]
@@ -3743,7 +3743,7 @@ def mathematical_core_binary(name,
     arr_grad2 = mx.nd.empty(shape)
 
     test = forward_mxnet_call(data1, data2)
-    exe_test = test._bind(default_context(), args=[arr_data1, arr_data2], args_grad=[arr_grad1, arr_grad2])
+    exe_test = test._bind(default_device(), args=[arr_data1, arr_data2], args_grad=[arr_grad1, arr_grad2])
     exe_test.forward(is_train=True)
     out = exe_test.outputs[0]
     npout = forward_numpy_call(data_tmp1, data_tmp2)
@@ -3773,7 +3773,7 @@ def mathematical_core(name, forward_mxnet_call, forward_numpy_call, backward_num
     arr_grad[:] = 3
 
     test = forward_mxnet_call(data)
-    exe_test = test._bind(default_context(), args=[arr_data], args_grad=[arr_grad])
+    exe_test = test._bind(default_device(), args=[arr_data], args_grad=[arr_grad])
     exe_test.forward(is_train=True)
     out = exe_test.outputs[0]
     npout = forward_numpy_call(data_tmp)
@@ -3820,7 +3820,7 @@ def rounding(name, forward_mxnet_call, forward_numpy_call, data_init=5., grad_in
     arr_data = mx.nd.array(data_tmp)
 
     test = forward_mxnet_call(data)
-    exe_test = test._bind(default_context(), args=[arr_data])
+    exe_test = test._bind(default_device(), args=[arr_data])
     exe_test.forward(is_train=True)
     out = exe_test.outputs[0]
     npout = forward_numpy_call(data_tmp)
@@ -3943,7 +3943,7 @@ def test_clip():
 def test_init():
     def test_basic_val_init(sym_func, np_func, shape, dtype):
         x = sym_func(shape=shape, dtype=dtype)
-        exe = x._bind(default_context(), args=[], args_grad=[])
+        exe = x._bind(default_device(), args=[], args_grad=[])
         exe.forward(is_train=True)
         assert_almost_equal(exe.outputs[0], np_func(shape=shape, dtype=dtype))
         assert exe.outputs[0].asnumpy().dtype == dtype
@@ -4001,7 +4001,7 @@ def test_arange_like_without_axis():
 
 
 def test_order():
-    ctx = default_context()
+    ctx = default_device()
 
     def gt_topk(dat, axis, ret_typ, k, is_ascend):
         if ret_typ == "indices":
@@ -4134,7 +4134,7 @@ def get_large_matrix():
 def test_blockgrad():
     a = mx.sym.Variable('a')
     b = mx.sym.BlockGrad(a)
-    exe = b._simple_bind(ctx=default_context(), a=(10, 10))
+    exe = b._simple_bind(ctx=default_device(), a=(10, 10))
     a_npy = np.random.rand(10, 10)
     exe.forward(is_train=True, a=a_npy)
     assert_almost_equal(exe.outputs[0], a_npy)
@@ -4212,7 +4212,7 @@ def grad_helper(grad_in, axis, idx):
             idx = mx.sym.Variable('indices')
             idx = mx.sym.BlockGrad(idx)
             result = mx.sym.take(a=data, indices=idx, axis=axis, mode=mode)
-            exe = result._simple_bind(default_context(), a=data_shape,
+            exe = result._simple_bind(default_device(), a=data_shape,
                                     indices=idx_shape)
             data_real = np.random.normal(size=data_shape).astype('float32')
             if out_of_range:
@@ -4257,7 +4257,7 @@ def test_grid_generator():
     for target_shape in test_case:
         affine_matrix =  mx.sym.Variable('affine')
         grid = mx.sym.GridGenerator(data=affine_matrix,transform_type='affine', target_shape=target_shape)
-        exe = grid._simple_bind(ctx=default_context(), affine=(1,6), grad_req='write')
+        exe = grid._simple_bind(ctx=default_device(), affine=(1,6), grad_req='write')
 
         # check forward
         exe.arg_dict['affine'][:] = np.array([[1.0,0,0,0,1.0,0]])
@@ -4279,7 +4279,7 @@ def test_grid_generator():
         grad_est = np.dot(out_grad[0].reshape(2,target_shape[0]*target_shape[1]),tmp.T).reshape(1,6)
         assert_almost_equal(exe.grad_dict['affine'], grad_est)
         # check addto
-        exe = grid._simple_bind(ctx=default_context(), affine=(1,6), grad_req='add')
+        exe = grid._simple_bind(ctx=default_device(), affine=(1,6), grad_req='add')
         grid_grad_npy = np.random.normal(size=exe.grad_dict['affine'].shape)
         exe.grad_dict['affine'][:] = grid_grad_npy
         exe.arg_dict['affine'][:] = np.array([[1.0, 0, 0, 0, 1.0, 0]])
@@ -4292,7 +4292,7 @@ def test_grid_generator():
     for target_shape in test_case:
         flow = mx.sym.Variable('flow')
         grid = mx.sym.GridGenerator(data=flow,transform_type='warp', target_shape=target_shape)
-        exe = grid._simple_bind(ctx=default_context(), flow=(1,2)+target_shape, grad_req='write')
+        exe = grid._simple_bind(ctx=default_device(), flow=(1,2)+target_shape, grad_req='write')
         # check forward
         exe.arg_dict['flow'][:] = np.ones((1,2)+target_shape)
         exe.forward(is_train=True)
@@ -4310,7 +4310,7 @@ def test_grid_generator():
         grad_est[0,1] = out_grad[0,1] / ((target_shape[0]-1.0) / 2.0)
         assert_almost_equal(exe.grad_dict['flow'], grad_est, rtol=1e-3)
         # check addto
-        exe_add = grid._simple_bind(ctx=default_context(), flow=(1, 2) + target_shape, grad_req='add')
+        exe_add = grid._simple_bind(ctx=default_device(), flow=(1, 2) + target_shape, grad_req='add')
         flow_grad_npy = np.random.normal(size=exe_add.grad_dict['flow'].shape)
         exe_add.arg_dict['flow'][:] = np.ones((1, 2) + target_shape)
         exe_add.grad_dict['flow'][:] = flow_grad_npy
@@ -4323,8 +4323,8 @@ def test_index2d():
     for _ in range(30):
         n = np.random.randint(1, 100)
         m = np.random.randint(1, 500)
-        data = mx.random.uniform(-1, 1, shape=(n, m), ctx=default_context())
-        x = mx.nd.array(np.random.randint(0, m, size=n), ctx=default_context(), dtype='int32')
+        data = mx.random.uniform(-1, 1, shape=(n, m), ctx=default_device())
+        x = mx.nd.array(np.random.randint(0, m, size=n), ctx=default_device(), dtype='int32')
         r = mx.nd.batch_take(data, x)
         assert_almost_equal(r, data.asnumpy()[np.arange(n), x.asnumpy()])
 
@@ -4334,13 +4334,13 @@ def test_cast():
         for dsttype in [np.float32, np.int32, np.float16]:
             x = mx.sym.Variable('x', dtype=srctype)
             y = mx.sym.Cast(x, dtype=dsttype)
-            exe = y._simple_bind(ctx=default_context(), x=(10, 10))
+            exe = y._simple_bind(ctx=default_device(), x=(10, 10))
             assert exe.arg_arrays[0].dtype == srctype
             X = np.random.uniform(-10, 10, size=(10, 10))
             exe.arg_arrays[0][:] = X
             exe.forward(is_train=True)
             assert exe.outputs[0].dtype == dsttype
-            exe.backward(mx.nd.array(X, dtype=dsttype, ctx=default_context()))
+            exe.backward(mx.nd.array(X, dtype=dsttype, ctx=default_device()))
             assert_almost_equal(exe.outputs[0], X.astype(srctype).astype(dsttype), rtol=1e-3, atol=1e-5)
             assert_almost_equal(exe.grad_arrays[0], X.astype(dsttype).astype(srctype), rtol=1e-3, atol=1e-5)
 
@@ -4374,7 +4374,7 @@ def test_cast_float32_to_float16():
     def check_cast(op, input_np, expected_output):
         x = mx.sym.Variable('x', dtype=np.float32)
         sym = op(x, dtype=np.float16)
-        ctx = default_context()
+        ctx = default_device()
         exe = sym._bind(ctx, {'x': mx.nd.array(input_np, dtype=np.float32, ctx=ctx)})
         assert exe.arg_arrays[0].dtype == np.float32
         exe.forward(is_train=True)
@@ -4391,12 +4391,12 @@ def check_cast(op, input_np, expected_output):
 
 
 def test_amp_multicast():
-    if default_context().device_type == 'cpu':
+    if default_device().device_type == 'cpu':
         return
     x = mx.sym.Variable('x', dtype=np.float16)
     y = mx.sym.Variable('y', dtype=np.float32)
     z = mx.sym.Variable('z', dtype=np.float16)
-    ctx = default_context()
+    ctx = default_device()
     res = mx.sym.amp_multicast(x, y, z, num_outputs=3)
     exe = res._bind(ctx, {'x': mx.nd.random.uniform(shape=(3, 3), dtype=np.float16, ctx=ctx),
                          'y': mx.nd.random.uniform(shape=(3, 3), dtype=np.float32, ctx=ctx),
@@ -4411,7 +4411,7 @@ def check_amp_multicast(input_np, expected_output):
         x = mx.sym.Variable('x', dtype=np.float16)
         y = mx.sym.Variable('y', dtype=np.float32)
         z = mx.sym.Variable('z', dtype=np.float16)
-        ctx = default_context()
+        ctx = default_device()
         res = mx.sym.amp_multicast(x, y, z, num_outputs=3)
         exe = res._bind(ctx, {'x': mx.nd.array(input_np, dtype=np.float16, ctx=ctx),
                              'y': mx.nd.array(input_np, dtype=np.float32, ctx=ctx),
@@ -4435,7 +4435,7 @@ def test_all_finite():
     finite_arr = mx.nd.array([[0, 0]])
     inf_arr = mx.nd.array([[np.inf, np.inf]])
     z = mx.sym.all_finite(data)
-    ctx = default_context()
+    ctx = default_device()
     exe = z._bind(ctx, {'data': inf_arr})
     exe.forward(is_train=False)
     sym_output = exe.outputs[0].asnumpy()
@@ -4467,7 +4467,7 @@ def test_repeat_forward():
                 shape += (np.random.randint(1, size_max+1), )
             a = np.random.random_sample(size=shape)
             aa = np.repeat(a, repeats)
-            b = mx.nd.array(a, ctx=default_context())
+            b = mx.nd.array(a, ctx=default_device())
             bb = mx.nd.repeat(b, repeats)
             assert_almost_equal(aa, bb)
 
@@ -4486,7 +4486,7 @@ def test_repeat_backward(axis):
         arr_grad = mx.nd.empty(shape)
         repeats = 2
         test = mx.sym.repeat(data, repeats=repeats, axis=axis)
-        exe = test._bind(ctx=default_context(), args=[arr_data], args_grad=[arr_grad])
+        exe = test._bind(ctx=default_device(), args=[arr_data], args_grad=[arr_grad])
         npout_grad = np.random.randint(0, 10, n1 * n2 * repeats)
         if axis == 0:
             npout_grad = npout_grad.reshape(n1 * repeats, n2)
@@ -4570,7 +4570,7 @@ def test_empty_tensor():
         shape = (2, 3, 0, 4)
         with mx.np_shape():
             a = np.array([], dtype=np.int32).reshape(shape)
-            b = mx.nd.array(a, ctx=default_context(), dtype=a.dtype)
+            b = mx.nd.array(a, ctx=default_device(), dtype=a.dtype)
 
             reps = (2, 4, 6)
             a_tiled = np.tile(a, reps)
@@ -4579,7 +4579,7 @@ def test_empty_tensor():
 
     def test_empty_reps():
         a = np.array([[2, 3, 4], [5, 6, 7]], dtype=np.int32)
-        b = mx.nd.array(a, ctx=default_context(), dtype=a.dtype)
+        b = mx.nd.array(a, ctx=default_device(), dtype=a.dtype)
         a_tiled = np.tile(a, ())
         b_tiled = mx.nd.tile(b, ()).asnumpy()
         assert same(a_tiled, b_tiled)
@@ -4596,7 +4596,7 @@ def test_tile_backward():
         reps2 = 2
         reps = (reps1, reps2)
         test = mx.sym.tile(data, reps=reps)
-        exe = test._bind(ctx=default_context(), args=[arr_data], args_grad=[arr_grad])
+        exe = test._bind(ctx=default_device(), args=[arr_data], args_grad=[arr_grad])
         npout_grad = np.random.randint(0, 10, n1 * n2 * reps1 * reps2).reshape(n1 * reps1, n2 * reps2)
         out_grad = mx.nd.array(npout_grad)
         exe.backward(out_grad)
@@ -4648,7 +4648,7 @@ def test_normal_case(index_type=np.int32):
             indices = np.random.randint(-dim_size_max, dim_size_max+1,
                                         size=np.prod(shape)).reshape(shape)
             mx_one_hot_array = mx.nd.one_hot(
-                mx.nd.array(indices, ctx=default_context(), dtype=index_type),
+                mx.nd.array(indices, ctx=default_device(), dtype=index_type),
                 depth=depth, dtype=np.int32)
             expected_array = np.zeros((np.prod(shape), depth), dtype=np.int32)
             expected_array[:] = off_value
@@ -4668,7 +4668,7 @@ def test_empty_indices():
             indices = np.array([]).reshape(shape)
             depth = 10
             mx_one_hot_array = mx.nd.one_hot(
-                mx.nd.array(indices, ctx=default_context(), dtype=np.int32),
+                mx.nd.array(indices, ctx=default_device(), dtype=np.int32),
                 depth=depth, dtype=np.int32
             ).asnumpy()
             expected_array = np.array([], dtype=np.int32).reshape(shape + (depth,))
@@ -4679,7 +4679,7 @@ def test_zero_depth():
         indices = np.ones(shape)
         depth = 0
         mx_one_hot_array = mx.nd.one_hot(
-            mx.nd.array(indices, ctx=default_context(), dtype=np.int32),
+            mx.nd.array(indices, ctx=default_device(), dtype=np.int32),
             depth=depth, dtype=np.int32).asnumpy()
         expected_array = np.array([], dtype=np.int32).reshape(shape + (depth, ))
         assert same(expected_array, mx_one_hot_array)
@@ -4773,7 +4773,7 @@ def test_where_helper(shape, same_shape):
         where_sym = mx.sym.where(condition, x, y)
 
         # test req='write'
-        where_exe_write = where_sym._simple_bind(ctx=default_context(),
+        where_exe_write = where_sym._simple_bind(ctx=default_device(),
                                                 condition=condition_np.shape,
                                                 x=x_np.shape, y=y_np.shape,
                                                 grad_req='write')
@@ -4790,7 +4790,7 @@ def test_where_helper(shape, same_shape):
         # test req='add'
         x_grad_init = np.random.randint(30, 40, np.prod(shape)).reshape(shape)
         y_grad_init = np.random.randint(40, 50, np.prod(shape)).reshape(shape)
-        where_exe_add = where_sym._simple_bind(ctx=default_context(),
+        where_exe_add = where_sym._simple_bind(ctx=default_device(),
                                               condition=condition_np.shape,
                                               x=x_np.shape, y=y_np.shape,
                                               grad_req='add')
@@ -4914,7 +4914,7 @@ def test_softmax_with_large_inputs():
     def softmax_forward(input_data, true_output):
         data = mx.sym.Variable('data')
         out1 = data.softmax(axis=1)
-        exec1 = out1._bind(default_context(), args={'data': input_data})
+        exec1 = out1._bind(default_device(), args={'data': input_data})
         exec1.forward()[0].wait_to_read()
         ndarr = exec1.outputs[0][0][0][0]
         assert_almost_equal(ndarr, true_output, rtol=1e-5, atol=1e-5)
@@ -5142,9 +5142,9 @@ def check_ctc_loss(acts, labels, loss_truth, contrib=False):
         ctc = mx.sym.contrib.ctc_loss(in_var, labels_var)
     else:
         ctc = mx.sym.ctc_loss(in_var, labels_var)
-    acts_nd = mx.nd.array(acts, ctx=default_context())
-    labels_nd = mx.nd.array(labels, ctx=default_context())
-    exe = ctc._bind(ctx=default_context(), args=[acts_nd, labels_nd])
+    acts_nd = mx.nd.array(acts, ctx=default_device())
+    labels_nd = mx.nd.array(labels, ctx=default_device())
+    exe = ctc._bind(ctx=default_device(), args=[acts_nd, labels_nd])
     # test forward with grad calc
     exe.forward(is_train=True)
     outTest = exe.outputs[0].copy()
@@ -5190,7 +5190,7 @@ def test_ctc_loss():
         check_ctc_loss(acts2, labels3, true_loss, contrib=contrib)
 
 def test_ctc_loss_with_large_classes():
-    ctx = default_context()
+    ctx = default_device()
     num_classes = 6000
     seq_len = 8
     batch_size = 2
@@ -5272,7 +5272,7 @@ def check_ctc_loss_grad(blank_label, contrib=False): # from tf
         label_lens = np.array([5, 4], dtype=np.int32)
         loss_truth = np.array([-loss_log_prob_0, -loss_log_prob_1], np.float32)
 
-        with default_context():
+        with default_device():
             data = mx.nd.array(inputs)
             label = mx.nd.array(labels)
             data.attach_grad()
@@ -5746,7 +5746,7 @@ def f(in_data, out_data):
     pytest.raises(MXNetError, custom_exc2)
 
     # 3. error in real execution
-    if default_context().device_type == 'cpu':
+    if default_device().device_type == 'cpu':
         def custom_exc3():
             def f(in_data, out_data):
                 dot = mx.nd.dot(in_data[0], in_data[1])
@@ -5859,7 +5859,7 @@ def test_deformable_convolution(num_batch, num_channel_data_deformable_group, in
     else:
         rtol, atol = 0.05, 1e-3
     # By now we only have gpu implementation
-    if default_context().device_type == 'gpu':
+    if default_device().device_type == 'gpu':
         check_numeric_gradient(op, [im_data, offset_data, weight, bias], rtol=rtol, atol=atol,
                                grad_nodes=grad_nodes, ctx=mx.gpu(0), numeric_eps=1.0/64)
 
@@ -5948,7 +5948,7 @@ def test_deformable_psroipooling():
                                                                trans_std=0.1, no_trans=False, name='test_op')
                     rtol, atol = 1e-2, 1e-3
                     # By now we only have gpu implementation
-                    if default_context().device_type == 'gpu':
+                    if default_device().device_type == 'gpu':
                         check_numeric_gradient(op, [im_data, rois_data, offset_data], rtol=rtol, atol=atol,
                                                grad_nodes=grad_nodes, ctx=mx.gpu(0))
 
@@ -6086,7 +6086,7 @@ def test_gemm():
     _gemm_test_helper(np.float64, True)
     with environment('MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION', '0'):
         _gemm_test_helper(np.float32, True)
-    if default_context().device_type == 'gpu':
+    if default_device().device_type == 'gpu':
         with environment('MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION', '1'):
             _gemm_test_helper(np.float32, True)
 
@@ -6339,7 +6339,7 @@ def test_laop_2():
     # Tests for linalg.gelqf
     # Currently disabled on GPU as they need cuda8
     # and MxNet builds use cuda 7.5
-    if not (default_context() == mx.cpu()):
+    if not (default_device() == mx.cpu()):
         return
 
     test_gelqf2 = _gelqf_combined_symbol(data1)  # Outputs (dot(Q, Q.T), dot(L, Q))
@@ -6416,7 +6416,7 @@ def _syevd_backward(grad_u, grad_l, u, l):
 def test_laop_3():
     # Currently disabled on GPU as syevd needs cuda8
     # and MxNet builds use cuda 7.5
-    if not (default_context() == mx.cpu()):
+    if not (default_device() == mx.cpu()):
         return
 
     dtype = np.float64
@@ -6485,7 +6485,7 @@ def test_laop_3():
 def test_laop_4():
     # Currently disabled on GPU as syevd needs cuda8
     # and MxNet builds use cuda 7.5
-    if not (default_context() == mx.cpu()):
+    if not (default_device() == mx.cpu()):
         return
 
     rtol_fw = 1e-6
@@ -6657,7 +6657,7 @@ def check_dropout_ratio(ratio, shape, cudnn_off=True):
         # test dropout
         x = mx.sym.var('data')
         y = mx.sym.Dropout(x, p=ratio, cudnn_off=cudnn_off)
-        exe = y._simple_bind(ctx=default_context(), data=shape)
+        exe = y._simple_bind(ctx=default_device(), data=shape)
 
         if ratio == 1:
             max_value = float('nan')
@@ -6695,7 +6695,7 @@ def check_dropout_ratio(ratio, shape, cudnn_off=True):
             # test permanent dropout
             x = mx.sym.var('data')
             y = mx.sym.Dropout(x, p=ratio, mode='always', cudnn_off=cudnn_off)
-            exe = y._simple_bind(ctx=default_context(), data=shape)
+            exe = y._simple_bind(ctx=default_device(), data=shape)
 
             exe.arg_arrays[0][:] = 1
             exe.forward(is_train=True)
@@ -7737,7 +7737,7 @@ def getRandom(base, percent = 1.):
 
 @pytest.mark.serial
 def test_allclose_function():
-    allclose_function([default_context()])
+    allclose_function([default_device()])
 
 def test_histogram():
     def f(x, bins=10, range=None):
@@ -7763,10 +7763,10 @@ def f(x, bins=10, range=None):
         bins = mx.sym.Variable("bins")
         histo1 = mx.sym.histogram(a=data, bins=bin_cnt, range=bin_range)
         histo2 = mx.sym.histogram(a=data, bins=bins)
-        executor1 = histo1._bind(ctx=default_context(), args={"data" : x})
+        executor1 = histo1._bind(ctx=default_device(), args={"data" : x})
         executor1.forward(is_train=False)
         assert_almost_equal(np_histo1, executor1.outputs[0].asnumpy(), 0, 0, ("EXPECTED_histo1", "FORWARD_histo1"), equal_nan=False)
-        executor2 = histo2._bind(ctx=default_context(), args={"data" : x, "bins" : mx_bins})
+        executor2 = histo2._bind(ctx=default_device(), args={"data" : x, "bins" : mx_bins})
         executor2.forward(is_train=False)
         assert_almost_equal(np_histo2, executor2.outputs[0].asnumpy(), 0, 0, ("EXPECTED_histo2", "FORWARD_histo2"), equal_nan=False)
 
@@ -7978,7 +7978,7 @@ def roialign_forward_backward(data, rois, pooled_size, spatial_scale, sampling_r
         return out, [dx, drois]
 
     def test_roi_align_value(sampling_ratio=0, position_sensitive=False):
-        ctx = default_context()
+        ctx = default_device()
         dtype = np.float32
         dlen = 224
         N, C, H, W = 5, 3, 16, 16
@@ -8014,7 +8014,7 @@ def test_roi_align_value(sampling_ratio=0, position_sensitive=False):
 
     # modified from test_roipooling()
     def test_roi_align_autograd(sampling_ratio=0):
-        ctx = default_context()
+        ctx = default_device()
         data = mx.symbol.Variable(name='data')
         rois = mx.symbol.Variable(name='rois')
         test = mx.symbol.contrib.ROIAlign(data=data, rois=rois, pooled_size=(4, 4), spatial_scale=1,
@@ -8143,7 +8143,7 @@ def rroialign_forward(data, rois, pooled_size, spatial_scale, sampling_ratio):
         return out
 
     def test_rroi_align_value(sampling_ratio=-1):
-        ctx = default_context()
+        ctx = default_device()
         if ctx.device_type == 'gpu':
             print('skipped testing rroi align for gpu since it is not supported yet')
             return
@@ -8904,7 +8904,7 @@ def convert_bias(F, q_bias, k_bias, v_bias, num_heads):
                                    num_hidden=out_dim, no_bias=False)
     output = mx.sym.transpose(output, axes=(1, 0, 2))
     output = mx.sym.Group([output, att_score])
-    executor = output._simple_bind(ctx=default_context(),
+    executor = output._simple_bind(ctx=default_device(),
                                   qkv=(batch_size, qkv_length, qkv_dim),
                                   q_weight=(qkv_units, qkv_dim),
                                   q_bias=(qkv_units,),
@@ -8969,7 +8969,7 @@ def convert_bias(F, q_bias, k_bias, v_bias, num_heads):
     output = mx.sym.FullyConnected(weighted_value, weight=out_weight, bias=out_bias, flatten=False,
                                    num_hidden=out_dim, no_bias=False)
     output = mx.sym.Group([output, att_score])
-    executor = output._simple_bind(ctx=default_context(),
+    executor = output._simple_bind(ctx=default_device(),
                                   qkv=(batch_size, qkv_length, qkv_dim),
                                   type_dict={'qkv': dtype},
                                   grad_req='write')
@@ -8995,7 +8995,7 @@ def convert_bias(F, q_bias, k_bias, v_bias, num_heads):
 @pytest.mark.serial
 def test_multihead_attention_selfatt():
     dtypes = ['float32']
-    if default_context().device_type == 'gpu':
+    if default_device().device_type == 'gpu':
         dtypes += ['float16']
 
     for dtype in dtypes:
@@ -9065,7 +9065,7 @@ def convert_bias(F, k_bias, v_bias, num_heads):
                                    num_hidden=out_dim, no_bias=False)
     output = mx.sym.transpose(output, axes=(1, 0, 2))
     output = mx.sym.Group([output, att_score])
-    executor = output._simple_bind(ctx=default_context(),
+    executor = output._simple_bind(ctx=default_device(),
                                   q=(batch_size, qkv_length, qkv_dim),
                                   kv=(batch_size, qkv_length, qkv_dim),
                                   q_weight=(qkv_units, qkv_dim),
@@ -9138,7 +9138,7 @@ def convert_bias(F, k_bias, v_bias, num_heads):
     output = mx.sym.FullyConnected(weighted_value, weight=out_weight, bias=out_bias, flatten=False,
                                    num_hidden=out_dim, no_bias=False)
     output = mx.sym.Group([output, att_score])
-    executor = output._simple_bind(ctx=default_context(),
+    executor = output._simple_bind(ctx=default_device(),
                                   q=(batch_size, qkv_length, qkv_dim),
                                   kv=(batch_size, qkv_length, qkv_dim),
                                   type_dict={'q': dtype,
@@ -9164,7 +9164,7 @@ def convert_bias(F, k_bias, v_bias, num_heads):
 @pytest.mark.serial
 def test_multihead_attention_encdec():
     dtypes = ['float32']
-    if default_context().device_type == 'gpu':
+    if default_device().device_type == 'gpu':
         dtypes += ['float16']
 
     for dtype in dtypes:
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 7ccb8f18242c..869de97a3d14 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -587,7 +587,7 @@ def test_rms():
             compare_optimizer(opt1(use_fused_step=False, **kwarg),
                               opt2(use_fused_step=True, **kwarg), shapes, dtype,
                               rtol=rtol, atol=atol)
-            if default_context() == mx.cpu():
+            if default_device() == mx.cpu():
                 compare_optimizer(opt1(use_fused_step=False, **kwarg),
                                   opt2(use_fused_step=True, **kwarg),
                                   shapes, dtype, g_stype='row_sparse', rtol=rtol, atol=atol)
diff --git a/tests/python/unittest/test_sparse_ndarray.py b/tests/python/unittest/test_sparse_ndarray.py
index 160cb605aad1..6a338ec3b1b9 100644
--- a/tests/python/unittest/test_sparse_ndarray.py
+++ b/tests/python/unittest/test_sparse_ndarray.py
@@ -67,7 +67,7 @@ def test_sparse_nd_copy():
     def check_sparse_nd_copy(from_stype, to_stype, shape):
         from_nd = rand_ndarray(shape, from_stype)
         # copy to ctx
-        to_ctx = from_nd.copyto(default_context())
+        to_ctx = from_nd.copyto(default_device())
         # copy to stype
         to_nd = rand_ndarray(shape, to_stype)
         to_nd = from_nd.copyto(to_nd)
@@ -471,7 +471,7 @@ def check_fallback_with_temp_resource(shape):
 def test_sparse_nd_random():
     """ test sparse random operator on cpu """
     # gpu random operator doesn't use fixed seed
-    if default_context().device_type is 'gpu':
+    if default_device().device_type is 'gpu':
         return
     shape = (100, 100)
     fns = [mx.nd.random.uniform, mx.nd.random.normal, mx.nd.random.gamma]
@@ -881,7 +881,7 @@ def test_powerlaw_generator(csr_arr, final_row=1):
 def test_sparse_nd_fluent():
     def check_fluent_regular(stype, func, kwargs, shape=(5, 17), equal_nan=False):
         with mx.name.NameManager():
-            data = mx.nd.random_uniform(shape=shape, ctx=default_context()).tostype(stype)
+            data = mx.nd.random_uniform(shape=shape, ctx=default_device()).tostype(stype)
             regular = getattr(mx.ndarray, func)(data, **kwargs)
             fluent = getattr(data, func)(**kwargs)
             if isinstance(regular, list):
@@ -1023,7 +1023,7 @@ def check_sparse_take(density, mode):
             check_sparse_take(d, m)
 
 def test_sparse_getnnz():
-    if default_context().device_type is 'gpu':
+    if default_device().device_type is 'gpu':
         return
     def check_sparse_getnnz(density, axis):
         shape = rand_shape_2d()
diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
index 1d9b78870ef0..d82d1925ca1a 100644
--- a/tests/python/unittest/test_sparse_operator.py
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -157,7 +157,7 @@ def all_zero(var):
 @pytest.mark.skip(reason="https://github.com/apache/incubator-mxnet/issues/18740")
 def test_elemwise_binary_ops():
     # skip testing on GPU because only CPU ops are implemented
-    if default_context().device_type is 'gpu':
+    if default_device().device_type is 'gpu':
         return
 
     def test_elemwise_binary_op(name, lhs_stype, rhs_stype, shape,
@@ -633,9 +633,9 @@ def check_sparse_mathematical_core(name, stype,
     args.append(arr_data)
 
     if arr_grad is not None:
-        exe_test = test._bind(default_context(), args=args, args_grad=[arr_grad])
+        exe_test = test._bind(default_device(), args=args, args_grad=[arr_grad])
     else:
-        exe_test = test._bind(default_context(), args=args)
+        exe_test = test._bind(default_device(), args=args)
 
     exe_test.forward(is_train=True)
     assert exe_test.outputs[0].stype == expected_result_type
@@ -1224,7 +1224,7 @@ def check_cast_storage(shape, density, from_stype, to_stype, check_numeric_grad=
             check_cast_storage(shape, d, 'default', 'row_sparse')
             check_cast_storage(shape, d, 'row_sparse', 'default')
         # Test specific gpu kernels
-        if default_context().device_type is 'gpu':
+        if default_device().device_type is 'gpu':
             dim0 = rnd.randint(1, 10)
             # test gpu thread kernel
             check_cast_storage((dim0, rnd.randint(  1,   32)), d, 'default', 'csr')
@@ -1299,7 +1299,7 @@ def test_dot_dns_csr(lhs_shape, rhs_shape, lhs_density, rhs_density, trans_lhs=F
         rhs_nd = rand_ndarray(rhs_shape, stype='csr', density=rhs_density)
         rhs_dns = rhs_nd.tostype('default')
 
-        if default_context() == mx.cpu():
+        if default_device() == mx.cpu():
             forward_stype = 'csr'
         else:
             forward_stype = 'default'
@@ -1315,7 +1315,7 @@ def test_dot_dns_csr(lhs_shape, rhs_shape, lhs_density, rhs_density, trans_lhs=F
         location = {'lhs': lhs_nd, 'rhs': rhs_nd}
         check_symbolic_forward(out, location, [out_np], rtol=1e-3, atol=1e-4)
 
-        if default_context() == mx.cpu():
+        if default_device() == mx.cpu():
             # test symbolic backward
             backward_trans = not trans_lhs
             rhs_backward_grad = mx.nd.dot(lhs_nd, out_dns, transpose_a=backward_trans).asnumpy()
@@ -1398,7 +1398,7 @@ def check_dot_determinism(lhs_stype, rhs_stype, lhs_density, rhs_density, transp
         assert_almost_equal(res1.asnumpy(), res2.asnumpy(), rtol=0.0, atol=0.0)
 
     check_dot_determinism('csr', 'default', 0.1, 1.0, True, False, 'row_sparse')
-    forward_stype = 'csr' if default_context() == mx.cpu() else 'default'
+    forward_stype = 'csr' if default_device() == mx.cpu() else 'default'
     check_dot_determinism('default', 'csr', 1.0, 0.1, False, False, forward_stype)
     check_dot_determinism('default', 'csr', 1.0, 0.1, False, True, forward_stype)
     check_dot_determinism('csr', 'default', 0.1, 1.0, True, False, 'default')
@@ -1618,7 +1618,7 @@ def test_sparse_square_sum():
                     dns_data = mx.sym.Variable('data')
                     baseline = mx.sym.sum(mx.sym.square(dns_data), axis=axis, keepdims=keepdim)
                     igrad_expected = mx.nd.empty(dns.shape)
-                    baseline_exec = baseline._bind(default_context(), args=[dns],
+                    baseline_exec = baseline._bind(default_device(), args=[dns],
                                                   args_grad=[igrad_expected])
                     baseline_exec.forward(is_train=True)
                     baseline_exec.backward([ret_expected])
@@ -1632,7 +1632,7 @@ def test_sparse_square_sum():
                     # Need to add one more layer after square_sum to trigger the kernel for ograd
                     # with default stype in square_sum op.
                     baseline1 = baseline + 1
-                    baseline_exec1 = baseline1._bind(default_context(), args=[dns],
+                    baseline_exec1 = baseline1._bind(default_device(), args=[dns],
                                                     args_grad=[igrad_expected])
                     baseline_exec1.forward(is_train=True)
                     baseline_exec1.backward([ret_expected])
@@ -1708,7 +1708,7 @@ def check_sparse_elementwise_sum_with_shape(stypes, shape, n):
         for stype in stypes:
             arr.append(rand_ndarray(shape, stype, densities[np.random.randint(0, len(densities))]))
 
-        exec1 = out._bind(default_context(),
+        exec1 = out._bind(default_device(),
                          args=arr,
                          args_grad=arr_grad)
         exec1.forward(is_train=True)
@@ -1756,7 +1756,7 @@ def check_sparse_embedding(in_dim, out_dim, batch, densities, sparse_grad):
         if sparse_grad:
             weight_grad = weight_grad.tostype('row_sparse')
         args_grad = {'embed_weight': weight_grad}
-        exe_test = embed._bind(default_context(), args=args, args_grad=args_grad, grad_req=grad_req)
+        exe_test = embed._bind(default_device(), args=args, args_grad=args_grad, grad_req=grad_req)
         arg_map = dict(zip(embed.list_arguments(), exe_test.arg_arrays))
         grad_map = dict(zip(embed.list_arguments(), exe_test.grad_arrays))
         # init data
@@ -1985,7 +1985,7 @@ def test_where_helper(shape):
         args_grad = {'condition': mx.nd.zeros_like(cond_nd),
                      'x': mx.nd.array(x_np).tostype('csr'), 'y' : mx.nd.array(y_np)}
         # test req='write'
-        where_exe_write = where_sym._bind(ctx=default_context(), args=args,
+        where_exe_write = where_sym._bind(ctx=default_device(), args=args,
                                          args_grad=args_grad, grad_req='write')
 
         # test forward req='write'
@@ -2000,7 +2000,7 @@ def test_where_helper(shape):
         # test req='add'
         x_grad_init = np.random.randint(30, 40, np.prod(shape)).reshape(shape)
         y_grad_init = np.random.randint(40, 50, np.prod(shape)).reshape(shape)
-        where_exe_add = where_sym._bind(ctx=default_context(), args=args,
+        where_exe_add = where_sym._bind(ctx=default_device(), args=args,
                                        args_grad=args_grad, grad_req='add')
         where_exe_add.grad_dict['x'][:] = x_grad_init
         where_exe_add.grad_dict['y'][:] = y_grad_init
@@ -2055,7 +2055,7 @@ def test_reshape_backward_fallback():
         - or, we can have out_y = sym.dot(sparse_y, w), then grad(w) will be inferred as sparse
     reshape backward (from w_x to w) needs to understand how to handle sparse inputs.
     """
-    ctx = default_context()
+    ctx = default_device()
     w_shape = (12, 4)
     w_x_shape = (1, 48)
     x_nd = rand_ndarray((4, 1), 'csr')
diff --git a/tests/python/unittest/test_subgraph.py b/tests/python/unittest/test_subgraph.py
index c4fe6dd39afe..7b3eae10fc5f 100644
--- a/tests/python/unittest/test_subgraph.py
+++ b/tests/python/unittest/test_subgraph.py
@@ -43,9 +43,9 @@ def make_subgraph1(stype):
 
         s = (10, 10)
         a_arr = mx.nd.array(np.random.normal(-0.1, 0.1, size=s),
-                ctx=default_context()).tostype(stype)
+                ctx=default_device()).tostype(stype)
         b_arr = mx.nd.array(np.random.normal(-0.1, 0.1, size=s),
-                ctx=default_context()).tostype(stype)
+                ctx=default_device()).tostype(stype)
         return (d, y, {'a': a_arr, 'b': b_arr}, {})
 
     def create_weights(shapes, names):
@@ -54,7 +54,7 @@ def create_weights(shapes, names):
         assert len(shapes) == len(names)
         for i in range(len(shapes)):
             sym_dict[names[i]] = mx.symbol.Variable(names[i])
-            nd_dict[names[i]] = mx.nd.array(np.ones(shapes[i]), ctx=default_context())
+            nd_dict[names[i]] = mx.nd.array(np.ones(shapes[i]), ctx=default_device())
         return (nd_dict, sym_dict)
 
     def make_subgraph_weight(orig, shape, stype):
@@ -73,7 +73,7 @@ def make_subgraph_weight(orig, shape, stype):
             input_list.append(input_dict[name])
         subg = make_subgraph(orig, *input_list)
 
-        arr = mx.nd.random.uniform(-1, 1, shape=shape, ctx=default_context()).tostype(stype)
+        arr = mx.nd.random.uniform(-1, 1, shape=shape, ctx=default_device()).tostype(stype)
         arg_dict = weight_dict
         arg_dict['data'] = arr
         return (orig, subg, arg_dict, aux_dict)
@@ -117,10 +117,10 @@ def make_subgraph4(stype):
             all_inputs = copy.deepcopy(inputs)
             all_inputs.update(aux_states)
             args_grad = {key : mx.nd.empty(shape=all_inputs[key].shape) for key in all_inputs.keys()}
-            e1 = orig._bind(ctx=default_context(), args=all_inputs, args_grad=args_grad,
+            e1 = orig._bind(ctx=default_device(), args=all_inputs, args_grad=args_grad,
                     aux_states=all_inputs)
             args_grad = {key : mx.nd.empty(shape=all_inputs[key].shape) for key in all_inputs.keys()}
-            e2 = subg._bind(ctx=default_context(), args=all_inputs, args_grad=args_grad,
+            e2 = subg._bind(ctx=default_device(), args=all_inputs, args_grad=args_grad,
                     aux_states=all_inputs)
             e1.forward(is_train=True)
             e2.forward(is_train=True)
@@ -128,7 +128,7 @@ def make_subgraph4(stype):
                 assert_almost_equal(e1.outputs[i].asnumpy(), e2.outputs[i].asnumpy(),
                         rtol=0.001, atol=0.0001)
 
-            out_grads = [mx.nd.random.uniform(-1, 1, shape=out.shape, ctx=default_context())
+            out_grads = [mx.nd.random.uniform(-1, 1, shape=out.shape, ctx=default_device())
                     for out in e1.outputs]
             e1.backward(out_grads)
             e2.backward(out_grads)
diff --git a/tests/python/unittest/test_thread_local.py b/tests/python/unittest/test_thread_local.py
index 9d1e529d4142..5b3ba952aafb 100644
--- a/tests/python/unittest/test_thread_local.py
+++ b/tests/python/unittest/test_thread_local.py
@@ -18,26 +18,26 @@
 import threading
 import numpy as np
 import mxnet as mx
-from mxnet import context, attribute
-from mxnet.context import Context
+from mxnet import device, attribute
+from mxnet.device import Device
 from mxnet.attribute import AttrScope
-from mxnet.test_utils import assert_almost_equal, set_default_context
+from mxnet.test_utils import assert_almost_equal, set_default_device
 from mxnet.util import _NumpyArrayScope, set_np_shape
 
 
-def test_context():
-    ctx_list = []
-    ctx_list.append(context.current_context())
+def test_device():
+    device_list = []
+    device_list.append(device.current_device())
     def f():
-        set_default_context(mx.gpu(11))
-        ctx_list.append(context.current_context())
+        set_default_device(mx.gpu(11))
+        device_list.append(device.current_device())
     thread = threading.Thread(target=f)
     thread.start()
     thread.join()
-    assert Context.devtype2str[ctx_list[0].device_typeid] == "cpu"
-    assert ctx_list[0].device_id == 0
-    assert Context.devtype2str[ctx_list[1].device_typeid] == "gpu"
-    assert ctx_list[1].device_id == 11
+    assert Device.devtype2str[device_list[0].device_typeid] == "cpu"
+    assert device_list[0].device_id == 0
+    assert Device.devtype2str[device_list[1].device_typeid] == "gpu"
+    assert device_list[1].device_id == 11
 
     e1 = threading.Event()
     e2 = threading.Event()
@@ -46,17 +46,17 @@ def g():
         with mx.cpu(10):
             e2.set()
             e1.wait()
-            if context.current_context().device_id == 10:
+            if device.current_device().device_id == 10:
                 status[0] = True
     thread = threading.Thread(target=g)
     thread.start()
     e2.wait()
-    with Context("cpu", 11):
+    with Device("cpu", 11):
         e1.set()
         thread.join()
     e1.clear()
     e2.clear()
-    assert status[0], "Spawned thread didn't set the correct context"
+    assert status[0], "Spawned thread didn't set the correct device"
 
 def test_attrscope():
     attrscope_list = []

From 02afedf5f89aa2c8b3702d235fe9f6e77c64ffca Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 14 Oct 2021 10:30:20 -0700
Subject: [PATCH 25/41] fix

---
 python/mxnet/ndarray/numpy/random.py | 38 ++++++++++++++--------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/python/mxnet/ndarray/numpy/random.py b/python/mxnet/ndarray/numpy/random.py
index e54201a26526..7e3804cf99d2 100644
--- a/python/mxnet/ndarray/numpy/random.py
+++ b/python/mxnet/ndarray/numpy/random.py
@@ -21,7 +21,7 @@
 from ...device import current_device
 from . import _internal as _npi
 from . import _api_internal
-from ...util import wrap_device_to_device_func
+from ...util import wrap_ctx_to_device_func
 
 
 __all__ = ['randint', 'uniform', 'normal', "choice", "rand", "multinomial", "multivariate_normal",
@@ -30,7 +30,7 @@
            "shuffle", 'gamma', 'beta', 'chisquare', 'exponential', 'lognormal', 'weibull', 'pareto', 'power']
 
 
-@wrap_device_to_device_func
+@wrap_ctx_to_device_func
 def randint(low, high=None, size=None, dtype=None, device=None, out=None):
     r"""Return random integers from `low` (inclusive) to `high` (exclusive).
 
@@ -96,7 +96,7 @@ def randint(low, high=None, size=None, dtype=None, device=None, out=None):
     return _api_internal.randint(low, high, size, dtype, device, out)
 
 
-@wrap_device_to_device_func
+@wrap_ctx_to_device_func
 def uniform(low=0.0, high=1.0, size=None, dtype=None, device=None, out=None):
     r"""Draw samples from a uniform distribution.
 
@@ -143,7 +143,7 @@ def uniform(low=0.0, high=1.0, size=None, dtype=None, device=None, out=None):
     return _api_internal.uniform(low, high, size, device, dtype, out)
 
 
-@wrap_device_to_device_func
+@wrap_ctx_to_device_func
 def normal(loc=0.0, scale=1.0, size=None, dtype=None, device=None, out=None):
     r"""Draw random samples from a normal (Gaussian) distribution.
 
@@ -186,7 +186,7 @@ def normal(loc=0.0, scale=1.0, size=None, dtype=None, device=None, out=None):
     return _api_internal.normal(loc, scale, size, device, dtype, out)
 
 
-@wrap_device_to_device_func
+@wrap_ctx_to_device_func
 def lognormal(mean=0.0, sigma=1.0, size=None, dtype=None, device=None, out=None):
     r"""Draw samples from a log-normal distribution.
 
@@ -223,7 +223,7 @@ def lognormal(mean=0.0, sigma=1.0, size=None, dtype=None, device=None, out=None)
     return _mx_np_op.exp(normal(loc=mean, scale=sigma, size=size, dtype=dtype, device=device, out=out))
 
 
-@wrap_device_to_device_func
+@wrap_ctx_to_device_func
 def logistic(loc=0.0, scale=1.0, size=None, device=None, out=None):
     r"""Draw samples from a logistic distribution.
 
@@ -261,7 +261,7 @@ def logistic(loc=0.0, scale=1.0, size=None, device=None, out=None):
     return _api_internal.logistic(loc, scale, size, device, out)
 
 
-@wrap_device_to_device_func
+@wrap_ctx_to_device_func
 def gumbel(loc=0.0, scale=1.0, size=None, device=None, out=None):
     r"""Draw samples from a Gumbel distribution.
 
@@ -348,7 +348,7 @@ def multinomial(n, pvals, size=None):
     return _api_internal.multinomial(n, pvals, size)
 
 
-@wrap_device_to_device_func
+@wrap_ctx_to_device_func
 def rayleigh(scale=1.0, size=None, device=None, out=None):
     r"""Draw samples from a Rayleigh distribution.
 
@@ -464,7 +464,7 @@ def multivariate_normal(mean, cov, size=None, check_valid=None, tol=None):
     return _npi.mvn_fallback(mean, cov, size=size)
 
 
-@wrap_device_to_device_func
+@wrap_ctx_to_device_func
 def choice(a, size=None, replace=True, p=None, device=None, out=None):
     r"""Generates a random sample from a given 1-D array
 
@@ -531,7 +531,7 @@ def choice(a, size=None, replace=True, p=None, device=None, out=None):
         return _api_internal.choice(a, size, replace, p, device, out)
 
 
-@wrap_device_to_device_func
+@wrap_ctx_to_device_func
 def exponential(scale=1.0, size=None, device=None, out=None):
     r"""Draw samples from an exponential distribution.
 
@@ -564,7 +564,7 @@ def exponential(scale=1.0, size=None, device=None, out=None):
     return _api_internal.exponential(scale, size, device, out)
 
 
-@wrap_device_to_device_func
+@wrap_ctx_to_device_func
 def weibull(a, size=None, device=None, out=None):
     r"""Draw samples from a 1-parameter Weibull distribution with given
     parameter a, via inversion.
@@ -616,7 +616,7 @@ def weibull(a, size=None, device=None, out=None):
     return _api_internal.weibull(a, size, device, out)
 
 
-@wrap_device_to_device_func
+@wrap_ctx_to_device_func
 def pareto(a, size=None, device=None, out=None):
     r"""Draw samples from a Pareto II or Lomax distribution with specified shape a.
 
@@ -658,7 +658,7 @@ def pareto(a, size=None, device=None, out=None):
     return _api_internal.pareto(a, size, device, out)
 
 
-@wrap_device_to_device_func
+@wrap_ctx_to_device_func
 def power(a, size=None, device=None, out=None):
     r"""Draw samples in [0, 1] from a power distribution with given parameter a.
 
@@ -700,7 +700,7 @@ def power(a, size=None, device=None, out=None):
     return _api_internal.powerd(a, size, device, out)
 
 
-@wrap_device_to_device_func
+@wrap_ctx_to_device_func
 def gamma(shape, scale=1.0, size=None, dtype=None, device=None, out=None):
     """Draw samples from a Gamma distribution.
 
@@ -749,7 +749,7 @@ def gamma(shape, scale=1.0, size=None, dtype=None, device=None, out=None):
     return _api_internal.gamma(shape, scale, size, device, dtype, out)
 
 
-@wrap_device_to_device_func
+@wrap_ctx_to_device_func
 def beta(a, b, size=None, dtype=None, device=None):
     r"""Draw samples from a Beta distribution.
 
@@ -807,7 +807,7 @@ def beta(a, b, size=None, dtype=None, device=None):
     return out.astype(dtype)
 
 
-@wrap_device_to_device_func
+@wrap_ctx_to_device_func
 def f(dfnum, dfden, size=None, device=None):
     r"""Draw samples from an F distribution.
 
@@ -872,7 +872,7 @@ def f(dfnum, dfden, size=None, device=None):
     return (X * dfden) / (Y * dfnum)
 
 
-@wrap_device_to_device_func
+@wrap_ctx_to_device_func
 def chisquare(df, size=None, dtype=None, device=None):
     r"""
     chisquare(df, size=None, dtype=None, device=None)
@@ -952,7 +952,7 @@ def chisquare(df, size=None, dtype=None, device=None):
     return gamma(df/2, 2, size=size, dtype=dtype, device=device)
 
 
-@wrap_device_to_device_func
+@wrap_ctx_to_device_func
 def rand(*size, **kwargs):
     r"""Random values in a given shape.
 
@@ -1016,7 +1016,7 @@ def shuffle(x):
     _api_internal.shuffle(x, x)
 
 
-@wrap_device_to_device_func
+@wrap_ctx_to_device_func
 def laplace(loc=0.0, scale=1.0, size=None, dtype=None, device=None, out=None):
     r"""Draw random samples from a Laplace distribution.
 

From c17825913e9767256c5481de730fd342234f85a6 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 14 Oct 2021 11:40:09 -0700
Subject: [PATCH 26/41] fix multiarray

---
 python/mxnet/numpy/multiarray.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index cce7975dd184..37777d1abb2f 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -46,7 +46,7 @@
 from ..device import Device
 from ..util import set_module, wrap_np_unary_func, wrap_np_binary_func,\
                    is_np_default_dtype, wrap_ctx_to_device_func,\
-                   dtype_from_number
+                   dtype_from_number, wrap_data_api_statical_func
 from ..device import current_device
 from ..ndarray import numpy as _mx_nd_np
 from ..ndarray.numpy import _internal as _npi

From 61cb8e59f194598058a878aeeeac35f6dbb55c46 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Thu, 14 Oct 2021 14:12:17 -0700
Subject: [PATCH 27/41] update ndarray.py

---
 benchmark/opperf/opperf.py      |  2 +-
 python/mxnet/ndarray/ndarray.py | 35 ++++++++++++++++++++++++---------
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/benchmark/opperf/opperf.py b/benchmark/opperf/opperf.py
index 5aa466eef20a..47bd970f930d 100644
--- a/benchmark/opperf/opperf.py
+++ b/benchmark/opperf/opperf.py
@@ -198,7 +198,7 @@ def main():
         "Output file {output_file} already exists.".format(output_file=args.output_file)
 
     # 2. RUN BENCHMARKS
-    ctx = _parse_mxnet_context(args.device)
+    ctx = _parse_mxnet_context(args.ctx)
     dtype = args.dtype
     profiler = args.profiler
     int64_tensor = args.int64_tensor
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 1f49fc566c70..70aeb4862e04 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -40,7 +40,7 @@
 from ..dlpack import ndarray_to_dlpack_for_read, ndarray_to_dlpack_for_write
 from ..dlpack import ndarray_from_dlpack, ndarray_from_numpy
 from ..runtime import Features
-from ..context import Context, current_context
+from ..device import Device, current_device
 from ..util import is_np_array
 from . import _internal
 from . import op
@@ -2499,7 +2499,7 @@ def context(self):
         >>> x.context
         cpu(0)
         >>> type(x.context)
-        <class 'mxnet.context.Context'>
+        <class 'mxnet.device.Device'>
         >>> y = mx.nd.zeros((2,3), mx.gpu(0))
         >>> y.context
         gpu(0)
@@ -2508,7 +2508,7 @@ def context(self):
         dev_id = ctypes.c_int()
         check_call(_LIB.MXNDArrayGetContext(
             self.handle, ctypes.byref(dev_typeid), ctypes.byref(dev_id)))
-        return Context(Context.devtype2str[dev_typeid.value], dev_id.value)
+        return Device(Device.devtype2str[dev_typeid.value], dev_id.value)
 
     @property
     def ctx(self):
@@ -2527,6 +2527,23 @@ def ctx(self):
         """
         return self.context
 
+    @property
+    def device(self):
+        """Device context of the array. Has the same meaning as context.
+
+        Examples
+        --------
+        >>> x = mx.nd.array([1, 2, 3, 4])
+        >>> x.device
+        cpu(0)
+        >>> type(x.device)
+        <class 'mxnet.device.Device'>
+        >>> y = mx.nd.zeros((2,3), mx.gpu(0))
+        >>> y.device
+        gpu(0)
+        """
+        return self.context
+
     @property
     def dtype(self):
         """Data-type of the array's elements.
@@ -3354,7 +3371,7 @@ def ones(shape, ctx=None, dtype=None, **kwargs):
     """
     # pylint: disable= unused-argument
     if ctx is None:
-        ctx = current_context()
+        ctx = current_device()
     dtype = mx_real_t if dtype is None else dtype
     # pylint: disable= no-member, protected-access
     return _internal._ones(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
@@ -3538,7 +3555,7 @@ def arange(start, stop=None, step=1.0, repeat=1, infer_range=None, ctx=None, dty
         warnings.warn('`infer_range` argument has been deprecated',
                       DeprecationWarning)
     if ctx is None:
-        ctx = current_context()
+        ctx = current_device()
     return _internal._arange(start=start, stop=stop, step=step, repeat=repeat,
                              infer_range=False, dtype=dtype, ctx=str(ctx))
 # pylint: enable= no-member, protected-access, too-many-arguments
@@ -3584,7 +3601,7 @@ def linspace(start, stop, num, endpoint=True, ctx=None, dtype=mx_real_t):
     array([ 2.,  2.2.,  2.4,  2.6,  2.8], dtype=float32)
     """
     if ctx is None:
-        ctx = current_context()
+        ctx = current_device()
     return _internal._linspace(start=start, stop=stop, num=num,
                                endpoint=endpoint, dtype=dtype, ctx=str(ctx))
 # pylint: disable= no-member, protected-access, too-many-arguments
@@ -4816,7 +4833,7 @@ def zeros(shape, ctx=None, dtype=None, **kwargs):
     """
     # pylint: disable= unused-argument
     if ctx is None:
-        ctx = current_context()
+        ctx = current_device()
     dtype = mx_real_t if dtype is None else dtype
     # pylint: disable= no-member, protected-access
     return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
@@ -4858,7 +4875,7 @@ def eye(N, M=0, k=0, ctx=None, dtype=None, **kwargs):
     """
     # pylint: disable= unused-argument
     if ctx is None:
-        ctx = current_context()
+        ctx = current_device()
     dtype = mx_real_t if dtype is None else dtype
     # pylint: disable= no-member, protected-access
     return _internal._eye(N=N, M=M, k=k, ctx=ctx, dtype=dtype, **kwargs)
@@ -4886,7 +4903,7 @@ def empty(shape, ctx=None, dtype=None):
     if isinstance(shape, int):
         shape = (shape, )
     if ctx is None:
-        ctx = current_context()
+        ctx = current_device()
     if dtype is None:
         dtype = mx_real_t
     return NDArray(handle=_new_alloc_handle(shape, ctx, False, dtype))

From 1111e98ca2716150970c3d63ba9bd3b3dbda2d12 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 15 Oct 2021 10:33:08 -0700
Subject: [PATCH 28/41] fix

---
 python/mxnet/executor.py                | 26 ++++++++++++-------------
 python/mxnet/gluon/parameter.py         |  4 ++--
 python/mxnet/ndarray/ndarray.py         |  2 +-
 python/mxnet/symbol/symbol.py           |  8 ++++----
 tests/python/unittest/test_gluon_rnn.py |  2 +-
 5 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index 85dba4b58858..c143a214a587 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -34,13 +34,13 @@ class Executor:
     >>> c = 2 * a + b
     >>> texec = c._bind(mx.cpu(), {'a': mx.nd.array([1,2]), 'b':mx.nd.array([2,3])})
     """
-    def __init__(self, sym, ctx, args, args_grad, grad_req, aux_states):
+    def __init__(self, sym, device, args, args_grad, grad_req, aux_states):
         self.outputs = None
         self._input_names = sym.list_inputs()
         self._aux_names = sym.list_auxiliary_states()
         self._arg_names = sym.list_arguments()
         self._output_names = sym.list_outputs()
-        self._ctx = ctx
+        self._device = device
         self._grad_req = grad_req
         # grad_req
         self._requires_grad = False
@@ -63,7 +63,7 @@ def __init__(self, sym, ctx, args, args_grad, grad_req, aux_states):
             for k, v in args.items():
                 try:
                     i = self._input_names.index(k)
-                    self._args[i] = v.copyto(ctx)
+                    self._args[i] = v.copyto(device)
                 # ignore provided arg which is not present in
                 # input_names
                 except ValueError:
@@ -73,7 +73,7 @@ def __init__(self, sym, ctx, args, args_grad, grad_req, aux_states):
             for i, arg in enumerate(args):
                 name = self._arg_names[i]
                 index = self._input_names.index(name)
-                self._args[index] = arg.copyto(ctx)
+                self._args[index] = arg.copyto(device)
 
         # aux states
         if aux_states:
@@ -81,12 +81,12 @@ def __init__(self, sym, ctx, args, args_grad, grad_req, aux_states):
                 for k, v in aux_states.items():
                     if k in self._aux_names:
                         i = self._input_names.index(k)
-                        self._args[i] = v.copyto(ctx)
+                        self._args[i] = v.copyto(device)
             else:
                 assert isinstance(aux_states, (list, tuple))
                 for i, v in enumerate(aux_states):
                     index = self._input_names.index(self._aux_names[i])
-                    self._args[index] = v.copyto(ctx)
+                    self._args[index] = v.copyto(device)
 
         # arg grad
         if self._args_grad:
@@ -101,7 +101,7 @@ def __init__(self, sym, ctx, args, args_grad, grad_req, aux_states):
                             assert isinstance(grad_req, dict)
                             req = grad_req[k]
                         if req != 'null':
-                            with self._ctx:
+                            with self._device:
                                 self._args[i].attach_grad(req, stype=g.stype)
                                 self._args[i].grad[:] = g
                     # ignore provided arg which is not present in
@@ -118,7 +118,7 @@ def __init__(self, sym, ctx, args, args_grad, grad_req, aux_states):
                         assert isinstance(grad_req, dict)
                         req = grad_req[self._input_names[i]]
                     if req != 'null':
-                        with self._ctx:
+                        with self._device:
                             self._args[i].attach_grad(req, stype=g.stype)
                             self._args[i].grad[:] = g
         self._cached_op = ndarray.CachedOp(sym)
@@ -161,7 +161,7 @@ def forward(self, is_train=False, **kwargs):
             for name, array in kwargs.items():
                 if name in self._input_names:
                     index = self._input_names.index(name)
-                    with self._ctx:
+                    with self._device:
                         arr = ndarray.array(array, dtype=array.dtype)
                         if self._args[index] is None:
                             self._args[index] = arr
@@ -172,16 +172,16 @@ def forward(self, is_train=False, **kwargs):
                                 assert isinstance(self._grad_req, dict)
                                 req = self._grad_req[name]
                             if req != 'null':
-                                with self._ctx:
+                                with self._device:
                                     self._args[index].attach_grad(req)
                         else:
                             self._args[index][:] = arr
 
         from . import autograd
-        default_ctx = None if self._input_names else self._ctx
+        default_device = None if self._input_names else self._device
         with autograd.record(train_mode=is_train):
             self.outputs = self._cached_op(*self._args,
-                                           default_ctx=default_ctx)
+                                           default_device=default_device)
         if not isinstance(self.outputs, (list, tuple)):
             self.outputs = [self.outputs]
         return self.outputs
@@ -205,7 +205,7 @@ def backward(self, out_grads=None):
         if out_grads is not None:
             if not isinstance(out_grads, (list, tuple)):
                 out_grads = [out_grads]
-            out_grads = [o.copyto(self._ctx) for o in out_grads]
+            out_grads = [o.copyto(self._device) for o in out_grads]
 
         if self._requires_grad:
             if self.outputs is None:
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 950e7c121396..96d116eaaba6 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -472,7 +472,7 @@ def initialize(self, init=None, device=None, default_init=initializer.Uniform(),
             return
         self._data = self._grad = None
         if device is None:
-            device = [device.current_device()]
+            device = [_device.current_device()]
         if isinstance(device, Device):
             device = [device]
         if init is None:
@@ -497,7 +497,7 @@ def reset_device(self, device):
             copy will be made for each device.
         """
         if device is None:
-            device = [device.current_device()]
+            device = [_device.current_device()]
         if isinstance(device, Device):
             device = [device]
         if self._data:
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 70aeb4862e04..fd76789918a1 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -2738,7 +2738,7 @@ def copyto(self, other):
                 warnings.warn('You are attempting to copy an array to itself', RuntimeWarning)
                 return False
             return _internal._copyto(self, out=other)
-        elif isinstance(other, Context):
+        elif isinstance(other, Device):
             hret = NDArray(_new_alloc_handle(self.shape, other, True, self.dtype))
             return _internal._copyto(self, out=hret)
         else:
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index 34e53da4bc30..3ef45f3d32d4 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -35,7 +35,7 @@
 from ..base import mx_uint, py_str, string_types, integer_types, mx_int, mx_int64
 from ..base import NDArrayHandle, SymbolHandle
 from ..base import check_call, MXNetError, NotImplementedForSymbol
-from ..context import Context, current_context
+from ..device import Device, current_device
 from ..ndarray import NDArray, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP
 from ..ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID, _int64_enabled, _SIGNED_INT32_UPPER_LIMIT
 from ..executor import Executor
@@ -1537,8 +1537,8 @@ def optimize_for(self, backend, args=None, aux=None, ctx=None,
             aux_handle, aux_ = self._get_ndarray_inputs('aux_states', aux,
                                                         self.list_auxiliary_states(), True)
         if ctx is None:
-            ctx = current_context()
-        assert isinstance(ctx, Context)
+            ctx = current_device()
+        assert isinstance(ctx, Device)
 
 
         # parse input data shape dict
@@ -1940,7 +1940,7 @@ def eval(self, ctx=None, **kwargs):
         the result will be a list with one element.
         """
         if ctx is None:
-            ctx = current_context()
+            ctx = current_device()
         return self._bind(ctx, kwargs).forward()
 
     def reshape(self, *args, **kwargs):
diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index 74aa7be2351b..d43083860fc6 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -650,7 +650,7 @@ def test_rnn_layers_fp32():
     run_rnn_layers('float32', 'float32')
 
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
-@pytest.mark.skipif(mx.context.num_gpus() == 0, reason="RNN FP16 only implemented for GPU for now")
+@pytest.mark.skipif(mx.device.num_gpus() == 0, reason="RNN FP16 only implemented for GPU for now")
 @pytest.mark.serial
 def test_rnn_layers_fp16():
     run_rnn_layers('float16', 'float32', mx.gpu())

From c583c9cebc7b5151fddb0c14383199078320cfc6 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 15 Oct 2021 14:16:20 -0700
Subject: [PATCH 29/41] fix

---
 example/gluon/image_classification.py         |  32 ++--
 .../super_resolution/super_resolution.py      |  36 ++---
 python/mxnet/_ctypes/cached_op.py             |   2 +
 python/mxnet/gluon/data/batchify.py           |   6 +-
 python/mxnet/gluon/parameter.py               |   7 +-
 python/mxnet/ndarray/sparse.py                |  18 +--
 python/mxnet/numpy/random.py                  | 143 ++++++++++--------
 python/mxnet/util.py                          |   2 +-
 tests/python/unittest/test_executor.py        |   2 +-
 tests/python/unittest/test_gluon.py           |   2 +-
 tests/python/unittest/test_gluon_estimator.py | 100 ++++++------
 11 files changed, 186 insertions(+), 164 deletions(-)

diff --git a/example/gluon/image_classification.py b/example/gluon/image_classification.py
index 2b4d7e8d7216..c7bfbf06e508 100644
--- a/example/gluon/image_classification.py
+++ b/example/gluon/image_classification.py
@@ -107,16 +107,16 @@
 model_name = opt.model
 dataset_classes = {'mnist': 10, 'cifar10': 10, 'caltech101':101, 'imagenet': 1000, 'dummy': 1000}
 batch_size, dataset, classes = opt.batch_size, opt.dataset, dataset_classes[opt.dataset]
-context = [mx.gpu(int(i)) for i in opt.gpus.split(',')] if opt.gpus.strip() else [mx.cpu()]
-num_gpus = len(context)
+device = [mx.gpu(int(i)) for i in opt.gpus.split(',')] if opt.gpus.strip() else [mx.cpu()]
+num_gpus = len(device)
 batch_size *= max(1, num_gpus)
 lr_steps = [int(x) for x in opt.lr_steps.split(',') if x.strip()]
 metric = CompositeEvalMetric([Accuracy(), TopKAccuracy(5)])
 kv = mx.kv.create(opt.kvstore)
 
-def get_model(model, ctx, opt):
+def get_model(model, device, opt):
     """Model initialization."""
-    kwargs = {'ctx': ctx, 'pretrained': opt.use_pretrained, 'classes': classes}
+    kwargs = {'device': device, 'pretrained': opt.use_pretrained, 'classes': classes}
     if model.startswith('resnet'):
         kwargs['thumbnail'] = opt.use_thumbnail
     elif model.startswith('vgg'):
@@ -133,7 +133,7 @@ def get_model(model, ctx, opt):
     net.cast(opt.dtype)
     return net
 
-net = get_model(opt.model, context, opt)
+net = get_model(opt.model, device, opt)
 
 def get_data_iters(dataset, batch_size, opt):
     """get dataset iterators"""
@@ -159,14 +159,14 @@ def get_data_iters(dataset, batch_size, opt):
         train_data, val_data = dummy_iterator(batch_size, (3, shape_dim, shape_dim))
     return train_data, val_data
 
-def test(ctx, val_data):
+def test(device, val_data):
     metric.reset()
     val_data.reset()
     for batch in val_data:
         data = gluon.utils.split_and_load(batch.data[0].astype(opt.dtype, copy=False),
-                                          ctx_list=ctx, batch_axis=0)
+                                          device_list=device, batch_axis=0)
         label = gluon.utils.split_and_load(batch.label[0].astype(opt.dtype, copy=False),
-                                           ctx_list=ctx, batch_axis=0)
+                                           device_list=device, batch_axis=0)
         outputs = [net(X) for X in data]
         metric.update(label, outputs)
     return metric.get()
@@ -188,13 +188,13 @@ def save_checkpoint(epoch, top1, best_acc):
         net.save_parameters(fname)
         logger.info('[Epoch %d] Saving checkpoint to %s with Accuracy: %.4f', epoch, fname, top1)
 
-def train(opt, ctx):
-    if isinstance(ctx, mx.Context):
-        ctx = [ctx]
+def train(opt, device):
+    if isinstance(device, mx.Device):
+        device = [device]
 
     train_data, val_data = get_data_iters(dataset, batch_size, opt)
     for p in net.collect_params().values():
-        p.reset_ctx(ctx)
+        p.reset_device(device)
     trainer = gluon.Trainer(net.collect_params(), 'sgd',
                             optimizer_params={'learning_rate': opt.lr,
                                               'wd': opt.wd,
@@ -213,8 +213,8 @@ def train(opt, ctx):
         metric.reset()
         btic = time.time()
         for i, batch in enumerate(train_data):
-            data = gluon.utils.split_and_load(batch.data[0].astype(opt.dtype), ctx_list=ctx, batch_axis=0)
-            label = gluon.utils.split_and_load(batch.label[0].astype(opt.dtype), ctx_list=ctx, batch_axis=0)
+            data = gluon.utils.split_and_load(batch.data[0].astype(opt.dtype), device_list=device, batch_axis=0)
+            label = gluon.utils.split_and_load(batch.label[0].astype(opt.dtype), device_list=device, batch_axis=0)
             outputs = []
             Ls = []
             with ag.record():
@@ -245,7 +245,7 @@ def train(opt, ctx):
         name, acc = metric.get()
         logger.info('[Epoch %d] training: %s=%f, %s=%f'%(epoch, name[0], acc[0], name[1], acc[1]))
         logger.info('[Epoch %d] time cost: %f'%(epoch, epoch_time))
-        name, val_acc = test(ctx, val_data)
+        name, val_acc = test(device, val_data)
         logger.info('[Epoch %d] validation: %s=%f, %s=%f'%(epoch, name[0], val_acc[0], name[1], val_acc[1]))
 
         # save model if meet requirements
@@ -259,7 +259,7 @@ def main():
         profiler.set_state('run')
     if opt.mode == 'hybrid':
         net.hybridize()
-    train(opt, context)
+    train(opt, device)
     if opt.builtin_profiler > 0:
         profiler.set_state('stop')
         print(profiler.dumps())
diff --git a/example/gluon/super_resolution/super_resolution.py b/example/gluon/super_resolution/super_resolution.py
index 75535168cf88..f9789c1c9da0 100644
--- a/example/gluon/super_resolution/super_resolution.py
+++ b/example/gluon/super_resolution/super_resolution.py
@@ -133,7 +133,7 @@ def get_dataset(prefetch=False):
 train_data, val_data = get_dataset()
 
 mx.np.random.seed(opt.seed)
-ctx = [mx.gpu(0)] if opt.use_gpu else [mx.cpu()]
+device = [mx.gpu(0)] if opt.use_gpu else [mx.cpu()]
 
 
 class SuperResolutionNet(gluon.HybridBlock):
@@ -156,14 +156,14 @@ def forward(self, x):
 net = SuperResolutionNet(upscale_factor)
 metric = mx.gluon.metric.MSE()
 
-def test(ctx):
+def test(device):
     val_data.reset()
     avg_psnr = 0
     batches = 0
     for batch in val_data:
         batches += 1
-        data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
-        label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
+        data = gluon.utils.split_and_load(batch.data[0], device_list=device, batch_axis=0)
+        label = gluon.utils.split_and_load(batch.label[0], device_list=device, batch_axis=0)
         outputs = []
         for x in data:
             outputs.append(net(x))
@@ -174,20 +174,20 @@ def test(ctx):
     print('validation avg psnr: %f' % avg_psnr)
 
 
-def train(epoch, ctx):
-    if isinstance(ctx, mx.Context):
-        ctx = [ctx]
-    net.initialize(mx.init.Orthogonal(), ctx=ctx)
+def train(epoch, device):
+    if isinstance(device, mx.Device):
+        device = [device]
+    net.initialize(mx.init.Orthogonal(), device=device)
     # re-initialize conv4's weight to be Orthogonal
-    net.conv4.initialize(mx.init.Orthogonal(scale=1), force_reinit=True, ctx=ctx)
+    net.conv4.initialize(mx.init.Orthogonal(scale=1), force_reinit=True, device=device)
     trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': opt.lr})
     loss = gluon.loss.L2Loss()
 
     for i in range(epoch):
         train_data.reset()
         for batch in train_data:
-            data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
-            label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
+            data = gluon.utils.split_and_load(batch.data[0], device_list=device, batch_axis=0)
+            label = gluon.utils.split_and_load(batch.label[0], device_list=device, batch_axis=0)
             outputs = []
             with ag.record():
                 for x, y in zip(data, label):
@@ -201,20 +201,20 @@ def train(epoch, ctx):
         name, acc = metric.get()
         metric.reset()
         print('training mse at epoch %d: %s=%f'%(i, name, acc))
-        test(ctx)
+        test(device)
 
     net.save_parameters(path.join(this_dir, 'superres.params'))
 
-def resolve(ctx):
+def resolve(device):
     from PIL import Image
 
-    if isinstance(ctx, list):
-        ctx = [ctx[0]]
+    if isinstance(device, list):
+        device = [device[0]]
 
     img_basename = path.splitext(path.basename(opt.resolve_img))[0]
     img_dirname = path.dirname(opt.resolve_img)
 
-    net.load_parameters(path.join(this_dir, 'superres.params'), ctx=ctx)
+    net.load_parameters(path.join(this_dir, 'superres.params'), device=device)
     img = Image.open(opt.resolve_img).convert('YCbCr')
     y, cb, cr = img.split()
     data = mx.np.expand_dims(mx.np.expand_dims(mx.np.array(y), axis=0), axis=0)
@@ -229,6 +229,6 @@ def resolve(ctx):
     out_img.save(path.join(img_dirname, '{}-resolved.png'.format(img_basename)))
 
 if opt.resolve_img:
-    resolve(ctx)
+    resolve(device)
 else:
-    train(opt.epochs, ctx)
+    train(opt.epochs, device)
diff --git a/python/mxnet/_ctypes/cached_op.py b/python/mxnet/_ctypes/cached_op.py
index 03715ef612a5..509484b7c3e4 100644
--- a/python/mxnet/_ctypes/cached_op.py
+++ b/python/mxnet/_ctypes/cached_op.py
@@ -74,6 +74,8 @@ def __call__(self, *args, **kwargs):
         """ctypes implementation of imperative invoke wrapper"""
         # New FFI only supports numpy ndarray
         default_device = kwargs.pop('default_device', None)
+        if not default_device:
+            default_device = kwargs.pop('default_ctx', None)
         out = kwargs.pop('out', None)
         if kwargs:
             raise TypeError(
diff --git a/python/mxnet/gluon/data/batchify.py b/python/mxnet/gluon/data/batchify.py
index 2d5ae0cb3a63..ec6b3d44bc08 100644
--- a/python/mxnet/gluon/data/batchify.py
+++ b/python/mxnet/gluon/data/batchify.py
@@ -93,7 +93,7 @@ def __call__(self, data):
             out = np.asarray(data)
             dtype = out.dtype
             if self._use_shared_mem:
-                return _arr.array(out, device=Device('cpu_shared', 0), dtype=dtype)
+                return _arr.array(out, ctx=Device('cpu_shared', 0), dtype=dtype)
             else:
                 return _arr.array(out, dtype=dtype)
 
@@ -149,7 +149,7 @@ def _pad_arrs_to_max_length(arrs, pad_val, use_shared_mem, dtype, round_to=None)
 
 
     device = Device('cpu_shared', 0) if use_shared_mem else cpu()
-    ret = _arr.array(ret, device=device, dtype=dtype)
+    ret = _arr.array(ret, ctx=device, dtype=dtype)
 
     return ret
 
@@ -266,7 +266,7 @@ def _append_arrs(arrs, use_shared_mem=False, expand=False, batch_axis=0):
             out = arrs
     else:
         if use_shared_mem:
-            out = [_arr.array(x, device=Device('cpu_shared', 0)) for x in arrs]
+            out = [_arr.array(x, ctx=Device('cpu_shared', 0)) for x in arrs]
         else:
             out = [_arr.array(x) for x in arrs]
 
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 96d116eaaba6..3e72fbab6c72 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -350,13 +350,14 @@ def _finish_deferred_init(self):
 
         with autograd.pause(), dc.context(False):
             if data is None:
-                kwargs = {'shape': self.shape, 'dtype': self.dtype, 'device': cpu()}
                 if is_np_array():
+                    kwargs = {'shape': self.shape, 'dtype': self.dtype, 'device': cpu()}
                     if self._stype != 'default':
                         raise ValueError("mxnet.numpy.zeros does not support stype = {}"
                                          .format(self._stype))
                     zeros_fn = _mx_np.zeros
                 else:
+                    kwargs = {'shape': self.shape, 'dtype': self.dtype, 'ctx': cpu()}
                     kwargs['stype'] = self._stype
                     zeros_fn = ndarray.zeros
                 data = zeros_fn(**kwargs)
@@ -391,7 +392,7 @@ def _init_grad(self):
             self._grad = [_mx_np.zeros(shape=i.shape, dtype=i.dtype, device=i.device)
                           for i in self._data]
         else:
-            self._grad = [ndarray.zeros(shape=i.shape, dtype=i.dtype, device=i.device,
+            self._grad = [ndarray.zeros(shape=i.shape, dtype=i.dtype, ctx=i.context,
                                         stype=self._grad_stype) for i in self._data]
 
         autograd.mark_variables(self._check_and_get(self._data, list),
@@ -412,7 +413,7 @@ def _reduce(self):
         else:
             # fetch all rows for 'row_sparse' param
             all_row_ids = ndarray.arange(0, self.shape[0], dtype='int64', device=device)
-            data = ndarray.zeros(self.shape, stype='row_sparse', device=device)
+            data = ndarray.zeros(self.shape, stype='row_sparse', ctx=device)
             trainer = self._trainer() if self._trainer else None
             if not trainer:
                 raise RuntimeError("Cannot reduce row_sparse data for Parameter '%s' when no " \
diff --git a/python/mxnet/ndarray/sparse.py b/python/mxnet/ndarray/sparse.py
index b5a25b4c07f4..f6a63b85f474 100644
--- a/python/mxnet/ndarray/sparse.py
+++ b/python/mxnet/ndarray/sparse.py
@@ -40,7 +40,7 @@
 from ..base import _LIB, numeric_types
 from ..base import c_array_buf, mx_real_t, integer_types
 from ..base import NDArrayHandle, check_call
-from ..context import Context, current_context
+from ..device import Device, current_device
 from . import _internal
 from . import op
 try:
@@ -255,7 +255,7 @@ def copyto(self, other):
                 warnings.warn('You are attempting to copy an array to itself', RuntimeWarning)
                 return False
             return _internal._copyto(self, out=other)
-        elif isinstance(other, Context):
+        elif isinstance(other, Device):
             hret = _ndarray_cls(_new_alloc_handle(self.stype, self.shape, other,
                                                   True, self.dtype, self._aux_types))
             return _internal._copyto(self, out=hret)
@@ -539,7 +539,7 @@ def copyto(self, other):
             The copied array. If ``other`` is an ``NDArray`` or ``CSRNDArray``, then the return
             value and ``other`` will point to the same ``NDArray`` or ``CSRNDArray``.
         """
-        if isinstance(other, Context):
+        if isinstance(other, Device):
             return super(CSRNDArray, self).copyto(other)
         elif isinstance(other, NDArray):
             stype = other.stype
@@ -786,7 +786,7 @@ def copyto(self, other):
             The copied array. If ``other`` is an ``NDArray`` or ``RowSparseNDArray``, then the
             return value and ``other`` will point to the same ``NDArray`` or ``RowSparseNDArray``.
         """
-        if isinstance(other, Context):
+        if isinstance(other, Device):
             return super(RowSparseNDArray, self).copyto(other)
         elif isinstance(other, NDArray):
             stype = other.stype
@@ -998,7 +998,7 @@ def _csr_matrix_from_definition(data, indices, indptr, shape=None, ctx=None,
     # pylint: disable= no-member, protected-access
     storage_type = 'csr'
     # context
-    ctx = current_context() if ctx is None else ctx
+    ctx = current_device() if ctx is None else ctx
     # types
     dtype = _prepare_default_dtype(data, dtype)
     indptr_type = _STORAGE_AUX_TYPES[storage_type][0] if indptr_type is None else indptr_type
@@ -1161,7 +1161,7 @@ def _row_sparse_ndarray_from_definition(data, indices, shape=None, ctx=None,
     """Create a `RowSparseNDArray` based on data and indices"""
     storage_type = 'row_sparse'
     # context
-    ctx = current_context() if ctx is None else ctx
+    ctx = current_device() if ctx is None else ctx
     # types
     dtype = _prepare_default_dtype(data, dtype)
     indices_type = _STORAGE_AUX_TYPES[storage_type][0] if indices_type is None else indices_type
@@ -1550,7 +1550,7 @@ def zeros(stype, shape, ctx=None, dtype=None, **kwargs):
     if stype == 'default':
         return _zeros_ndarray(shape, ctx=ctx, dtype=dtype, **kwargs)
     if ctx is None:
-        ctx = current_context()
+        ctx = current_device()
     dtype = mx_real_t if dtype is None else dtype
     if stype in ('row_sparse', 'csr'):
         aux_types = _STORAGE_AUX_TYPES[stype]
@@ -1583,7 +1583,7 @@ def empty(stype, shape, ctx=None, dtype=None):
     if isinstance(shape, int):
         shape = (shape, )
     if ctx is None:
-        ctx = current_context()
+        ctx = current_device()
     if dtype is None:
         dtype = mx_real_t
     assert(stype is not None)
@@ -1624,7 +1624,7 @@ def array(source_array, ctx=None, dtype=None):
     >>> mx.nd.sparse.array(mx.nd.sparse.zeros('row_sparse', (3, 2)))
     <RowSparseNDArray 3x2 @cpu(0)>
     """
-    ctx = current_context() if ctx is None else ctx
+    ctx = current_device() if ctx is None else ctx
     if isinstance(source_array, NDArray):
         assert(source_array.stype != 'default'), \
                "Please use `tostype` to create RowSparseNDArray or CSRNDArray from an NDArray"
diff --git a/python/mxnet/numpy/random.py b/python/mxnet/numpy/random.py
index 3f10902e33d3..659f710af536 100644
--- a/python/mxnet/numpy/random.py
+++ b/python/mxnet/numpy/random.py
@@ -19,6 +19,7 @@
 
 from ..ndarray import numpy as _mx_nd_np
 from ..random import seed
+from ..util import wrap_ctx_to_device_func
 
 
 __all__ = ["randint", "uniform", "normal", "choice", "rand", "multinomial", "multivariate_normal",
@@ -29,7 +30,8 @@
            "seed"]
 
 
-def randint(low, high=None, size=None, dtype=None, ctx=None, out=None):
+@wrap_ctx_to_device_func
+def randint(low, high=None, size=None, dtype=None, device=None, out=None):
     r"""Return random integers from `low` (inclusive) to `high` (exclusive).
 
     Return random integers from the "discrete uniform" distribution of
@@ -54,8 +56,8 @@ def randint(low, high=None, size=None, dtype=None, ctx=None, out=None):
         name, i.e., 'int64', 'int', etc, so byteorder is not available
         and a specific precision may have different C types depending
         on the platform. The default value is 'np.int'.
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
     out : ndarray, optional
         The output ndarray (default is `None`).
 
@@ -78,10 +80,11 @@ def randint(low, high=None, size=None, dtype=None, ctx=None, out=None):
     array([[4, 0, 2, 1],
         [3, 2, 2, 0]])
     """
-    return _mx_nd_np.random.randint(low, high, size, dtype, ctx, out)
+    return _mx_nd_np.random.randint(low, high, size, dtype, device, out)
 
 
-def uniform(low=0.0, high=1.0, size=None, dtype=None, ctx=None, out=None):
+@wrap_ctx_to_device_func
+def uniform(low=0.0, high=1.0, size=None, dtype=None, device=None, out=None):
     r"""Draw samples from a uniform distribution.
 
     Samples are uniformly distributed over the half-open interval
@@ -107,8 +110,8 @@ def uniform(low=0.0, high=1.0, size=None, dtype=None, ctx=None, out=None):
         Data type of output samples.
         When npx.is_np_default_dtype() returns False, default dtype is float32;
         When npx.is_np_default_dtype() returns True, default dtype is float64.
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
 
     Returns
     -------
@@ -136,10 +139,11 @@ def uniform(low=0.0, high=1.0, size=None, dtype=None, ctx=None, out=None):
     function to behave when passed arguments satisfying that
     inequality condition.
     """
-    return _mx_nd_np.random.uniform(low, high, size=size, ctx=ctx, dtype=dtype, out=out)
+    return _mx_nd_np.random.uniform(low, high, size=size, device=device, dtype=dtype, out=out)
 
 
-def normal(loc=0.0, scale=1.0, size=None, dtype=None, ctx=None, out=None):
+@wrap_ctx_to_device_func
+def normal(loc=0.0, scale=1.0, size=None, dtype=None, device=None, out=None):
     r"""Draw random samples from a normal (Gaussian) distribution.
 
     Samples are distributed according to a normal distribution parametrized
@@ -160,8 +164,8 @@ def normal(loc=0.0, scale=1.0, size=None, dtype=None, ctx=None, out=None):
         Data type of output samples.
         When npx.is_np_default_dtype() returns False, default dtype is float32;
         When npx.is_np_default_dtype() returns True, default dtype is float64.
-    ctx : Context, optional
-        Device context of output, default is current context.
+    device : Device, optional
+        Device context of output, default is current device.
     out : ``ndarray``, optional
         Store output to an existing ``ndarray``.
 
@@ -205,10 +209,11 @@ def normal(loc=0.0, scale=1.0, size=None, dtype=None, ctx=None, out=None):
     >>> np.abs(mu - np.mean(s)) < 0.01
     array(True)
     """
-    return _mx_nd_np.random.normal(loc, scale, size, dtype, ctx, out)
+    return _mx_nd_np.random.normal(loc, scale, size, dtype, device, out)
 
 
-def lognormal(mean=0.0, sigma=1.0, size=None, dtype=None, ctx=None, out=None):
+@wrap_ctx_to_device_func
+def lognormal(mean=0.0, sigma=1.0, size=None, dtype=None, device=None, out=None):
     r"""Draw samples from a log-normal distribution.
 
     Draw samples from a `log-normal distribution` [1]_ with specified mean,
@@ -230,8 +235,8 @@ def lognormal(mean=0.0, sigma=1.0, size=None, dtype=None, ctx=None, out=None):
         Otherwise, ``np.broadcast(mean, sigma).size`` samples are drawn.
     dtype : {'float16', 'float32', 'float64'}, optional
         Data type of output samples. Default is 'float32'
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
     out : ``ndarray``, optional
         Store output to an existing ``ndarray``.
 
@@ -272,10 +277,11 @@ def lognormal(mean=0.0, sigma=1.0, size=None, dtype=None, ctx=None, out=None):
     >>> mu, sigma = 3., 1. # mean and standard deviation
     >>> s = np.random.lognormal(mu, sigma, 1000)
     """
-    return _mx_nd_np.random.lognormal(mean, sigma, size, dtype, ctx, out)
+    return _mx_nd_np.random.lognormal(mean, sigma, size, dtype, device, out)
 
 
-def logistic(loc=0.0, scale=1.0, size=None, ctx=None, out=None):
+@wrap_ctx_to_device_func
+def logistic(loc=0.0, scale=1.0, size=None, device=None, out=None):
     r"""Draw samples from a logistic distribution.
 
     Samples are drawn from a logistic distribution with specified
@@ -293,8 +299,8 @@ def logistic(loc=0.0, scale=1.0, size=None, ctx=None, out=None):
         ``m * n * k`` samples are drawn.  If size is ``None`` (default),
         a single value is returned if ``loc`` and ``scale`` are both scalars.
         Otherwise, ``np.broadcast(loc, scale).size`` samples are drawn.
-    ctx : Context, optional
-        Device context of output, default is current context.
+    device : Device, optional
+        Device context of output, default is current device.
     out : ``ndarray``, optional
         Store output to an existing ``ndarray``.
 
@@ -317,10 +323,11 @@ def logistic(loc=0.0, scale=1.0, size=None, ctx=None, out=None):
     >>> plt.plot(bins, lgst_val * count.max() / lgst_val.max())
     >>> plt.show()
     """
-    return _mx_nd_np.random.logistic(loc, scale, size, ctx, out)
+    return _mx_nd_np.random.logistic(loc, scale, size, device, out)
 
 
-def gumbel(loc=0.0, scale=1.0, size=None, ctx=None, out=None):
+@wrap_ctx_to_device_func
+def gumbel(loc=0.0, scale=1.0, size=None, device=None, out=None):
     r"""Draw samples from a Gumbel distribution.
 
     Draw samples from a Gumbel distribution with specified location and
@@ -338,8 +345,8 @@ def gumbel(loc=0.0, scale=1.0, size=None, ctx=None, out=None):
         ``m * n * k`` samples are drawn.  If size is ``None`` (default),
         a single value is returned if ``loc`` and ``scale`` are both scalars.
         Otherwise, ``np.broadcast(loc, scale).size`` samples are drawn.
-    ctx : Context, optional
-        Device context of output, default is current context.
+    device : Device, optional
+        Device context of output, default is current device.
     out : ``ndarray``, optional
         Store output to an existing ``ndarray``.
 
@@ -380,7 +387,7 @@ def gumbel(loc=0.0, scale=1.0, size=None, ctx=None, out=None):
     ...          linewidth=2, color='g')
     >>> plt.show()
     """
-    return _mx_nd_np.random.gumbel(loc, scale, size, ctx, out)
+    return _mx_nd_np.random.gumbel(loc, scale, size, device, out)
 
 
 def multinomial(n, pvals, size=None, **kwargs):
@@ -503,7 +510,8 @@ def multivariate_normal(mean, cov, size=None, check_valid=None, tol=None):
     return _mx_nd_np.random.multivariate_normal(mean, cov, size=size, check_valid=None, tol=None)
 
 
-def choice(a, size=None, replace=True, p=None, ctx=None, out=None):
+@wrap_ctx_to_device_func
+def choice(a, size=None, replace=True, p=None, device=None, out=None):
     r"""Generates a random sample from a given 1-D array
 
     Parameters
@@ -521,8 +529,8 @@ def choice(a, size=None, replace=True, p=None, ctx=None, out=None):
         The probabilities associated with each entry in a.
         If not given the sample assumes a uniform distribution over all
         entries in a.
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
 
     Returns
     --------
@@ -555,10 +563,11 @@ def choice(a, size=None, replace=True, p=None, ctx=None, out=None):
     >>> np.random.choice(5, 3, replace=False, p=[0.1, 0, 0.3, 0.6, 0])
     array([2, 3, 0])
     """
-    return _mx_nd_np.random.choice(a, size, replace, p, ctx, out)
+    return _mx_nd_np.random.choice(a, size, replace, p, device, out)
 
 
-def rayleigh(scale=1.0, size=None, ctx=None, out=None):
+@wrap_ctx_to_device_func
+def rayleigh(scale=1.0, size=None, device=None, out=None):
     r"""Draw samples from a Rayleigh distribution.
 
     The :math:`\chi` and Weibull distributions are generalizations of the
@@ -573,8 +582,8 @@ def rayleigh(scale=1.0, size=None, ctx=None, out=None):
         ``m * n * k`` samples are drawn.  If size is ``None`` (default),
         a single value is returned if ``scale`` is a scalar.  Otherwise,
         ``np.array(scale).size`` samples are drawn.
-    ctx : Context, optional
-        Device context of output, default is current context.
+    device : Device, optional
+        Device context of output, default is current device.
     out : ``ndarray``, optional
         Store output to an existing ``ndarray``.
 
@@ -583,7 +592,7 @@ def rayleigh(scale=1.0, size=None, ctx=None, out=None):
     out : ndarray or scalar
         Drawn samples from the parameterized Rayleigh distribution.
     """
-    return _mx_nd_np.random.rayleigh(scale, size, ctx, out)
+    return _mx_nd_np.random.rayleigh(scale, size, device, out)
 
 
 def rand(*size, **kwargs):
@@ -616,7 +625,8 @@ def rand(*size, **kwargs):
     return _mx_nd_np.random.uniform(0, 1, size=output_shape, **kwargs)
 
 
-def exponential(scale=1.0, size=None, ctx=None, out=None):
+@wrap_ctx_to_device_func
+def exponential(scale=1.0, size=None, device=None, out=None):
     r"""Draw samples from an exponential distribution.
 
     Parameters
@@ -629,8 +639,8 @@ def exponential(scale=1.0, size=None, ctx=None, out=None):
         ``m * n * k`` samples are drawn.  If size is ``None`` (default),
         a single value is returned if ``scale`` is a scalar.  Otherwise,
         ``np.array(scale).size`` samples are drawn.
-    ctx : Context, optional
-        Device context of output, default is current context.
+    device : Device, optional
+        Device context of output, default is current device.
     out : ``ndarray``, optional
         Store output to an existing ``ndarray``.
 
@@ -639,10 +649,11 @@ def exponential(scale=1.0, size=None, ctx=None, out=None):
     out : ndarray or scalar
         Drawn samples from the parameterized exponential distribution.
     """
-    return _mx_nd_np.random.exponential(scale, size=size, ctx=ctx, out=out)
+    return _mx_nd_np.random.exponential(scale, size=size, device=device, out=out)
 
 
-def weibull(a, size=None, ctx=None, out=None):
+@wrap_ctx_to_device_func
+def weibull(a, size=None, device=None, out=None):
     r"""Draw samples from a 1-parameter Weibull distribution with given parameter a
     via inversion.
 
@@ -681,10 +692,11 @@ def weibull(a, size=None, ctx=None, out=None):
     model time to failure, in modeling particle sizes, in information retrieval
     to model dwell time on pages, in quantitative finance to model risk etc.
     """
-    return _mx_nd_np.random.weibull(a, size=size, ctx=ctx, out=out)
+    return _mx_nd_np.random.weibull(a, size=size, device=device, out=out)
 
 
-def pareto(a, size=None, ctx=None, out=None):
+@wrap_ctx_to_device_func
+def pareto(a, size=None, device=None, out=None):
     r"""Draw samples from a Pareto II or Lomax distribution with specified shape a.
 
     Parameters
@@ -715,10 +727,11 @@ def pareto(a, size=None, ctx=None, out=None):
     where a is the shape and m the scale. Here m is assumed 1. The Pareto distribution
     is a power law distribution. Pareto created it to describe the wealth in the economy.
     """
-    return _mx_nd_np.random.pareto(a, size=size, ctx=ctx, out=out)
+    return _mx_nd_np.random.pareto(a, size=size, device=device, out=out)
 
 
-def power(a, size=None, ctx=None, out=None):
+@wrap_ctx_to_device_func
+def power(a, size=None, device=None, out=None):
     r"""Draw samples in [0, 1] from a power distribution with given parameter a.
 
     Parameters
@@ -749,7 +762,7 @@ def power(a, size=None, ctx=None, out=None):
     The power distribution is just the inverse of the Pareto distribution and
     a special case of the Beta distribution.
     """
-    return _mx_nd_np.random.power(a, size=size, ctx=ctx, out=out)
+    return _mx_nd_np.random.power(a, size=size, device=device, out=out)
 
 
 def shuffle(x):
@@ -784,7 +797,8 @@ def shuffle(x):
     _mx_nd_np.random.shuffle(x)
 
 
-def gamma(shape, scale=1.0, size=None, dtype=None, ctx=None, out=None):
+@wrap_ctx_to_device_func
+def gamma(shape, scale=1.0, size=None, dtype=None, device=None, out=None):
     """Draw samples from a Gamma distribution.
 
     Samples are drawn from a Gamma distribution with specified parameters,
@@ -809,18 +823,19 @@ def gamma(shape, scale=1.0, size=None, dtype=None, ctx=None, out=None):
         ``m * n * k`` samples are drawn.  If size is ``None`` (default),
         a single value is returned if ``shape`` and ``scale`` are both scalars.
         Otherwise, ``np.broadcast(shape, scale).size`` samples are drawn.
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
 
     Returns
     -------
     out : ndarray or scalar
         Drawn samples from the parameterized gamma distribution.
     """
-    return _mx_nd_np.random.gamma(shape, scale, size, dtype, ctx, out)
+    return _mx_nd_np.random.gamma(shape, scale, size, dtype, device, out)
 
 
-def beta(a, b, size=None, dtype=None, ctx=None):
+@wrap_ctx_to_device_func
+def beta(a, b, size=None, dtype=None, device=None):
     r"""Draw samples from a Beta distribution.
 
     The Beta distribution is a special case of the Dirichlet distribution,
@@ -852,8 +867,8 @@ def beta(a, b, size=None, dtype=None, ctx=None):
         Data type of output samples. Default is 'float32'.
         Dtype 'float32' or 'float64' is strongly recommended,
         since lower precision might lead to out of range issue.
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
 
     Notes
     -----
@@ -865,10 +880,11 @@ def beta(a, b, size=None, dtype=None, ctx=None):
     out : ndarray or scalar
         Drawn samples from the parameterized beta distribution.
     """
-    return _mx_nd_np.random.beta(a, b, size=size, dtype=dtype, ctx=ctx)
+    return _mx_nd_np.random.beta(a, b, size=size, dtype=dtype, device=device)
 
 
-def f(dfnum, dfden, size=None, ctx=None):
+@wrap_ctx_to_device_func
+def f(dfnum, dfden, size=None, device=None):
     r"""Draw samples from an F distribution.
 
     Samples are drawn from an F distribution with specified parameters,
@@ -892,8 +908,8 @@ def f(dfnum, dfden, size=None, ctx=None):
         ``m * n * k`` samples are drawn.  If size is ``None`` (default),
         a single value is returned if ``dfnum`` and ``dfden`` are both scalars.
         Otherwise, ``np.broadcast(dfnum, dfden).size`` samples are drawn.
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
 
     Returns
     -------
@@ -927,10 +943,11 @@ def f(dfnum, dfden, size=None, ctx=None):
     the measured value is 36, so the null hypothesis is rejected at the 1%
     level.
     """
-    return _mx_nd_np.random.f(dfnum, dfden, size=size, ctx=ctx)
+    return _mx_nd_np.random.f(dfnum, dfden, size=size, device=device)
 
 
-def chisquare(df, size=None, dtype=None, ctx=None):
+@wrap_ctx_to_device_func
+def chisquare(df, size=None, dtype=None, device=None):
     r"""Draw samples from a chi-square distribution.
 
     When `df` independent random variables, each with standard normal
@@ -949,8 +966,8 @@ def chisquare(df, size=None, dtype=None, ctx=None):
         ``np.array(df).size`` samples are drawn.
     dtype : {'float16', 'float32', 'float64'}, optional
         Data type of output samples. Default is 'float32'.
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
 
     Returns
     -------
@@ -993,7 +1010,7 @@ def chisquare(df, size=None, dtype=None, ctx=None):
     >>> np.random.chisquare(2,4)
     array([ 1.89920014,  9.00867716,  3.13710533,  5.62318272]) # random
     """
-    return _mx_nd_np.random.chisquare(df, size=size, dtype=dtype, ctx=ctx)
+    return _mx_nd_np.random.chisquare(df, size=size, dtype=dtype, device=device)
 
 
 def randn(*size, **kwargs):
@@ -1036,7 +1053,9 @@ def randn(*size, **kwargs):
         output_shape += (s,)
     return _mx_nd_np.random.normal(0, 1, size=output_shape, **kwargs)
 
-def laplace(loc=0.0, scale=1.0, size=None, dtype=None, ctx=None, out=None):
+
+@wrap_ctx_to_device_func
+def laplace(loc=0.0, scale=1.0, size=None, dtype=None, device=None, out=None):
     r"""Draw random samples from a Laplace distribution.
 
     Samples are distributed according to a Laplace distribution parametrized
@@ -1054,8 +1073,8 @@ def laplace(loc=0.0, scale=1.0, size=None, dtype=None, ctx=None, out=None):
 
     dtype : {'float16', 'float32', 'float64'}, optional
         Data type of output samples. Default is 'float32'
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
     out : ``ndarray``, optional
         Store output to an existing ``ndarray``.
 
@@ -1064,4 +1083,4 @@ def laplace(loc=0.0, scale=1.0, size=None, dtype=None, ctx=None, out=None):
     out : ndarray
         Drawn samples from the parameterized Laplace distribution.
     """
-    return _mx_nd_np.random.laplace(loc, scale, size, dtype, ctx, out)
+    return _mx_nd_np.random.laplace(loc, scale, size, dtype, device, out)
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index 233fbea033ed..0976b4b1b3d5 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -1011,7 +1011,7 @@ def default_array(source_array, device=None, dtype=None):
     if is_np_array():
         return _mx_np.array(source_array, device=device, dtype=dtype)
     else:
-        return _mx_nd.array(source_array, device=device, dtype=dtype)
+        return _mx_nd.array(source_array, ctx=device, dtype=dtype)
 
 class _NumpyDefaultDtypeScope(object):
     """Scope for managing NumPy default dtype semantics.
diff --git a/tests/python/unittest/test_executor.py b/tests/python/unittest/test_executor.py
index b735c8322b27..24959a8f2244 100644
--- a/tests/python/unittest/test_executor.py
+++ b/tests/python/unittest/test_executor.py
@@ -153,7 +153,7 @@ def check_init(static_alloc, static_shape):
         out = mx.sym.zeros((3,3))
         flags = [('static_alloc', static_alloc), ('static_shape', static_shape)]
         exe = mx.ndarray.CachedOp(out, flags)
-        z = exe(None, default_ctx=mx.cpu())
+        z = exe(None, default_device=mx.cpu())
         assert np.all(z.asnumpy() == 0)
 
     check_init(False, False)
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 0d40a3196d46..ca769b7edd26 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -566,7 +566,7 @@ def test_batchnorm_backward_synchronization(variable):
     Tests if synchronization of BatchNorm running variables is done correctly.
     If not, the test sometimes fails - depending on the timing.
     """
-    device = mx.test_utils.to_device()
+    device = mx.test_utils.default_device()
 
     for _ in range(20):
         layer = nn.BatchNorm()
diff --git a/tests/python/unittest/test_gluon_estimator.py b/tests/python/unittest/test_gluon_estimator.py
index 713a242687f6..9b5a9f3d62e6 100644
--- a/tests/python/unittest/test_gluon_estimator.py
+++ b/tests/python/unittest/test_gluon_estimator.py
@@ -53,16 +53,16 @@ def test_fit():
     net = _get_test_network()
     dataloader, dataiter = _get_test_data()
     num_epochs = 1
-    ctx = mx.cpu()
+    device = mx.cpu()
     loss = gluon.loss.L2Loss()
     acc = mx.gluon.metric.Accuracy()
-    net.initialize(ctx=ctx)
+    net.initialize(device=device)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
     est = Estimator(net=net,
                     loss=loss,
                     train_metrics=acc,
                     trainer=trainer,
-                    context=ctx)
+                    device=device)
 
     est.fit(train_data=dataloader,
             epochs=num_epochs)
@@ -83,17 +83,17 @@ def test_validation():
     net = _get_test_network()
     dataloader, dataiter = _get_test_data()
     num_epochs = 1
-    ctx = mx.cpu()
+    device = mx.cpu()
     loss = gluon.loss.L2Loss()
     acc = mx.gluon.metric.Accuracy()
     val_loss = gluon.loss.L1Loss()
-    net.initialize(ctx=ctx)
+    net.initialize(device=device)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
     est = Estimator(net=net,
                     loss=loss,
                     train_metrics=acc,
                     trainer=trainer,
-                    context=ctx,
+                    device=device,
                     val_loss=val_loss)
     # Input dataloader
     est.fit(train_data=dataloader,
@@ -122,7 +122,7 @@ def test_initializer():
     net = _get_test_network()
     train_data, _ = _get_test_data()
     num_epochs = 1
-    ctx = mx.cpu()
+    device = mx.cpu()
 
     loss = gluon.loss.L2Loss()
     acc = mx.gluon.metric.Accuracy()
@@ -130,13 +130,13 @@ def test_initializer():
     est = Estimator(net=net,
                     loss=loss,
                     train_metrics=acc,
-                    context=ctx)
+                    device=device)
     est.fit(train_data=train_data,
             epochs=num_epochs)
 
     # different initializer for net and estimator
     net = _get_test_network()
-    net.initialize(mx.init.Xavier(), ctx=ctx)
+    net.initialize(mx.init.Xavier(), device=device)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
     # catch reinit warning
     with warnings.catch_warnings(record=True) as w:
@@ -145,14 +145,14 @@ def test_initializer():
                         train_metrics=acc,
                         initializer=mx.init.MSRAPrelu(),
                         trainer=trainer,
-                        context=ctx)
+                        device=device)
         assert 'Network already fully initialized' in str(w[-1].message)
     # net partially initialized, fine tuning use case
-    net = gluon.model_zoo.vision.resnet18_v1(pretrained=False, ctx=ctx)
-    net.features.initialize(ctx=ctx)
+    net = gluon.model_zoo.vision.resnet18_v1(pretrained=False, device=device)
+    net.features.initialize(device=device)
     net.features(mx.np.zeros((1, 3, 224, 224)))
     net.output = gluon.nn.Dense(10) #last layer not initialized
-    est = Estimator(net, loss=loss, train_metrics=acc, context=ctx)
+    est = Estimator(net, loss=loss, train_metrics=acc, device=device)
     dataset =  gluon.data.ArrayDataset(mx.np.zeros((10, 3, 224, 224)), mx.np.zeros((10, 10)))
     train_data = gluon.data.DataLoader(dataset=dataset, batch_size=5)
     est.fit(train_data=train_data,
@@ -165,17 +165,17 @@ def test_trainer():
     net = _get_test_network()
     train_data, _ = _get_test_data()
     num_epochs = 1
-    ctx = mx.cpu()
+    device = mx.cpu()
 
     loss = gluon.loss.L2Loss()
     acc = mx.gluon.metric.Accuracy()
-    net.initialize(ctx=ctx)
+    net.initialize(device=device)
     # input no trainer
     with warnings.catch_warnings(record=True) as w:
         est = Estimator(net=net,
                         loss=loss,
                         train_metrics=acc,
-                        context=ctx)
+                        device=device)
         assert 'No trainer specified' in str(w[-1].message)
     est.fit(train_data=train_data,
             epochs=num_epochs)
@@ -187,7 +187,7 @@ def test_trainer():
                         loss=loss,
                         train_metrics=acc,
                         trainer=trainer,
-                        context=ctx)
+                        device=device)
 
 
 @mx.util.use_np
@@ -196,16 +196,16 @@ def test_metric():
     net = _get_test_network()
     train_data, _ = _get_test_data()
     num_epochs = 1
-    ctx = mx.cpu()
+    device = mx.cpu()
 
     loss = gluon.loss.L2Loss()
-    net.initialize(ctx=ctx)
+    net.initialize(device=device)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
     # input no metric
     est = Estimator(net=net,
                     loss=loss,
                     trainer=trainer,
-                    context=ctx)
+                    device=device)
     est.fit(train_data=train_data,
             epochs=num_epochs)
     # input list of metrics
@@ -214,7 +214,7 @@ def test_metric():
                     loss=loss,
                     train_metrics=metrics,
                     trainer=trainer,
-                    context=ctx)
+                    device=device)
     est.fit(train_data=train_data,
             epochs=num_epochs)
     # input invalid metric
@@ -223,13 +223,13 @@ def test_metric():
                         loss=loss,
                         train_metrics='acc',
                         trainer=trainer,
-                        context=ctx)
+                        device=device)
     # test default metric
     loss = gluon.loss.SoftmaxCrossEntropyLoss()
     est = Estimator(net=net,
                     loss=loss,
                     trainer=trainer,
-                    context=ctx)
+                    device=device)
     assert isinstance(est.train_metrics[0], mx.gluon.metric.Accuracy)
 
 
@@ -237,9 +237,9 @@ def test_metric():
 def test_loss():
     ''' test with invalid loss '''
     net = _get_test_network()
-    ctx = mx.cpu()
+    device = mx.cpu()
     acc = mx.gluon.metric.Accuracy()
-    net.initialize(ctx=ctx)
+    net.initialize(device=device)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
     # input invalid loss
     with pytest.raises(ValueError):
@@ -247,39 +247,39 @@ def test_loss():
                         loss='mse',
                         train_metrics=acc,
                         trainer=trainer,
-                        context=ctx)
+                        device=device)
 
 
 @mx.util.use_np
-def test_context():
-    ''' test with no context, list of context, invalid context '''
+def test_device():
+    ''' test with no device, list of device, invalid device '''
     net = _get_test_network()
     loss = gluon.loss.L2Loss()
     metrics = mx.gluon.metric.Accuracy()
-    # input no context
+    # input no device
     est = Estimator(net=net,
                     loss=loss,
                     train_metrics=metrics)
-    # input list of context
-    gpus = mx.context.num_gpus()
-    ctx = [mx.gpu(i) for i in range(gpus)] if gpus > 0 else [mx.cpu()]
+    # input list of device
+    gpus = mx.device.num_gpus()
+    device = [mx.gpu(i) for i in range(gpus)] if gpus > 0 else [mx.cpu()]
     net = _get_test_network()
     est = Estimator(net=net,
                     loss=loss,
                     train_metrics=metrics,
-                    context=ctx)
-    # input invalid context
+                    device=device)
+    # input invalid device
     with pytest.raises(ValueError):
         est = Estimator(net=net,
                         loss=loss,
                         train_metrics=metrics,
-                        context='cpu')
+                        device='cpu')
 
     with pytest.raises(AssertionError):
         est = Estimator(net=net,
                         loss=loss,
                         train_metrics=metrics,
-                        context=[mx.gpu(0), mx.gpu(100)])
+                        device=[mx.gpu(0), mx.gpu(100)])
 
 
 @mx.util.use_np
@@ -334,9 +334,9 @@ def test_default_handlers():
     train_data, _ = _get_test_data()
 
     num_epochs = 1
-    ctx = mx.cpu()
+    device = mx.cpu()
 
-    net.initialize(ctx=ctx)
+    net.initialize(device=device)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
 
     train_acc = mx.gluon.metric.RMSE()
@@ -346,7 +346,7 @@ def test_default_handlers():
                     loss=loss,
                     train_metrics=train_acc,
                     trainer=trainer,
-                    context=ctx)
+                    device=device)
     # no handler(all default handlers), no warning
     with warnings.catch_warnings(record=True) as w:
         est.fit(train_data=train_data, epochs=num_epochs)
@@ -387,17 +387,17 @@ def test_val_net():
     val_net = _get_test_network(params=net.collect_params())
     dataloader, dataiter = _get_test_data()
     num_epochs = 1
-    ctx = mx.cpu()
+    device = mx.cpu()
     loss = gluon.loss.L2Loss()
     val_loss = gluon.loss.L2Loss()
     acc = mx.gluon.metric.Accuracy()
-    net.initialize(ctx=ctx)
+    net.initialize(device=device)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
     est = Estimator(net=net,
                     loss=loss,
                     train_metrics=acc,
                     trainer=trainer,
-                    context=ctx,
+                    device=device,
                     val_loss=val_loss,
                     val_net=val_net)
 
@@ -406,20 +406,20 @@ def test_val_net():
             epochs=num_epochs)
 
     ''' test partial weight sharing of two resnets '''
-    net = gluon.model_zoo.vision.resnet18_v1(pretrained=False, ctx=ctx)
+    net = gluon.model_zoo.vision.resnet18_v1(pretrained=False, device=device)
     net.output = gluon.nn.Dense(10)
-    val_net = gluon.model_zoo.vision.resnet18_v1(pretrained=False, ctx=ctx)
+    val_net = gluon.model_zoo.vision.resnet18_v1(pretrained=False, device=device)
     val_net.output = net.output
     dataset = gluon.data.ArrayDataset(mx.np.zeros((10, 3, 224, 224)), mx.np.zeros((10, 10)))
     dataloader = gluon.data.DataLoader(dataset=dataset, batch_size=5)
-    net.initialize(ctx=ctx)
-    val_net.initialize(ctx=ctx)
+    net.initialize(device=device)
+    val_net.initialize(device=device)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
     est = Estimator(net=net,
                     loss=loss,
                     train_metrics=acc,
                     trainer=trainer,
-                    context=ctx,
+                    device=device,
                     val_loss=val_loss,
                     val_net=val_net)
 
@@ -434,8 +434,8 @@ def test_val_handlers():
     val_data, _ = _get_test_data()
 
     num_epochs = 1
-    ctx = mx.cpu()
-    net.initialize(ctx=ctx)
+    device = mx.cpu()
+    net.initialize(device=device)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
 
     train_acc = mx.gluon.metric.RMSE()
@@ -445,7 +445,7 @@ def test_val_handlers():
                     loss=loss,
                     train_metrics=train_acc,
                     trainer=trainer,
-                    context=ctx)
+                    device=device)
 
     with warnings.catch_warnings(record=True) as w:
         est.fit(train_data=train_data, epochs=num_epochs)

From 558d7a9f85c4afbbcdef1bd71f1a6f36cccb4d88 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 15 Oct 2021 18:43:40 -0700
Subject: [PATCH 30/41] fix tests

---
 .../gluon_from_experiment_to_deployment.md    |  2 +-
 .../getting-started/gluon_migration_guide.md  |  2 +-
 .../packages/gluon/blocks/save_load_params.md |  2 +-
 .../packages/gluon/image/info_gan.md          |  2 +-
 .../gluon/training/fit_api_tutorial.md        |  2 +-
 .../learning_rates/learning_rate_finder.md    |  2 +-
 .../learning_rates/learning_rate_schedules.md |  2 +-
 .../packages/onnx/fine_tuning_gluon.md        |  2 +-
 .../tutorials/performance/backend/profiler.md |  2 +-
 example/adversary/adversary_generation.ipynb  |  2 +-
 example/bi-lstm-sort/bi-lstm-sort.ipynb       |  2 +-
 .../gluon_mnist.py                            |  2 +-
 .../cifar10_kvstore_hvd.py                    |  2 +-
 example/multi-task/multi-task-learning.ipynb  |  2 +-
 example/recommenders/demo1-MF.ipynb           |  2 +-
 example/recommenders/demo2-dssm.ipynb         |  2 +-
 .../contrib/estimator/batch_processor.py      |  4 +-
 .../gluon/contrib/estimator/event_handler.py  |  2 +-
 python/mxnet/gluon/parameter.py               |  2 +-
 python/mxnet/gluon/trainer.py                 | 39 +++++++++++--------
 python/mxnet/test_utils.py                    |  6 +--
 .../dist_device_sync_kvstore_byteps.py        |  2 +-
 .../dist_device_sync_kvstore_horovod.py       |  4 +-
 tests/nightly/estimator/test_estimator_cnn.py |  2 +-
 tests/nightly/estimator/test_sentiment_rnn.py |  2 +-
 tests/python/gpu/test_device.py               |  4 +-
 tests/python/gpu/test_fusion.py               |  2 +-
 tests/python/gpu/test_nccl.py                 |  2 +-
 tests/python/gpu/test_operator_gpu.py         |  2 +-
 tests/python/profiling/test_nvtx.py           |  2 +-
 tests/python/unittest/test_gluon.py           |  6 +--
 .../unittest/test_gluon_batch_processor.py    | 12 +++---
 tests/python/unittest/test_gluon_data.py      |  2 +-
 tests/python/unittest/test_operator.py        |  2 +-
 tests/python/unittest/test_random.py          |  4 +-
 35 files changed, 70 insertions(+), 63 deletions(-)

diff --git a/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md b/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
index 39f6d371bbb0..391a41899875 100644
--- a/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
+++ b/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
@@ -91,7 +91,7 @@ lr_factor = 0.75
 # learning rate change at following epochs
 lr_epochs = [10, 20, 30]
 
-num_gpus = mx.context.num_gpus()
+num_gpus = mx.device.num_gpus()
 # you can replace num_workers with the number of cores on you device
 num_workers = 8
 ctx = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
diff --git a/docs/python_docs/python/tutorials/getting-started/gluon_migration_guide.md b/docs/python_docs/python/tutorials/getting-started/gluon_migration_guide.md
index 54a3cfd41bed..c65a7f2a6556 100644
--- a/docs/python_docs/python/tutorials/getting-started/gluon_migration_guide.md
+++ b/docs/python_docs/python/tutorials/getting-started/gluon_migration_guide.md
@@ -342,7 +342,7 @@ Example:
 import mxnet as mx
 # create key-value store with horovod backend
 kv = mx.kv.create('horovod') # or choose 'kvstore', 'byteps' as backend
-ctx = mx.gpu(kv.local_rank) if mx.context.num_gpus() > 0 else mx.cpu(kv.local_rank)
+ctx = mx.gpu(kv.local_rank) if mx.device.num_gpus() > 0 else mx.cpu(kv.local_rank)
 val = mx.np.zeros((2, 3), ctx=ctx)
 # broadcast the value at rank 0 to all ranks
 kv.broadcast('0', mx.np.zeros((2, 3), ctx=ctx), out=val)
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md
index 738f128bb6d8..4e98d90ee88b 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md
@@ -49,7 +49,7 @@ Let's define a helper function to build a LeNet model and another helper to trai
 
 ```{.python .input}
 # Use GPU if one exists, else use CPU
-ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu()
+ctx = mx.gpu() if mx.device.num_gpus() else mx.cpu()
 
 # MNIST images are 28x28. Total pixels in input layer is 28x28 = 784
 num_inputs = 784
diff --git a/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md b/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md
index 34c6a3d53129..5bb842bcf30a 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md
@@ -50,7 +50,7 @@ batch_size   = 64
 z_dim        = 100
 n_continuous = 2
 n_categories = 10
-ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu()
+ctx = mx.gpu() if mx.device.num_gpus() else mx.cpu()
 ```
 
 Some functions to load and normalize images.
diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md b/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md
index 9abe52388349..54d8ba4c3b7a 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md
@@ -38,7 +38,7 @@ from mxnet.gluon.model_zoo import vision
 from mxnet.gluon.contrib.estimator import estimator
 from mxnet.gluon.contrib.estimator.event_handler import TrainBegin, TrainEnd, EpochEnd, CheckpointHandler
 
-gpu_count = mx.context.num_gpus()
+gpu_count = mx.device.num_gpus()
 ctx = [mx.gpu(i) for i in range(gpu_count)] if gpu_count > 0 else mx.cpu()
 ```
 
diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_finder.md b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_finder.md
index d446e60e6218..a64f3c4da9b6 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_finder.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_finder.md
@@ -231,7 +231,7 @@ Using a Pre-activation ResNet-18 from the Gluon model zoo, we instantiate our Le
 
 
 ```{.python .input}
-ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu()
+ctx = mx.gpu() if mx.device.num_gpus() else mx.cpu()
 net = mx.gluon.model_zoo.vision.resnet18_v2(classes=10)
 learner = Learner(net=net, data_loader=data_loader, ctx=ctx)
 lr_finder = LRFinder(learner)
diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md
index 1a5f6f4516e3..c1f2fd812775 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md
@@ -140,7 +140,7 @@ As discussed above, the schedule should return a learning rate given an (1-based
 
 ```{.python .input}
 # Use GPU if one exists, else use CPU
-ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu()
+ctx = mx.gpu() if mx.device.num_gpus() else mx.cpu()
 
 # MNIST images are 28x28. Total pixels in input layer is 28x28 = 784
 num_inputs = 784
diff --git a/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md b/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md
index 2c6315b6fc76..ba75c728ca59 100644
--- a/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md
+++ b/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md
@@ -272,7 +272,7 @@ We pick a context, fine-tuning on CPU will be **WAY** slower.
 
 
 ```{.python .input}
-ctx = mx.gpu() if mx.context.num_gpus() > 0 else mx.cpu()
+ctx = mx.gpu() if mx.device.num_gpus() > 0 else mx.cpu()
 ```
 
 We create a symbol block that is going to hold all our pre-trained layers, and assign the weights of the different pre-trained layers to the newly created SymbolBlock
diff --git a/docs/python_docs/python/tutorials/performance/backend/profiler.md b/docs/python_docs/python/tutorials/performance/backend/profiler.md
index e56019405287..e769a040155e 100644
--- a/docs/python_docs/python/tutorials/performance/backend/profiler.md
+++ b/docs/python_docs/python/tutorials/performance/backend/profiler.md
@@ -101,7 +101,7 @@ Let's define a function that will run a single training iteration given `data` a
 
 ```{.python .input}
 # Use GPU if available
-if mx.context.num_gpus():
+if mx.device.num_gpus():
     ctx=mx.gpu()
 else:
     ctx=mx.cpu()
diff --git a/example/adversary/adversary_generation.ipynb b/example/adversary/adversary_generation.ipynb
index bab5f81cf3c8..9a8cc53373df 100644
--- a/example/adversary/adversary_generation.ipynb
+++ b/example/adversary/adversary_generation.ipynb
@@ -63,7 +63,7 @@
    "cell_type": "code",
    "execution_count": 17,
    "source": [
-    "ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu()\n",
+    "ctx = mx.gpu() if mx.device.num_gpus() else mx.cpu()\n",
     "batch_size = 128"
    ],
    "outputs": [],
diff --git a/example/bi-lstm-sort/bi-lstm-sort.ipynb b/example/bi-lstm-sort/bi-lstm-sort.ipynb
index 3eb08f1508d2..cc0e43b99b66 100644
--- a/example/bi-lstm-sort/bi-lstm-sort.ipynb
+++ b/example/bi-lstm-sort/bi-lstm-sort.ipynb
@@ -59,7 +59,7 @@
     "seq_len = 5\n",
     "split = 0.8\n",
     "batch_size = 512\n",
-    "ctx = mx.gpu() if mx.context.num_gpus() > 0 else mx.cpu()"
+    "ctx = mx.gpu() if mx.device.num_gpus() > 0 else mx.cpu()"
    ],
    "outputs": [],
    "metadata": {}
diff --git a/example/distributed_training-horovod/gluon_mnist.py b/example/distributed_training-horovod/gluon_mnist.py
index c2e6f0bdc533..324b98520669 100644
--- a/example/distributed_training-horovod/gluon_mnist.py
+++ b/example/distributed_training-horovod/gluon_mnist.py
@@ -45,7 +45,7 @@
 
 if not args.no_cuda:
     # Disable CUDA if there are no GPUs.
-    if mx.context.num_gpus() == 0:
+    if mx.device.num_gpus() == 0:
         args.no_cuda = True
 
 logging.basicConfig(level=logging.INFO)
diff --git a/example/distributed_training/cifar10_kvstore_hvd.py b/example/distributed_training/cifar10_kvstore_hvd.py
index ff679864f7c3..d60507842729 100644
--- a/example/distributed_training/cifar10_kvstore_hvd.py
+++ b/example/distributed_training/cifar10_kvstore_hvd.py
@@ -49,7 +49,7 @@
 
 if not args.no_cuda:
     # Disable CUDA if there are no GPUs.
-    if mx.context.num_gpus() == 0:
+    if mx.device.num_gpus() == 0:
         args.no_cuda = True
 
 
diff --git a/example/multi-task/multi-task-learning.ipynb b/example/multi-task/multi-task-learning.ipynb
index 460e56ee7c3d..62da7aee21b7 100644
--- a/example/multi-task/multi-task-learning.ipynb
+++ b/example/multi-task/multi-task-learning.ipynb
@@ -78,7 +78,7 @@
    "source": [
     "batch_size = 128\n",
     "epochs = 5\n",
-    "ctx = mx.gpu() if mx.context.num_gpus() > 0 else mx.cpu()\n",
+    "ctx = mx.gpu() if mx.device.num_gpus() > 0 else mx.cpu()\n",
     "lr = 0.01"
    ],
    "outputs": [],
diff --git a/example/recommenders/demo1-MF.ipynb b/example/recommenders/demo1-MF.ipynb
index 71ea91801d71..6d37995f5c46 100644
--- a/example/recommenders/demo1-MF.ipynb
+++ b/example/recommenders/demo1-MF.ipynb
@@ -73,7 +73,7 @@
    "cell_type": "code",
    "execution_count": 2,
    "source": [
-    "ctx = [mx.gpu(0)] if mx.context.num_gpus() > 0 else [mx.cpu()]\n",
+    "ctx = [mx.gpu(0)] if mx.device.num_gpus() > 0 else [mx.cpu()]\n",
     "batch_size = 128"
    ],
    "outputs": [],
diff --git a/example/recommenders/demo2-dssm.ipynb b/example/recommenders/demo2-dssm.ipynb
index 9f4080a24a09..be3ad541308d 100644
--- a/example/recommenders/demo2-dssm.ipynb
+++ b/example/recommenders/demo2-dssm.ipynb
@@ -59,7 +59,7 @@
     "hidden_units = 128\n",
     "epsilon_proj = 0.25\n",
     "\n",
-    "ctx = mx.gpu() if mx.context.num_gpus() > 0 else mx.cpu()"
+    "ctx = mx.gpu() if mx.device.num_gpus() > 0 else mx.cpu()"
    ],
    "outputs": [],
    "metadata": {
diff --git a/python/mxnet/gluon/contrib/estimator/batch_processor.py b/python/mxnet/gluon/contrib/estimator/batch_processor.py
index 5545a7678bcf..0d69004df5b7 100644
--- a/python/mxnet/gluon/contrib/estimator/batch_processor.py
+++ b/python/mxnet/gluon/contrib/estimator/batch_processor.py
@@ -61,7 +61,7 @@ def evaluate_batch(self, estimator,
         batch_axis : int, default 0
             Batch axis to split the validation data into devices.
         """
-        data, label = self._get_data_and_label(val_batch, estimator.context, batch_axis)
+        data, label = self._get_data_and_label(val_batch, estimator.device, batch_axis)
         pred = [estimator.val_net(x) for x in data]
         loss = [estimator.val_loss(y_hat, y) for y_hat, y in zip(pred, label)]
 
@@ -94,7 +94,7 @@ def fit_batch(self, estimator,
         loss: List of NDArray
             Loss on each of the sharded inputs.
         """
-        data, label = self._get_data_and_label(train_batch, estimator.context, batch_axis)
+        data, label = self._get_data_and_label(train_batch, estimator.device, batch_axis)
 
         with autograd.record():
             pred = [estimator.net(x) for x in data]
diff --git a/python/mxnet/gluon/contrib/estimator/event_handler.py b/python/mxnet/gluon/contrib/estimator/event_handler.py
index 40620556b72f..99a50d567ffe 100644
--- a/python/mxnet/gluon/contrib/estimator/event_handler.py
+++ b/python/mxnet/gluon/contrib/estimator/event_handler.py
@@ -586,7 +586,7 @@ def _resume_from_checkpoint(self, estimator):
             trainer_file = os.path.join(self.model_dir, trainer_file)
             assert os.path.exists(param_file), "Failed to load checkpoint, %s does not exist" % param_file
             assert os.path.exists(trainer_file), "Failed to load checkpoint, %s does not exist" % trainer_file
-            estimator.net.load_parameters(param_file, ctx=estimator.context)
+            estimator.net.load_parameters(param_file, ctx=estimator.device)
             estimator.trainer.load_states(trainer_file)
             estimator.logger.warning(msg)
 
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index 3e72fbab6c72..5565452a6a94 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -412,7 +412,7 @@ def _reduce(self):
                 data = self.data().copyto(device)
         else:
             # fetch all rows for 'row_sparse' param
-            all_row_ids = ndarray.arange(0, self.shape[0], dtype='int64', device=device)
+            all_row_ids = ndarray.arange(0, self.shape[0], dtype='int64', ctx=device)
             data = ndarray.zeros(self.shape, stype='row_sparse', ctx=device)
             trainer = self._trainer() if self._trainer else None
             if not trainer:
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index 93fe9eca4f23..0566a734d1ab 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -20,6 +20,7 @@
 """Parameter optimizer."""
 __all__ = ['Trainer']
 
+import warnings
 from collections import OrderedDict
 
 from .. import optimizer as opt
@@ -107,7 +108,7 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device',
             if param._grad_stype != 'default':
                 self._contains_sparse_grad = True
         self._compression_params = compression_params
-        self._contexts = self._check_contexts()
+        self._devices = self._check_devices()
         optimizer_params = optimizer_params if optimizer_params else {}
         self._init_optimizer(optimizer, optimizer_params)
         self._scale = self._optimizer.rescale_grad
@@ -126,15 +127,21 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device',
         self._reset_kvstore()
 
     def _check_contexts(self):
-        contexts = None
+        """This function has been deprecated. Please refer to ``Trainer._check_devices``."""
+        warnings.warn('Trainer._check_contexts has been renamed to'
+                      ' Trainer._check_devices', DeprecationWarning)
+        return self._check_devices()
+
+    def _check_devices(self):
+        devices = None
         for param in self._params:
-            ctx = param.list_ctx()
-            assert contexts is None or contexts == ctx, \
-                "All Parameters must be initialized on the same set of contexts, " \
+            device = param.list_device()
+            assert devices is None or devices == device, \
+                "All Parameters must be initialized on the same set of devices, " \
                 "but Parameter %s is initialized on %s while previous Parameters " \
-                "are initialized on %s."%(param.name, str(ctx), str(contexts))
-            contexts = ctx
-        return contexts
+                "are initialized on %s."%(param.name, str(device), str(devices))
+            devices = device
+        return devices
 
     def _init_optimizer(self, optimizer, optimizer_params):
         param_dict = {i: param for i, param in enumerate(self._params)}
@@ -150,7 +157,7 @@ def _init_optimizer(self, optimizer, optimizer_params):
             self._optimizer = opt.create(optimizer, param_dict=param_dict,
                                          **optimizer_params)
         self._updaters = [opt.get_updater(self._optimizer) \
-                            for _ in self._contexts]
+                            for _ in self._devices]
 
     def _init_params(self):
         """Initialize parameters in the KVStore.
@@ -224,8 +231,8 @@ def _init_kvstore(self):
             #    - backward()
             #    - push_and_update(grad)
             #    - pull(weight)
-            arg_arrays = {param._uuid: param.data(self._contexts[0]) for param in self._params}
-            kvstore, _ = _create_kvstore(config['kvstore'], len(self._contexts), arg_arrays)
+            arg_arrays = {param._uuid: param.data(self._devices[0]) for param in self._params}
+            kvstore, _ = _create_kvstore(config['kvstore'], len(self._devices), arg_arrays)
             self._distributed = 'dist' in kvstore.type if kvstore else False
             update_on_kvstore = self._distributed
             # raise err if user provides unsupported configs
@@ -242,8 +249,8 @@ def _init_kvstore(self):
         else:
             # Training with dense weight and dense gradients.
             # The only unsupported mode is async with update_on_kvstore=False
-            arg_arrays = {param._uuid: param.data(self._contexts[0]) for param in self._params}
-            kvstore, update_on_kvstore = _create_kvstore(config['kvstore'], len(self._contexts),
+            arg_arrays = {param._uuid: param.data(self._devices[0]) for param in self._params}
+            kvstore, update_on_kvstore = _create_kvstore(config['kvstore'], len(self._devices),
                                                          arg_arrays)
             self._distributed = 'dist' in kvstore.type if kvstore else False
             if self._distributed and 'async' in kvstore.type:
@@ -361,7 +368,7 @@ def step(self, batch_size, ignore_stale_grad=False):
         self._update(ignore_stale_grad)
 
     def allreduce_grads(self):
-        """For each parameter, reduce the gradients from different contexts.
+        """For each parameter, reduce the gradients from different devices.
 
         Should be called after `autograd.backward()`, outside of `record()` scope,
         and before `trainer.update()`.
@@ -457,13 +464,13 @@ def _update(self, ignore_stale_grad=False):
                 for data in param._check_and_get(param._data, list):
                     if not data._fresh_grad:
                         raise UserWarning(
-                            "Gradient of Parameter `%s` on context %s has not been updated "
+                            "Gradient of Parameter `%s` on device %s has not been updated "
                             "by backward since last `step`. This could mean a bug in your "
                             "model that made it only use a subset of the Parameters (Blocks) "
                             "for this iteration. If you are intentionally only using a subset, "
                             "call step with ignore_stale_grad=True to suppress this "
                             "warning and skip updating of Parameters with stale gradient" \
-                            %(param.name, str(data.context)))
+                            %(param.name, str(data.device)))
 
             if self._kvstore and self._update_on_kvstore:
                 continue
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index b18e8965196e..07b738cbe58d 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -843,7 +843,7 @@ def assert_exception(f, exception_type, *args, **kwargs):
         return
 
 
-def _parse_location(sym, location, device, dtype=default_dtype()):
+def _parse_location(sym, location, ctx, dtype=default_dtype()):
     """Parses the given location to a ordered dictionary.
 
     Arguments of the provided op `sym` are used as dictionary keys
@@ -862,7 +862,7 @@ def _parse_location(sym, location, device, dtype=default_dtype()):
         - if type is dict of str -> `np.ndarray`
             maps the name of arguments to the corresponding `np.ndarray`.
         *In either case, value of all the arguments must be provided.*
-    device : Device
+    ctx : Device
         Device context.
     dtype: "asnumpy" or np.float16 or np.float32 or np.float64
         If dtype is "asnumpy" then the mx.nd.array created will have the same
@@ -898,7 +898,7 @@ def _parse_location(sym, location, device, dtype=default_dtype()):
                              % (str(set(sym.list_arguments())), str(set(location.keys()))))
     else:
         location = {k: v for k, v in zip(sym.list_arguments(), location)}
-    location = {k: mx.nd.array(v, ctx=device, dtype=v.dtype if dtype == "asnumpy" else dtype) \
+    location = {k: mx.nd.array(v, ctx=ctx, dtype=v.dtype if dtype == "asnumpy" else dtype) \
                if isinstance(v, np.ndarray) else v for k, v in location.items()}
     return _sorted_dict(location)
 
diff --git a/tests/nightly/dist_device_sync_kvstore_byteps.py b/tests/nightly/dist_device_sync_kvstore_byteps.py
index c41875d981c6..43df9a501752 100644
--- a/tests/nightly/dist_device_sync_kvstore_byteps.py
+++ b/tests/nightly/dist_device_sync_kvstore_byteps.py
@@ -52,7 +52,7 @@ def check_diff_to_scalar(A, x, rank=None):
 my_rank = kv.rank
 my_num_workers = kv.num_workers
 
-has_gpu = mx.context.num_gpus() > 0
+has_gpu = mx.device.num_gpus() > 0
 
 def get_current_device(device=False):
     if has_gpu and device==True:
diff --git a/tests/nightly/dist_device_sync_kvstore_horovod.py b/tests/nightly/dist_device_sync_kvstore_horovod.py
index b5dfcafc8af1..3e2c2b98e270 100644
--- a/tests/nightly/dist_device_sync_kvstore_horovod.py
+++ b/tests/nightly/dist_device_sync_kvstore_horovod.py
@@ -51,7 +51,7 @@ def check_diff_to_scalar(A, x, rank=None):
 
 
 def test_pushpull():
-    ctx = mx.gpu(kv.local_rank) if mx.context.num_gpus() > 0 else mx.cpu(kv.local_rank)
+    ctx = mx.gpu(kv.local_rank) if mx.device.num_gpus() > 0 else mx.cpu(kv.local_rank)
     scale = kv.rank + 1
     tensor = mx.nd.ones(shape, ctx) * scale
     kv.pushpull('3', tensor)
@@ -62,7 +62,7 @@ def test_pushpull():
 
 
 def test_broadcast():
-    ctx = mx.gpu(kv.local_rank) if mx.context.num_gpus() > 0 else mx.cpu(kv.local_rank)
+    ctx = mx.gpu(kv.local_rank) if mx.device.num_gpus() > 0 else mx.cpu(kv.local_rank)
     val = mx.nd.zeros(shape, ctx)
     kv.broadcast('0', mx.nd.ones(shape), out=val)
     expected = 1
diff --git a/tests/nightly/estimator/test_estimator_cnn.py b/tests/nightly/estimator/test_estimator_cnn.py
index ef2fd566a0c6..7b6812a70d68 100644
--- a/tests/nightly/estimator/test_estimator_cnn.py
+++ b/tests/nightly/estimator/test_estimator_cnn.py
@@ -122,7 +122,7 @@ def test_estimator_cpu():
 
 
 @pytest.mark.seed(7)  # using fixed seed to reduce flakiness in accuracy assertion
-@pytest.mark.skipif(mx.context.num_gpus() < 1, reason="skip if no GPU")
+@pytest.mark.skipif(mx.device.num_gpus() < 1, reason="skip if no GPU")
 def test_estimator_gpu():
     '''
     Test estimator by training resnet18_v1 for 5 epochs on MNIST and verify accuracy
diff --git a/tests/nightly/estimator/test_sentiment_rnn.py b/tests/nightly/estimator/test_sentiment_rnn.py
index 311e4277179e..e481f70cda49 100644
--- a/tests/nightly/estimator/test_sentiment_rnn.py
+++ b/tests/nightly/estimator/test_sentiment_rnn.py
@@ -238,7 +238,7 @@ def test_estimator_cpu():
 
 
 @pytest.mark.seed(7)  # using fixed seed to reduce flakiness in accuracy assertion
-@pytest.mark.skipif(mx.context.num_gpus() < 1, reason="skip if no GPU")
+@pytest.mark.skipif(mx.device.num_gpus() < 1, reason="skip if no GPU")
 def test_estimator_gpu():
     '''
     Test estimator by training Bidirectional RNN for 5 epochs on the IMDB dataset
diff --git a/tests/python/gpu/test_device.py b/tests/python/gpu/test_device.py
index 76a32def33f5..3c446dfb4fe6 100644
--- a/tests/python/gpu/test_device.py
+++ b/tests/python/gpu/test_device.py
@@ -24,7 +24,7 @@
 
 shapes = [(10), (100), (1000), (10000), (100000), (2,2), (2,3,4,5,6,7,8)]
 keys = [1,2,3,4,5,6,7]
-num_gpus = mx.context.num_gpus()
+num_gpus = mx.device.num_gpus()
 
 
 if num_gpus > 8 :
@@ -34,7 +34,7 @@
 
 gpus = range(1, 1+num_gpus)
 
-@pytest.mark.skipif(mx.context.num_gpus() < 1, reason="test_device_pushpull needs at least 1 GPU")
+@pytest.mark.skipif(mx.device.num_gpus() < 1, reason="test_device_pushpull needs at least 1 GPU")
 def test_device_pushpull():
     def check_dense_pushpull(kv_type):
         for shape, key in zip(shapes, keys):
diff --git a/tests/python/gpu/test_fusion.py b/tests/python/gpu/test_fusion.py
index a18be14460d0..6e72a38b5ecb 100644
--- a/tests/python/gpu/test_fusion.py
+++ b/tests/python/gpu/test_fusion.py
@@ -269,7 +269,7 @@ def test_fusion_compiler_cache():
     check_fused_symbol(a+b, ctx=mx.gpu(0), a=arr1, b=arr2)
 
     # On multi-GPU systems, invoke the same model on other GPUs
-    num_gpus = mx.context.num_gpus()
+    num_gpus = mx.device.num_gpus()
     if num_gpus > 1:
         check_fused_symbol(a+b, ctx=mx.gpu(1), a=arr1, b=arr2)
 
diff --git a/tests/python/gpu/test_nccl.py b/tests/python/gpu/test_nccl.py
index 136ced2aac73..6452ea867bbe 100644
--- a/tests/python/gpu/test_nccl.py
+++ b/tests/python/gpu/test_nccl.py
@@ -22,7 +22,7 @@
 
 shapes = [(10), (100), (1000), (10000), (100000), (2,2), (2,3,4,5,6,7,8)]
 keys = [1,2,3,4,5,6,7]
-num_gpus = mx.context.num_gpus()
+num_gpus = mx.device.num_gpus()
 
 
 if num_gpus > 8 :
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 066ab97f6a77..43560749e6f6 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -2193,7 +2193,7 @@ def test_allclose_function_gpu():
 
 def test_context_num_gpus():
     # Test that num_gpus reports at least one GPU, as the test is run on a GPU host.
-    assert mx.context.num_gpus() > 0
+    assert mx.device.num_gpus() > 0
 
 def math_log(shape, dtype, check_value):
     np_x = np.random.rand(*tuple(shape))
diff --git a/tests/python/profiling/test_nvtx.py b/tests/python/profiling/test_nvtx.py
index 5cf708b7a700..92db2ff10a82 100644
--- a/tests/python/profiling/test_nvtx.py
+++ b/tests/python/profiling/test_nvtx.py
@@ -22,7 +22,7 @@
 
 from subprocess import Popen, PIPE
 
-@pytest.mark.skipif(not mx.context.num_gpus(), reason='Test only applicable to machines with GPUs')
+@pytest.mark.skipif(not mx.device.num_gpus(), reason='Test only applicable to machines with GPUs')
 def test_nvtx_ranges_present_in_profile():
 
     # Build a system independent wrapper to execute simple_forward with nvprof
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index ca769b7edd26..88af86ff616a 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -698,7 +698,7 @@ def _syncParameters(bn1, bn2, device):
                             input2grad.asnumpy(), atol=atol, rtol=rtol)
 
     cfgs = [(1, False)]
-    num_gpus = 0 if default_device().device_type != 'gpu' else mx.context.num_gpus()
+    num_gpus = 0 if default_device().device_type != 'gpu' else mx.device.num_gpus()
     batch_size = 24
     for i in range(1, num_gpus + 1):
         if batch_size % i == 0:
@@ -1442,7 +1442,7 @@ def _test_multi_reset(nArrays, dtype, device):
             shape = ()
             for _ in range(onp.random.randint(1, 5)):
                 shape = shape + (onp.random.randint(1, 10),)
-            arr.append(mx.nd.random.uniform(shape=shape, dtype=arrType, device=device))
+            arr.append(mx.nd.random.uniform(shape=shape, dtype=arrType, ctx=device))
 
         # Reset all arrays
         mx.nd.reset_arrays(*arr, num_arrays=len(arr))
@@ -1465,7 +1465,7 @@ def _test_multi_reset(nArrays, dtype, device):
     with environment('MXNET_STORAGE_FALLBACK_LOG_VERBOSE', '0'):
         for type in ['float16', 'float32', 'float64']:
             for embType in ['float32', 'float64']:
-                _test_grad_reset(ctx, dtype=type, sparse=False, embeddingType=embType)
+                _test_grad_reset(device, dtype=type, sparse=False, embeddingType=embType)
 
 
 @pytest.mark.parametrize('static_alloc', [False, True])
diff --git a/tests/python/unittest/test_gluon_batch_processor.py b/tests/python/unittest/test_gluon_batch_processor.py
index 0f0aa222658c..6a2ab2ff3c3c 100644
--- a/tests/python/unittest/test_gluon_batch_processor.py
+++ b/tests/python/unittest/test_gluon_batch_processor.py
@@ -53,17 +53,17 @@ def test_batch_processor_fit():
     net = _get_test_network()
     dataloader, dataiter = _get_test_data()
     num_epochs = 1
-    ctx = mx.cpu()
+    device = mx.cpu()
     loss = gluon.loss.L2Loss()
     acc = mx.gluon.metric.Accuracy()
-    net.initialize(ctx=ctx)
+    net.initialize(device=device)
     processor = BatchProcessor()
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
     est = Estimator(net=net,
                     loss=loss,
                     train_metrics=acc,
                     trainer=trainer,
-                    context=ctx,
+                    device=device,
                     batch_processor=processor)
 
     est.fit(train_data=dataloader,
@@ -85,18 +85,18 @@ def test_batch_processor_validation():
     net = _get_test_network()
     dataloader, dataiter = _get_test_data()
     num_epochs = 1
-    ctx = mx.cpu()
+    device = mx.cpu()
     loss = gluon.loss.L2Loss()
     acc = mx.gluon.metric.Accuracy()
     val_loss = gluon.loss.L1Loss()
-    net.initialize(ctx=ctx)
+    net.initialize(device=device)
     processor = BatchProcessor()
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
     est = Estimator(net=net,
                     loss=loss,
                     train_metrics=acc,
                     trainer=trainer,
-                    context=ctx,
+                    device=device,
                     val_loss=val_loss,
                     batch_processor=processor)
     # Input dataloader
diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py
index aadc372c9160..f1643ea6d30f 100644
--- a/tests/python/unittest/test_gluon_data.py
+++ b/tests/python/unittest/test_gluon_data.py
@@ -365,7 +365,7 @@ def test_dataloader_context():
     for _, x in enumerate(loader2):
         assert x.context == context.cpu_pinned(default_dev_id)
 
-    if mx.context.num_gpus() <= 1:
+    if mx.device.num_gpus() <= 1:
         print('Bypassing custom_dev_id pinned mem test on system with < 2 gpus.')
     else:
         # use pinned memory with custom device id
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index d8a134d3b321..d9396bdba8a6 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -7857,7 +7857,7 @@ def test_context_num_gpus():
     try:
         # Note: the test is run both on GPU and CPU hosts, so that we can not assert
         # on a specific number here.
-        assert mx.context.num_gpus() >= 0
+        assert mx.device.num_gpus() >= 0
     except mx.MXNetError as e:
         # Note: On a CPU only host CUDA sometimes is not able to determine the number
         # of GPUs
diff --git a/tests/python/unittest/test_random.py b/tests/python/unittest/test_random.py
index e03d6d015006..3fc061bd13b6 100644
--- a/tests/python/unittest/test_random.py
+++ b/tests/python/unittest/test_random.py
@@ -499,7 +499,7 @@ def test_random_seed_setting_for_context():
         samples_imp = []
         samples_sym = []
         # Collect random number samples from the generators of all devices, each seeded with the same number.
-        for dev_id in range(0, mx.context.num_gpus() if dev_type == 'gpu' else 1):
+        for dev_id in range(0, mx.device.num_gpus() if dev_type == 'gpu' else 1):
             with mx.Context(dev_type, dev_id):
                 ctx = mx.context.current_context()
                 seed = set_seed_variously_for_context(ctx, 1, num_temp_seeds, seed_to_test)
@@ -530,7 +530,7 @@ def test_parallel_random_seed_setting_for_context():
         samples_imp = []
         samples_sym = []
         # Collect random number samples from the generators of all devices, each seeded with the same number.
-        for dev_id in range(0, mx.context.num_gpus() if dev_type == 'gpu' else 1):
+        for dev_id in range(0, mx.device.num_gpus() if dev_type == 'gpu' else 1):
             with mx.Context(dev_type, dev_id):
                 ctx = mx.context.current_context()
                 # Avoid excessive test cpu runtimes.

From ccfbc28ac619315fc6eab840f4b4a2416b6e9148 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Fri, 15 Oct 2021 22:48:23 -0700
Subject: [PATCH 31/41] update

---
 python/mxnet/numpy_extension/random.py | 45 ++++++++++++++------------
 python/mxnet/test_utils.py             |  6 ++--
 2 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/python/mxnet/numpy_extension/random.py b/python/mxnet/numpy_extension/random.py
index 35e6eb1f4d19..25ea4692e1f2 100644
--- a/python/mxnet/numpy_extension/random.py
+++ b/python/mxnet/numpy_extension/random.py
@@ -19,12 +19,14 @@
 
 from .. import random as _mx_rand
 from ..ndarray import numpy_extension as _mx_nd_npx
+from ..util import wrap_ctx_to_device_func
 
 
 __all__ = ['seed', 'bernoulli', 'normal_n', 'uniform_n']
 
 
-def seed(seed, ctx='all'):  # pylint: disable=redefined-outer-name
+@wrap_ctx_to_device_func
+def seed(seed, device='all'):  # pylint: disable=redefined-outer-name
     r"""Seeds the random number generators in MXNet.
 
     This affects the behavior of modules in MXNet that uses random number generators,
@@ -35,7 +37,7 @@ def seed(seed, ctx='all'):  # pylint: disable=redefined-outer-name
     seed : int
         The random number seed.
 
-    ctx : Context
+    device : Device
         The device context of the generator. The default is "all" which means seeding random
         number generators of all devices.
 
@@ -47,7 +49,7 @@ def seed(seed, ctx='all'):  # pylint: disable=redefined-outer-name
     even if they are seeded using the same seed.
 
     To produce identical random number sequences independent of the device id,
-    set optional `ctx` argument. This produces the same sequence of random numbers independent
+    set optional `device` argument. This produces the same sequence of random numbers independent
     of the device id, but the sequence can be different on different kind of devices as MXNet's
     random number generators for CPU and GPU use different algorithms.
 
@@ -65,16 +67,17 @@ def seed(seed, ctx='all'):  # pylint: disable=redefined-outer-name
     >>> np.random.uniform()
     array(0.03812965)
     >>> npx.random.seed(128)
-    >>> np.random.uniform(ctx=npx.gpu(0))
-    array(0.9894903, ctx=gpu(0))
+    >>> np.random.uniform(device=npx.gpu(0))
+    array(0.9894903, device=gpu(0))
     >>> npx.random.seed(128)
-    >>> np.random.uniform(ctx=npx.gpu(0))
-    array(0.9894903, ctx=gpu(0))
+    >>> np.random.uniform(device=npx.gpu(0))
+    array(0.9894903, device=gpu(0))
     """
-    _mx_rand.seed(seed_state=seed, ctx=ctx)
+    _mx_rand.seed(seed_state=seed, device=device)
 
 
-def bernoulli(prob=None, logit=None, size=None, dtype=None, ctx=None, out=None):
+@wrap_ctx_to_device_func
+def bernoulli(prob=None, logit=None, size=None, dtype=None, device=None, out=None):
     """Creates a Bernoulli distribution parameterized by :attr:`prob`
     or :attr:`logit` (but not both).
 
@@ -98,8 +101,8 @@ def bernoulli(prob=None, logit=None, size=None, dtype=None, ctx=None, out=None):
         name, i.e., 'int64', 'int', etc, so byteorder is not available
         and a specific precision may have different C types depending
         on the platform. The default value is 'np.float32'.
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
     out : symbol, optional
         The output symbol (default is `None`).
 
@@ -124,10 +127,11 @@ def bernoulli(prob=None, logit=None, size=None, dtype=None, ctx=None, out=None):
         [1., 1., 1., 0.],
         [1., 0., 1., 0.]])
     """
-    return _mx_nd_npx.random.bernoulli(prob, logit, size, dtype, ctx, out)
+    return _mx_nd_npx.random.bernoulli(prob, logit, size, dtype, device, out)
 
 
-def uniform_n(low=0.0, high=1.0, batch_shape=None, dtype=None, ctx=None):
+@wrap_ctx_to_device_func
+def uniform_n(low=0.0, high=1.0, batch_shape=None, dtype=None, device=None):
     r"""Draw samples from a uniform distribution.
 
     Samples are uniformly distributed over the half-open interval
@@ -152,8 +156,8 @@ def uniform_n(low=0.0, high=1.0, batch_shape=None, dtype=None, ctx=None):
         ``np.broadcast(low, high).size`` samples are drawn.
     dtype : {'float16', 'float32', 'float64'}, optional
         Data type of output samples. Default is 'float32'
-    ctx : Context, optional
-        Device context of output. Default is current context.
+    device : Device, optional
+        Device context of output. Default is current device.
 
     Returns
     -------
@@ -181,10 +185,11 @@ def uniform_n(low=0.0, high=1.0, batch_shape=None, dtype=None, ctx=None):
     function to behave when passed arguments satisfying that
     inequality condition.
     """
-    return _mx_nd_npx.random.uniform_n(low, high, batch_shape=batch_shape, ctx=ctx, dtype=dtype)
+    return _mx_nd_npx.random.uniform_n(low, high, batch_shape=batch_shape, device=device, dtype=dtype)
 
 
-def normal_n(loc=0.0, scale=1.0, batch_shape=None, dtype=None, ctx=None):
+@wrap_ctx_to_device_func
+def normal_n(loc=0.0, scale=1.0, batch_shape=None, dtype=None, device=None):
     r"""Draw random samples from a normal (Gaussian) distribution.
 
     Samples are distributed according to a normal distribution parametrized
@@ -206,8 +211,8 @@ def normal_n(loc=0.0, scale=1.0, batch_shape=None, dtype=None, ctx=None):
         ``np.broadcast(loc, scale).size`` samples are drawn.
     dtype : {'float16', 'float32', 'float64'}, optional
         Data type of output samples. Default is 'float32'
-    ctx : Context, optional
-        Device context of output, default is current context.
+    device : Device, optional
+        Device context of output, default is current device.
 
     Returns
     -------
@@ -249,4 +254,4 @@ def normal_n(loc=0.0, scale=1.0, batch_shape=None, dtype=None, ctx=None):
     >>> np.abs(mu - np.mean(s)) < 0.01
     array(True)
     """
-    return _mx_nd_npx.random.normal_n(loc, scale, batch_shape, dtype, ctx)
+    return _mx_nd_npx.random.normal_n(loc, scale, batch_shape, dtype, device)
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 07b738cbe58d..7be949deeb51 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -903,7 +903,7 @@ def _parse_location(sym, location, ctx, dtype=default_dtype()):
     return _sorted_dict(location)
 
 
-def _parse_aux_states(sym, aux_states, device, dtype=default_dtype()):
+def _parse_aux_states(sym, aux_states, ctx, dtype=default_dtype()):
     """Parses the given auxiliary states to a dictionary.
 
     Auxiliary states of the provided op `sym` are used as dictionary
@@ -922,7 +922,7 @@ def _parse_aux_states(sym, aux_states, device, dtype=default_dtype()):
         - if type is dict of str -> `np.ndarray`
             maps the name of arguments to the corresponding `np.ndarray`.
         *In either case, all aux states of `sym` must be provided.*
-    device : Device
+    ctx : Device
         Device context.
     dtype: "asnumpy" or np.float16 or np.float32 or np.float64
         If dtype is "asnumpy" then the mx.nd.array created will have the same
@@ -963,7 +963,7 @@ def _parse_aux_states(sym, aux_states, device, dtype=default_dtype()):
         elif isinstance(aux_states, (list, tuple)):
             aux_names = sym.list_auxiliary_states()
             aux_states = {k:v for k, v in zip(aux_names, aux_states)}
-        aux_states = {k: mx.nd.array(v, ctx=device, dtype=v.dtype if dtype == "asnumpy" else dtype) \
+        aux_states = {k: mx.nd.array(v, ctx=ctx, dtype=v.dtype if dtype == "asnumpy" else dtype) \
                       for k, v in aux_states.items()}
     return aux_states
 

From 2bf9efc79d84c05c0ca141791a04e65de0eee5a9 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Sun, 17 Oct 2021 14:23:46 -0700
Subject: [PATCH 32/41] fix tests

---
 .../mxnet/ndarray/numpy_extension/random.py   | 24 +++++++++----------
 python/mxnet/test_utils.py                    |  2 +-
 tests/python/unittest/test_operator.py        |  4 ++--
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/python/mxnet/ndarray/numpy_extension/random.py b/python/mxnet/ndarray/numpy_extension/random.py
index 3ed627661126..05224f649fd0 100644
--- a/python/mxnet/ndarray/numpy_extension/random.py
+++ b/python/mxnet/ndarray/numpy_extension/random.py
@@ -91,18 +91,18 @@ def bernoulli(prob=None, logit=None, size=None, dtype=None, device=None, out=Non
         is_tensor = isinstance(prob, tensor_type_name)
         if is_tensor:
             return _npi.bernoulli(prob, prob=None, logit=None, is_logit=False,
-                                  size=size, device=device, dtype=dtype, out=out)
+                                  size=size, ctx=device, dtype=dtype, out=out)
         else:
             return _npi.bernoulli(prob=prob, logit=None, is_logit=False,
-                                  size=size, device=device, dtype=dtype, out=out)
+                                  size=size, ctx=device, dtype=dtype, out=out)
     else:
         is_tensor = isinstance(logit, tensor_type_name)
         if is_tensor:
             return _npi.bernoulli(logit, prob=None, logit=None, is_logit=True,
-                                  size=size, device=device, dtype=dtype, out=out)
+                                  size=size, ctx=device, dtype=dtype, out=out)
         else:
             return _npi.bernoulli(prob=None, logit=logit, is_logit=True,
-                                  size=size, device=device, dtype=dtype, out=out)
+                                  size=size, ctx=device, dtype=dtype, out=out)
 
 
 @wrap_ctx_to_device_func
@@ -174,16 +174,16 @@ def uniform_n(low=0.0, high=1.0, batch_shape=None, dtype=None, device=None):
         batch_shape = (-2,) + batch_shape
     if input_type == (True, True):
         return _npi.uniform(low, high, low=None, high=None, size=batch_shape,
-                            device=device, dtype=dtype)
+                            ctx=device, dtype=dtype)
     elif input_type == (False, True):
         return _npi.uniform(high, low=low, high=None, size=batch_shape,
-                            device=device, dtype=dtype)
+                            ctx=device, dtype=dtype)
     elif input_type == (True, False):
         return _npi.uniform(low, low=None, high=high, size=batch_shape,
-                            device=device, dtype=dtype)
+                            ctx=device, dtype=dtype)
     else:
         return _npi.uniform(low=low, high=high, size=batch_shape,
-                            device=device, dtype=dtype)
+                            ctx=device, dtype=dtype)
 
 
 @wrap_ctx_to_device_func
@@ -266,13 +266,13 @@ def normal_n(loc=0.0, scale=1.0, batch_shape=None, dtype=None, device=None):
         batch_shape = (-2,) + batch_shape
     if input_type == (True, True):
         return _npi.normal(loc, scale, loc=None, scale=None, size=batch_shape,
-                           device=device, dtype=dtype)
+                           ctx=device, dtype=dtype)
     elif input_type == (False, True):
         return _npi.normal(scale, loc=loc, scale=None, size=batch_shape,
-                           device=device, dtype=dtype)
+                           ctx=device, dtype=dtype)
     elif input_type == (True, False):
         return _npi.normal(loc, loc=None, scale=scale, size=batch_shape,
-                           device=device, dtype=dtype)
+                           ctx=device, dtype=dtype)
     else:
         return _npi.normal(loc=loc, scale=scale, size=batch_shape,
-                           device=device, dtype=dtype)
+                           ctx=device, dtype=dtype)
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 7be949deeb51..c80417347d77 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -1639,7 +1639,7 @@ def smaller_dtype(dt1, dt2):
                           for (out, dt) in zip(exe_list[0].outputs, least_precise_dtype)]
         # Perform backward()
         for exe in exe_list:
-            out_grads = [mx.nd.array(golden_np, ctx=exe._ctx,
+            out_grads = [mx.nd.array(golden_np, ctx=exe._device,
                                      dtype=out.dtype).tostype(out.stype)
                          for (golden_np, out) in zip(golden_data_np, exe.outputs)]
             exe.backward(out_grads)
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index d9396bdba8a6..dfb012e4c538 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -3022,7 +3022,7 @@ def test_batch_dot():
                         assert_almost_equal(outputs[0], c_npy,
                                             rtol=1e-2 if data_type == 'float16' else 1e-3,
                                             atol=1e-2 if data_type == 'float16' else 1e-4)
-                        exe.backward(out_grads=[mx.nd.array(ograd_npy, dtype=outputs[0].dtype, ctx=exe._ctx)])
+                        exe.backward(out_grads=[mx.nd.array(ograd_npy, dtype=outputs[0].dtype, ctx=exe._device)])
                         assert_almost_equal(exe.grad_dict['a'], agrad_npy,
                                             rtol=1e-2 if data_type == 'float16' else 1e-3,
                                             atol=1e-2 if data_type == 'float16' else 1e-4)
@@ -3030,7 +3030,7 @@ def test_batch_dot():
                                             rtol=1e-2 if data_type == 'float16' else 1e-3,
                                             atol=1e-2 if data_type == 'float16' else 1e-4)
                         exe_add.forward(is_train=True, a=a_npy, b=b_npy)
-                        exe_add.backward(out_grads=[mx.nd.array(ograd_npy, dtype=exe_add.outputs[0].dtype, ctx=exe._ctx)])
+                        exe_add.backward(out_grads=[mx.nd.array(ograd_npy, dtype=exe_add.outputs[0].dtype, ctx=exe._device)])
                         assert_almost_equal(exe_add.grad_dict['a'],
                                             agrad_npy + a_init_grad_npy,
                                             rtol=1e-2 if data_type == 'float16' else 1e-3,

From b77c6f947ab2bd642130a84c1edac2ce9fa4b2da Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Sun, 17 Oct 2021 19:00:48 -0700
Subject: [PATCH 33/41] update rand_zipfian

---
 python/mxnet/ndarray/contrib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mxnet/ndarray/contrib.py b/python/mxnet/ndarray/contrib.py
index 5188628a1b7a..5e1de94c0db9 100644
--- a/python/mxnet/ndarray/contrib.py
+++ b/python/mxnet/ndarray/contrib.py
@@ -88,7 +88,7 @@ def rand_zipfian(true_classes, num_sampled, range_max, ctx=None):
     <NDArray 4 @cpu(0)>
     """
     if ctx is None:
-        ctx = current_context()
+        ctx = current_device()
     log_range = math.log(range_max + 1)
     rand = uniform(0, log_range, shape=(num_sampled,), dtype='float64', ctx=ctx)
     # make sure sampled_classes are in the range of [0, range_max)

From 1519650d1896e5f88e9527edc182508bf53a93c4 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Sun, 17 Oct 2021 22:24:22 -0700
Subject: [PATCH 34/41] update

---
 python/mxnet/contrib/quantization.py | 4 ++--
 python/mxnet/gluon/block.py          | 1 +
 python/mxnet/gluon/data/batchify.py  | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/mxnet/contrib/quantization.py b/python/mxnet/contrib/quantization.py
index 972bdac04518..be0282fe8a81 100644
--- a/python/mxnet/contrib/quantization.py
+++ b/python/mxnet/contrib/quantization.py
@@ -259,8 +259,8 @@ def get_optimal_threshold(hist_data, quantized_dtype, num_quantized_bins=255):
         if min_val >= 0 and quantized_dtype in ['auto', 'uint8']:
             # We need to move negative bins to positive bins to fit uint8 range.
             num_quantized_bins = num_quantized_bins * 2 + 1
-        hist = ndarray.array(hist, device=cpu())
-        hist_edges = ndarray.array(hist_edges, device=cpu())
+        hist = ndarray.array(hist, ctx=cpu())
+        hist_edges = ndarray.array(hist_edges, ctx=cpu())
         threshold, divergence = ndarray.contrib.calibrate_entropy(hist=hist,
                                                                   hist_edges=hist_edges,
                                                                   num_quantized_bins=num_quantized_bins)
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 8c0b416c8366..1a2535c7e71c 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -1683,6 +1683,7 @@ class SymbolBlock(HybridBlock):
     >>> print(feat_model(x))
     """
     @staticmethod
+    @wrap_ctx_to_device_func
     def imports(symbol_file, input_names, param_file=None, device=None, allow_missing=False,
                 ignore_extra=False):
         """Import model previously saved by `gluon.HybridBlock.export`
diff --git a/python/mxnet/gluon/data/batchify.py b/python/mxnet/gluon/data/batchify.py
index ec6b3d44bc08..de1f52d8e767 100644
--- a/python/mxnet/gluon/data/batchify.py
+++ b/python/mxnet/gluon/data/batchify.py
@@ -82,7 +82,7 @@ def __call__(self, data):
             dtype = data[0].dtype
             if self._use_shared_mem:
                 out = _arr.empty((len(data),) + data[0].shape, dtype=dtype,
-                                 device=Device('cpu_shared', 0))
+                                 ctx=Device('cpu_shared', 0))
                 return _arr.stack(data, out=out) if is_np_array() else _arr.stack(*data, out=out)
             else:
                 return _arr.stack(data) if is_np_array() else _arr.stack(*data)

From dc0e3b95fbceaf2b17e0c4c70b56161937af6e18 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 18 Oct 2021 09:56:04 -0700
Subject: [PATCH 35/41] device => cuda_device in util.py

---
 python/mxnet/util.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index 0976b4b1b3d5..58bfe6450039 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -965,7 +965,7 @@ def get_cuda_compute_capability(device):
 
     cc_major = ctypes.c_int()
     cc_minor = ctypes.c_int()
-    device = ctypes.c_int()
+    cuda_device = ctypes.c_int()
     error_str = ctypes.c_char_p()
 
     ret = cuda.cuInit(0)
@@ -974,12 +974,12 @@ def get_cuda_compute_capability(device):
         raise RuntimeError('cuInit failed with erro code {}: {}'
                            .format(ret, error_str.value.decode()))
 
-    ret = cuda.cuDeviceGet(ctypes.byref(device), device.device_id)
+    ret = cuda.cuDeviceGet(ctypes.byref(cuda_device), device.device_id)
     if ret != _CUDA_SUCCESS:
         cuda.cuGetErrorString(ret, ctypes.byref(error_str))
         raise RuntimeError('cuDeviceGet failed with error code {}: {}'
                            .format(ret, error_str.value.decode()))
-    ret = cuda.cuDeviceComputeCapability(ctypes.byref(cc_major), ctypes.byref(cc_minor), device)
+    ret = cuda.cuDeviceComputeCapability(ctypes.byref(cc_major), ctypes.byref(cc_minor), cuda_device)
     if ret != _CUDA_SUCCESS:
         cuda.cuGetErrorString(ret, ctypes.byref(error_str))
         raise RuntimeError('cuDeviceComputeCapability failed with error code {}: {}'

From d96a824870a34cabafbade0065340b2773f4f811 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 18 Oct 2021 15:41:03 -0700
Subject: [PATCH 36/41] context.gpu_memory_info => device.gpu_memory_info

---
 docs/python_docs/python/api/{context => device}/index.rst | 4 ++--
 docs/python_docs/python/api/index.rst                     | 6 +++---
 tests/python/gpu/test_gluon_gpu.py                        | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)
 rename docs/python_docs/python/api/{context => device}/index.rst (95%)

diff --git a/docs/python_docs/python/api/context/index.rst b/docs/python_docs/python/api/device/index.rst
similarity index 95%
rename from docs/python_docs/python/api/context/index.rst
rename to docs/python_docs/python/api/device/index.rst
index 135a8440bb06..f939aab59047 100644
--- a/docs/python_docs/python/api/context/index.rst
+++ b/docs/python_docs/python/api/device/index.rst
@@ -15,9 +15,9 @@
    specific language governing permissions and limitations
    under the License.
 
-mxnet.context
+mxnet.device
 =============
 
-.. automodule:: mxnet.context
+.. automodule:: mxnet.device
     :members:
     :autosummary:
diff --git a/docs/python_docs/python/api/index.rst b/docs/python_docs/python/api/index.rst
index 91837435601a..149570d3c488 100644
--- a/docs/python_docs/python/api/index.rst
+++ b/docs/python_docs/python/api/index.rst
@@ -86,7 +86,7 @@ Gluon related modules
       Key value store interface of MXNet for parameter synchronization.
 
    .. card::
-      :title: mxnet.context
+      :title: mxnet.device
       :link: mxnet/context/index.html
 
       CPU and GPU context information.
@@ -116,8 +116,8 @@ Advanced modules
       API for querying MXNet enabled features.
 
    .. card::
-      :title: mxnet.context
-      :link: context/index.html
+      :title: mxnet.device
+      :link: device/index.html
 
       MXNet array context for specifying in-memory storage device.
 
diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index ecc9e8dea9b6..266679c9164c 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -473,7 +473,7 @@ def tensor_size(big_tensor_bytes):
     # The idea is to create models with large tensors of (say) 20% of the total memory.
     # This in the past has given cudnnFind() trouble when it needed to allocate similar I/O's
     # from the area carved out by the MXNET_GPU_MEM_POOL_RESERVE setting (by default 5%).
-    (free_mem_bytes, total_mem_bytes) = mx.context.gpu_memory_info(device.device_id)
+    (free_mem_bytes, total_mem_bytes) = mx.device.gpu_memory_info(device.device_id)
     # This test needs to be 'qualified' for use with each new larger memory size
     largest_supported_total_mem_GB = 32
     if (total_mem_bytes > largest_supported_total_mem_GB * 1024 * 1024 * 1024):

From d126e8935d52f951c88f815a66fa251fcda47921 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 18 Oct 2021 21:53:21 -0700
Subject: [PATCH 37/41] fix docs

---
 docs/python_docs/python/api/context/index.rst | 23 +++++++++++
 docs/python_docs/python/api/index.rst         |  6 +--
 .../inference/image_classification_jetson.md  | 10 ++---
 .../python/tutorials/extend/customop.md       |  8 ++--
 .../getting-started/crash-course/1-nparray.md |  2 +-
 .../crash-course/2-create-nn.md               |  2 +-
 .../crash-course/4-components.md              |  6 +--
 .../crash-course/5-datasets.md                |  2 +-
 .../crash-course/6-train-nn.md                | 12 +++---
 .../crash-course/7-use-gpus.md                | 14 +++----
 .../gluon_from_experiment_to_deployment.md    | 20 +++++-----
 .../getting-started/gluon_migration_guide.md  | 28 +++++++-------
 .../logistic_regression_explained.md          | 10 ++---
 .../packages/gluon/blocks/custom-layer.md     |  2 +-
 .../tutorials/packages/gluon/blocks/nn.md     |  2 +-
 .../packages/gluon/blocks/save_load_params.md | 14 +++----
 .../tutorials/packages/gluon/data/datasets.md | 16 ++++----
 .../packages/gluon/image/info_gan.md          | 38 +++++++++----------
 .../tutorials/packages/gluon/image/mnist.md   | 22 +++++------
 .../tutorials/packages/gluon/text/gnmt.rst    | 24 ++++++------
 .../gluon/training/fit_api_tutorial.md        | 16 ++++----
 .../learning_rates/learning_rate_finder.md    | 28 +++++++-------
 .../learning_rates/learning_rate_schedules.md | 14 +++----
 .../tutorials/packages/kvstore/kvstore.md     |  8 ++--
 .../tutorials/packages/np/cheat-sheet.md      |  8 ++--
 .../tutorials/packages/np/np-vs-numpy.md      | 12 +++---
 .../packages/onnx/fine_tuning_gluon.md        | 22 +++++------
 .../packages/onnx/inference_on_onnx_model.md  | 10 ++---
 .../tutorials/performance/backend/amp.md      | 26 ++++++-------
 .../tutorials/performance/backend/profiler.md | 16 ++++----
 .../pages/api/developer_guide/profiling.md    |  4 +-
 tests/python/unittest/common.py               |  2 +-
 32 files changed, 225 insertions(+), 202 deletions(-)
 create mode 100644 docs/python_docs/python/api/context/index.rst

diff --git a/docs/python_docs/python/api/context/index.rst b/docs/python_docs/python/api/context/index.rst
new file mode 100644
index 000000000000..f939aab59047
--- /dev/null
+++ b/docs/python_docs/python/api/context/index.rst
@@ -0,0 +1,23 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+
+mxnet.device
+=============
+
+.. automodule:: mxnet.device
+    :members:
+    :autosummary:
diff --git a/docs/python_docs/python/api/index.rst b/docs/python_docs/python/api/index.rst
index 149570d3c488..40f09bc19098 100644
--- a/docs/python_docs/python/api/index.rst
+++ b/docs/python_docs/python/api/index.rst
@@ -87,9 +87,9 @@ Gluon related modules
 
    .. card::
       :title: mxnet.device
-      :link: mxnet/context/index.html
+      :link: mxnet/device/index.html
 
-      CPU and GPU context information.
+      CPU and GPU device information.
 
    .. card::
       :title: mxnet.profiler
@@ -119,7 +119,7 @@ Advanced modules
       :title: mxnet.device
       :link: device/index.html
 
-      MXNet array context for specifying in-memory storage device.
+      MXNet array device for specifying in-memory storage device.
 
    .. card::
       :title: mxnet.profiler
diff --git a/docs/python_docs/python/tutorials/deploy/inference/image_classification_jetson.md b/docs/python_docs/python/tutorials/deploy/inference/image_classification_jetson.md
index 152dc6d80c0e..8787b2a7c53b 100644
--- a/docs/python_docs/python/tutorials/deploy/inference/image_classification_jetson.md
+++ b/docs/python_docs/python/tutorials/deploy/inference/image_classification_jetson.md
@@ -71,18 +71,18 @@ And we are done. You can test the installation now by importing mxnet from pytho
 
 ## Running a pre-trained ResNet-50 model on Jetson
 
-We are now ready to run a pre-trained model and run inference on a Jetson module. In this tutorial we are using ResNet-50 model trained on Imagenet dataset. We run the following classification script with either cpu/gpu context using python3.
+We are now ready to run a pre-trained model and run inference on a Jetson module. In this tutorial we are using ResNet-50 model trained on Imagenet dataset. We run the following classification script with either cpu/gpu device using python3.
 
 ```{.python .input}
 from mxnet import gluon
 import mxnet as mx
 
-# set context
+# set device
 gpus = mx.test_utils.list_gpus()
-ctx =  mx.gpu() if gpus else mx.cpu()
+device =  mx.gpu() if gpus else mx.cpu()
 
 # load pre-trained model
-net = gluon.model_zoo.vision.resnet50_v1(pretrained=True, ctx=ctx)
+net = gluon.model_zoo.vision.resnet50_v1(pretrained=True, device=device)
 net.hybridize(static_alloc=True, static_shape=True)
 
 # load labels
@@ -99,7 +99,7 @@ img = mx.image.color_normalize(img.astype(dtype='float32')/255,
                                std=mx.np.array([0.229, 0.224, 0.225])) # normalize
 img = img.transpose((2, 0, 1)) # channel first
 img = mx.np.expand_dims(img, axis=0) # batchify
-img = img.to_device(ctx)
+img = img.to_device(device)
 
 prob = mx.npx.softmax(net(img)) # predict and normalize output
 idx = mx.npx.topk(prob, k=5)[0] # get top 5 result
diff --git a/docs/python_docs/python/tutorials/extend/customop.md b/docs/python_docs/python/tutorials/extend/customop.md
index 2ee70afbec8b..fd52c3b32535 100644
--- a/docs/python_docs/python/tutorials/extend/customop.md
+++ b/docs/python_docs/python/tutorials/extend/customop.md
@@ -101,7 +101,7 @@ class SigmoidProp(mx.operator.CustomOpProp):
         # return 3 lists representing inputs shapes, outputs shapes, and aux data shapes.
         return (data_shape,), (output_shape,), ()
 
-    def create_operator(self, ctx, in_shapes, in_dtypes):
+    def create_operator(self, device, in_shapes, in_dtypes):
         #  create and return the CustomOp class.
         return Sigmoid()
 ```
@@ -183,7 +183,7 @@ class DenseProp(mx.operator.CustomOpProp):
         # return 3 lists representing inputs shapes, outputs shapes, and aux data shapes.
         return (data_shape, weight_shape), (output_shape,), ()
 
-    def create_operator(self, ctx, in_shapes, in_dtypes):
+    def create_operator(self, device, in_shapes, in_dtypes):
         #  create and return the CustomOp class.
         return Dense(self._bias)
 ```
@@ -201,8 +201,8 @@ class DenseBlock(mx.gluon.Block):
         self.weight = gluon.Parameter('weight', shape=(channels, in_channels))
 
     def forward(self, x):
-        ctx = x.context
-        return mx.nd.Custom(x, self.weight.data(ctx), bias=self._bias, op_type='dense')
+        device = x.device
+        return mx.nd.Custom(x, self.weight.data(device), bias=self._bias, op_type='dense')
 ```
 
 ### Example usage
diff --git a/docs/python_docs/python/tutorials/getting-started/crash-course/1-nparray.md b/docs/python_docs/python/tutorials/getting-started/crash-course/1-nparray.md
index 112e9ee5976c..06e579e866df 100644
--- a/docs/python_docs/python/tutorials/getting-started/crash-course/1-nparray.md
+++ b/docs/python_docs/python/tutorials/getting-started/crash-course/1-nparray.md
@@ -195,7 +195,7 @@ a = np.array(a)
 (type(a), a)
 ```
 
-Additionally, you can move them to different GPU contexts. You will dive more
+Additionally, you can move them to different GPU devices. You will dive more
 into this later, but here is an example for now.
 
 ```{.python .input}
diff --git a/docs/python_docs/python/tutorials/getting-started/crash-course/2-create-nn.md b/docs/python_docs/python/tutorials/getting-started/crash-course/2-create-nn.md
index 7ab1a85cb923..284a2187db42 100644
--- a/docs/python_docs/python/tutorials/getting-started/crash-course/2-create-nn.md
+++ b/docs/python_docs/python/tutorials/getting-started/crash-course/2-create-nn.md
@@ -564,7 +564,7 @@ with warnings.catch_warnings():
     warnings.simplefilter("ignore")
     net_loaded = nn.SymbolBlock.imports("MLP_hybrid-symbol.json",
                                         ['data'], "MLP_hybrid-0000.params",
-                                        ctx=None)
+                                        device=None)
 ```
 
 ```{.python .input}
diff --git a/docs/python_docs/python/tutorials/getting-started/crash-course/4-components.md b/docs/python_docs/python/tutorials/getting-started/crash-course/4-components.md
index 135c22a85b8a..b52449899b6b 100644
--- a/docs/python_docs/python/tutorials/getting-started/crash-course/4-components.md
+++ b/docs/python_docs/python/tutorials/getting-started/crash-course/4-components.md
@@ -35,7 +35,7 @@ import mxnet as mx
 from mxnet.gluon import nn
 npx.set_np()
 
-ctx = mx.cpu()
+device = mx.cpu()
 ```
 
 ## Initialization
@@ -103,7 +103,7 @@ To initialize your network using different built-in types, you have to use the
 from mxnet import init
 
 # Constant init initializes the weights to be a constant value for all the params
-net.initialize(init=init.Constant(3), ctx=ctx)
+net.initialize(init=init.Constant(3), device=device)
 print(net[0].weight.data()[0])
 ```
 
@@ -113,7 +113,7 @@ already initialized the weight but want to reinitialize the weight, set the
 `force_reinit` flag to `True`.
 
 ```{.python .input}
-net.initialize(init=init.Normal(sigma=0.2), force_reinit=True, ctx=ctx)
+net.initialize(init=init.Normal(sigma=0.2), force_reinit=True, device=device)
 print(net[0].weight.data()[0])
 ```
 
diff --git a/docs/python_docs/python/tutorials/getting-started/crash-course/5-datasets.md b/docs/python_docs/python/tutorials/getting-started/crash-course/5-datasets.md
index 68cdbd829729..b36948010d1d 100644
--- a/docs/python_docs/python/tutorials/getting-started/crash-course/5-datasets.md
+++ b/docs/python_docs/python/tutorials/getting-started/crash-course/5-datasets.md
@@ -215,7 +215,7 @@ The current data loading pipeline is the major bottleneck for many training task
 - `Transform.__call__()/forward()`
 - `Batchify`
 - (optional communicate through shared_mem)
-- `split_and_load(ctxs)`
+- `split_and_load(devices)`
 - training on GPUs
 
 Performance concerns include slow python dataset/transform functions, multithreading issues due to global interpreter lock, Python multiprocessing issues due to speed, and batchify issues due to poor memory management.
diff --git a/docs/python_docs/python/tutorials/getting-started/crash-course/6-train-nn.md b/docs/python_docs/python/tutorials/getting-started/crash-course/6-train-nn.md
index b4498853c08c..58cb638fe4c9 100644
--- a/docs/python_docs/python/tutorials/getting-started/crash-course/6-train-nn.md
+++ b/docs/python_docs/python/tutorials/getting-started/crash-course/6-train-nn.md
@@ -322,13 +322,13 @@ hybridize the model.
 
 ```{.python .input}
 # Create the model based on the blueprint provided and initialize the parameters
-ctx = mx.gpu()
+device = mx.gpu()
 
 initializer = mx.initializer.Xavier()
 
 model = LeafNetwork()
-model.initialize(initializer, ctx=ctx)
-model.summary(mx.np.random.uniform(size=(4, 3, 128, 128), ctx=ctx))
+model.initialize(initializer, device=device)
+model.summary(mx.np.random.uniform(size=(4, 3, 128, 128), device=device))
 model.hybridize()
 ```
 
@@ -368,7 +368,7 @@ def test(val_data):
     for batch in val_data:
         data = batch[0]
         labels = batch[1]
-        outputs = model(data.to_device(ctx))
+        outputs = model(data.to_device(device))
         acc.update([labels], [outputs])
 
     _, accuracy = acc.get()
@@ -396,8 +396,8 @@ for epoch in range(epochs):
         data = batch[0]
         label = batch[1]
         with mx.autograd.record():
-            outputs = model(data.to_device(ctx))
-            loss = loss_fn(outputs, label.to_device(ctx))
+            outputs = model(data.to_device(device))
+            loss = loss_fn(outputs, label.to_device(device))
         mx.autograd.backward(loss)
         trainer.step(batch_size)
         accuracy.update([label], [outputs])
diff --git a/docs/python_docs/python/tutorials/getting-started/crash-course/7-use-gpus.md b/docs/python_docs/python/tutorials/getting-started/crash-course/7-use-gpus.md
index 33717f03373c..0922cd79d964 100644
--- a/docs/python_docs/python/tutorials/getting-started/crash-course/7-use-gpus.md
+++ b/docs/python_docs/python/tutorials/getting-started/crash-course/7-use-gpus.md
@@ -36,11 +36,11 @@ npx.num_gpus() #This command provides the number of GPUs MXNet can access
 
 ## Allocate data to a GPU
 
-MXNet's ndarray is very similar to NumPy's. One major difference is that MXNet's ndarray has a `context` attribute specifieing which device an array is on. By default, arrays are stored on `npx.cpu()`. To change it to the first GPU, you can use the following code, `npx.gpu()` or `npx.gpu(0)` to indicate the first GPU.
+MXNet's ndarray is very similar to NumPy's. One major difference is that MXNet's ndarray has a `device` attribute specifieing which device an array is on. By default, arrays are stored on `npx.cpu()`. To change it to the first GPU, you can use the following code, `npx.gpu()` or `npx.gpu(0)` to indicate the first GPU.
 
 ```{.python .input}
 gpu = npx.gpu() if npx.num_gpus() > 0 else npx.cpu()
-x = np.ones((3,4), ctx=gpu)
+x = np.ones((3,4), device=gpu)
 x
 ```
 
@@ -63,7 +63,7 @@ If you have multiple GPUs on your machine, MXNet can access each of them through
 To perform an operation on a particular GPU, you only need to guarantee that the input of an operation is already on that GPU. The output is allocated on the same GPU as well. Almost all operators in the `np` and `npx` module support running on a GPU.
 
 ```{.python .input}
-y = np.random.uniform(size=(3,4), ctx=gpu)
+y = np.random.uniform(size=(3,4), device=gpu)
 x + y
 ```
 
@@ -115,17 +115,17 @@ class LeafNetwork(nn.HybridBlock):
         return batch
 ```
 
-Load the saved parameters onto GPU 0 directly as shown below; additionally, you could use `net.collect_params().reset_ctx(gpu)` to change the device.
+Load the saved parameters onto GPU 0 directly as shown below; additionally, you could use `net.collect_params().reset_device(gpu)` to change the device.
 
 ```{.python .input}
 net = LeafNetwork()
-net.load_parameters('leaf_models.params', ctx=gpu)
+net.load_parameters('leaf_models.params', device=gpu)
 ```
 
 Use the following command to create input data on GPU 0. The forward function will then run on GPU 0.
 
 ```{.python .input}
-x = np.random.uniform(size=(1, 3, 128, 128), ctx=gpu)
+x = np.random.uniform(size=(1, 3, 128, 128), device=gpu)
 net(x)
 ```
 
@@ -201,7 +201,7 @@ devices = available_gpus[:num_gpus]
 print('Using {} GPUs'.format(len(devices)))
 
 # Diff 2: reinitialize the parameters and place them on multiple GPUs
-net.initialize(force_reinit=True, ctx=devices)
+net.initialize(force_reinit=True, device=devices)
 
 # Loss and trainer are the same as before
 loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
diff --git a/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md b/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
index 391a41899875..a04e028f107e 100644
--- a/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
+++ b/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
@@ -94,7 +94,7 @@ lr_epochs = [10, 20, 30]
 num_gpus = mx.device.num_gpus()
 # you can replace num_workers with the number of cores on you device
 num_workers = 8
-ctx = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
+device = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
 batch_size = per_device_batch_size * max(num_gpus, 1)
 ```
 
@@ -166,11 +166,11 @@ Before we go to training, one unique Gluon feature you should be aware of is hyb
 
 ```{.python .input}
 # load pre-trained resnet50_v2 from model zoo
-finetune_net = resnet50_v2(pretrained=True, ctx=ctx)
+finetune_net = resnet50_v2(pretrained=True, device=device)
 
 # change last softmax layer since number of classes are different
 finetune_net.output = nn.Dense(classes)
-finetune_net.output.initialize(init.Xavier(), ctx=ctx)
+finetune_net.output.initialize(init.Xavier(), device=device)
 # hybridize for better performance
 finetune_net.hybridize()
 
@@ -195,11 +195,11 @@ Now let's define the test metrics and start fine-tuning.
 
 
 ```{.python .input}
-def test(net, val_data, ctx):
+def test(net, val_data, device):
     metric = mx.gluon.metric.Accuracy()
     for i, (data, label) in enumerate(val_data):
-        data = gluon.utils.split_and_load(data, ctx_list=ctx, even_split=False)
-        label = gluon.utils.split_and_load(label, ctx_list=ctx, even_split=False)
+        data = gluon.utils.split_and_load(data, device, even_split=False)
+        label = gluon.utils.split_and_load(label, device, even_split=False)
         outputs = [net(x) for x in data]
         metric.update(label, outputs)
     return metric.get()
@@ -215,8 +215,8 @@ for epoch in range(1, epochs + 1):
 
     for i, (data, label) in enumerate(train_data):
         # get the images and labels
-        data = gluon.utils.split_and_load(data, ctx_list=ctx, even_split=False)
-        label = gluon.utils.split_and_load(label, ctx_list=ctx, even_split=False)
+        data = gluon.utils.split_and_load(data, device, even_split=False)
+        label = gluon.utils.split_and_load(label, device, even_split=False)
         with autograd.record():
             outputs = [finetune_net(x) for x in data]
             loss = [softmax_cross_entropy(yhat, y) for yhat, y in zip(outputs, label)]
@@ -229,12 +229,12 @@ for epoch in range(1, epochs + 1):
 
     _, train_acc = metric.get()
     train_loss /= num_batch
-    _, val_acc = test(finetune_net, val_data, ctx)
+    _, val_acc = test(finetune_net, val_data, device)
 
     print('[Epoch %d] Train-acc: %.3f, loss: %.3f | Val-acc: %.3f | learning-rate: %.3E | time: %.1f' %
           (epoch, train_acc, train_loss, val_acc, trainer.learning_rate, time.time() - tic))
 
-_, test_acc = test(finetune_net, test_data, ctx)
+_, test_acc = test(finetune_net, test_data, device)
 print('[Finished] Test-acc: %.3f' % (test_acc))
 ```
 
diff --git a/docs/python_docs/python/tutorials/getting-started/gluon_migration_guide.md b/docs/python_docs/python/tutorials/getting-started/gluon_migration_guide.md
index c65a7f2a6556..f362223e6680 100644
--- a/docs/python_docs/python/tutorials/getting-started/gluon_migration_guide.md
+++ b/docs/python_docs/python/tutorials/getting-started/gluon_migration_guide.md
@@ -170,7 +170,7 @@ class SampleBlock(HybridBlock):
         # Access constant parameters, which are not iterated during training
         self.weight = Constant('const', const_arr)
 ```
-Also, there will be new mechanisms for parameter loading, sharing and setting context. 
+Also, there will be new mechanisms for parameter loading, sharing and setting device. 
 
 1. Parameter loading in Gluon 1.x vs Gluon 2.0:
     ```{.python}
@@ -179,7 +179,7 @@ Also, there will be new mechanisms for parameter loading, sharing and setting co
     net.collect_params().load_dict(arg_dict, ctx=ctx)
     # in Gluon 2.0
     net = nn.Dense(8, activation='relu')
-    net.load_dict(arg_dict, ctx=ctx)
+    net.load_dict(arg_dict, device=device)
     ```
 
 2. Parameter sharing in Gluon 1.x vs Gluon 2.0:
@@ -192,14 +192,14 @@ Also, there will be new mechanisms for parameter loading, sharing and setting co
     net = nn.Dense(8, activation='relu').share_parameters(shared.params)
     ```
 
-3. Parameter setting context in Gluon 1.x vs Gluon 2.0:
+3. Parameter setting device in Gluon 1.x vs Gluon 2.0:
     ```{.python}
     # in Gluon 1.x
     net = nn.Dense(8, activation='relu')
     net.collect_params().reset_ctx(devices)
     # in Gluon 2.0
     net = nn.Dense(8, activation='relu')
-    net.reset_ctx(devices)
+    net.reset_device(devices)
     ```
 
 #### Forward Interface
@@ -222,11 +222,11 @@ Now, in deferred computation mode of Gluon2.0, the divergence of NDArray and Sym
 ```{.python}
 # forward interface, no F any more
 def forward(self, x):
-    # get the context information of input array and make parameters run on the same context
-    ctx = x.device
+    # get the device information of input array and make parameters run on the same device
+    device = x.device
     # use np/npx interfaces instead of F
-    act = npx.fully_connected(x, self.weight.data(ctx),
-                              self.bias.data(ctx) if self.bias is not None else None,
+    act = npx.fully_connected(x, self.weight.data(device),
+                              self.bias.data(device) if self.bias is not None else None,
                               no_bias=self.bias is None,
                               num_hidden=self._units, flatten=self._flatten, name='fwd')
     if self.act is not None:
@@ -276,9 +276,9 @@ class Dense(HybridBlock):
             self.act = None
 
     def forward(self, x):
-        ctx = x.device
-        act = npx.fully_connected(x, self.weight.data(ctx),
-                                  self.bias.data(ctx) if self.bias is not None else None,
+        device = x.device
+        act = npx.fully_connected(x, self.weight.data(device),
+                                  self.bias.data(device) if self.bias is not None else None,
                                   no_bias=self.bias is None,
                                   num_hidden=self._units, flatten=self._flatten, name='fwd')
         if self.act is not None:
@@ -342,10 +342,10 @@ Example:
 import mxnet as mx
 # create key-value store with horovod backend
 kv = mx.kv.create('horovod') # or choose 'kvstore', 'byteps' as backend
-ctx = mx.gpu(kv.local_rank) if mx.device.num_gpus() > 0 else mx.cpu(kv.local_rank)
-val = mx.np.zeros((2, 3), ctx=ctx)
+device = mx.gpu(kv.local_rank) if mx.device.num_gpus() > 0 else mx.cpu(kv.local_rank)
+val = mx.np.zeros((2, 3), device=device)
 # broadcast the value at rank 0 to all ranks
-kv.broadcast('0', mx.np.zeros((2, 3), ctx=ctx), out=val)
+kv.broadcast('0', mx.np.zeros((2, 3), device=device), out=val)
 scale = kv.rank + 1
 # performs allreduce on a single array
 kv.pushpull('3', val * scale)
diff --git a/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md b/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md
index ddfd9e7f32b5..caa2975c634a 100644
--- a/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md
+++ b/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md
@@ -37,8 +37,8 @@ In this tutorial we will use fake dataset, which contains 10 features drawn from
 
 
 ```{.python .input}
-def get_random_data(size, ctx):
-    x = np.random.normal(0, 1, size=(size, 10), ctx=ctx)
+def get_random_data(size, device):
+    x = np.random.normal(0, 1, size=(size, 10), device=device)
     y = x.sum(axis=1) > 3
     return x, y
 ```
@@ -47,7 +47,7 @@ Also, let's define a set of hyperparameters, that we are going to use later. Sin
 
 
 ```{.python .input}
-ctx = mx.cpu()
+device = mx.cpu()
 train_data_size = 1000
 val_data_size = 100
 batch_size = 10
@@ -61,11 +61,11 @@ Below we define training and validation datasets, which we are going to use in t
 
 
 ```{.python .input}
-train_x, train_ground_truth_class = get_random_data(train_data_size, ctx)
+train_x, train_ground_truth_class = get_random_data(train_data_size, device)
 train_dataset = ArrayDataset(train_x, train_ground_truth_class)
 train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
 
-val_x, val_ground_truth_class = get_random_data(val_data_size, ctx)
+val_x, val_ground_truth_class = get_random_data(val_data_size, device)
 val_dataset = ArrayDataset(val_x, val_ground_truth_class)
 val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
 ```
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md
index cc98f1f42266..ff25bdda1c9f 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md
@@ -81,7 +81,7 @@ Thanks to inheriting from HybridBlock, one can easily do forward pass on a given
 
 ```{.python .input}
 layer = NormalizationHybridLayer()
-layer(np.array([1, 2, 3], ctx=mx.cpu()))
+layer(np.array([1, 2, 3], device=mx.cpu()))
 ```
 
 Output:
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/nn.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/nn.md
index 5e39f283dc0b..61f550816584 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/nn.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/nn.md
@@ -304,7 +304,7 @@ deep learning library. The problems of Python's [Global Interpreter
 Lock](https://wiki.python.org/moin/GlobalInterpreterLock) are well
 known.
 
-In the context of deep learning, we often have highly performant GPUs that
+In the device of deep learning, we often have highly performant GPUs that
 depend on CPUs running Python to tell them what to do. This mismatch can
 manifest in the form of GPU starvation when the CPUs can not provide
 instruction fast enough. We can improve this situation by deferring to a more
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md
index 4e98d90ee88b..7b1ce57abadb 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md
@@ -49,7 +49,7 @@ Let's define a helper function to build a LeNet model and another helper to trai
 
 ```{.python .input}
 # Use GPU if one exists, else use CPU
-ctx = mx.gpu() if mx.device.num_gpus() else mx.cpu()
+device = mx.gpu() if mx.device.num_gpus() else mx.cpu()
 
 # MNIST images are 28x28. Total pixels in input layer is 28x28 = 784
 num_inputs = 784
@@ -82,7 +82,7 @@ def build_lenet(net):
 # Train a given model using MNIST data
 def train_model(model):
     # Initialize the parameters with Xavier initializer
-    model.initialize(mx.init.Xavier(), ctx=ctx)
+    model.initialize(mx.init.Xavier(), device=device)
     # Use cross entropy loss
     softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
     # Use Adam optimizer
@@ -93,8 +93,8 @@ def train_model(model):
         # Iterate through the images and labels in the training data
         for batch_num, (data, label) in enumerate(train_data):
             # get the images and labels
-            data = data.as_in_context(ctx)
-            label = label.as_in_context(ctx)
+            data = data.to_device(device)
+            label = label.to_device(device)
             # Ask autograd to record the forward pass
             with autograd.record():
                 # Run the forward pass
@@ -156,7 +156,7 @@ Let's now create a network with the parameters we saved into the file. We build
 
 ```{.python .input}
 new_net = build_lenet(gluon.nn.Sequential())
-new_net.load_parameters(file_name, ctx=ctx)
+new_net.load_parameters(file_name, device=device)
 ```
 
 Note that to do this, we need the definition of the network as Python code. If we want to recreate this network on a different machine using the saved weights, we need the same Python code (`build_lenet`) that created the network to create the `new_net` object shown above. This means Python code needs to be copied over to any machine where we want to run this network.
@@ -190,7 +190,7 @@ def verify_loaded_model(net):
 
         # Display the predictions
         data = np.transpose(data, (0, 3, 1, 2))
-        out = net(data.to_device(ctx))
+        out = net(data.to_device(device))
         predictions = np.argmax(out, axis=1)
         print('Model predictions: ', predictions.asnumpy())
 
@@ -254,7 +254,7 @@ Serialized Hybrid networks (saved as .JSON and .params file) can be loaded and u
 import warnings
 with warnings.catch_warnings():
     warnings.simplefilter("ignore")
-    deserialized_net = gluon.nn.SymbolBlock.imports("lenet-symbol.json", ['data'], "lenet-0001.params", ctx=ctx)
+    deserialized_net = gluon.nn.SymbolBlock.imports("lenet-symbol.json", ['data'], "lenet-0001.params", device=device)
 ```
 
 `deserialized_net` now contains the network we deserialized from files. Let's test the deserialized network to make sure it works.
diff --git a/docs/python_docs/python/tutorials/packages/gluon/data/datasets.md b/docs/python_docs/python/tutorials/packages/gluon/data/datasets.md
index dafc36e32525..9bc7a375fd8d 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/data/datasets.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/data/datasets.md
@@ -150,11 +150,11 @@ def construct_net():
     return net
 
 # construct and initialize network.
-ctx =  mx.gpu() if mx.context.num_gpus() else mx.cpu()
+device =  mx.gpu() if mx.device.num_gpus() else mx.cpu()
 
 net = construct_net()
 net.hybridize()
-net.initialize(mx.init.Xavier(), ctx=ctx)
+net.initialize(mx.init.Xavier(), device=device)
 # define loss and trainer.
 criterion = gluon.loss.SoftmaxCrossEntropyLoss()
 trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1})
@@ -166,11 +166,11 @@ trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1})
 epochs = 5
 for epoch in range(epochs):
     # training loop (with autograd and trainer steps, etc.)
-    cumulative_train_loss = mx.np.zeros(1, ctx=ctx)
+    cumulative_train_loss = mx.np.zeros(1, device=device)
     training_samples = 0
     for batch_idx, (data, label) in enumerate(train_data_loader):
-        data = data.as_in_ctx(ctx).reshape((-1, 784)) # 28*28=784
-        label = label.as_in_ctx(ctx)
+        data = data.to_device(device).reshape((-1, 784)) # 28*28=784
+        label = label.to_device(device)
         with autograd.record():
             output = net(data)
             loss = criterion(output, label)
@@ -181,11 +181,11 @@ for epoch in range(epochs):
     train_loss = cumulative_train_loss.item()/training_samples
 
     # validation loop
-    cumulative_valid_loss = mx.np.zeros(1, ctx=ctx)
+    cumulative_valid_loss = mx.np.zeros(1, device=device)
     valid_samples = 0
     for batch_idx, (data, label) in enumerate(valid_data_loader):
-        data = data.as_in_ctx(ctx).reshape((-1, 784)) # 28*28=784
-        label = label.as_in_ctx(ctx)
+        data = data.to_device(device).reshape((-1, 784)) # 28*28=784
+        label = label.to_device(device)
         output = net(data)
         loss = criterion(output, label)
         cumulative_valid_loss += mx.np.sum(loss)
diff --git a/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md b/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md
index 5bb842bcf30a..3a82855df2d4 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md
@@ -50,7 +50,7 @@ batch_size   = 64
 z_dim        = 100
 n_continuous = 2
 n_categories = 10
-ctx = mx.gpu() if mx.device.num_gpus() else mx.cpu()
+device = mx.gpu() if mx.device.num_gpus() else mx.cpu()
 ```
 
 Some functions to load and normalize images.
@@ -197,11 +197,11 @@ Initialize Generator and Discriminator and define correspoing trainer function.
 ```{.python .input}
 generator = Generator()
 generator.hybridize()
-generator.initialize(mx.init.Normal(0.002), ctx=ctx)
+generator.initialize(mx.init.Normal(0.002), device=device)
 
 discriminator = Discriminator()
 discriminator.hybridize()
-discriminator.initialize(mx.init.Normal(0.002), ctx=ctx)
+discriminator.initialize(mx.init.Normal(0.002), device=device)
 
 lr   = 0.0001
 beta = 0.5
@@ -215,8 +215,8 @@ Create vectors with real (=1) and fake labels (=0).
 
 
 ```{.python .input}
-real_label = np.ones((batch_size,), ctx=ctx)
-fake_label = np.zeros((batch_size,),ctx=ctx)
+real_label = np.ones((batch_size,), device=device)
+fake_label = np.zeros((batch_size,),device=device)
 ```
 
 Load a pretrained model.
@@ -224,8 +224,8 @@ Load a pretrained model.
 
 ```{.python .input}
 if os.path.isfile('infogan_d_latest.params') and os.path.isfile('infogan_g_latest.params'):
-    discriminator.load_parameters('infogan_d_latest.params', ctx=ctx, allow_missing=True, ignore_extra=True)
-    generator.load_parameters('infogan_g_latest.params', ctx=ctx, allow_missing=True, ignore_extra=True)
+    discriminator.load_parameters('infogan_d_latest.params', device=device, allow_missing=True, ignore_extra=True)
+    generator.load_parameters('infogan_g_latest.params', device=device, allow_missing=True, ignore_extra=True)
 ```
 There are 2 differences between InfoGAN and DCGAN: the extra latent code and the Q network to estimate the code.
 The latent code is part of the Generator input and it contains mutliple variables (continuous, categorical) that can represent different distributions. In order to make sure that the Generator uses the latent code, mutual information is introduced into the GAN loss term. Mutual information measures how much X is known given Y or vice versa. It is defined as:
@@ -255,10 +255,10 @@ This function samples `c`, `z`, and concatenates them to create the generator in
 def create_generator_input():
 
     #create random noise
-    z      = np.random.normal(0, 1, size=(batch_size, z_dim), ctx=ctx)
-    label  = np.array(onp.random.randint(n_categories, size=batch_size)).as_in_context(ctx)
-    c1     = npx.one_hot(label, depth=n_categories).as_in_context(ctx)
-    c2     = np.random.uniform(-1, 1, size=(batch_size, n_continuous)).as_in_context(ctx)
+    z      = np.random.normal(0, 1, size=(batch_size, z_dim), device=device)
+    label  = np.array(onp.random.randint(n_categories, size=batch_size)).to_device(device)
+    c1     = npx.one_hot(label, depth=n_categories).to_device(device)
+    c2     = np.random.uniform(-1, 1, size=(batch_size, n_continuous)).to_device(device)
 
     # concatenate random noise with c which will be the input of the generator
     return np.concatenate([z, c1, c2], axis=1), label, c2
@@ -279,13 +279,13 @@ for epoch in range(epochs):
     print("Epoch", epoch)
     starttime = time.time()
 
-    d_error_epoch = np.zeros((1,), ctx=ctx)
-    g_error_epoch = np.zeros((1,), ctx=ctx)
+    d_error_epoch = np.zeros((1,), device=device)
+    g_error_epoch = np.zeros((1,), device=device)
 
     for idx, data in enumerate(train_dataloader):
 
         #get real data and generator input
-        real_data = data.as_in_context(ctx)
+        real_data = data.to_device(device)
         g_input, label, c2 = create_generator_input()
 
 
@@ -342,7 +342,7 @@ Load the trained discriminator and retrieve one of its last layers.
 
 ```{.python .input}
 discriminator = Discriminator()
-discriminator.load_parameters("infogan_d_latest.params", ctx=ctx, ignore_extra=True)
+discriminator.load_parameters("infogan_d_latest.params", device=device, ignore_extra=True)
 
 discriminator = discriminator.D[:11]
 print (discriminator)
@@ -375,18 +375,18 @@ Take some images from the test data, obtain its feature vector from `discriminat
 ```{.python .input}
 feature_size = 8192
 
-features = np.zeros((len(test_images), feature_size), ctx=ctx)
+features = np.zeros((len(test_images), feature_size), device=device)
 
 for idx, image in enumerate(test_images):
 
-    feature = discriminator(np.array(image, ctx=ctx))
+    feature = discriminator(np.array(image, device=device))
     feature = feature.reshape(feature_size,)
-    features[idx,:] = feature.copyto(ctx)
+    features[idx,:] = feature.copyto(device)
 
 
 for image in test_images[:100]:
 
-    feature = discriminator(np.array(image, ctx=ctx))
+    feature = discriminator(np.array(image, device=device))
     feature = feature.reshape((feature_size,))
     image   = image.reshape((3,64,64))
 
diff --git a/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md b/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
index bdd2b06deed0..677dfbcb8fae 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
@@ -127,8 +127,8 @@ initialized parameters.
 
 ```{.python .input}
 gpus = mx.test_utils.list_gpus()
-ctx =  mx.gpu() if gpus else [mx.cpu(0), mx.cpu(1)]
-net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
+device =  mx.gpu() if gpus else [mx.cpu(0), mx.cpu(1)]
+net.initialize(mx.init.Xavier(magnitude=2.24), device=device)
 trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.02})
 ```
 
@@ -163,9 +163,9 @@ for i in range(epoch):
         # Inside training scope
         with ag.record():
             for x, y in zip(data, label):
-                z = net(x.to_device(ctx))
+                z = net(x.to_device(device))
                 # Computes softmax cross entropy loss.
-                loss = softmax_cross_entropy_loss(z, y.to_device(ctx))
+                loss = softmax_cross_entropy_loss(z, y.to_device(device))
                 # Backpropagate the error for one iteration.
                 loss.backward()
                 outputs.append(z)
@@ -192,7 +192,7 @@ metric = mx.gluon.metric.Accuracy()
 for batch_num, (data, label) in enumerate(val_data):
     outputs = []
     for x in data:
-        outputs.append(net(x.to_device(ctx)))
+        outputs.append(net(x.to_device(device)))
     # Updates internal evaluation
     metric.update(label, outputs)
 print('validation acc: %s=%f'%metric.get())
@@ -259,9 +259,9 @@ Training and prediction can be done in the similar way as we did for MLP.
 We will initialize the network parameters as follows:
 
 ```{.python .input}
-# set the context on GPU is available otherwise CPU
-ctx = [mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()]
-net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
+# set the device on GPU is available otherwise CPU
+device = [mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()]
+net.initialize(mx.init.Xavier(magnitude=2.24), device=device)
 trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.03})
 ```
 
@@ -279,9 +279,9 @@ for i in range(epoch):
         # Inside training scope
         with ag.record():
             for x, y in zip(data, label):
-                z = net(x.to_device(ctx))
+                z = net(x.to_device(device))
                 # Computes softmax cross entropy loss.
-                loss = softmax_cross_entropy_loss(z, y.to_device(ctx))
+                loss = softmax_cross_entropy_loss(z, y.to_device(device))
                 # Backpropogate the error for one iteration.
                 loss.backward()
                 outputs.append(z)
@@ -308,7 +308,7 @@ metric = mx.gluon.metric.Accuracy()
 for batch_num, (data, label) in enumerate(val_data):
     outputs = []
     for x in data:
-        outputs.append(net(x.to_device(ctx)))
+        outputs.append(net(x.to_device(device)))
     # Updates internal evaluation
     metric.update(label, outputs)
 print('validation acc: %s=%f'%metric.get())
diff --git a/docs/python_docs/python/tutorials/packages/gluon/text/gnmt.rst b/docs/python_docs/python/tutorials/packages/gluon/text/gnmt.rst
index d3ac96f8a2cf..164512b13410 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/text/gnmt.rst
+++ b/docs/python_docs/python/tutorials/packages/gluon/text/gnmt.rst
@@ -50,7 +50,7 @@ Hyper-parameters
     np.random.seed(100)
     random.seed(100)
     mx.random.seed(10000)
-    ctx = mx.gpu(0)
+    device = mx.gpu(0)
 
     # parameters for dataset
     dataset = 'IWSLT2015'
@@ -318,7 +318,7 @@ allows computation to be done using the symbolic backend.
                                                          num_bi_layers=num_bi_layers)
     model = nmt.translation.NMTModel(src_vocab=src_vocab, tgt_vocab=tgt_vocab, encoder=encoder, decoder=decoder,
                                      embed_size=num_hidden, prefix='gnmt_')
-    model.initialize(init=mx.init.Uniform(0.1), ctx=ctx)
+    model.initialize(init=mx.init.Uniform(0.1), device=device)
     static_alloc = True
     model.hybridize(static_alloc=static_alloc)
     logging.info(model)
@@ -363,10 +363,10 @@ testing datasets.
         avg_loss = 0.0
         for _, (src_seq, tgt_seq, src_valid_length, tgt_valid_length, inst_ids) \
                 in enumerate(data_loader):
-            src_seq = src_seq.as_in_context(ctx)
-            tgt_seq = tgt_seq.as_in_context(ctx)
-            src_valid_length = src_valid_length.as_in_context(ctx)
-            tgt_valid_length = tgt_valid_length.as_in_context(ctx)
+            src_seq = src_seq.to_device(device)
+            tgt_seq = tgt_seq.to_device(device)
+            src_valid_length = src_valid_length.to_device(device)
+            tgt_valid_length = tgt_valid_length.to_device(device)
             # Calculating Loss
             out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
             loss = loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean().asscalar()
@@ -408,7 +408,7 @@ that uses ADAM optimzier.
 We can then write the training loop. During the training, we evaluate on
 the validation and testing datasets every epoch, and record the
 parameters that give the hightest BLEU score on the validation dataset.
-Before performing forward and backward, we first use ``as_in_context``
+Before performing forward and backward, we first use ``to_device``
 function to copy the mini-batch to GPU. The statement
 ``with mx.autograd.record()`` tells Gluon backend to compute the
 gradients for the part inside the block.
@@ -424,16 +424,16 @@ gradients for the part inside the block.
         for batch_id, (src_seq, tgt_seq, src_valid_length, tgt_valid_length)\
                 in enumerate(train_data_loader):
             # logging.info(src_seq.context) Context suddenly becomes GPU.
-            src_seq = src_seq.as_in_context(ctx)
-            tgt_seq = tgt_seq.as_in_context(ctx)
-            src_valid_length = src_valid_length.as_in_context(ctx)
-            tgt_valid_length = tgt_valid_length.as_in_context(ctx)
+            src_seq = src_seq.to_device(device)
+            tgt_seq = tgt_seq.to_device(device)
+            src_valid_length = src_valid_length.to_device(device)
+            tgt_valid_length = tgt_valid_length.to_device(device)
             with mx.autograd.record():
                 out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
                 loss = loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean()
                 loss = loss * (tgt_seq.shape[1] - 1) / (tgt_valid_length - 1).mean()
                 loss.backward()
-            grads = [p.grad(ctx) for p in model.collect_params().values()]
+            grads = [p.grad(device) for p in model.collect_params().values()]
             gnorm = gluon.utils.clip_global_norm(grads, clip)
             trainer.step(1)
             src_wc = src_valid_length.sum().asscalar()
diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md b/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md
index 54d8ba4c3b7a..fba454ca4074 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/training/fit_api_tutorial.md
@@ -39,7 +39,7 @@ from mxnet.gluon.contrib.estimator import estimator
 from mxnet.gluon.contrib.estimator.event_handler import TrainBegin, TrainEnd, EpochEnd, CheckpointHandler
 
 gpu_count = mx.device.num_gpus()
-ctx = [mx.gpu(i) for i in range(gpu_count)] if gpu_count > 0 else mx.cpu()
+device = [mx.gpu(i) for i in range(gpu_count)] if gpu_count > 0 else mx.cpu()
 ```
 
 ## Dataset
@@ -94,7 +94,7 @@ Let's load the resnet-18 model architecture from [Gluon Model Zoo](../../../../a
 
 ```{.python .input}
 resnet_18_v1 = vision.resnet18_v1(pretrained=False, classes = 10)
-resnet_18_v1.initialize(init = mx.init.Xavier(), ctx=ctx)
+resnet_18_v1.initialize(init = mx.init.Xavier(), device=device)
 ```
 
 We will be using `SoftmaxCrossEntropyLoss` as the loss function since this is a multi-class classification problem. We will be using `sgd` (Stochastic Gradient Descent) as the optimizer.
@@ -127,12 +127,12 @@ In the basic usage example, with just 2 lines of code, we will set up our model
 ```{.python .input}
 train_acc = mx.gluon.metric.Accuracy() # Metric to monitor
 
-# Define the estimator, by passing to it the model, loss function, metrics, trainer object and context
+# Define the estimator, by passing to it the model, loss function, metrics, trainer object and device
 est = estimator.Estimator(net=resnet_18_v1,
                           loss=loss_fn,
                           train_metrics=train_acc,
                           trainer=trainer,
-                          context=ctx)
+                          device=device)
 
 # ignore warnings for nightly test on CI only
 import warnings
@@ -204,7 +204,7 @@ class LossRecordHandler(TrainBegin, TrainEnd, EpochEnd):
 ```{.python .input}
 # Let's reset the model, trainer and accuracy objects from above
 
-resnet_18_v1.initialize(force_reinit=True, init = mx.init.Xavier(), ctx=ctx)
+resnet_18_v1.initialize(force_reinit=True, init = mx.init.Xavier(), device=device)
 trainer = gluon.Trainer(resnet_18_v1.collect_params(),
                         'sgd', {'learning_rate': learning_rate})
 train_acc = mx.gluon.metric.Accuracy()
@@ -212,12 +212,12 @@ train_acc = mx.gluon.metric.Accuracy()
 
 
 ```{.python .input}
-# Define the estimator, by passing to it the model, loss function, metrics, trainer object and context
+# Define the estimator, by passing to it the model, loss function, metrics, trainer object and device
 est = estimator.Estimator(net=resnet_18_v1,
                           loss=loss_fn,
                           train_metrics=train_acc,
                           trainer=trainer,
-                          context=ctx)
+                          device=device)
 
 # Define the handlers, let's say in built Checkpointhandler
 checkpoint_handler = CheckpointHandler(model_dir='./',
@@ -257,7 +257,7 @@ You can load the saved model, by using the `load_parameters` API in Gluon. For m
 
 ```{.python .input}
 resnet_18_v1 = vision.resnet18_v1(pretrained=False, classes=10)
-resnet_18_v1.load_parameters('./my_model-best.params', ctx=ctx)
+resnet_18_v1.load_parameters('./my_model-best.params', device=device)
 ```
 
 ## Next Steps
diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_finder.md b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_finder.md
index a64f3c4da9b6..77d2bd195cfa 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_finder.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_finder.md
@@ -48,19 +48,19 @@ import mxnet as mx
 mx.np.random.seed(42)
 
 class Learner():
-    def __init__(self, net, data_loader, ctx):
+    def __init__(self, net, data_loader, device):
         """
         :param net: network (mx.gluon.Block)
         :param data_loader: training data loader (mx.gluon.data.DataLoader)
-        :param ctx: context (mx.gpu or mx.cpu)
+        :param device: device (mx.gpu or mx.cpu)
         """
         self.net = net
         self.data_loader = data_loader
-        self.ctx = ctx
+        self.device = device
         # So we don't need to be in `for batch in data_loader` scope
         # and can call for next batch in `iteration`
         self.data_loader_iter = iter(self.data_loader)
-        self.net.initialize(mx.init.Xavier(), ctx=self.ctx)
+        self.net.initialize(mx.init.Xavier(), device=self.device)
         self.loss_fn = mx.gluon.loss.SoftmaxCrossEntropyLoss()
         self.trainer = mx.gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': .001})
 
@@ -73,10 +73,10 @@ class Learner():
         # Update learning rate if different this iteration
         if lr and (lr != self.trainer.learning_rate):
             self.trainer.set_learning_rate(lr)
-        # Get next batch, and move context (e.g. to GPU if set)
+        # Get next batch, and move device (e.g. to GPU if set)
         data, label = next(self.data_loader_iter)
-        data = data.as_in_context(self.ctx)
-        label = label.as_in_context(self.ctx)
+        data = data.to_device(self.device)
+        label = label.to_device(self.device)
         # Standard forward and backward pass
         with mx.autograd.record():
             output = self.net(data)
@@ -172,7 +172,7 @@ class LRFinder():
                 break
             lr = lr * lr_multiplier
         # Restore params (as finder changed them)
-        self.learner.net.load_parameters("lr_finder.params", ctx=self.learner.ctx)
+        self.learner.net.load_parameters("lr_finder.params", device=self.learner.device)
         self.learner.trainer.load_states("lr_finder.state")
         return self.results
 
@@ -231,9 +231,9 @@ Using a Pre-activation ResNet-18 from the Gluon model zoo, we instantiate our Le
 
 
 ```{.python .input}
-ctx = mx.gpu() if mx.device.num_gpus() else mx.cpu()
+device = mx.gpu() if mx.device.num_gpus() else mx.cpu()
 net = mx.gluon.model_zoo.vision.resnet18_v2(classes=10)
-learner = Learner(net=net, data_loader=data_loader, ctx=ctx)
+learner = Learner(net=net, data_loader=data_loader, device=device)
 lr_finder = LRFinder(learner)
 lr_finder.find(lr_start=1e-6)
 lr_finder.plot()
@@ -274,8 +274,8 @@ And now we have a baseline, let's see what happens when we train with a learning
 
 ```{.python .input}
 net = mx.gluon.model_zoo.vision.resnet18_v2(classes=10)
-learner = Learner(net=net, data_loader=data_loader, ctx=ctx)
-learner.net.load_parameters("net.params", ctx=ctx)
+learner = Learner(net=net, data_loader=data_loader, device=device)
+learner.net.load_parameters("net.params", device=device)
 lr = 0.5
 
 for iter_idx in range(300):
@@ -302,8 +302,8 @@ And lastly, we see how the model trains with a more conservative learning rate o
 
 ```{.python .input}
 net = mx.gluon.model_zoo.vision.resnet18_v2(classes=10)
-learner = Learner(net=net, data_loader=data_loader, ctx=ctx)
-learner.net.load_parameters("net.params", ctx=ctx)
+learner = Learner(net=net, data_loader=data_loader, device=device)
+learner.net.load_parameters("net.params", device=device)
 lr = 0.005
 
 for iter_idx in range(300):
diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md
index c1f2fd812775..562e90e394d9 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md
@@ -140,7 +140,7 @@ As discussed above, the schedule should return a learning rate given an (1-based
 
 ```{.python .input}
 # Use GPU if one exists, else use CPU
-ctx = mx.gpu() if mx.device.num_gpus() else mx.cpu()
+device = mx.gpu() if mx.device.num_gpus() else mx.cpu()
 
 # MNIST images are 28x28. Total pixels in input layer is 28x28 = 784
 num_inputs = 784
@@ -178,7 +178,7 @@ We then initialize our network (technically deferred until we pass the first bat
 
 ```{.python .input}
 # Initialize the parameters with Xavier initializer
-net.initialize(mx.init.Xavier(), ctx=ctx)
+net.initialize(mx.init.Xavier(), device=device)
 # Use cross entropy loss
 softmax_cross_entropy = mx.gluon.loss.SoftmaxCrossEntropyLoss()
 ```
@@ -227,8 +227,8 @@ for epoch in range(1, num_epochs+1):
     # Iterate through the images and labels in the training data
     for batch_num, (data, label) in enumerate(train_dataloader, start=1):
         # get the images and labels
-        data = data.as_in_context(ctx)
-        label = label.as_in_context(ctx)
+        data = data.to_device(device)
+        label = label.to_device(device)
         # Ask autograd to record the forward pass
         with mx.autograd.record():
             # Run the forward pass
@@ -279,7 +279,7 @@ We replicate the example above, but now keep track of the `iteration_idx`, call
 
 ```{.python .input}
 net = build_cnn()
-net.initialize(mx.init.Xavier(), ctx=ctx)
+net.initialize(mx.init.Xavier(), device=device)
 
 schedule = mx.lr_scheduler.MultiFactorScheduler(step=steps_iterations, factor=0.1)
 schedule.base_lr = 0.03
@@ -293,8 +293,8 @@ for epoch in range(1, num_epochs + 1):
     # Iterate through the images and labels in the training data
     for batch_num, (data, label) in enumerate(train_dataloader, start=1):
         # get the images and labels
-        data = data.as_in_context(ctx)
-        label = label.as_in_context(ctx)
+        data = data.to_device(device)
+        label = label.to_device(device)
         # Ask autograd to record the forward pass
         with mx.autograd.record():
             # Run the forward pass
diff --git a/docs/python_docs/python/tutorials/packages/kvstore/kvstore.md b/docs/python_docs/python/tutorials/packages/kvstore/kvstore.md
index bc1b0b1211d4..49198d975482 100644
--- a/docs/python_docs/python/tutorials/packages/kvstore/kvstore.md
+++ b/docs/python_docs/python/tutorials/packages/kvstore/kvstore.md
@@ -57,8 +57,8 @@ values and then push the aggregated value. Here we will just demonstrate pushing
 Please note summation only happens if the value list is longer than one
 
 ```{.python .input}
-contexts = [mx.cpu(i) for i in range(4)]
-b = [mx.np.ones(shape=shape, ctx=ctx) for ctx in contexts]
+devices = [mx.cpu(i) for i in range(4)]
+b = [mx.np.ones(shape=shape, device=device) for device in devices]
 kv.push(3, b)
 kv.pull(3, out = a)
 print(a.asnumpy())
@@ -98,7 +98,7 @@ You've already seen how to pull a single key-value pair. Similarly, to push, you
 pull the value onto several devices with a single call:
 
 ```{.python .input}
-b = [mx.np.ones(shape=shape, ctx=ctx) for ctx in contexts]
+b = [mx.np.ones(shape=shape, device=device) for device in devices]
 kv.pull(3, out = b)
 print(b[1].asnumpy())
 ```
@@ -132,7 +132,7 @@ print(b[1].asnumpy())
 For multiple devices:
 
 ```{.python .input}
-b = [[mx.np.ones(shape=shape, ctx=ctx) for ctx in contexts]] * len(keys)
+b = [[mx.np.ones(shape=shape, device=device) for device in devices]] * len(keys)
 kv.push(keys, b)
 kv.pull(keys, out = b)
 print(b[1][1].asnumpy())
diff --git a/docs/python_docs/python/tutorials/packages/np/cheat-sheet.md b/docs/python_docs/python/tutorials/packages/np/cheat-sheet.md
index 6536bfff2ec7..238b0b985b21 100644
--- a/docs/python_docs/python/tutorials/packages/np/cheat-sheet.md
+++ b/docs/python_docs/python/tutorials/packages/np/cheat-sheet.md
@@ -411,13 +411,13 @@ npx.gpu(0), npx.gpu(1)  # Context for the first and second GPUs
 
 ```{.python .input}
 gpu_0 = npx.gpu(0) if npx.num_gpus() > 1 else npx.cpu()
-g0 = np.zeros((2,3), ctx=gpu_0)  # Create array on GPU 0
+g0 = np.zeros((2,3), device=gpu_0)  # Create array on GPU 0
 g0
 ```
 
 ```{.python .input}
 gpu_1 = npx.gpu(1) if npx.num_gpus() > 2 else npx.cpu()
-g1 = np.random.uniform(size=(2,3), ctx=gpu_1)  # Create array on GPU 1
+g1 = np.random.uniform(size=(2,3), device=gpu_1)  # Create array on GPU 1
 g1
 ```
 
@@ -427,12 +427,12 @@ g1.copyto(gpu_0)
 ```
 
 ```{.python .input}
-# Return itself if matching the context, otherwise copy
+# Return itself if matching the device, otherwise copy
 g1.copyto(gpu_0), g1.copyto(gpu_0)
 ```
 
 ```{.python .input}
-g1.context  # Query the device an array is on
+g1.device  # Query the device an array is on
 ```
 
 ```{.python .input}
diff --git a/docs/python_docs/python/tutorials/packages/np/np-vs-numpy.md b/docs/python_docs/python/tutorials/packages/np/np-vs-numpy.md
index c4062001bae5..e67bf7351b16 100644
--- a/docs/python_docs/python/tutorials/packages/np/np-vs-numpy.md
+++ b/docs/python_docs/python/tutorials/packages/np/np-vs-numpy.md
@@ -33,19 +33,19 @@ In addition, an operator might not contain all arguments available in NumPy. For
 
 ## Extra functionalities
 
-The `mxnet.np` module aims to mimic NumPy.  Most extra functionalities that enhance NumPy for deep learning use are available on other modules, such as `npx` for operators used in deep learning and `autograd` for automatic differentiation. The `np` module API is not complete. One notable change is GPU support. Creating routines accepts a `ctx` argument:
+The `mxnet.np` module aims to mimic NumPy.  Most extra functionalities that enhance NumPy for deep learning use are available on other modules, such as `npx` for operators used in deep learning and `autograd` for automatic differentiation. The `np` module API is not complete. One notable change is GPU support. Creating routines accepts a `device` argument:
 
 ```{.python .input}
 gpu = npx.gpu() if npx.num_gpus() > 0 else npx.cpu()
-a = np.array(1, ctx=gpu)
-b = np.random.uniform(ctx=gpu)
-(a, b.context)
+a = np.array(1, device=gpu)
+b = np.random.uniform(device=gpu)
+(a, b.device)
 ```
 
 Methods to move data across devices.
 
 ```{.python .input}
-a.copyto(npx.cpu()), b.as_in_context(npx.cpu())
+a.copyto(npx.cpu()), b.to_device(npx.cpu())
 ```
 
 ## Default data types
@@ -99,7 +99,7 @@ format, whereas `npx.savez` can save a collection of both dense and sparse
 arrays to the `.npz` format.
 
 ```{.python .input}
-a = np.array(1, ctx=gpu)
+a = np.array(1, device=gpu)
 npx.save('a', a)
 npx.load('a')
 npx.savez('a', a=a, b=a*2)
diff --git a/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md b/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md
index ba75c728ca59..d7c998624052 100644
--- a/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md
+++ b/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md
@@ -268,11 +268,11 @@ new_sym, new_arg_params, new_aux_params = get_layer_output(sym, arg_params, aux_
 We can now take advantage of the features and pattern detection knowledge that our network learnt training on ImageNet, and apply that to the new Caltech101 dataset.
 
 
-We pick a context, fine-tuning on CPU will be **WAY** slower.
+We pick a device, fine-tuning on CPU will be **WAY** slower.
 
 
 ```{.python .input}
-ctx = mx.gpu() if mx.device.num_gpus() > 0 else mx.cpu()
+device = mx.gpu() if mx.device.num_gpus() > 0 else mx.cpu()
 ```
 
 We create a symbol block that is going to hold all our pre-trained layers, and assign the weights of the different pre-trained layers to the newly created SymbolBlock
@@ -286,10 +286,10 @@ with warnings.catch_warnings():
 net_params = pre_trained.collect_params()
 for param in new_arg_params:
     if param in net_params:
-        net_params[param]._load_init(new_arg_params[param], ctx=ctx)
+        net_params[param]._load_init(new_arg_params[param], device=device)
 for param in new_aux_params:
     if param in net_params:
-        net_params[param]._load_init(new_aux_params[param], ctx=ctx)
+        net_params[param]._load_init(new_aux_params[param], device=device)
 
 ```
 
@@ -298,7 +298,7 @@ We create the new dense layer with the right new number of classes (101) and ini
 
 ```{.python .input}
 dense_layer = gluon.nn.Dense(NUM_CLASSES)
-dense_layer.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
+dense_layer.initialize(mx.init.Xavier(magnitude=2.24), device=device)
 ```
 
 We add the SymbolBlock and the new dense layer to a HybridSequential network
@@ -346,10 +346,10 @@ We measure the accuracy in a non-blocking way, using `np.array` to take care of
 ```{.python .input}
  def evaluate_accuracy_gluon(data_iterator, net):
     num_instance = 0
-    sum_metric = np.zeros(1,ctx=ctx, dtype=np.int32)
+    sum_metric = np.zeros(1,device=device, dtype=np.int32)
     for i, (data, label) in enumerate(data_iterator):
-        data = data.astype(np.float32).as_in_context(ctx)
-        label = label.astype(np.int32).as_in_context(ctx)
+        data = data.astype(np.float32).to_device(device)
+        label = label.astype(np.int32).to_device(device)
         output = net(data)
         prediction = np.argmax(output, axis=1).astype(np.int32)
         num_instance += len(prediction)
@@ -375,8 +375,8 @@ print("Untrained network Test Accuracy: {0:.4f}".format(evaluate_accuracy_gluon(
 val_accuracy = 0
 for epoch in range(5):
     for i, (data, label) in enumerate(dataloader_train):
-        data = data.astype(np.float32).as_in_context(ctx)
-        label = label.as_in_context(ctx)
+        data = data.astype(np.float32).to_device(device)
+        label = label.to_device(device)
 
         if i%20==0 and i >0:
             print('Batch [{0}] loss: {1:.4f}'.format(i, loss.mean().item()))
@@ -416,7 +416,7 @@ TOP_P = 3
 ```{.python .input}
 # Convert img to format expected by the network
 def transform(img):
-    return np.array(np.expand_dims(np.transpose(img, (2,0,1)),axis=0).astype(np.float32), ctx=ctx)
+    return np.array(np.expand_dims(np.transpose(img, (2,0,1)),axis=0).astype(np.float32), device=device)
 ```
 
 
diff --git a/docs/python_docs/python/tutorials/packages/onnx/inference_on_onnx_model.md b/docs/python_docs/python/tutorials/packages/onnx/inference_on_onnx_model.md
index 0f250db55255..f4465170e7df 100644
--- a/docs/python_docs/python/tutorials/packages/onnx/inference_on_onnx_model.md
+++ b/docs/python_docs/python/tutorials/packages/onnx/inference_on_onnx_model.md
@@ -114,11 +114,11 @@ We get the symbol and parameter objects
 sym, arg_params, aux_params = onnx_mxnet.import_model(onnx_path)
 ```
 
-We pick a context, CPU is fine for inference, switch to mx.gpu() if you want to use your GPU.
+We pick a device, CPU is fine for inference, switch to mx.gpu() if you want to use your GPU.
 
 
 ```{.python .input}
-ctx = mx.cpu()
+device = mx.cpu()
 ```
 
 We obtain the data names of the inputs to the model by using the model metadata API:
@@ -148,10 +148,10 @@ with warnings.catch_warnings():
 net_params = net.collect_params()
 for param in arg_params:
     if param in net_params:
-        net_params[param]._load_init(arg_params[param], ctx=ctx)
+        net_params[param]._load_init(arg_params[param], device=device)
 for param in aux_params:
     if param in net_params:
-        net_params[param]._load_init(aux_params[param], ctx=ctx)
+        net_params[param]._load_init(aux_params[param], device=device)
 ```
 
 We can now cache the computational graph through [hybridization](https://mxnet.apache.org/versions/master/api/python/docs/tutorials/packages/gluon/blocks/hybridize.html) to gain some performance
@@ -215,7 +215,7 @@ images = image_net_images + caltech101_images
 And run them as a batch through the network to get the predictions
 
 ```{.python .input}
-batch = nd.array(np.concatenate([transform(img) for img in images], axis=0), ctx=ctx)
+batch = nd.array(np.concatenate([transform(img) for img in images], axis=0), device=device)
 result = run_batch(net, [batch])
 ```
 
diff --git a/docs/python_docs/python/tutorials/performance/backend/amp.md b/docs/python_docs/python/tutorials/performance/backend/amp.md
index 52ec2c4d3040..ea1f4f6fa26f 100644
--- a/docs/python_docs/python/tutorials/performance/backend/amp.md
+++ b/docs/python_docs/python/tutorials/performance/backend/amp.md
@@ -48,8 +48,8 @@ lr = 0.001
 wd = 0.0005
 momentum = 0.9
 
-# training contexts
-ctx = [mx.gpu(0)]
+# training devices
+device = [mx.gpu(0)]
 
 # set up logger
 logging.basicConfig()
@@ -70,9 +70,9 @@ class SyntheticDataLoader(object):
         shape = (batch_size, 3, data_shape, data_shape)
         cls_targets_shape = (batch_size, 6132)
         box_targets_shape = (batch_size, 6132, 4)
-        self.data = mx.np.random.uniform(-1, 1, size=shape, ctx=mx.cpu_pinned())
-        self.cls_targets = mx.np.random.uniform(0, 1, size=cls_targets_shape, ctx=mx.cpu_pinned())
-        self.box_targets = mx.np.random.uniform(0, 1, size=box_targets_shape, ctx=mx.cpu_pinned())
+        self.data = mx.np.random.uniform(-1, 1, size=shape, device=mx.cpu_pinned())
+        self.cls_targets = mx.np.random.uniform(0, 1, size=cls_targets_shape, device=mx.cpu_pinned())
+        self.box_targets = mx.np.random.uniform(0, 1, size=box_targets_shape, device=mx.cpu_pinned())
     
     def next(self):
         if self.counter >= self.epoch_size:
@@ -98,7 +98,7 @@ def get_network():
         warnings.simplefilter("ignore")
         net = get_model(net_name, pretrained_base=True, norm_layer=gluon.nn.BatchNorm)
         net.initialize()
-        net.reset_ctx(ctx)
+        net.reset_device(device)
 
     return net
 ```
@@ -136,9 +136,9 @@ for epoch in range(1):
 
     for i, batch in enumerate(train_data):
         batch_size = batch[0].shape[0]
-        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
-        cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
-        box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0)
+        data = gluon.utils.split_and_load(batch[0], ctx_list=device, batch_axis=0)
+        cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=device, batch_axis=0)
+        box_targets = gluon.utils.split_and_load(batch[2], ctx_list=device, batch_axis=0)
         with autograd.record():
             cls_preds = []
             box_preds = []
@@ -223,9 +223,9 @@ for epoch in range(1):
 
     for i, batch in enumerate(train_data):
         batch_size = batch[0].shape[0]
-        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
-        cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
-        box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0)
+        data = gluon.utils.split_and_load(batch[0], ctx_list=device, batch_axis=0)
+        cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=device, batch_axis=0)
+        box_targets = gluon.utils.split_and_load(batch[2], ctx_list=device, batch_axis=0)
         with autograd.record():
             cls_preds = []
             box_preds = []
@@ -272,7 +272,7 @@ with mx.Context(mx.gpu(0)):
     with warnings.catch_warnings(record=True) as w:
         warnings.simplefilter("ignore")
         model = get_model("resnet50_v1")
-        model.initialize(ctx=mx.current_device())
+        model.initialize(device=mx.current_device())
         model.hybridize()
         model(mx.np.zeros((1, 3, 224, 224)))
         converted_model = amp.convert_hybrid_block(model)
diff --git a/docs/python_docs/python/tutorials/performance/backend/profiler.md b/docs/python_docs/python/tutorials/performance/backend/profiler.md
index e769a040155e..a54892d4cf73 100644
--- a/docs/python_docs/python/tutorials/performance/backend/profiler.md
+++ b/docs/python_docs/python/tutorials/performance/backend/profiler.md
@@ -102,12 +102,12 @@ Let's define a function that will run a single training iteration given `data` a
 ```{.python .input}
 # Use GPU if available
 if mx.device.num_gpus():
-    ctx=mx.gpu()
+    device=mx.gpu()
 else:
-    ctx=mx.cpu()
+    device=mx.cpu()
 
 # Initialize the parameters with random weights
-net.initialize(mx.init.Xavier(), ctx=ctx)
+net.initialize(mx.init.Xavier(), device=device)
 
 # Use SGD optimizer
 trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1})
@@ -117,9 +117,9 @@ softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
 
 # A helper function to run one training iteration
 def run_training_iteration(data, label):
-    # Load data and label is the right context
-    data = data.as_in_context(ctx)
-    label = label.as_in_context(ctx)
+    # Load data and label is the right device
+    data = data.to_device(device)
+    label = label.to_device(device)
     # Run the forward pass
     with autograd.record():
         output = net(data)
@@ -178,7 +178,7 @@ When working with networks created using the Gluon API, you will get a more gran
 
 ### Viewing profiler output
 
-There are a few ways to view the information collected by the profiler. You can view it in the console, you can view a more graphical version in a browser, or you can use a vendor tool such as Intel VTune or Nvidia NVProf to view output. For most scenarios the information you need can be obtained with MXNet's built in profiler support, but if you want to investigate the performance of operators alongside extra context about your hardware (e.g. cache hit rates, or CUDA kernel timings) then profiling jointly with vendor tools is recommended.
+There are a few ways to view the information collected by the profiler. You can view it in the console, you can view a more graphical version in a browser, or you can use a vendor tool such as Intel VTune or Nvidia NVProf to view output. For most scenarios the information you need can be obtained with MXNet's built in profiler support, but if you want to investigate the performance of operators alongside extra device about your hardware (e.g. cache hit rates, or CUDA kernel timings) then profiling jointly with vendor tools is recommended.
 
 #### 1. View in console
 
@@ -261,7 +261,7 @@ class CustomAddOneProp(mx.operator.CustomOpProp):
     def infer_shape(self, in_shape):
         return [in_shape[0]], [in_shape[0]], []
 
-    def create_operator(self, ctx, shapes, dtypes):
+    def create_operator(self, device, shapes, dtypes):
         return MyAddOne()
 
 
diff --git a/docs/static_site/src/pages/api/developer_guide/profiling.md b/docs/static_site/src/pages/api/developer_guide/profiling.md
index 8fad066afce8..1996f4531654 100644
--- a/docs/static_site/src/pages/api/developer_guide/profiling.md
+++ b/docs/static_site/src/pages/api/developer_guide/profiling.md
@@ -60,13 +60,13 @@ with model.name_scope():
     model.add(nn.Dense(64, activation='tanh'),
               nn.Dense(32, in_units=64))
     model.add(nn.Activation('relu'))
-model.initialize(ctx=mx.cpu())
+model.initialize(device=mx.cpu())
 model.hybridize()
 
 inputs = mx.sym.var('data')
 
 with mx.autograd.record():
-    out = model(mx.nd.zeros((16, 10), ctx=mx.cpu()))
+    out = model(mx.nd.zeros((16, 10), device=mx.cpu()))
 out.backward()
 mx.nd.waitall()
 profiler.set_state('stop')
diff --git a/tests/python/unittest/common.py b/tests/python/unittest/common.py
index d2dd8e378683..d55bc74f2cbc 100644
--- a/tests/python/unittest/common.py
+++ b/tests/python/unittest/common.py
@@ -136,7 +136,7 @@ def test_new(*args, **kwargs):
             cuxx_off = os.getenv(cfg['TEST_OFF_ENV_VAR']) == 'true'
             cuxx_env_version = os.getenv(cfg['VERSION_ENV_VAR'], None if cuxx_off else cfg['DEFAULT_VERSION'])
             cuxx_test_disabled = cuxx_off or less_than(cuxx_env_version, min_version)
-            if not cuxx_test_disabled or mx.device.current_device.device_type == 'cpu':
+            if not cuxx_test_disabled or mx.device.current_device().device_type == 'cpu':
                 orig_test(*args, **kwargs)
             else:
                 pytest.raises((MXNetError, RuntimeError), orig_test, *args, **kwargs)

From 17d98980758bb5da7ba468259e4e22ce9b0697b5 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Tue, 19 Oct 2021 16:46:44 -0700
Subject: [PATCH 38/41] rm context in doc

---
 docs/python_docs/python/api/context/index.rst | 23 -------------------
 1 file changed, 23 deletions(-)
 delete mode 100644 docs/python_docs/python/api/context/index.rst

diff --git a/docs/python_docs/python/api/context/index.rst b/docs/python_docs/python/api/context/index.rst
deleted file mode 100644
index f939aab59047..000000000000
--- a/docs/python_docs/python/api/context/index.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-   or more contributor license agreements.  See the NOTICE file
-   distributed with this work for additional information
-   regarding copyright ownership.  The ASF licenses this file
-   to you under the Apache License, Version 2.0 (the
-   "License"); you may not use this file except in compliance
-   with the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing,
-   software distributed under the License is distributed on an
-   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-   KIND, either express or implied.  See the License for the
-   specific language governing permissions and limitations
-   under the License.
-
-mxnet.device
-=============
-
-.. automodule:: mxnet.device
-    :members:
-    :autosummary:

From bb1e4ad1197a934f29c41bffddada1bc8660133d Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 25 Oct 2021 10:07:23 -0700
Subject: [PATCH 39/41] fix lint

---
 python/mxnet/context.py                           | 6 +++---
 python/mxnet/gluon/block.py                       | 2 +-
 python/mxnet/gluon/contrib/estimator/estimator.py | 6 +++---
 python/mxnet/gluon/utils.py                       | 2 +-
 python/mxnet/numpy_op_fallback.py                 | 8 ++++----
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/python/mxnet/context.py b/python/mxnet/context.py
index 668f9cc6e25c..7f8f67a1c04b 100644
--- a/python/mxnet/context.py
+++ b/python/mxnet/context.py
@@ -16,17 +16,17 @@
 # under the License.
 """Context management API of mxnet."""
 from warnings import warn
-from .device import Device, _current, cpu, gpu, cpu_pinned
+from .device import Device, _current, cpu, gpu, cpu_pinned  # pylint: disable=unused-import
 
 
 def Context(*args, **kwargs):
     """This class has been deprecated. Please refer to ``device.Device``."""
     warn('Directly use Context class to construct a device will be deprecated. '
-        'Please use Device class instead. ', DeprecationWarning)
+         'Please use Device class instead. ', DeprecationWarning)
     return Device(*args, **kwargs)
 
 def current_context():
     """This function has been deprecated. Please refer to ``device.current_device``."""
     warn('Directly use current_context to get current device will be deprecated. '
-        'Please use current_device method instead. ', DeprecationWarning)
+         'Please use current_device method instead. ', DeprecationWarning)
     return _current.get()
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 1a2535c7e71c..64d3d7d9f2b3 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -767,7 +767,7 @@ def reset_device(self, device):
         params = self.collect_params()
         for i in params.values():
             i.reset_device(device)
-    
+
     def reset_ctx(self, ctx):
         """This function has been deprecated. Please refer to ``Block.reset_device``."""
         warnings.warn('Block.reset_ctx has been renamed to'
diff --git a/python/mxnet/gluon/contrib/estimator/estimator.py b/python/mxnet/gluon/contrib/estimator/estimator.py
index 423d198be17e..7b212c03c302 100644
--- a/python/mxnet/gluon/contrib/estimator/estimator.py
+++ b/python/mxnet/gluon/contrib/estimator/estimator.py
@@ -149,7 +149,7 @@ def _check_context(self, context):
         warnings.warn('Estimator._check_context has been renamed to'
                       ' Estimator._check_devices', DeprecationWarning)
         return self._check_devices(context)
-    
+
     def _check_devices(self, devices):
         # infer available devices
         gpus = num_gpus()
@@ -239,8 +239,8 @@ def _is_initialized(self):
     def _get_data_and_label(self, batch, device, batch_axis=0):
         data = batch[0]
         label = batch[1]
-        data = split_and_load(data, device_list=device, batch_axis=batch_axis)
-        label = split_and_load(label, device_list=device, batch_axis=batch_axis)
+        data = split_and_load(data, device, batch_axis=batch_axis)
+        label = split_and_load(label, device, batch_axis=batch_axis)
         return data, label
 
     def _add_default_training_metrics(self):
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index 7c9557f80a07..267ed2c793a2 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -140,7 +140,7 @@ def group_by_ctx(arr_list):
             groups[ctx].append(arr)
         return groups
     def multi_sum_sq(*args, ctx=None):
-        sum = _mx_np.array([0], ctx=ctx)
+        sum = _mx_np.array([0], device=ctx)
         for arg in args:
             sum += _mx_np.square(arg).sum().item()
         return sum
diff --git a/python/mxnet/numpy_op_fallback.py b/python/mxnet/numpy_op_fallback.py
index 8804701765e8..17b6327bb79b 100644
--- a/python/mxnet/numpy_op_fallback.py
+++ b/python/mxnet/numpy_op_fallback.py
@@ -67,7 +67,7 @@ def forward(self, is_train, req, in_data, out_data, aux):
                                 subok=self._subok)
         else:
             out = np.empty_like(in_data[0].asnumpy())
-        self.assign(out_data[0], req[0], _mx_np.array(out, ctx=in_data[0].device))
+        self.assign(out_data[0], req[0], _mx_np.array(out, device=in_data[0].device))
 
     def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
         raise NotImplementedError('Operator empty_like does not support gradient computation')
@@ -108,7 +108,7 @@ def __init__(self, new_shape):
 
     def forward(self, is_train, req, in_data, out_data, aux):
         out = np.resize(in_data[0].asnumpy(), self._new_shape)
-        self.assign(out_data[0], req[0], _mx_np.array(out, dtype=out.dtype, ctx=out_data[0].device))
+        self.assign(out_data[0], req[0], _mx_np.array(out, dtype=out.dtype, device=out_data[0].device))
 
     def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
         raise NotImplementedError('Operator resize does not support gradient computation')
@@ -141,7 +141,7 @@ def __init__(self, shape):
 
     def forward(self, is_train, req, in_data, out_data, aux):
         out = np.unravel_index(in_data[0].asnumpy(), self._shape)
-        self.assign(out_data[0], req[0], _mx_np.array(out, dtype=out[0].dtype, ctx=out_data[0].device))
+        self.assign(out_data[0], req[0], _mx_np.array(out, dtype=out[0].dtype, device=out_data[0].device))
 
     def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
         raise NotImplementedError('Operator Unravel_index does not support gradient computation')
@@ -181,7 +181,7 @@ def forward(self, is_train, req, in_data, out_data, aux):
         else:
             scale = _mx_np.linalg.cholesky(cov)
         #set context
-        noise = _mx_np.random.normal(size=out_data[0].shape, dtype=loc.dtype, ctx=loc.device)
+        noise = _mx_np.random.normal(size=out_data[0].shape, dtype=loc.dtype, device=loc.device)
         out = loc + _mx_np.einsum('...jk,...j->...k', scale, noise)
         self.assign(out_data[0], req[0], out)
 

From e7758447aad0ca14427bd3800c4ad1cf7f9c3247 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 25 Oct 2021 11:47:50 -0700
Subject: [PATCH 40/41] remove npv

---
 python/mxnet/numpy/fallback.py                       | 1 -
 tests/python/unittest/test_numpy_interoperability.py | 5 -----
 2 files changed, 6 deletions(-)

diff --git a/python/mxnet/numpy/fallback.py b/python/mxnet/numpy/fallback.py
index 83bf67372517..0848a33e2352 100644
--- a/python/mxnet/numpy/fallback.py
+++ b/python/mxnet/numpy/fallback.py
@@ -75,7 +75,6 @@
     'nansum',
     'nanvar',
     'ndim',
-    'npv',
     'packbits',
     'partition',
     'piecewise',
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index ca852a85de59..64bb75e8e865 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -2714,10 +2714,6 @@ def _add_workload_ndim():
     OpArgMngr.add_workload('ndim', b)
 
 
-def _add_workload_npv():
-    rate, cashflows = 0.281, np.array([-100, 39, 59, 55, 20])
-    OpArgMngr.add_workload('npv', rate, cashflows)
-
 
 def _add_workload_partition():
     a = np.array([3, 4, 2, 1])
@@ -3206,7 +3202,6 @@ def _prepare_workloads():
     _add_workload_nansum()
     _add_workload_nanvar()
     _add_workload_ndim()
-    _add_workload_npv()
     _add_workload_packbits()
     _add_workload_pad()
     _add_workload_partition()

From f67701a5d8fd399224bf0dad7a85ba8806da6972 Mon Sep 17 00:00:00 2001
From: barry-jin <barryjin1995@gmail.com>
Date: Mon, 25 Oct 2021 15:54:53 -0700
Subject: [PATCH 41/41] Revert "remove npv"

This reverts commit e7758447aad0ca14427bd3800c4ad1cf7f9c3247.
---
 python/mxnet/numpy/fallback.py                       | 1 +
 tests/python/unittest/test_numpy_interoperability.py | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/python/mxnet/numpy/fallback.py b/python/mxnet/numpy/fallback.py
index 0848a33e2352..83bf67372517 100644
--- a/python/mxnet/numpy/fallback.py
+++ b/python/mxnet/numpy/fallback.py
@@ -75,6 +75,7 @@
     'nansum',
     'nanvar',
     'ndim',
+    'npv',
     'packbits',
     'partition',
     'piecewise',
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index 64bb75e8e865..ca852a85de59 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -2714,6 +2714,10 @@ def _add_workload_ndim():
     OpArgMngr.add_workload('ndim', b)
 
 
+def _add_workload_npv():
+    rate, cashflows = 0.281, np.array([-100, 39, 59, 55, 20])
+    OpArgMngr.add_workload('npv', rate, cashflows)
+
 
 def _add_workload_partition():
     a = np.array([3, 4, 2, 1])
@@ -3202,6 +3206,7 @@ def _prepare_workloads():
     _add_workload_nansum()
     _add_workload_nanvar()
     _add_workload_ndim()
+    _add_workload_npv()
     _add_workload_packbits()
     _add_workload_pad()
     _add_workload_partition()