apache
diff --git a/‎ci/docker/docker-compose.yml‎
Lines changed: 1 addition & 1 deletion b/‎ci/docker/docker-compose.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/mxnet/gluon/parameter.py‎
Lines changed: 3 additions & 0 deletions b/‎python/mxnet/gluon/parameter.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎python/mxnet/gluon/rnn/rnn_layer.py‎
Lines changed: 29 additions & 76 deletions b/‎python/mxnet/gluon/rnn/rnn_layer.py‎
Lines changed: 29 additions & 76 deletions
diff --git a/‎python/mxnet/gluon/utils.py‎
Lines changed: 76 additions & 0 deletions b/‎python/mxnet/gluon/utils.py‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎python/mxnet/initializer.py‎
Lines changed: 119 additions & 0 deletions b/‎python/mxnet/initializer.py‎
Lines changed: 119 additions & 0 deletions
@@ -58,7 +58,7 @@ services:
       dockerfile: Dockerfile.build.centos7
       target: base
       args:
-        BASE_IMAGE: nvidia/cuda:10.2-cudnn7-devel-centos7
+        BASE_IMAGE: nvidia/cuda:10.2-cudnn8-devel-centos7
       cache_from:
         - ${DOCKER_CACHE_REGISTRY}/build.centos7_gpu_cu102:latest
   centos7_gpu_cu110:
 
@@ -472,6 +472,9 @@ def initialize(self, init=None, ctx=None, default_init=initializer.Uniform(),
             ctx = [context.current_context()]
         if isinstance(ctx, Context):
             ctx = [ctx]
+        if isinstance(self.init, initializer.RNNFused):
+            self.init.set_initializer(init if init else default_init)
+            init = default_init = self.init
         if init is None:
             init = default_init if self.init is None else self.init
         if not shape_is_known(self.shape):
 
@@ -23,7 +23,7 @@
 
 __all__ = ['RNN', 'LSTM', 'GRU']
 
-from ... import np, npx, context
+from ... import np, npx, context, initializer
 from .. import HybridBlock, tensor_types
 from ..parameter import Parameter
 from ...util import use_np
@@ -50,11 +50,6 @@ def __init__(self, hidden_size, num_layers, layout,
         self._dropout = dropout
         self._dir = 2 if bidirectional else 1
         self._input_size = input_size
-        self._i2h_weight_initializer = i2h_weight_initializer
-        self._h2h_weight_initializer = h2h_weight_initializer
-        self._i2h_bias_initializer = i2h_bias_initializer
-        self._h2h_bias_initializer = h2h_bias_initializer
-        self._h2r_weight_initializer = h2r_weight_initializer
         self._lstm_state_clip_min = lstm_state_clip_min
         self._lstm_state_clip_max = lstm_state_clip_max
         self._lstm_state_clip_nan = lstm_state_clip_nan
@@ -64,48 +59,17 @@ def __init__(self, hidden_size, num_layers, layout,
 
         self._gates = {'rnn_relu': 1, 'rnn_tanh': 1, 'lstm': 4, 'gru': 3}[mode]
 
-        ng, ni, nh = self._gates, input_size, hidden_size
-        if not projection_size:
-            for i in range(num_layers):
-                for j in ['l', 'r'][:self._dir]:
-                    self._register_param('{}{}_i2h_weight'.format(j, i),
-                                         shape=(ng*nh, ni),
-                                         init=i2h_weight_initializer, dtype=dtype)
-                    self._register_param('{}{}_h2h_weight'.format(j, i),
-                                         shape=(ng*nh, nh),
-                                         init=h2h_weight_initializer, dtype=dtype)
-                    self._register_param('{}{}_i2h_bias'.format(j, i),
-                                         shape=(ng*nh,),
-                                         init=i2h_bias_initializer, dtype=dtype)
-                    self._register_param('{}{}_h2h_bias'.format(j, i),
-                                         shape=(ng*nh,),
-                                         init=h2h_bias_initializer, dtype=dtype)
-                ni = nh * self._dir
-        else:
-            ps = self._projection_size
-            for i in range(num_layers):
-                for j in ['l', 'r'][:self._dir]:
-                    self._register_param('{}{}_i2h_weight'.format(j, i),
-                                         shape=(ng*nh, ni),
-                                         init=i2h_weight_initializer, dtype=dtype)
-                    self._register_param('{}{}_h2h_weight'.format(j, i),
-                                         shape=(ng*nh, ps),
-                                         init=h2h_weight_initializer, dtype=dtype)
-                    self._register_param('{}{}_i2h_bias'.format(j, i),
-                                         shape=(ng*nh,),
-                                         init=i2h_bias_initializer, dtype=dtype)
-                    self._register_param('{}{}_h2h_bias'.format(j, i),
-                                         shape=(ng*nh,),
-                                         init=h2h_bias_initializer, dtype=dtype)
-                    self._register_param('{}{}_h2r_weight'.format(j, i),
-                                         shape=(ps, nh),
-                                         init=h2r_weight_initializer, dtype=dtype)
-                ni = ps * self._dir
-
-    def _register_param(self, name, shape, init, dtype):
-        p = Parameter(name, shape=shape, init=init, allow_deferred_init=True, dtype=dtype)
-        setattr(self, name, p)
-        return p
+        param_initializer = initializer.RNNFused(
+            mode, num_layers, hidden_size,
+            bidirectional, projection_size,
+            i2h_weight_initializer=i2h_weight_initializer,
+            h2h_weight_initializer=h2h_weight_initializer,
+            i2h_bias_initializer=i2h_bias_initializer,
+            h2h_bias_initializer=h2h_bias_initializer,
+            h2r_weight_initializer=h2r_weight_initializer)
+
+        self.rnn_param = Parameter('rnn_param', shape=(-1,), init=param_initializer,
+                                   allow_deferred_init=True, dtype=dtype)
 
     def __repr__(self):
         s = '{name}({mapping}, {_layout}'
@@ -116,8 +80,7 @@ def __repr__(self):
         if self._dir == 2:
             s += ', bidirectional'
         s += ')'
-        shape = self.l0_i2h_weight.shape
-        mapping = '{0} -> {1}'.format(shape[1] if shape[1] else None, shape[0] // self._gates)
+        mapping = '{0} -> {1}'.format(self._input_size if self._input_size else None, self._hidden_size)
         return s.format(name=self.__class__.__name__,
                         mapping=mapping,
                         **self.__dict__)
@@ -196,37 +159,26 @@ def forward(self, inputs, states, sequence_length=None):
     def infer_shape(self, inputs, *args):
         assert inputs.ndim == 3, \
             "Input data should be rank-3 tensor of dim [sequence length, batch size, input size]"
-        if not self._projection_size:
-            step = self._hidden_size
-        else:
-            step = self._projection_size
-        ni = inputs.shape[2]
-        for i in range(self._num_layers):
-            for j in ['l', 'r'][:self._dir]:
-                name = '{}{}_i2h_weight'.format(j, i)
-                getattr(self, name).shape = (self._gates*self._hidden_size, ni)
-            ni = step * self._dir
+        self._input_size = inputs.shape[2]
+        ng, ni, nh = self._gates, inputs.shape[2], self._hidden_size
+
+        size = nh * self._dir * ng
+        size1 = (ni + nh + 2) * size  # first layer size
+        size2 = (nh * self._dir + nh + 2) * size  # second layer size
+        if self._projection_size:
+            size1 = (ni + self._projection_size + 2) * size  # first layer size
+            size2 = (self._projection_size * self._dir + \
+                self._projection_size + 2) * size  # second layer size
+        param_size = size1 + (self._num_layers - 1) * size2
+        if self._projection_size:
+            param_size += self._projection_size * nh * self._num_layers * self._dir
+        self.rnn_param.shape = (param_size, )
 
     def _forward_kernel(self, inputs, states, sequence_length):
         """ forward using CUDNN or CPU kenrel"""
         ctx = inputs.ctx
         if self._layout == 'NTC':
             inputs = np.swapaxes(inputs, 0, 1)
-        if self._projection_size is None:
-            params = (getattr(self, '{}{}_{}_{}'.format(d, l, g, t)).data(ctx).reshape(-1)
-                      for t in ['weight', 'bias']
-                      for l in range(self._num_layers)
-                      for d in ['l', 'r'][:self._dir]
-                      for g in ['i2h', 'h2h'])
-        else:
-            params = (getattr(self, '{}{}_{}_{}'.format(d, l, g, t)).data(ctx).reshape(-1)
-                      for t in ['weight', 'bias']
-                      for l in range(self._num_layers)
-                      for d in ['l', 'r'][:self._dir]
-                      for g in ['i2h', 'h2h', 'h2r']
-                      if g != 'h2r' or t != 'bias')
-
-        params = np.concatenate(params, axis=0)
 
         if self._use_sequence_length:
             rnn_args = states + [sequence_length]
@@ -238,7 +190,8 @@ def _forward_kernel(self, inputs, states, sequence_length):
             new_args = args.as_in_ctx(ctx)
             rnn_args_ctx.append(new_args)
 
-        rnn = npx.rnn(inputs, params, *rnn_args_ctx, use_sequence_length=self._use_sequence_length,
+        rnn = npx.rnn(inputs, self.rnn_param.data().as_in_ctx(ctx), *rnn_args_ctx,
+                      use_sequence_length=self._use_sequence_length,
                       state_size=self._hidden_size, projection_size=self._projection_size,
                       num_layers=self._num_layers, bidirectional=self._dir == 2,
                       p=self._dropout, state_outputs=True, mode=self._mode,
 
@@ -504,3 +504,79 @@ def _check_block_input_np_ndarrays(inputs):
         for i in inputs:
             _check_block_input_np_ndarrays(i)
     # pylint: enable=no-else-raise
+
+
+# pylint: disable=too-many-nested-blocks
+def split_rnn_params(param, mode, num_layers, input_size, hidden_size, bidirectional=False, projection_size=None):
+    """Split rnn layer parameter into weight and bias in different layer.
+
+    Parameters
+    ----------
+    param : ndarray
+        The parameter of rnn layer.
+    mode : str
+        Mode of rnn. Supported modes: rnn_relu, rnn_tanh, lstm, gru
+    num_layers : int, default 1
+        Number of recurrent layers.
+    input_size: int, default 0
+        The number of expected features in the input x.
+        If not specified, it will be inferred from input.
+    hidden_size: int
+        The number of features in the hidden state h.
+    bidirectional: bool, default False
+        If `True`, becomes a bidirectional RNN.
+    projection_size: int, default None
+        The number of features after projection.
+    """
+    gates = {'rnn_relu': 1, 'rnn_tanh': 1, 'lstm': 4, 'gru': 3}[mode]
+    dir = 2 if bidirectional else 1
+    param_dict = {}
+    begin = 0
+    if not projection_size:
+        for p in ['weight', 'bias']:
+            for l in range(num_layers):
+                for d in ['l', 'r'][:dir]:
+                    for g in ['i2h', 'h2h']:
+                        ni = input_size
+                        if l != 0:
+                            ni = hidden_size * dir
+                        if g == 'h2h':
+                            ni = hidden_size
+                        shape0 = gates * hidden_size
+                        if p == 'weight':
+                            cur_len = shape0 * ni
+                            param_dict['{}{}_{}_{}'.format(d, l, g, p)] = \
+                                param[begin:begin+cur_len].reshape(shape0, ni)
+                        else:
+                            cur_len = shape0
+                            param_dict['{}{}_{}_{}'.format(d, l, g, p)] = \
+                                param[begin:begin+cur_len].reshape(shape0,)
+                        begin += cur_len
+    else:
+        for p in ['weight', 'bias']:
+            for l in range(num_layers):
+                for d in ['l', 'r'][:dir]:
+                    for g in ['i2h', 'h2h', 'h2r']:
+                        if g != 'h2r' or p != 'bias':
+                            if g == 'h2r':
+                                cur_len = projection_size * hidden_size
+                                param_dict['{}{}_{}_{}'.format(d, l, g, p)] = \
+                                    param[begin:begin+cur_len]. \
+                                        reshape(projection_size, hidden_size)
+                            else:
+                                ni = input_size
+                                if l != 0:
+                                    ni = projection_size * dir
+                                if g == 'h2h':
+                                    ni = projection_size
+                                shape0 = gates * hidden_size
+                                if p == 'weight':
+                                    cur_len = shape0 * ni
+                                    param_dict['{}{}_{}_{}'.format(d, l, g, p)] = \
+                                        param[begin:begin+cur_len].reshape(shape0, ni)
+                                else:
+                                    cur_len = shape0
+                                    param_dict['{}{}_{}_{}'.format(d, l, g, p)] = \
+                                        param[begin:begin+cur_len].reshape(shape0,)
+                            begin += cur_len
+    return param_dict
@@ -711,3 +711,122 @@ def _init_weight(self, name, arr):
         # gate of the 4 LSTM gates, we modify the according values.
         num_hidden = int(arr.shape[0] / 4)
         arr[num_hidden:2*num_hidden] = self.forget_bias
+
+
+@register
+class RNNFused(Initializer):
+    """Initialize RNN fused parameter with bias part initialized to 0.0 and
+    weight initialized with random values uniformly sampled from a given range.
+
+    Parameters
+    ----------
+    mode : {'gru', 'lstm', 'rnn_relu', 'rnn_tanh'}, required
+        the type of RNN to compute
+    num_layers : int (non-negative), required
+        number of stacked layers
+    state_size : int (non-negative), required
+        size of the state for each layer
+    bidirectional : boolean, optional, default=0
+        whether to use bidirectional recurrent layers
+    projection_size : int or None, optional, default='None'
+        size of project size
+    scale : float, optional
+        The bound on the range of the generated random values for weights.
+        Values are generated from the range [-`scale`, `scale`].
+        Default scale is 0.07.
+    """
+    def __init__(self, mode, num_layers, state_size, bidirectional=False,
+                 projection_size=None, i2h_weight_initializer=None,
+                 h2h_weight_initializer=None, i2h_bias_initializer=None,
+                 h2h_bias_initializer=None, h2r_weight_initializer=None):
+        super(RNNFused, self).__init__(mode=mode, num_layers=num_layers,
+                                       state_size=state_size,
+                                       bidirectional=bidirectional,
+                                       projection_size=projection_size,
+                                       i2h_weight_initializer=i2h_weight_initializer,
+                                       h2h_weight_initializer=h2h_weight_initializer,
+                                       i2h_bias_initializer=i2h_bias_initializer,
+                                       h2h_bias_initializer=h2h_bias_initializer,
+                                       h2r_weight_initializer=h2r_weight_initializer)
+        self.gates = {'rnn_relu': 1, 'rnn_tanh': 1, 'lstm': 4, 'gru': 3}[mode]
+        self.num_layers = num_layers
+        self.num_hidden = state_size
+        self.dir = 2 if bidirectional else 1
+        self.projection_size = projection_size
+        self._i2h_weight_initializer = i2h_weight_initializer
+        self._h2h_weight_initializer = h2h_weight_initializer
+        self._i2h_bias_initializer = i2h_bias_initializer
+        self._h2h_bias_initializer = h2h_bias_initializer
+        self._h2r_weight_initializer = h2r_weight_initializer
+
+    # pylint: disable=too-many-nested-blocks
+    def _init_weight(self, name, arr):
+        arr_len = arr.shape[0]
+        size = self.num_hidden * self.dir * self.gates
+        if not self.projection_size:
+            # second layer size
+            size2 = (self.num_hidden * self.dir + self.num_hidden + 2) * size
+            input_size = (arr_len - (self.num_layers - 1) * size2) // \
+                size - 2 - self.num_hidden
+        else:
+            # second layer size
+            size2 = (self.projection_size * self.dir + self.projection_size + 2) * size
+            size_projection = self.projection_size * self.num_hidden * self.num_layers * self.dir
+            input_size = (arr_len - size_projection - (self.num_layers - 1) * size2) // \
+                size - 2 - self.projection_size
+        begin = 0
+        if not self.projection_size:
+            for param in ['weight', 'bias']:
+                for layer_num in range(self.num_layers):
+                    for _ in range(self.dir):
+                        for connect in ['i2h', 'h2h']:
+                            num_inputs = input_size
+                            if layer_num != 0:
+                                num_inputs = self.num_hidden * self.dir
+                            if connect == 'h2h':
+                                num_inputs = self.num_hidden
+                            shape0 = self.gates * self.num_hidden
+                            if param == 'weight':
+                                cur_len = shape0 * num_inputs
+                            else:
+                                cur_len = shape0
+                            self._init_util(param, connect, arr[begin:begin+cur_len])
+                            begin += cur_len
+        else:
+            for param in ['weight', 'bias']:
+                for layer_num in range(self.num_layers):
+                    for _ in range(self.dir):
+                        for connect in ['i2h', 'h2h', 'h2r']:
+                            if connect != 'h2r' or param != 'bias':
+                                if connect == 'h2r':
+                                    cur_len = self.projection_size * self.num_hidden
+                                else:
+                                    num_inputs = input_size
+                                    if layer_num != 0:
+                                        num_inputs = self.projection_size * self.dir
+                                    if connect == 'h2h':
+                                        num_inputs = self.projection_size
+                                    shape0 = self.gates * self.num_hidden
+                                    if param == 'weight':
+                                        cur_len = shape0 * num_inputs
+                                    else:
+                                        cur_len = shape0
+                                self._init_util(param, connect, arr[begin:begin+cur_len])
+                                begin += cur_len
+
+    def _init_util(self, param, connect, arr):
+        name = "_{}_{}_initializer".format(connect, param)
+        init = getattr(self, name)
+        create(init)(InitDesc(name, {'__init__': init}), arr)
+
+    def set_initializer(self, init):
+        self._i2h_weight_initializer = \
+            init if not self._i2h_weight_initializer else 'uniform'
+        self._h2h_weight_initializer = \
+            init if not self._h2h_weight_initializer else 'uniform'
+        self._i2h_bias_initializer = \
+            init if not self._i2h_bias_initializer else 'zero'
+        self._h2h_bias_initializer = \
+            init if not self._i2h_bias_initializer else 'zero'
+        self._h2r_weight_initializer = \
+            init if not self._h2r_weight_initializer else 'uniform'