Skip to content
This repository was archived by the owner on Nov 17, 2023. It is now read-only.
This repository was archived by the owner on Nov 17, 2023. It is now read-only.

Run rnn backward fail  #10264

@chenchu-zs

Description

@chenchu-zs

Description

I want to write a complete rnn testcase contains checking forward and backward. But it will throw exception when running FusedRNNCell's backward. It is caused by error slice operation of out_grads. I guess it maybe affected by __layout__ attribute. #6747
The testcase and error message are below.

Environment info (Required)

GPU: P100, CPU: Skylake-8180, Python3

Build info (Required if built from source)

Compiler (gcc/clang/mingw/visual studio):
gcc

MXNet commit hash:
the newest master branch

Build config:
(Paste the content of config.mk, or the build command.)
make -j 10 USE_BLAS=mkl USE_CUDA=1 USE_CUDNN=1 USE_CUDA_PATH=/usr/local/cuda

Error Message:

Traceback (most recent call last):
File "test_lstm.py", line 77, in
test_lstm()
File "test_lstm.py", line 59, in test_lstm
check_rnn_consistency(fused, stack)
File "test_lstm.py", line 49, in check_rnn_consistency
mod1.backward(out_grads=[dy])
File "/home/shuzhan1/incubator-mxnet/python/mxnet/module/module.py", line 627, in backward
self._exec_group.backward(out_grads=out_grads)
File "/home/shuzhan1/incubator-mxnet/python/mxnet/module/executor_group.py", line 576, in backward
end=islice.stop)
File "", line 63, in slice_axis
File "/home/shuzhan1/incubator-mxnet/python/mxnet/_ctypes/ndarray.py", line 92, in _imperative_invoke
ctypes.byref(out_stypes)))
File "/home/shuzhan1/incubator-mxnet/python/mxnet/base.py", line 149, in check_call
raise MXNetError(py_str(_LIB.MXGetLastError()))
mxnet.base.MXNetError: [14:22:00] src/operator/tensor/./matrix_op-inl.h:1022: Check failed: (*end <= axis_size) && (*end >= 0) Invalid begin, end, get begin=0, end=5

Minimum reproducible example

My test script is below:

from __future__ import print_function
import numpy as np 
import mxnet as mx 
import math        
import random      
import itertools   
from mxnet.test_utils import * 
from numpy.testing import assert_allclose, assert_array_equal

def check_rnn_consistency(cell1, cell2):
    T, N, I, H = 5, 4, 3, 2
    dshape = (T, N, I)
    data = mx.sym.Variable('data')

    Y1, _ = cell1.unroll(T, data, layout='TNC', merge_outputs=True)
    mod1 = mx.mod.Module(Y1, label_names=None, context=mx.gpu())
    mod1.bind(data_shapes=[('data', dshape)], label_shapes=None, inputs_need_grad=True)

    Y2, _ = cell2.unroll(T, data, layout='TNC', merge_outputs=True)
    mod2 = mx.mod.Module(Y2, label_names=None, context=mx.gpu())
    mod2.bind(data_shapes=[('data', dshape)], label_shapes=None, inputs_need_grad=True)

    mod1.init_params()
    args, auxs = mod1.get_params()
    args = cell1.unpack_weights(args)
    args = cell2.pack_weights(args)
    mod2.set_params(args, auxs)
    x = mx.random.uniform(shape=dshape)
    dy = mx.random.uniform(shape=(T, N, H))
    batch=mx.io.DataBatch(data=[x])
    # check inference
    mod1.forward(batch, is_train=False)
    mod2.forward(batch, is_train=False)
    assert_allclose(mod1.get_outputs()[0].asnumpy(), mod2.get_outputs()[0].asnumpy(), rtol=1e-2, atol=1e-4)
    
    # check training
    mod1.forward(batch, is_train=True)
    mod2.forward(batch, is_train=True)

    assert_allclose(dy.shape, mod1.get_outputs()[0].shape, rtol=1e-2, atol=1e-4)
    assert_allclose(mod1.get_outputs()[0].asnumpy(), mod2.get_outputs()[0].asnumpy(), rtol=1e-2, atol=1e-4)
    mod1.backward(out_grads=[dy])
    mod2.backward(out_grads=[dy])
    assert_allclose(mod1.get_input_grads()[0].asnumpy(), mod2.get_input_grads()[0].asnumpy(), rtol=1e-2, atol=1e-4)

def test_lstm():
    T, N, I, H = 5, 4, 3, 2
    fused = mx.rnn.FusedRNNCell(H, num_layers=1, mode='lstm', get_next_state=True, prefix='')
    stack = mx.rnn.SequentialRNNCell()
    stack.add(mx.rnn.LSTMCell(H, prefix='l0_'))
    check_rnn_consistency(fused, stack)
test_lstm()

What have you tried to solve it?

I check the line:261 of incubator-mxnet/python/mxnet/module/executor_group.py.
self.outputs_layouts is [0] when running mod2.backward(out_grads=[dy])
self.outputs_layouts is [1] when running mod1.backward(out_grads=[dy])

Metadata

Metadata

Assignees

No one assigned

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions