diff --git a/python/mxnet/onnx/mx2onnx/_op_translations/_op_translations_opset12.py b/python/mxnet/onnx/mx2onnx/_op_translations/_op_translations_opset12.py index 3778e386a98a..d64061b36c81 100644 --- a/python/mxnet/onnx/mx2onnx/_op_translations/_op_translations_opset12.py +++ b/python/mxnet/onnx/mx2onnx/_op_translations/_op_translations_opset12.py @@ -2137,7 +2137,9 @@ def convert_square(node, **kwargs): ) return [tensor_node, node] +# sum_axis is equivalent to sum in MXNet @mx_op.register("sum") +@mx_op.register("sum_axis") def convert_sum(node, **kwargs): """Map MXNet's sum operator attributes to onnx's ReduceSum operator and return the created node. @@ -4476,12 +4478,12 @@ def convert_RNN(node, **kwargs): initial_h = input_nodes[2] nodes = [] + create_tensor([0], name+'_0', kwargs['initializer']) mode = str(attrs.get('mode')) if mode == 'lstm': initial_c = input_nodes[3] if num_layers == 2: - create_tensor([0], name+'_0', kwargs['initializer']) create_tensor([8*state_size], name+'_8*state_size', kwargs['initializer']) create_tensor([4*state_size*state_size], name+'_4*state_size^2', kwargs['initializer']) create_tensor([1, 4*state_size, state_size], name+'_WR_shape', kwargs['initializer']) @@ -4553,7 +4555,6 @@ def convert_RNN(node, **kwargs): make_node('Concat', [name+'_lstm0_c', name+'_lstm1_c'], [name+'2'], axis=0), ] elif num_layers == 1: - create_tensor([0], name+'_0', kwargs['initializer']) create_tensor([1], name+'_1', kwargs['initializer']) create_tensor([4*state_size], name+'_4*state_size', kwargs['initializer']) create_tensor([8*state_size], name+'_8*state_size', kwargs['initializer']) @@ -4598,7 +4599,6 @@ def convert_RNN(node, **kwargs): elif mode == 'gru': if num_layers == 2: - create_tensor([0], name+'_0', kwargs['initializer']) create_tensor([6*state_size], name+'_6*state_size', kwargs['initializer']) create_tensor([3*state_size*state_size], name+'_3*state_size^2', kwargs['initializer']) create_tensor([1, 3*state_size, state_size], name+'_WR_shape', kwargs['initializer']) @@ -4669,7 +4669,7 @@ def convert_RNN(node, **kwargs): ] elif num_layers == 1: - create_tensor([0], name+'_0', kwargs['initializer']) + create_tensor([1], name+'_1', kwargs['initializer']) create_tensor([3*state_size], name+'_3*state_size', kwargs['initializer']) create_tensor([6*state_size], name+'_6*state_size', kwargs['initializer']) @@ -4712,6 +4712,100 @@ def convert_RNN(node, **kwargs): else: raise NotImplementedError('Currently RNN onnx export only supports num_layers equals to 1 or 2') + elif mode in ['rnn_tanh', 'rnn_relu']: + activations = ['Tanh'] + if mode == 'rnn_relu': + activations = ['Relu'] + if num_layers == 2: + + create_tensor([2*state_size], name+'_2*state_size', kwargs['initializer']) + create_tensor([state_size*state_size], name+'_state_size^2', kwargs['initializer']) + create_tensor([1, state_size, state_size], name+'_WR_shape', kwargs['initializer']) + create_tensor([1, 2*state_size], name+'_B_shape', kwargs['initializer']) + create_tensor([4*state_size*state_size], name+'_WR_offset', kwargs['initializer']) + + nodes += [ + make_node('Shape', [data], [name+'_data_shape']), + make_node('Split', [name+'_data_shape'], [name+'_seq_length', name+'_batch_size', name+'_input_size']), + + # Layer 0 + # get W + make_node('Slice', [param, name+'_0', name+'_state_size^2'], [name+'_W0_1d']), + make_node('Reshape', [name+'_W0_1d', name+'_WR_shape'], [name+'_W0']), + # get R + make_node('Add', [name+'_state_size^2', name+'_state_size^2'], [name+'_R0_offset']), + make_node('Slice', [param, name+'_state_size^2', name+'_R0_offset'], [name+'_R0_1d']), + make_node('Reshape', [name+'_R0_1d', name+'_WR_shape'], [name+'_R0']), + # get B + make_node('Add', [name+'_WR_offset', name+'_2*state_size'], [name+'_B0_offset']), + make_node('Slice', [param, name+'_WR_offset', name+'_B0_offset'], [name+'_B0_1d']), + make_node('Reshape', [name+'_B0_1d', name+'_B_shape'], [name+'_B0']), + # get initial states + make_node('Split', [initial_h], [name+'_initial_h0', name+'_initial_h1'], axis=0), + # get seq_len + make_node('Tile', [name+'_seq_length', name+'_batch_size'], [name+'_seq_len_']), + make_node("Cast", [name+'_seq_len_'], [name+"_seq_len"], to=int(TensorProto.INT32)), + # Layer 0 RNN + make_node('RNN', [data, name+'_W0', name+'_R0', name+'_B0', name+'_seq_len', name+'_initial_h0'], + [name+'_rnn0_out_', name+'_rnn0_h'], hidden_size=state_size, activations=activations), + make_node('Squeeze', [name+'_rnn0_out_'], [name+'_rnn0_out'], axes=[1]), + + # Layer 1 + # get W + make_node('Add', [name+'_R0_offset', name+'_state_size^2'], [name+'_W1_offset']), + make_node('Slice', [param, name+'_R0_offset', name+'_W1_offset'], [name+'_W1_1d']), + make_node('Reshape', [name+'_W1_1d', name+'_WR_shape'], [name+'_W1']), + # get R + make_node('Slice', [param, name+'_W1_offset', name+'_WR_offset'], [name+'_R1_1d']), + make_node('Reshape', [name+'_R1_1d', name+'_WR_shape'], [name+'_R1']), + # get B + make_node('Add', [name+'_B0_offset', name+'_2*state_size'], [name+'_B1_offset']), + make_node('Slice', [param, name+'_B0_offset', name+'_B1_offset'], [name+'_B1_1d']), + make_node('Reshape', [name+'_B1_1d', name+'_B_shape'], [name+'_B1']), + # Layer 1 RNN + make_node('RNN', [name+'_rnn0_out', name+'_W1', name+'_R1', name+'_B1', name+'_seq_len', + name+'_initial_h1'], [name+'_rnn1_out_', name+'_rnn1_h'], + hidden_size=state_size, activations=activations), + make_node('Squeeze', [name+'_rnn1_out_'], [name], axes=[1]), + make_node('Concat', [name+'_rnn0_h', name+'_rnn1_h'], [name+'1'], axis=0) + ] + + elif num_layers == 1: + + create_tensor([1], name+'_1', kwargs['initializer']) + create_tensor([state_size], name+'_state_size', kwargs['initializer']) + create_tensor([2*state_size], name+'_2*state_size', kwargs['initializer']) + create_tensor([state_size*state_size], name+'_state_size^2', kwargs['initializer']) + create_tensor([1, state_size, state_size], name+'_R_shape', kwargs['initializer']) + create_tensor([1, 2*state_size], name+'_B_shape', kwargs['initializer']) + + nodes += [ + make_node('Shape', [data], [name+'_data_shape']), + make_node('Split', [name+'_data_shape'], + [name+'_seq_length', name+'_batch_size', name+'_input_size'], name='split0'), + # get W + make_node('Mul', [name+'_state_size', name+'_input_size'], [name+'_mul0']), + make_node('Slice', [param, name+'_0', name+'_mul0'], [name+'_W_1d']), + make_node('Concat', [name+'_1', name+'_state_size', name+'_input_size'], [name+'_W_shape'], axis=0), + make_node('Reshape', [name+'_W_1d', name+'_W_shape'], [name+'_W']), + # get R + make_node('Add', [name+'_mul0', name+'_state_size^2'], [name+'_add0']), + make_node('Slice', [param, name+'_mul0', name+'_add0'], [name+'_R_1d']), + make_node('Reshape', [name+'_R_1d', name+'_R_shape'], [name+'_R']), + # get B + make_node('Add', [name+'_add0', name+'_2*state_size'], [name+'_add1']), + make_node('Slice', [param, name+'_add0', name+'_add1'], [name+'_B_1d']), + make_node('Reshape', [name+'_B_1d', name+'_B_shape'], [name+'_B']), + # get seq_len + make_node('Tile', [name+'_seq_length', name+'_batch_size'], [name+'_seq_len_']), + make_node("Cast", [name+'_seq_len_'], [name+"_seq_len"], to=int(TensorProto.INT32)), + # compute RNN + make_node('RNN', [data, name+'_W', name+'_R', name+'_B', name+'_seq_len', initial_h], + [name+'0_', name+'1'], hidden_size=state_size, activations=activations), + make_node('Squeeze', [name+'0_'], [name], axes=[1]), + ] + else: + raise NotImplementedError('Currently RNN onnx export only supports num_layers equals to 1 or 2') else: raise NotImplementedError(f"Currently RNN onnx export does not support {mode} mode") return nodes diff --git a/python/mxnet/onnx/mx2onnx/_op_translations/_op_translations_opset13.py b/python/mxnet/onnx/mx2onnx/_op_translations/_op_translations_opset13.py index 0b563ededdde..62703f2ee9ae 100644 --- a/python/mxnet/onnx/mx2onnx/_op_translations/_op_translations_opset13.py +++ b/python/mxnet/onnx/mx2onnx/_op_translations/_op_translations_opset13.py @@ -1047,11 +1047,12 @@ def convert_RNN(node, **kwargs): nodes = [] mode = str(attrs.get('mode')) + create_tensor([0], name+'_0', kwargs['initializer']) create_tensor([1], name+'_1', kwargs['initializer']) + if mode == 'lstm': initial_c = input_nodes[3] if num_layers == 2: - create_tensor([0], name+'_0', kwargs['initializer']) create_tensor([8*state_size], name+'_8*state_size', kwargs['initializer']) create_tensor([4*state_size*state_size], name+'_4*state_size^2', kwargs['initializer']) create_tensor([1, 4*state_size, state_size], name+'_WR_shape', kwargs['initializer']) @@ -1123,7 +1124,6 @@ def convert_RNN(node, **kwargs): make_node('Concat', [name+'_lstm0_c', name+'_lstm1_c'], [name+'2'], axis=0), ] elif num_layers == 1: - create_tensor([0], name+'_0', kwargs['initializer']) create_tensor([4*state_size], name+'_4*state_size', kwargs['initializer']) create_tensor([8*state_size], name+'_8*state_size', kwargs['initializer']) create_tensor([4*state_size*state_size], name+'_4*state_size^2', kwargs['initializer']) @@ -1167,7 +1167,6 @@ def convert_RNN(node, **kwargs): elif mode == 'gru': if num_layers == 2: - create_tensor([0], name+'_0', kwargs['initializer']) create_tensor([6*state_size], name+'_6*state_size', kwargs['initializer']) create_tensor([3*state_size*state_size], name+'_3*state_size^2', kwargs['initializer']) create_tensor([1, 3*state_size, state_size], name+'_WR_shape', kwargs['initializer']) @@ -1238,7 +1237,6 @@ def convert_RNN(node, **kwargs): ] elif num_layers == 1: - create_tensor([0], name+'_0', kwargs['initializer']) create_tensor([3*state_size], name+'_3*state_size', kwargs['initializer']) create_tensor([6*state_size], name+'_6*state_size', kwargs['initializer']) create_tensor([3*state_size*state_size], name+'_3*state_size^2', kwargs['initializer']) @@ -1272,7 +1270,7 @@ def convert_RNN(node, **kwargs): # get seq_len make_node('Tile', [name+'_seq_length', name+'_batch_size'], [name+'_seq_len_']), make_node("Cast", [name+'_seq_len_'], [name+"_seq_len"], to=int(TensorProto.INT32)), - # compute LSTM + # compute GRU make_node('GRU', [data, name+'_W', name+'_R', name+'_B', name+'_seq_len', initial_h], [name+'0_', name+'1'], hidden_size=state_size, linear_before_reset=1), make_node('Squeeze', [name+'0_', name+'_1'], [name]), @@ -1280,6 +1278,98 @@ def convert_RNN(node, **kwargs): else: raise NotImplementedError('Currently RNN onnx export only supports num_layers equals to 1 or 2') + elif mode in ['rnn_tanh', 'rnn_relu']: + activations = ['Tanh'] + if mode == 'rnn_relu': + activations = ['Relu'] + if num_layers == 2: + create_tensor([2*state_size], name+'_2*state_size', kwargs['initializer']) + create_tensor([state_size*state_size], name+'_state_size^2', kwargs['initializer']) + create_tensor([1, state_size, state_size], name+'_WR_shape', kwargs['initializer']) + create_tensor([1, 2*state_size], name+'_B_shape', kwargs['initializer']) + create_tensor([4*state_size*state_size], name+'_WR_offset', kwargs['initializer']) + + nodes += [ + make_node('Shape', [data], [name+'_data_shape']), + make_node('Split', [name+'_data_shape'], [name+'_seq_length', name+'_batch_size', name+'_input_size']), + + # Layer 0 + # get W + make_node('Slice', [param, name+'_0', name+'_state_size^2'], [name+'_W0_1d']), + make_node('Reshape', [name+'_W0_1d', name+'_WR_shape'], [name+'_W0']), + # get R + make_node('Add', [name+'_state_size^2', name+'_state_size^2'], [name+'_R0_offset']), + make_node('Slice', [param, name+'_state_size^2', name+'_R0_offset'], [name+'_R0_1d']), + make_node('Reshape', [name+'_R0_1d', name+'_WR_shape'], [name+'_R0']), + # get B + make_node('Add', [name+'_WR_offset', name+'_2*state_size'], [name+'_B0_offset']), + make_node('Slice', [param, name+'_WR_offset', name+'_B0_offset'], [name+'_B0_1d']), + make_node('Reshape', [name+'_B0_1d', name+'_B_shape'], [name+'_B0']), + # get initial states + make_node('Split', [initial_h], [name+'_initial_h0', name+'_initial_h1'], axis=0), + # get seq_len + make_node('Tile', [name+'_seq_length', name+'_batch_size'], [name+'_seq_len_']), + make_node("Cast", [name+'_seq_len_'], [name+"_seq_len"], to=int(TensorProto.INT32)), + # Layer 0 RNN + make_node('RNN', [data, name+'_W0', name+'_R0', name+'_B0', name+'_seq_len', + name+'_initial_h0'], [name+'_rnn0_out_', name+'_rnn0_h'], + hidden_size=state_size, activations=activations), + make_node('Squeeze', [name+'_rnn0_out_', name+'_1'], [name+'_rnn0_out']), + + # Layer 1 + # get W + make_node('Add', [name+'_R0_offset', name+'_state_size^2'], [name+'_W1_offset']), + make_node('Slice', [param, name+'_R0_offset', name+'_W1_offset'], [name+'_W1_1d']), + make_node('Reshape', [name+'_W1_1d', name+'_WR_shape'], [name+'_W1']), + # get R + make_node('Slice', [param, name+'_W1_offset', name+'_WR_offset'], [name+'_R1_1d']), + make_node('Reshape', [name+'_R1_1d', name+'_WR_shape'], [name+'_R1']), + # get B + make_node('Add', [name+'_B0_offset', name+'_2*state_size'], [name+'_B1_offset']), + make_node('Slice', [param, name+'_B0_offset', name+'_B1_offset'], [name+'_B1_1d']), + make_node('Reshape', [name+'_B1_1d', name+'_B_shape'], [name+'_B1']), + # Layer 1 RNN + make_node('RNN', [name+'_rnn0_out', name+'_W1', name+'_R1', name+'_B1', name+'_seq_len', + name+'_initial_h1'], [name+'_rnn1_out_', name+'_rnn1_h'], + hidden_size=state_size, activations=activations), + make_node('Squeeze', [name+'_rnn1_out_', name+'_1'], [name]), + make_node('Concat', [name+'_rnn0_h', name+'_rnn1_h'], [name+'1'], axis=0) + ] + + elif num_layers == 1: + create_tensor([state_size], name+'_state_size', kwargs['initializer']) + create_tensor([2*state_size], name+'_2*state_size', kwargs['initializer']) + create_tensor([state_size*state_size], name+'_state_size^2', kwargs['initializer']) + create_tensor([1, state_size, state_size], name+'_R_shape', kwargs['initializer']) + create_tensor([1, 2*state_size], name+'_B_shape', kwargs['initializer']) + + nodes += [ + make_node('Shape', [data], [name+'_data_shape']), + make_node('Split', [name+'_data_shape'], [name+'_seq_length', name+'_batch_size', + name+'_input_size'], name='split0'), + # get W + make_node('Mul', [name+'_state_size', name+'_input_size'], [name+'_mul0']), + make_node('Slice', [param, name+'_0', name+'_mul0'], [name+'_W_1d']), + make_node('Concat', [name+'_1', name+'_state_size', name+'_input_size'], [name+'_W_shape'], axis=0), + make_node('Reshape', [name+'_W_1d', name+'_W_shape'], [name+'_W']), + # get R + make_node('Add', [name+'_mul0', name+'_state_size^2'], [name+'_add0']), + make_node('Slice', [param, name+'_mul0', name+'_add0'], [name+'_R_1d']), + make_node('Reshape', [name+'_R_1d', name+'_R_shape'], [name+'_R']), + # get B + make_node('Add', [name+'_add0', name+'_2*state_size'], [name+'_add1']), + make_node('Slice', [param, name+'_add0', name+'_add1'], [name+'_B_1d']), + make_node('Reshape', [name+'_B_1d', name+'_B_shape'], [name+'_B']), + # get seq_len + make_node('Tile', [name+'_seq_length', name+'_batch_size'], [name+'_seq_len_']), + make_node("Cast", [name+'_seq_len_'], [name+"_seq_len"], to=int(TensorProto.INT32)), + # compute RNN + make_node('RNN', [data, name+'_W', name+'_R', name+'_B', name+'_seq_len', initial_h], + [name+'0_', name+'1'], hidden_size=state_size, activations=activations), + make_node('Squeeze', [name+'0_', name+'_1'], [name]), + ] + else: + raise NotImplementedError('Currently RNN onnx export only supports num_layers equals to 1 or 2') else: raise NotImplementedError(f"Currently RNN onnx export does not support {mode} mode") return nodes diff --git a/tests/python-pytest/onnx/test_operators.py b/tests/python-pytest/onnx/test_operators.py index 2cdf9f95bf6b..11d14b7596c6 100644 --- a/tests/python-pytest/onnx/test_operators.py +++ b/tests/python-pytest/onnx/test_operators.py @@ -51,7 +51,7 @@ def hybrid_forward(self, F, *inputs): return func(*inputs, **params) return Model -def op_export_test(model_name, Model, inputs, tmp_path, dummy_input=False, onnx_map=None, mx_map=None): +def op_export_test(model_name, Model, inputs, tmp_path, dummy_input=False, onnx_map=None, mx_map=None, rtol=None, atol=None): def export_to_onnx(model, model_name, inputs): model_path = '{}/{}'.format(tmp_path, model_name) model.export(model_path, epoch=0) @@ -82,11 +82,11 @@ def onnx_rt(onnx_file, inputs): for i in range(len(pred_mx)): pred_onx_i = onnx_map(pred_onx[i]) if onnx_map else pred_onx[i] pred_mx_i = mx_map(pred_mx[i]) if mx_map else pred_mx[i] - assert_almost_equal(pred_onx_i, pred_mx_i, equal_nan=True) + assert_almost_equal(pred_onx_i, pred_mx_i, equal_nan=True, rtol=rtol, atol=atol) else: pred_onx = onnx_map(pred_onx[0]) if onnx_map else pred_onx[0] pred_mx = mx_map(pred_mx) if mx_map else pred_mx - assert_almost_equal(pred_onx, pred_mx, equal_nan=True) + assert_almost_equal(pred_onx, pred_mx, equal_nan=True, rtol=rtol, atol=atol) def test_onnx_export_abs(tmp_path): @@ -1047,16 +1047,17 @@ def test_onnx_export_log2(tmp_path, dtype): @pytest.mark.parametrize('dtype', ['int32', 'int64', 'float16', 'float32', 'float64']) @pytest.mark.parametrize('axis', [None, 1, [1,2], -1]) -def test_onnx_export_sum(tmp_path, dtype, axis): +@pytest.mark.parametrize('operator', ['sum', 'sum_axis']) +def test_onnx_export_sum(tmp_path, dtype, axis, operator): if 'int' in dtype: x = mx.nd.random.randint(0, 10, (5, 6, 7, 8), dtype=dtype) else: x = mx.nd.random.normal(0, 10, (5, 6, 7, 8), dtype=dtype) if axis is not None: - M = def_model('sum', axis=axis) + M = def_model(operator, axis=axis) else: - M = def_model('sum') - op_export_test('sum', M, [x], tmp_path) + M = def_model(operator) + op_export_test(operator, M, [x], tmp_path) @pytest.mark.parametrize('dtype', ['float16', 'float32', 'float64', 'int32', 'int64']) @@ -1234,21 +1235,22 @@ def test_onnx_export_sequence_reverse(tmp_path, dtype, params): # onnx LSTM from opset 11 does not support float64 -@pytest.mark.parametrize('mode', ['lstm', 'gru']) +@pytest.mark.parametrize('mode', ['lstm', 'gru', 'rnn_tanh', 'rnn_relu']) @pytest.mark.parametrize('dtype', ['float32']) -@pytest.mark.parametrize('state_size', [16, 32]) +@pytest.mark.parametrize('state_size', [16, 32, 64]) @pytest.mark.parametrize('input_size', [16, 32, 64]) @pytest.mark.parametrize('num_layers', [1, 2]) @pytest.mark.parametrize('batch_size', [1, 2, 4]) -@pytest.mark.parametrize('seq_length', [16, 32]) +@pytest.mark.parametrize('seq_length', [16]) def test_onnx_export_RNN(tmp_path, mode, dtype, state_size, input_size, num_layers, batch_size, seq_length): # TODO: The current implementation fails assertion checks for large parm/state_size. - # for num_layers >= 2, input_size must equal to state_size if num_layers >= 2 and input_size != state_size: return - factor = 3 - if mode == 'lstm': + factor = 1 + if mode == 'gru': + factor = 3 + elif mode == 'lstm': factor = 4 M = def_model('RNN', mode=mode, state_size=state_size, state_outputs=True, num_layers=num_layers, p=0) @@ -1260,8 +1262,11 @@ def test_onnx_export_RNN(tmp_path, mode, dtype, state_size, input_size, num_laye if mode == 'lstm': cell = mx.nd.random.uniform(-1, 1, [num_layers, batch_size, state_size], dtype=dtype) op_export_test('rnn', M, [x, param, state, cell], tmp_path) + elif mode == 'rnn_relu': + # set large atol as relu can outputs big numbers + op_export_test('rnn', M, [x, param, state], tmp_path, atol=1e20) else: - op_export_test('rnn', M, [x, param, state], tmp_path) + op_export_test('rnn', M, [x, param, state], tmp_path, atol=1e-2) @pytest.mark.parametrize('dtype', ['float16', 'float32', 'int32', 'int64'])