From b6a6475621d907f9dc82401830a874304b2fc8f1 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Thu, 12 Aug 2021 10:03:57 +0300
Subject: [PATCH 1/9] GRU cell was implemented in common.py. GRU was supported
 on pytorch frontend side

---
 python/tvm/relay/frontend/common.py  |  90 ++++++++++++
 python/tvm/relay/frontend/pytorch.py | 201 ++++++++++++++++++++++++++-
 2 files changed, 290 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index 077b942ddf01..a635dcdc4771 100755
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -658,6 +658,96 @@ def unbind(data, axis=0):
     return _expr.TupleWrapper(_expr.Tuple(ret), selections)
 
 
+def gru_cell(
+    input_seqs,
+    hidden_state,
+    hidden_size,
+    w_inp,
+    w_hid,
+    b_inp=None,
+    b_hid=None,
+    r_act=_op.sigmoid,
+    z_act=_op.sigmoid,
+    n_act=_op.tanh,
+    backwards=False,
+):
+    """
+    Common implementation of GRU cell for all frontends of TVM
+    TODO(vvchernov): currently it is used by pytorch. Extend for other frontends
+
+    Parameters
+    ----------
+    input_seqs : List[relay.Expr]
+        The sequence of input tensors
+        Input tensor should be 2d while issue #8412 is not resolved
+        Shape = (batch, feature_size)
+    hidden_state : relay.Expr
+        Hidden state. shape = (batch_size, hidden_size)
+    hidden_size : int
+        The number of features in the hidden state. It is needed for correct and quick split of weights.
+    w_inp, w_hid : relay.Expr
+        weight matrices. wi shape = (3 * hidden_size, feature_size)
+        wh shape = (3 * hidden_size, hidden_size)
+        NOTE: wi = (w_ir|w_iz|w_in) for reset, update and new gates.
+        The order is important for correct GRU calculation!
+    b_inp, b_hid : relay.Expr
+        bias matrices. The same order of internal parts as for weights. shape = (3 * hidden_size)
+    r_act : relay.op
+        activation funtion for reset gate. it is sigmoid by default
+    z_act : relay.op
+        activation funtion for update gate. it is sigmoid by default
+    n_act : relay.op
+        activation funtion for new gate. it is tanh by default
+    backwards : bool
+        Flag for reverse pass of GRU
+
+    Returns
+    -------
+    result : List[relay.Expr], relay.Expr, relay.Expr
+        The sequence of computed result, final hidden and cell state
+    """
+
+    outputs_list = []
+    for x_t in input_seqs if not backwards else reversed(input_seqs):
+        # x_t shape = (batch, feature size), step shape = (batch, feature size + hidden_size)
+        step = _op.concatenate([x_t, hidden_state], axis=1)
+        w_irz, w_in = _op.split(w_inp, [2*hidden_size], axis=0)
+        w_hrz, w_hn = _op.split(w_hid, [2*hidden_size], axis=0)
+        cat_w = _op.concatenate([w_irz, w_hrz], axis=1)
+        # Instead of nn.dense(x_t, w_inp) + nn.dense(hidden_state, w_hid)
+        # nn.dense(step, cat_w) is used
+        # gates shape = (batch, 2 * hidden_size)
+        rz_gates = _op.nn.dense(step, cat_w)
+        # Add biases
+        if b_inp is not None:
+            b_irz, b_in = _op.split(b_inp, [2*hidden_size], axis=0)
+            rz_gates += b_irz
+        if b_hid is not None:
+            b_hrz, b_hn = _op.split(b_hid, [2*hidden_size], axis=0)
+            rz_gates += b_hrz
+        # TODO(vvchernov): check similarity of r_act and z_act and change sequence act->split
+        # any gate shape = (batch, hidden_size)
+        r_gate, z_gate = _op.split(rz_gates, 2, axis=-1)
+
+        r_gate = r_act(r_gate)
+        z_gate = z_act(z_gate)
+
+        ni_gate = _op.nn.dense(x_t, w_in)
+        if b_inp is not None:
+            ni_gate += b_in
+        nh_gate = _op.nn.dense(hidden_state, w_hn)
+        if b_hid is not None:
+            nh_gate += b_hn
+
+        n_gate = n_act(ni_gate + r_gate * nh_gate)
+
+        hidden_state = (_op.ones_like(z_gate) - z_gate) * n_gate + z_gate * hidden_state
+
+        outputs_list.append(hidden_state)  # [seq_num, (batch, hidden_size)]
+
+    return outputs_list, hidden_state
+
+
 def lstm_cell(
     input_seqs,
     hidden_state,
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 7c10889ce17e..741e7f0d2be8 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -39,7 +39,7 @@
 from ..prelude import Prelude, StaticTensorArrayOps
 from ..ty import Any, TensorType, TupleType
 from . import qnn_torch
-from .common import AttrCvt, get_relay_op, unbind, lstm_cell
+from .common import AttrCvt, get_relay_op, unbind, lstm_cell, gru_cell
 from .common import infer_value as _infer_value
 from .common import infer_shape as _infer_shape
 from .common import infer_value_simulated as _infer_value_simulated
@@ -2315,6 +2315,204 @@ def flip(self, inputs, input_types):
         axis = inputs[1]
         return _op.transform.reverse(data, axis=axis[0])
 
+    def bidir_gru_cell(
+        self,
+        input_seqs,
+        hidden_size,
+        weights_dicts,
+    ):
+        """
+        Bidirectional GRU cell
+        """
+        seq_len = len(input_seqs)
+        forward_outputs, fw_H_t = gru_cell(
+            input_seqs,
+            hidden_size=hidden_size,
+            **weights_dicts[0],
+        )
+
+        reverse_outputs, rev_H_t = gru_cell(
+            input_seqs,
+            hidden_size=hidden_size,
+            **weights_dicts[1],
+            backwards=True,
+        )
+
+        final_outputs = []
+        for i in range(seq_len):
+            final_outputs.append(
+                _op.concatenate([forward_outputs[i], reverse_outputs[seq_len - 1 - i]], axis=-1)
+            )
+
+        return final_outputs, _op.stack([fw_H_t, rev_H_t], axis=0)
+
+    def gru_layers(self, input_data, layer_weights_dicts, bidirectional, hidden_size, dropout_p=0.0):
+        """
+        Methods iterates layers for Stacked LSTM
+        """
+        layers_num = len(layer_weights_dicts)
+        # split input sequence to samples set
+        input_seqs = unbind(input_data, 0)  # [seq_num, (batch, feature_size)]
+        output_hiddens = []
+        for i in range(layers_num):
+            weights_dicts = layer_weights_dicts[i]
+            # input_seqs shape = [seq_num, (batch, feature_size)] or
+            # [seq_num, (batch, 2*feature_size)] for bidirectional
+            if bidirectional:
+                input_seqs, H_t = self.bidir_gru_cell(input_seqs, hidden_size, weights_dicts)
+            else:
+                input_seqs, H_t = gru_cell(input_seqs, **weights_dicts[0], hidden_size=hidden_size)
+
+            output_hiddens.append(H_t)
+
+            # TODO (vvchernov): in pytorch implementation train is also checked
+            # see https://github.com/pytorch/pytorch/blob/70c8daf43946b53af6493d058899ef952d27d339
+            # /aten/src/ATen/native/RNN.cpp#L1054
+            if dropout_p != 0 and i < layers_num - 1:
+                # for input in input_seqs:
+                #     input = _op.dropout(input, dropout_p)
+                raise NotImplementedError("Dropout for LSTM has not been supported yet!")
+        final_hiddens = []
+        if bidirectional:
+            for output_hidden in output_hiddens:
+                final_hiddens.append(output_hidden[0])
+                final_hiddens.append(output_hidden[1])
+        else:
+            final_hiddens = output_hiddens
+
+        return _op.stack(input_seqs, 0), final_hiddens
+
+    def gru(self, inputs, input_types):
+        """
+        Description of GRU in pytorch:https://pytorch.org/docs/stable/generated/torch.nn.GRU.html?highlight=gru#torch.nn.GRU
+        """
+        # TODO (vvchernov): support dropout
+        assert len(inputs) == 9, "Input of size 9 is expected"
+        # Unpack inputs, note that if optional and not provided then value will be None.
+        _X = inputs[0]
+        # _X shape (seq_num, batch, feature_size) or (batch, seq_num, feature_size)
+
+        hidden_state = inputs[1]
+        # Hidden state shape (hidden_layers_num, batch, hidden_size)
+
+        _weights = inputs[2]
+        # Wi layer[0] shape (3 * hidden_size, feature_size)
+        # Wh layer[0] shape (3 * hidden_size, hidden_size)
+        # Bi layer[0] shape (3 * hidden_size)
+        # Bh layer[0] shape (3 * hidden_size)
+
+        # Wi layer[>0] shape (3 * hidden_size, hidden_size * num_directions)
+        # Wh layer[>0] shape (3 * hidden_size, hidden_size)
+        # Bi layer[>0] shape (3 * hidden_size)
+        # Bh layer[>0] shape (3 * hidden_size)
+
+        # Scalar inputs
+        has_biases = inputs[3]
+        num_layers = inputs[4]
+        dropout_p = inputs[5]  # dropout probability, if 0.0 it means there is no dropout
+        # train = inputs[6]
+        bidirectional = inputs[7]
+        batch_first = inputs[8]
+
+        num_directions = 1
+        if bidirectional:
+            num_directions = 2
+
+        rsd = len(_weights) % num_layers
+        assert rsd == 0, "The number of weights must be a multiple of the number of layers!"
+        rsd = (len(_weights) / num_layers) % num_directions
+        assert (
+            rsd == 0
+        ), "The number of weights in layer must be a multiple of the number of directions!"
+
+        weights_num = int(len(_weights) / num_layers / num_directions)
+        if has_biases:
+            assert weights_num == 4, "The weights number in layer is expected equal to 4"
+        else:
+            assert weights_num == 2, "The weights number in layer is expected equal to 2"
+
+        X = _op.transpose(_X, (1, 0, 2)) if batch_first else _X
+        # TODO (vvchernov): Which data type should be used? from input or weights?
+        # Instead of it _infer_type(X).checked_type.dtype can be used
+        X_dtype = input_types[0]
+        X_shape = _infer_shape(X)  # (seq_num, batch, feature_size)
+
+        hidden_size = _infer_shape(_weights[0])[0] / 3
+        batch_size = X_shape[1]
+
+        # Initialize hidden states if not provided.
+        layers_h = []
+        hidden_layers_num = num_directions * num_layers
+        if hidden_state is None:
+            h_0 = _op.zeros((batch_size, hidden_size), X_dtype)
+            for i in range(hidden_layers_num):
+                layers_h.append(h_0)
+        else:
+            layers_h = unbind(hidden_state, 0)
+
+        layer_weights_dicts = []
+        k = 0  # layer counter
+        if has_biases:
+            names = ["hidden_state", "w_inp", "w_hid", "b_inp", "b_hid"]
+            if bidirectional:
+                rsd = len(_weights) % (2 * weights_num)
+                assert rsd == 0, "got an incorrect number of LSTM weights"
+                for i in range(0, len(_weights), 2 * weights_num):
+                    fw_tensors = [layers_h[2 * k], *_weights[i : i + 4]]
+                    fw_weights_dict = dict(zip(names, fw_tensors))
+                    j = i + weights_num
+                    rev_tensors = [layers_h[2 * k + 1], *_weights[j : j + 4]]
+                    rev_weights_dict = dict(zip(names, rev_tensors))
+                    layer_weights_dicts.append([fw_weights_dict, rev_weights_dict])
+                    k += 1
+            else:
+                assert len(_weights) % weights_num == 0, "got an incorrect number of LSTM weights"
+                for i in range(0, len(_weights), weights_num):
+                    fw_tensors = [layers_h[k], *_weights[i : i + 4]]
+                    fw_weights_dict = dict(zip(names, fw_tensors))
+                    layer_weights_dicts.append([fw_weights_dict])
+                    k += 1
+        else:
+            names = ["hidden_state", "w_inp", "w_hid"]
+            if bidirectional:
+                rsd = len(_weights) % (2 * weights_num)
+                assert rsd == 0, "got an incorrect number of LSTM weights"
+                for i in range(0, len(_weights), 2 * weights_num):
+                    fw_tensors = [layers_h[2 * k], *_weights[i : i + 2]]
+                    fw_weights_dict = dict(zip(names, fw_tensors))
+                    j = i + weights_num
+                    rev_tensors = [layers_h[2 * k + 1], *_weights[j : j + 2]]
+                    rev_weights_dict = dict(zip(names, rev_tensors))
+                    layer_weights_dicts.append([fw_weights_dict, rev_weights_dict])
+                    k += 1
+            else:
+                assert len(_weights) % weights_num == 0, "got an incorrect number of LSTM weights"
+                for i in range(0, len(_weights), weights_num):
+                    fw_tensors = [layers_h[k], *_weights[i : i + 2]]
+                    fw_weights_dict = dict(zip(names, fw_tensors))
+                    layer_weights_dicts.append([fw_weights_dict])
+                    k += 1
+        assert (
+            len(layer_weights_dicts) == num_layers and k == num_layers
+        ), "For stacked GRU number of weights sets should be the same as number of layers!"
+
+        outputs = self.gru_layers(
+            X,
+            layer_weights_dicts,
+            bidirectional,
+            hidden_size=hidden_size,
+            dropout_p=dropout_p,
+        )
+
+        # output shape = (seq_num, batch, hidden_size) or
+        # (seq_num, batch, 2*feature_size) for bidirectional
+        output = outputs[0]
+
+        if batch_first:
+            output = _op.transpose(output, (1, 0, 2))
+
+        return (output, _op.stack(outputs[1], 0))
+
     def bidir_lstm_cell(
         self,
         input_seqs,
@@ -2792,6 +2990,7 @@ def create_convert_map(self):
             "aten::nll_loss": self.nll_loss,
             "aten::nll_loss2d": self.nll_loss,
             "aten::flip": self.flip,
+            "aten::gru": self.gru,
             "aten::lstm": self.lstm,
         }
 

From 549ff68b91637dbb221d831ee7dcc2bcb7929fdb Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Mon, 16 Aug 2021 12:01:21 +0300
Subject: [PATCH 2/9] update GRU in common.py and onnx frontend

---
 python/tvm/relay/frontend/common.py | 87 +++++++++++++++++++----------
 python/tvm/relay/frontend/onnx.py   |  2 +-
 2 files changed, 57 insertions(+), 32 deletions(-)

diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index a635dcdc4771..bb703081adb7 100755
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -666,10 +666,10 @@ def gru_cell(
     w_hid,
     b_inp=None,
     b_hid=None,
-    r_act=_op.sigmoid,
-    z_act=_op.sigmoid,
+    rz_act=_op.sigmoid,
     n_act=_op.tanh,
     backwards=False,
+    linear_before_reset=True,
 ):
     """
     Common implementation of GRU cell for all frontends of TVM
@@ -710,38 +710,63 @@ def gru_cell(
     outputs_list = []
     for x_t in input_seqs if not backwards else reversed(input_seqs):
         # x_t shape = (batch, feature size), step shape = (batch, feature size + hidden_size)
-        step = _op.concatenate([x_t, hidden_state], axis=1)
-        w_irz, w_in = _op.split(w_inp, [2*hidden_size], axis=0)
-        w_hrz, w_hn = _op.split(w_hid, [2*hidden_size], axis=0)
-        cat_w = _op.concatenate([w_irz, w_hrz], axis=1)
-        # Instead of nn.dense(x_t, w_inp) + nn.dense(hidden_state, w_hid)
-        # nn.dense(step, cat_w) is used
-        # gates shape = (batch, 2 * hidden_size)
-        rz_gates = _op.nn.dense(step, cat_w)
-        # Add biases
+        # step = _op.concatenate([x_t, hidden_state], axis=1)
+        # w_irz, w_in = _op.split(w_inp, [2*hidden_size], axis=0)
+        # w_hrz, w_hn = _op.split(w_hid, [2*hidden_size], axis=0)
+        # cat_w = _op.concatenate([w_irz, w_hrz], axis=1)
+        # # Instead of nn.dense(x_t, w_inp) + nn.dense(hidden_state, w_hid)
+        # # nn.dense(step, cat_w) is used
+        # # gates shape = (batch, 2 * hidden_size)
+        # rz_gates = _op.nn.dense(step, cat_w)
+        # # Add biases
+        # if b_inp is not None:
+        #     b_irz, b_in = _op.split(b_inp, [2*hidden_size], axis=0)
+        #     rz_gates += b_irz
+        # if b_hid is not None:
+        #     b_hrz, b_hn = _op.split(b_hid, [2*hidden_size], axis=0)
+        #     rz_gates += b_hrz
+        # # TODO(vvchernov): check similarity of r_act and z_act and change sequence act->split
+        # # any gate shape = (batch, hidden_size)
+        # r_gate, z_gate = _op.split(rz_gates, 2, axis=-1)
+
+        # r_gate = r_act(r_gate)
+        # z_gate = z_act(z_gate)
+
+        # ni_gate = _op.nn.dense(x_t, w_in)
+        # if b_inp is not None:
+        #     ni_gate += b_in
+        # nh_gate = _op.nn.dense(hidden_state, w_hn)
+        # if b_hid is not None:
+        #     nh_gate += b_hn
+
+        # n_gate = n_act(ni_gate + r_gate * nh_gate)
+
+        xwt = _op.nn.dense(x_t, w_inp)
+        i_r, i_z, i_n = _op.split(xwt, 3, axis=1)
+        h_r, h_z, h_n = _op.split(w_hid, 3, axis=0)
+        r_gate = i_r + _op.nn.dense(hidden_state, h_r)
+        z_gate = i_z + _op.nn.dense(hidden_state, h_z)
+        # TODO(vvchernov): It is assumed that both bias are or not
         if b_inp is not None:
-            b_irz, b_in = _op.split(b_inp, [2*hidden_size], axis=0)
-            rz_gates += b_irz
-        if b_hid is not None:
-            b_hrz, b_hn = _op.split(b_hid, [2*hidden_size], axis=0)
-            rz_gates += b_hrz
-        # TODO(vvchernov): check similarity of r_act and z_act and change sequence act->split
-        # any gate shape = (batch, hidden_size)
-        r_gate, z_gate = _op.split(rz_gates, 2, axis=-1)
-
-        r_gate = r_act(r_gate)
-        z_gate = z_act(z_gate)
-
-        ni_gate = _op.nn.dense(x_t, w_in)
-        if b_inp is not None:
-            ni_gate += b_in
-        nh_gate = _op.nn.dense(hidden_state, w_hn)
-        if b_hid is not None:
-            nh_gate += b_hn
+            i_br, i_bz, i_bn = _op.split(b_inp, 3, axis=-1)
+            h_br, h_bz, h_bn = _op.split(b_hid, 3, axis=-1)
+            z_gate += i_bz + h_bz
+            r_gate += i_br + h_br
+            if linear_before_reset:
+                n_gate = i_n + i_bn + (r_gate * (_op.nn.dense(hidden_state, h_n) + h_bn))
+            else:
+                n_gate = i_n + i_bn + _op.nn.dense((r_gate * hidden_state), h_n) + h_bn
+        else:
+            if linear_before_reset:
+                n_gate = i_n + (r_gate * (_op.nn.dense(hidden_state, h_n)))
+            else:
+                n_gate = i_n + _op.nn.dense((r_gate * hidden_state), h_n)
 
-        n_gate = n_act(ni_gate + r_gate * nh_gate)
+        r_gate = rz_act(r_gate)
+        z_gate = rz_act(z_gate)
+        n_gate = n_act(n_gate)
 
-        hidden_state = (_op.ones_like(z_gate) - z_gate) * n_gate + z_gate * hidden_state
+        hidden_state = (hidden_state - n_gate) * z_gate + n_gate
 
         outputs_list.append(hidden_state)  # [seq_num, (batch, hidden_size)]
 
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 0f78c32ef59f..dd47d4d7ff99 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -2386,7 +2386,7 @@ def generate_gru(
             r = f_act(r)
             h = g_act(h)
 
-            H_t = ((_expr.const(1, dtype=W_dtype) - z) * h) + (z * H_t)
+            H_t = (H_t - h) * z + h
             h_list.append(_op.expand_dims(H_t, axis=0))
 
         if backwards:

From 59eefa1f84fccb09f70f9b6fddcb8ce425807845 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 17 Aug 2021 11:58:44 +0300
Subject: [PATCH 3/9] fix issue related to GRU accuracy in pytorch and ONNX
 frontend

---
 python/tvm/relay/frontend/common.py  | 25 +++++++++++++------------
 python/tvm/relay/frontend/onnx.py    |  3 ++-
 python/tvm/relay/frontend/pytorch.py | 15 +++------------
 3 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index bb703081adb7..9e16425e0868 100755
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -743,26 +743,27 @@ def gru_cell(
 
         xwt = _op.nn.dense(x_t, w_inp)
         i_r, i_z, i_n = _op.split(xwt, 3, axis=1)
-        h_r, h_z, h_n = _op.split(w_hid, 3, axis=0)
-        r_gate = i_r + _op.nn.dense(hidden_state, h_r)
-        z_gate = i_z + _op.nn.dense(hidden_state, h_z)
+        w_hr, w_hz, w_hn = _op.split(w_hid, 3, axis=0)
+        r_gate = i_r + _op.nn.dense(hidden_state, w_hr)
+        z_gate = i_z + _op.nn.dense(hidden_state, w_hz)
         # TODO(vvchernov): It is assumed that both bias are or not
         if b_inp is not None:
-            i_br, i_bz, i_bn = _op.split(b_inp, 3, axis=-1)
-            h_br, h_bz, h_bn = _op.split(b_hid, 3, axis=-1)
-            z_gate += i_bz + h_bz
-            r_gate += i_br + h_br
+            b_ir, b_iz, b_in = _op.split(b_inp, 3, axis=-1)
+            b_hr, b_hz, b_hn = _op.split(b_hid, 3, axis=-1)
+            r_gate += b_ir + b_hr
+            r_gate = rz_act(r_gate)
+            z_gate += b_iz + b_hz
             if linear_before_reset:
-                n_gate = i_n + i_bn + (r_gate * (_op.nn.dense(hidden_state, h_n) + h_bn))
+                n_gate = i_n + b_in + (r_gate * (_op.nn.dense(hidden_state, w_hn) + b_hn))
             else:
-                n_gate = i_n + i_bn + _op.nn.dense((r_gate * hidden_state), h_n) + h_bn
+                n_gate = i_n + b_in + _op.nn.dense((r_gate * hidden_state), w_hn) + b_hn
         else:
+            r_gate = rz_act(r_gate)
             if linear_before_reset:
-                n_gate = i_n + (r_gate * (_op.nn.dense(hidden_state, h_n)))
+                n_gate = i_n + (r_gate * (_op.nn.dense(hidden_state, w_hn)))
             else:
-                n_gate = i_n + _op.nn.dense((r_gate * hidden_state), h_n)
+                n_gate = i_n + _op.nn.dense((r_gate * hidden_state), w_hn)
 
-        r_gate = rz_act(r_gate)
         z_gate = rz_act(z_gate)
         n_gate = n_act(n_gate)
 
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index dd47d4d7ff99..fe3dcb6c0792 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -2372,18 +2372,19 @@ def generate_gru(
                 rbz, rbr, rbh = _op.split(RB, 3, axis=-1)
                 z += wbz + rbz
                 r += wbr + rbr
+                r = f_act(r)
                 if linear_before_reset:
                     h = ch + (r * (_op.nn.dense(H_t, rh) + rbh)) + wbh
                 else:
                     h = ch + _op.nn.dense((r * H_t), rh) + wbh + rbh
             else:
+                r = f_act(r)
                 if linear_before_reset:
                     h = ch + (r * (_op.nn.dense(H_t, rh)))
                 else:
                     h = ch + _op.nn.dense((r * H_t), rh)
 
             z = f_act(z)
-            r = f_act(r)
             h = g_act(h)
 
             H_t = (H_t - h) * z + h
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 741e7f0d2be8..dff1ee3153a0 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -2372,15 +2372,8 @@ def gru_layers(self, input_data, layer_weights_dicts, bidirectional, hidden_size
                 # for input in input_seqs:
                 #     input = _op.dropout(input, dropout_p)
                 raise NotImplementedError("Dropout for LSTM has not been supported yet!")
-        final_hiddens = []
-        if bidirectional:
-            for output_hidden in output_hiddens:
-                final_hiddens.append(output_hidden[0])
-                final_hiddens.append(output_hidden[1])
-        else:
-            final_hiddens = output_hiddens
 
-        return _op.stack(input_seqs, 0), final_hiddens
+        return _op.stack(input_seqs, 0), _op.stack(output_hiddens, 0)
 
     def gru(self, inputs, input_types):
         """
@@ -2496,7 +2489,7 @@ def gru(self, inputs, input_types):
             len(layer_weights_dicts) == num_layers and k == num_layers
         ), "For stacked GRU number of weights sets should be the same as number of layers!"
 
-        outputs = self.gru_layers(
+        output, out_hidden_state = self.gru_layers(
             X,
             layer_weights_dicts,
             bidirectional,
@@ -2506,12 +2499,10 @@ def gru(self, inputs, input_types):
 
         # output shape = (seq_num, batch, hidden_size) or
         # (seq_num, batch, 2*feature_size) for bidirectional
-        output = outputs[0]
-
         if batch_first:
             output = _op.transpose(output, (1, 0, 2))
 
-        return (output, _op.stack(outputs[1], 0))
+        return (output, out_hidden_state)
 
     def bidir_lstm_cell(
         self,

From d2f49145d9ab07837c76d72813d14e34a487b421 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 17 Aug 2021 20:04:49 +0300
Subject: [PATCH 4/9] small fixes and remove excess

---
 python/tvm/relay/frontend/common.py  | 35 ----------------------------
 python/tvm/relay/frontend/pytorch.py | 15 +++++-------
 2 files changed, 6 insertions(+), 44 deletions(-)

diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index 9e16425e0868..517b63dca273 100755
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -661,7 +661,6 @@ def unbind(data, axis=0):
 def gru_cell(
     input_seqs,
     hidden_state,
-    hidden_size,
     w_inp,
     w_hid,
     b_inp=None,
@@ -683,8 +682,6 @@ def gru_cell(
         Shape = (batch, feature_size)
     hidden_state : relay.Expr
         Hidden state. shape = (batch_size, hidden_size)
-    hidden_size : int
-        The number of features in the hidden state. It is needed for correct and quick split of weights.
     w_inp, w_hid : relay.Expr
         weight matrices. wi shape = (3 * hidden_size, feature_size)
         wh shape = (3 * hidden_size, hidden_size)
@@ -709,38 +706,6 @@ def gru_cell(
 
     outputs_list = []
     for x_t in input_seqs if not backwards else reversed(input_seqs):
-        # x_t shape = (batch, feature size), step shape = (batch, feature size + hidden_size)
-        # step = _op.concatenate([x_t, hidden_state], axis=1)
-        # w_irz, w_in = _op.split(w_inp, [2*hidden_size], axis=0)
-        # w_hrz, w_hn = _op.split(w_hid, [2*hidden_size], axis=0)
-        # cat_w = _op.concatenate([w_irz, w_hrz], axis=1)
-        # # Instead of nn.dense(x_t, w_inp) + nn.dense(hidden_state, w_hid)
-        # # nn.dense(step, cat_w) is used
-        # # gates shape = (batch, 2 * hidden_size)
-        # rz_gates = _op.nn.dense(step, cat_w)
-        # # Add biases
-        # if b_inp is not None:
-        #     b_irz, b_in = _op.split(b_inp, [2*hidden_size], axis=0)
-        #     rz_gates += b_irz
-        # if b_hid is not None:
-        #     b_hrz, b_hn = _op.split(b_hid, [2*hidden_size], axis=0)
-        #     rz_gates += b_hrz
-        # # TODO(vvchernov): check similarity of r_act and z_act and change sequence act->split
-        # # any gate shape = (batch, hidden_size)
-        # r_gate, z_gate = _op.split(rz_gates, 2, axis=-1)
-
-        # r_gate = r_act(r_gate)
-        # z_gate = z_act(z_gate)
-
-        # ni_gate = _op.nn.dense(x_t, w_in)
-        # if b_inp is not None:
-        #     ni_gate += b_in
-        # nh_gate = _op.nn.dense(hidden_state, w_hn)
-        # if b_hid is not None:
-        #     nh_gate += b_hn
-
-        # n_gate = n_act(ni_gate + r_gate * nh_gate)
-
         xwt = _op.nn.dense(x_t, w_inp)
         i_r, i_z, i_n = _op.split(xwt, 3, axis=1)
         w_hr, w_hz, w_hn = _op.split(w_hid, 3, axis=0)
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index dff1ee3153a0..f616c3bf7037 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -2318,7 +2318,6 @@ def flip(self, inputs, input_types):
     def bidir_gru_cell(
         self,
         input_seqs,
-        hidden_size,
         weights_dicts,
     ):
         """
@@ -2327,13 +2326,11 @@ def bidir_gru_cell(
         seq_len = len(input_seqs)
         forward_outputs, fw_H_t = gru_cell(
             input_seqs,
-            hidden_size=hidden_size,
             **weights_dicts[0],
         )
 
         reverse_outputs, rev_H_t = gru_cell(
             input_seqs,
-            hidden_size=hidden_size,
             **weights_dicts[1],
             backwards=True,
         )
@@ -2346,7 +2343,7 @@ def bidir_gru_cell(
 
         return final_outputs, _op.stack([fw_H_t, rev_H_t], axis=0)
 
-    def gru_layers(self, input_data, layer_weights_dicts, bidirectional, hidden_size, dropout_p=0.0):
+    def gru_layers(self, input_data, layer_weights_dicts, bidirectional, dropout_p=0.0):
         """
         Methods iterates layers for Stacked LSTM
         """
@@ -2359,9 +2356,9 @@ def gru_layers(self, input_data, layer_weights_dicts, bidirectional, hidden_size
             # input_seqs shape = [seq_num, (batch, feature_size)] or
             # [seq_num, (batch, 2*feature_size)] for bidirectional
             if bidirectional:
-                input_seqs, H_t = self.bidir_gru_cell(input_seqs, hidden_size, weights_dicts)
+                input_seqs, H_t = self.bidir_gru_cell(input_seqs, weights_dicts)
             else:
-                input_seqs, H_t = gru_cell(input_seqs, **weights_dicts[0], hidden_size=hidden_size)
+                input_seqs, H_t = gru_cell(input_seqs, **weights_dicts[0])
 
             output_hiddens.append(H_t)
 
@@ -2377,7 +2374,8 @@ def gru_layers(self, input_data, layer_weights_dicts, bidirectional, hidden_size
 
     def gru(self, inputs, input_types):
         """
-        Description of GRU in pytorch:https://pytorch.org/docs/stable/generated/torch.nn.GRU.html?highlight=gru#torch.nn.GRU
+        Description of GRU in pytorch:
+        https://pytorch.org/docs/stable/generated/torch.nn.GRU.html?highlight=gru#torch.nn.GRU
         """
         # TODO (vvchernov): support dropout
         assert len(inputs) == 9, "Input of size 9 is expected"
@@ -2430,7 +2428,7 @@ def gru(self, inputs, input_types):
         X_dtype = input_types[0]
         X_shape = _infer_shape(X)  # (seq_num, batch, feature_size)
 
-        hidden_size = _infer_shape(_weights[0])[0] / 3
+        hidden_size = int(_infer_shape(_weights[0])[0] / 3)
         batch_size = X_shape[1]
 
         # Initialize hidden states if not provided.
@@ -2493,7 +2491,6 @@ def gru(self, inputs, input_types):
             X,
             layer_weights_dicts,
             bidirectional,
-            hidden_size=hidden_size,
             dropout_p=dropout_p,
         )
 

From 439d076b843f48913b109d8ed6df6c128b9689b0 Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 17 Aug 2021 15:56:39 +0300
Subject: [PATCH 5/9] common GRU was additionaly updated. tuned pytorch GRU was
 strongly accelerated

---
 python/tvm/relay/frontend/common.py | 49 ++++++++++++++++-------------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index 517b63dca273..e0dce1e212c2 100755
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -707,30 +707,35 @@ def gru_cell(
     outputs_list = []
     for x_t in input_seqs if not backwards else reversed(input_seqs):
         xwt = _op.nn.dense(x_t, w_inp)
-        i_r, i_z, i_n = _op.split(xwt, 3, axis=1)
-        w_hr, w_hz, w_hn = _op.split(w_hid, 3, axis=0)
-        r_gate = i_r + _op.nn.dense(hidden_state, w_hr)
-        z_gate = i_z + _op.nn.dense(hidden_state, w_hz)
-        # TODO(vvchernov): It is assumed that both bias are or not
-        if b_inp is not None:
-            b_ir, b_iz, b_in = _op.split(b_inp, 3, axis=-1)
-            b_hr, b_hz, b_hn = _op.split(b_hid, 3, axis=-1)
-            r_gate += b_ir + b_hr
-            r_gate = rz_act(r_gate)
-            z_gate += b_iz + b_hz
-            if linear_before_reset:
-                n_gate = i_n + b_in + (r_gate * (_op.nn.dense(hidden_state, w_hn) + b_hn))
-            else:
-                n_gate = i_n + b_in + _op.nn.dense((r_gate * hidden_state), w_hn) + b_hn
+        if linear_before_reset:
+            hwt = _op.nn.dense(hidden_state, w_hid)
+            # TODO(vvchernov): It is assumed that both bias are or not
+            if b_inp is not None:
+                xwt += b_inp
+                hwt += b_hid
+            i_r, i_z, i_n = _op.split(xwt, 3, axis=-1)
+            h_r, h_z, h_n = _op.split(hwt, 3, axis=-1)
+            r_gate = rz_act(i_r + h_r)
+            z_gate = rz_act(i_z + h_z)
+            n_gate = n_act(i_n + r_gate * h_n)
         else:
-            r_gate = rz_act(r_gate)
-            if linear_before_reset:
-                n_gate = i_n + (r_gate * (_op.nn.dense(hidden_state, w_hn)))
+            i_r, i_z, i_n = _op.split(xwt, 3, axis=1)
+            w_hr, w_hz, w_hn = _op.split(w_hid, 3, axis=0)
+            r_gate = i_r + _op.nn.dense(hidden_state, w_hr)
+            z_gate = i_z + _op.nn.dense(hidden_state, w_hz)
+            # TODO(vvchernov): It is assumed that both bias are or not
+            if b_inp is not None:
+                b_ir, b_iz, b_in = _op.split(b_inp, 3, axis=-1)
+                b_hr, b_hz, b_hn = _op.split(b_hid, 3, axis=-1)
+                r_gate += b_ir + b_hr
+                z_gate += b_iz + b_hz
+                i_n += b_in
+                h_n = _op.nn.dense((r_gate * hidden_state), w_hn) + b_hn
             else:
-                n_gate = i_n + _op.nn.dense((r_gate * hidden_state), w_hn)
-
-        z_gate = rz_act(z_gate)
-        n_gate = n_act(n_gate)
+                h_n = _op.nn.dense((r_gate * hidden_state), w_hn)
+            r_gate = rz_act(r_gate)
+            z_gate = rz_act(z_gate)
+            n_gate = n_act(i_n + h_n)
 
         hidden_state = (hidden_state - n_gate) * z_gate + n_gate
 

From 677eafbec7b810129697e014509fdf2ceca4de5f Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Wed, 18 Aug 2021 22:51:58 +0300
Subject: [PATCH 6/9] GRU cell in ONNX frontend was used from common.py.
 previous implementation was removed

---
 python/tvm/relay/frontend/onnx.py | 150 ++++++++++++++----------------
 1 file changed, 72 insertions(+), 78 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index fe3dcb6c0792..5471f67ea106 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -47,6 +47,7 @@
     infer_value,
     new_var,
     unbind,
+    gru_cell,
     lstm_cell,
 )
 
@@ -2349,57 +2350,41 @@ class GRU(RNN):
     """Operator convert for GRU"""
 
     @classmethod
-    def generate_gru(
-        cls, X_steps, H_t, W, R, B, linear_before_reset, f_act, g_act, W_dtype, backwards=False
+    def bidir_gru_cell(
+        cls,
+        input_seqs,
+        weight_dicts,
+        acts,
     ):
-        """Create an unrolled gru loop.
-
-        See https://github.com/onnx/onnx/blob/master/docs/Operators.md for math.
         """
-        h_list = []
-        seq_length = len(X_steps)
-        for i in range(seq_length):
-            step = X_steps[i] if not backwards else X_steps[seq_length - (i + 1)]
-            step = _op.squeeze(step, axis=[0])
-            current = _op.nn.dense(step, W)
-            cz, cr, ch = _op.split(current, 3, axis=1)
-            rz, rr, rh = _op.split(R, 3, axis=0)
-            z = cz + _op.nn.dense(H_t, rz)
-            r = cr + _op.nn.dense(H_t, rr)
-            if B is not None:
-                WB, RB = _op.split(B, 2)
-                wbz, wbr, wbh = _op.split(WB, 3, axis=-1)
-                rbz, rbr, rbh = _op.split(RB, 3, axis=-1)
-                z += wbz + rbz
-                r += wbr + rbr
-                r = f_act(r)
-                if linear_before_reset:
-                    h = ch + (r * (_op.nn.dense(H_t, rh) + rbh)) + wbh
-                else:
-                    h = ch + _op.nn.dense((r * H_t), rh) + wbh + rbh
-            else:
-                r = f_act(r)
-                if linear_before_reset:
-                    h = ch + (r * (_op.nn.dense(H_t, rh)))
-                else:
-                    h = ch + _op.nn.dense((r * H_t), rh)
-
-            z = f_act(z)
-            h = g_act(h)
-
-            H_t = (H_t - h) * z + h
-            h_list.append(_op.expand_dims(H_t, axis=0))
+        Bidirectional GRU cell
+        """
+        seq_len = len(input_seqs)
+        forward_outputs, fw_H_t = gru_cell(
+            input_seqs,
+            **weight_dicts[0],
+            rz_act=acts[0],
+            n_act=acts[1],
+        )
 
-        if backwards:
-            # Canonical view is hidden states from the first token not last
-            h_list = h_list[::-1]
+        reverse_outputs, rev_H_t = gru_cell(
+            input_seqs,
+            **weight_dicts[1],
+            rz_act=acts[2],
+            n_act=acts[3],
+            backwards=True,
+        )
 
-        # Concatenate outputs and add back in direction axis.
-        concatenated = _op.concatenate(h_list, 0)
-        output = _op.expand_dims(concatenated, axis=1)
-        H_t = _op.expand_dims(H_t, axis=0)
+        final_outputs = []
+        for i in range(seq_len):
+            final_outputs.append(
+                _op.stack([forward_outputs[i], reverse_outputs[seq_len - 1 - i]], axis=0)
+            )
 
-        return output, H_t
+        return (
+            _op.stack(final_outputs, axis=0),
+            _op.stack([fw_H_t, rev_H_t], axis=0),
+        )
 
     @classmethod
     def _impl_v7(cls, inputs, attr, params):
@@ -2417,20 +2402,14 @@ def _impl_v7(cls, inputs, attr, params):
         W_dtype = infer_type(Wp).checked_type.dtype
 
         if num_directions not in [1, 2]:
-            raise NotImplementedError(
-                f"Directions for GRUs should be either 1 or 2 got {num_directions}"
-            )
+            raise ValueError("num_directions must be either 1 or 2!")
 
         X_shape = infer_shape(X)
         hidden_size = infer_shape(Rp)[-1]
         batch_size = X_shape[1]
 
-        # Initialize state if not provided.
-        # Otherwise remove bidirectional axis.
         if Hp_0 is None:
             Hp_0 = _op.zeros((num_directions, batch_size, hidden_size), W_dtype)
-        if Bp is None:
-            Bp = _op.zeros((num_directions, hidden_size * 6), W_dtype)
 
         if "activations" in attr:
             activations = attr["activations"]
@@ -2461,39 +2440,54 @@ def _impl_v7(cls, inputs, attr, params):
         else:
             acts = [_op.sigmoid, _op.tanh] * 2
 
-        result_output = []
-        result_H = []
+        # TODO (vvchernov): It can be replaced by _op.split if issue #8412 is resolved
+        X_steps = unbind(X, axis=0)
 
-        X_steps = _op.split(X, indices_or_sections=X_shape[0], axis=0)
         H_ts = _op.split(Hp_0, num_directions)
         Ws = _op.split(Wp, num_directions)
         Rs = _op.split(Rp, num_directions)
-        Bs = _op.split(Bp, num_directions)
 
+        if Bp is not None:
+            Bs = _op.split(Bp, num_directions)
+
+        weights_dicts = []
         for i in range(num_directions):
-            H_t = _op.squeeze(H_ts[i], axis=[0])
-            W = _op.squeeze(Ws[i], axis=[0])
-            R = _op.squeeze(Rs[i], axis=[0])
-            B = _op.squeeze(Bs[i], axis=[0])
-            f_act, g_act = acts[i * 2 : (i + 1) * 2]
-            output, H = GRU.generate_gru(
-                X_steps=X_steps,
-                H_t=H_t,
-                W=W,
-                R=R,
-                B=B,
-                linear_before_reset=linear_before_reset,
-                f_act=f_act,
-                g_act=g_act,
-                W_dtype=W_dtype,
-                backwards=i == 1,
-            )
+            weights_dict = {}
+
+            weights_dict["hidden_state"] = _op.squeeze(H_ts[i], axis=[0])
+            weights_dict["linear_before_reset"] = linear_before_reset
+
+            # Weights permutation: onnx format i-o-f-c, lstm cell format i-f-c-o
+            matz, matr, matn = _op.split(_op.squeeze(Ws[i], axis=[0]), 3)
+            weights_dict["w_inp"] = _op.concatenate([matr, matz, matn], axis=0)
+            matz, matr, matn = _op.split(_op.squeeze(Rs[i], axis=[0]), 3)
+            weights_dict["w_hid"] = _op.concatenate([matr, matz, matn], axis=0)
+            if Bp is not None:
+                Bi, Bh = _op.split(Bs[i], 2, -1)
+                matz, matr, matn = _op.split(_op.squeeze(Bi, axis=[0]), 3)
+                weights_dict["b_inp"] = _op.concatenate([matr, matz, matn], axis=0)
+                matz, matr, matn = _op.split(_op.squeeze(Bh, axis=[0]), 3)
+                weights_dict["b_hid"] = _op.concatenate([matr, matz, matn], axis=0)
+            weights_dicts.append(weights_dict)
 
-            result_output.append(output)
-            result_H.append(H)
+        if num_directions == 2:
+            output, H = GRU.bidir_gru_cell(
+                input_seqs=X_steps,
+                weight_dicts=weights_dicts,
+                acts=acts,
+            )
+        else:
+            # outputs shape = [seqs_num, (batch_size, hidden_size)]
+            outputs, H = gru_cell(
+                input_seqs=X_steps,
+                **weights_dicts[0],
+                rz_act=acts[0],
+                n_act=acts[1],
+            )
 
-        output = _op.concatenate(result_output, axis=1)
-        H = _op.concatenate(result_H, axis=0)
+            # output shape = (seqs_num, num_directions, batch_size, hidden_size)
+            output = _op.expand_dims(_op.stack(outputs, axis=0), axis=1)
+            H = _op.expand_dims(H, axis=0)
 
         return _expr.TupleWrapper(_expr.Tuple((output, H)), 2)
 

From 1f889a397c550b1f03abc5426bf1075638d6dbec Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Fri, 20 Aug 2021 18:44:00 +0300
Subject: [PATCH 7/9] small fixes in comments

---
 python/tvm/relay/frontend/pytorch.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index f616c3bf7037..613643f091d7 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -2345,7 +2345,7 @@ def bidir_gru_cell(
 
     def gru_layers(self, input_data, layer_weights_dicts, bidirectional, dropout_p=0.0):
         """
-        Methods iterates layers for Stacked LSTM
+        Methods iterates layers for Stacked GRU
         """
         layers_num = len(layer_weights_dicts)
         # split input sequence to samples set
@@ -2368,7 +2368,7 @@ def gru_layers(self, input_data, layer_weights_dicts, bidirectional, dropout_p=0
             if dropout_p != 0 and i < layers_num - 1:
                 # for input in input_seqs:
                 #     input = _op.dropout(input, dropout_p)
-                raise NotImplementedError("Dropout for LSTM has not been supported yet!")
+                raise NotImplementedError("Dropout for GRU has not been supported yet!")
 
         return _op.stack(input_seqs, 0), _op.stack(output_hiddens, 0)
 
@@ -2447,7 +2447,7 @@ def gru(self, inputs, input_types):
             names = ["hidden_state", "w_inp", "w_hid", "b_inp", "b_hid"]
             if bidirectional:
                 rsd = len(_weights) % (2 * weights_num)
-                assert rsd == 0, "got an incorrect number of LSTM weights"
+                assert rsd == 0, "got an incorrect number of GRU weights"
                 for i in range(0, len(_weights), 2 * weights_num):
                     fw_tensors = [layers_h[2 * k], *_weights[i : i + 4]]
                     fw_weights_dict = dict(zip(names, fw_tensors))
@@ -2457,7 +2457,7 @@ def gru(self, inputs, input_types):
                     layer_weights_dicts.append([fw_weights_dict, rev_weights_dict])
                     k += 1
             else:
-                assert len(_weights) % weights_num == 0, "got an incorrect number of LSTM weights"
+                assert len(_weights) % weights_num == 0, "got an incorrect number of GRU weights"
                 for i in range(0, len(_weights), weights_num):
                     fw_tensors = [layers_h[k], *_weights[i : i + 4]]
                     fw_weights_dict = dict(zip(names, fw_tensors))
@@ -2467,7 +2467,7 @@ def gru(self, inputs, input_types):
             names = ["hidden_state", "w_inp", "w_hid"]
             if bidirectional:
                 rsd = len(_weights) % (2 * weights_num)
-                assert rsd == 0, "got an incorrect number of LSTM weights"
+                assert rsd == 0, "got an incorrect number of GRU weights"
                 for i in range(0, len(_weights), 2 * weights_num):
                     fw_tensors = [layers_h[2 * k], *_weights[i : i + 2]]
                     fw_weights_dict = dict(zip(names, fw_tensors))
@@ -2477,7 +2477,7 @@ def gru(self, inputs, input_types):
                     layer_weights_dicts.append([fw_weights_dict, rev_weights_dict])
                     k += 1
             else:
-                assert len(_weights) % weights_num == 0, "got an incorrect number of LSTM weights"
+                assert len(_weights) % weights_num == 0, "got an incorrect number of GRU weights"
                 for i in range(0, len(_weights), weights_num):
                     fw_tensors = [layers_h[k], *_weights[i : i + 2]]
                     fw_weights_dict = dict(zip(names, fw_tensors))

From 0e19a6f817d40884315b99b18f1a5b3baf7e087d Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Mon, 23 Aug 2021 10:03:03 +0300
Subject: [PATCH 8/9] fixes after review. GRU test was implemented for pytorch
 frontend

---
 python/tvm/relay/frontend/common.py           |   8 +-
 .../pytorch/{test_lstms.py => test_rnns.py}   | 305 +++++++++++++++---
 2 files changed, 255 insertions(+), 58 deletions(-)
 rename tests/python/frontend/pytorch/{test_lstms.py => test_rnns.py} (53%)

diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index e0dce1e212c2..ce048105ae8b 100755
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -672,7 +672,7 @@ def gru_cell(
 ):
     """
     Common implementation of GRU cell for all frontends of TVM
-    TODO(vvchernov): currently it is used by pytorch. Extend for other frontends
+    TODO(vvchernov): currently it is used by pytorch and ONNX. Extend for other frontends
 
     Parameters
     ----------
@@ -709,8 +709,7 @@ def gru_cell(
         xwt = _op.nn.dense(x_t, w_inp)
         if linear_before_reset:
             hwt = _op.nn.dense(hidden_state, w_hid)
-            # TODO(vvchernov): It is assumed that both bias are or not
-            if b_inp is not None:
+            if b_inp is not None and b_hid is not None:
                 xwt += b_inp
                 hwt += b_hid
             i_r, i_z, i_n = _op.split(xwt, 3, axis=-1)
@@ -723,8 +722,7 @@ def gru_cell(
             w_hr, w_hz, w_hn = _op.split(w_hid, 3, axis=0)
             r_gate = i_r + _op.nn.dense(hidden_state, w_hr)
             z_gate = i_z + _op.nn.dense(hidden_state, w_hz)
-            # TODO(vvchernov): It is assumed that both bias are or not
-            if b_inp is not None:
+            if b_inp is not None and b_hid is not None:
                 b_ir, b_iz, b_in = _op.split(b_inp, 3, axis=-1)
                 b_hr, b_hz, b_hn = _op.split(b_hid, 3, axis=-1)
                 r_gate += b_ir + b_hr
diff --git a/tests/python/frontend/pytorch/test_lstms.py b/tests/python/frontend/pytorch/test_rnns.py
similarity index 53%
rename from tests/python/frontend/pytorch/test_lstms.py
rename to tests/python/frontend/pytorch/test_rnns.py
index 967245e1ef9d..18f8e984ac38 100644
--- a/tests/python/frontend/pytorch/test_lstms.py
+++ b/tests/python/frontend/pytorch/test_rnns.py
@@ -22,22 +22,101 @@
 import onnx
 import io
 import sys
-import pytest
 
 from tvm import relay
 from tvm.contrib import graph_executor
 
 from torch import nn
 
-## Model parameters
-model_feature_size = 16
-model_hidden_size = 32
-model_num_layers = 2
-seqs_length = 2
+## LSTM parameters
+lstm_feature_size = 16
+lstm_hidden_size = 32
+lstm_num_layers = 2
 projection_size = 20
+
+## GRU parameters
+gru_feature_size = 8
+gru_hidden_size = 16
+gru_num_layers = 2
+
+seqs_length = 2
 batch_size = 2
 
 
+class GRU_Model(nn.Module):
+    def __init__(
+        self,
+        device,
+        seq_len=seqs_length,
+        batch_size=batch_size,
+        feature_size=gru_feature_size,
+        hidden_size=gru_hidden_size,
+        batch_first=False,
+        layer_num=1,
+        bidirectional=False,
+        use_bias=True,
+        rnd_weights_init=False,
+    ):
+        super().__init__()
+
+        self.batch_first = batch_first
+        self.seqs_length = seq_len
+        self.batch_size = batch_size
+        self.feature_size = feature_size
+
+        self.gru = nn.GRU(
+            input_size=self.feature_size,
+            hidden_size=hidden_size,
+            num_layers=layer_num,
+            bidirectional=bidirectional,
+            batch_first=batch_first,
+            bias=use_bias,
+        ).to(device)
+
+        if rnd_weights_init:
+            self.gen_rnd_weights()
+
+    def forward(self, input, hidden_init=None):
+        """
+        Computes the output tensor after input inference along GRU layer.
+
+        :param input: batch of data as a tensor of shape (seqs_length, batch_size, feature_size) or (batch_size, seqs_length, feature_size) if self.batch_first = True
+        :param hidden_init: initial hidden state of the GRU as a tensor of shape (num_layers, batch_size, hidden_size). Will default to a tensor of zeros if None.
+        :return: the output tensor of shape (batch_size, hidden_size)
+        """
+        out, hidden = self.gru(input, hidden_init)
+
+        return out
+
+    def gen_rnd_weights(self):
+        """
+        Generate random weigths for the model with biases
+        For first uni- and bidirectional weights group:
+            Wi (3*hidden_size, feature_size)
+            Wh (3*hidden_size, hidden_size)
+            Bi (3*hidden_size)
+            Bh (3*hidden_size)
+        For other weights group:
+            Wi (3*hidden_size, hidden_size)
+            Wh (3*hidden_size, hidden_size)
+            Bi (3*hidden_size)
+            Bh (3*hidden_size)
+        For generation of random weigths for the model without biases the Bi and Bh weights are skipped
+        """
+        with torch.no_grad():
+            for weight_group in self.gru.all_weights:
+                for weight in weight_group:
+                    weight.data = torch.rand(weight.shape)
+
+    def get_dummy_input(self):
+        shape = [self.seqs_length, self.batch_size, self.feature_size]
+        if self.batch_first:
+            shape = [self.batch_size, self.seqs_length, self.feature_size]
+        res = torch.rand(shape)
+
+        return res, shape
+
+
 def check_torch_version_for_proj_in_lstm():
     """
     proj_size parameter is supported in torch.nn.LSTM layer started from 1.8.0 torch version
@@ -75,8 +154,8 @@ def __init__(
 
         if check_torch_version_for_proj_in_lstm():
             self.lstm = nn.LSTM(
-                input_size=model_feature_size,
-                hidden_size=model_hidden_size,
+                input_size=lstm_feature_size,
+                hidden_size=lstm_hidden_size,
                 num_layers=layer_num,
                 bidirectional=bidirectional,
                 proj_size=proj_size,
@@ -91,8 +170,8 @@ def __init__(
                 )
                 # sys.exit()
             self.lstm = nn.LSTM(
-                input_size=model_feature_size,
-                hidden_size=model_hidden_size,
+                input_size=lstm_feature_size,
+                hidden_size=lstm_hidden_size,
                 num_layers=layer_num,
                 bidirectional=bidirectional,
                 batch_first=batch_first,
@@ -106,9 +185,9 @@ def forward(self, input, hidden_init=None):
         """
         Computes the output tensor after input inference along LSTM layer.
 
-        :param input: batch of data as a tensor of shape (seqs_length, batch_size, model_feature_size) or (batch_size, seqs_length, model_feature_size) if self.batch_first = True
+        :param input: batch of data as a tensor of shape (seqs_length, batch_size, lstm_feature_size) or (batch_size, seqs_length, lstm_feature_size) if self.batch_first = True
         :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers, batch_size, hidden_size). Will default to a tensor of zeros if None.
-        :return: the output tensor of shape (batch_size, model_hidden_size)
+        :return: the output tensor of shape (batch_size, lstm_hidden_size)
         """
         # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
         # and the final cell state.
@@ -121,49 +200,50 @@ def gen_rnd_weights(self):
         Generate random weigths for the model with biases
         Without projection:
             For first weights group:
-                Wi (4*model_hidden_size, model_feature_size)
-                Wh (4*model_hidden_size, model_hidden_size)
-                Bi (4*model_hidden_size)
-                Bh (4*model_hidden_size)
+                Wi (4*lstm_hidden_size, lstm_feature_size)
+                Wh (4*lstm_hidden_size, lstm_hidden_size)
+                Bi (4*lstm_hidden_size)
+                Bh (4*lstm_hidden_size)
             For first bidirectional weights group:
-                Wi (4*model_hidden_size, model_feature_size)
-                Wh (4*model_hidden_size, model_hidden_size)
-                Bi (4*model_hidden_size)
-                Bh (4*model_hidden_size)
+                Wi (4*lstm_hidden_size, lstm_feature_size)
+                Wh (4*lstm_hidden_size, lstm_hidden_size)
+                Bi (4*lstm_hidden_size)
+                Bh (4*lstm_hidden_size)
             For other weights group:
-                Wi (4*model_hidden_size, model_hidden_size)
-                Wh (4*model_hidden_size, model_hidden_size)
-                Bi (4*model_hidden_size)
-                Bh (4*model_hidden_size)
+                Wi (4*lstm_hidden_size, lstm_hidden_size)
+                Wh (4*lstm_hidden_size, lstm_hidden_size)
+                Bi (4*lstm_hidden_size)
+                Bh (4*lstm_hidden_size)
         With projection:
             For first weights group:
-                Wi (4*model_hidden_size, model_feature_size)
-                Wh (4*model_hidden_size, proj_size)
-                Bi (4*model_hidden_size)
-                Bh (4*model_hidden_size)
-                P  (proj_size, model_hidden_size)
+                Wi (4*lstm_hidden_size, lstm_feature_size)
+                Wh (4*lstm_hidden_size, proj_size)
+                Bi (4*lstm_hidden_size)
+                Bh (4*lstm_hidden_size)
+                P  (proj_size, lstm_hidden_size)
             For first bidirectional weights group:
-                Wi (4*model_hidden_size, model_feature_size)
-                Wh (4*model_hidden_size, proj_size)
-                Bi (4*model_hidden_size)
-                Bh (4*model_hidden_size)
-                P  (proj_size, model_hidden_size)
+                Wi (4*lstm_hidden_size, lstm_feature_size)
+                Wh (4*lstm_hidden_size, proj_size)
+                Bi (4*lstm_hidden_size)
+                Bh (4*lstm_hidden_size)
+                P  (proj_size, lstm_hidden_size)
             For other weights group:
-                Wi (4*model_hidden_size, proj_size * num_directions)
-                Wh (4*model_hidden_size, proj_size)
-                Bi (4*model_hidden_size)
-                Bh (4*model_hidden_size)
-                P  (proj_size, model_hidden_size)
+                Wi (4*lstm_hidden_size, proj_size * num_directions)
+                Wh (4*lstm_hidden_size, proj_size)
+                Bi (4*lstm_hidden_size)
+                Bh (4*lstm_hidden_size)
+                P  (proj_size, lstm_hidden_size)
         For generation of random weigths for the model without biases Bi and Bh are skipped
         """
-        for weight_group in self.lstm.all_weights:
-            for weight in weight_group:
-                weight.data = torch.rand(weight.shape)
+        with torch.no_grad():
+            for weight_group in self.lstm.all_weights:
+                for weight in weight_group:
+                    weight.data = torch.rand(weight.shape)
 
     def get_dummy_input(self):
-        shape = [seqs_length, batch_size, model_feature_size]
+        shape = [seqs_length, batch_size, lstm_feature_size]
         if self.batch_first:
-            shape = [batch_size, seqs_length, model_feature_size]
+            shape = [batch_size, seqs_length, lstm_feature_size]
         res = torch.rand(shape)
 
         return res, shape
@@ -173,6 +253,115 @@ def compare(input, gold_data, rtol=1e-5, atol=1e-5):
     tvm.testing.assert_allclose(input, gold_data, rtol=rtol, atol=atol)
 
 
+def check_gru_with_type(gru_type, target=tvm.target.Target("llvm -mcpu=core-avx2"), dev=tvm.cpu(0)):
+    device = torch.device("cpu")
+    hidden_layers_num = 1
+    model = None
+    for batch_first in (True, False):
+        for use_bias in (True, False):
+            for rnd_weights in [True]:  # (True, False):
+                if gru_type == "uni":
+                    model = GRU_Model(
+                        device,
+                        batch_first=batch_first,
+                        rnd_weights_init=rnd_weights,
+                        use_bias=use_bias,
+                    )
+                elif gru_type == "b":
+                    model = GRU_Model(
+                        device,
+                        batch_first=batch_first,
+                        bidirectional=True,
+                        rnd_weights_init=rnd_weights,
+                        use_bias=use_bias,
+                    )
+                    hidden_layers_num = 2
+                elif gru_type == "s":
+                    model = GRU_Model(
+                        device,
+                        batch_first=batch_first,
+                        layer_num=gru_num_layers,
+                        rnd_weights_init=rnd_weights,
+                        use_bias=use_bias,
+                    )
+                    hidden_layers_num = gru_num_layers
+                elif gru_type == "sb":
+                    model = GRU_Model(
+                        device,
+                        batch_first=batch_first,
+                        bidirectional=True,
+                        layer_num=gru_num_layers,
+                        rnd_weights_init=rnd_weights,
+                        use_bias=use_bias,
+                    )
+                    hidden_layers_num = 2 * gru_num_layers
+                else:
+                    print("WARNING: GRU type {} is not supported here!".format(gru_type))
+                    return
+
+                model.eval()
+
+                # Get golden output from original model
+                input_hidden_shape = (hidden_layers_num, batch_size, gru_hidden_size)
+                dummy_input, input_shape = model.get_dummy_input()
+                golden_output_batch = model.forward(dummy_input.to(device)).detach().cpu().numpy()
+
+                dtype = "float32"
+                h_zeros = np.zeros(input_hidden_shape, dtype=dtype)
+
+                tvm_output = None
+                for format in ["ts"]:  # ["ts", "onnx"]:
+                    if format == "ts":
+                        # Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing.
+                        traced_script_module = torch.jit.trace(model, dummy_input).eval()
+
+                        # Import model to Relay
+                        shape_list = [("input", input_shape)]
+                        mod, params = relay.frontend.from_pytorch(traced_script_module, shape_list)
+
+                        # Model compilation by tvm
+                        with tvm.transform.PassContext(opt_level=3):
+                            lib = relay.build(mod, target=target, params=params)
+                    elif format == "onnx":
+                        onnx_io = io.BytesIO()
+                        with torch.no_grad():
+                            h0 = torch.rand(input_hidden_shape)
+                            input_names = ["input", "h0"]
+
+                            # default export (without dynamic input)
+                            torch.onnx.export(
+                                model, (dummy_input, h0), onnx_io, input_names=input_names
+                            )
+                        onnx_io.seek(0, 0)
+                        onnx_model = onnx.load_model(onnx_io)
+
+                        # Import model to Relay
+                        shape_dict = {
+                            "input": input_shape,
+                            "h0": input_hidden_shape,
+                        }
+                        mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
+
+                        # Model compilation by tvm
+                        with tvm.transform.PassContext(opt_level=1):
+                            lib = relay.build(mod, target=target, params=params)
+
+                    # Inference of the model with given input data
+                    m = graph_executor.GraphModule(lib["default"](dev))
+
+                    # Set inputs
+                    m.set_input(
+                        input=tvm.nd.array(dummy_input.numpy().astype(dtype)),
+                        h0=tvm.nd.array(h_zeros),
+                    )
+                    # Execute
+                    m.run()
+                    # Get outputs (converted to numpy array)
+                    tvm_output = m.get_output(0).numpy()
+
+                    compare(tvm_output, golden_output_batch)
+
+
 def check_lstm_with_type(
     lstm_type, target=tvm.target.Target("llvm -mcpu=core-avx2"), dev=tvm.cpu(0)
 ):
@@ -212,31 +401,31 @@ def check_lstm_with_type(
                     model = LSTM_Model(
                         device,
                         batch_first=batch_first,
-                        layer_num=model_num_layers,
+                        layer_num=lstm_num_layers,
                         rnd_weights_init=rnd_weights,
                         use_bias=use_bias,
                     )
-                    hidden_layers_num = model_num_layers
+                    hidden_layers_num = lstm_num_layers
                 elif lstm_type == "sb":
                     model = LSTM_Model(
                         device,
                         batch_first=batch_first,
                         bidirectional=True,
-                        layer_num=model_num_layers,
+                        layer_num=lstm_num_layers,
                         rnd_weights_init=rnd_weights,
                         use_bias=use_bias,
                     )
-                    hidden_layers_num = 2 * model_num_layers
+                    hidden_layers_num = 2 * lstm_num_layers
                 elif lstm_type == "sp":
                     model = LSTM_Model(
                         device,
                         batch_first=batch_first,
-                        layer_num=model_num_layers,
+                        layer_num=lstm_num_layers,
                         proj_size=projection_size,
                         rnd_weights_init=rnd_weights,
                         use_bias=use_bias,
                     )
-                    hidden_layers_num = model_num_layers
+                    hidden_layers_num = lstm_num_layers
                 elif lstm_type == "bp":
                     model = LSTM_Model(
                         device,
@@ -252,12 +441,12 @@ def check_lstm_with_type(
                         device,
                         batch_first=batch_first,
                         bidirectional=True,
-                        layer_num=model_num_layers,
+                        layer_num=lstm_num_layers,
                         proj_size=projection_size,
                         rnd_weights_init=rnd_weights,
                         use_bias=use_bias,
                     )
-                    hidden_layers_num = 2 * model_num_layers
+                    hidden_layers_num = 2 * lstm_num_layers
                 else:
                     print("WARNING: LSTM type {} is not supported here!".format(lstm_type))
                     return
@@ -265,7 +454,7 @@ def check_lstm_with_type(
                 model.eval()
 
                 # Get golden output from original model
-                input_hidden_shape = (hidden_layers_num, batch_size, model_hidden_size)
+                input_hidden_shape = (hidden_layers_num, batch_size, lstm_hidden_size)
                 input_hidden_shape_with_proj = (hidden_layers_num, batch_size, projection_size)
                 dummy_input, input_shape = model.get_dummy_input()
                 golden_output_batch = model.forward(dummy_input.to(device)).detach().cpu().numpy()
@@ -346,6 +535,15 @@ def check_lstm_with_type(
                     compare(tvm_output, golden_output_batch)
 
 
+@tvm.testing.uses_gpu
+def test_grus():
+    for target, dev in tvm.testing.enabled_targets():
+        check_gru_with_type("uni", target, dev)
+        check_gru_with_type("s", target, dev)
+        check_gru_with_type("b", target, dev)
+        check_gru_with_type("sb", target, dev)
+
+
 @tvm.testing.uses_gpu
 def test_lstms():
     for target, dev in tvm.testing.enabled_targets():
@@ -361,3 +559,4 @@ def test_lstms():
 
 if __name__ == "__main__":
     test_lstms()
+    test_grus()

From c722e27739d77058915a18e42764852fd7a4e1fa Mon Sep 17 00:00:00 2001
From: Valery Chernov <valery.chernov@deelvin.com>
Date: Tue, 24 Aug 2021 12:36:55 +0300
Subject: [PATCH 9/9] tests for RNN layers was unified for pytorch frontend

---
 tests/python/frontend/pytorch/test_rnns.py | 544 ++++++++-------------
 1 file changed, 206 insertions(+), 338 deletions(-)

diff --git a/tests/python/frontend/pytorch/test_rnns.py b/tests/python/frontend/pytorch/test_rnns.py
index 18f8e984ac38..b5784a6fe1e1 100644
--- a/tests/python/frontend/pytorch/test_rnns.py
+++ b/tests/python/frontend/pytorch/test_rnns.py
@@ -17,7 +17,6 @@
 
 import tvm
 import tvm.testing
-import numpy as np
 import torch
 import onnx
 import io
@@ -31,22 +30,72 @@
 ## LSTM parameters
 lstm_feature_size = 16
 lstm_hidden_size = 32
-lstm_num_layers = 2
-projection_size = 20
+lstm_projection_size = 20
 
 ## GRU parameters
 gru_feature_size = 8
 gru_hidden_size = 16
-gru_num_layers = 2
 
+num_layers = 2
 seqs_length = 2
 batch_size = 2
 
 
-class GRU_Model(nn.Module):
+class RNN_Model(nn.Module):
+    """
+    It is base class for RNN layer classes.
+    It contains some common fields and methods for child classes.
+    """
+
+    def __init__(
+        self,
+    ):
+        super().__init__()
+
+        # model is defined in child class
+        self.model = None
+
+    def forward(self, input, hidden_init=None):
+        """
+        Computes the output tensor after input inference along RNN layer.
+
+        :param input: batch of data as a tensor of shape (seqs_length, batch_size, feature_size) or (batch_size, seqs_length, feature_size) if self.batch_first = True
+        :param hidden_init: initial hidden state(s) of the RNN as a tensor(s) of shape (num_layers, batch_size, hidden_size). Will default to a tensor of zeros if None.
+        :return: the output tensor of shape (batch_size, hidden_size)
+        """
+        if self.model is None:
+            raise NotImplementedError("self.model must be defined in subclasses!")
+        out, _ = self.model(input, hidden_init)
+
+        return out
+
+    def gen_rnd_weights(self):
+        """
+        Generate random weigths for the model
+        """
+        if self.model is None:
+            raise NotImplementedError("self.model must be defined in subclasses!")
+        with torch.no_grad():
+            for weight_group in self.model.all_weights:
+                for weight in weight_group:
+                    weight.data = torch.rand(weight.shape)
+
+    def get_dummy_inputs(self):
+        raise NotImplementedError("subclasses must override get_dummy_inputs()!")
+
+    def get_input_names(self):
+        raise NotImplementedError("subclasses must override get_input_names()!")
+
+    def get_shape_desc(self, frontend_type):
+        raise NotImplementedError("subclasses must override get_shape_desc(frontend_type)!")
+
+    def get_tvm_inputs(self, dtype):
+        raise NotImplementedError("subclasses must override get_tvm_inputs(dtype)!")
+
+
+class GRU_Model(RNN_Model):
     def __init__(
         self,
-        device,
         seq_len=seqs_length,
         batch_size=batch_size,
         feature_size=gru_feature_size,
@@ -59,35 +108,27 @@ def __init__(
     ):
         super().__init__()
 
-        self.batch_first = batch_first
-        self.seqs_length = seq_len
-        self.batch_size = batch_size
-        self.feature_size = feature_size
-
-        self.gru = nn.GRU(
-            input_size=self.feature_size,
+        # Shapes
+        self.shape = [seq_len, batch_size, feature_size]
+        if batch_first:
+            self.shape = [batch_size, seq_len, feature_size]
+        layers_num = 2 * layer_num if bidirectional else layer_num
+        self.h0_shape = [layers_num, batch_size, hidden_size]
+        # Dummy inputs
+        self.dummy_inputs = (torch.rand(self.shape), torch.zeros(self.h0_shape))
+
+        self.model = nn.GRU(
+            input_size=feature_size,
             hidden_size=hidden_size,
             num_layers=layer_num,
             bidirectional=bidirectional,
             batch_first=batch_first,
             bias=use_bias,
-        ).to(device)
+        )
 
         if rnd_weights_init:
             self.gen_rnd_weights()
 
-    def forward(self, input, hidden_init=None):
-        """
-        Computes the output tensor after input inference along GRU layer.
-
-        :param input: batch of data as a tensor of shape (seqs_length, batch_size, feature_size) or (batch_size, seqs_length, feature_size) if self.batch_first = True
-        :param hidden_init: initial hidden state of the GRU as a tensor of shape (num_layers, batch_size, hidden_size). Will default to a tensor of zeros if None.
-        :return: the output tensor of shape (batch_size, hidden_size)
-        """
-        out, hidden = self.gru(input, hidden_init)
-
-        return out
-
     def gen_rnd_weights(self):
         """
         Generate random weigths for the model with biases
@@ -103,18 +144,30 @@ def gen_rnd_weights(self):
             Bh (3*hidden_size)
         For generation of random weigths for the model without biases the Bi and Bh weights are skipped
         """
-        with torch.no_grad():
-            for weight_group in self.gru.all_weights:
-                for weight in weight_group:
-                    weight.data = torch.rand(weight.shape)
+        super().gen_rnd_weights()
 
-    def get_dummy_input(self):
-        shape = [self.seqs_length, self.batch_size, self.feature_size]
-        if self.batch_first:
-            shape = [self.batch_size, self.seqs_length, self.feature_size]
-        res = torch.rand(shape)
+    def get_dummy_inputs(self):
+        return self.dummy_inputs
 
-        return res, shape
+    def get_input_names(self):
+        return ["input", "h0"]
+
+    def get_shape_desc(self, frontend_type):
+        shape_desc = None
+        if frontend_type == "pt":  # PyTorch
+            shape_desc = [("input", self.shape)]
+        elif frontend_type == "onnx":  # ONNX
+            shape_desc = {
+                "input": self.shape,
+                "h0": self.h0_shape,
+            }
+        return shape_desc
+
+    def get_tvm_inputs(self, dtype):
+        return {
+            "input": tvm.nd.array(self.dummy_inputs[0].numpy().astype(dtype)),
+            "h0": tvm.nd.array(self.dummy_inputs[1].numpy().astype(dtype)),
+        }
 
 
 def check_torch_version_for_proj_in_lstm():
@@ -135,10 +188,13 @@ def check_torch_version_for_proj_in_lstm():
     return me
 
 
-class LSTM_Model(nn.Module):
+class LSTM_Model(RNN_Model):
     def __init__(
         self,
-        device,
+        seq_len=seqs_length,
+        batch_size=batch_size,
+        feature_size=lstm_feature_size,
+        hidden_size=lstm_hidden_size,
         batch_first=False,
         layer_num=1,
         bidirectional=False,
@@ -148,12 +204,23 @@ def __init__(
     ):
         super().__init__()
 
-        self.device = device
-        self.batch_first = batch_first
-        self.use_bias = use_bias
+        # Shapes
+        self.shape = [seq_len, batch_size, feature_size]
+        if batch_first:
+            self.shape = [batch_size, seq_len, feature_size]
+        layers_num = 2 * layer_num if bidirectional else layer_num
+        self.h0_shape = [layers_num, batch_size, hidden_size]
+        if proj_size > 0:
+            self.h0_shape = [layers_num, batch_size, proj_size]
+        self.c0_shape = [layers_num, batch_size, hidden_size]
+        # Dummy inputs
+        self.dummy_inputs = (
+            torch.rand(self.shape),
+            (torch.zeros(self.h0_shape), torch.zeros(self.c0_shape)),
+        )
 
         if check_torch_version_for_proj_in_lstm():
-            self.lstm = nn.LSTM(
+            self.model = nn.LSTM(
                 input_size=lstm_feature_size,
                 hidden_size=lstm_hidden_size,
                 num_layers=layer_num,
@@ -161,7 +228,7 @@ def __init__(
                 proj_size=proj_size,
                 batch_first=batch_first,
                 bias=use_bias,
-            ).to(device)
+            )
         else:
             if proj_size > 0:
                 print(
@@ -169,32 +236,18 @@ def __init__(
                     "LSTM was constructed without projection!",
                 )
                 # sys.exit()
-            self.lstm = nn.LSTM(
+            self.model = nn.LSTM(
                 input_size=lstm_feature_size,
                 hidden_size=lstm_hidden_size,
                 num_layers=layer_num,
                 bidirectional=bidirectional,
                 batch_first=batch_first,
                 bias=use_bias,
-            ).to(device)
+            )
 
         if rnd_weights_init:
             self.gen_rnd_weights()
 
-    def forward(self, input, hidden_init=None):
-        """
-        Computes the output tensor after input inference along LSTM layer.
-
-        :param input: batch of data as a tensor of shape (seqs_length, batch_size, lstm_feature_size) or (batch_size, seqs_length, lstm_feature_size) if self.batch_first = True
-        :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers, batch_size, hidden_size). Will default to a tensor of zeros if None.
-        :return: the output tensor of shape (batch_size, lstm_hidden_size)
-        """
-        # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
-        # and the final cell state.
-        out, (hidden, cell) = self.lstm(input, hidden_init)
-
-        return out
-
     def gen_rnd_weights(self):
         """
         Generate random weigths for the model with biases
@@ -235,328 +288,143 @@ def gen_rnd_weights(self):
                 P  (proj_size, lstm_hidden_size)
         For generation of random weigths for the model without biases Bi and Bh are skipped
         """
-        with torch.no_grad():
-            for weight_group in self.lstm.all_weights:
-                for weight in weight_group:
-                    weight.data = torch.rand(weight.shape)
-
-    def get_dummy_input(self):
-        shape = [seqs_length, batch_size, lstm_feature_size]
-        if self.batch_first:
-            shape = [batch_size, seqs_length, lstm_feature_size]
-        res = torch.rand(shape)
-
-        return res, shape
+        super().gen_rnd_weights()
+
+    def get_dummy_inputs(self):
+        return self.dummy_inputs
+
+    def get_input_names(self):
+        return ["input", "h0", "c0"]
+
+    def get_shape_desc(self, frontend_type):
+        shape_desc = None
+        if frontend_type == "pt":  # PyTorch
+            shape_desc = [("input", self.shape)]
+        elif frontend_type == "onnx":  # ONNX
+            shape_desc = {
+                "input": self.shape,
+                "h0": self.h0_shape,
+                "c0": self.c0_shape,
+            }
+        return shape_desc
+
+    def get_tvm_inputs(self, dtype):
+        return {
+            "input": tvm.nd.array(self.dummy_inputs[0].numpy().astype(dtype)),
+            "h0": tvm.nd.array(self.dummy_inputs[1][0].numpy().astype(dtype)),
+            "c0": tvm.nd.array(self.dummy_inputs[1][1].numpy().astype(dtype)),
+        }
 
 
 def compare(input, gold_data, rtol=1e-5, atol=1e-5):
     tvm.testing.assert_allclose(input, gold_data, rtol=rtol, atol=atol)
 
 
-def check_gru_with_type(gru_type, target=tvm.target.Target("llvm -mcpu=core-avx2"), dev=tvm.cpu(0)):
-    device = torch.device("cpu")
-    hidden_layers_num = 1
-    model = None
-    for batch_first in (True, False):
-        for use_bias in (True, False):
-            for rnd_weights in [True]:  # (True, False):
-                if gru_type == "uni":
-                    model = GRU_Model(
-                        device,
-                        batch_first=batch_first,
-                        rnd_weights_init=rnd_weights,
-                        use_bias=use_bias,
-                    )
-                elif gru_type == "b":
-                    model = GRU_Model(
-                        device,
-                        batch_first=batch_first,
-                        bidirectional=True,
-                        rnd_weights_init=rnd_weights,
-                        use_bias=use_bias,
-                    )
-                    hidden_layers_num = 2
-                elif gru_type == "s":
-                    model = GRU_Model(
-                        device,
-                        batch_first=batch_first,
-                        layer_num=gru_num_layers,
-                        rnd_weights_init=rnd_weights,
-                        use_bias=use_bias,
-                    )
-                    hidden_layers_num = gru_num_layers
-                elif gru_type == "sb":
-                    model = GRU_Model(
-                        device,
-                        batch_first=batch_first,
-                        bidirectional=True,
-                        layer_num=gru_num_layers,
-                        rnd_weights_init=rnd_weights,
-                        use_bias=use_bias,
-                    )
-                    hidden_layers_num = 2 * gru_num_layers
-                else:
-                    print("WARNING: GRU type {} is not supported here!".format(gru_type))
-                    return
-
-                model.eval()
-
-                # Get golden output from original model
-                input_hidden_shape = (hidden_layers_num, batch_size, gru_hidden_size)
-                dummy_input, input_shape = model.get_dummy_input()
-                golden_output_batch = model.forward(dummy_input.to(device)).detach().cpu().numpy()
-
-                dtype = "float32"
-                h_zeros = np.zeros(input_hidden_shape, dtype=dtype)
-
-                tvm_output = None
-                for format in ["ts"]:  # ["ts", "onnx"]:
-                    if format == "ts":
-                        # Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing.
-                        traced_script_module = torch.jit.trace(model, dummy_input).eval()
-
-                        # Import model to Relay
-                        shape_list = [("input", input_shape)]
-                        mod, params = relay.frontend.from_pytorch(traced_script_module, shape_list)
-
-                        # Model compilation by tvm
-                        with tvm.transform.PassContext(opt_level=3):
-                            lib = relay.build(mod, target=target, params=params)
-                    elif format == "onnx":
-                        onnx_io = io.BytesIO()
-                        with torch.no_grad():
-                            h0 = torch.rand(input_hidden_shape)
-                            input_names = ["input", "h0"]
-
-                            # default export (without dynamic input)
-                            torch.onnx.export(
-                                model, (dummy_input, h0), onnx_io, input_names=input_names
-                            )
-                        onnx_io.seek(0, 0)
-                        onnx_model = onnx.load_model(onnx_io)
-
-                        # Import model to Relay
-                        shape_dict = {
-                            "input": input_shape,
-                            "h0": input_hidden_shape,
-                        }
-                        mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
-
-                        # Model compilation by tvm
-                        with tvm.transform.PassContext(opt_level=1):
-                            lib = relay.build(mod, target=target, params=params)
-
-                    # Inference of the model with given input data
-                    m = graph_executor.GraphModule(lib["default"](dev))
-
-                    # Set inputs
-                    m.set_input(
-                        input=tvm.nd.array(dummy_input.numpy().astype(dtype)),
-                        h0=tvm.nd.array(h_zeros),
-                    )
-                    # Execute
-                    m.run()
-                    # Get outputs (converted to numpy array)
-                    tvm_output = m.get_output(0).numpy()
-
-                    compare(tvm_output, golden_output_batch)
+def check_rnn(rnn_type, rnn_mod, target=tvm.target.Target("llvm -mcpu=core-avx2"), dev=tvm.cpu(0)):
+    def get_model(
+        rnn_type,
+        rnn_mod,
+        args,
+    ):
+        # Fill args
+        if "b" in rnn_mod:
+            args["bidirectional"] = True
+        if "s" in rnn_mod:
+            args["layer_num"] = num_layers
+
+        if rnn_type == "GRU":
+            RNN_Model_selector = GRU_Model
+        elif rnn_type == "LSTM":
+            RNN_Model_selector = LSTM_Model
+            if "p" in rnn_mod:
+                args["proj_size"] = lstm_projection_size
+
+        return RNN_Model_selector(**args)
+
+    def get_onnx_model(model):
+        onnx_io = io.BytesIO()
+        with torch.no_grad():
+            input_names = model.get_input_names()
+            inputs = model.get_dummy_inputs()
 
+            # default export (without dynamic input)
+            torch.onnx.export(model, inputs, onnx_io, input_names=input_names)
 
-def check_lstm_with_type(
-    lstm_type, target=tvm.target.Target("llvm -mcpu=core-avx2"), dev=tvm.cpu(0)
-):
-    has_proj = "p" in lstm_type
+        onnx_io.seek(0, 0)
+        return onnx.load_model(onnx_io)
 
-    device = torch.device("cpu")
-    hidden_layers_num = 1
     model = None
+    dtype = "float32"
+    device = torch.device("cpu")
     for batch_first in (True, False):
         for use_bias in (True, False):
             for rnd_weights in [True]:  # (True, False):
-                if lstm_type == "uni":
-                    model = LSTM_Model(
-                        device,
-                        batch_first=batch_first,
-                        rnd_weights_init=rnd_weights,
-                        use_bias=use_bias,
-                    )
-                elif lstm_type == "b":
-                    model = LSTM_Model(
-                        device,
-                        batch_first=batch_first,
-                        bidirectional=True,
-                        rnd_weights_init=rnd_weights,
-                        use_bias=use_bias,
-                    )
-                    hidden_layers_num = 2
-                elif lstm_type == "p":
-                    model = LSTM_Model(
-                        device,
-                        batch_first=batch_first,
-                        proj_size=projection_size,
-                        rnd_weights_init=rnd_weights,
-                        use_bias=use_bias,
-                    )
-                elif lstm_type == "s":
-                    model = LSTM_Model(
-                        device,
-                        batch_first=batch_first,
-                        layer_num=lstm_num_layers,
-                        rnd_weights_init=rnd_weights,
-                        use_bias=use_bias,
-                    )
-                    hidden_layers_num = lstm_num_layers
-                elif lstm_type == "sb":
-                    model = LSTM_Model(
-                        device,
-                        batch_first=batch_first,
-                        bidirectional=True,
-                        layer_num=lstm_num_layers,
-                        rnd_weights_init=rnd_weights,
-                        use_bias=use_bias,
-                    )
-                    hidden_layers_num = 2 * lstm_num_layers
-                elif lstm_type == "sp":
-                    model = LSTM_Model(
-                        device,
-                        batch_first=batch_first,
-                        layer_num=lstm_num_layers,
-                        proj_size=projection_size,
-                        rnd_weights_init=rnd_weights,
-                        use_bias=use_bias,
-                    )
-                    hidden_layers_num = lstm_num_layers
-                elif lstm_type == "bp":
-                    model = LSTM_Model(
-                        device,
-                        batch_first=batch_first,
-                        bidirectional=True,
-                        proj_size=projection_size,
-                        rnd_weights_init=rnd_weights,
-                        use_bias=use_bias,
-                    )
-                    hidden_layers_num = 2
-                elif lstm_type == "sbp":
-                    model = LSTM_Model(
-                        device,
-                        batch_first=batch_first,
-                        bidirectional=True,
-                        layer_num=lstm_num_layers,
-                        proj_size=projection_size,
-                        rnd_weights_init=rnd_weights,
-                        use_bias=use_bias,
-                    )
-                    hidden_layers_num = 2 * lstm_num_layers
-                else:
-                    print("WARNING: LSTM type {} is not supported here!".format(lstm_type))
-                    return
-
+                model_inputs = {
+                    "batch_first": batch_first,
+                    "use_bias": use_bias,
+                    "rnd_weights_init": rnd_weights,
+                }
+                model = get_model(rnn_type, rnn_mod, model_inputs)
+                model.to(device)
                 model.eval()
 
                 # Get golden output from original model
-                input_hidden_shape = (hidden_layers_num, batch_size, lstm_hidden_size)
-                input_hidden_shape_with_proj = (hidden_layers_num, batch_size, projection_size)
-                dummy_input, input_shape = model.get_dummy_input()
-                golden_output_batch = model.forward(dummy_input.to(device)).detach().cpu().numpy()
-
-                dtype = "float32"
-                h_zeros = np.zeros(input_hidden_shape, dtype=dtype)
-                if has_proj:
-                    h_zeros = np.zeros(input_hidden_shape_with_proj, dtype=dtype)
-                c_zeros = np.zeros(input_hidden_shape, dtype=dtype)
+                dummy_inputs = model.get_dummy_inputs()
+                golden_output = model.forward(dummy_inputs[0].to(device)).detach().cpu().numpy()
 
                 tvm_output = None
-                for format in ["ts"]:  # ["ts", "onnx"]:
-                    if format == "ts":
+                for format in ["pt"]:  # ["pt", "onnx"]:
+                    shape_desc = model.get_shape_desc(format)
+                    if format == "pt":
                         # Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing.
-                        traced_script_module = torch.jit.trace(model, dummy_input).eval()
+                        traced_script_module = torch.jit.trace(model, dummy_inputs[0]).eval()
 
                         # Import model to Relay
-                        shape_list = [("input", input_shape)]
-                        mod, params = relay.frontend.from_pytorch(traced_script_module, shape_list)
-
-                        # Model compilation by tvm
-                        with tvm.transform.PassContext(opt_level=3):
-                            lib = relay.build(mod, target=target, params=params)
+                        mod, params = relay.frontend.from_pytorch(traced_script_module, shape_desc)
                     elif format == "onnx":
-                        if has_proj:
+                        try:
+                            onnx_model = get_onnx_model(model)
+                        except:
                             print(
                                 "WARNING: torch.onnx.export does not support conversion LSTM with projection "
                                 "from pytorch! TODO: waiting for the support and correct test after that."
                             )
                             continue
-                        onnx_io = io.BytesIO()
-                        with torch.no_grad():
-                            h0 = torch.rand(input_hidden_shape)
-                            if has_proj:
-                                h0 = torch.rand(input_hidden_shape_with_proj)
-                            c0 = torch.rand(input_hidden_shape)
-                            input_names = ["input", "h0", "c0"]
-
-                            # default export (without dynamic input)
-                            torch.onnx.export(
-                                model, (dummy_input, (h0, c0)), onnx_io, input_names=input_names
-                            )
-                        onnx_io.seek(0, 0)
-                        onnx_model = onnx.load_model(onnx_io)
 
                         # Import model to Relay
-                        shape_dict = {
-                            "input": input_shape,
-                            "h0": input_hidden_shape,
-                            "c0": input_hidden_shape,
-                        }
-                        if has_proj:
-                            shape_dict = {
-                                "input": input_shape,
-                                "h0": input_hidden_shape_with_proj,
-                                "c0": input_hidden_shape,
-                            }
-                        mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
-
-                        # Model compilation by tvm
-                        with tvm.transform.PassContext(opt_level=1):
-                            lib = relay.build(mod, target=target, params=params)
+                        mod, params = relay.frontend.from_onnx(onnx_model, shape_desc)
+
+                    # Model compilation by tvm
+                    with tvm.transform.PassContext(opt_level=3):
+                        lib = relay.build(mod, target=target, params=params)
 
                     # Inference of the model with given input data
                     m = graph_executor.GraphModule(lib["default"](dev))
 
                     # Set inputs
-                    m.set_input(
-                        input=tvm.nd.array(dummy_input.numpy().astype(dtype)),
-                        h0=tvm.nd.array(h_zeros),
-                        c0=tvm.nd.array(c_zeros),
-                    )
+                    tvm_inputs = model.get_tvm_inputs(dtype)
+                    m.set_input(**tvm_inputs)
                     # Execute
                     m.run()
                     # Get outputs (converted to numpy array)
                     tvm_output = m.get_output(0).numpy()
 
-                    compare(tvm_output, golden_output_batch)
-
-
-@tvm.testing.uses_gpu
-def test_grus():
-    for target, dev in tvm.testing.enabled_targets():
-        check_gru_with_type("uni", target, dev)
-        check_gru_with_type("s", target, dev)
-        check_gru_with_type("b", target, dev)
-        check_gru_with_type("sb", target, dev)
+                    compare(tvm_output, golden_output)
 
 
 @tvm.testing.uses_gpu
-def test_lstms():
+def test_rnns():
     for target, dev in tvm.testing.enabled_targets():
-        check_lstm_with_type("uni", target, dev)
-        # check_lstm_with_type("p", target, dev)
-        check_lstm_with_type("s", target, dev)
-        check_lstm_with_type("b", target, dev)
-        # check_lstm_with_type("bp", target, dev)
-        # check_lstm_with_type("sp", target, dev)
-        check_lstm_with_type("sb", target, dev)
-        # check_lstm_with_type("sbp", target, dev)
+        # RNN types: GRU, LSTM
+        # GRU modifications: unidirectional, stacked, bidirectional, stacked bidirectional
+        for mod_type in ["uni", "s", "b", "sb"]:
+            check_rnn("GRU", mod_type, target, dev)
+        # LSTM modifications: unidirectional, stacked, bidirectional, stacked bidirectional,
+        # and all these types with projection ("p", "sp", "bp", "sbp")
+        # The latter are skiped for test acceleration
+        for mod_type in ["uni", "s", "b", "sb"]:
+            check_rnn("LSTM", mod_type, target, dev)
 
 
 if __name__ == "__main__":
-    test_lstms()
-    test_grus()
+    test_rnns()