diff --git a/README.md b/README.md
index 357ff43fba..6f72f21c14 100644
--- a/README.md
+++ b/README.md
@@ -94,6 +94,7 @@ A full [document](doc/train/train-input-auto.rst) on options in the training inp
     - [Descriptor `"se_e2_a"`](doc/model/train-se-e2-a.md)
     - [Descriptor `"se_e2_r"`](doc/model/train-se-e2-r.md)
     - [Descriptor `"se_e3"`](doc/model/train-se-e3.md)
+    - [Descriptor `"se_atten"`](doc/model/train-se-atten.md)
     - [Descriptor `"hybrid"`](doc/model/train-hybrid.md)
     - [Descriptor `sel`](doc/model/sel.md)
     - [Fit energy](doc/model/train-energy.md)
diff --git a/deepmd/common.py b/deepmd/common.py
index 1146f291d5..6129011bbc 100644
--- a/deepmd/common.py
+++ b/deepmd/common.py
@@ -405,8 +405,8 @@ def j_loader(filename: Union[str, Path]) -> Dict[str, Any]:
 
 
 def get_activation_func(
-    activation_fn: "_ACTIVATION",
-) -> Callable[[tf.Tensor], tf.Tensor]:
+    activation_fn: Union["_ACTIVATION", None],
+) -> Union[Callable[[tf.Tensor], tf.Tensor], None]:
     """Get activation function callable based on string name.
 
     Parameters
@@ -424,6 +424,8 @@ def get_activation_func(
     RuntimeError
         if unknown activation function is specified
     """
+    if activation_fn is None or activation_fn in ['none', 'None']:
+        return None
     if activation_fn not in ACTIVATION_FN_DICT:
         raise RuntimeError(f"{activation_fn} is not a valid activation function")
     return ACTIVATION_FN_DICT[activation_fn]
diff --git a/deepmd/descriptor/__init__.py b/deepmd/descriptor/__init__.py
index dd022a4e08..e4df063a81 100644
--- a/deepmd/descriptor/__init__.py
+++ b/deepmd/descriptor/__init__.py
@@ -7,3 +7,4 @@
 from .se_a_ef import DescrptSeAEf
 from .se_a_ef import DescrptSeAEfLower
 from .loc_frame import DescrptLocFrame
+from .se_atten import DescrptSeAtten
diff --git a/deepmd/descriptor/se_atten.py b/deepmd/descriptor/se_atten.py
new file mode 100644
index 0000000000..ead4f93d46
--- /dev/null
+++ b/deepmd/descriptor/se_atten.py
@@ -0,0 +1,777 @@
+import math
+import numpy as np
+from typing import Tuple, List, Dict, Any
+from packaging.version import Version
+
+from deepmd.env import tf
+from deepmd.common import get_activation_func, get_precision, cast_precision
+from deepmd.env import GLOBAL_TF_FLOAT_PRECISION
+from deepmd.env import TF_VERSION
+from deepmd.env import GLOBAL_NP_FLOAT_PRECISION
+from deepmd.env import op_module
+from deepmd.env import default_tf_session_config
+from deepmd.utils.network import one_layer, embedding_net, embedding_net_rand_seed_shift
+from deepmd.utils.tabulate import DPTabulate
+from deepmd.utils.type_embed import embed_atom_type
+from deepmd.utils.sess import run_sess
+from deepmd.utils.graph import load_graph_def, get_tensor_by_name_from_graph, get_tensor_by_name
+from deepmd.utils.errors import GraphWithoutTensorError
+from .descriptor import Descriptor
+from .se_a import DescrptSeA
+
+
+@Descriptor.register("se_atten")
+class DescrptSeAtten(DescrptSeA):
+    """
+    Parameters
+    ----------
+    rcut
+            The cut-off radius :math:`r_c`
+    rcut_smth
+            From where the environment matrix should be smoothed :math:`r_s`
+    sel : list[str]
+            sel[i] specifies the maxmum number of type i atoms in the cut-off radius
+    neuron : list[int]
+            Number of neurons in each hidden layers of the embedding net :math:`\mathcal{N}`
+    axis_neuron
+            Number of the axis neuron :math:`M_2` (number of columns of the sub-matrix of the embedding matrix)
+    resnet_dt
+            Time-step `dt` in the resnet construction:
+            y = x + dt * \phi (Wx + b)
+    trainable
+            If the weights of embedding net are trainable.
+    seed
+            Random seed for initializing the network parameters.
+    type_one_side
+            Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets
+    exclude_types : List[List[int]]
+            The excluded pairs of types which have no interaction with each other.
+            For example, `[[0, 1]]` means no interaction between type 0 and type 1.
+    set_davg_zero
+            Set the shift of embedding net input to zero.
+    activation_function
+            The activation function in the embedding net. Supported options are |ACTIVATION_FN|
+    precision
+            The precision of the embedding net parameters. Supported options are |PRECISION|
+    uniform_seed
+            Only for the purpose of backward compatibility, retrieves the old behavior of using the random seed
+    attn
+            The length of hidden vector during scale-dot attention computation.
+    attn_layer
+            The number of layers in attention mechanism.
+    attn_dotr
+            Whether to dot the relative coordinates on the attention weights as a gated scheme.
+    attn_mask
+            Whether to mask the diagonal in the attention weights.
+    """
+
+    def __init__(self,
+                 rcut: float,
+                 rcut_smth: float,
+                 sel: int,
+                 ntypes: int,
+                 neuron: List[int] = [24, 48, 96],
+                 axis_neuron: int = 8,
+                 resnet_dt: bool = False,
+                 trainable: bool = True,
+                 seed: int = None,
+                 type_one_side: bool = True,
+                 exclude_types: List[List[int]] = [],
+                 set_davg_zero: bool = False,
+                 activation_function: str = 'tanh',
+                 precision: str = 'default',
+                 uniform_seed: bool = False,
+                 attn: int = 128,
+                 attn_layer: int = 2,
+                 attn_dotr: bool = True,
+                 attn_mask: bool = False
+                 ) -> None:
+        DescrptSeA.__init__(self,
+                            rcut,
+                            rcut_smth,
+                            [sel],
+                            neuron=neuron,
+                            axis_neuron=axis_neuron,
+                            resnet_dt=resnet_dt,
+                            trainable=trainable,
+                            seed=seed,
+                            type_one_side=type_one_side,
+                            exclude_types=exclude_types,
+                            set_davg_zero=set_davg_zero,
+                            activation_function=activation_function,
+                            precision=precision,
+                            uniform_seed=uniform_seed
+                            )
+        """
+        Constructor
+        """
+        assert (Version(TF_VERSION) > Version('2')), "se_atten only support tensorflow version 2.0 or higher."
+        self.ntypes = ntypes
+        self.att_n = attn
+        self.attn_layer = attn_layer
+        self.attn_mask = attn_mask
+        self.attn_dotr = attn_dotr
+
+        # descrpt config
+        self.sel_all_a = [sel]
+        self.sel_all_r = [0]
+        avg_zero = np.zeros([self.ntypes, self.ndescrpt]).astype(GLOBAL_NP_FLOAT_PRECISION)
+        std_ones = np.ones([self.ntypes, self.ndescrpt]).astype(GLOBAL_NP_FLOAT_PRECISION)
+        sub_graph = tf.Graph()
+        with sub_graph.as_default():
+            name_pfx = 'd_sea_'
+            for ii in ['coord', 'box']:
+                self.place_holders[ii] = tf.placeholder(GLOBAL_NP_FLOAT_PRECISION, [None, None],
+                                                        name=name_pfx + 't_' + ii)
+            self.place_holders['type'] = tf.placeholder(tf.int32, [None, None], name=name_pfx + 't_type')
+            self.place_holders['natoms_vec'] = tf.placeholder(tf.int32, [self.ntypes + 2], name=name_pfx + 't_natoms')
+            self.place_holders['default_mesh'] = tf.placeholder(tf.int32, [None], name=name_pfx + 't_mesh')
+            self.stat_descrpt, self.descrpt_deriv_t, self.rij_t, self.nlist_t, self.nei_type_vec_t, self.nmask_t \
+                = op_module.prod_env_mat_a_mix(self.place_holders['coord'],
+                                               self.place_holders['type'],
+                                               self.place_holders['natoms_vec'],
+                                               self.place_holders['box'],
+                                               self.place_holders['default_mesh'],
+                                               tf.constant(avg_zero),
+                                               tf.constant(std_ones),
+                                               rcut_a=self.rcut_a,
+                                               rcut_r=self.rcut_r,
+                                               rcut_r_smth=self.rcut_r_smth,
+                                               sel_a=self.sel_all_a,
+                                               sel_r=self.sel_all_r)
+        self.sub_sess = tf.Session(graph=sub_graph, config=default_tf_session_config)
+
+    def compute_input_stats(self,
+                            data_coord: list,
+                            data_box: list,
+                            data_atype: list,
+                            natoms_vec: list,
+                            mesh: list,
+                            input_dict: dict,
+                            mixed_type: bool = False,
+                            real_natoms_vec: list = None
+                            ) -> None:
+        """
+        Compute the statisitcs (avg and std) of the training data. The input will be normalized by the statistics.
+        
+        Parameters
+        ----------
+        data_coord
+                The coordinates. Can be generated by deepmd.model.make_stat_input
+        data_box
+                The box. Can be generated by deepmd.model.make_stat_input
+        data_atype
+                The atom types. Can be generated by deepmd.model.make_stat_input
+        natoms_vec
+                The vector for the number of atoms of the system and different types of atoms.
+                If mixed_type is True, this para is blank. See real_natoms_vec.
+        mesh
+                The mesh for neighbor searching. Can be generated by deepmd.model.make_stat_input
+        input_dict
+                Dictionary for additional input
+        mixed_type
+                Whether to perform the mixed_type mode.
+                If True, the input data has the mixed_type format (see doc/model/train_se_atten.md),
+                in which frames in a system may have different natoms_vec(s), with the same nloc.
+        real_natoms_vec
+                If mixed_type is True, it takes in the real natoms_vec for each frame.
+        """
+        all_davg = []
+        all_dstd = []
+        if True:
+            sumr = []
+            suma = []
+            sumn = []
+            sumr2 = []
+            suma2 = []
+            if mixed_type:
+                sys_num = 0
+                for cc, bb, tt, nn, mm, r_n in zip(data_coord, data_box, data_atype, natoms_vec, mesh, real_natoms_vec):
+                    sysr, sysr2, sysa, sysa2, sysn \
+                        = self._compute_dstats_sys_smth(cc, bb, tt, nn, mm, mixed_type, r_n)
+                    sys_num += 1
+                    sumr.append(sysr)
+                    suma.append(sysa)
+                    sumn.append(sysn)
+                    sumr2.append(sysr2)
+                    suma2.append(sysa2)
+            else:
+                for cc, bb, tt, nn, mm in zip(data_coord, data_box, data_atype, natoms_vec, mesh):
+                    sysr, sysr2, sysa, sysa2, sysn \
+                        = self._compute_dstats_sys_smth(cc, bb, tt, nn, mm)
+                    sumr.append(sysr)
+                    suma.append(sysa)
+                    sumn.append(sysn)
+                    sumr2.append(sysr2)
+                    suma2.append(sysa2)
+            sumr = np.sum(sumr, axis=0)
+            suma = np.sum(suma, axis=0)
+            sumn = np.sum(sumn, axis=0)
+            sumr2 = np.sum(sumr2, axis=0)
+            suma2 = np.sum(suma2, axis=0)
+            for type_i in range(self.ntypes):
+                davgunit = [sumr[type_i] / (sumn[type_i] + 1e-15), 0, 0, 0]
+                dstdunit = [self._compute_std(sumr2[type_i], sumr[type_i], sumn[type_i]),
+                            self._compute_std(suma2[type_i], suma[type_i], sumn[type_i]),
+                            self._compute_std(suma2[type_i], suma[type_i], sumn[type_i]),
+                            self._compute_std(suma2[type_i], suma[type_i], sumn[type_i])
+                            ]
+                davg = np.tile(davgunit, self.ndescrpt // 4)
+                dstd = np.tile(dstdunit, self.ndescrpt // 4)
+                all_davg.append(davg)
+                all_dstd.append(dstd)
+
+        if not self.set_davg_zero:
+            self.davg = np.array(all_davg)
+        self.dstd = np.array(all_dstd)
+
+    def build(self,
+              coord_: tf.Tensor,
+              atype_: tf.Tensor,
+              natoms: tf.Tensor,
+              box_: tf.Tensor,
+              mesh: tf.Tensor,
+              input_dict: dict,
+              reuse: bool = None,
+              suffix: str = ''
+              ) -> tf.Tensor:
+        """
+        Build the computational graph for the descriptor
+
+        Parameters
+        ----------
+        coord_
+                The coordinate of atoms
+        atype_
+                The type of atoms
+        natoms
+                The number of atoms. This tensor has the length of Ntypes + 2
+                natoms[0]: number of local atoms
+                natoms[1]: total number of atoms held by this processor
+                natoms[i]: 2 <= i < Ntypes+2, number of type i atoms
+        mesh
+                For historical reasons, only the length of the Tensor matters.
+                if size of mesh == 6, pbc is assumed. 
+                if size of mesh == 0, no-pbc is assumed. 
+        input_dict
+                Dictionary for additional inputs
+        reuse
+                The weights in the networks should be reused when get the variable.
+        suffix
+                Name suffix to identify this descriptor
+
+        Returns
+        -------
+        descriptor
+                The output descriptor
+        """
+        davg = self.davg
+        dstd = self.dstd
+        with tf.variable_scope('descrpt_attr' + suffix, reuse=reuse):
+            if davg is None:
+                davg = np.zeros([self.ntypes, self.ndescrpt])
+            if dstd is None:
+                dstd = np.ones([self.ntypes, self.ndescrpt])
+            t_rcut = tf.constant(np.max([self.rcut_r, self.rcut_a]),
+                                 name='rcut',
+                                 dtype=GLOBAL_TF_FLOAT_PRECISION)
+            t_ntypes = tf.constant(self.ntypes,
+                                   name='ntypes',
+                                   dtype=tf.int32)
+            t_ndescrpt = tf.constant(self.ndescrpt,
+                                     name='ndescrpt',
+                                     dtype=tf.int32)
+            t_sel = tf.constant(self.sel_a,
+                                name='sel',
+                                dtype=tf.int32)
+            t_original_sel = tf.constant(self.original_sel if self.original_sel is not None else self.sel_a,
+                                         name='original_sel',
+                                         dtype=tf.int32)
+            self.t_avg = tf.get_variable('t_avg',
+                                         davg.shape,
+                                         dtype=GLOBAL_TF_FLOAT_PRECISION,
+                                         trainable=False,
+                                         initializer=tf.constant_initializer(davg))
+            self.t_std = tf.get_variable('t_std',
+                                         dstd.shape,
+                                         dtype=GLOBAL_TF_FLOAT_PRECISION,
+                                         trainable=False,
+                                         initializer=tf.constant_initializer(dstd))
+
+        with tf.control_dependencies([t_sel, t_original_sel]):
+            coord = tf.reshape(coord_, [-1, natoms[1] * 3])
+        box = tf.reshape(box_, [-1, 9])
+        atype = tf.reshape(atype_, [-1, natoms[1]])
+        self.attn_weight = [None for i in range(self.attn_layer)]
+        self.angular_weight = [None for i in range(self.attn_layer)]
+        self.attn_weight_final = [None for i in range(self.attn_layer)]
+        self.G = None
+        self.qs = [None for i in range(self.attn_layer)]
+        self.ks = [None for i in range(self.attn_layer)]
+        self.vs = [None for i in range(self.attn_layer)]
+
+        self.descrpt, self.descrpt_deriv, self.rij, self.nlist, self.nei_type_vec, self.nmask \
+            = op_module.prod_env_mat_a_mix(coord,
+                                           atype,
+                                           natoms,
+                                           box,
+                                           mesh,
+                                           self.t_avg,
+                                           self.t_std,
+                                           rcut_a=self.rcut_a,
+                                           rcut_r=self.rcut_r,
+                                           rcut_r_smth=self.rcut_r_smth,
+                                           sel_a=self.sel_all_a,
+                                           sel_r=self.sel_all_r)
+        self.nei_type_vec = tf.reshape(self.nei_type_vec, [-1])
+        self.nmask = tf.cast(tf.reshape(self.nmask, [-1, 1, self.sel_all_a[0]]), GLOBAL_TF_FLOAT_PRECISION)
+        self.negative_mask = -(2 << 32) * (1.0 - self.nmask)
+        # only used when tensorboard was set as true
+        tf.summary.histogram('descrpt', self.descrpt)
+        tf.summary.histogram('rij', self.rij)
+        tf.summary.histogram('nlist', self.nlist)
+
+        self.descrpt_reshape = tf.reshape(self.descrpt, [-1, self.ndescrpt])
+        self.atype_nloc = tf.reshape(tf.slice(atype, [0, 0], [-1, natoms[0]]),
+                                     [-1])  ## lammps will have error without this
+        self._identity_tensors(suffix=suffix)
+
+        self.dout, self.qmat = self._pass_filter(self.descrpt_reshape,
+                                                 self.atype_nloc,
+                                                 natoms,
+                                                 input_dict,
+                                                 suffix=suffix,
+                                                 reuse=reuse,
+                                                 trainable=self.trainable)
+
+        # only used when tensorboard was set as true
+        tf.summary.histogram('embedding_net_output', self.dout)
+        return self.dout
+
+    def _pass_filter(self,
+                     inputs,
+                     atype,
+                     natoms,
+                     input_dict,
+                     reuse=None,
+                     suffix='',
+                     trainable=True):
+        assert (input_dict is not None and input_dict.get('type_embedding', None) is not None), \
+            'se_atten desctiptor must use type_embedding'
+        type_embedding = input_dict.get('type_embedding', None)
+        inputs = tf.reshape(inputs, [-1, natoms[0], self.ndescrpt])
+        output = []
+        output_qmat = []
+        inputs_i = inputs
+        inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt])
+        type_i = -1
+        layer, qmat = self._filter(inputs_i, type_i, natoms, name='filter_type_all' + suffix, reuse=reuse,
+                                   trainable=trainable, activation_fn=self.filter_activation_fn,
+                                   type_embedding=type_embedding, atype=atype)
+        layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[0], self.get_dim_out()])
+        qmat = tf.reshape(qmat, [tf.shape(inputs)[0], natoms[0], self.get_dim_rot_mat_1() * 3])
+        output.append(layer)
+        output_qmat.append(qmat)
+        output = tf.concat(output, axis=1)
+        output_qmat = tf.concat(output_qmat, axis=1)
+        return output, output_qmat
+
+    def _compute_dstats_sys_smth(self,
+                                 data_coord,
+                                 data_box,
+                                 data_atype,
+                                 natoms_vec,
+                                 mesh,
+                                 mixed_type=False,
+                                 real_natoms_vec=None):
+        dd_all, descrpt_deriv_t, rij_t, nlist_t, nei_type_vec_t, nmask_t \
+            = run_sess(self.sub_sess, [self.stat_descrpt, self.descrpt_deriv_t, self.rij_t, self.nlist_t, self.nei_type_vec_t, self.nmask_t],
+                       feed_dict={
+                           self.place_holders['coord']: data_coord,
+                           self.place_holders['type']: data_atype,
+                           self.place_holders['natoms_vec']: natoms_vec,
+                           self.place_holders['box']: data_box,
+                           self.place_holders['default_mesh']: mesh,
+                       })
+        if mixed_type:
+            nframes = dd_all.shape[0]
+            sysr = [0. for i in range(self.ntypes)]
+            sysa = [0. for i in range(self.ntypes)]
+            sysn = [0 for i in range(self.ntypes)]
+            sysr2 = [0. for i in range(self.ntypes)]
+            sysa2 = [0. for i in range(self.ntypes)]
+            for ff in range(nframes):
+                natoms = real_natoms_vec[ff]
+                dd_ff = np.reshape(dd_all[ff], [-1, self.ndescrpt * natoms[0]])
+                start_index = 0
+                for type_i in range(self.ntypes):
+                    end_index = start_index + self.ndescrpt * natoms[2 + type_i]  # center atom split
+                    dd = dd_ff[:, start_index:end_index]
+                    dd = np.reshape(dd, [-1, self.ndescrpt])  # nframes * typen_atoms , nnei * 4
+                    start_index = end_index
+                    # compute
+                    dd = np.reshape(dd, [-1, 4])  # nframes * typen_atoms * nnei, 4
+                    ddr = dd[:, :1]
+                    dda = dd[:, 1:]
+                    sumr = np.sum(ddr)
+                    suma = np.sum(dda) / 3.
+                    sumn = dd.shape[0]
+                    sumr2 = np.sum(np.multiply(ddr, ddr))
+                    suma2 = np.sum(np.multiply(dda, dda)) / 3.
+                    sysr[type_i] += sumr
+                    sysa[type_i] += suma
+                    sysn[type_i] += sumn
+                    sysr2[type_i] += sumr2
+                    sysa2[type_i] += suma2
+        else:
+            natoms = natoms_vec
+            dd_all = np.reshape(dd_all, [-1, self.ndescrpt * natoms[0]])
+            start_index = 0
+            sysr = []
+            sysa = []
+            sysn = []
+            sysr2 = []
+            sysa2 = []
+            for type_i in range(self.ntypes):
+                end_index = start_index + self.ndescrpt * natoms[2 + type_i]  # center atom split
+                dd = dd_all[:, start_index:end_index]
+                dd = np.reshape(dd, [-1, self.ndescrpt])  # nframes * typen_atoms , nnei * 4
+                start_index = end_index
+                # compute
+                dd = np.reshape(dd, [-1, 4])  # nframes * typen_atoms * nnei, 4
+                ddr = dd[:, :1]
+                dda = dd[:, 1:]
+                sumr = np.sum(ddr)
+                suma = np.sum(dda) / 3.
+                sumn = dd.shape[0]
+                sumr2 = np.sum(np.multiply(ddr, ddr))
+                suma2 = np.sum(np.multiply(dda, dda)) / 3.
+                sysr.append(sumr)
+                sysa.append(suma)
+                sysn.append(sumn)
+                sysr2.append(sumr2)
+                sysa2.append(suma2)
+        return sysr, sysr2, sysa, sysa2, sysn
+
+    def _lookup_type_embedding(
+            self,
+            xyz_scatter,
+            natype,
+            type_embedding,
+    ):
+        '''Concatenate `type_embedding` of neighbors and `xyz_scatter`.
+        If not self.type_one_side, concatenate `type_embedding` of center atoms as well.
+
+        Parameters
+        ----------
+        xyz_scatter:
+                shape is [nframes*natoms[0]*self.nnei, 1]
+        nframes:
+                shape is []
+        natoms:
+                shape is [1+1+self.ntypes]
+        type_embedding:
+                shape is [self.ntypes, Y] where Y=jdata['type_embedding']['neuron'][-1]
+
+        Returns
+        -------
+            embedding:
+                environment of each atom represented by embedding.
+        '''
+        te_out_dim = type_embedding.get_shape().as_list()[-1]
+        self.test_type_embedding = type_embedding
+        self.test_nei_embed = tf.nn.embedding_lookup(type_embedding,
+                                                     self.nei_type_vec)  # shape is [self.nnei, 1+te_out_dim]
+        # nei_embed = tf.tile(nei_embed, (nframes * natoms[0], 1))  # shape is [nframes*natoms[0]*self.nnei, te_out_dim]
+        nei_embed = tf.reshape(self.test_nei_embed, [-1, te_out_dim])
+        self.embedding_input = tf.concat([xyz_scatter, nei_embed],
+                                         1)  # shape is [nframes*natoms[0]*self.nnei, 1+te_out_dim]
+        if not self.type_one_side:
+            self.atm_embed = tf.nn.embedding_lookup(type_embedding, natype)  # shape is [nframes*natoms[0], te_out_dim]
+            self.atm_embed = tf.tile(self.atm_embed,
+                                     [1, self.nnei])  # shape is [nframes*natoms[0], self.nnei*te_out_dim]
+            self.atm_embed = tf.reshape(self.atm_embed,
+                                        [-1, te_out_dim])  # shape is [nframes*natoms[0]*self.nnei, te_out_dim]
+            self.embedding_input_2 = tf.concat([self.embedding_input, self.atm_embed],
+                                               1)  # shape is [nframes*natoms[0]*self.nnei, 1+te_out_dim+te_out_dim]
+            return self.embedding_input_2
+        return self.embedding_input
+
+    def _feedforward(self, input_xyz, d_in, d_mid):
+        residual = input_xyz
+        input_xyz = tf.nn.relu(one_layer(
+            input_xyz,
+            d_mid,
+            name='c_ffn1',
+            reuse=tf.AUTO_REUSE,
+            seed=self.seed,
+            activation_fn=None,
+            precision=self.filter_precision,
+            trainable=True,
+            uniform_seed=self.uniform_seed))
+        input_xyz = one_layer(
+            input_xyz,
+            d_in,
+            name='c_ffn2',
+            reuse=tf.AUTO_REUSE,
+            seed=self.seed,
+            activation_fn=None,
+            precision=self.filter_precision,
+            trainable=True,
+            uniform_seed=self.uniform_seed)
+        input_xyz += residual
+        input_xyz = tf.keras.layers.LayerNormalization()(input_xyz)
+        return input_xyz
+
+    def _scaled_dot_attn(self, Q, K, V, temperature, input_r, dotr=False, do_mask=False, layer=0, save_weights=True):
+        attn = tf.matmul(Q / temperature, K, transpose_b=True)
+        attn *= self.nmask
+        attn += self.negative_mask
+        attn = tf.nn.softmax(attn, axis=-1)
+        attn *= tf.reshape(self.nmask, [-1, attn.shape[-1], 1])
+        if save_weights:
+            self.attn_weight[layer] = attn[0]  # atom 0
+        if dotr:
+            angular_weight = tf.matmul(input_r, input_r, transpose_b=True)  # normalized
+            attn *= angular_weight
+            if save_weights:
+                self.angular_weight[layer] = angular_weight[0]  # atom 0
+                self.attn_weight_final[layer] = attn[0]  # atom 0
+        if do_mask:
+            nei = int(attn.shape[-1])
+            mask = tf.cast(tf.ones((nei, nei)) - tf.eye(nei), self.filter_precision)
+            attn *= mask
+        output = tf.matmul(attn, V)
+        return output
+
+    def _attention_layers(
+            self,
+            input_xyz,
+            layer_num,
+            shape_i,
+            outputs_size,
+            input_r,
+            dotr=False,
+            do_mask=False,
+            trainable=True
+    ):
+        sd_k = tf.sqrt(tf.cast(1., dtype=self.filter_precision))
+        self.G = tf.reshape(input_xyz, (-1, shape_i[1] // 4, outputs_size[-1]))[0]
+        for i in range(layer_num):
+            with tf.variable_scope('attention_layer{}_'.format(i), reuse=tf.AUTO_REUSE):
+                # input_xyz_in = tf.nn.l2_normalize(input_xyz, -1)
+                Q_c = one_layer(
+                    input_xyz,
+                    self.att_n,
+                    name='c_query',
+                    reuse=tf.AUTO_REUSE,
+                    seed=self.seed,
+                    activation_fn=None,
+                    precision=self.filter_precision,
+                    trainable=trainable,
+                    uniform_seed=self.uniform_seed)
+                K_c = one_layer(
+                    input_xyz,
+                    self.att_n,
+                    name='c_key',
+                    reuse=tf.AUTO_REUSE,
+                    seed=self.seed,
+                    activation_fn=None,
+                    precision=self.filter_precision,
+                    trainable=trainable,
+                    uniform_seed=self.uniform_seed)
+                V_c = one_layer(
+                    input_xyz,
+                    self.att_n,
+                    name='c_value',
+                    reuse=tf.AUTO_REUSE,
+                    seed=self.seed,
+                    activation_fn=None,
+                    precision=self.filter_precision,
+                    trainable=trainable,
+                    uniform_seed=self.uniform_seed)
+                # # natom x nei_type_i x out_size
+                # xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1] // 4, outputs_size[-1]))
+                # natom x nei_type_i x att_n
+                Q_c = tf.nn.l2_normalize(tf.reshape(Q_c, (-1, shape_i[1] // 4, self.att_n)), -1)
+                K_c = tf.nn.l2_normalize(tf.reshape(K_c, (-1, shape_i[1] // 4, self.att_n)), -1)
+                V_c = tf.nn.l2_normalize(tf.reshape(V_c, (-1, shape_i[1] // 4, self.att_n)), -1)
+                # Q_c = tf.reshape(Q_c, (-1, shape_i[1] // 4, self.att_n))
+                # K_c = tf.reshape(K_c, (-1, shape_i[1] // 4, self.att_n))
+                # V_c = tf.reshape(V_c, (-1, shape_i[1] // 4, self.att_n))
+                self.qs[i] = Q_c[0]
+                self.ks[i] = K_c[0]
+                self.vs[i] = V_c[0]
+
+                input_att = self._scaled_dot_attn(Q_c, K_c, V_c, sd_k, input_r, dotr=dotr, do_mask=do_mask, layer=i)
+                input_att = tf.reshape(input_att, (-1, self.att_n))
+
+                # A_c = tf.nn.softmax(tf.matmul(Q_c, K_c, transpose_b=True)/sd_k)
+                # # (natom x nei_type_i) x att_n
+                # input_att = tf.reshape(tf.matmul(A_c, V_c), (-1, self.att_n))
+
+                # (natom x nei_type_i) x out_size
+                input_xyz += one_layer(
+                    input_att,
+                    outputs_size[-1],
+                    name='c_out',
+                    reuse=tf.AUTO_REUSE,
+                    seed=self.seed,
+                    activation_fn=None,
+                    precision=self.filter_precision,
+                    trainable=trainable,
+                    uniform_seed=self.uniform_seed)
+                input_xyz = tf.keras.layers.LayerNormalization()(input_xyz)
+                # input_xyz = self._feedforward(input_xyz, outputs_size[-1], self.att_n)
+        return input_xyz
+
+    def _filter_lower(
+            self,
+            type_i,
+            type_input,
+            start_index,
+            incrs_index,
+            inputs,
+            type_embedding=None,
+            atype=None,
+            is_exclude=False,
+            activation_fn=None,
+            bavg=0.0,
+            stddev=1.0,
+            trainable=True,
+            suffix='',
+            name='filter_',
+            reuse=None
+    ):
+        """
+        input env matrix, returns R.G
+        """
+        outputs_size = [1] + self.filter_neuron
+        # cut-out inputs
+        # with natom x (nei_type_i x 4)  
+        inputs_i = tf.slice(inputs,
+                            [0, start_index * 4],
+                            [-1, incrs_index * 4])
+        shape_i = inputs_i.get_shape().as_list()
+        natom = tf.shape(inputs_i)[0]
+        # with (natom x nei_type_i) x 4
+        inputs_reshape = tf.reshape(inputs_i, [-1, 4])
+        # with (natom x nei_type_i) x 1
+        xyz_scatter = tf.reshape(tf.slice(inputs_reshape, [0, 0], [-1, 1]), [-1, 1])
+        assert atype is not None, 'atype must exist!!'
+        type_embedding = tf.cast(type_embedding, self.filter_precision)
+        xyz_scatter = self._lookup_type_embedding(
+            xyz_scatter, atype, type_embedding)
+        if self.compress:
+            raise RuntimeError('compression of attention descriptor is not supported at the moment')
+        # natom x 4 x outputs_size
+        if (not is_exclude):
+            with tf.variable_scope(name, reuse=reuse):
+                # with (natom x nei_type_i) x out_size
+                xyz_scatter = embedding_net(
+                    xyz_scatter,
+                    self.filter_neuron,
+                    self.filter_precision,
+                    activation_fn=activation_fn,
+                    resnet_dt=self.filter_resnet_dt,
+                    name_suffix=suffix,
+                    stddev=stddev,
+                    bavg=bavg,
+                    seed=self.seed,
+                    trainable=trainable,
+                    uniform_seed=self.uniform_seed,
+                    initial_variables=self.embedding_net_variables,
+                    mixed_prec=self.mixed_prec)
+                if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift
+            input_r = tf.slice(tf.reshape(inputs_i, (-1, shape_i[1] // 4, 4)), [0, 0, 1], [-1, -1, 3])
+            input_r = tf.nn.l2_normalize(input_r, -1)
+            # natom x nei_type_i x out_size
+            xyz_scatter_att = tf.reshape(
+                self._attention_layers(xyz_scatter, self.attn_layer, shape_i, outputs_size, input_r,
+                                       dotr=self.attn_dotr, do_mask=self.attn_mask, trainable=trainable),
+                (-1, shape_i[1] // 4, outputs_size[-1]))
+            # xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1] // 4, outputs_size[-1]))
+        else:
+            # we can safely return the final xyz_scatter filled with zero directly
+            return tf.cast(tf.fill((natom, 4, outputs_size[-1]), 0.), self.filter_precision)
+        # When using tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]) below
+        # [588 24] -> [588 6 4] correct
+        # but if sel is zero
+        # [588 0] -> [147 0 4] incorrect; the correct one is [588 0 4]
+        # So we need to explicitly assign the shape to tf.shape(inputs_i)[0] instead of -1
+        return tf.matmul(tf.reshape(inputs_i, [natom, shape_i[1] // 4, 4]), xyz_scatter_att, transpose_a=True)
+
+    @cast_precision
+    def _filter(
+            self,
+            inputs,
+            type_input,
+            natoms,
+            type_embedding=None,
+            atype=None,
+            activation_fn=tf.nn.tanh,
+            stddev=1.0,
+            bavg=0.0,
+            name='linear',
+            reuse=None,
+            trainable=True):
+        nframes = tf.shape(tf.reshape(inputs, [-1, natoms[0], self.ndescrpt]))[0]
+        # natom x (nei x 4)
+        shape = inputs.get_shape().as_list()
+        outputs_size = [1] + self.filter_neuron
+        outputs_size_2 = self.n_axis_neuron
+        all_excluded = all([(type_input, type_i) in self.exclude_types for type_i in range(self.ntypes)])
+        if all_excluded:
+            # all types are excluded so result and qmat should be zeros
+            # we can safaly return a zero matrix...
+            # See also https://stackoverflow.com/a/34725458/9567349
+            # result: natom x outputs_size x outputs_size_2
+            # qmat: natom x outputs_size x 3
+            natom = tf.shape(inputs)[0]
+            result = tf.cast(tf.fill((natom, outputs_size_2, outputs_size[-1]), 0.), GLOBAL_TF_FLOAT_PRECISION)
+            qmat = tf.cast(tf.fill((natom, outputs_size[-1], 3), 0.), GLOBAL_TF_FLOAT_PRECISION)
+            return result, qmat
+
+        start_index = 0
+        type_i = 0
+        # natom x 4 x outputs_size
+        xyz_scatter_1 = self._filter_lower(
+            type_i, type_input,
+            start_index, np.cumsum(self.sel_a)[-1],
+            inputs,
+            type_embedding=type_embedding,
+            is_exclude=False,
+            activation_fn=activation_fn,
+            stddev=stddev,
+            bavg=bavg,
+            trainable=trainable,
+            name=name,
+            reuse=reuse,
+            atype=atype)
+        # natom x nei x outputs_size
+        # xyz_scatter = tf.concat(xyz_scatter_total, axis=1)
+        # natom x nei x 4
+        # inputs_reshape = tf.reshape(inputs, [-1, shape[1]//4, 4])
+        # natom x 4 x outputs_size
+        # xyz_scatter_1 = tf.matmul(inputs_reshape, xyz_scatter, transpose_a = True)
+        if self.original_sel is None:
+            # shape[1] = nnei * 4
+            nnei = shape[1] / 4
+        else:
+            nnei = tf.cast(tf.Variable(np.sum(self.original_sel), dtype=tf.int32, trainable=False, name="nnei"),
+                           self.filter_precision)
+        xyz_scatter_1 = xyz_scatter_1 / nnei
+        # natom x 4 x outputs_size_2
+        xyz_scatter_2 = tf.slice(xyz_scatter_1, [0, 0, 0], [-1, -1, outputs_size_2])
+        # # natom x 3 x outputs_size_2
+        # qmat = tf.slice(xyz_scatter_2, [0,1,0], [-1, 3, -1])
+        # natom x 3 x outputs_size_1
+        qmat = tf.slice(xyz_scatter_1, [0, 1, 0], [-1, 3, -1])
+        # natom x outputs_size_1 x 3
+        qmat = tf.transpose(qmat, perm=[0, 2, 1])
+        # natom x outputs_size x outputs_size_2
+        result = tf.matmul(xyz_scatter_1, xyz_scatter_2, transpose_a=True)
+        # natom x (outputs_size x outputs_size_2)
+        result = tf.reshape(result, [-1, outputs_size_2 * outputs_size[-1]])
+
+        return result, qmat
diff --git a/deepmd/entrypoints/train.py b/deepmd/entrypoints/train.py
index 4a67eee9d4..080867c38c 100755
--- a/deepmd/entrypoints/train.py
+++ b/deepmd/entrypoints/train.py
@@ -90,7 +90,10 @@ def train(
 
     jdata = normalize(jdata)
 
-    if not is_compress and not skip_neighbor_stat:
+    if jdata['model']['descriptor']['type'] in ['se_atten'] and isinstance(jdata['model']['descriptor']['sel'], list):
+        jdata['model']['descriptor']['sel'] = sum(jdata['model']['descriptor']['sel'])
+
+    if not is_compress and not skip_neighbor_stat and jdata['model']['descriptor']['type'] not in ['se_atten']:
         jdata = update_sel(jdata)
 
     with open(output, "w") as fp:
diff --git a/deepmd/fit/ener.py b/deepmd/fit/ener.py
index 61d70045d8..876af78493 100644
--- a/deepmd/fit/ener.py
+++ b/deepmd/fit/ener.py
@@ -168,7 +168,8 @@ def get_numb_aparam(self) -> int:
         return self.numb_fparam
 
     def compute_output_stats(self, 
-                             all_stat: dict
+                             all_stat: dict,
+                             mixed_type: bool = False
     ) -> None:
         """
         Compute the ouput statistics
@@ -179,10 +180,14 @@ def compute_output_stats(self,
                 must have the following components:
                 all_stat['energy'] of shape n_sys x n_batch x n_frame
                 can be prepared by model.make_stat_input
+        mixed_type
+                Whether to perform the mixed_type mode.
+                If True, the input data has the mixed_type format (see doc/model/train_se_atten.md),
+                in which frames in a system may have different natoms_vec(s), with the same nloc.
         """
-        self.bias_atom_e = self._compute_output_stats(all_stat, rcond = self.rcond)
+        self.bias_atom_e = self._compute_output_stats(all_stat, rcond=self.rcond, mixed_type=mixed_type)
 
-    def _compute_output_stats(self, all_stat, rcond = 1e-3):
+    def _compute_output_stats(self, all_stat, rcond=1e-3, mixed_type=False):
         data = all_stat['energy']
         # data[sys_idx][batch_idx][frame_idx]
         sys_ener = np.array([])
@@ -193,11 +198,22 @@ def _compute_output_stats(self, all_stat, rcond = 1e-3):
                     sys_data.append(data[ss][ii][jj])
             sys_data = np.concatenate(sys_data)
             sys_ener = np.append(sys_ener, np.average(sys_data))
-        data = all_stat['natoms_vec']
         sys_tynatom = np.array([])
-        nsys = len(data)
-        for ss in range(len(data)):
-            sys_tynatom = np.append(sys_tynatom, data[ss][0].astype(np.float64))
+        if mixed_type:
+            data = all_stat['real_natoms_vec']
+            nsys = len(data)
+            for ss in range(len(data)):
+                tmp_tynatom = []
+                for ii in range(len(data[ss])):
+                    for jj in range(len(data[ss][ii])):
+                        tmp_tynatom.append(data[ss][ii][jj].astype(np.float64))
+                tmp_tynatom = np.average(np.array(tmp_tynatom), axis=0)
+                sys_tynatom = np.append(sys_tynatom, tmp_tynatom)
+        else:
+            data = all_stat['natoms_vec']
+            nsys = len(data)
+            for ss in range(len(data)):
+                sys_tynatom = np.append(sys_tynatom, data[ss][0].astype(np.float64))
         sys_tynatom = np.reshape(sys_tynatom, [nsys,-1])
         sys_tynatom = sys_tynatom[:,2:]
         if len(self.atom_ener) > 0:
@@ -402,6 +418,11 @@ def build (self,
             t_daparam = tf.constant(self.numb_aparam, 
                                     name = 'daparam', 
                                     dtype = tf.int32)
+            self.t_bias_atom_e = tf.get_variable('t_bias_atom_e',
+                                            self.bias_atom_e.shape,
+                                            dtype=GLOBAL_TF_FLOAT_PRECISION,
+                                            trainable=False,
+                                            initializer=tf.constant_initializer(self.bias_atom_e))
             if self.numb_fparam > 0: 
                 t_fparam_avg = tf.get_variable('t_fparam_avg', 
                                                self.numb_fparam,
@@ -452,12 +473,16 @@ def build (self,
             aparam = tf.reshape(aparam, [-1, self.numb_aparam * natoms[0]])
             
         type_embedding = input_dict.get('type_embedding', None)
+        atype = input_dict.get('atype', None)
         if type_embedding is not None:
-            atype_embed = embed_atom_type(self.ntypes, natoms, type_embedding)
-            atype_embed = tf.tile(atype_embed,[tf.shape(inputs)[0],1])
+            atype_nall = tf.reshape(atype, [-1, natoms[1]])
+            self.atype_nloc = tf.reshape(tf.slice(atype_nall, [0, 0], [-1, natoms[0]]), [-1])  ## lammps will make error
+            atype_embed = tf.nn.embedding_lookup(type_embedding, self.atype_nloc)
         else:
             atype_embed = None
 
+        self.atype_embed = atype_embed
+
         if atype_embed is None:
             start_index = 0
             outs_list = []
@@ -503,11 +528,11 @@ def build (self,
                 bias_atom_e=0.0, suffix=suffix, reuse=reuse
             )
             outs = tf.reshape(final_layer, [tf.shape(inputs)[0], natoms[0]])
-            # add atom energy bias; TF will broadcast to all batches
-            # tf.repeat is avaiable in TF>=2.1 or TF 1.15
-            _TF_VERSION = Version(TF_VERSION)
-            if (Version('1.15') <= _TF_VERSION < Version('2') or _TF_VERSION >= Version('2.1')) and self.bias_atom_e is not None:
-                outs += tf.repeat(tf.Variable(self.bias_atom_e, dtype=self.fitting_precision, trainable=False, name="bias_atom_ei"), natoms[2:])
+            # add bias
+            self.atom_ener_before = outs
+            self.add_type = tf.reshape(tf.nn.embedding_lookup(self.t_bias_atom_e, self.atype_nloc), [tf.shape(inputs)[0], natoms[0]])
+            outs = outs + self.add_type
+            self.atom_ener_after = outs
 
         if self.tot_ener_zero:
             force_tot_ener = 0.0
diff --git a/deepmd/model/ener.py b/deepmd/model/ener.py
index 471a0b4743..7504e22906 100644
--- a/deepmd/model/ener.py
+++ b/deepmd/model/ener.py
@@ -92,23 +92,36 @@ def get_type_map (self) :
     def data_stat(self, data):
         all_stat = make_stat_input(data, self.data_stat_nbatch, merge_sys = False)
         m_all_stat = merge_sys_stat(all_stat)
-        self._compute_input_stat(m_all_stat, protection = self.data_stat_protect)
-        self._compute_output_stat(all_stat)
+        self._compute_input_stat(m_all_stat, protection=self.data_stat_protect, mixed_type=data.mixed_type)
+        self._compute_output_stat(all_stat, mixed_type=data.mixed_type)
         # self.bias_atom_e = data.compute_energy_shift(self.rcond)
 
-    def _compute_input_stat (self, all_stat, protection = 1e-2) :
-        self.descrpt.compute_input_stats(all_stat['coord'],
-                                         all_stat['box'],
-                                         all_stat['type'],
-                                         all_stat['natoms_vec'],
-                                         all_stat['default_mesh'], 
-                                         all_stat)
-        self.fitting.compute_input_stats(all_stat, protection = protection)
+    def _compute_input_stat (self, all_stat, protection=1e-2, mixed_type=False):
+        if mixed_type:
+            self.descrpt.compute_input_stats(all_stat['coord'],
+                                             all_stat['box'],
+                                             all_stat['type'],
+                                             all_stat['natoms_vec'],
+                                             all_stat['default_mesh'],
+                                             all_stat,
+                                             mixed_type,
+                                             all_stat['real_natoms_vec'])
+        else:
+            self.descrpt.compute_input_stats(all_stat['coord'],
+                                             all_stat['box'],
+                                             all_stat['type'],
+                                             all_stat['natoms_vec'],
+                                             all_stat['default_mesh'],
+                                             all_stat)
+        self.fitting.compute_input_stats(all_stat, protection=protection)
+
+    def _compute_output_stat (self, all_stat, mixed_type=False):
+        if mixed_type:
+            self.fitting.compute_output_stats(all_stat, mixed_type=mixed_type)
+        else:
+            self.fitting.compute_output_stats(all_stat)
 
-    def _compute_output_stat (self, all_stat) :
-        self.fitting.compute_output_stats(all_stat)
 
-    
     def build (self, 
                coord_, 
                atype_,
@@ -158,6 +171,7 @@ def build (self,
                 suffix = suffix,
             )
             input_dict['type_embedding'] = type_embedding
+            input_dict['atype'] = atype_
 
         if frz_model == None:
             dout \
@@ -195,6 +209,7 @@ def build (self,
                                         input_dict, 
                                         reuse = reuse, 
                                         suffix = suffix)
+        self.atom_ener = atom_ener
 
         if self.srtab is not None :
             sw_lambda, sw_deriv \
diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py
index 86edb8db78..52cee2427b 100644
--- a/deepmd/train/trainer.py
+++ b/deepmd/train/trainer.py
@@ -24,6 +24,7 @@
 from deepmd.utils.sess import run_sess
 from deepmd.utils.type_embed import TypeEmbedNet
 from deepmd.utils.graph import load_graph_def, get_tensor_by_name_from_graph
+from deepmd.utils.argcheck import type_embedding_args
 
 from tensorflow.python.client import timeline
 from deepmd.env import op_module, TF_VERSION
@@ -69,17 +70,20 @@ def _init_param(self, jdata):
         # nvnmd
         self.nvnmd_param = jdata.get('nvnmd', {})
         nvnmd_cfg.init_from_jdata(self.nvnmd_param)
-        nvnmd_cfg.init_from_deepmd_input(model_param)
         if nvnmd_cfg.enable:
+            nvnmd_cfg.init_from_deepmd_input(model_param)
             nvnmd_cfg.disp_message()
             nvnmd_cfg.save()
         
         # descriptor
         try:
             descrpt_type = descrpt_param['type']
+            self.descrpt_type = descrpt_type
         except KeyError:
             raise KeyError('the type of descriptor should be set by `type`')
 
+        if descrpt_param['type'] in ['se_atten']:
+            descrpt_param['ntypes'] = len(model_param['type_map'])
         self.descrpt = Descriptor(**descrpt_param)
 
         # fitting net
@@ -112,6 +116,9 @@ def _init_param(self, jdata):
             raise RuntimeError('unknow fitting type ' + fitting_type)
 
         # type embedding
+        padding = False
+        if descrpt_type == 'se_atten':
+            padding = True
         if typeebd_param is not None:
             self.typeebd = TypeEmbedNet(
                 neuron=typeebd_param['neuron'],
@@ -119,7 +126,20 @@ def _init_param(self, jdata):
                 activation_function=typeebd_param['activation_function'],
                 precision=typeebd_param['precision'],
                 trainable=typeebd_param['trainable'],
-                seed=typeebd_param['seed']
+                seed=typeebd_param['seed'],
+                padding=padding
+            )
+        elif descrpt_type == 'se_atten':
+            default_args = type_embedding_args()
+            default_args_dict = {i.name: i.default for i in default_args}
+            self.typeebd = TypeEmbedNet(
+                neuron=default_args_dict['neuron'],
+                resnet_dt=default_args_dict['resnet_dt'],
+                activation_function=None,
+                precision=default_args_dict['precision'],
+                trainable=default_args_dict['trainable'],
+                seed=default_args_dict['seed'],
+                padding=padding
             )
         else:
             self.typeebd = None
@@ -272,6 +292,10 @@ def build (self,
         self.ntypes = self.model.get_ntypes()
         self.stop_batch = stop_batch
 
+        if not self.is_compress and data.mixed_type:
+            assert self.descrpt_type in ['se_atten'], 'Data in mixed_type format must use attention descriptor!'
+            assert self.fitting_type in ['ener'], 'Data in mixed_type format must use ener fitting!'
+
         if self.numb_fparam > 0 :
             log.info("training with %d frame parameter(s)" % self.numb_fparam)
         else:
@@ -585,7 +609,7 @@ def save_checkpoint(self, cur_batch: int):
     def get_feed_dict(self, batch, is_training):
         feed_dict = {}
         for kk in batch.keys():
-            if kk == 'find_type' or kk == 'type':
+            if kk == 'find_type' or kk == 'type' or kk == 'real_natoms_vec':
                 continue
             if 'find_' in kk:
                 feed_dict[self.place_holders[kk]] = batch[kk]
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index e7c7edb170..7ba62f9953 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -28,17 +28,17 @@ def type_embedding_args():
     doc_neuron = 'Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_seed = 'Random seed for parameter initialization'
-    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}. Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
+    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
     doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
     doc_trainable = 'If the parameters in the embedding net are trainable'
     
     return [
-        Argument("neuron", list, optional = True, default = [2, 4, 8], doc = doc_neuron),
+        Argument("neuron", list, optional = True, default = [8], doc = doc_neuron),
         Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function),
         Argument("resnet_dt", bool, optional = True, default = False, doc = doc_resnet_dt),
         Argument("precision", str, optional = True, default = "default", doc = doc_precision),
         Argument("trainable", bool, optional = True, default = True, doc = doc_trainable),
-        Argument("seed", [int,None], optional = True, doc = doc_seed),
+        Argument("seed", [int,None], optional = True, default = None, doc = doc_seed),
     ]        
 
 
@@ -128,7 +128,7 @@ def descrpt_se_a_args():
     doc_rcut_smth = 'Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`'
     doc_neuron = 'Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.'
     doc_axis_neuron = 'Size of the submatrix of G (embedding matrix).'
-    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}. Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
+    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_type_one_side = 'Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets'
     doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
@@ -162,7 +162,7 @@ def descrpt_se_t_args():
     doc_rcut = 'The cut-off radius.'
     doc_rcut_smth = 'Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`'
     doc_neuron = 'Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.'
-    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}. Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
+    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
     doc_trainable = 'If the parameters in the embedding net are trainable'
@@ -205,7 +205,7 @@ def descrpt_se_r_args():
     doc_rcut = 'The cut-off radius.'
     doc_rcut_smth = 'Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`'
     doc_neuron = 'Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.'
-    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}. Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
+    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_type_one_side = 'Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets'
     doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
@@ -239,6 +239,48 @@ def descrpt_hybrid_args():
     ]
 
 
+@descrpt_args_plugin.register("se_atten")
+def descrpt_se_atten_args():
+    doc_sel = 'This parameter set the number of selected neighbors. Note that this parameter is a little different from that in other descriptors. Instead of separating each type of atoms, only the summation matters. And this number is highly related with the efficiency, thus one should not make it too large. Usually 200 or less is enough, far away from the GPU limitation 4096. It can be:\n\n\
+    - `int`. The maximum number of neighbor atoms to be considered. We recommend it to be less than 200. \n\n\
+    - `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. Only the summation of `sel[i]` matters, and it is recommended to be less than 200.'
+    doc_rcut = 'The cut-off radius.'
+    doc_rcut_smth = 'Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`'
+    doc_neuron = 'Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.'
+    doc_axis_neuron = 'Size of the submatrix of G (embedding matrix).'
+    doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
+    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
+    doc_type_one_side = 'Whether to consider the information from only one side or both sides.'
+    doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
+    doc_trainable = 'If the parameters in the embedding net is trainable'
+    doc_seed = 'Random seed for parameter initialization'
+    doc_exclude_types = 'The excluded pairs of types which have no interaction with each other. For example, `[[0, 1]]` means no interaction between type 0 and type 1.'
+    doc_set_davg_zero = 'Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used'
+    doc_attn = 'The length of hidden vectors in attention layers'
+    doc_attn_layer = 'The number of attention layers'
+    doc_attn_dotr = 'Whether to do dot product with the normalized relative coordinates'
+    doc_attn_mask = 'Whether to do mask on the diagonal in the attention matrix'
+
+    return [
+        Argument("sel", [int, list], optional=True, default=200, doc=doc_sel),
+        Argument("rcut", float, optional=True, default=6.0, doc=doc_rcut),
+        Argument("rcut_smth", float, optional=True, default=0.5, doc=doc_rcut_smth),
+        Argument("neuron", list, optional=True, default=[10, 20, 40], doc=doc_neuron),
+        Argument("axis_neuron", int, optional=True, default=4, alias=['n_axis_neuron'], doc=doc_axis_neuron),
+        Argument("activation_function", str, optional=True, default='tanh', doc=doc_activation_function),
+        Argument("resnet_dt", bool, optional=True, default=False, doc=doc_resnet_dt),
+        Argument("type_one_side", bool, optional=True, default=False, doc=doc_type_one_side),
+        Argument("precision", str, optional=True, default="default", doc=doc_precision),
+        Argument("trainable", bool, optional=True, default=True, doc=doc_trainable),
+        Argument("seed", [int, None], optional=True, doc=doc_seed),
+        Argument("exclude_types", list, optional=True, default=[], doc=doc_exclude_types),
+        Argument("set_davg_zero", bool, optional=True, default=False, doc=doc_set_davg_zero),
+        Argument("attn", int, optional=True, default=100, doc=doc_attn),
+        Argument("attn_layer", int, optional=True, default=4, doc=doc_attn_layer),
+        Argument("attn_dotr", bool, optional=True, default=False, doc=doc_attn_dotr),
+        Argument("attn_mask", bool, optional=True, default=False, doc=doc_attn_mask)
+    ]
+
 def descrpt_variant_type_args(exclude_hybrid: bool = False) -> Variant:
     link_lf = make_link('loc_frame', 'model/descriptor[loc_frame]')
     link_se_e2_a = make_link('se_e2_a', 'model/descriptor[se_e2_a]')
@@ -246,12 +288,14 @@ def descrpt_variant_type_args(exclude_hybrid: bool = False) -> Variant:
     link_se_e3 = make_link('se_e3', 'model/descriptor[se_e3]')
     link_se_a_tpe = make_link('se_a_tpe', 'model/descriptor[se_a_tpe]')
     link_hybrid = make_link('hybrid', 'model/descriptor[hybrid]')
+    link_se_atten = make_link('se_atten', 'model/descriptor[se_atten]')
     doc_descrpt_type = f'The type of the descritpor. See explanation below. \n\n\
 - `loc_frame`: Defines a local frame at each atom, and the compute the descriptor as local coordinates under this frame.\n\n\
 - `se_e2_a`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor.\n\n\
 - `se_e2_r`: Used by the smooth edition of Deep Potential. Only the distance between atoms is used to construct the descriptor.\n\n\
 - `se_e3`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor. Three-body embedding will be used by this descriptor.\n\n\
 - `se_a_tpe`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor. Type embedding will be used by this descriptor.\n\n\
+- `se_atten`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor. Attention mechanism will be used by this descriptor.\n\n\
 - `hybrid`: Concatenate of a list of descriptors as a new descriptor.'
     
     return Variant("type", descrpt_args_plugin.get_all_argument(), doc = doc_descrpt_type)
@@ -262,7 +306,7 @@ def fitting_ener():
     doc_numb_fparam = 'The dimension of the frame parameter. If set to >0, file `fparam.npy` should be included to provided the input fparams.'
     doc_numb_aparam = 'The dimension of the atomic parameter. If set to >0, file `aparam.npy` should be included to provided the input aparams.'
     doc_neuron = 'The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.'
-    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}. Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
+    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
     doc_precision = f'The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_trainable = 'Whether the parameters in the fitting net are trainable. This option can be\n\n\
@@ -288,7 +332,7 @@ def fitting_ener():
 
 def fitting_polar():
     doc_neuron = 'The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.'
-    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}. Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
+    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_precision = f'The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
     doc_scale = 'The output of the fitting net (polarizability matrix) will be scaled by ``scale``'
@@ -320,7 +364,7 @@ def fitting_polar():
 
 def fitting_dipole():
     doc_neuron = 'The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.'
-    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())}. Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
+    doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_precision = f'The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
     doc_sel_type = 'The atom types for which the atomic dipole will be provided. If not set, all types will be selected.'
diff --git a/deepmd/utils/data.py b/deepmd/utils/data.py
index 0709352d7a..b5faf15c34 100644
--- a/deepmd/utils/data.py
+++ b/deepmd/utils/data.py
@@ -48,6 +48,7 @@ def __init__ (self,
         root = DPPath(sys_path)
         self.dirs = root.glob(set_prefix + ".*")
         self.dirs.sort()
+        self.mixed_type = self._check_mode(self.dirs[0])  # mixed_type format only has one set
         # load atom type
         self.atom_type = self._load_type(root)
         self.natoms = len(self.atom_type)
@@ -59,9 +60,12 @@ def __init__ (self,
         self.pbc = self._check_pbc(root)
         # enforce type_map if necessary
         if type_map is not None and self.type_map is not None:
-            atom_type_ = [type_map.index(self.type_map[ii]) for ii in self.atom_type]
-            self.atom_type = np.array(atom_type_, dtype = np.int32)
+            if not self.mixed_type:
+                atom_type_ = [type_map.index(self.type_map[ii]) for ii in self.atom_type]
+                self.atom_type = np.array(atom_type_, dtype = np.int32)
             self.type_map = type_map
+        if type_map is None and self.type_map is None and self.mixed_type:
+            raise RuntimeError('mixed_type format must have type_map!')
         # make idx map
         self.idx_map = self._make_idx_map(self.atom_type)
         # train dirs
@@ -408,8 +412,7 @@ def _shuffle_data (self,
             if type(data[kk]) == np.ndarray and \
                len(data[kk].shape) == 2 and \
                data[kk].shape[0] == nframes and \
-               not('find_' in kk) and \
-               'type' != kk:
+               not('find_' in kk):
                 ret[kk] = data[kk][idx]
             else :
                 ret[kk] = data[kk]
@@ -430,7 +433,6 @@ def _load_set(self, set_name: DPPath) :
         assert(coord.shape[1] == self.data_dict['coord']['ndof'] * self.natoms)
         # load keys
         data = {}
-        data['type'] = np.tile (self.atom_type[self.idx_map], (nframes, 1))
         for kk in self.data_dict.keys():
             if self.data_dict[kk]['reduce'] is None :
                 data['find_'+kk], data[kk] \
@@ -453,6 +455,17 @@ def _load_set(self, set_name: DPPath) :
                 tmp_in = data[k_in].astype(GLOBAL_ENER_FLOAT_PRECISION)
                 data[kk] = np.sum(np.reshape(tmp_in, [nframes, self.natoms, ndof]), axis = 1)
 
+        if self.mixed_type:
+            type_path = set_name / "real_atom_types.npy"
+            real_type = type_path.load_numpy().astype(np.int32).reshape([nframes, -1])
+            data['type'] = real_type
+            natoms = data['type'].shape[1]
+            data['real_natoms_vec'] = np.concatenate((np.tile(np.array([natoms, natoms], dtype=np.int32), (nframes, 1)),
+                                 np.array([(real_type == i).sum(axis=-1) for i in range(self.get_ntypes())],
+                                          dtype=np.int32).T), axis=-1)
+        else:
+            data['type'] = np.tile(self.atom_type[self.idx_map], (nframes, 1))
+
         return data
 
 
@@ -524,6 +537,9 @@ def _check_pbc(self, sys_path: DPPath):
             pbc = False
         return pbc
 
+    def _check_mode(self, set_path: DPPath):
+        return (set_path / 'real_atom_types.npy').is_file()
+
 
 class DataSets (object):
     """
diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py
index 656a7a2e7b..a3782a906e 100644
--- a/deepmd/utils/data_system.py
+++ b/deepmd/utils/data_system.py
@@ -84,6 +84,17 @@ def __init__ (self,
                     modifier = modifier, 
                     trn_all_set = trn_all_set
                 ))
+        # check mix_type format
+        error_format_msg = "if one of the system is of mixed_type format, " \
+                           "then all of the systems should be of mixed_type format!"
+        if self.data_systems[0].mixed_type:
+            for data_sys in self.data_systems[1:]:
+                assert data_sys.mixed_type, error_format_msg
+            self.mixed_type = True
+        else:
+            for data_sys in self.data_systems[1:]:
+                assert not data_sys.mixed_type, error_format_msg
+            self.mixed_type = False
         # batch size
         self.batch_size = batch_size
         if isinstance(self.batch_size, int):
diff --git a/deepmd/utils/network.py b/deepmd/utils/network.py
index befd571f24..22542adfba 100644
--- a/deepmd/utils/network.py
+++ b/deepmd/utils/network.py
@@ -204,7 +204,10 @@ def embedding_net(xx,
             xx = tf.cast(xx, get_precision(mixed_prec['compute_prec']))
             w  = tf.cast(w,  get_precision(mixed_prec['compute_prec']))
             b  = tf.cast(b,  get_precision(mixed_prec['compute_prec']))
-        hidden = tf.reshape(activation_fn(tf.nn.bias_add(tf.matmul(xx, w), b)), [-1, outputs_size[ii]])
+        if activation_fn is not None:
+            hidden = tf.reshape(activation_fn(tf.nn.bias_add(tf.matmul(xx, w), b)), [-1, outputs_size[ii]])
+        else:
+            hidden = tf.reshape(tf.nn.bias_add(tf.matmul(xx, w), b), [-1, outputs_size[ii]])
         if resnet_dt :
             idt_initializer = tf.random_normal_initializer(
                                   stddev=0.001, 
diff --git a/deepmd/utils/type_embed.py b/deepmd/utils/type_embed.py
index 5d5b5c9888..fcd890e224 100644
--- a/deepmd/utils/type_embed.py
+++ b/deepmd/utils/type_embed.py
@@ -1,5 +1,5 @@
 import numpy as np
-from typing import Tuple, List
+from typing import Tuple, List, Union
 
 from deepmd.env import tf
 from deepmd.utils.network import one_layer
@@ -72,16 +72,19 @@ class TypeEmbedNet():
             Random seed for initializing the network parameters.
     uniform_seed
             Only for the purpose of backward compatibility, retrieves the old behavior of using the random seed
+    padding
+            Concat the zero padding to the output, as the default embedding of empty type.
     """
     def __init__(
             self,
             neuron: List[int]=[],
             resnet_dt: bool = False,
-            activation_function: str = 'tanh',
+            activation_function: Union[str, None] = 'tanh',
             precision: str = 'default',
             trainable: bool = True,
             seed: int = None,
             uniform_seed: bool = False,
+            padding: bool = False,
     )->None:
         """
         Constructor
@@ -94,6 +97,7 @@ def __init__(
         self.trainable = trainable
         self.uniform_seed = uniform_seed
         self.type_embedding_net_variables = None
+        self.padding = padding
 
 
     def build(
@@ -134,10 +138,13 @@ def build(
                 precision = self.filter_precision,
                 resnet_dt = self.filter_resnet_dt,
                 seed = self.seed,
-                trainable = self.trainable, 
+                trainable = self.trainable,
                 initial_variables = self.type_embedding_net_variables,
                 uniform_seed = self.uniform_seed)
-        ebd_type = tf.reshape(ebd_type, [-1, self.neuron[-1]]) # nnei * neuron[-1]
+        ebd_type = tf.reshape(ebd_type, [-1, self.neuron[-1]])  # ntypes * neuron[-1]
+        if self.padding:
+            last_type = tf.cast(tf.zeros([1, self.neuron[-1]]), self.filter_precision)
+            ebd_type = tf.concat([ebd_type, last_type], 0)  # (ntypes + 1) * neuron[-1]
         self.ebd_type = tf.identity(ebd_type, name ='t_typeebd')
         return self.ebd_type 
 
diff --git a/doc/data/data-conv.md b/doc/data/data-conv.md
index d3c0632464..ee98ecf556 100644
--- a/doc/data/data-conv.md
+++ b/doc/data/data-conv.md
@@ -30,6 +30,8 @@ O H
 ```
 The type `0` is named by `"O"` and the type `1` is named by `"H"`.
 
+For training models with descriptor `se_atten`, a [new system format](../model/train-se-atten.md#data-format) is supported to put together the frame-sparse systems with the same atom number.
+
 ## HDF5 format
 
 A system with the HDF5 format has the same strucutre as the Numpy format, but in a HDF5 file, a system is organized as an [HDF5 group](https://docs.h5py.org/en/stable/high/group.html). The file name of a Numpy file is the key in a HDF5 file, and the data is the value to the key. One need to use `#` in a DP path to divide the path to the HDF5 file and the HDF5 key:
diff --git a/doc/data/system.md b/doc/data/system.md
index afaa5c3d17..d94c325cf5 100644
--- a/doc/data/system.md
+++ b/doc/data/system.md
@@ -1,6 +1,6 @@
 # System
 
-DeePMD-kit takes a **system** as data structure. A snapshot of a system is called a **frame**. A system may contain multiple frames with the same atom types and numbers, i.e. the same formula (like `H2O`). To contains data with different formula, one need to divide data into multiple systems.
+DeePMD-kit takes a **system** as data structure. A snapshot of a system is called a **frame**. A system may contain multiple frames with the same atom types and numbers, i.e. the same formula (like `H2O`). To contains data with different formula, one usually need to divide data into multiple systems, which may sometimes result in sparse-frame systems. See a [new system format](../model/train-se-atten.md#data-format) to further combine different systems with the same atom numbers, when training with descriptor `se_atten`.  
 
 A system should contain system properties, input frame properties, and labeled frame properties. The system property contains the following property:
 
diff --git a/doc/images/model_se_atten.png b/doc/images/model_se_atten.png
new file mode 100644
index 0000000000..c7fbb6d9c4
Binary files /dev/null and b/doc/images/model_se_atten.png differ
diff --git a/doc/model/index.md b/doc/model/index.md
index 12df4e22f3..7b58d09ef8 100644
--- a/doc/model/index.md
+++ b/doc/model/index.md
@@ -4,6 +4,7 @@
 - [Descriptor `"se_e2_a"`](train-se-e2-a.md)
 - [Descriptor `"se_e2_r"`](train-se-e2-r.md)
 - [Descriptor `"se_e3"`](train-se-e3.md)
+- [Descriptor `"se_atten"`](train-se-atten.md)
 - [Descriptor `"hybrid"`](train-hybrid.md)
 - [Descriptor `sel`](sel.md)
 - [Fit energy](train-energy.md)
diff --git a/doc/model/index.rst b/doc/model/index.rst
index 6177452342..fab6673495 100644
--- a/doc/model/index.rst
+++ b/doc/model/index.rst
@@ -8,6 +8,7 @@ Model
    train-se-e2-a
    train-se-e2-r
    train-se-e3
+   train-se-atten
    train-hybrid
    sel
    train-energy
diff --git a/doc/model/train-se-atten.md b/doc/model/train-se-atten.md
new file mode 100644
index 0000000000..a742070391
--- /dev/null
+++ b/doc/model/train-se-atten.md
@@ -0,0 +1,120 @@
+# Descriptor `"se_atten"`
+
+## DPA-1: Pretraining of Attention-based Deep Potential Model for Molecular Simulation 
+
+![ALT](../images/model_se_atten.png "model_se_atten")
+
+Here we propose DPA-1, a Deep Potential model with a novel attention mechanism, which is highly effective for representing the conformation and chemical spaces of atomic systems and learning the PES.
+
+See [this paper](https://arxiv.org/abs/2208.08236) for more information. DPA-1 is implemented as a new descriptor `"se_atten"` for model training, which can be used after simply editing the input.json.
+
+## Installation 
+Follow the [standard installation](../install/install-from-source.md#install-the-python-interface) of python interface in DeePMD-kit.
+After that, you can smoothly use the DPA-1 model with following instructions.
+
+## Introduction to new features of DPA-1
+Next we will list the detail settings in input.json and the data format, especially for large systems with dozens of elements. An example of DPA-1 input can be found in [here](../../examples/water/se_atten/input.json).
+
+### Descriptor `"se_atten"`
+
+The notation of `se_atten` is short for the smooth edition of Deep Potential with an attention mechanism.
+This descriptor was described in detail in [the DPA-1 paper](https://arxiv.org/abs/2208.08236) and the images above.
+
+In this example we will train a DPA-1 model for a water system.  A complete training input script of this example can be find in the directory:
+```bash
+$deepmd_source_dir/examples/water/se_atten/input.json
+```
+With the training input script, data are also provided in the example directory. One may train the model with the DeePMD-kit from the directory.
+
+An example of the DPA-1 descriptor is provided as follows
+```json
+	"descriptor" :{
+          "type":		"se_atten",
+          "rcut_smth":	0.50,
+          "rcut":		6.00,
+          "sel":		120,
+          "neuron":		[25, 50, 100],
+          "axis_neuron":	16,
+          "resnet_dt":	false,
+          "attn":	128,
+          "attn_layer":	2,
+          "attn_mask":	false,
+          "attn_dotr":	true,
+          "seed":	1
+	}
+```
+* The {ref}`type <model/descriptor/type>` of the descriptor is set to `"se_atten"`, which will use DPA-1 structures. 
+* {ref}`rcut <model/descriptor[se_atten]/rcut>` is the cut-off radius for neighbor searching, and the {ref}`rcut_smth <model/descriptor[se_atten]/rcut_smth>` gives where the smoothing starts. 
+* **{ref}`sel <model/descriptor[se_atten]/sel>`** gives the maximum possible number of neighbors in the cut-off radius. It is an int. Note that this number highly effects the efficiency of training, which we usually use less than 200. (We use 120 for training 56 elements in [OC2M dataset](https://github.com/Open-Catalyst-Project/ocp/blob/main/DATASET.md))
+* The {ref}`neuron <model/descriptor[se_atten]/neuron>` specifies the size of the embedding net. From left to right the members denote the sizes of each hidden layer from input end to the output end, respectively. If the outer layer is of twice size as the inner layer, then the inner layer is copied and concatenated, then a [ResNet architecture](https://arxiv.org/abs/1512.03385) is built between them.
+* The {ref}`axis_neuron <model/descriptor[se_atten]/axis_neuron>` specifies the size of submatrix of the embedding matrix, the axis matrix as explained in the [DeepPot-SE paper](https://arxiv.org/abs/1805.09003) 
+* If the option {ref}`resnet_dt <model/descriptor[se_atten]/resnet_dt>` is set to `true`, then a timestep is used in the ResNet.
+* {ref}`seed <model/descriptor[se_atten]/seed>` gives the random seed that is used to generate random numbers when initializing the model parameters.
+* {ref}`attn <model/descriptor[se_atten]/attn>` sets the length of hidden vector during scale-dot attention computation.
+* {ref}`attn_layer <model/descriptor[se_atten]/attn_layer>` sets the number of layers in attention mechanism.
+* {ref}`attn_mask <model/descriptor[se_atten]/attn_mask>` determines whether to mask the diagonal in the attention weights and False is recommended.
+* {ref}`attn_dotr <model/descriptor[se_atten]/attn_dotr>` determines whether to dot the relative coordinates on the attention weights as a gated scheme, True is recommended.
+
+### Fitting `"ener"`
+DPA-1 only support `"ener"` fitting type, and you can refer [here](train-energy.md) for detail information.
+
+### Type embedding
+DPA-1 only support models with type embeddings on. And the default setting is as follows:
+```json
+"type_embedding":{
+            "neuron":           [8],
+            "resnet_dt":        false,
+            "seed":             1
+        }
+```
+You can add these settings in input.json if you want to change the default ones, see [here](train-se-e2-a-tebd.md) for detail information.
+
+
+### Type map
+For training a large systems, especially those with dozens of elements, the {ref}`type <model/type_map>` determines the element index of training data:
+```json
+"type_map": [
+   "Mg",
+   "Al",
+   "Cu"
+  ]
+```
+which should include all the elements in the dataset you want to train on. 
+## Data format
+DPA-1 supports the standard data format, which is detailed in [data-conv.md](../data/data-conv.md) and [system.md](../data/system.md).
+Note that in this format, only those frames with the same fingerprint(i.e. the number of atoms of different element) can be put together as a unit system.
+This may lead to sparse frame number in those rare systems. 
+
+An ideal way is to put systems with same total number of atoms together, which is the way we trained DPA-1 on [OC2M](https://github.com/Open-Catalyst-Project/ocp/blob/main/DATASET.md). 
+This system format, which is called `mixed_type`, is proper to put frame-sparse systems together and is slightly different from the standard one.
+Take an example, a `mixed_type` may contain the following files:
+```
+type.raw
+type_map.raw
+set.000/box.npy
+set.000/coord.npy
+set.000/energy.npy
+set.000/force.npy
+set.000/real_atom_types.npy
+```
+This system contains `Nframes` frames with the same atom number `Natoms`, the total number of element types contained in all frames is `Ntypes`. Note that we put all the frames in one set `set.000`. Most files are the same as those in [standard format](../data/system.md), here we only list the distinct ones:
+
+ID             | Property                         | File                | Required/Optional    | Shape                    | Description
+----------     | -------------------------------- | ------------------- | -------------------- | -----------------------  | -----------
+/              | Atom type indexes (place holder) | type.raw            | Required             | Natoms                   | All zeros to fake the type input
+type_map       | Atom type names                  | type_map.raw        | Required             | Ntypes                   | Atom names that map to atom type contained in all the frames, which is unnecessart to be contained in the periodic table
+type           | Atom type indexes of each frame  | real_atom_types.npy | Required             | Nframes \* Natoms        | Integers that describe atom types in each frame, corresponding to indexes in type_map
+
+With these edited files, one can put together frames with the same `Natoms`, instead of the same formula (like `H2O`). Note that this `mixed_type` format only supports `se_atten` descriptor.
+
+The API to generate or transfer to `mixed_type` format will be uploaded on [dpdata](https://github.com/deepmodeling/dpdata) soon for a more convenient experience.
+
+## Training example
+Here we upload the AlMgCu example showed in the paper, you can download here:
+[Baidu disk](https://pan.baidu.com/s/1Mk9CihPHCmf8quwaMhT-nA?pwd=d586);
+[Google disk](https://drive.google.com/file/d/11baEpRrvHoqxORFPSdJiGWusb3Y4AnRE/view?usp=sharing).
+
+
+
+
+
diff --git a/examples/water/se_atten/input.json b/examples/water/se_atten/input.json
new file mode 100644
index 0000000000..2d54eaa1b6
--- /dev/null
+++ b/examples/water/se_atten/input.json
@@ -0,0 +1,70 @@
+{
+    "_comment": " model parameters",
+    "model": {
+	"type_map":	["O", "H"],
+	"descriptor" :{
+	    "type":		"se_atten",
+	    "sel":		120,
+	    "rcut_smth":	0.50,
+	    "rcut":		6.00,
+	    "neuron":		[25, 50, 100],
+	    "resnet_dt":	false,
+	    "axis_neuron":	16,
+	    "seed":		1,
+		"attn":		128,
+		"attn_layer":	2,
+		"attn_dotr": 	true,
+		"attn_mask": 	false,
+	    "_comment":		" that's all"
+	},
+	"fitting_net" : {
+	    "neuron":		[240, 240, 240],
+	    "resnet_dt":	true,
+	    "seed":		1,
+	    "_comment":		" that's all"
+	},
+	"_comment":	" that's all"
+    },
+
+    "learning_rate" :{
+	"type":		"exp",
+	"decay_steps":	5000,
+	"start_lr":	0.001,	
+	"stop_lr":	3.51e-8,
+	"_comment":	"that's all"
+    },
+
+    "loss" :{
+	"type":		"ener",
+	"start_pref_e":	0.02,
+	"limit_pref_e":	1,
+	"start_pref_f":	1000,
+	"limit_pref_f":	1,
+	"start_pref_v":	0,
+	"limit_pref_v":	0,
+	"_comment":	" that's all"
+    },
+
+    "training" : {
+	"training_data": {
+	    "systems":		["../data/data_0/", "../data/data_1/", "../data/data_2/"],
+	    "batch_size":	"auto",
+	    "_comment":		"that's all"
+	},
+	"validation_data":{
+	    "systems":		["../data/data_3"],
+	    "batch_size":	1,
+	    "numb_btch":	3,
+	    "_comment":		"that's all"
+	},
+	"numb_steps":	1000000,
+	"seed":		10,
+	"disp_file":	"lcurve.out",
+	"disp_freq":	100,
+	"save_freq":	1000,
+	"_comment":	"that's all"
+    },    
+
+    "_comment":		"that's all"
+}
+
diff --git a/source/lib/include/neighbor_list.h b/source/lib/include/neighbor_list.h
index 53e6d83d2c..0155715cdc 100644
--- a/source/lib/include/neighbor_list.h
+++ b/source/lib/include/neighbor_list.h
@@ -85,6 +85,17 @@ build_nlist_cpu(
     const int & mem_size,
     const float & rcut);
 
+void use_nei_info_cpu(
+    int * nlist, 
+    int * ntype,
+    bool * nmask,
+    const int * type,
+    const int * nlist_map, 
+    const int nloc, 
+    const int nnei,
+    const int ntypes,
+    const bool b_nlist_map);
+
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 /**
  *@brief              Convert the a host memory InputNlist to a device memory InputNlist
@@ -140,6 +151,17 @@ build_nlist_gpu(
     const int & mem_size,
     const float & rcut);
 
+void use_nei_info_gpu(
+    int * nlist, 
+    int * ntype,
+    bool * nmask,
+    const int * type,
+    const int * nlist_map, 
+    const int nloc, 
+    const int nnei,
+    const int ntypes,
+    const bool b_nlist_map);
+
 #endif // GOOGLE_CUDA
 
 
@@ -167,6 +189,17 @@ build_nlist_gpu_rocm(
     const int & mem_size,
     const float & rcut);
 	
+void use_nei_info_gpu_rocm(
+    int * nlist, 
+    int * ntype,
+    bool * nmask,
+    const int * type,
+    const int * nlist_map, 
+    const int nloc, 
+    const int nnei,
+    const int ntypes,
+    const bool b_nlist_map);
+
 #endif // TENSORFLOW_USE_ROCM
 
 } // namespace deepmd
diff --git a/source/lib/include/prod_env_mat.h b/source/lib/include/prod_env_mat.h
index 58f1ae8485..cab2cda93a 100644
--- a/source/lib/include/prod_env_mat.h
+++ b/source/lib/include/prod_env_mat.h
@@ -21,7 +21,8 @@ void prod_env_mat_a_cpu(
     const int nall, 
     const float rcut, 
     const float rcut_smth, 
-    const std::vector<int> sec);
+    const std::vector<int> sec,
+    const int * f_type = NULL);
 
 template<typename FPTYPE>
 void prod_env_mat_r_cpu(
@@ -60,7 +61,8 @@ void prod_env_mat_a_gpu_cuda(
     const int nall, 
     const float rcut, 
     const float rcut_smth, 
-    const std::vector<int> sec);
+    const std::vector<int> sec,
+    const int * f_type=NULL);
 
 template<typename FPTYPE> 
 void prod_env_mat_r_gpu_cuda(    
@@ -110,7 +112,8 @@ void prod_env_mat_a_gpu_rocm(
     const int nall, 
     const float rcut, 
     const float rcut_smth, 
-    const std::vector<int> sec);
+    const std::vector<int> sec,
+    const int * f_type=NULL);
 
 template<typename FPTYPE> 
 void prod_env_mat_r_gpu_rocm(    
diff --git a/source/lib/src/cuda/neighbor_list.cu b/source/lib/src/cuda/neighbor_list.cu
index 100fcd6aca..3089ab956b 100644
--- a/source/lib/src/cuda/neighbor_list.cu
+++ b/source/lib/src/cuda/neighbor_list.cu
@@ -144,6 +144,58 @@ __global__ void map_nlist(
     }
 }
 
+__global__ void map_nei_info(
+    int * nlist,
+    int * ntype,
+    bool * nmask,
+    const int * type,
+    const int * nlist_map,
+    const int nloc,
+    const int nnei,
+    const int ntypes
+)
+{
+    int atom_idx=blockIdx.x;
+    int nei_idx=blockIdx.y*blockDim.y+threadIdx.y;
+    if(nei_idx>=nnei){return;}
+    int nlist_idx=atom_idx*nnei+nei_idx;
+    int nlist_item=nlist[nlist_idx];
+    int temp=0;
+    if(nlist_item!=-1){
+        temp=nlist_map[nlist_item];
+        nlist[nlist_idx]=temp;
+        ntype[nlist_idx]=type[temp];
+        nmask[nlist_idx]=true;
+    }
+    else{
+        ntype[nlist_idx]=ntypes;
+    }
+}
+
+__global__ void map_nei_info_noconvert(
+    int * nlist,
+    int * ntype,
+    bool * nmask,
+    const int * type,
+    const int nloc,
+    const int nnei,
+    const int ntypes
+)
+{
+    int atom_idx=blockIdx.x;
+    int nei_idx=blockIdx.y*blockDim.y+threadIdx.y;
+    if(nei_idx>=nnei){return;}
+    int nlist_idx=atom_idx*nnei+nei_idx;
+    int nlist_item=nlist[nlist_idx];
+    if(nlist_item!=-1){
+        ntype[nlist_idx]=type[nlist_item];
+        nmask[nlist_idx]=true;
+    }
+    else{
+        ntype[nlist_idx]=ntypes;
+    }
+}
+
 namespace deepmd {
 template <typename FPTYPE>
 int build_nlist_gpu(
@@ -220,6 +272,32 @@ void use_nlist_map(
     DPErrcheck(cudaDeviceSynchronize());
 }
 
+void use_nei_info_gpu(
+    int * nlist,
+    int * ntype,
+    bool * nmask,
+    const int * type,
+    const int * nlist_map, 
+    const int nloc, 
+    const int nnei,
+    const int ntypes,
+    const bool b_nlist_map)
+{
+    int nblock=(nnei+TPB-1)/TPB;
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, TPB);
+    DPErrcheck(cudaMemset(ntype, 0, sizeof(int) * nloc * nnei));
+    DPErrcheck(cudaMemset(nmask, 0, sizeof(bool) * nloc * nnei));
+    if (b_nlist_map){
+        map_nei_info<<<block_grid,thread_grid>>>(nlist, ntype, nmask, type, nlist_map, nloc, nnei, ntypes);
+    }
+    else{
+        map_nei_info_noconvert<<<block_grid,thread_grid>>>(nlist, ntype, nmask, type, nloc, nnei, ntypes);
+    }
+    DPErrcheck(cudaGetLastError());
+    DPErrcheck(cudaDeviceSynchronize());
+}
+
 template int build_nlist_gpu<float>(InputNlist & nlist, int * max_list_size, int * nlist_data, const float * c_cpy, const int & nloc, const int & nall, const int & mem_size, const float & rcut);
 template int build_nlist_gpu<double>(InputNlist & nlist, int * max_list_size, int * nlist_data, const double * c_cpy, const int & nloc, const int & nall, const int & mem_size, const float & rcut);
 }
\ No newline at end of file
diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/cuda/prod_env_mat.cu
index 93a2b6a787..a68067e949 100644
--- a/source/lib/src/cuda/prod_env_mat.cu
+++ b/source/lib/src/cuda/prod_env_mat.cu
@@ -610,8 +610,12 @@ void prod_env_mat_a_gpu_cuda(
     const int nall, 
     const float rcut, 
     const float rcut_smth, 
-    const std::vector<int> sec)
+    const std::vector<int> sec,
+    const int * f_type)
 {
+  if (f_type == NULL){
+    f_type = type;
+  }
   const int nnei = sec.back();
   const int ndescrpt = nnei * 4;
   DPErrcheck(cudaMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
@@ -620,7 +624,7 @@ void prod_env_mat_a_gpu_cuda(
 
   format_nbor_list_gpu_cuda(
       nlist, 
-      coord, type, gpu_inlist, array_int, array_longlong, max_nbor_size, nloc, nall, rcut, sec);
+      coord, f_type, gpu_inlist, array_int, array_longlong, max_nbor_size, nloc, nall, rcut, sec);
   nborErrcheck(cudaGetLastError());
   nborErrcheck(cudaDeviceSynchronize());
 
@@ -688,8 +692,8 @@ void test_encoding_decoding_nbor_info_gpu_cuda(
   DPErrcheck(cudaDeviceSynchronize());
 }
 
-template void prod_env_mat_a_gpu_cuda<float>(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
-template void prod_env_mat_a_gpu_cuda<double>(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
+template void prod_env_mat_a_gpu_cuda<float>(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec, const int * f_type);
+template void prod_env_mat_a_gpu_cuda<double>(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec, const int * f_type);
 template void prod_env_mat_r_gpu_cuda<float>(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
 template void prod_env_mat_r_gpu_cuda<double>(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
 template void format_nbor_list_gpu_cuda<float>(int * nlist, const float * coord, const int * type, const deepmd::InputNlist & gpu_inlist,int * array_int,uint_64 * array_longlong,const int max_nbor_size,const int nloc, const int nall, const float rcut, const std::vector<int> sec);
diff --git a/source/lib/src/neighbor_list.cc b/source/lib/src/neighbor_list.cc
index cae7630430..99362bcc08 100644
--- a/source/lib/src/neighbor_list.cc
+++ b/source/lib/src/neighbor_list.cc
@@ -820,6 +820,55 @@ build_nlist_cpu(
   return 0;
 }
 
+void 
+deepmd::
+use_nei_info_cpu(
+    int * nlist, 
+    int * ntype,
+    bool * nmask,
+    const int * type,
+    const int * nlist_map, 
+    const int nloc, 
+    const int nnei,
+    const int ntypes,
+    const bool b_nlist_map)
+{
+    if(b_nlist_map){
+    for (int ii = 0; ii < nloc; ++ii){
+      for (int jj = 0; jj < nnei; ++jj){
+        int nlist_idx = ii*nnei+jj;
+        int record = nlist[nlist_idx];
+        if (record >= 0){	
+          int temp = nlist_map[record];
+          nlist[nlist_idx] = temp;	  
+          ntype[nlist_idx]=type[temp];
+          nmask[nlist_idx]=true;    
+        }
+        else{
+          ntype[nlist_idx]=ntypes;
+          nmask[nlist_idx]=false;
+        }
+      }
+    } 
+  }
+  else{
+    for (int ii = 0; ii < nloc; ++ii){
+      for (int jj = 0; jj < nnei; ++jj){
+        int nlist_idx = ii*nnei+jj;
+        int record = nlist[nlist_idx];
+        if (record >= 0){		  
+          ntype[nlist_idx]=type[record];
+          nmask[nlist_idx]=true;    
+        }
+        else{
+          ntype[nlist_idx]=ntypes;
+          nmask[nlist_idx]=false;
+        }
+      }
+    } 
+  }
+}
+
 template
 int
 deepmd::
diff --git a/source/lib/src/prod_env_mat.cc b/source/lib/src/prod_env_mat.cc
index 303542699c..d64e6a4b09 100644
--- a/source/lib/src/prod_env_mat.cc
+++ b/source/lib/src/prod_env_mat.cc
@@ -25,8 +25,12 @@ prod_env_mat_a_cpu(
     const int nall, 
     const float rcut, 
     const float rcut_smth, 
-    const std::vector<int> sec) 
+    const std::vector<int> sec,
+    const int * f_type)
 {
+  if (f_type == NULL){
+    f_type = type;
+  }
   const int nnei = sec.back();
   const int nem = nnei * 4;
 
@@ -39,11 +43,11 @@ prod_env_mat_a_cpu(
   }
 
   // set type
-  std::vector<int> d_type (nall);
+  std::vector<int> d_f_type(nall);
   for (int ii = 0; ii < nall; ++ii) {
-    d_type[ii] = type[ii];
+    d_f_type[ii] = f_type[ii];
   }
-    
+  
   // build nlist
   std::vector<std::vector<int > > d_nlist_a(nloc);
 
@@ -62,13 +66,13 @@ prod_env_mat_a_cpu(
 #pragma omp parallel for 
   for (int ii = 0; ii < nloc; ++ii) {
     std::vector<int> fmt_nlist_a;
-    int ret = format_nlist_i_cpu(fmt_nlist_a, d_coord3, d_type, ii, d_nlist_a[ii], rcut, sec);
+    int ret = format_nlist_i_cpu(fmt_nlist_a, d_coord3, d_f_type, ii, d_nlist_a[ii], rcut, sec);
     std::vector<FPTYPE> d_em_a;
     std::vector<FPTYPE> d_em_a_deriv;
     std::vector<FPTYPE> d_em_r;
     std::vector<FPTYPE> d_em_r_deriv;
     std::vector<FPTYPE> d_rij_a;
-    env_mat_a_cpu (d_em_a, d_em_a_deriv, d_rij_a, d_coord3, d_type, ii, fmt_nlist_a, sec, rcut_smth, rcut);
+    env_mat_a_cpu (d_em_a, d_em_a_deriv, d_rij_a, d_coord3, d_f_type, ii, fmt_nlist_a, sec, rcut_smth, rcut);
 
     // check sizes
     assert (d_em_a.size() == nem);
@@ -77,10 +81,10 @@ prod_env_mat_a_cpu(
     assert (fmt_nlist_a.size() == nnei);
     // record outputs
     for (int jj = 0; jj < nem; ++jj) {
-      em[ii * nem + jj] = (d_em_a[jj] - avg[d_type[ii] * nem + jj]) / std[d_type[ii] * nem + jj];
+      em[ii * nem + jj] = (d_em_a[jj] - avg[type[ii] * nem + jj]) / std[type[ii] * nem + jj];
     }
     for (int jj = 0; jj < nem * 3; ++jj) {
-      em_deriv[ii * nem * 3 + jj] = d_em_a_deriv[jj] / std[d_type[ii] * nem + jj / 3];
+      em_deriv[ii * nem * 3 + jj] = d_em_a_deriv[jj] / std[type[ii] * nem + jj / 3];
     }
     for (int jj = 0; jj < nnei * 3; ++jj) {
       rij[ii * nnei * 3 + jj] = d_rij_a[jj];
@@ -175,7 +179,6 @@ prod_env_mat_r_cpu(
   }
 }
 
-
 template
 void 
 deepmd::
@@ -194,7 +197,8 @@ prod_env_mat_a_cpu<double>(
     const int nall, 
     const float rcut, 
     const float rcut_smth, 
-    const std::vector<int> sec);
+    const std::vector<int> sec,
+    const int * f_type);
 
 template
 void
@@ -214,7 +218,8 @@ prod_env_mat_a_cpu<float>(
     const int nall, 
     const float rcut, 
     const float rcut_smth, 
-    const std::vector<int> sec);
+    const std::vector<int> sec,
+    const int * f_type);
 
 template
 void
diff --git a/source/lib/src/rocm/neighbor_list.hip.cu b/source/lib/src/rocm/neighbor_list.hip.cu
index 795bdd59d3..3bdfbee770 100644
--- a/source/lib/src/rocm/neighbor_list.hip.cu
+++ b/source/lib/src/rocm/neighbor_list.hip.cu
@@ -144,6 +144,58 @@ __global__ void map_nlist(
     }
 }
 
+__global__ void map_nei_info(
+    int * nlist,
+    int * ntype,
+    bool * nmask,
+    const int * type,
+    const int * nlist_map,
+    const int nloc,
+    const int nnei,
+    const int ntypes
+)
+{
+    int atom_idx=blockIdx.x;
+    int nei_idx=blockIdx.y*blockDim.y+threadIdx.y;
+    if(nei_idx>=nnei){return;}
+    int nlist_idx=atom_idx*nnei+nei_idx;
+    int nlist_item=nlist[nlist_idx];
+    int temp=0;
+    if(nlist_item!=-1){
+        temp=nlist_map[nlist_item];
+        nlist[nlist_idx]=temp;
+        ntype[nlist_idx]=type[temp];
+        nmask[nlist_idx]=true;
+    }
+    else{
+        ntype[nlist_idx]=ntypes;
+    }
+}
+
+__global__ void map_nei_info_noconvert(
+    int * nlist,
+    int * ntype,
+    bool * nmask,
+    const int * type,
+    const int nloc,
+    const int nnei,
+    const int ntypes
+)
+{
+    int atom_idx=blockIdx.x;
+    int nei_idx=blockIdx.y*blockDim.y+threadIdx.y;
+    if(nei_idx>=nnei){return;}
+    int nlist_idx=atom_idx*nnei+nei_idx;
+    int nlist_item=nlist[nlist_idx];
+    if(nlist_item!=-1){
+        ntype[nlist_idx]=type[nlist_item];
+        nmask[nlist_idx]=true;
+    }
+    else{
+        ntype[nlist_idx]=ntypes;
+    }
+}
+
 namespace deepmd {
 template <typename FPTYPE>
 int build_nlist_gpu_rocm(
@@ -221,6 +273,32 @@ void use_nlist_map(
     DPErrcheck(hipDeviceSynchronize());
 }
 
+void use_nei_info_gpu_rocm(
+    int * nlist,
+    int * ntype,
+    bool * nmask,
+    const int * type,
+    const int * nlist_map, 
+    const int nloc, 
+    const int nnei,
+    const int ntypes,
+    const bool b_nlist_map)
+{
+    int nblock=(nnei+TPB-1)/TPB;
+    dim3 block_grid(nloc, nblock);
+    dim3 thread_grid(1, TPB);
+    DPErrcheck(hipMemset(ntype, 0, sizeof(int) * nloc * nnei));
+    DPErrcheck(hipMemset(nmask, 0, sizeof(FPTYPE) * nloc * nnei));
+    if (b_nlist_map){
+        hipLaunchKernelGGL(map_nei_info, block_grid, thread_grid, 0, 0, nlist, ntype, nmask, type, nlist_map, nloc, nnei, ntypes);
+    }
+    else{
+        hipLaunchKernelGGL(map_nei_info_noconvert, block_grid, thread_grid, 0, 0, nlist, ntype, nmask, type, nloc, nnei, ntypes);
+    }
+    DPErrcheck(hipGetLastError());
+    DPErrcheck(hipDeviceSynchronize());
+}
+
 template int build_nlist_gpu_rocm<float>(InputNlist & nlist, int * max_list_size, int * nlist_data, const float * c_cpy, const int & nloc, const int & nall, const int & mem_size, const float & rcut);
 template int build_nlist_gpu_rocm<double>(InputNlist & nlist, int * max_list_size, int * nlist_data, const double * c_cpy, const int & nloc, const int & nall, const int & mem_size, const float & rcut);
 }
\ No newline at end of file
diff --git a/source/lib/src/rocm/prod_env_mat.hip.cu b/source/lib/src/rocm/prod_env_mat.hip.cu
index 506a844a04..ce6227a7f8 100644
--- a/source/lib/src/rocm/prod_env_mat.hip.cu
+++ b/source/lib/src/rocm/prod_env_mat.hip.cu
@@ -608,8 +608,12 @@ void prod_env_mat_a_gpu_rocm(
     const int nall, 
     const float rcut, 
     const float rcut_smth, 
-    const std::vector<int> sec)
+    const std::vector<int> sec,
+    const int * f_type)
 {
+  if (f_type == NULL){
+    f_type = type;
+  }
   const int nnei = sec.back();
   const int ndescrpt = nnei * 4;
   DPErrcheck(hipMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt));
@@ -618,7 +622,7 @@ void prod_env_mat_a_gpu_rocm(
 
   format_nbor_list_gpu_rocm(
       nlist, 
-      coord, type, gpu_inlist, array_int, array_longlong, max_nbor_size, nloc, nall, rcut, sec);
+      coord, f_type, gpu_inlist, array_int, array_longlong, max_nbor_size, nloc, nall, rcut, sec);
   nborErrcheck(hipGetLastError());
   nborErrcheck(hipDeviceSynchronize());
 
@@ -686,8 +690,8 @@ void test_encoding_decoding_nbor_info_gpu_rocm(
   DPErrcheck(hipDeviceSynchronize());
 }
 
-template void prod_env_mat_a_gpu_rocm<float>(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
-template void prod_env_mat_a_gpu_rocm<double>(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
+template void prod_env_mat_a_gpu_rocm<float>(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec, const int * f_type);
+template void prod_env_mat_a_gpu_rocm<double>(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec, const int * f_type);
 template void prod_env_mat_r_gpu_rocm<float>(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
 template void prod_env_mat_r_gpu_rocm<double>(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
 template void format_nbor_list_gpu_rocm<float>(int * nlist, const float * coord, const int * type, const deepmd::InputNlist & gpu_inlist,int * array_int,uint_64 * array_longlong,const int max_nbor_size,const int nloc, const int nall, const float rcut, const std::vector<int> sec);
diff --git a/source/lib/tests/test_env_mat_a_mix.cc b/source/lib/tests/test_env_mat_a_mix.cc
new file mode 100644
index 0000000000..03d01fe01d
--- /dev/null
+++ b/source/lib/tests/test_env_mat_a_mix.cc
@@ -0,0 +1,1055 @@
+#include <iostream>
+#include <gtest/gtest.h>
+#include "fmt_nlist.h"
+#include "env_mat.h"
+#include "prod_env_mat.h"
+#include "neighbor_list.h"
+#include "device.h"
+
+class TestEnvMatAMix : public ::testing::Test
+{
+protected:
+  std::vector<double > posi = {12.83, 2.56, 2.18, 
+			       12.09, 2.87, 2.74,
+			       00.25, 3.32, 1.68,
+			       3.36, 3.00, 1.81,
+			       3.51, 2.51, 2.60,
+			       4.27, 3.22, 1.56
+  };
+  std::vector<int > atype = {0, 1, 1, 0, 1, 1};
+  std::vector<int > f_atype = {0, 0, 0, 0, 0, 0};
+  std::vector<double > posi_cpy;
+//   std::vector<int > atype_cpy;
+  std::vector<int > f_atype_cpy;
+  int nloc, nall;
+  double rc = 6;
+  double rc_smth = 0.8;
+  SimulationRegion<double > region;
+  std::vector<int> mapping, ncell, ngcell;
+  std::vector<int> sec_a = {0, 20};
+  std::vector<int> sec_r = {0, 0};
+  std::vector<int> nat_stt, ext_stt, ext_end;
+  std::vector<std::vector<int>> nlist_a, nlist_r;
+  std::vector<std::vector<int>> nlist_a_cpy, nlist_r_cpy;
+  std::vector<std::vector<int>> ntype;
+  int f_ntypes = 1;
+  int ntypes = 2; // this information normally comes from natoms or avg/std
+  int nnei = sec_a.back();
+  int ndescrpt = nnei * 4;
+  std::vector<double > expected_env = {
+    1.02167, -0.77271, 0.32370, 0.58475, 0.99745, 0.41810, 0.75655, -0.49773, 0.12206, 0.12047, 0.01502, -0.01263, 0.10564, 0.10495, -0.00143, 0.01198, 0.03103, 0.03041, 0.00452, -0.00425, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 
+    1.02167, 0.77271, -0.32370, -0.58475, 0.59220, 0.42028, 0.16304, -0.38405, 0.04135, 0.04039, 0.00123, -0.00880, 0.03694, 0.03680, -0.00300, -0.00117, 0.00336, 0.00327, 0.00022, -0.00074, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 
+    0.99745, -0.41810, -0.75655, 0.49773, 0.59220, -0.42028, -0.16304, 0.38405, 0.19078, 0.18961, -0.01951, 0.00793, 0.13499, 0.12636, -0.03140, 0.03566, 0.07054, 0.07049, -0.00175, -0.00210, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000,  
+    1.06176, 0.16913, -0.55250, 0.89077, 1.03163, 0.96880, 0.23422, -0.26615, 0.19078, -0.18961, 0.01951, -0.00793, 0.12206, -0.12047, -0.01502, 0.01263, 0.04135, -0.04039, -0.00123, 0.00880, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 
+    1.06176, -0.16913, 0.55250, -0.89077, 0.66798, 0.34516, 0.32245, -0.47232, 0.13499, -0.12636, 0.03140, -0.03566, 0.10564, -0.10495, 0.00143, -0.01198, 0.03694, -0.03680, 0.00300, 0.00117, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 
+    1.03163, -0.96880, -0.23422, 0.26615, 0.66798, -0.34516, -0.32245, 0.47232, 0.07054, -0.07049, 0.00175, 0.00210, 0.03103, -0.03041, -0.00452, 0.00425, 0.00336, -0.00327, -0.00022, 0.00074, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 
+  };
+  std::vector<int > expected_ntype = {
+    1, 1, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    0, 1, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    0, 1, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 1, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    0, 1, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    0, 1, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  }; 
+  std::vector<bool > expected_nmask = {
+    1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  }; 
+  
+  void SetUp() override {
+    double box[] = {13., 0., 0., 0., 13., 0., 0., 0., 13.};
+    region.reinitBox(box);
+    copy_coord(posi_cpy, f_atype_cpy, mapping, ncell, ngcell, posi, f_atype, rc, region);
+    nloc = posi.size() / 3;
+    nall = posi_cpy.size() / 3;
+    nat_stt.resize(3);
+    ext_stt.resize(3);
+    ext_end.resize(3);
+    for (int dd = 0; dd < 3; ++dd){
+      ext_stt[dd] = -ngcell[dd];
+      ext_end[dd] = ncell[dd] + ngcell[dd];
+    }
+    build_nlist(nlist_a, nlist_r, posi, rc, rc, ncell, region);
+    build_nlist(nlist_a_cpy, nlist_r_cpy, posi_cpy, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region, ncell);
+  }
+  void TearDown() override {
+  }
+};
+
+
+class TestEnvMatAMixShortSel : public ::testing::Test
+{
+protected:
+  std::vector<double > posi = {12.83, 2.56, 2.18, 
+			       12.09, 2.87, 2.74,
+			       00.25, 3.32, 1.68,
+			       3.36, 3.00, 1.81,
+			       3.51, 2.51, 2.60,
+			       4.27, 3.22, 1.56
+  };
+  std::vector<int > atype = {0, 1, 1, 0, 1, 1};
+  std::vector<int > f_atype = {0, 0, 0, 0, 0, 0};
+  std::vector<double > posi_cpy;
+//   std::vector<int > atype_cpy;
+  std::vector<int > f_atype_cpy;
+  int nloc, nall;
+  double rc = 6;
+  double rc_smth = 0.8;
+  SimulationRegion<double > region;
+  std::vector<int> mapping, ncell, ngcell;
+  std::vector<int> sec_a = {0, 4};
+  std::vector<int> sec_r = {0, 0};
+  std::vector<int> nat_stt, ext_stt, ext_end;
+  std::vector<std::vector<int>> nlist_a, nlist_r;
+  std::vector<std::vector<int>> nlist_a_cpy, nlist_r_cpy;
+  std::vector<std::vector<int>> ntype;
+  int f_ntypes = 1;
+  int ntypes = 2; // this information normally comes from natoms or avg/std
+  int nnei = sec_a.back();
+  int ndescrpt = nnei * 4;
+  std::vector<double > expected_env = {
+    1.02167, -0.77271, 0.32370, 0.58475, 0.99745, 0.41810, 0.75655, -0.49773, 0.12206, 0.12047, 0.01502, -0.01263, 0.10564, 0.10495, -0.00143, 0.01198,
+    1.02167, 0.77271, -0.32370, -0.58475, 0.59220, 0.42028, 0.16304, -0.38405, 0.04135, 0.04039, 0.00123, -0.00880, 0.03694, 0.03680, -0.00300, -0.00117,
+    0.99745, -0.41810, -0.75655, 0.49773, 0.59220, -0.42028, -0.16304, 0.38405, 0.19078, 0.18961, -0.01951, 0.00793, 0.13499, 0.12636, -0.03140, 0.03566,
+    1.06176, 0.16913, -0.55250, 0.89077, 1.03163, 0.96880, 0.23422, -0.26615, 0.19078, -0.18961, 0.01951, -0.00793, 0.12206, -0.12047, -0.01502, 0.01263,
+    1.06176, -0.16913, 0.55250, -0.89077, 0.66798, 0.34516, 0.32245, -0.47232, 0.13499, -0.12636, 0.03140, -0.03566, 0.10564, -0.10495, 0.00143, -0.01198,
+    1.03163, -0.96880, -0.23422, 0.26615, 0.66798, -0.34516, -0.32245, 0.47232, 0.07054, -0.07049, 0.00175, 0.00210, 0.03103, -0.03041, -0.00452, 0.00425,
+  };
+  std::vector<int > expected_ntype = {
+    1, 1, 0, 1,
+    1, 1, 1, 0,
+    0, 1, 0, 1,
+    0, 1, 0, 1,
+    0, 1, 1, 0,
+    0, 1, 1, 0,
+  }; 
+  std::vector<bool > expected_nmask = {
+    1, 1, 1, 1,
+    1, 1, 1, 1,
+    1, 1, 1, 1,
+    1, 1, 1, 1,
+    1, 1, 1, 1,
+    1, 1, 1, 1,
+  }; 
+  
+  void SetUp() override {
+    double box[] = {13., 0., 0., 0., 13., 0., 0., 0., 13.};
+    region.reinitBox(box);
+    copy_coord(posi_cpy, f_atype_cpy, mapping, ncell, ngcell, posi, f_atype, rc, region);
+    nloc = posi.size() / 3;
+    nall = posi_cpy.size() / 3;
+    nat_stt.resize(3);
+    ext_stt.resize(3);
+    ext_end.resize(3);
+    for (int dd = 0; dd < 3; ++dd){
+      ext_stt[dd] = -ngcell[dd];
+      ext_end[dd] = ncell[dd] + ngcell[dd];
+    }
+    build_nlist(nlist_a, nlist_r, posi, rc, rc, ncell, region);
+    build_nlist(nlist_a_cpy, nlist_r_cpy, posi_cpy, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region, ncell);
+  }
+  void TearDown() override {
+  }
+};
+
+
+TEST_F(TestEnvMatAMix, orig_cpy)
+{
+  std::vector<int> fmt_nlist_a, fmt_nlist_r;
+  std::vector<double> env, env_deriv, rij_a;
+  bool pbc = false;
+  for(int ii = 0; ii < nloc; ++ii){
+    int ret = format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, f_atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);
+    EXPECT_EQ(ret, -1);
+    env_mat_a(env, env_deriv, rij_a, posi_cpy, f_ntypes, f_atype_cpy, region, pbc, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+    EXPECT_EQ(env.size(), sec_a.back()*4);
+    EXPECT_EQ(env.size(), env_deriv.size()/3);
+    EXPECT_EQ(rij_a.size(), sec_a.back()*3);
+    for (int jj = 0; jj < sec_a.back(); ++jj){
+      for (int dd = 0; dd < 4; ++dd){
+    	EXPECT_LT(fabs(env[jj*4+dd] - expected_env[ii*sec_a.back()*4 + jj*4 + dd]) , 1e-5);
+      }
+    }    
+    // for (int jj = 0; jj < sec_a.back(); ++jj){
+    //   printf("%7.5f, %7.5f, %7.5f, %7.5f, ", env[jj*4+0], env[jj*4+1], env[jj*4+2], env[jj*4+3]);
+    // }
+    // printf("\n");
+  }
+}
+
+TEST_F(TestEnvMatAMix, orig_pbc)
+{
+  std::vector<int> fmt_nlist_a, fmt_nlist_r;
+  std::vector<double> env, env_deriv, rij_a;
+  bool pbc = true;
+  for(int ii = 0; ii < nloc; ++ii){
+    int ret = format_nlist_i_fill_a(fmt_nlist_a, fmt_nlist_r, posi, f_ntypes, f_atype, region, pbc, ii, nlist_a[ii], nlist_r[ii], rc, sec_a, sec_r);    
+    EXPECT_EQ(ret, -1);
+    env_mat_a(env, env_deriv, rij_a, posi, f_ntypes, f_atype, region, pbc, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+    EXPECT_EQ(env.size(), sec_a.back()*4);
+    EXPECT_EQ(env.size(), env_deriv.size()/3);
+    EXPECT_EQ(rij_a.size(), sec_a.back()*3);
+    for (int jj = 0; jj < sec_a.back(); ++jj){
+      for (int dd = 0; dd < 4; ++dd){
+    	EXPECT_LT(fabs(env[jj*4+dd] - expected_env[ii*sec_a.back()*4 + jj*4 + dd]) , 1e-5);
+      }
+    }    
+  }
+}
+
+TEST_F(TestEnvMatAMix, orig_cpy_equal_pbc)
+{
+  std::vector<int> fmt_nlist_a_0, fmt_nlist_r_0;
+  std::vector<int> fmt_nlist_a_1, fmt_nlist_r_1;
+  std::vector<double> env_0, env_deriv_0, rij_a_0;
+  std::vector<double> env_1, env_deriv_1, rij_a_1;
+  for(int ii = 0; ii < nloc; ++ii){
+    int ret_0 = format_nlist_i_cpu<double>(fmt_nlist_a_0, posi_cpy, f_atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);
+    EXPECT_EQ(ret_0, -1);
+    env_mat_a(env_0, env_deriv_0, rij_a_0, posi_cpy, f_ntypes, f_atype_cpy, region, false, ii, fmt_nlist_a_0, sec_a, rc_smth, rc);
+    int ret_1 = format_nlist_i_fill_a(fmt_nlist_a_1, fmt_nlist_r_1, posi, f_ntypes, f_atype, region, true, ii, nlist_a[ii], nlist_r[ii], rc, sec_a, sec_r);    
+    EXPECT_EQ(ret_1, -1);
+    env_mat_a(env_1, env_deriv_1, rij_a_1, posi, f_ntypes, f_atype, region, true, ii, fmt_nlist_a_1, sec_a, rc_smth, rc);    
+    EXPECT_EQ(env_0.size(), env_1.size());
+    EXPECT_EQ(env_deriv_0.size(), env_deriv_1.size());
+    EXPECT_EQ(rij_a_0.size(), rij_a_1.size());
+    for (unsigned jj = 0; jj < env_0.size(); ++jj){
+      EXPECT_LT(fabs(env_0[jj] - env_1[jj]), 1e-10);
+    }
+    for (unsigned jj = 0; jj < env_deriv_0.size(); ++jj){
+      EXPECT_LT(fabs(env_deriv_0[jj] - env_deriv_1[jj]), 1e-10);      
+    }    
+    for (unsigned jj = 0; jj < rij_a_0.size(); ++jj){
+      EXPECT_LT(fabs(rij_a_0[jj] - rij_a_1[jj]), 1e-10);
+    }
+  }
+}
+
+TEST_F(TestEnvMatAMix, orig_cpy_num_deriv)
+{
+  std::vector<int> fmt_nlist_a, fmt_nlist_r;
+  std::vector<double> env, env_0, env_1, env_deriv, env_deriv_tmp, rij_a;
+  bool pbc = false;
+  double hh = 1e-5;
+  for(int ii = 0; ii < nloc; ++ii){
+    int ret = format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, f_atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);
+    EXPECT_EQ(ret, -1);
+    env_mat_a(env, env_deriv, rij_a, posi_cpy, f_ntypes, f_atype_cpy, region, pbc, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+    EXPECT_EQ(env.size(), sec_a.back()*4);
+    EXPECT_EQ(env.size(), env_deriv.size()/3);
+    EXPECT_EQ(rij_a.size(), sec_a.back()*3);
+
+    for (int jj = 0; jj < sec_a.back(); ++jj){
+      int j_idx = fmt_nlist_a[jj];
+      if (j_idx < 0) continue;
+      for (int kk = 0; kk < 4; ++kk){
+	for (int dd = 0; dd < 3; ++dd){
+	  std::vector<double> posi_0 = posi_cpy;
+	  std::vector<double> posi_1 = posi_cpy;
+	  posi_0[j_idx*3+dd] -= hh;
+	  posi_1[j_idx*3+dd] += hh;
+	  env_mat_a(env_0, env_deriv_tmp, rij_a, posi_0, f_ntypes, f_atype_cpy, region, pbc, ii, fmt_nlist_a, sec_a, rc_smth, rc);
+	  env_mat_a(env_1, env_deriv_tmp, rij_a, posi_1, f_ntypes, f_atype_cpy, region, pbc, ii, fmt_nlist_a, sec_a, rc_smth, rc);
+	  double num_deriv = (env_1[jj*4+kk] - env_0[jj*4+kk])/(2.*hh);
+	  double ana_deriv = -env_deriv[jj*12+kk*3+dd];
+	  EXPECT_LT(fabs(num_deriv - ana_deriv), 1e-5);
+	}
+      }
+    }
+    // for (int jj = 0; jj < sec_a.back(); ++jj){
+    //   printf("%7.5f, %7.5f, %7.5f, %7.5f, ", env[jj*4+0], env[jj*4+1], env[jj*4+2], env[jj*4+3]);
+    // }
+    // printf("\n");
+  }
+}
+
+TEST_F(TestEnvMatAMix, cpu)
+{
+  std::vector<int> fmt_nlist_a, fmt_nlist_r;
+  std::vector<double> env, env_deriv, rij_a;
+  bool pbc = false;
+  for(int ii = 0; ii < nloc; ++ii){
+    int ret = format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, f_atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);    
+    EXPECT_EQ(ret, -1);
+    deepmd::env_mat_a_cpu<double>(env, env_deriv, rij_a, posi_cpy, f_atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+    EXPECT_EQ(env.size(), sec_a.back()*4);
+    EXPECT_EQ(env.size(), env_deriv.size()/3);
+    EXPECT_EQ(rij_a.size(), sec_a.back()*3);
+    for (int jj = 0; jj < sec_a.back(); ++jj){
+      for (int dd = 0; dd < 4; ++dd){
+    	EXPECT_LT(fabs(env[jj*4+dd] - expected_env[ii*sec_a.back()*4 + jj*4 + dd]) , 1e-5);
+      }
+    }    
+  }
+}
+
+TEST_F(TestEnvMatAMix, cpu_equal_orig_cpy)
+{
+  std::vector<int> fmt_nlist_a_0, fmt_nlist_r_0;
+  std::vector<int> fmt_nlist_a_1, fmt_nlist_r_1;
+  std::vector<double> env_0, env_deriv_0, rij_a_0;
+  std::vector<double> env_1, env_deriv_1, rij_a_1;
+  for(int ii = 0; ii < nloc; ++ii){
+    int ret_0 = format_nlist_i_cpu<double>(fmt_nlist_a_0, posi_cpy, f_atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);
+    EXPECT_EQ(ret_0, -1);
+    env_mat_a(env_0, env_deriv_0, rij_a_0, posi_cpy, f_ntypes, f_atype_cpy, region, false, ii, fmt_nlist_a_0, sec_a, rc_smth, rc);
+
+    int ret_1 = format_nlist_i_cpu<double>(fmt_nlist_a_1, posi_cpy, f_atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);  
+  
+    EXPECT_EQ(ret_1, -1);
+    deepmd::env_mat_a_cpu<double>(env_1, env_deriv_1, rij_a_1, posi_cpy, f_atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth, rc);
+
+    EXPECT_EQ(env_0.size(), env_1.size());
+    EXPECT_EQ(env_deriv_0.size(), env_deriv_1.size());
+    EXPECT_EQ(rij_a_0.size(), rij_a_1.size());
+    for (unsigned jj = 0; jj < env_0.size(); ++jj){
+      EXPECT_LT(fabs(env_0[jj] - env_1[jj]), 1e-10);
+    }
+    for (unsigned jj = 0; jj < env_deriv_0.size(); ++jj){
+      EXPECT_LT(fabs(env_deriv_0[jj] - env_deriv_1[jj]), 1e-10);      
+    }    
+    for (unsigned jj = 0; jj < rij_a_0.size(); ++jj){
+      EXPECT_LT(fabs(rij_a_0[jj] - rij_a_1[jj]), 1e-10);
+    }
+  }
+}
+
+TEST_F(TestEnvMatAMix, cpu_num_deriv)
+{
+  std::vector<int> fmt_nlist_a, fmt_nlist_r;
+  std::vector<double> env, env_0, env_1, env_deriv, env_deriv_tmp, rij_a;
+  bool pbc = false;
+  double hh = 1e-5;
+  for(int ii = 0; ii < nloc; ++ii){
+    int ret = format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, f_atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);    
+    EXPECT_EQ(ret, -1);
+    deepmd::env_mat_a_cpu<double>(env, env_deriv, rij_a, posi_cpy, f_atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+    EXPECT_EQ(env.size(), sec_a.back()*4);
+    EXPECT_EQ(env.size(), env_deriv.size()/3);
+    EXPECT_EQ(rij_a.size(), sec_a.back()*3);
+
+    for (int jj = 0; jj < sec_a.back(); ++jj){
+      int j_idx = fmt_nlist_a[jj];
+      if (j_idx < 0) continue;
+      for (int kk = 0; kk < 4; ++kk){
+	for (int dd = 0; dd < 3; ++dd){
+	  std::vector<double> posi_0 = posi_cpy;
+	  std::vector<double> posi_1 = posi_cpy;
+	  posi_0[j_idx*3+dd] -= hh;
+	  posi_1[j_idx*3+dd] += hh;
+	  env_mat_a(env_0, env_deriv_tmp, rij_a, posi_0, f_ntypes, f_atype_cpy, region, pbc, ii, fmt_nlist_a, sec_a, rc_smth, rc);
+	  env_mat_a(env_1, env_deriv_tmp, rij_a, posi_1, f_ntypes, f_atype_cpy, region, pbc, ii, fmt_nlist_a, sec_a, rc_smth, rc);
+	  double num_deriv = (env_1[jj*4+kk] - env_0[jj*4+kk])/(2.*hh);
+	  double ana_deriv = -env_deriv[jj*12+kk*3+dd];
+	  EXPECT_LT(fabs(num_deriv - ana_deriv), 1e-5);
+	}
+      }
+    }
+    // for (int jj = 0; jj < sec_a.back(); ++jj){
+    //   printf("%7.5f, %7.5f, %7.5f, %7.5f, ", env[jj*4+0], env[jj*4+1], env[jj*4+2], env[jj*4+3]);
+    // }
+    // printf("\n");
+  }
+}
+
+TEST_F(TestEnvMatAMixShortSel, orig_cpy)
+{
+  std::vector<int> fmt_nlist_a, fmt_nlist_r;
+  std::vector<double> env, env_deriv, rij_a;
+  bool pbc = false;
+  for(int ii = 0; ii < nloc; ++ii){
+    int ret = format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, f_atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);
+    EXPECT_EQ(ret, 0);
+    env_mat_a(env, env_deriv, rij_a, posi_cpy, f_ntypes, f_atype_cpy, region, pbc, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+    EXPECT_EQ(env.size(), sec_a.back()*4);
+    EXPECT_EQ(env.size(), env_deriv.size()/3);
+    EXPECT_EQ(rij_a.size(), sec_a.back()*3);
+    for (int jj = 0; jj < sec_a.back(); ++jj){
+      for (int dd = 0; dd < 4; ++dd){
+    	EXPECT_LT(fabs(env[jj*4+dd] - expected_env[ii*sec_a.back()*4 + jj*4 + dd]) , 1e-5);
+      }
+    }    
+    // for (int jj = 0; jj < sec_a.back(); ++jj){
+    //   printf("%8.5f, %8.5f, %8.5f, %8.5f, ", env[jj*4+0], env[jj*4+1], env[jj*4+2], env[jj*4+3]);
+    // }
+    // printf("\n");
+  }
+}
+
+TEST_F(TestEnvMatAMixShortSel, orig_pbc)
+{
+  std::vector<int> fmt_nlist_a, fmt_nlist_r;
+  std::vector<double> env, env_deriv, rij_a;
+  bool pbc = true;
+  for(int ii = 0; ii < nloc; ++ii){
+    int ret = format_nlist_i_fill_a(fmt_nlist_a, fmt_nlist_r, posi, f_ntypes, f_atype, region, pbc, ii, nlist_a[ii], nlist_r[ii], rc, sec_a, sec_r);    
+    EXPECT_EQ(ret, 0);
+    env_mat_a(env, env_deriv, rij_a, posi, f_ntypes, f_atype, region, pbc, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+    EXPECT_EQ(env.size(), sec_a.back()*4);
+    EXPECT_EQ(env.size(), env_deriv.size()/3);
+    EXPECT_EQ(rij_a.size(), sec_a.back()*3);
+    for (int jj = 0; jj < sec_a.back(); ++jj){
+      for (int dd = 0; dd < 4; ++dd){
+    	EXPECT_LT(fabs(env[jj*4+dd] - expected_env[ii*sec_a.back()*4 + jj*4 + dd]) , 1e-5);
+      }
+    }    
+  }
+}
+
+TEST_F(TestEnvMatAMixShortSel, cpu)
+{
+  std::vector<int> fmt_nlist_a, fmt_nlist_r;
+  std::vector<double> env, env_deriv, rij_a;
+  bool pbc = false;
+  for(int ii = 0; ii < nloc; ++ii){
+    int ret = format_nlist_i_cpu<double>(fmt_nlist_a, posi_cpy, f_atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);    
+    EXPECT_EQ(ret, 0);
+    deepmd::env_mat_a_cpu<double>(env, env_deriv, rij_a, posi_cpy, f_atype_cpy, ii, fmt_nlist_a, sec_a, rc_smth, rc);    
+    EXPECT_EQ(env.size(), sec_a.back()*4);
+    EXPECT_EQ(env.size(), env_deriv.size()/3);
+    EXPECT_EQ(rij_a.size(), sec_a.back()*3);
+    for (int jj = 0; jj < sec_a.back(); ++jj){
+      for (int dd = 0; dd < 4; ++dd){
+    	EXPECT_LT(fabs(env[jj*4+dd] - expected_env[ii*sec_a.back()*4 + jj*4 + dd]) , 1e-5);
+      }
+    }    
+  }
+}
+
+TEST_F(TestEnvMatAMix, prod_cpu)
+{
+  EXPECT_EQ(nlist_r_cpy.size(), nloc);
+  int tot_nnei = 0;
+  int max_nbor_size = 0;
+  for(int ii = 0; ii < nlist_a_cpy.size(); ++ii){
+    tot_nnei += nlist_a_cpy[ii].size();
+    if (nlist_a_cpy[ii].size() > max_nbor_size){
+      max_nbor_size = nlist_a_cpy[ii].size();
+    }
+  }
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  deepmd::convert_nlist(inlist, nlist_a_cpy);
+  
+  std::vector<double > em(nloc * ndescrpt), em_deriv(nloc * ndescrpt * 3), rij(nloc * nnei * 3);
+  std::vector<int> nlist(nloc * nnei);
+  std::vector<int> ntype(nloc * nnei);
+  bool * nmask = new bool [nloc * nnei];
+  memset(nmask, 0, sizeof(bool) * nloc * nnei);
+  std::vector<double > avg(ntypes * ndescrpt, 0);
+  std::vector<double > std(ntypes * ndescrpt, 1);
+  deepmd::prod_env_mat_a_cpu(
+      &em[0],
+      &em_deriv[0],
+      &rij[0],
+      &nlist[0],
+      &posi_cpy[0],
+      &atype[0],
+      inlist,
+      max_nbor_size,
+      &avg[0],
+      &std[0],
+      nloc,
+      nall,
+      rc, 
+      rc_smth,
+      sec_a,
+      &f_atype_cpy[0]);
+  deepmd::use_nei_info_cpu(
+      &nlist[0],
+      &ntype[0],
+      nmask,
+      &atype[0],
+      &mapping[0],
+      nloc,
+      nnei,
+      ntypes,
+      true);
+
+  for(int ii = 0; ii < nloc; ++ii){
+    for (int jj = 0; jj < nnei; ++jj){
+      for (int dd = 0; dd < 4; ++dd){
+    	EXPECT_LT(fabs(em[ii*nnei*4 + jj*4 + dd] - 
+		       expected_env[ii*nnei*4 + jj*4 + dd]) , 
+		  1e-5);
+      }
+      EXPECT_EQ(ntype[ii*nnei+jj], expected_ntype[ii*nnei+jj]);
+      EXPECT_EQ(nmask[ii*nnei+jj], expected_nmask[ii*nnei+jj]);
+    }    
+  }
+  free(nmask);
+}
+
+TEST_F(TestEnvMatAMix, prod_cpu_equal_cpu)
+{
+  EXPECT_EQ(nlist_r_cpy.size(), nloc);
+  int tot_nnei = 0;
+  int max_nbor_size = 0;
+  for(int ii = 0; ii < nlist_a_cpy.size(); ++ii){
+    tot_nnei += nlist_a_cpy[ii].size();
+    if (nlist_a_cpy[ii].size() > max_nbor_size){
+      max_nbor_size = nlist_a_cpy[ii].size();
+    }
+  }
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  convert_nlist(inlist, nlist_a_cpy);
+  std::vector<double > em(nloc * ndescrpt), em_deriv(nloc * ndescrpt * 3), rij(nloc * nnei * 3);
+  std::vector<int> nlist(nloc * nnei);
+  std::vector<double > avg(ntypes * ndescrpt, 0);
+  std::vector<double > std(ntypes * ndescrpt, 1);
+  deepmd::prod_env_mat_a_cpu(
+      &em[0],
+      &em_deriv[0],
+      &rij[0],
+      &nlist[0],
+      &posi_cpy[0],
+      &atype[0],
+      inlist,
+      max_nbor_size,
+      &avg[0],
+      &std[0],
+      nloc,
+      nall,
+      rc, 
+      rc_smth,
+      sec_a,
+      &f_atype_cpy[0]);
+
+  std::vector<int> fmt_nlist_a_1, fmt_nlist_r_1;
+  std::vector<double> env_1, env_deriv_1, rij_a_1;
+  for(int ii = 0; ii < nloc; ++ii){
+    int ret_1 = format_nlist_i_cpu<double>(fmt_nlist_a_1, posi_cpy, f_atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);  
+    EXPECT_EQ(ret_1, -1);
+    deepmd::env_mat_a_cpu<double>(env_1, env_deriv_1, rij_a_1, posi_cpy, f_atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth, rc);
+    EXPECT_EQ(env_1.size(), nnei * 4);
+    EXPECT_EQ(env_deriv_1.size(), nnei * 4 * 3);
+    EXPECT_EQ(rij_a_1.size(), nnei * 3);
+    EXPECT_EQ(fmt_nlist_a_1.size(), nnei);
+    EXPECT_EQ(env_1.size() * nloc, em.size());
+    EXPECT_EQ(env_deriv_1.size() * nloc, em_deriv.size());
+    EXPECT_EQ(rij_a_1.size() * nloc, rij.size());
+    EXPECT_EQ(fmt_nlist_a_1.size() * nloc, nlist.size());
+    for (unsigned jj = 0; jj < env_1.size(); ++jj){
+      EXPECT_LT(fabs(em[ii*nnei*4+jj] - env_1[jj]), 1e-10);
+    }
+    for (unsigned jj = 0; jj < env_deriv_1.size(); ++jj){
+      EXPECT_LT(fabs(em_deriv[ii*nnei*4*3+jj] - env_deriv_1[jj]), 1e-10);      
+    }    
+    for (unsigned jj = 0; jj < rij_a_1.size(); ++jj){
+      EXPECT_LT(fabs(rij[ii*nnei*3+jj] - rij_a_1[jj]), 1e-10);
+    }
+    for (unsigned jj = 0; jj < fmt_nlist_a_1.size(); ++jj){
+      EXPECT_EQ(nlist[ii*nnei+jj], fmt_nlist_a_1[jj]);
+    }
+  }
+
+  // for(int ii = 0; ii < nloc; ++ii){
+  //   for (int jj = 0; jj < nnei; ++jj){
+  //     for (int dd = 0; dd < 4; ++dd){
+  //   	EXPECT_LT(fabs(em[ii*nnei*4 + jj*4 + dd] - 
+  // 		       expected_env[ii*nnei*4 + jj*4 + dd]) , 
+  // 		  1e-5);
+  //     }
+  //   }    
+  // }
+}
+
+
+#if GOOGLE_CUDA
+TEST_F(TestEnvMatAMix, prod_gpu_cuda)
+{
+  EXPECT_EQ(nlist_r_cpy.size(), nloc);
+  int tot_nnei = 0;
+  int max_nbor_size = 0;
+  for(int ii = 0; ii < nlist_a_cpy.size(); ++ii){
+    tot_nnei += nlist_a_cpy[ii].size();
+    if (nlist_a_cpy[ii].size() > max_nbor_size){
+      max_nbor_size = nlist_a_cpy[ii].size();
+    }
+  }
+  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
+  if (max_nbor_size <= 1024) {
+    max_nbor_size = 1024;
+  }
+  else if (max_nbor_size <= 2048) {
+    max_nbor_size = 2048;
+  }
+  else {
+    max_nbor_size = 4096;
+  }
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
+  convert_nlist(inlist, nlist_a_cpy);
+  std::vector<double > em(nloc * ndescrpt, 0.0), em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
+  std::vector<int> nlist(nloc * nnei, 0);
+  std::vector<int> ntype(nloc * nnei, 0);
+  bool * nmask = new bool [nloc * nnei];
+  memset(nmask, 0, sizeof(bool) * nloc * nnei);
+  std::vector<double > avg(ntypes * ndescrpt, 0);
+  std::vector<double > std(ntypes * ndescrpt, 1);
+
+  double * em_dev = NULL, * em_deriv_dev = NULL, * rij_dev = NULL;
+  bool * nmask_dev = NULL;
+  double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL;
+  int * f_atype_cpy_dev = NULL, * atype_dev = NULL, * nlist_dev = NULL, * ntype_dev = NULL, * mapping_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL;
+  uint_64 * array_longlong_dev = NULL;
+  deepmd::malloc_device_memory_sync(em_dev, em);
+  deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
+  deepmd::malloc_device_memory_sync(rij_dev, rij);
+  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
+  deepmd::malloc_device_memory_sync(avg_dev, avg);
+  deepmd::malloc_device_memory_sync(std_dev, std);
+  deepmd::malloc_device_memory_sync(f_atype_cpy_dev, f_atype_cpy);
+  deepmd::malloc_device_memory_sync(atype_dev, atype);
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory_sync(ntype_dev, ntype);
+  deepmd::malloc_device_memory_sync(mapping_dev, mapping);
+  deepmd::malloc_device_memory_sync(nmask_dev, nmask, nloc * nnei);
+  deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
+  deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
+  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size);
+
+  deepmd::prod_env_mat_a_gpu_cuda(    
+      em_dev, 
+      em_deriv_dev, 
+      rij_dev, 
+      nlist_dev, 
+      posi_cpy_dev, 
+      atype_dev,
+      gpu_inlist,
+      array_int_dev, 
+      array_longlong_dev,
+      max_nbor_size,
+      avg_dev, 
+      std_dev, 
+      nloc,
+      nall,
+      rc, 
+      rc_smth,
+      sec_a,
+      f_atype_cpy_dev);
+
+  deepmd::use_nei_info_gpu(
+      nlist_dev,
+      ntype_dev,
+      nmask_dev,
+      atype_dev,
+      mapping_dev,
+      nloc,
+      nnei,
+      ntypes,
+      true);
+  deepmd::memcpy_device_to_host(em_dev, em);
+  deepmd::memcpy_device_to_host(ntype_dev, ntype);
+  deepmd::memcpy_device_to_host(nmask_dev, nmask, nloc * nnei);
+  deepmd::delete_device_memory(em_dev);
+  deepmd::delete_device_memory(em_deriv_dev);
+  deepmd::delete_device_memory(rij_dev);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(ntype_dev);
+  deepmd::delete_device_memory(nmask_dev);
+  deepmd::delete_device_memory(posi_cpy_dev);
+  deepmd::delete_device_memory(f_atype_cpy_dev);
+  deepmd::delete_device_memory(atype_dev);
+  deepmd::delete_device_memory(mapping_dev);
+  deepmd::delete_device_memory(array_int_dev);
+  deepmd::delete_device_memory(array_longlong_dev);
+  deepmd::delete_device_memory(avg_dev);
+  deepmd::delete_device_memory(std_dev);
+  deepmd::delete_device_memory(memory_dev);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
+
+  for(int ii = 0; ii < nloc; ++ii){
+    for (int jj = 0; jj < nnei; ++jj){
+      for (int dd = 0; dd < 4; ++dd){
+    	EXPECT_LT(fabs(em[ii*nnei*4 + jj*4 + dd] - 
+		       expected_env[ii*nnei*4 + jj*4 + dd]) , 
+		  1e-5);
+      }
+      EXPECT_EQ(ntype[ii*nnei+jj], expected_ntype[ii*nnei+jj]);
+      EXPECT_EQ(nmask[ii*nnei+jj], expected_nmask[ii*nnei+jj]);
+    }    
+  }
+  free(nmask);
+}
+
+
+TEST_F(TestEnvMatAMix, prod_gpu_cuda_equal_cpu)
+{
+  EXPECT_EQ(nlist_r_cpy.size(), nloc);
+  int tot_nnei = 0;
+  int max_nbor_size = 0;
+  for(int ii = 0; ii < nlist_a_cpy.size(); ++ii){
+    tot_nnei += nlist_a_cpy[ii].size();
+    if (nlist_a_cpy[ii].size() > max_nbor_size){
+      max_nbor_size = nlist_a_cpy[ii].size();
+    }
+  }
+  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
+  if (max_nbor_size <= 1024) {
+    max_nbor_size = 1024;
+  }
+  else if (max_nbor_size <= 2048) {
+    max_nbor_size = 2048;
+  }
+  else {
+    max_nbor_size = 4096;
+  }
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
+  convert_nlist(inlist, nlist_a_cpy);
+  std::vector<double > em(nloc * ndescrpt, 0.0), em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
+  std::vector<int> nlist(nloc * nnei, 0);
+  std::vector<double > avg(ntypes * ndescrpt, 0);
+  std::vector<double > std(ntypes * ndescrpt, 1);
+
+  double * em_dev = NULL, * em_deriv_dev = NULL, * rij_dev = NULL;
+  double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL;
+  int * f_atype_cpy_dev = NULL, * atype_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL;
+  uint_64 * array_longlong_dev = NULL;
+  deepmd::malloc_device_memory_sync(em_dev, em);
+  deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
+  deepmd::malloc_device_memory_sync(rij_dev, rij);
+  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
+  deepmd::malloc_device_memory_sync(avg_dev, avg);
+  deepmd::malloc_device_memory_sync(std_dev, std);
+
+  deepmd::malloc_device_memory_sync(f_atype_cpy_dev, f_atype_cpy);
+  deepmd::malloc_device_memory_sync(atype_dev, atype);
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
+  deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
+  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size);
+
+  deepmd::prod_env_mat_a_gpu_cuda(    
+      em_dev, 
+      em_deriv_dev, 
+      rij_dev, 
+      nlist_dev, 
+      posi_cpy_dev,  
+      atype_dev,
+      gpu_inlist,
+      array_int_dev, 
+      array_longlong_dev,
+      max_nbor_size,
+      avg_dev, 
+      std_dev, 
+      nloc,
+      nall,
+      rc, 
+      rc_smth,
+      sec_a,
+      f_atype_cpy_dev);
+  deepmd::memcpy_device_to_host(em_dev, em);
+  deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv);
+  deepmd::memcpy_device_to_host(rij_dev, rij);
+  deepmd::memcpy_device_to_host(nlist_dev, nlist);
+  deepmd::delete_device_memory(em_dev);
+  deepmd::delete_device_memory(em_deriv_dev);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(posi_cpy_dev);
+  deepmd::delete_device_memory(f_atype_cpy_dev);
+  deepmd::delete_device_memory(atype_dev);
+  deepmd::delete_device_memory(array_int_dev);
+  deepmd::delete_device_memory(array_longlong_dev);
+  deepmd::delete_device_memory(avg_dev);
+  deepmd::delete_device_memory(std_dev);
+  deepmd::delete_device_memory(memory_dev);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
+
+  std::vector<int> fmt_nlist_a_1, fmt_nlist_r_1;
+  std::vector<double> env_1, env_deriv_1, rij_a_1;
+  for(int ii = 0; ii < nloc; ++ii){
+    int ret_1 = format_nlist_i_cpu<double>(fmt_nlist_a_1, posi_cpy, f_atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);  
+    EXPECT_EQ(ret_1, -1);
+    deepmd::env_mat_a_cpu<double>(env_1, env_deriv_1, rij_a_1, posi_cpy, f_atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth, rc);
+    EXPECT_EQ(env_1.size(), nnei * 4);
+    EXPECT_EQ(env_deriv_1.size(), nnei * 4 * 3);
+    EXPECT_EQ(rij_a_1.size(), nnei * 3);
+    EXPECT_EQ(fmt_nlist_a_1.size(), nnei);
+    EXPECT_EQ(env_1.size() * nloc, em.size());
+    EXPECT_EQ(env_deriv_1.size() * nloc, em_deriv.size());
+    EXPECT_EQ(rij_a_1.size() * nloc, rij.size());
+    EXPECT_EQ(fmt_nlist_a_1.size() * nloc, nlist.size());
+    for (unsigned jj = 0; jj < env_1.size(); ++jj){
+      EXPECT_LT(fabs(em[ii*nnei*4+jj] - env_1[jj]), 1e-10);
+    }
+    for (unsigned jj = 0; jj < env_deriv_1.size(); ++jj){
+      EXPECT_LT(fabs(em_deriv[ii*nnei*4*3+jj] - env_deriv_1[jj]), 1e-10);      
+    }    
+    for (unsigned jj = 0; jj < rij_a_1.size(); ++jj){
+      EXPECT_LT(fabs(rij[ii*nnei*3+jj] - rij_a_1[jj]), 1e-10);
+    }
+    for (unsigned jj = 0; jj < fmt_nlist_a_1.size(); ++jj){
+      EXPECT_EQ(nlist[ii*nnei+jj], fmt_nlist_a_1[jj]);
+    }
+  }
+
+  for(int ii = 0; ii < nloc; ++ii){
+    for (int jj = 0; jj < nnei; ++jj){
+      for (int dd = 0; dd < 4; ++dd){
+    	EXPECT_LT(fabs(em[ii*nnei*4 + jj*4 + dd] - 
+  		       expected_env[ii*nnei*4 + jj*4 + dd]) , 
+  		  1e-5);
+      }
+    }    
+  }
+}
+#endif //GOOGLE_CUDA
+
+#if TENSORFLOW_USE_ROCM
+TEST_F(TestEnvMatAMix, prod_gpu_rocm)
+{
+  EXPECT_EQ(nlist_r_cpy.size(), nloc);
+  int tot_nnei = 0;
+  int max_nbor_size = 0;
+  for(int ii = 0; ii < nlist_a_cpy.size(); ++ii){
+    tot_nnei += nlist_a_cpy[ii].size();
+    if (nlist_a_cpy[ii].size() > max_nbor_size){
+      max_nbor_size = nlist_a_cpy[ii].size();
+    }
+  }
+  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
+  if (max_nbor_size <= 1024) {
+    max_nbor_size = 1024;
+  }
+  else if (max_nbor_size <= 2048) {
+    max_nbor_size = 2048;
+  }
+  else {
+    max_nbor_size = 4096;
+  }
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
+  convert_nlist(inlist, nlist_a_cpy);
+  std::vector<double > em(nloc * ndescrpt, 0.0), em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
+  std::vector<int> nlist(nloc * nnei, 0);
+  std::vector<int> ntype(nloc * nnei, 0);
+  bool * nmask = new bool [nloc * nnei];
+  memset(nmask, 0, sizeof(bool) * nloc * nnei);
+  std::vector<double > avg(ntypes * ndescrpt, 0);
+  std::vector<double > std(ntypes * ndescrpt, 1);
+
+  double * em_dev = NULL, * em_deriv_dev = NULL, * rij_dev = NULL, * nmask_dev = NULL;
+  double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL;
+  int * f_atype_cpy_dev = NULL, * atype_dev = NULL, * nlist_dev = NULL, * ntype_dev = NULL, * mapping_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL;
+  uint_64 * array_longlong_dev = NULL;
+  deepmd::malloc_device_memory_sync(em_dev, em);
+  deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
+  deepmd::malloc_device_memory_sync(rij_dev, rij);
+  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
+  deepmd::malloc_device_memory_sync(avg_dev, avg);
+  deepmd::malloc_device_memory_sync(std_dev, std);
+  deepmd::malloc_device_memory_sync(f_atype_cpy_dev, f_atype_cpy);
+  deepmd::malloc_device_memory_sync(atype_dev, atype);
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory_sync(ntype_dev, ntype);
+  deepmd::malloc_device_memory_sync(mapping_dev, mapping);
+  deepmd::malloc_device_memory_sync(nmask_dev, nmask, nloc * nnei);
+  deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
+  deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
+  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size);
+
+  deepmd::prod_env_mat_a_gpu_rocm(    
+      em_dev, 
+      em_deriv_dev, 
+      rij_dev, 
+      nlist_dev, 
+      posi_cpy_dev, 
+      atype_dev,
+      gpu_inlist,
+      array_int_dev, 
+      array_longlong_dev,
+      max_nbor_size,
+      avg_dev, 
+      std_dev, 
+      nloc,
+      nall,
+      rc, 
+      rc_smth,
+      sec_a,
+      f_atype_cpy_dev);
+
+  deepmd::use_nei_info_gpu_rocm(
+      nlist_dev,
+      ntype_dev,
+      nmask_dev,
+      atype_dev,
+      mapping_dev,
+      nloc,
+      nnei,
+      ntypes,
+      true);
+  deepmd::memcpy_device_to_host(em_dev, em);
+  deepmd::memcpy_device_to_host(ntype_dev, ntype);
+  deepmd::memcpy_device_to_host(nmask_dev, nmask, nloc * nnei);
+  deepmd::delete_device_memory(em_dev);
+  deepmd::delete_device_memory(em_deriv_dev);
+  deepmd::delete_device_memory(rij_dev);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(ntype_dev);
+  deepmd::delete_device_memory(nmask_dev);
+  deepmd::delete_device_memory(posi_cpy_dev);
+  deepmd::delete_device_memory(f_atype_cpy_dev);
+  deepmd::delete_device_memory(atype_dev);
+  deepmd::delete_device_memory(mapping_dev);
+  deepmd::delete_device_memory(array_int_dev);
+  deepmd::delete_device_memory(array_longlong_dev);
+  deepmd::delete_device_memory(avg_dev);
+  deepmd::delete_device_memory(std_dev);
+  deepmd::delete_device_memory(memory_dev);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
+
+  for(int ii = 0; ii < nloc; ++ii){
+    for (int jj = 0; jj < nnei; ++jj){
+      for (int dd = 0; dd < 4; ++dd){
+    	EXPECT_LT(fabs(em[ii*nnei*4 + jj*4 + dd] - 
+		       expected_env[ii*nnei*4 + jj*4 + dd]) , 
+		  1e-5);
+      }
+      EXPECT_EQ(ntype[ii*nnei+jj], expected_ntype[ii*nnei+jj]);
+      EXPECT_EQ(nmask[ii*nnei+jj], expected_nmask[ii*nnei+jj]);
+    }    
+  }
+  free(nmask);
+}
+
+
+TEST_F(TestEnvMatAMix, prod_gpu_rocm_equal_cpu)
+{
+  EXPECT_EQ(nlist_r_cpy.size(), nloc);
+  int tot_nnei = 0;
+  int max_nbor_size = 0;
+  for(int ii = 0; ii < nlist_a_cpy.size(); ++ii){
+    tot_nnei += nlist_a_cpy[ii].size();
+    if (nlist_a_cpy[ii].size() > max_nbor_size){
+      max_nbor_size = nlist_a_cpy[ii].size();
+    }
+  }
+  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
+  if (max_nbor_size <= 1024) {
+    max_nbor_size = 1024;
+  }
+  else if (max_nbor_size <= 2048) {
+    max_nbor_size = 2048;
+  }
+  else {
+    max_nbor_size = 4096;
+  }
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
+  convert_nlist(inlist, nlist_a_cpy);
+  std::vector<double > em(nloc * ndescrpt, 0.0), em_deriv(nloc * ndescrpt * 3, 0.0), rij(nloc * nnei * 3, 0.0);
+  std::vector<int> nlist(nloc * nnei, 0);
+  std::vector<double > avg(ntypes * ndescrpt, 0);
+  std::vector<double > std(ntypes * ndescrpt, 1);
+
+  double * em_dev = NULL, * em_deriv_dev = NULL, * rij_dev = NULL;
+  double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL;
+  int * f_atype_cpy_dev = NULL, * atype_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL;
+  uint_64 * array_longlong_dev = NULL;
+  deepmd::malloc_device_memory_sync(em_dev, em);
+  deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
+  deepmd::malloc_device_memory_sync(rij_dev, rij);
+  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
+  deepmd::malloc_device_memory_sync(avg_dev, avg);
+  deepmd::malloc_device_memory_sync(std_dev, std);
+
+  deepmd::malloc_device_memory_sync(f_atype_cpy_dev, f_atype_cpy);
+  deepmd::malloc_device_memory_sync(atype_dev, atype);
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
+  deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
+  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
+  deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size);
+
+  deepmd::prod_env_mat_a_gpu_rocm(    
+      em_dev, 
+      em_deriv_dev, 
+      rij_dev, 
+      nlist_dev, 
+      posi_cpy_dev, 
+      atype_dev,
+      gpu_inlist,
+      array_int_dev, 
+      array_longlong_dev,
+      max_nbor_size,
+      avg_dev, 
+      std_dev, 
+      nloc,
+      nall,
+      rc, 
+      rc_smth,
+      sec_a,
+      f_atype_cpy_dev);
+  deepmd::memcpy_device_to_host(em_dev, em);
+  deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv);
+  deepmd::memcpy_device_to_host(rij_dev, rij);
+  deepmd::memcpy_device_to_host(nlist_dev, nlist);
+  deepmd::delete_device_memory(em_dev);
+  deepmd::delete_device_memory(em_deriv_dev);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(posi_cpy_dev);
+  deepmd::delete_device_memory(f_atype_cpy_dev);
+  deepmd::delete_device_memory(atype_dev);
+  deepmd::delete_device_memory(array_int_dev);
+  deepmd::delete_device_memory(array_longlong_dev);
+  deepmd::delete_device_memory(avg_dev);
+  deepmd::delete_device_memory(std_dev);
+  deepmd::delete_device_memory(memory_dev);
+  deepmd::free_nlist_gpu_device(gpu_inlist);
+
+  std::vector<int> fmt_nlist_a_1, fmt_nlist_r_1;
+  std::vector<double> env_1, env_deriv_1, rij_a_1;
+  for(int ii = 0; ii < nloc; ++ii){
+    int ret_1 = format_nlist_i_cpu<double>(fmt_nlist_a_1, posi_cpy, f_atype_cpy, ii, nlist_a_cpy[ii], rc, sec_a);  
+    EXPECT_EQ(ret_1, -1);
+    deepmd::env_mat_a_cpu<double>(env_1, env_deriv_1, rij_a_1, posi_cpy, f_atype_cpy, ii, fmt_nlist_a_1, sec_a, rc_smth, rc);
+    EXPECT_EQ(env_1.size(), nnei * 4);
+    EXPECT_EQ(env_deriv_1.size(), nnei * 4 * 3);
+    EXPECT_EQ(rij_a_1.size(), nnei * 3);
+    EXPECT_EQ(fmt_nlist_a_1.size(), nnei);
+    EXPECT_EQ(env_1.size() * nloc, em.size());
+    EXPECT_EQ(env_deriv_1.size() * nloc, em_deriv.size());
+    EXPECT_EQ(rij_a_1.size() * nloc, rij.size());
+    EXPECT_EQ(fmt_nlist_a_1.size() * nloc, nlist.size());
+    for (unsigned jj = 0; jj < env_1.size(); ++jj){
+      EXPECT_LT(fabs(em[ii*nnei*4+jj] - env_1[jj]), 1e-10);
+    }
+    for (unsigned jj = 0; jj < env_deriv_1.size(); ++jj){
+      EXPECT_LT(fabs(em_deriv[ii*nnei*4*3+jj] - env_deriv_1[jj]), 1e-10);      
+    }    
+    for (unsigned jj = 0; jj < rij_a_1.size(); ++jj){
+      EXPECT_LT(fabs(rij[ii*nnei*3+jj] - rij_a_1[jj]), 1e-10);
+    }
+    for (unsigned jj = 0; jj < fmt_nlist_a_1.size(); ++jj){
+      EXPECT_EQ(nlist[ii*nnei+jj], fmt_nlist_a_1[jj]);
+    }
+  }
+
+  for(int ii = 0; ii < nloc; ++ii){
+    for (int jj = 0; jj < nnei; ++jj){
+      for (int dd = 0; dd < 4; ++dd){
+    	EXPECT_LT(fabs(em[ii*nnei*4 + jj*4 + dd] - 
+  		       expected_env[ii*nnei*4 + jj*4 + dd]) , 
+  		  1e-5);
+      }
+    }    
+  }
+}
+#endif //TENSORFLOW_USE_ROCM
diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index f88606053d..4e32904907 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -147,6 +147,36 @@ REGISTER_OP("DescrptSeR")
     .Output("rij: T")
     .Output("nlist: int32"); 
 
+REGISTER_OP("ProdEnvMatAMix")
+    .Attr("T: {float, double} = DT_DOUBLE")
+    .Input("coord: T")          //atomic coordinates
+    .Input("type: int32")       //atomic type
+    .Input("natoms: int32")     //local atomic number; each type atomic number
+    .Input("box : T")
+    .Input("mesh : int32")
+    .Input("davg: T")           //average value of data
+    .Input("dstd: T")           //standard deviation
+    .Attr("rcut_a: float")      //no use
+    .Attr("rcut_r: float")
+    .Attr("rcut_r_smth: float")
+    .Attr("sel_a: list(int)")
+    .Attr("sel_r: list(int)")   //all zero
+    .Output("descrpt: T")
+    .Output("descrpt_deriv: T")
+    .Output("rij: T")
+    .Output("nlist: int32")
+    .Output("ntype: int32")
+    .Output("nmask: bool")
+    .Doc(R"(Compute the environment matrix mixing the atom types.
+The sorting of neighbor atoms depends not on atom types, but on the distance and index. 
+The atoms in nlist matrix will gather forward and thus save space for gaps of types in ProdEnvMatA, 
+resulting in optimized and relative small sel_a.
+
+The additional outputs are listed as following:
+ntype: The corresponding atom types in nlist.
+nmask: The atom mask in nlist.
+)");
+
 template<typename FPTYPE>
 static int
 _norm_copy_coord_cpu(
@@ -184,6 +214,18 @@ _map_nlist_cpu(
     const int & nloc,
     const int & nnei);
 
+static void
+_map_nei_info_cpu(
+    int * nlist,
+    int * ntype,
+    bool * nmask,
+    const int * type,
+    const int * idx_mapping,
+    const int & nloc,
+    const int & nnei,
+    const int & ntypes,
+    const bool & b_nlist_map);
+
 template <typename FPTYPE>
 static void
 _prepare_coord_nlist_cpu(
@@ -252,6 +294,18 @@ _map_nlist_gpu(
     const int & nloc,
     const int & nnei);
 
+static void
+_map_nei_info_gpu(
+    int * nlist,
+    int * ntype,
+    bool * nmask,
+    const int * type,
+    const int * idx_mapping,
+    const int & nloc,
+    const int & nnei,
+    const int & ntypes,
+    const bool & b_nlist_map);
+
 template <typename FPTYPE>
 static void
 _prepare_coord_nlist_gpu(
@@ -326,6 +380,18 @@ _map_nlist_gpu_rocm(
     const int & nloc,
     const int & nnei);
 
+static void
+_map_nei_info_gpu_rocm(
+    int * nlist,
+    int * ntype,
+    bool * nmask,
+    const int * type,
+    const int * idx_mapping,
+    const int & nloc,
+    const int & nnei,
+    const int & ntypes,
+    const bool & b_nlist_map);
+
 template <typename FPTYPE>
 static void
 _prepare_coord_nlist_gpu_rocm(
@@ -869,7 +935,7 @@ class ProdEnvMatROp : public OpKernel {
 	  frame_nall, mem_cpy, mem_nnei, max_nbor_size,
 	  box, mesh_tensor.flat<int>().data(), nloc, nei_mode, rcut, max_cpy_trial, max_nnei_trial);
       // launch the cpu compute function
-      prod_env_mat_r_cpu(
+      deepmd::prod_env_mat_r_cpu(
           em, em_deriv, rij, nlist, 
           coord, type, inlist, max_nbor_size, avg, std, nloc, frame_nall, rcut, rcut_smth, sec);
       if(b_nlist_map) _map_nlist_cpu(nlist, &idx_mapping[0], nloc, nnei);
@@ -896,7 +962,314 @@ class ProdEnvMatROp : public OpKernel {
   int * nbor_list_dev = NULL;
 };
 
+template <typename Device, typename FPTYPE>
+class ProdEnvMatAMixOp : public OpKernel {
+public:
+  explicit ProdEnvMatAMixOp(OpKernelConstruction* context) : OpKernel(context) {
+    float nloc_f, nall_f;
+    OP_REQUIRES_OK(context, context->GetAttr("rcut_a", &rcut_a));
+    OP_REQUIRES_OK(context, context->GetAttr("rcut_r", &rcut_r));
+    OP_REQUIRES_OK(context, context->GetAttr("rcut_r_smth", &rcut_r_smth));
+    OP_REQUIRES_OK(context, context->GetAttr("sel_a", &sel_a));
+    OP_REQUIRES_OK(context, context->GetAttr("sel_r", &sel_r));
+    // OP_REQUIRES_OK(context, context->GetAttr("nloc", &nloc_f));
+    // OP_REQUIRES_OK(context, context->GetAttr("nall", &nall_f));
+    deepmd::cum_sum (sec_a, sel_a);
+    deepmd::cum_sum (sec_r, sel_r);
+    ndescrpt_a = sec_a.back() * 4;
+    ndescrpt_r = sec_r.back() * 1;
+    ndescrpt = ndescrpt_a + ndescrpt_r;
+    nnei_a = sec_a.back();
+    nnei_r = sec_r.back();
+    nnei = nnei_a + nnei_r;
+    max_nbor_size = 1024;
+    max_cpy_trial = 100;
+    mem_cpy = 256;
+    max_nnei_trial = 100;
+    mem_nnei = 256;
+  }
+
+  void Compute(OpKernelContext* context) override {
+    deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);});
+  }
+
+  void _Compute(OpKernelContext* context) {
+    // Grab the input tensor
+    int context_input_index = 0;
+    const Tensor& coord_tensor	= context->input(context_input_index++);
+    const Tensor& type_tensor	= context->input(context_input_index++);
+    const Tensor& natoms_tensor	= context->input(context_input_index++);
+    const Tensor& box_tensor	= context->input(context_input_index++);
+    const Tensor& mesh_tensor   = context->input(context_input_index++);
+    const Tensor& avg_tensor	= context->input(context_input_index++);
+    const Tensor& std_tensor	= context->input(context_input_index++);
+    // set size of the sample. assume 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]], then shape(t) ==> [2, 2, 3]
+    OP_REQUIRES (context, (coord_tensor.shape().dims() == 2),       errors::InvalidArgument ("Dim of coord should be 2"));
+    OP_REQUIRES (context, (type_tensor.shape().dims() == 2),        errors::InvalidArgument ("Dim of type should be 2"));
+    OP_REQUIRES (context, (natoms_tensor.shape().dims() == 1),      errors::InvalidArgument ("Dim of natoms should be 1"));
+    OP_REQUIRES (context, (box_tensor.shape().dims() == 2),         errors::InvalidArgument ("Dim of box should be 2"));
+    OP_REQUIRES (context, (mesh_tensor.shape().dims() == 1),        errors::InvalidArgument ("Dim of mesh should be 1"));
+    OP_REQUIRES (context, (avg_tensor.shape().dims() == 2),         errors::InvalidArgument ("Dim of avg should be 2"));
+    OP_REQUIRES (context, (std_tensor.shape().dims() == 2),         errors::InvalidArgument ("Dim of std should be 2"));
+    OP_REQUIRES (context, (sec_r.back() == 0),                      errors::InvalidArgument ("Rotational free descriptor only support all-angular information: sel_r should be all zero."));
+    OP_REQUIRES (context, (natoms_tensor.shape().dim_size(0) >= 3), errors::InvalidArgument ("number of atoms should be larger than (or equal to) 3"));
+    DeviceFunctor() (
+        device,
+        context->eigen_device<Device>()
+    );
+    const int * natoms = natoms_tensor.flat<int>().data();
+    int nloc = natoms[0];
+    int nall = natoms[1];
+    int ntypes = natoms_tensor.shape().dim_size(0) - 2;
+    int nsamples = coord_tensor.shape().dim_size(0);
+    //// check the sizes
+    OP_REQUIRES (context, (nsamples == type_tensor.shape().dim_size(0)),  errors::InvalidArgument ("number of samples should match"));
+    OP_REQUIRES (context, (nsamples == box_tensor.shape().dim_size(0)),   errors::InvalidArgument ("number of samples should match"));
+    OP_REQUIRES (context, (ntypes == avg_tensor.shape().dim_size(0)),     errors::InvalidArgument ("number of avg should be ntype"));
+    OP_REQUIRES (context, (ntypes == std_tensor.shape().dim_size(0)),     errors::InvalidArgument ("number of std should be ntype"));
+    
+    OP_REQUIRES (context, (nall * 3 == coord_tensor.shape().dim_size(1)), errors::InvalidArgument ("number of atoms should match"));
+    OP_REQUIRES (context, (nall == type_tensor.shape().dim_size(1)),      errors::InvalidArgument ("number of atoms should match"));
+    OP_REQUIRES (context, (9 == box_tensor.shape().dim_size(1)),          errors::InvalidArgument ("number of box should be 9"));
+    OP_REQUIRES (context, (ndescrpt == avg_tensor.shape().dim_size(1)),   errors::InvalidArgument ("number of avg should be ndescrpt"));
+    OP_REQUIRES (context, (ndescrpt == std_tensor.shape().dim_size(1)),   errors::InvalidArgument ("number of std should be ndescrpt"));   
+    
+    OP_REQUIRES (context, (1 == int(sel_a.size())),  errors::InvalidArgument ("the length of sel array should be 1 in this op"));
+    OP_REQUIRES (context, (1 == int(sel_r.size())),  errors::InvalidArgument ("the length of sel array should be 1 in this op"));
+
+    int nei_mode = 0;
+    bool b_nlist_map = false;
+    if (mesh_tensor.shape().dim_size(0) == 16) {
+      // lammps neighbor list
+      nei_mode = 3;
+    }
+    else if (mesh_tensor.shape().dim_size(0) == 6) {
+      // manual copied pbc
+      assert (nloc == nall);
+      nei_mode = 1;
+      b_nlist_map = true;
+    }
+    else if (mesh_tensor.shape().dim_size(0) == 0) {
+      // no pbc
+      assert (nloc == nall);
+      nei_mode = -1;
+    }
+    else {
+      throw deepmd::deepmd_exception("invalid mesh tensor");
+    }
+
+    // Create output tensors
+    TensorShape descrpt_shape ;
+    descrpt_shape.AddDim (nsamples);
+    descrpt_shape.AddDim (int_64(nloc) * ndescrpt);
+    TensorShape descrpt_deriv_shape ;
+    descrpt_deriv_shape.AddDim (nsamples);
+    descrpt_deriv_shape.AddDim (int_64(nloc) * ndescrpt * 3);
+    TensorShape rij_shape ;
+    rij_shape.AddDim (nsamples);
+    rij_shape.AddDim (int_64(nloc) * nnei * 3);
+    TensorShape nlist_shape ;
+    nlist_shape.AddDim (nsamples);
+    nlist_shape.AddDim (int_64(nloc) * nnei);
+    TensorShape ntype_shape ;
+    ntype_shape.AddDim (nsamples);
+    ntype_shape.AddDim (nloc * nnei);
+    TensorShape nmask_shape ;
+    nmask_shape.AddDim (nsamples);
+    nmask_shape.AddDim (nloc * nnei);
+    // define output tensor
+    int context_output_index = 0;
+    Tensor* descrpt_tensor = NULL;
+    Tensor* descrpt_deriv_tensor = NULL;
+    Tensor* rij_tensor = NULL;
+    Tensor* nlist_tensor = NULL;
+    Tensor* ntype_tensor = NULL;
+    Tensor* nmask_tensor = NULL;
+    OP_REQUIRES_OK(context, context->allocate_output(
+        context_output_index++,
+        descrpt_shape,
+        &descrpt_tensor));
+    OP_REQUIRES_OK(context, context->allocate_output(
+        context_output_index++,
+        descrpt_deriv_shape,
+        &descrpt_deriv_tensor));
+    OP_REQUIRES_OK(context, context->allocate_output(
+        context_output_index++,
+        rij_shape,
+        &rij_tensor));
+    OP_REQUIRES_OK(context, context->allocate_output(
+        context_output_index++,
+        nlist_shape,
+        &nlist_tensor));
+    OP_REQUIRES_OK(context, context->allocate_output(
+        context_output_index++,
+        ntype_shape,
+        &ntype_tensor));
+    OP_REQUIRES_OK(context, context->allocate_output(
+        context_output_index++,
+        nmask_shape,
+        &nmask_tensor));
+
+    FPTYPE * p_em = descrpt_tensor->flat<FPTYPE>().data();
+    FPTYPE * p_em_deriv = descrpt_deriv_tensor->flat<FPTYPE>().data();
+    FPTYPE * p_rij = rij_tensor->flat<FPTYPE>().data();
+    int * p_nlist = nlist_tensor->flat<int>().data();
+    int * p_ntype = ntype_tensor->flat<int>().data();
+    bool * p_nmask = nmask_tensor->flat<bool>().data();
+    const FPTYPE * p_coord = coord_tensor.flat<FPTYPE>().data();
+    const FPTYPE * p_box = box_tensor.flat<FPTYPE>().data();
+    const FPTYPE * avg = avg_tensor.flat<FPTYPE>().data();
+    const FPTYPE * std = std_tensor.flat<FPTYPE>().data();
+    const int * p_type = type_tensor.flat<int>().data();
+
+    // loop over samples
+    for(int_64 ff = 0; ff < nsamples; ++ff){
+      FPTYPE * em = p_em + ff*nloc*ndescrpt;
+      FPTYPE * em_deriv = p_em_deriv + ff*nloc*ndescrpt*3;
+      FPTYPE * rij = p_rij + ff*nloc*nnei*3;
+      int * nlist = p_nlist + ff*nloc*nnei;
+      int * ntype = p_ntype + ff*nloc*nnei;
+      bool * nmask = p_nmask + ff*nloc*nnei;
+      const FPTYPE * coord = p_coord + ff*nall*3;
+      const FPTYPE * box = p_box + ff*9;
+      const int * type = p_type + ff*nall;
+
+    if(device == "GPU") {
+      #if GOOGLE_CUDA
+      int * idx_mapping = NULL;
+      int * ilist = NULL, * numneigh = NULL;
+      int ** firstneigh = NULL;
+      deepmd::malloc_device_memory(firstneigh, nloc);
+      int * jlist = NULL;
+      FPTYPE * coord_cpy;
+      int * type_cpy;
+      int frame_nall = nall;
+      int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
+      std::vector<Tensor> tensor_list(7);
+      Tensor fake_type; // all zeros
+      TensorShape fake_type_shape;
+      fake_type_shape.AddDim(nall);
+      OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, fake_type_shape, &fake_type));
+      deepmd::memset_device_memory(fake_type.flat<int>().data(), 0, nall);
+      const int * f_type = fake_type.flat<int>().data();
+      // prepare coord and nlist
+      _prepare_coord_nlist_gpu<FPTYPE>(
+          context, &tensor_list[0], &coord, coord_cpy, &f_type, type_cpy, idx_mapping, 
+          gpu_inlist, ilist, numneigh, firstneigh, jlist, nbor_list_dev,
+          frame_nall, mem_cpy, mem_nnei, max_nbor_size,
+          box, mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode, rcut_r, max_cpy_trial, max_nnei_trial);
+
+      // allocate temp memory, temp memory must not be used after this operation!
+      Tensor int_temp;
+      TensorShape int_shape;
+      int_shape.AddDim(sec_a.size() + int_64(nloc) * sec_a.size() + nloc);
+      OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
+      Tensor uint64_temp;
+      TensorShape uint64_shape;
+      uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
+      OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp));
+      array_int = int_temp.flat<int>().data(); 
+      array_longlong = uint64_temp.flat<unsigned long long>().data();
 
+      // launch the gpu(nv) compute function
+      deepmd::prod_env_mat_a_gpu_cuda(
+          em, em_deriv, rij, nlist, 
+          coord, type, gpu_inlist, array_int, array_longlong, max_nbor_size, avg, std, nloc, frame_nall, rcut_r, rcut_r_smth, sec_a, f_type);
+      _map_nei_info_gpu(nlist, ntype, nmask, type, idx_mapping, nloc, nnei, ntypes, b_nlist_map);
+      deepmd::delete_device_memory(firstneigh);
+      #endif //GOOGLE_CUDA
+
+      #if TENSORFLOW_USE_ROCM
+      int * idx_mapping = NULL;
+      int * ilist = NULL, * numneigh = NULL;
+      int ** firstneigh = NULL;
+      deepmd::malloc_device_memory(firstneigh, nloc);
+      int * jlist = NULL;
+      FPTYPE * coord_cpy;
+      int * type_cpy;
+      int frame_nall = nall;
+      int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
+      std::vector<Tensor> tensor_list(7);
+      Tensor fake_type; // all zeros
+      TensorShape fake_type_shape;
+      fake_type_shape.AddDim(nall);
+      OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, fake_type_shape, &fake_type));
+      deepmd::memset_device_memory(fake_type.flat<int>().data(), 0, nall);
+      const int * f_type = fake_type.flat<int>().data();
+      // prepare coord and nlist
+      _prepare_coord_nlist_gpu_rocm<FPTYPE>(
+          context, &tensor_list[0], &coord, coord_cpy, &f_type, type_cpy, idx_mapping, 
+          gpu_inlist, ilist, numneigh, firstneigh, jlist, nbor_list_dev,
+          frame_nall, mem_cpy, mem_nnei, max_nbor_size,
+          box, mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode, rcut_r, max_cpy_trial, max_nnei_trial);
+
+      // allocate temp memory, temp memory must not be used after this operation!
+      Tensor int_temp;
+      TensorShape int_shape;
+      int_shape.AddDim(sec_a.size() + int_64(nloc) * sec_a.size() + nloc);
+      OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
+      Tensor uint64_temp;
+      TensorShape uint64_shape;
+      uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
+      OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp));
+      array_int = int_temp.flat<int>().data(); 
+      array_longlong = uint64_temp.flat<unsigned long long>().data();
+
+      // launch the gpu(nv) compute function
+      deepmd::prod_env_mat_a_gpu_rocm(
+          em, em_deriv, rij, nlist, 
+          coord, type, gpu_inlist, array_int, array_longlong, max_nbor_size, avg, std, nloc, frame_nall, rcut_r, rcut_r_smth, sec_a, f_type);
+      _map_nei_info_gpu_rocm(nlist, ntype, nmask, type, idx_mapping, nloc, nnei, ntypes, b_nlist_map);
+      deepmd::delete_device_memory(firstneigh);
+      #endif //TENSORFLOW_USE_ROCM
+    }
+    else if (device == "CPU") {
+      deepmd::InputNlist inlist;
+      // some buffers, be freed after the evaluation of this frame
+      std::vector<int> idx_mapping;
+      std::vector<int> ilist(nloc), numneigh(nloc);
+      std::vector<int*> firstneigh(nloc);
+      std::vector<std::vector<int>> jlist(nloc);
+      std::vector<FPTYPE> coord_cpy;
+      std::vector<int> type_cpy;
+      int frame_nall = nall;
+      std::vector<int> fake_type(nall, 0);
+      const int * f_type = &fake_type[0];
+      // prepare coord and nlist
+      _prepare_coord_nlist_cpu<FPTYPE>(
+	  context, &coord, coord_cpy, &f_type, type_cpy, idx_mapping, 
+	  inlist, ilist, numneigh, firstneigh, jlist,
+	  frame_nall, mem_cpy, mem_nnei, max_nbor_size,
+	  box, mesh_tensor.flat<int>().data(), nloc, nei_mode, rcut_r, max_cpy_trial, max_nnei_trial);
+      // launch the cpu compute function
+      deepmd::prod_env_mat_a_cpu(
+	  em, em_deriv, rij, nlist, 
+	  coord, type, inlist, max_nbor_size, avg, std, nloc, frame_nall, rcut_r, rcut_r_smth, sec_a, f_type);
+      // do nlist mapping if coords were copied
+    _map_nei_info_cpu(nlist, ntype, nmask, type, &idx_mapping[0], nloc, nnei, ntypes, b_nlist_map);
+    }
+    }
+  }
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+private:
+  float rcut_a;
+  float rcut_r;
+  float rcut_r_smth;
+  std::vector<int32> sel_r;
+  std::vector<int32> sel_a;
+  std::vector<int> sec_a;
+  std::vector<int> sec_r;
+  int ndescrpt, ndescrpt_a, ndescrpt_r;
+  int nnei, nnei_a, nnei_r, nloc, nall, max_nbor_size;
+  int mem_cpy, max_cpy_trial;
+  int mem_nnei, max_nnei_trial;
+  std::string device;
+  int * array_int = NULL;
+  unsigned long long * array_longlong = NULL;
+  deepmd::InputNlist gpu_inlist;
+  int * nbor_list_dev = NULL;
+};
 
 
 template<typename FPTYPE>
@@ -989,6 +1362,21 @@ _map_nlist_cpu(
   }  
 }
 
+static void
+_map_nei_info_cpu(
+    int * nlist,
+    int * ntype,
+    bool * nmask,
+    const int * type,
+    const int * idx_mapping,
+    const int & nloc,
+    const int & nnei,
+    const int & ntypes,
+    const bool & b_nlist_map)
+{
+  deepmd::use_nei_info_cpu(nlist, ntype, nmask, type, idx_mapping, nloc, nnei, ntypes, b_nlist_map);
+}
+
 template <typename FPTYPE>
 static void
 _prepare_coord_nlist_cpu(
@@ -1188,6 +1576,21 @@ _map_nlist_gpu(
   deepmd::use_nlist_map(nlist, idx_mapping, nloc, nnei);
 }
 
+static void
+_map_nei_info_gpu(
+    int * nlist,
+    int * ntype,
+    bool * nmask,
+    const int * type,
+    const int * idx_mapping,
+    const int & nloc,
+    const int & nnei,
+    const int & ntypes,
+    const bool & b_nlist_map)
+{
+  deepmd::use_nei_info_gpu(nlist, ntype, nmask, type, idx_mapping, nloc, nnei, ntypes, b_nlist_map);
+}
+
 template <typename FPTYPE>
 static void
 _prepare_coord_nlist_gpu(
@@ -1403,6 +1806,22 @@ _map_nlist_gpu_rocm(
   deepmd::use_nlist_map(nlist, idx_mapping, nloc, nnei);
 }
 
+static void
+_map_nei_info_gpu_rocm(
+    int * nlist,
+    int * ntype,
+    bool * nmask,
+    const int * type,
+    const int * idx_mapping,
+    const int & nloc,
+    const int & nnei,
+    const int & ntypes,
+    const bool & b_nlist_map)
+{
+  deepmd::use_nei_info_gpu_rocm(nlist, ntype, nmask, type, idx_mapping, nloc, nnei, ntypes, b_nlist_map);
+}
+
+
 template <typename FPTYPE>
 static void
 _prepare_coord_nlist_gpu_rocm(
@@ -1484,6 +1903,9 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(                                                                                  \
     Name("ProdEnvMatR").Device(DEVICE_CPU).TypeConstraint<T>("T"),                                        \
     ProdEnvMatROp<CPUDevice, T>);                                                                         \
+REGISTER_KERNEL_BUILDER(                                                                                  \
+    Name("ProdEnvMatAMix").Device(DEVICE_CPU).TypeConstraint<T>("T"),                                     \
+    ProdEnvMatAMixOp<CPUDevice, T>);                                                                      \
 REGISTER_KERNEL_BUILDER(                                                                                  \
     Name("DescrptSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                                        \
     ProdEnvMatAOp<CPUDevice, T>);                                                                         \
@@ -1506,6 +1928,9 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(                                                                                  \
     Name("ProdEnvMatR").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms").HostMemory("box"), \
     ProdEnvMatROp<GPUDevice, T>);                                                                         \
+REGISTER_KERNEL_BUILDER(                                                                                  \
+    Name("ProdEnvMatAMix").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms").HostMemory("box"), \
+    ProdEnvMatAMixOp<GPUDevice, T>);                                                                      \
 REGISTER_KERNEL_BUILDER(                                                                                  \
     Name("DescrptSeA").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms").HostMemory("box"), \
     ProdEnvMatAOp<GPUDevice, T>);                                                                         \
diff --git a/source/tests/common.py b/source/tests/common.py
index e767f8a3eb..38c5bc1aee 100644
--- a/source/tests/common.py
+++ b/source/tests/common.py
@@ -24,9 +24,11 @@ def j_loader(filename):
 def del_data():
     if os.path.isdir('system'):
         shutil.rmtree('system')
+    if os.path.isdir('system_mixed_type'):
+        shutil.rmtree('system_mixed_type')
 
-def gen_data(nframes = 1) :
-    tmpdata = Data(rand_pert = 0.1, seed = 1, nframes = nframes)
+def gen_data_type_specific(nframes = 1):
+    tmpdata = Data(rand_pert=0.1, seed=1, nframes=nframes)
     sys = dpdata.LabeledSystem()
     sys.data['atom_names'] = ['foo', 'bar']
     sys.data['coords'] = tmpdata.coord
@@ -34,14 +36,41 @@ def gen_data(nframes = 1) :
     sys.data['cells'] = tmpdata.cell
     nframes = tmpdata.nframes
     natoms = tmpdata.natoms
-    sys.data['coords'] = sys.data['coords'].reshape([nframes,natoms,3])
-    sys.data['cells'] = sys.data['cells'].reshape([nframes,3,3])
-    sys.data['energies'] = np.zeros([nframes,1])
-    sys.data['forces'] = np.zeros([nframes,natoms,3])
-    sys.to_deepmd_npy('system', prec=np.float64)    
+    sys.data['coords'] = sys.data['coords'].reshape([nframes, natoms, 3])
+    sys.data['cells'] = sys.data['cells'].reshape([nframes, 3, 3])
+    sys.data['energies'] = np.zeros([nframes, 1])
+    sys.data['forces'] = np.zeros([nframes, natoms, 3])
+    sys.to_deepmd_npy('system', prec=np.float64)
     np.save('system/set.000/fparam.npy', tmpdata.fparam)
     np.save('system/set.000/aparam.npy', tmpdata.aparam.reshape([nframes, natoms, 2]))
 
+def gen_data_mixed_type(nframes = 1):
+    tmpdata = Data(rand_pert=0.1, seed=1, nframes=nframes)
+    sys = dpdata.LabeledSystem()
+    real_type_map = ['foo', 'bar']
+    sys.data['atom_names'] = ['X']
+    sys.data['coords'] = tmpdata.coord
+    sys.data['atom_types'] = np.zeros_like(tmpdata.atype)
+    sys.data['cells'] = tmpdata.cell
+    nframes = tmpdata.nframes
+    natoms = tmpdata.natoms
+    sys.data['coords'] = sys.data['coords'].reshape([nframes, natoms, 3])
+    sys.data['cells'] = sys.data['cells'].reshape([nframes, 3, 3])
+    sys.data['energies'] = np.zeros([nframes, 1])
+    sys.data['forces'] = np.zeros([nframes, natoms, 3])
+    sys.to_deepmd_npy('system_mixed_type', prec=np.float64)
+    np.savetxt('system_mixed_type/type_map.raw', real_type_map, fmt='%s')
+    np.save('system_mixed_type/set.000/real_atom_types.npy', tmpdata.atype.reshape(1, -1).repeat(nframes, 0))
+    np.save('system_mixed_type/set.000/fparam.npy', tmpdata.fparam)
+    np.save('system_mixed_type/set.000/aparam.npy', tmpdata.aparam.reshape([nframes, natoms, 2]))
+
+def gen_data(nframes = 1, mixed_type=False) :
+    if not mixed_type:
+        gen_data_type_specific(nframes)
+    else:
+        gen_data_mixed_type(nframes)
+
+
 class Data():
     def __init__ (self, 
                   rand_pert = 0.1, 
@@ -377,4 +406,4 @@ def strerch_box(old_coord, old_box, new_box):
     obox = old_box.reshape(3,3)
     nbox = new_box.reshape(3,3)
     ncoord = ocoord @ np.linalg.inv(obox) @ nbox
-    return ncoord.reshape(old_coord.shape)
\ No newline at end of file
+    return ncoord.reshape(old_coord.shape)
diff --git a/source/tests/test_data_large_batch.py b/source/tests/test_data_large_batch.py
new file mode 100644
index 0000000000..f84aaeedf6
--- /dev/null
+++ b/source/tests/test_data_large_batch.py
@@ -0,0 +1,173 @@
+import dpdata, os, sys, unittest
+import numpy as np
+from deepmd.env import tf
+import pickle
+from common import Data, gen_data, j_loader
+
+from deepmd.utils.data_system import DeepmdDataSystem
+from deepmd.descriptor import DescrptSeAtten
+from deepmd.common import data_requirement
+from deepmd.fit import EnerFitting
+from deepmd.model import EnerModel
+from deepmd.utils.type_embed import TypeEmbedNet
+from deepmd.common import j_must_have
+from common import tf
+from packaging.version import parse as parse_version
+
+GLOBAL_ENER_FLOAT_PRECISION = tf.float64
+GLOBAL_TF_FLOAT_PRECISION = tf.float64
+GLOBAL_NP_FLOAT_PRECISION = np.float64
+
+
+@unittest.skipIf(parse_version(tf.__version__) < parse_version("1.15"),
+    f"The current tf version {tf.__version__} is too low to run the new testing model.")
+class TestDataLargeBatch(tf.test.TestCase):
+    def setUp(self):
+        gen_data(mixed_type=True)
+
+    def test_data_mixed_type(self):
+        jfile = 'water_se_atten_mixed_type.json'
+        jdata = j_loader(jfile)
+
+        systems = j_must_have(jdata, 'systems')
+        batch_size = 1
+        test_size = 1
+        rcut = j_must_have(jdata['model']['descriptor'], 'rcut')
+
+        data = DeepmdDataSystem(systems, batch_size, test_size, rcut)
+        data_requirement = {'energy': {'ndof': 1,
+                    'atomic': False,
+                    'must': False,
+                    'high_prec': True,
+                    'type_sel': None,
+                    'repeat': 1,
+                    'default': 0.0},
+         'force': {'ndof': 3,
+                   'atomic': True,
+                   'must': False,
+                   'high_prec': False,
+                   'type_sel': None,
+                   'repeat': 1,
+                   'default': 0.0},
+         'virial': {'ndof': 9,
+                    'atomic': False,
+                    'must': False,
+                    'high_prec': False,
+                    'type_sel': None,
+                    'repeat': 1,
+                    'default': 0.0},
+         'atom_ener': {'ndof': 1,
+                       'atomic': True,
+                       'must': False,
+                       'high_prec': False,
+                       'type_sel': None,
+                       'repeat': 1,
+                       'default': 0.0},
+         'atom_pref': {'ndof': 1,
+                       'atomic': True,
+                       'must': False,
+                       'high_prec': False,
+                       'type_sel': None,
+                       'repeat': 3,
+                       'default': 0.0}}
+        data.add_dict(data_requirement)
+
+        test_data = data.get_test()
+        numb_test = 1
+
+        jdata['model']['descriptor'].pop('type', None)
+        jdata['model']['descriptor']['ntypes'] = 2
+        descrpt = DescrptSeAtten(**jdata['model']['descriptor'], uniform_seed=True)
+        jdata['model']['fitting_net']['descrpt'] = descrpt
+        fitting = EnerFitting(**jdata['model']['fitting_net'], uniform_seed=True)
+        typeebd_param = jdata['model']['type_embedding']
+        typeebd = TypeEmbedNet(
+            neuron=typeebd_param['neuron'],
+            resnet_dt=typeebd_param['resnet_dt'],
+            activation_function=None,
+            seed=typeebd_param['seed'],
+            uniform_seed=True,
+            padding=True)
+        model = EnerModel(descrpt, fitting, typeebd)
+
+        # model._compute_dstats([test_data['coord']], [test_data['box']], [test_data['type']], [test_data['natoms_vec']], [test_data['default_mesh']])
+        input_data = {'coord': [test_data['coord']],
+                      'box': [test_data['box']],
+                      'type': [test_data['type']],
+                      'natoms_vec': [test_data['natoms_vec']],
+                      'default_mesh': [test_data['default_mesh']],
+                      'real_natoms_vec': [test_data['real_natoms_vec']]
+                      }
+        model._compute_input_stat(input_data, mixed_type=True)
+        model.descrpt.bias_atom_e = np.array([0.,  0.])
+
+        t_energy = tf.placeholder(GLOBAL_ENER_FLOAT_PRECISION, [None], name='t_energy')
+        t_force = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name='t_force')
+        t_virial = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name='t_virial')
+        t_atom_ener = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name='t_atom_ener')
+        t_coord = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name='i_coord')
+        t_type = tf.placeholder(tf.int32, [None], name='i_type')
+        t_natoms = tf.placeholder(tf.int32, [model.ntypes + 2], name='i_natoms')
+        t_box = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None, 9], name='i_box')
+        t_mesh = tf.placeholder(tf.int32, [None], name='i_mesh')
+        is_training = tf.placeholder(tf.bool)
+        t_fparam = None
+        inputs_dict = {}
+
+        model_pred \
+            = model.build(t_coord,
+                          t_type,
+                          t_natoms,
+                          t_box,
+                          t_mesh,
+                          inputs_dict,
+                          suffix="se_atten",
+                          reuse=False)
+
+        energy = model_pred['energy']
+        force = model_pred['force']
+        virial = model_pred['virial']
+        atom_ener = model_pred['atom_ener']
+
+        feed_dict_test = {t_energy: np.reshape(test_data['energy'][:numb_test], [-1]),
+                          t_force: np.reshape(test_data['force'][:numb_test, :], [-1]),
+                          t_virial: np.reshape(test_data['virial'][:numb_test, :], [-1]),
+                          t_atom_ener: np.reshape(test_data['atom_ener'][:numb_test, :], [-1]),
+                          t_coord: np.reshape(test_data['coord'][:numb_test, :], [-1]),
+                          t_box: test_data['box'][:numb_test, :],
+                          t_type: np.reshape(test_data['type'][:numb_test, :], [-1]),
+                          t_natoms: test_data['natoms_vec'],
+                          t_mesh: test_data['default_mesh'],
+                          is_training: False}
+
+
+        sess = self.test_session().__enter__()
+        sess.run(tf.global_variables_initializer())
+        [e, f, v] = sess.run([energy, force, virial],
+                             feed_dict=feed_dict_test)
+        # print(sess.run(model.type_embedding))
+        # np.savetxt('tmp.out', sess.run(descrpt.dout, feed_dict = feed_dict_test), fmt='%.10e')
+        # # print(sess.run(model.atype_embed, feed_dict = feed_dict_test))
+        # print(sess.run(fitting.inputs, feed_dict = feed_dict_test))
+        # print(sess.run(fitting.outs, feed_dict = feed_dict_test))
+        # print(sess.run(fitting.atype_embed, feed_dict = feed_dict_test))
+
+        e = e.reshape([-1])
+        f = f.reshape([-1])
+        v = v.reshape([-1])
+        np.savetxt('e.out', e.reshape([1, -1]), delimiter=',')
+        np.savetxt('f.out', f.reshape([1, -1]), delimiter=',')
+        np.savetxt('v.out', v.reshape([1, -1]), delimiter=',')
+
+        refe = [6.12188445792698e+01]
+        reff = [-2.7590100298321299e-03, -2.7392865283639755e-03, 8.5672424478673337e-05,  7.3154109032780492e-03, 7.6754109031673332e-04, -1.0882393042639207e-03, 9.8633073531477645e-03,  3.6631966083397029e-03, -2.2379079261940034e-04, -4.2393697523149913e-03, 4.9491210390296492e-04,  1.6970049039709007e-04, -8.9021867696626039e-03, -4.7967452269658322e-03, 9.2569990351204447e-04, -1.2781517046160920e-03, 2.6103819527704053e-03,  1.3095727849551296e-04]
+        refv = [-1.0171833662757776e-02, -6.7981543912862021e-03, 6.1480942994810296e-04, -6.7981543912861942e-03, 3.0092645628232335e-03,  3.8060849919518031e-04, 6.1480942994810383e-04,  3.8060849919518036e-04, -5.6890657188056002e-05]
+
+        refe = np.reshape(refe, [-1])
+        reff = np.reshape(reff, [-1])
+        refv = np.reshape(refv, [-1])
+
+        places = 10
+        np.testing.assert_almost_equal(e, refe, places)
+        np.testing.assert_almost_equal(f, reff, places)
+        np.testing.assert_almost_equal(v, refv, places)
diff --git a/source/tests/test_descrpt_se_atten.py b/source/tests/test_descrpt_se_atten.py
new file mode 100644
index 0000000000..777ba0e6bc
--- /dev/null
+++ b/source/tests/test_descrpt_se_atten.py
@@ -0,0 +1,252 @@
+import dpdata, os, sys, unittest
+import numpy as np
+from deepmd.env import tf
+import pickle
+from common import Data, gen_data, j_loader
+
+from deepmd.utils.data_system import DataSystem
+from deepmd.descriptor import DescrptSeAtten
+from deepmd.fit import EnerFitting
+from deepmd.model import EnerModel
+from deepmd.common import j_must_have
+from deepmd.utils.type_embed import embed_atom_type, TypeEmbedNet
+from common import tf
+from packaging.version import parse as parse_version
+
+GLOBAL_ENER_FLOAT_PRECISION = tf.float64
+GLOBAL_TF_FLOAT_PRECISION = tf.float64
+GLOBAL_NP_FLOAT_PRECISION = np.float64
+
+
+@unittest.skipIf(parse_version(tf.__version__) < parse_version("1.15"),
+    f"The current tf version {tf.__version__} is too low to run the new testing model.")
+class TestModel(tf.test.TestCase):
+    def setUp(self):
+        gen_data(nframes=2)
+
+    def test_descriptor_two_sides(self):
+        jfile = 'water_se_atten.json'
+        jdata = j_loader(jfile)
+
+        systems = j_must_have(jdata, 'systems')
+        set_pfx = j_must_have(jdata, 'set_prefix')
+        batch_size = j_must_have(jdata, 'batch_size')
+        test_size = j_must_have(jdata, 'numb_test')
+        batch_size = 2
+        test_size = 1
+        stop_batch = j_must_have(jdata, 'stop_batch')
+        rcut = j_must_have(jdata['model']['descriptor'], 'rcut')
+        sel = j_must_have(jdata['model']['descriptor'], 'sel')
+        ntypes = len(jdata['model']['type_map'])
+
+        data = DataSystem(systems, set_pfx, batch_size, test_size, rcut, run_opt=None)
+
+        test_data = data.get_test()
+        numb_test = 1
+
+        # set parameters
+        jdata['model']['descriptor']['neuron'] = [5, 5, 5]
+        jdata['model']['descriptor']['axis_neuron'] = 2
+        typeebd_param = {'neuron': [5],
+                         'resnet_dt': False,
+                         'seed': 1,
+                         }
+
+        # init models
+        typeebd = TypeEmbedNet(
+            neuron=typeebd_param['neuron'],
+            activation_function=None,
+            resnet_dt=typeebd_param['resnet_dt'],
+            seed=typeebd_param['seed'],
+            uniform_seed=True,
+            padding=True
+        )
+
+        jdata['model']['descriptor'].pop('type', None)
+        jdata['model']['descriptor']['ntypes'] = ntypes
+        descrpt = DescrptSeAtten(**jdata['model']['descriptor'], uniform_seed=True)
+
+        # model._compute_dstats([test_data['coord']], [test_data['box']], [test_data['type']], [test_data['natoms_vec']], [test_data['default_mesh']])
+        input_data = {'coord': [test_data['coord']],
+                      'box': [test_data['box']],
+                      'type': [test_data['type']],
+                      'natoms_vec': [test_data['natoms_vec']],
+                      'default_mesh': [test_data['default_mesh']]
+                      }
+        descrpt.bias_atom_e = data.compute_energy_shift()
+
+        t_prop_c = tf.placeholder(tf.float32, [5], name='t_prop_c')
+        t_energy = tf.placeholder(GLOBAL_ENER_FLOAT_PRECISION, [None], name='t_energy')
+        t_force = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name='t_force')
+        t_virial = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name='t_virial')
+        t_atom_ener = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name='t_atom_ener')
+        t_coord = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name='i_coord')
+        t_type = tf.placeholder(tf.int32, [None], name='i_type')
+        t_natoms = tf.placeholder(tf.int32, [ntypes + 2], name='i_natoms')
+        t_box = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None, 9], name='i_box')
+        t_mesh = tf.placeholder(tf.int32, [None], name='i_mesh')
+        is_training = tf.placeholder(tf.bool)
+        t_fparam = None
+
+        type_embedding = typeebd.build(
+            ntypes,
+            suffix="_se_atten_type_des_ebd_2sdies"
+        )
+
+        dout \
+            = descrpt.build(
+            t_coord,
+            t_type,
+            t_natoms,
+            t_box,
+            t_mesh,
+            {'type_embedding': type_embedding},
+            reuse=False,
+            suffix="_se_atten_type_des_2sides"
+        )
+
+        feed_dict_test = {t_prop_c: test_data['prop_c'],
+                          t_energy: test_data['energy'][:numb_test],
+                          t_force: np.reshape(test_data['force'][:numb_test, :], [-1]),
+                          t_virial: np.reshape(test_data['virial'][:numb_test, :], [-1]),
+                          t_atom_ener: np.reshape(test_data['atom_ener'][:numb_test, :], [-1]),
+                          t_coord: np.reshape(test_data['coord'][:numb_test, :], [-1]),
+                          t_box: test_data['box'][:numb_test, :],
+                          t_type: np.reshape(test_data['type'][:numb_test, :], [-1]),
+                          t_natoms: test_data['natoms_vec'],
+                          t_mesh: test_data['default_mesh'],
+                          is_training: False}
+
+        sess = self.test_session().__enter__()
+        sess.run(tf.global_variables_initializer())
+        [model_dout] = sess.run([dout],
+                                feed_dict=feed_dict_test)
+        model_dout = model_dout.reshape([-1])
+
+        ref_dout = [1.3503570575883254e-04, -9.3606804794552518e-05, -9.3606804794552518e-05,  6.4931435609575354e-05, -3.4432462227712845e-04,  2.3883309310633266e-04, -2.1612770334269806e-04,  1.4980041766865035e-04,
+        5.1902342465554648e-04, -3.5995814159000579e-04, 1.0061650355705337e-04, -7.5148260042556979e-05, -7.5148260042556979e-05,  5.6249549384058458e-05, -2.7820514647114664e-04,  2.0819618461713165e-04,
+       -1.5698895407951743e-04,  1.1721016363267746e-04, 4.0972585703616773e-04, -3.0650763759131061e-04, 7.5599650998659526e-05, -5.8808888720672558e-05, -5.8808888720672558e-05,  4.5766209906762655e-05,
+       -2.1712714013251668e-04,  1.6899894453623564e-04, -1.2167120597162636e-04,  9.4648599144861605e-05, 3.2200758382615601e-04, -2.5060486486718734e-04, 1.1293831101452813e-04, -7.9512063028041913e-05,
+       -7.9512063028041913e-05,  5.5979262682797850e-05, -2.9058515610909440e-04,  2.0457554106366365e-04, -1.8732839505532627e-04,  1.3188376232775540e-04, 4.4448730317793450e-04, -3.1292650304617497e-04,
+        1.3015885894252541e-04, -8.8816609587789126e-05, -8.8816609587789126e-05,  6.0613949400496957e-05, -3.2308121544925519e-04,  2.2046786823295058e-04, -2.1781481424814687e-04,  1.4862599684199924e-04,
+        4.9955378034266583e-04, -3.4089120488765758e-04, 1.0160496779809329e-04, -7.4538471222199861e-05, -7.4538471222199861e-05,  5.4703671679263269e-05, -2.7394267959121653e-04,  2.0103409637607701e-04,
+       -1.6657135958432620e-04,  1.2219321453198225e-04, 4.1344754259964935e-04, -3.0339251136512270e-04]
+
+        places = 10
+        np.testing.assert_almost_equal(model_dout, ref_dout, places)
+
+    def test_descriptor_one_side(self):
+        jfile = 'water_se_atten.json'
+        jdata = j_loader(jfile)
+
+        systems = j_must_have(jdata, 'systems')
+        set_pfx = j_must_have(jdata, 'set_prefix')
+        batch_size = j_must_have(jdata, 'batch_size')
+        test_size = j_must_have(jdata, 'numb_test')
+        batch_size = 1
+        test_size = 1
+        stop_batch = j_must_have(jdata, 'stop_batch')
+        rcut = j_must_have(jdata['model']['descriptor'], 'rcut')
+        sel = j_must_have(jdata['model']['descriptor'], 'sel')
+        ntypes = len(jdata['model']['type_map'])
+
+        data = DataSystem(systems, set_pfx, batch_size, test_size, rcut, run_opt=None)
+
+        test_data = data.get_test()
+        numb_test = 1
+
+        # set parameters
+        jdata['model']['descriptor']['neuron'] = [5, 5, 5]
+        jdata['model']['descriptor']['axis_neuron'] = 2
+        jdata['model']['descriptor']['type_one_side'] = True
+        typeebd_param = {'neuron': [5],
+                         'resnet_dt': False,
+                         'seed': 1,
+                         }
+
+        # init models
+        typeebd = TypeEmbedNet(
+            neuron=typeebd_param['neuron'],
+            activation_function=None,
+            resnet_dt=typeebd_param['resnet_dt'],
+            seed=typeebd_param['seed'],
+            uniform_seed=True,
+            padding=True
+        )
+
+        jdata['model']['descriptor'].pop('type', None)
+        jdata['model']['descriptor']['ntypes'] = ntypes
+        descrpt = DescrptSeAtten(**jdata['model']['descriptor'], uniform_seed=True)
+
+        # model._compute_dstats([test_data['coord']], [test_data['box']], [test_data['type']], [test_data['natoms_vec']], [test_data['default_mesh']])
+        input_data = {'coord': [test_data['coord']],
+                      'box': [test_data['box']],
+                      'type': [test_data['type']],
+                      'natoms_vec': [test_data['natoms_vec']],
+                      'default_mesh': [test_data['default_mesh']]
+                      }
+        descrpt.bias_atom_e = data.compute_energy_shift()
+
+        t_prop_c = tf.placeholder(tf.float32, [5], name='t_prop_c')
+        t_energy = tf.placeholder(GLOBAL_ENER_FLOAT_PRECISION, [None], name='t_energy')
+        t_force = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name='t_force')
+        t_virial = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name='t_virial')
+        t_atom_ener = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name='t_atom_ener')
+        t_coord = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name='i_coord')
+        t_type = tf.placeholder(tf.int32, [None], name='i_type')
+        t_natoms = tf.placeholder(tf.int32, [ntypes + 2], name='i_natoms')
+        t_box = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None, 9], name='i_box')
+        t_mesh = tf.placeholder(tf.int32, [None], name='i_mesh')
+        is_training = tf.placeholder(tf.bool)
+        t_fparam = None
+
+        type_embedding = typeebd.build(
+            ntypes,
+            suffix="_se_atten_type_des_ebd_1side"
+        )
+
+        dout \
+            = descrpt.build(
+            t_coord,
+            t_type,
+            t_natoms,
+            t_box,
+            t_mesh,
+            {'type_embedding': type_embedding},
+            reuse=False,
+            suffix="_se_atten_type_des_1side"
+        )
+
+        feed_dict_test = {t_prop_c: test_data['prop_c'],
+                          t_energy: test_data['energy'][:numb_test],
+                          t_force: np.reshape(test_data['force'][:numb_test, :], [-1]),
+                          t_virial: np.reshape(test_data['virial'][:numb_test, :], [-1]),
+                          t_atom_ener: np.reshape(test_data['atom_ener'][:numb_test, :], [-1]),
+                          t_coord: np.reshape(test_data['coord'][:numb_test, :], [-1]),
+                          t_box: test_data['box'][:numb_test, :],
+                          t_type: np.reshape(test_data['type'][:numb_test, :], [-1]),
+                          t_natoms: test_data['natoms_vec'],
+                          t_mesh: test_data['default_mesh'],
+                          is_training: False}
+
+        sess = self.test_session().__enter__()
+        sess.run(tf.global_variables_initializer())
+        [model_dout] = sess.run([dout],
+                                feed_dict=feed_dict_test)
+        model_dout = model_dout.reshape([-1])
+
+        ref_dout = [8.9336098555659429e-05, -3.8921422089719007e-05, -3.8921422089719007e-05,  1.6975109833017758e-05, -2.9184951813034413e-04,  1.2724836941382651e-04, -1.8062533253590169e-04,  7.8681048972093648e-05,
+        4.2206017420030542e-04, -1.8398310612921889e-04, 6.4996467281506633e-05, -3.0812041327073575e-05, -3.0812041327073575e-05,  1.4663988013438402e-05, -2.3274950984084172e-04,  1.1059587214865573e-04,
+       -1.3043761448464089e-04,  6.1788865409826698e-05, 3.2900269837104958e-04, -1.5623668424484728e-04, 5.0697927477465942e-05, -2.3511768544350768e-05, -2.3511768544350768e-05,  1.0919808814040025e-05,
+       -1.8622373494960208e-04,  8.6439275444049409e-05, -1.0326450661269683e-04,  4.7880797898768150e-05, 2.6230208262918372e-04, -1.2172811361250681e-04, 7.8240863239649707e-05, -3.2501260967978116e-05,
+       -3.2501260967978116e-05,  1.3502267073810926e-05, -2.5360559687597850e-04,  1.0535336854834091e-04, -1.6047265448841568e-04,  6.6660202062744658e-05, 3.6833864909272261e-04, -1.5301457671691837e-04,
+        9.1148582997925288e-05, -3.6614945467066073e-05, -3.6614945467066073e-05,  1.4709958908948206e-05, -2.8364168092837332e-04,  1.1394466218003484e-04, -1.8721615730559043e-04,  7.5203967811613109e-05,
+        4.1632420070310456e-04, -1.6724364343353009e-04, 6.9506193268190631e-05, -3.0228106532898472e-05, -3.0228106532898472e-05,  1.3156705594652870e-05, -2.3740975974826574e-04,  1.0328972070195332e-04,
+       -1.4218547815143072e-04,  6.1827596642872941e-05, 3.4031715116440432e-04, -1.4804591640658066e-04]
+
+        places = 10
+        np.testing.assert_almost_equal(model_dout, ref_dout, places)
+
+
+
+
diff --git a/source/tests/test_examples.py b/source/tests/test_examples.py
index 9bcbc2b3d7..61e1801345 100644
--- a/source/tests/test_examples.py
+++ b/source/tests/test_examples.py
@@ -17,6 +17,7 @@
     p_examples / "water" / "se_e3" / "input.json",
     p_examples / "water" / "se_e2_a_tebd" / "input.json",
     p_examples / "water" / "se_e2_a_mixed_prec" / "input.json",
+    p_examples / "water" / "se_atten" / "input.json",
     p_examples / "water" / "dplr" / "train" / "dw.json",
     p_examples / "water" / "dplr" / "train" / "ener.json",
     p_examples / "nopbc" / "train" / "input.json",
diff --git a/source/tests/test_fitting_ener_type.py b/source/tests/test_fitting_ener_type.py
index 1eb1002147..5ccb4797ab 100644
--- a/source/tests/test_fitting_ener_type.py
+++ b/source/tests/test_fitting_ener_type.py
@@ -76,14 +76,17 @@ def test_fitting(self):
                     0.0005154794374703516,-0.00019422534512034776,-0.00019422534512034776,7.318151797939947e-05,-0.0013576642997136488,0.0005115548790018505,-0.0010275333676074971,0.00038716440070070385,0.0005376426714609369,-0.00020257810468163985,
                     0.0004482204892297628,-0.00016887749501640607,-0.00016887749501640607,6.364643102775375e-05,-0.001181345877677835,0.0004452029242063362,-0.0008941636427724908,0.0003369586197174627,0.0004677878512312651,-0.00017625260641095753])
         type_embedding = np.array([1.4916816460764615,0.2720153234707013,-2.4385153754181985,-1.8454294510880027,2.874575701113528,1.1225116575801295,0.4204818970813372,-2.3784087249787587,-1.5053748251050598,2.769329403073084])
+        atype = np.array([0, 0, 1, 1, 1, 1], dtype=np.int32)
 
         dout= dout.reshape([-1,10])
         type_embedding = type_embedding.reshape([ntypes,-1])
+        atype = atype.reshape([-1])
         atom_ener = fitting.build(tf.convert_to_tensor(dout),
                                   t_natoms, 
-                                  {'type_embedding':tf.convert_to_tensor(type_embedding)},
-                                  reuse = False,
-                                  suffix = "se_a_type_fit_")
+                                  {'type_embedding':tf.convert_to_tensor(type_embedding),
+                                   'atype':tf.convert_to_tensor(atype)},
+                                  reuse=False,
+                                  suffix="se_a_type_fit_")
 
         feed_dict_test = {t_prop_c:        test_data['prop_c'],
                           t_energy:        test_data['energy']              [:numb_test],
diff --git a/source/tests/test_model_se_atten.py b/source/tests/test_model_se_atten.py
new file mode 100644
index 0000000000..d8bce857b0
--- /dev/null
+++ b/source/tests/test_model_se_atten.py
@@ -0,0 +1,138 @@
+import dpdata, os, sys, unittest
+import numpy as np
+from deepmd.env import tf
+import pickle
+from common import Data, gen_data, j_loader
+
+from deepmd.utils.data_system import DataSystem
+from deepmd.descriptor import DescrptSeAtten
+from deepmd.fit import EnerFitting
+from deepmd.model import EnerModel
+from deepmd.utils.type_embed import TypeEmbedNet
+from deepmd.common import j_must_have
+from common import tf
+from packaging.version import parse as parse_version
+
+GLOBAL_ENER_FLOAT_PRECISION = tf.float64
+GLOBAL_TF_FLOAT_PRECISION = tf.float64
+GLOBAL_NP_FLOAT_PRECISION = np.float64
+
+
+@unittest.skipIf(parse_version(tf.__version__) < parse_version("1.15"),
+    f"The current tf version {tf.__version__} is too low to run the new testing model.")
+class TestModel(tf.test.TestCase):
+    def setUp(self):
+        gen_data()
+
+    def test_model(self):
+        jfile = 'water_se_atten.json'
+        jdata = j_loader(jfile)
+
+        systems = j_must_have(jdata, 'systems')
+        set_pfx = j_must_have(jdata, 'set_prefix')
+        batch_size = j_must_have(jdata, 'batch_size')
+        test_size = j_must_have(jdata, 'numb_test')
+        batch_size = 1
+        test_size = 1
+        stop_batch = j_must_have(jdata, 'stop_batch')
+        rcut = j_must_have(jdata['model']['descriptor'], 'rcut')
+
+        data = DataSystem(systems, set_pfx, batch_size, test_size, rcut, run_opt=None)
+
+        test_data = data.get_test()
+        numb_test = 1
+
+        jdata['model']['descriptor'].pop('type', None)
+        jdata['model']['descriptor']['ntypes'] = 2
+        descrpt = DescrptSeAtten(**jdata['model']['descriptor'], uniform_seed=True)
+        jdata['model']['fitting_net']['descrpt'] = descrpt
+        fitting = EnerFitting(**jdata['model']['fitting_net'], uniform_seed=True)
+        typeebd_param = jdata['model']['type_embedding']
+        typeebd = TypeEmbedNet(
+            neuron=typeebd_param['neuron'],
+            activation_function=None,
+            resnet_dt=typeebd_param['resnet_dt'],
+            seed=typeebd_param['seed'],
+            uniform_seed=True,
+            padding=True)
+        model = EnerModel(descrpt, fitting, typeebd)
+
+        # model._compute_dstats([test_data['coord']], [test_data['box']], [test_data['type']], [test_data['natoms_vec']], [test_data['default_mesh']])
+        input_data = {'coord': [test_data['coord']],
+                      'box': [test_data['box']],
+                      'type': [test_data['type']],
+                      'natoms_vec': [test_data['natoms_vec']],
+                      'default_mesh': [test_data['default_mesh']]
+                      }
+        model._compute_input_stat(input_data)
+        model.descrpt.bias_atom_e = data.compute_energy_shift()
+
+        t_prop_c = tf.placeholder(tf.float32, [5], name='t_prop_c')
+        t_energy = tf.placeholder(GLOBAL_ENER_FLOAT_PRECISION, [None], name='t_energy')
+        t_force = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name='t_force')
+        t_virial = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name='t_virial')
+        t_atom_ener = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name='t_atom_ener')
+        t_coord = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name='i_coord')
+        t_type = tf.placeholder(tf.int32, [None], name='i_type')
+        t_natoms = tf.placeholder(tf.int32, [model.ntypes + 2], name='i_natoms')
+        t_box = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None, 9], name='i_box')
+        t_mesh = tf.placeholder(tf.int32, [None], name='i_mesh')
+        is_training = tf.placeholder(tf.bool)
+        t_fparam = None
+        inputs_dict = {}
+
+        model_pred \
+            = model.build(t_coord,
+                          t_type,
+                          t_natoms,
+                          t_box,
+                          t_mesh,
+                          inputs_dict,
+                          suffix="se_atten",
+                          reuse=False)
+        energy = model_pred['energy']
+        force = model_pred['force']
+        virial = model_pred['virial']
+        atom_ener = model_pred['atom_ener']
+
+        feed_dict_test = {t_prop_c: test_data['prop_c'],
+                          t_energy: test_data['energy'][:numb_test],
+                          t_force: np.reshape(test_data['force'][:numb_test, :], [-1]),
+                          t_virial: np.reshape(test_data['virial'][:numb_test, :], [-1]),
+                          t_atom_ener: np.reshape(test_data['atom_ener'][:numb_test, :], [-1]),
+                          t_coord: np.reshape(test_data['coord'][:numb_test, :], [-1]),
+                          t_box: test_data['box'][:numb_test, :],
+                          t_type: np.reshape(test_data['type'][:numb_test, :], [-1]),
+                          t_natoms: test_data['natoms_vec'],
+                          t_mesh: test_data['default_mesh'],
+                          is_training: False}
+        sess = self.test_session().__enter__()
+        sess.run(tf.global_variables_initializer())
+        [e, f, v] = sess.run([energy, force, virial],
+                             feed_dict=feed_dict_test)
+        # print(sess.run(model.type_embedding))
+        # np.savetxt('tmp.out', sess.run(descrpt.dout, feed_dict = feed_dict_test), fmt='%.10e')
+        # # print(sess.run(model.atype_embed, feed_dict = feed_dict_test))
+        # print(sess.run(fitting.inputs, feed_dict = feed_dict_test))
+        # print(sess.run(fitting.outs, feed_dict = feed_dict_test))
+        # print(sess.run(fitting.atype_embed, feed_dict = feed_dict_test))
+
+        e = e.reshape([-1])
+        f = f.reshape([-1])
+        v = v.reshape([-1])
+        np.savetxt('e.out', e.reshape([1, -1]), delimiter=',')
+        np.savetxt('f.out', f.reshape([1, -1]), delimiter=',')
+        np.savetxt('v.out', v.reshape([1, -1]), delimiter=',')
+
+        refe = [6.12188445792698e+01]
+        reff = [-2.7590100298321299e-03, -2.7392865283639755e-03, 8.5672424478673337e-05,  7.3154109032780492e-03, 7.6754109031673332e-04, -1.0882393042639207e-03, 9.8633073531477645e-03,  3.6631966083397029e-03, -2.2379079261940034e-04, -4.2393697523149913e-03, 4.9491210390296492e-04,  1.6970049039709007e-04, -8.9021867696626039e-03, -4.7967452269658322e-03, 9.2569990351204447e-04, -1.2781517046160920e-03, 2.6103819527704053e-03,  1.3095727849551296e-04]
+        refv = [-1.0171833662757776e-02, -6.7981543912862021e-03, 6.1480942994810296e-04, -6.7981543912861942e-03, 3.0092645628232335e-03,  3.8060849919518031e-04, 6.1480942994810383e-04,  3.8060849919518036e-04, -5.6890657188056002e-05]
+
+        refe = np.reshape(refe, [-1])
+        reff = np.reshape(reff, [-1])
+        refv = np.reshape(refv, [-1])
+
+        places = 10
+        np.testing.assert_almost_equal(e, refe, places)
+        np.testing.assert_almost_equal(f, reff, places)
+        np.testing.assert_almost_equal(v, refv, places)
diff --git a/source/tests/water_se_atten.json b/source/tests/water_se_atten.json
new file mode 100644
index 0000000000..1a1e02c11c
--- /dev/null
+++ b/source/tests/water_se_atten.json
@@ -0,0 +1,66 @@
+{
+    "_comment": " model parameters",
+    "model" : {
+    "type_map":     ["O", "H"],
+	"type_embedding":{
+	    "neuron":		[8],
+	    "resnet_dt":	false,
+	    "seed":		1
+	},
+	"descriptor" :{
+	    "type":		"se_atten",
+	    "sel":		120,
+	    "rcut_smth":	5.80,
+	    "rcut":		6.00,
+	    "neuron":		[25, 50, 100],
+	    "resnet_dt":	false,
+	    "type_one_side":    false,
+	    "axis_neuron":	16,
+	    "seed":		1,
+        "attn":		128,
+		"attn_layer":	2,
+		"attn_dotr": 	true,
+		"attn_mask": 	false
+    },
+	"fitting_net" : {
+	    "neuron":		[240, 240, 240],
+	    "resnet_dt":	true,
+	    "seed":		1
+	}
+    },
+
+
+    "_comment": " traing controls",
+    "systems":		["system"],
+    "set_prefix":	"set",
+    "stop_batch":	1000000,
+    "batch_size":	1,
+    "start_lr":		0.005,
+    "decay_steps":	5000,
+    "decay_rate":	0.95,
+
+    "start_pref_e":	0.02,
+    "limit_pref_e":	1,
+    "start_pref_f":	1000,
+    "limit_pref_f":	1,
+    "start_pref_v":	0,
+    "limit_pref_v":	0,
+
+    "seed":		1,
+
+    "_comment": " display and restart",
+    "_comment": " frequencies counted in batch",
+    "disp_file":	"lcurve.out",
+    "disp_freq":	100,
+    "numb_test":	1,
+    "save_freq":	1000,
+    "save_ckpt":	"model.ckpt",
+    "load_ckpt":	"model.ckpt",
+    "disp_training":	true,
+    "time_training":	true,
+    "profiling":	false,
+    "profiling_file":	"timeline.json",
+
+    "_comment":		"that's all"
+}
+
diff --git a/source/tests/water_se_atten_mixed_type.json b/source/tests/water_se_atten_mixed_type.json
new file mode 100644
index 0000000000..e83e7c3f39
--- /dev/null
+++ b/source/tests/water_se_atten_mixed_type.json
@@ -0,0 +1,66 @@
+{
+    "_comment": " model parameters",
+    "model" : {
+    "type_map":     ["O", "H"],
+	"type_embedding":{
+	    "neuron":		[8],
+	    "resnet_dt":	false,
+	    "seed":		1
+	},
+	"descriptor" :{
+	    "type":		"se_atten",
+	    "sel":		120,
+	    "rcut_smth":	5.80,
+	    "rcut":		6.00,
+	    "neuron":		[25, 50, 100],
+	    "resnet_dt":	false,
+	    "type_one_side":    false,
+	    "axis_neuron":	16,
+	    "seed":		1,
+        "attn":		128,
+		"attn_layer":	2,
+		"attn_dotr": 	true,
+		"attn_mask": 	false
+    },
+	"fitting_net" : {
+	    "neuron":		[240, 240, 240],
+	    "resnet_dt":	true,
+	    "seed":		1
+	}
+    },
+
+
+    "_comment": " traing controls",
+    "systems":		["system_mixed_type"],
+    "set_prefix":	"set",
+    "stop_batch":	1000000,
+    "batch_size":	1,
+    "start_lr":		0.005,
+    "decay_steps":	5000,
+    "decay_rate":	0.95,
+
+    "start_pref_e":	0.02,
+    "limit_pref_e":	1,
+    "start_pref_f":	1000,
+    "limit_pref_f":	1,
+    "start_pref_v":	0,
+    "limit_pref_v":	0,
+
+    "seed":		1,
+
+    "_comment": " display and restart",
+    "_comment": " frequencies counted in batch",
+    "disp_file":	"lcurve.out",
+    "disp_freq":	100,
+    "numb_test":	1,
+    "save_freq":	1000,
+    "save_ckpt":	"model.ckpt",
+    "load_ckpt":	"model.ckpt",
+    "disp_training":	true,
+    "time_training":	true,
+    "profiling":	false,
+    "profiling_file":	"timeline.json",
+
+    "_comment":		"that's all"
+}
+