diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py
index 2d03635e00..3d502730de 100644
--- a/deepmd/descriptor/se_a.py
+++ b/deepmd/descriptor/se_a.py
@@ -112,7 +112,7 @@ def __init__ (self,
             self.place_holders['natoms_vec'] = tf.placeholder(tf.int32, [self.ntypes+2], name=name_pfx+'t_natoms')
             self.place_holders['default_mesh'] = tf.placeholder(tf.int32, [None], name=name_pfx+'t_mesh')
             self.stat_descrpt, descrpt_deriv, rij, nlist \
-                = op_module.descrpt_se_a(self.place_holders['coord'],
+                = op_module.prod_env_mat_a(self.place_holders['coord'],
                                          self.place_holders['type'],
                                          self.place_holders['natoms_vec'],
                                          self.place_holders['box'],
@@ -340,7 +340,7 @@ def build (self,
         atype = tf.reshape (atype_, [-1, natoms[1]])
 
         self.descrpt, self.descrpt_deriv, self.rij, self.nlist \
-            = op_module.descrpt_se_a (coord,
+            = op_module.prod_env_mat_a (coord,
                                        atype,
                                        natoms,
                                        box,
diff --git a/source/lib/include/neighbor_list.h b/source/lib/include/neighbor_list.h
index 44a7fc417b..58d5f150ed 100644
--- a/source/lib/include/neighbor_list.h
+++ b/source/lib/include/neighbor_list.h
@@ -16,6 +16,9 @@ struct InputNlist
   int * ilist;
   int * numneigh;
   int ** firstneigh;
+  InputNlist () 
+      : inum(0), ilist(NULL), numneigh(NULL), firstneigh(NULL)
+      {};
   InputNlist (
       int inum_, 
       int * ilist_,
@@ -23,7 +26,7 @@ struct InputNlist
       int ** firstneigh_
       ) 
       : inum(inum_), ilist(ilist_), numneigh(numneigh_), firstneigh(firstneigh_)
-      {}
+      {};
 };
 
 void convert_nlist(
@@ -31,10 +34,12 @@ void convert_nlist(
     std::vector<std::vector<int> > & from_nlist
     );
 
+int max_numneigh(
+    const InputNlist & to_nlist
+    );
 
 // build neighbor list.
 // outputs
-
 //	nlist, max_list_size
 //	max_list_size is the maximal size of jlist.
 // inputs
diff --git a/source/lib/include/prod_env_mat.h b/source/lib/include/prod_env_mat.h
index 2b09d07d22..ff12ba1710 100644
--- a/source/lib/include/prod_env_mat.h
+++ b/source/lib/include/prod_env_mat.h
@@ -1,6 +1,7 @@
 #pragma once
 #include <vector>
 #include "device.h"
+#include "neighbor_list.h"
 
 template<typename FPTYPE>
 void prod_env_mat_a_cpu(
@@ -10,9 +11,7 @@ void prod_env_mat_a_cpu(
     int * nlist, 
     const FPTYPE * coord, 
     const int * type, 
-    const int * ilist, 
-    const int * jrange, 
-    const int * jlist,
+    const InputNlist & inlist,
     const int max_nbor_size,
     const FPTYPE * avg, 
     const FPTYPE * std, 
@@ -30,9 +29,7 @@ void prod_env_mat_r_cpu(
     int * nlist, 
     const FPTYPE * coord, 
     const int * type, 
-    const int * ilist, 
-    const int * jrange, 
-    const int * jlist,
+    const InputNlist & inlist,
     const int max_nbor_size,
     const FPTYPE * avg, 
     const FPTYPE * std, 
diff --git a/source/lib/src/coord.cc b/source/lib/src/coord.cc
index 5066eac2a5..fb57b2c403 100644
--- a/source/lib/src/coord.cc
+++ b/source/lib/src/coord.cc
@@ -33,10 +33,11 @@ copy_coord_cpu(
     const FPTYPE * in_c,
     const int * in_t,
     const int & nloc,
-    const int & mem_nall,
+    const int & mem_nall_,
     const float & rcut,
     const Region<FPTYPE> & region)
 {
+  const int mem_nall = mem_nall_;
   std::vector<double> coord(nloc * 3);
   std::vector<int> atype(nloc);
   std::copy(in_c, in_c+nloc*3, coord.begin());
diff --git a/source/lib/src/neighbor_list.cc b/source/lib/src/neighbor_list.cc
index 0643037e3f..6a603dc94e 100644
--- a/source/lib/src/neighbor_list.cc
+++ b/source/lib/src/neighbor_list.cc
@@ -757,6 +757,18 @@ convert_nlist(
   }
 }
 
+int
+max_numneigh(
+    const InputNlist & nlist
+    )
+{
+  int max_num = 0;
+  for(int ii = 0; ii < nlist.inum; ++ii){
+    if(nlist.numneigh[ii] > max_num) max_num = nlist.numneigh[ii];
+  }
+  return max_num;
+}
+
 template <typename FPTYPE>
 int
 build_nlist_cpu(
@@ -765,9 +777,10 @@ build_nlist_cpu(
     const FPTYPE * c_cpy,
     const int & nloc, 
     const int & nall, 
-    const int & mem_size,
+    const int & mem_size_,
     const float & rcut)
 {
+  const int mem_size = mem_size_;
   *max_list_size = 0;
   nlist.inum = nloc;
   FPTYPE rcut2 = rcut * rcut;  
diff --git a/source/lib/src/prod_env_mat.cc b/source/lib/src/prod_env_mat.cc
index ce052b3df0..6bad611ed7 100644
--- a/source/lib/src/prod_env_mat.cc
+++ b/source/lib/src/prod_env_mat.cc
@@ -13,9 +13,7 @@ void prod_env_mat_a_cpu(
     int * nlist, 
     const FPTYPE * coord, 
     const int * type, 
-    const int * ilist, 
-    const int * jrange, 
-    const int * jlist,
+    const InputNlist & inlist,
     const int max_nbor_size,
     const FPTYPE * avg, 
     const FPTYPE * std, 
@@ -45,13 +43,14 @@ void prod_env_mat_a_cpu(
   // build nlist
   std::vector<std::vector<int > > d_nlist_a(nloc);
 
+  assert(nloc == inlist.inum);
   for (unsigned ii = 0; ii < nloc; ++ii) {
-    d_nlist_a.reserve (jrange[nloc] / nloc + 10);
+    d_nlist_a[ii].reserve(max_nbor_size);
   }
   for (unsigned ii = 0; ii < nloc; ++ii) {
-    int i_idx = ilist[ii];
-    for (unsigned jj = jrange[ii]; jj < jrange[ii+1]; ++jj) {
-      int j_idx = jlist[jj];
+    int i_idx = inlist.ilist[ii];
+    for(unsigned jj = 0; jj < inlist.numneigh[ii]; ++jj){
+      int j_idx = inlist.firstneigh[ii][jj];
       d_nlist_a[i_idx].push_back (j_idx);
     }
   }
@@ -96,9 +95,7 @@ void prod_env_mat_r_cpu(
     int * nlist, 
     const FPTYPE * coord, 
     const int * type, 
-    const int * ilist, 
-    const int * jrange, 
-    const int * jlist,
+    const InputNlist & inlist,
     const int max_nbor_size,
     const FPTYPE * avg, 
     const FPTYPE * std, 
@@ -128,13 +125,14 @@ void prod_env_mat_r_cpu(
   // build nlist
   std::vector<std::vector<int > > d_nlist_a(nloc);
 
+  assert(nloc == inlist.inum);
   for (unsigned ii = 0; ii < nloc; ++ii) {
-    d_nlist_a.reserve (jrange[nloc] / nloc + 10);
+    d_nlist_a[ii].reserve(max_nbor_size);
   }
   for (unsigned ii = 0; ii < nloc; ++ii) {
-    int i_idx = ilist[ii];
-    for (unsigned jj = jrange[ii]; jj < jrange[ii+1]; ++jj) {
-      int j_idx = jlist[jj];
+    int i_idx = inlist.ilist[ii];
+    for(unsigned jj = 0; jj < inlist.numneigh[ii]; ++jj){
+      int j_idx = inlist.firstneigh[ii][jj];
       d_nlist_a[i_idx].push_back (j_idx);
     }
   }
@@ -180,9 +178,7 @@ void prod_env_mat_a_cpu<double>(
     int * nlist, 
     const double * coord, 
     const int * type, 
-    const int * ilist, 
-    const int * jrange, 
-    const int * jlist,
+    const InputNlist & inlist,
     const int max_nbor_size,
     const double * avg, 
     const double * std, 
@@ -200,9 +196,7 @@ void prod_env_mat_a_cpu<float>(
     int * nlist, 
     const float * coord, 
     const int * type, 
-    const int * ilist, 
-    const int * jrange, 
-    const int * jlist,
+    const InputNlist & inlist,
     const int max_nbor_size,
     const float * avg, 
     const float * std, 
@@ -220,9 +214,7 @@ void prod_env_mat_r_cpu<double>(
     int * nlist, 
     const double * coord, 
     const int * type, 
-    const int * ilist, 
-    const int * jrange, 
-    const int * jlist,
+    const InputNlist & inlist,
     const int max_nbor_size,
     const double * avg, 
     const double * std, 
@@ -240,9 +232,7 @@ void prod_env_mat_r_cpu<float>(
     int * nlist, 
     const float * coord, 
     const int * type, 
-    const int * ilist, 
-    const int * jrange, 
-    const int * jlist,
+    const InputNlist & inlist,
     const int max_nbor_size,
     const float * avg, 
     const float * std, 
diff --git a/source/lib/tests/test_env_mat_a.cc b/source/lib/tests/test_env_mat_a.cc
index 85d66ec5dd..1e630ecd7d 100644
--- a/source/lib/tests/test_env_mat_a.cc
+++ b/source/lib/tests/test_env_mat_a.cc
@@ -397,15 +397,11 @@ TEST_F(TestEnvMatA, prod_cpu)
       max_nbor_size = nlist_a_cpy[ii].size();
     }
   }
-  std::vector<int> ilist(nloc), jlist(tot_nnei), jrange(nloc+1, 0);
-  for (int ii = 0; ii < nloc; ++ii){
-    ilist[ii] = ii;
-    jrange[ii+1] = jrange[ii] + nlist_a_cpy[ii].size();
-    int jj, cc;
-    for (jj = jrange[ii], cc = 0; jj < jrange[ii+1]; ++jj, ++cc){
-      jlist[jj] = nlist_a_cpy[ii][cc];
-    }
-  }
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  convert_nlist(inlist, nlist_a_cpy);
+  
   std::vector<double > em(nloc * ndescrpt), em_deriv(nloc * ndescrpt * 3), rij(nloc * nnei * 3);
   std::vector<int> nlist(nloc * nnei);
   std::vector<double > avg(ntypes * ndescrpt, 0);
@@ -417,9 +413,7 @@ TEST_F(TestEnvMatA, prod_cpu)
       &nlist[0],
       &posi_cpy[0],
       &atype_cpy[0],
-      &ilist[0],
-      &jrange[0],
-      &jlist[0],
+      inlist,
       max_nbor_size,
       &avg[0],
       &std[0],
@@ -452,15 +446,10 @@ TEST_F(TestEnvMatA, prod_cpu_equal_cpu)
       max_nbor_size = nlist_a_cpy[ii].size();
     }
   }
-  std::vector<int> ilist(nloc), jlist(tot_nnei), jrange(nloc+1, 0);
-  for (int ii = 0; ii < nloc; ++ii){
-    ilist[ii] = ii;
-    jrange[ii+1] = jrange[ii] + nlist_a_cpy[ii].size();
-    int jj, cc;
-    for (jj = jrange[ii], cc = 0; jj < jrange[ii+1]; ++jj, ++cc){
-      jlist[jj] = nlist_a_cpy[ii][cc];
-    }
-  }
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  convert_nlist(inlist, nlist_a_cpy);
   std::vector<double > em(nloc * ndescrpt), em_deriv(nloc * ndescrpt * 3), rij(nloc * nnei * 3);
   std::vector<int> nlist(nloc * nnei);
   std::vector<double > avg(ntypes * ndescrpt, 0);
@@ -472,9 +461,7 @@ TEST_F(TestEnvMatA, prod_cpu_equal_cpu)
       &nlist[0],
       &posi_cpy[0],
       &atype_cpy[0],
-      &ilist[0],
-      &jrange[0],
-      &jlist[0],
+      inlist,
       max_nbor_size,
       &avg[0],
       &std[0],
diff --git a/source/lib/tests/test_env_mat_r.cc b/source/lib/tests/test_env_mat_r.cc
index d70e5ef0fe..153c10d582 100644
--- a/source/lib/tests/test_env_mat_r.cc
+++ b/source/lib/tests/test_env_mat_r.cc
@@ -249,15 +249,11 @@ TEST_F(TestEnvMatR, prod_cpu)
       max_nbor_size = nlist_a_cpy[ii].size();
     }
   }
-  std::vector<int> ilist(nloc), jlist(tot_nnei), jrange(nloc+1, 0);
-  for (int ii = 0; ii < nloc; ++ii){
-    ilist[ii] = ii;
-    jrange[ii+1] = jrange[ii] + nlist_a_cpy[ii].size();
-    int jj, cc;
-    for (jj = jrange[ii], cc = 0; jj < jrange[ii+1]; ++jj, ++cc){
-      jlist[jj] = nlist_a_cpy[ii][cc];
-    }
-  }
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  convert_nlist(inlist, nlist_a_cpy);
+  
   std::vector<double > em(nloc * ndescrpt), em_deriv(nloc * ndescrpt * 3), rij(nloc * nnei * 3);
   std::vector<int> nlist(nloc * nnei);
   std::vector<double > avg(ntypes * ndescrpt, 0);
@@ -269,9 +265,7 @@ TEST_F(TestEnvMatR, prod_cpu)
       &nlist[0],
       &posi_cpy[0],
       &atype_cpy[0],
-      &ilist[0],
-      &jrange[0],
-      &jlist[0],
+      inlist,
       max_nbor_size,
       &avg[0],
       &std[0],
@@ -304,15 +298,10 @@ TEST_F(TestEnvMatR, prod_cpu_equal_cpu)
       max_nbor_size = nlist_a_cpy[ii].size();
     }
   }
-  std::vector<int> ilist(nloc), jlist(tot_nnei), jrange(nloc+1, 0);
-  for (int ii = 0; ii < nloc; ++ii){
-    ilist[ii] = ii;
-    jrange[ii+1] = jrange[ii] + nlist_a_cpy[ii].size();
-    int jj, cc;
-    for (jj = jrange[ii], cc = 0; jj < jrange[ii+1]; ++jj, ++cc){
-      jlist[jj] = nlist_a_cpy[ii][cc];
-    }
-  }
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  convert_nlist(inlist, nlist_a_cpy);
   std::vector<double > em(nloc * ndescrpt), em_deriv(nloc * ndescrpt * 3), rij(nloc * nnei * 3);
   std::vector<int> nlist(nloc * nnei);
   std::vector<double > avg(ntypes * ndescrpt, 0);
@@ -324,9 +313,7 @@ TEST_F(TestEnvMatR, prod_cpu_equal_cpu)
       &nlist[0],
       &posi_cpy[0],
       &atype_cpy[0],
-      &ilist[0],
-      &jrange[0],
-      &jlist[0],
+      inlist,
       max_nbor_size,
       &avg[0],
       &std[0],
diff --git a/source/lib/tests/test_pair_tab.cc b/source/lib/tests/test_pair_tab.cc
index 867956e7ae..0c7cf5ea6f 100644
--- a/source/lib/tests/test_pair_tab.cc
+++ b/source/lib/tests/test_pair_tab.cc
@@ -160,34 +160,34 @@ TEST_F(TestPairTab, cpu)
 }
 
 
-int make_inter_nlist(
-    std::vector<int> &ilist,
-    std::vector<int> &jrange,
-    std::vector<int> &jlist,
-    const int & nloc,
-    const std::vector<std::vector<int>> & nlist_cpy)
-{
-  ilist.resize(nloc);
-  jrange.resize(nloc+1);
-  int tot_nnei = 0;
-  int max_nbor_size = 0;
-  for(int ii = 0; ii < nlist_cpy.size(); ++ii){
-    tot_nnei += nlist_cpy[ii].size();
-    if (nlist_cpy[ii].size() > max_nbor_size){
-      max_nbor_size = nlist_cpy[ii].size();
-    }
-  }
-  jlist.resize(tot_nnei);
-  for (int ii = 0; ii < nloc; ++ii){
-    ilist[ii] = ii;
-    jrange[ii+1] = jrange[ii] + nlist_cpy[ii].size();
-    int jj, cc;
-    for (jj = jrange[ii], cc = 0; jj < jrange[ii+1]; ++jj, ++cc){
-      jlist[jj] = nlist_cpy[ii][cc];
-    }
-  }
-  return max_nbor_size;
-}
+// int make_inter_nlist(
+//     std::vector<int> &ilist,
+//     std::vector<int> &jrange,
+//     std::vector<int> &jlist,
+//     const int & nloc,
+//     const std::vector<std::vector<int>> & nlist_cpy)
+// {
+//   ilist.resize(nloc);
+//   jrange.resize(nloc+1);
+//   int tot_nnei = 0;
+//   int max_nbor_size = 0;
+//   for(int ii = 0; ii < nlist_cpy.size(); ++ii){
+//     tot_nnei += nlist_cpy[ii].size();
+//     if (nlist_cpy[ii].size() > max_nbor_size){
+//       max_nbor_size = nlist_cpy[ii].size();
+//     }
+//   }
+//   jlist.resize(tot_nnei);
+//   for (int ii = 0; ii < nloc; ++ii){
+//     ilist[ii] = ii;
+//     jrange[ii+1] = jrange[ii] + nlist_cpy[ii].size();
+//     int jj, cc;
+//     for (jj = jrange[ii], cc = 0; jj < jrange[ii+1]; ++jj, ++cc){
+//       jlist[jj] = nlist_cpy[ii][cc];
+//     }
+//   }
+//   return max_nbor_size;
+// }
 
 
 TEST_F(TestPairTab, cpu_f_num_deriv)
@@ -240,18 +240,22 @@ TEST_F(TestPairTab, cpu_f_num_deriv)
       std::vector<std::vector<int>> nlist_cpy_0, nlist_cpy_1, t_nlist;
       build_nlist(nlist_cpy_0, t_nlist, posi_cpy_0, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region, ncell);
       build_nlist(nlist_cpy_1, t_nlist, posi_cpy_1, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region, ncell);
-      std::vector<int> ilist_0, jlist_0, jrange_0;
-      std::vector<int> ilist_1, jlist_1, jrange_1;
-      int max_nnei_0 = make_inter_nlist(ilist_0, jrange_0, jlist_0, nloc, nlist_cpy_0);
-      int max_nnei_1 = make_inter_nlist(ilist_1, jrange_1, jlist_1, nloc, nlist_cpy_1);
+      std::vector<int> ilist_0(nloc), numneigh_0(nloc), ilist_1(nloc), numneigh_1(nloc);;
+      std::vector<int*> firstneigh_0(nloc), firstneigh_1(nloc);
+      InputNlist inlist_0(nloc, &ilist_0[0], &numneigh_0[0], &firstneigh_0[0]);
+      InputNlist inlist_1(nloc, &ilist_1[0], &numneigh_1[0], &firstneigh_1[0]);
+      convert_nlist(inlist_0, nlist_cpy_0);
+      convert_nlist(inlist_1, nlist_cpy_1);
+      int max_nnei_0 = max_numneigh(inlist_0);
+      int max_nnei_1 = max_numneigh(inlist_1);
       EXPECT_EQ(max_nnei_0, max_nnei_1);
       std::vector<double> t_em(nloc * ndescrpt), t_em_deriv(nloc * ndescrpt * 3);
       std::vector<double> rij_0(nloc * nnei * 3), rij_1(nloc * nnei * 3);
       std::vector<int> nlist_0(nloc * nnei), nlist_1(nloc * nnei);
       std::vector<double > avg(ntypes * ndescrpt, 0);
       std::vector<double > std(ntypes * ndescrpt, 1);
-      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_0[0], &nlist_0[0], &posi_cpy_0[0], &atype_cpy_0[0], &ilist_0[0], &jrange_0[0], &jlist_0[0], max_nnei_0, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
-      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_1[0], &nlist_1[0], &posi_cpy_1[0], &atype_cpy_1[0], &ilist_1[0], &jrange_1[0], &jlist_1[0], max_nnei_1, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
+      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_0[0], &nlist_0[0], &posi_cpy_0[0], &atype_cpy_0[0], inlist_0, max_nnei_0, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
+      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_1[0], &nlist_1[0], &posi_cpy_1[0], &atype_cpy_1[0], inlist_1, max_nnei_1, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
       std::vector<double> energy_0(nloc), energy_1(nloc);
       std::vector<double> t_force(nall * 3), t_virial(nall * 9);
       pair_tab_cpu(
@@ -344,18 +348,22 @@ TEST_F(TestPairTab, cpu_f_num_deriv_scale)
       std::vector<std::vector<int>> nlist_cpy_0, nlist_cpy_1, t_nlist;
       build_nlist(nlist_cpy_0, t_nlist, posi_cpy_0, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region, ncell);
       build_nlist(nlist_cpy_1, t_nlist, posi_cpy_1, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region, ncell);
-      std::vector<int> ilist_0, jlist_0, jrange_0;
-      std::vector<int> ilist_1, jlist_1, jrange_1;
-      int max_nnei_0 = make_inter_nlist(ilist_0, jrange_0, jlist_0, nloc, nlist_cpy_0);
-      int max_nnei_1 = make_inter_nlist(ilist_1, jrange_1, jlist_1, nloc, nlist_cpy_1);
+      std::vector<int> ilist_0(nloc), numneigh_0(nloc), ilist_1(nloc), numneigh_1(nloc);;
+      std::vector<int*> firstneigh_0(nloc), firstneigh_1(nloc);
+      InputNlist inlist_0(nloc, &ilist_0[0], &numneigh_0[0], &firstneigh_0[0]);
+      InputNlist inlist_1(nloc, &ilist_1[0], &numneigh_1[0], &firstneigh_1[0]);
+      convert_nlist(inlist_0, nlist_cpy_0);
+      convert_nlist(inlist_1, nlist_cpy_1);
+      int max_nnei_0 = max_numneigh(inlist_0);
+      int max_nnei_1 = max_numneigh(inlist_1);
       EXPECT_EQ(max_nnei_0, max_nnei_1);
       std::vector<double> t_em(nloc * ndescrpt), t_em_deriv(nloc * ndescrpt * 3);
       std::vector<double> rij_0(nloc * nnei * 3), rij_1(nloc * nnei * 3);
       std::vector<int> nlist_0(nloc * nnei), nlist_1(nloc * nnei);
       std::vector<double > avg(ntypes * ndescrpt, 0);
       std::vector<double > std(ntypes * ndescrpt, 1);
-      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_0[0], &nlist_0[0], &posi_cpy_0[0], &atype_cpy_0[0], &ilist_0[0], &jrange_0[0], &jlist_0[0], max_nnei_0, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
-      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_1[0], &nlist_1[0], &posi_cpy_1[0], &atype_cpy_1[0], &ilist_1[0], &jrange_1[0], &jlist_1[0], max_nnei_1, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
+      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_0[0], &nlist_0[0], &posi_cpy_0[0], &atype_cpy_0[0], inlist_0, max_nnei_0, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
+      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_1[0], &nlist_1[0], &posi_cpy_1[0], &atype_cpy_1[0], inlist_1, max_nnei_1, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
       std::vector<double> energy_0(nloc), energy_1(nloc);
       std::vector<double> t_force(nall * 3), t_virial(nall * 9);
       pair_tab_cpu(
@@ -461,18 +469,22 @@ TEST_F(TestPairTab, cpu_v_num_deriv)
       std::vector<std::vector<int>> nlist_cpy_0, nlist_cpy_1, t_nlist;
       build_nlist(nlist_cpy_0, t_nlist, posi_cpy_0, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region_0, ncell);
       build_nlist(nlist_cpy_1, t_nlist, posi_cpy_1, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region_1, ncell);
-      std::vector<int> ilist_0, jlist_0, jrange_0;
-      std::vector<int> ilist_1, jlist_1, jrange_1;
-      int max_nnei_0 = make_inter_nlist(ilist_0, jrange_0, jlist_0, nloc, nlist_cpy_0);
-      int max_nnei_1 = make_inter_nlist(ilist_1, jrange_1, jlist_1, nloc, nlist_cpy_1);
+      std::vector<int> ilist_0(nloc), numneigh_0(nloc), ilist_1(nloc), numneigh_1(nloc);;
+      std::vector<int*> firstneigh_0(nloc), firstneigh_1(nloc);
+      InputNlist inlist_0(nloc, &ilist_0[0], &numneigh_0[0], &firstneigh_0[0]);
+      InputNlist inlist_1(nloc, &ilist_1[0], &numneigh_1[0], &firstneigh_1[0]);
+      convert_nlist(inlist_0, nlist_cpy_0);
+      convert_nlist(inlist_1, nlist_cpy_1);
+      int max_nnei_0 = max_numneigh(inlist_0);
+      int max_nnei_1 = max_numneigh(inlist_1);
       EXPECT_EQ(max_nnei_0, max_nnei_1);
       std::vector<double> t_em(nloc * ndescrpt), t_em_deriv(nloc * ndescrpt * 3);
       std::vector<double> rij_0(nloc * nnei * 3), rij_1(nloc * nnei * 3);
       std::vector<int> nlist_0(nloc * nnei), nlist_1(nloc * nnei);
       std::vector<double > avg(ntypes * ndescrpt, 0);
       std::vector<double > std(ntypes * ndescrpt, 1);
-      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_0[0], &nlist_0[0], &posi_cpy_0[0], &atype_cpy_0[0], &ilist_0[0], &jrange_0[0], &jlist_0[0], max_nnei_0, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
-      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_1[0], &nlist_1[0], &posi_cpy_1[0], &atype_cpy_1[0], &ilist_1[0], &jrange_1[0], &jlist_1[0], max_nnei_1, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
+      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_0[0], &nlist_0[0], &posi_cpy_0[0], &atype_cpy_0[0], inlist_0, max_nnei_0, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
+      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_1[0], &nlist_1[0], &posi_cpy_1[0], &atype_cpy_1[0], inlist_1, max_nnei_1, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
       std::vector<double> energy_0(nloc), energy_1(nloc);
       std::vector<double> t_force(nall * 3), t_virial(nall * 9);
       pair_tab_cpu(
@@ -589,18 +601,22 @@ TEST_F(TestPairTab, cpu_v_num_deriv_scale)
       std::vector<std::vector<int>> nlist_cpy_0, nlist_cpy_1, t_nlist;
       build_nlist(nlist_cpy_0, t_nlist, posi_cpy_0, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region_0, ncell);
       build_nlist(nlist_cpy_1, t_nlist, posi_cpy_1, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region_1, ncell);
-      std::vector<int> ilist_0, jlist_0, jrange_0;
-      std::vector<int> ilist_1, jlist_1, jrange_1;
-      int max_nnei_0 = make_inter_nlist(ilist_0, jrange_0, jlist_0, nloc, nlist_cpy_0);
-      int max_nnei_1 = make_inter_nlist(ilist_1, jrange_1, jlist_1, nloc, nlist_cpy_1);
+      std::vector<int> ilist_0(nloc), numneigh_0(nloc), ilist_1(nloc), numneigh_1(nloc);;
+      std::vector<int*> firstneigh_0(nloc), firstneigh_1(nloc);
+      InputNlist inlist_0(nloc, &ilist_0[0], &numneigh_0[0], &firstneigh_0[0]);
+      InputNlist inlist_1(nloc, &ilist_1[0], &numneigh_1[0], &firstneigh_1[0]);
+      convert_nlist(inlist_0, nlist_cpy_0);
+      convert_nlist(inlist_1, nlist_cpy_1);
+      int max_nnei_0 = max_numneigh(inlist_0);
+      int max_nnei_1 = max_numneigh(inlist_1);
       EXPECT_EQ(max_nnei_0, max_nnei_1);
       std::vector<double> t_em(nloc * ndescrpt), t_em_deriv(nloc * ndescrpt * 3);
       std::vector<double> rij_0(nloc * nnei * 3), rij_1(nloc * nnei * 3);
       std::vector<int> nlist_0(nloc * nnei), nlist_1(nloc * nnei);
       std::vector<double > avg(ntypes * ndescrpt, 0);
       std::vector<double > std(ntypes * ndescrpt, 1);
-      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_0[0], &nlist_0[0], &posi_cpy_0[0], &atype_cpy_0[0], &ilist_0[0], &jrange_0[0], &jlist_0[0], max_nnei_0, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
-      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_1[0], &nlist_1[0], &posi_cpy_1[0], &atype_cpy_1[0], &ilist_1[0], &jrange_1[0], &jlist_1[0], max_nnei_1, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
+      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_0[0], &nlist_0[0], &posi_cpy_0[0], &atype_cpy_0[0], inlist_0, max_nnei_0, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
+      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_1[0], &nlist_1[0], &posi_cpy_1[0], &atype_cpy_1[0], inlist_1, max_nnei_1, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
       std::vector<double> energy_0(nloc), energy_1(nloc);
       std::vector<double> t_force(nall * 3), t_virial(nall * 9);
       pair_tab_cpu(
@@ -717,18 +733,22 @@ TEST_F(TestPairTabTriBox, cpu_v_num_deriv)
       std::vector<std::vector<int>> nlist_cpy_0, nlist_cpy_1, t_nlist;
       build_nlist(nlist_cpy_0, t_nlist, posi_cpy_0, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region_0, ncell);
       build_nlist(nlist_cpy_1, t_nlist, posi_cpy_1, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region_1, ncell);
-      std::vector<int> ilist_0, jlist_0, jrange_0;
-      std::vector<int> ilist_1, jlist_1, jrange_1;
-      int max_nnei_0 = make_inter_nlist(ilist_0, jrange_0, jlist_0, nloc, nlist_cpy_0);
-      int max_nnei_1 = make_inter_nlist(ilist_1, jrange_1, jlist_1, nloc, nlist_cpy_1);
+      std::vector<int> ilist_0(nloc), numneigh_0(nloc), ilist_1(nloc), numneigh_1(nloc);;
+      std::vector<int*> firstneigh_0(nloc), firstneigh_1(nloc);
+      InputNlist inlist_0(nloc, &ilist_0[0], &numneigh_0[0], &firstneigh_0[0]);
+      InputNlist inlist_1(nloc, &ilist_1[0], &numneigh_1[0], &firstneigh_1[0]);
+      convert_nlist(inlist_0, nlist_cpy_0);
+      convert_nlist(inlist_1, nlist_cpy_1);
+      int max_nnei_0 = max_numneigh(inlist_0);
+      int max_nnei_1 = max_numneigh(inlist_1);
       EXPECT_EQ(max_nnei_0, max_nnei_1);
       std::vector<double> t_em(nloc * ndescrpt), t_em_deriv(nloc * ndescrpt * 3);
       std::vector<double> rij_0(nloc * nnei * 3), rij_1(nloc * nnei * 3);
       std::vector<int> nlist_0(nloc * nnei), nlist_1(nloc * nnei);
       std::vector<double > avg(ntypes * ndescrpt, 0);
       std::vector<double > std(ntypes * ndescrpt, 1);
-      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_0[0], &nlist_0[0], &posi_cpy_0[0], &atype_cpy_0[0], &ilist_0[0], &jrange_0[0], &jlist_0[0], max_nnei_0, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
-      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_1[0], &nlist_1[0], &posi_cpy_1[0], &atype_cpy_1[0], &ilist_1[0], &jrange_1[0], &jlist_1[0], max_nnei_1, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
+      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_0[0], &nlist_0[0], &posi_cpy_0[0], &atype_cpy_0[0], inlist_0, max_nnei_0, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
+      prod_env_mat_a_cpu(&t_em[0], &t_em_deriv[0], &rij_1[0], &nlist_1[0], &posi_cpy_1[0], &atype_cpy_1[0], inlist_1, max_nnei_1, &avg[0], &std[0], nloc, nall, rc, rc_smth, sec_a);
       std::vector<double> energy_0(nloc), energy_1(nloc);
       std::vector<double> t_force(nall * 3), t_virial(nall * 9);
       pair_tab_cpu(
diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt
index 67ffc7f844..ced3cd0bf5 100644
--- a/source/op/CMakeLists.txt
+++ b/source/op/CMakeLists.txt
@@ -3,7 +3,7 @@
 set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_DIR}/lib/src/neighbor_list.cc)
 
 set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} )
-file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_a_ef.cc descrpt_se_a_ef.cc descrpt_se_a_ef_para.cc descrpt_se_a_ef_vert.cc descrpt_se_r.cc pair_tab.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu_multi_device.cc map_aparam.cc neighbor_stat.cc unaggregated_grad.cc tabulate_multi_device.cc)
+file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_a_ef.cc descrpt_se_a_ef.cc descrpt_se_a_ef_para.cc descrpt_se_a_ef_vert.cc descrpt_se_r.cc pair_tab.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu_multi_device.cc map_aparam.cc neighbor_stat.cc unaggregated_grad.cc tabulate_multi_device.cc prod_env_mat_multi_device.cc)
 file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc prod_env_mat_multi_device.cc pair_tab.cc prod_force_multi_device.cc prod_virial_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc tabulate_multi_device.cc)
 file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_se_a_grad.cc prod_force_se_r_grad.cc prod_virial_grad.cc prod_virial_se_a_grad.cc prod_virial_se_r_grad.cc soft_min_force_grad.cc soft_min_virial_grad.cc )
 file(GLOB OP_PY *.py)
diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index a50461a274..42310ed30b 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -1,8 +1,11 @@
 #include "custom_op.h"
 #include "utilities.h"
+#include "coord.h"
+#include "region.h"
+#include "neighbor_list.h"
 #include "prod_env_mat.h"
 
-REGISTER_OP("DescrptSeA")
+REGISTER_OP("ProdEnvMatA")
     .Attr("T: {float, double}")
     .Input("coord: T")          //atomic coordinates
     .Input("type: int32")       //atomic type
@@ -22,7 +25,7 @@ REGISTER_OP("DescrptSeA")
     .Output("nlist: int32");
     // only sel_a and rcut_r uesd.
 
-REGISTER_OP("DescrptSeR")
+REGISTER_OP("ProdEnvMatR")
     .Attr("T: {float, double}")
     .Input("coord: T")
     .Input("type: int32")
@@ -39,10 +42,49 @@ REGISTER_OP("DescrptSeR")
     .Output("rij: T")
     .Output("nlist: int32");
 
+
+template<typename FPTYPE>
+static int
+_norm_copy_coord_cpu(
+    std::vector<FPTYPE> & coord_cpy,
+    std::vector<int> & type_cpy,
+    std::vector<int> & mapping,
+    int & nall,
+    int & mem_cpy,
+    const FPTYPE * coord,
+    const FPTYPE * box,
+    const int * type,
+    const int &nloc, 
+    const int &max_cpy_trial, 
+    const float & rcut_r);
+
+template<typename FPTYPE>
+static int
+_build_nlist_cpu(
+    std::vector<int> &ilist, 
+    std::vector<int> &numneigh,
+    std::vector<int*> &firstneigh,
+    std::vector<std::vector<int>> &jlist,
+    int & max_nnei,
+    int & mem_nnei,
+    const FPTYPE *coord,
+    const int & nloc,
+    const int & new_nall,
+    const int & max_nnei_trial,
+    const float & rcut_r);
+
+static void
+_map_nlist_cpu(
+    int * nlist,
+    const int * idx_mapping,
+    const int & nloc,
+    const int & nnei);
+
+
 template <typename Device, typename FPTYPE>
-class DescrptSeAOp : public OpKernel {
+class ProdEnvMatAOp : public OpKernel {
 public:
-  explicit DescrptSeAOp(OpKernelConstruction* context) : OpKernel(context) {
+  explicit ProdEnvMatAOp(OpKernelConstruction* context) : OpKernel(context) {
     float nloc_f, nall_f;
     OP_REQUIRES_OK(context, context->GetAttr("rcut_a", &rcut_a));
     OP_REQUIRES_OK(context, context->GetAttr("rcut_r", &rcut_r));
@@ -60,6 +102,10 @@ class DescrptSeAOp : public OpKernel {
     nnei_r = sec_r.back();
     nnei = nnei_a + nnei_r;
     max_nbor_size = 1024;
+    max_cpy_trial = 100;
+    mem_cpy = 256;
+    max_nnei_trial = 100;
+    mem_nnei = 256;
   }
 
   void Compute(OpKernelContext* context) override {
@@ -105,6 +151,28 @@ class DescrptSeAOp : public OpKernel {
     
     OP_REQUIRES (context, (ntypes == int(sel_a.size())),  errors::InvalidArgument ("number of types should match the length of sel array"));
     OP_REQUIRES (context, (ntypes == int(sel_r.size())),  errors::InvalidArgument ("number of types should match the length of sel array"));
+
+    int nei_mode = 0;
+    bool b_nlist_map = false;
+    if (mesh_tensor.shape().dim_size(0) == 16) {
+      // lammps neighbor list
+      nei_mode = 3;
+    }
+    else if (mesh_tensor.shape().dim_size(0) == 6) {
+      // manual copied pbc
+      assert (nloc == nall);
+      nei_mode = 1;
+      b_nlist_map = true;
+    }
+    else if (mesh_tensor.shape().dim_size(0) == 0) {
+      // no pbc
+      assert (nloc == nall);
+      nei_mode = -1;
+    }
+    else {
+      throw std::runtime_error("invalid mesh tensor");
+    }
+
     // Create output tensors
     TensorShape descrpt_shape ;
     descrpt_shape.AddDim (nsamples);
@@ -141,16 +209,29 @@ class DescrptSeAOp : public OpKernel {
         nlist_shape,
         &nlist_tensor));
 
-    FPTYPE * em = descrpt_tensor->flat<FPTYPE>().data();
-    FPTYPE * em_deriv = descrpt_deriv_tensor->flat<FPTYPE>().data();
-    FPTYPE * rij = rij_tensor->flat<FPTYPE>().data();
-    int * nlist = nlist_tensor->flat<int>().data();
-    const FPTYPE * coord = coord_tensor.flat<FPTYPE>().data();
+    FPTYPE * p_em = descrpt_tensor->flat<FPTYPE>().data();
+    FPTYPE * p_em_deriv = descrpt_deriv_tensor->flat<FPTYPE>().data();
+    FPTYPE * p_rij = rij_tensor->flat<FPTYPE>().data();
+    int * p_nlist = nlist_tensor->flat<int>().data();
+    const FPTYPE * p_coord = coord_tensor.flat<FPTYPE>().data();
+    const FPTYPE * p_box = box_tensor.flat<FPTYPE>().data();
     const FPTYPE * avg = avg_tensor.flat<FPTYPE>().data();
     const FPTYPE * std = std_tensor.flat<FPTYPE>().data();
-    const int * type = type_tensor.flat<int>().data();
+    const int * p_type = type_tensor.flat<int>().data();
+
+    // loop over samples
+    for(int ff = 0; ff < nsamples; ++ff){
+      FPTYPE * em = p_em + ff*nloc*ndescrpt;
+      FPTYPE * em_deriv = p_em_deriv + ff*nloc*ndescrpt*3;
+      FPTYPE * rij = p_rij + ff*nloc*nnei*3;
+      int * nlist = p_nlist + ff*nloc*nnei;
+      const FPTYPE * coord = p_coord + ff*nall*3;
+      const FPTYPE * box = p_box + ff*9;
+      const int * type = p_type + ff*nall;
+      std::vector<int> idx_mapping;
 
     if(device == "GPU") {
+      #if GOOGLE_CUDA
       // allocate temp memory, temp memory must not be used after this operation!
       Tensor int_temp;
       TensorShape int_shape;
@@ -166,7 +247,6 @@ class DescrptSeAOp : public OpKernel {
           init, ilist, jrange, jlist, ilist_size, jrange_size, jlist_size, max_nbor_size,
           mesh_tensor.flat<int>().data(), static_cast<int>(mesh_tensor.NumElements()));
       OP_REQUIRES (context, (max_nbor_size <= GPU_MAX_NBOR_SIZE), errors::InvalidArgument ("Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_nbor_size) + " is larger than " + std::to_string(GPU_MAX_NBOR_SIZE) + ", which currently is not supported by deepmd-kit."));
-      #if GOOGLE_CUDA
       // launch the gpu(nv) compute function
       prod_env_mat_a_gpu_cuda(
           em, em_deriv, rij, nlist, 
@@ -174,13 +254,49 @@ class DescrptSeAOp : public OpKernel {
       #endif //GOOGLE_CUDA
     }
     else if (device == "CPU") {
-      memcpy (&ilist,  4  + mesh_tensor.flat<int>().data(), sizeof(int *));
-      memcpy (&jrange, 8  + mesh_tensor.flat<int>().data(), sizeof(int *));
-      memcpy (&jlist,  12 + mesh_tensor.flat<int>().data(), sizeof(int *));
+      int new_nall = nall;
+      InputNlist inlist;
+      inlist.inum = nloc;
+      // some buffers, be free after prod_env_mat_a_cpu
+      std::vector<int> ilist(nloc), numneigh(nloc);
+      std::vector<int*> firstneigh(nloc);
+      std::vector<std::vector<int>> jlist(nloc);
+      std::vector<FPTYPE> coord_cpy;
+      std::vector<int> type_cpy;
+      if(nei_mode != 3){
+	// build nlist by myself
+	// normalize and copy coord
+	if(nei_mode == 1){
+	  int copy_ok = _norm_copy_coord_cpu(
+	      coord_cpy, type_cpy, idx_mapping, new_nall, mem_cpy,
+	      coord, box, type, nloc, max_cpy_trial, rcut_r);
+	  OP_REQUIRES (context, copy_ok, errors::Aborted("cannot allocate mem for copied coords"));
+	  coord = &coord_cpy[0];
+	  type = &type_cpy[0];
+	}
+	// build nlist
+	int build_ok = _build_nlist_cpu(
+	    ilist, numneigh, firstneigh, jlist, max_nbor_size, mem_nnei,
+	    coord, nloc, new_nall, max_nnei_trial, rcut_r);
+	OP_REQUIRES (context, build_ok, errors::Aborted("cannot allocate mem for nlist"));
+	inlist.ilist = &ilist[0];
+	inlist.numneigh = &numneigh[0];
+	inlist.firstneigh = &firstneigh[0];
+      }
+      else{
+	// copy pointers to nlist data
+	memcpy(&inlist.ilist, 4 + mesh_tensor.flat<int>().data(), sizeof(int *));
+	memcpy(&inlist.numneigh, 8 + mesh_tensor.flat<int>().data(), sizeof(int *));
+	memcpy(&inlist.firstneigh, 12 + mesh_tensor.flat<int>().data(), sizeof(int **));
+	max_nbor_size = max_numneigh(inlist);
+      }
       // launch the cpu compute function
       prod_env_mat_a_cpu(
-          em, em_deriv, rij, nlist, 
-          coord, type, ilist, jrange, jlist, max_nbor_size, avg, std, nloc, nall, rcut_r, rcut_r_smth, sec_a);
+	  em, em_deriv, rij, nlist, 
+	  coord, type, inlist, max_nbor_size, avg, std, nloc, new_nall, rcut_r, rcut_r_smth, sec_a);
+      // do nlist mapping if coords were copied
+      if(b_nlist_map) _map_nlist_cpu(nlist, &idx_mapping[0], nloc, nnei);
+    }
     }
   }
 
@@ -195,6 +311,8 @@ class DescrptSeAOp : public OpKernel {
   std::vector<int> sec_r;
   int ndescrpt, ndescrpt_a, ndescrpt_r;
   int nnei, nnei_a, nnei_r, nloc, nall, max_nbor_size;
+  int mem_cpy, max_cpy_trial;
+  int mem_nnei, max_nnei_trial;
   std::string device;
   int * array_int = NULL;
   unsigned long long * array_longlong = NULL;
@@ -204,9 +322,9 @@ class DescrptSeAOp : public OpKernel {
 };
 
 template<typename Device, typename FPTYPE>
-class DescrptSeROp : public OpKernel {
+class ProdEnvMatROp : public OpKernel {
 public:
-  explicit DescrptSeROp(OpKernelConstruction* context) : OpKernel(context) {
+  explicit ProdEnvMatROp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("rcut", &rcut));
     OP_REQUIRES_OK(context, context->GetAttr("rcut_smth", &rcut_smth));
     OP_REQUIRES_OK(context, context->GetAttr("sel", &sel));
@@ -304,6 +422,7 @@ class DescrptSeROp : public OpKernel {
     const int * type = type_tensor.flat<int>().data();
 
     if(device == "GPU") {
+      #if GOOGLE_CUDA
       // allocate temp memory, temp memory must not be used after this operation!
       Tensor int_temp;
       TensorShape int_shape;
@@ -319,7 +438,6 @@ class DescrptSeROp : public OpKernel {
           init, ilist, jrange, jlist, ilist_size, jrange_size, jlist_size, max_nbor_size,
           mesh_tensor.flat<int>().data(), static_cast<int>(mesh_tensor.NumElements()));
       OP_REQUIRES (context, (max_nbor_size <= GPU_MAX_NBOR_SIZE), errors::InvalidArgument ("Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_nbor_size) + " is larger than " + std::to_string(GPU_MAX_NBOR_SIZE) + ", which currently is not supported by deepmd-kit."));
-      #if GOOGLE_CUDA
       // launch the gpu(nv) compute function
       prod_env_mat_r_gpu_cuda(
           em, em_deriv, rij, nlist, 
@@ -331,9 +449,9 @@ class DescrptSeROp : public OpKernel {
       memcpy (&jrange, 8  + mesh_tensor.flat<int>().data(), sizeof(int *));
       memcpy (&jlist,  12 + mesh_tensor.flat<int>().data(), sizeof(int *));
       // launch the cpu compute function
-      prod_env_mat_r_cpu(
-          em, em_deriv, rij, nlist, 
-          coord, type, ilist, jrange, jlist, max_nbor_size, avg, std, nloc, nall, rcut, rcut_smth, sec);
+      // prod_env_mat_r_cpu(
+      //     em, em_deriv, rij, nlist, 
+      //     coord, type, ilist, jrange, jlist, max_nbor_size, avg, std, nloc, nall, rcut, rcut_smth, sec);
     }
   }
 
@@ -355,14 +473,106 @@ class DescrptSeROp : public OpKernel {
   int ilist_size = 0, jrange_size = 0, jlist_size = 0;
 };
 
+template<typename FPTYPE>
+static int
+_norm_copy_coord_cpu(
+    std::vector<FPTYPE> & coord_cpy,
+    std::vector<int> & type_cpy,
+    std::vector<int> & idx_mapping,
+    int & nall,
+    int & mem_cpy,
+    const FPTYPE * coord,
+    const FPTYPE * box,
+    const int * type,
+    const int &nloc, 
+    const int &max_cpy_trial, 
+    const float & rcut_r)
+{
+  std::vector<FPTYPE> tmp_coord(nall*3);
+  std::copy(coord, coord+nall*3, tmp_coord.begin());
+  Region<FPTYPE> region;
+  init_region_cpu(region, box);
+  normalize_coord_cpu(&tmp_coord[0], nall, region);
+  int tt;
+  for(tt = 0; tt < max_cpy_trial; ++tt){
+    coord_cpy.resize(mem_cpy*3);
+    type_cpy.resize(mem_cpy);
+    idx_mapping.resize(mem_cpy);
+    int ret = copy_coord_cpu(
+	&coord_cpy[0], &type_cpy[0], &idx_mapping[0], &nall, 
+	&tmp_coord[0], type, nloc, mem_cpy, rcut_r, region);
+    if(ret == 0){
+      break;
+    }
+    else{
+      mem_cpy *= 2;
+    }
+  }
+  return (tt != max_cpy_trial);
+}
+
+template<typename FPTYPE>
+static int
+_build_nlist_cpu(
+    std::vector<int> &ilist, 
+    std::vector<int> &numneigh,
+    std::vector<int*> &firstneigh,
+    std::vector<std::vector<int>> &jlist,
+    int & max_nnei,
+    int & mem_nnei,
+    const FPTYPE *coord,
+    const int & nloc,
+    const int & new_nall,
+    const int & max_nnei_trial,
+    const float & rcut_r)
+{
+  int tt;
+  for(tt = 0; tt < max_nnei_trial; ++tt){
+    for(int ii = 0; ii < nloc; ++ii){
+      jlist[ii].resize(mem_nnei);
+      firstneigh[ii] = &jlist[ii][0];
+    }
+    InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+    int ret = build_nlist_cpu(
+	inlist, &max_nnei, 
+	coord, nloc, new_nall, mem_nnei, rcut_r);
+    if(ret == 0){
+      break;
+    }
+    else{
+      mem_nnei *= 2;
+    }
+  }
+  return (tt != max_nnei_trial);
+}
+    
+static void
+_map_nlist_cpu(
+    int * nlist,
+    const int * idx_mapping,
+    const int & nloc,
+    const int & nnei)
+{
+  for (int ii = 0; ii < nloc; ++ii){
+    for (int jj = 0; jj < nnei; ++jj){
+      int record = nlist[ii*nnei+jj];
+      if (record >= 0) {		
+	nlist[ii*nnei+jj] = idx_mapping[record];	      
+      }
+    }
+  }  
+}
+
+
+
 // Register the CPU kernels.
 #define REGISTER_CPU(T)                                                                 \
 REGISTER_KERNEL_BUILDER(                                                                \
-    Name("DescrptSeA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
-    DescrptSeAOp<CPUDevice, T>);                                                        \
+    Name("ProdEnvMatA").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    ProdEnvMatAOp<CPUDevice, T>);                                                        \
 REGISTER_KERNEL_BUILDER(                                                                \
-    Name("DescrptSeR").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
-    DescrptSeROp<CPUDevice, T>); 
+    Name("ProdEnvMatR").Device(DEVICE_CPU).TypeConstraint<T>("T"),                       \
+    ProdEnvMatROp<CPUDevice, T>); 
 REGISTER_CPU(float);
 REGISTER_CPU(double);
 
@@ -370,11 +580,11 @@ REGISTER_CPU(double);
 #if GOOGLE_CUDA
 #define REGISTER_GPU(T)                                                                 \
 REGISTER_KERNEL_BUILDER(                                                                \
-    Name("DescrptSeA").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"),  \
-    DescrptSeAOp<GPUDevice, T>);                                                        \
+    Name("ProdEnvMatA").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"),  \
+    ProdEnvMatAOp<GPUDevice, T>);                                                        \
 REGISTER_KERNEL_BUILDER(                                                                \
-    Name("DescrptSeR").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"),  \
-    DescrptSeROp<GPUDevice, T>);
+    Name("ProdEnvMatR").Device(DEVICE_GPU).TypeConstraint<T>("T").HostMemory("natoms"),  \
+    ProdEnvMatROp<GPUDevice, T>);
 REGISTER_GPU(float);
 REGISTER_GPU(double);
 #endif  // GOOGLE_CUDA
diff --git a/source/tests/test_data_modifier.py b/source/tests/test_data_modifier.py
index d6574e39c8..70999f5734 100644
--- a/source/tests/test_data_modifier.py
+++ b/source/tests/test_data_modifier.py
@@ -43,7 +43,7 @@ def _setUp(self):
             restart=None,
             init_model=None,
             log_path=None,
-            log_level=0,
+            log_level=30,
             mpi_log="master",
             try_distrib=False
         )
diff --git a/source/tests/test_data_modifier_shuffle.py b/source/tests/test_data_modifier_shuffle.py
index 9057ba5836..39269c2ff5 100644
--- a/source/tests/test_data_modifier_shuffle.py
+++ b/source/tests/test_data_modifier_shuffle.py
@@ -48,7 +48,7 @@ def _setUp(self):
             restart=None,
             init_model=None,
             log_path=None,
-            log_level=0,
+            log_level=30,
             mpi_log="master",
             try_distrib=False
         )
diff --git a/source/tests/test_descrpt_smooth.py b/source/tests/test_descrpt_smooth.py
index c9c796799a..6a72af7db5 100644
--- a/source/tests/test_descrpt_smooth.py
+++ b/source/tests/test_descrpt_smooth.py
@@ -78,7 +78,7 @@ def comp_ef (self,
                  name,
                  reuse = None) :
         descrpt, descrpt_deriv, rij, nlist \
-            = op_module.descrpt_se_a (dcoord, 
+            = op_module.prod_env_mat_a (dcoord, 
                                        dtype,
                                        tnatoms,
                                        dbox, 
diff --git a/source/tests/test_prod_env_mat.py b/source/tests/test_prod_env_mat.py
new file mode 100644
index 0000000000..347eb913ea
--- /dev/null
+++ b/source/tests/test_prod_env_mat.py
@@ -0,0 +1,214 @@
+import os,sys
+import numpy as np
+import unittest
+
+import deepmd.op
+from deepmd.env import tf
+from deepmd.env import op_module
+from deepmd.run_options import GLOBAL_TF_FLOAT_PRECISION
+from deepmd.run_options import GLOBAL_NP_FLOAT_PRECISION
+from deepmd.run_options import GLOBAL_ENER_FLOAT_PRECISION
+
+class TestProdEnvMat(unittest.TestCase):
+    def setUp(self):
+        self.sess = tf.Session()
+        self.nframes = 2
+        self.dcoord = [
+            12.83, 2.56, 2.18,
+            12.09, 2.87, 2.74,
+            00.25, 3.32, 1.68,
+            3.36, 3.00, 1.81,
+            3.51, 2.51, 2.60,
+            4.27, 3.22, 1.56]
+        self.dtype = [0, 1, 1, 0, 1, 1]
+        self.dbox = [13., 0., 0., 0., 13., 0., 0., 0., 13.]
+        self.dcoord = np.reshape(self.dcoord, [1, -1])
+        self.dtype = np.reshape(self.dtype, [1, -1])
+        self.dbox = np.reshape(self.dbox, [1, -1])
+        self.dcoord = np.tile(self.dcoord, [self.nframes, 1])
+        self.dtype = np.tile(self.dtype, [self.nframes, 1])
+        self.dbox = np.tile(self.dbox, [self.nframes, 1])        
+        self.pbc_expected_output = [
+            0.12206, 0.12047, 0.01502, -0.01263, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 1.02167, -0.77271, 0.32370, 0.58475, 0.99745, 0.41810, 0.75655, -0.49773, 0.10564, 0.10495, -0.00143, 0.01198, 0.03103, 0.03041, 0.00452, -0.00425, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 
+            1.02167, 0.77271, -0.32370, -0.58475, 0.04135, 0.04039, 0.00123, -0.00880, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.59220, 0.42028, 0.16304, -0.38405, 0.03694, 0.03680, -0.00300, -0.00117, 0.00336, 0.00327, 0.00022, -0.00074, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 
+            0.99745, -0.41810, -0.75655, 0.49773, 0.19078, 0.18961, -0.01951, 0.00793, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.59220, -0.42028, -0.16304, 0.38405, 0.13499, 0.12636, -0.03140, 0.03566, 0.07054, 0.07049, -0.00175, -0.00210, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 
+            0.12206, -0.12047, -0.01502, 0.01263, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 1.06176, 0.16913, -0.55250, 0.89077, 1.03163, 0.96880, 0.23422, -0.26615, 0.19078, -0.18961, 0.01951, -0.00793, 0.04135, -0.04039, -0.00123, 0.00880, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 
+            1.06176, -0.16913, 0.55250, -0.89077, 0.10564, -0.10495, 0.00143, -0.01198, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.66798, 0.34516, 0.32245, -0.47232, 0.13499, -0.12636, 0.03140, -0.03566, 0.03694, -0.03680, 0.00300, 0.00117, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 
+            1.03163, -0.96880, -0.23422, 0.26615, 0.03103, -0.03041, -0.00452, 0.00425, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.66798, -0.34516, -0.32245, 0.47232, 0.07054, -0.07049, 0.00175, 0.00210, 0.00336, -0.00327, -0.00022, 0.00074, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000]
+        self.nopbc_expected_output = [
+            0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,1.02167,-0.77271,0.32370,0.58475,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,
+            1.02167,0.77271,-0.32370,-0.58475,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,
+            0.19078,0.18961,-0.01951,0.00793,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.13499,0.12636,-0.03140,0.03566,0.07054,0.07049,-0.00175,-0.00210,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,
+            0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,1.06176,0.16913,-0.55250,0.89077,1.03163,0.96880,0.23422,-0.26615,0.19078,-0.18961,0.01951,-0.00793,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,
+            1.06176,-0.16913,0.55250,-0.89077,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.66798,0.34516,0.32245,-0.47232,0.13499,-0.12636,0.03140,-0.03566,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,
+1.03163,-0.96880,-0.23422,0.26615,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.66798,-0.34516,-0.32245,0.47232,0.07054,-0.07049,0.00175,0.00210,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000]
+        self.sel = [10, 10]
+        self.sec = np.array([0, 0, 0], dtype = int)
+        self.sec[1:3] = np.cumsum(self.sel)
+        self.rcut = 6.
+        self.rcut_smth = 0.8
+        self.dnatoms = [6, 6, 2, 4]
+        self.tcoord = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None, self.dnatoms[0] * 3], name='t_coord')
+        self.tbox = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None, 9], name='t_box')
+        self.ttype = tf.placeholder(tf.int32, [None, self.dnatoms[0]], name = "t_type")
+        self.tnatoms = tf.placeholder(tf.int32, [None], name = "t_natoms")
+        self.nloc = self.dnatoms[0]
+        self.nall = self.dnatoms[1]
+        self.nnei = self.sec[-1]
+        self.ndescrpt = 4 * self.nnei
+        self.ntypes = np.max(self.dtype) + 1
+        davg = np.zeros ([self.ntypes, self.ndescrpt])
+        dstd = np.ones  ([self.ntypes, self.ndescrpt])
+        self.t_avg = tf.constant(davg.astype(GLOBAL_NP_FLOAT_PRECISION))
+        self.t_std = tf.constant(dstd.astype(GLOBAL_NP_FLOAT_PRECISION))
+ 
+    def test_pbc_self_built_nlist(self):
+        tem, tem_deriv, trij, tnlist \
+            = op_module.prod_env_mat_a (
+                self.tcoord,
+                self.ttype,
+                self.tnatoms,
+                self.tbox, 
+                tf.constant(np.zeros(6, dtype = np.int32)),
+                self.t_avg,
+                self.t_std,
+                rcut_a = -1, 
+                rcut_r = self.rcut, 
+                rcut_r_smth = self.rcut_smth,
+                sel_a = self.sel, 
+                sel_r = [0, 0])
+        self.sess.run (tf.global_variables_initializer())
+        dem, dem_deriv, drij, dnlist = self.sess.run(
+            [tem, tem_deriv, trij, tnlist],
+            feed_dict = {
+                self.tcoord: self.dcoord,
+                self.ttype: self.dtype,
+                self.tbox: self.dbox,
+                self.tnatoms: self.dnatoms}
+        )
+        self.assertEqual(dem.shape, (self.nframes, self.nloc*self.ndescrpt))
+        self.assertEqual(dem_deriv.shape, (self.nframes, self.nloc*self.ndescrpt*3))
+        self.assertEqual(drij.shape, (self.nframes, self.nloc*self.nnei*3))
+        self.assertEqual(dnlist.shape, (self.nframes, self.nloc*self.nnei))
+        for ff in range(self.nframes):
+            for ii in range(self.ndescrpt):
+                self.assertAlmostEqual(dem[ff][ii], self.pbc_expected_output[ii], places=5)
+
+    def test_pbc_self_built_nlist_deriv(self):
+        hh = 1e-4
+        tem, tem_deriv, trij, tnlist \
+            = op_module.prod_env_mat_a (
+                self.tcoord,
+                self.ttype,
+                self.tnatoms,
+                self.tbox, 
+                tf.constant(np.zeros(6, dtype = np.int32)),
+                self.t_avg,
+                self.t_std,
+                rcut_a = -1, 
+                rcut_r = self.rcut, 
+                rcut_r_smth = self.rcut_smth,
+                sel_a = self.sel, 
+                sel_r = [0, 0])
+        self.sess.run (tf.global_variables_initializer())
+        self.check_deriv_numerical_deriv(hh, tem, tem_deriv, trij, tnlist)
+
+    def test_nopbc_self_built_nlist(self):
+        tem, tem_deriv, trij, tnlist \
+            = op_module.prod_env_mat_a (
+                self.tcoord,
+                self.ttype,
+                self.tnatoms,
+                self.tbox, 
+                tf.constant(np.zeros(0, dtype = np.int32)),
+                self.t_avg,
+                self.t_std,
+                rcut_a = -1, 
+                rcut_r = self.rcut, 
+                rcut_r_smth = self.rcut_smth,
+                sel_a = self.sel, 
+                sel_r = [0, 0])
+        self.sess.run (tf.global_variables_initializer())
+        dem, dem_deriv, drij, dnlist = self.sess.run(
+            [tem, tem_deriv, trij, tnlist],
+            feed_dict = {
+                self.tcoord: self.dcoord,
+                self.ttype: self.dtype,
+                self.tbox: self.dbox,
+                self.tnatoms: self.dnatoms}
+        )
+        self.assertEqual(dem.shape, (self.nframes, self.nloc*self.ndescrpt))
+        self.assertEqual(dem_deriv.shape, (self.nframes, self.nloc*self.ndescrpt*3))
+        self.assertEqual(drij.shape, (self.nframes, self.nloc*self.nnei*3))
+        self.assertEqual(dnlist.shape, (self.nframes, self.nloc*self.nnei))
+        for ff in range(self.nframes):
+            for ii in range(self.ndescrpt):
+                self.assertAlmostEqual(dem[ff][ii], self.nopbc_expected_output[ii], places=5)
+
+
+    def test_nopbc_self_built_nlist_deriv(self):
+        hh = 1e-4
+        tem, tem_deriv, trij, tnlist \
+            = op_module.prod_env_mat_a (
+                self.tcoord,
+                self.ttype,
+                self.tnatoms,
+                self.tbox, 
+                tf.constant(np.zeros(0, dtype = np.int32)),
+                self.t_avg,
+                self.t_std,
+                rcut_a = -1, 
+                rcut_r = self.rcut, 
+                rcut_r_smth = self.rcut_smth,
+                sel_a = self.sel, 
+                sel_r = [0, 0])
+        self.sess.run (tf.global_variables_initializer())
+        self.check_deriv_numerical_deriv(hh, tem, tem_deriv, trij, tnlist)
+
+
+    def check_deriv_numerical_deriv(self, 
+                                    hh,
+                                    tem, tem_deriv, trij, tnlist):
+        dem_, dem_deriv_, drij_, dnlist_ = self.sess.run(
+            [tem, tem_deriv, trij, tnlist],
+            feed_dict = {
+                self.tcoord: self.dcoord,
+                self.ttype: self.dtype,
+                self.tbox: self.dbox,
+                self.tnatoms: self.dnatoms}
+        )
+        ff = 0
+        dem = dem_[ff]
+        dem_deriv = dem_deriv_[ff]
+        dnlist = dnlist_[ff]
+        for ii in range(self.dnatoms[0]):            
+            for jj in range(self.nnei):
+                j_idx = dnlist[ii*self.nnei+jj]
+                if j_idx < 0:
+                    continue
+                for kk in range(4):
+                    for dd in range(3):
+                        dcoord_0 = np.copy(self.dcoord)
+                        dcoord_1 = np.copy(self.dcoord)
+                        dcoord_0[ff][j_idx*3+dd] -= hh
+                        dcoord_1[ff][j_idx*3+dd] += hh
+                        dem_0, dem_deriv_0, drij_0, dnlist_0 = self.sess.run(
+                            [tem, tem_deriv, trij, tnlist],
+                            feed_dict = {
+                                self.tcoord: dcoord_0,
+                                self.ttype: self.dtype,
+                                self.tbox: self.dbox,
+                                self.tnatoms: self.dnatoms}
+                        )
+                        dem_1, dem_deriv_1, drij_1, dnlist_1 = self.sess.run(
+                            [tem, tem_deriv, trij, tnlist],
+                            feed_dict = {
+                                self.tcoord: dcoord_1,
+                                self.ttype: self.dtype,
+                                self.tbox: self.dbox,
+                                self.tnatoms: self.dnatoms}
+                        )
+                        num_deriv = (dem_1[0][ii*self.nnei*4+jj*4+kk] - dem_0[0][ii*self.ndescrpt+jj*4+kk]) / (2.*hh)
+                        ana_deriv = -dem_deriv[ii*self.nnei*4*3+jj*4*3+kk*3+dd]
+                        self.assertAlmostEqual(num_deriv, ana_deriv, places = 5)
+