diff --git a/source/lib/include/NNPInter.h b/source/lib/include/NNPInter.h
index 69a0ecf7d1..a32940a738 100644
--- a/source/lib/include/NNPInter.h
+++ b/source/lib/include/NNPInter.h
@@ -93,10 +93,8 @@ class NNPInter
   compute_t *array_double;
   InternalNeighborList nlist;
   NNPAtomMap<VALUETYPE> nnpmap;
-  unsigned long long *array_longlong;
-  int *ilist, *jrange, *jlist, *array_int;
+  int *ilist, *jrange, *jlist;
   int ilist_size, jrange_size, jlist_size;
-  int arr_int_size, arr_ll_size, arr_dou_size;
 
   // function used for neighbor list copy
   vector<int> get_sel_a() const;
@@ -191,13 +189,10 @@ class NNPInterModelDevi
   vector<vector<int> > sec;
   InternalNeighborList nlist;
   NNPAtomMap<VALUETYPE> nnpmap;
-  unsigned long long *array_longlong;
-  int max_sec_size = 0, max_sec_back = 0;
-  int *ilist, *jrange, *jlist, *array_int;
-  int ilist_size, jrange_size, jlist_size, arr_int_size, arr_ll_size, arr_dou_size;
+  int *ilist, *jrange, *jlist;
+  int ilist_size, jrange_size, jlist_size;
 
   // function used for nborlist copy
-  void get_max_sec();
   vector<vector<int> > get_sel() const;
   void cum_sum(const std::vector<std::vector<int32> > n_sel);
 #ifdef USE_CUDA_TOOLKIT
diff --git a/source/lib/include/common.h b/source/lib/include/common.h
index 8db7d817ce..3912f21f7f 100644
--- a/source/lib/include/common.h
+++ b/source/lib/include/common.h
@@ -144,9 +144,6 @@ session_input_tensors (vector<std::pair<string, Tensor>>& input_tensors,
 		       const int                        * ilist, 
 		       const int                        * jrange,
 		       const int                        * jlist,
-		       int                              * array_int,
-		       unsigned long long               * array_longlong, 
-		       double                           * array_double,
 		       const vector<VALUETYPE>		& fparam_,
 		       const vector<VALUETYPE>	        & aparam_,
 		       const NNPAtomMap<VALUETYPE>      & nnpmap,
diff --git a/source/lib/src/NNPInter.cc b/source/lib/src/NNPInter.cc
index f4f39945ff..f526cad911 100644
--- a/source/lib/src/NNPInter.cc
+++ b/source/lib/src/NNPInter.cc
@@ -3,7 +3,6 @@
 #include "SimulationRegion.h"
 #include <stdexcept>	
 
-#define MAGIC_NUMBER 1024
 
 #ifdef  USE_CUDA_TOOLKIT
 #include "cuda_runtime.h"
@@ -14,7 +13,7 @@
 #define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); }
 inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
 {
-    if (code != cudaSuccess) 
+    if (code != cudaSuccess)
     {
         fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line);
         if (abort) exit(code);
@@ -273,9 +272,6 @@ NNPInter::~NNPInter() {
         cudaErrcheck(cudaFree(ilist));
         cudaErrcheck(cudaFree(jrange));
         cudaErrcheck(cudaFree(jlist));
-        cudaErrcheck(cudaFree(array_int));
-        cudaErrcheck(cudaFree(array_longlong));
-        cudaErrcheck(cudaFree(array_double));
     }
     #endif
 }
@@ -283,24 +279,12 @@ NNPInter::~NNPInter() {
 #ifdef USE_CUDA_TOOLKIT
 void NNPInter::update_nbor(const InternalNeighborList & nlist, const int nloc) {
     if (!init_nbor) {
-        sec_a = cum_sum(get_sel_a());
         cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
         cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
         cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
-        cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (sec_a.size() + nloc * sec_a.size() + nloc)));
-        cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2));
-        #ifdef HIGH_PREC
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3));
-        #else
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3));
-        #endif
         ilist_size = nlist.ilist.size();
         jrange_size = nlist.jrange.size();
         jlist_size = nlist.jlist.size();
-        arr_int_size = sec_a.size() + nloc * sec_a.size() + nloc;
-        arr_ll_size = nloc * MAGIC_NUMBER * 2;
-        arr_dou_size = nloc * sec_a.back() * 3;
-        init_nbor = true;
     }
     if (ilist_size < nlist.ilist.size()) {
         cudaErrcheck(cudaFree(ilist));
@@ -317,25 +301,7 @@ void NNPInter::update_nbor(const InternalNeighborList & nlist, const int nloc) {
         cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
         jlist_size = nlist.jlist.size();
     }
-    if (arr_int_size < sec_a.size() + nloc * sec_a.size() + nloc) {
-        cudaErrcheck(cudaFree(array_int));
-        cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (sec_a.size() + nloc * sec_a.size() + nloc)));
-        arr_int_size = sec_a.size() + nloc * sec_a.size() + nloc;
-    }
-    if (arr_ll_size < nloc * MAGIC_NUMBER * 2) {
-        cudaErrcheck(cudaFree(array_longlong));
-        cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2));
-        arr_ll_size = nloc * MAGIC_NUMBER * 2;
-    }
-    if (arr_dou_size < nloc * sec_a.back() * 3) {
-        cudaErrcheck(cudaFree(array_double));
-        #ifdef HIGH_PREC
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3));
-        #else
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3));
-        #endif
-        arr_dou_size = nloc * sec_a.back() * 3;
-    }
+    
     cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice));
     cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice));
     cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice));
@@ -378,14 +344,10 @@ init (const string & model, const int & gpu_rank)
   if (dfparam < 0) dfparam = 0;
   if (daparam < 0) daparam = 0;
   inited = true;
-
+  
   init_nbor = false;
-  array_int = NULL;
-  array_double = NULL;
-  array_longlong = NULL;
   ilist = NULL; jrange = NULL; jlist = NULL;
   ilist_size = 0; jrange_size = 0; jlist_size = 0;
-  arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0;
 }
 #else
 void
@@ -415,12 +377,8 @@ init (const string & model, const int & gpu_rank)
   inited = true;
 
   init_nbor = false;
-  array_int = NULL;
-  array_double = NULL;
-  array_longlong = NULL;
   ilist = NULL; jrange = NULL; jlist = NULL;
   ilist_size = 0; jrange_size = 0; jlist_size = 0;
-  arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0;
 }
 #endif
 
@@ -602,7 +560,7 @@ compute_inner (ENERGYTYPE &			dener,
     }
 
     #ifdef USE_CUDA_TOOLKIT
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
+        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
     #else
         int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
     #endif
@@ -669,7 +627,7 @@ compute (ENERGYTYPE &			dener,
     }
 
     #ifdef USE_CUDA_TOOLKIT
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
+        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
     #else
         int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
     #endif
@@ -710,9 +668,6 @@ NNPInterModelDevi::~NNPInterModelDevi() {
         cudaErrcheck(cudaFree(ilist));
         cudaErrcheck(cudaFree(jrange));
         cudaErrcheck(cudaFree(jlist));
-        cudaErrcheck(cudaFree(array_int));
-        cudaErrcheck(cudaFree(array_longlong));
-        cudaErrcheck(cudaFree(array_double));
     }
 #endif
 }
@@ -761,14 +716,10 @@ init (const vector<string> & models, const int & gpu_rank)
   // cell_size = rcut;
   // ntypes = get_ntypes();
   inited = true;
-
+  
   init_nbor = false;
-  array_int = NULL;
-  array_double = NULL;
-  array_longlong = NULL;
   ilist = NULL; jrange = NULL; jlist = NULL;
   ilist_size = 0; jrange_size = 0; jlist_size = 0;
-  arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0;
 }
 #else
 void
@@ -798,14 +749,10 @@ init (const vector<string> & models, const int & gpu_rank)
   // cell_size = rcut;
   // ntypes = get_ntypes();
   inited = true;
-
+  
   init_nbor = false;
-  array_int = NULL;
-  array_double = NULL;
-  array_longlong = NULL;
   ilist = NULL; jrange = NULL; jlist = NULL;
   ilist_size = 0; jrange_size = 0; jlist_size = 0;
-  arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0;
 }
 #endif
 
@@ -873,41 +820,18 @@ cum_sum (const std::vector<std::vector<int32> > n_sel)
     }
 }
 
-void  
-NNPInterModelDevi::
-get_max_sec() 
-{
-    for (int ii = 0; ii < numb_models; ii++) {
-        this->max_sec_size = max_sec_size < sec[ii].size() ? sec[ii].size() : max_sec_size;
-        this->max_sec_back = max_sec_back < sec[ii].back() ? sec[ii].back() : max_sec_back;
-    }
-}
-
 #ifdef USE_CUDA_TOOLKIT
 void
 NNPInterModelDevi::
 update_nbor(const InternalNeighborList & nlist, const int nloc) 
 {
     if (!init_nbor) {
-        cum_sum(get_sel());
-        get_max_sec();
         cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size()));
         cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size()));
         cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
-        cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (max_sec_size + nloc * max_sec_size + nloc)));
-        cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2));
-        #ifdef HIGH_PREC
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3));
-        #else
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3));
-        #endif
         ilist_size = nlist.ilist.size();
         jrange_size = nlist.jrange.size();
         jlist_size = nlist.jlist.size();
-        arr_int_size = max_sec_size + nloc * max_sec_size + nloc;
-        arr_ll_size = nloc * MAGIC_NUMBER * 2;
-        arr_dou_size = nloc * max_sec_back * 3;
-        init_nbor = true;
     }
     if (ilist_size < nlist.ilist.size()) {
         cudaErrcheck(cudaFree(ilist));
@@ -924,25 +848,7 @@ update_nbor(const InternalNeighborList & nlist, const int nloc)
         cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size()));
         jlist_size = nlist.jlist.size();
     }
-    if (arr_int_size < max_sec_size + nloc * max_sec_size + nloc) {
-        cudaErrcheck(cudaFree(array_int));
-        cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (max_sec_size + nloc * max_sec_size + nloc)));
-        arr_int_size = max_sec_size + nloc * max_sec_size + nloc;
-    }
-    if (arr_ll_size < nloc * MAGIC_NUMBER * 2) {
-        cudaErrcheck(cudaFree(array_longlong));
-        cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2));
-        arr_ll_size = nloc * MAGIC_NUMBER * 2;
-    }
-    if (arr_dou_size < nloc * max_sec_back * 3) {
-        cudaErrcheck(cudaFree(array_double));
-        #ifdef HIGH_PREC
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3));
-        #else
-            cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3));
-        #endif
-        arr_dou_size = nloc * max_sec_back * 3;
-    }
+
     cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice));
     cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice));
     cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice));
@@ -1044,7 +950,7 @@ compute (vector<ENERGYTYPE> &		all_energy,
 
     }
     #ifdef USE_CUDA_TOOLKIT
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
+        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
     #else
         int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
     #endif
@@ -1094,7 +1000,7 @@ compute (vector<ENERGYTYPE> &			all_energy,
         
     }
     #ifdef USE_CUDA_TOOLKIT
-        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost);
+        int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost);
     #else
         int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost);
     #endif
diff --git a/source/lib/src/common.cc b/source/lib/src/common.cc
index 296800a396..dd4c3f672a 100644
--- a/source/lib/src/common.cc
+++ b/source/lib/src/common.cc
@@ -479,9 +479,6 @@ session_input_tensors (
     const int                           *   ilist, 
     const int                           *   jrange,
     const int                           *   jlist,
-    int                                 *   array_int,
-    unsigned long long                  *   array_longlong, 
-    double                              *   array_double,
     const vector<VALUETYPE>	            &   fparam_,
     const vector<VALUETYPE>	            &   aparam_,
     const NNPAtomMap<VALUETYPE>         &   nnpmap,
@@ -511,7 +508,7 @@ session_input_tensors (
     box_shape.AddDim (nframes);
     box_shape.AddDim (9);
     TensorShape mesh_shape;
-    mesh_shape.AddDim (32);
+    mesh_shape.AddDim (16);
     TensorShape natoms_shape;
     natoms_shape.AddDim (2 + ntypes);
     TensorShape fparam_shape;
@@ -565,7 +562,7 @@ session_input_tensors (
         }
     }
     
-    for (int ii = 0; ii < 32; ++ii) mesh(ii) = 0;
+    for (int ii = 0; ii < 16; ++ii) mesh(ii) = 0;
     
     mesh (0) = sizeof(int *) / sizeof(int);
     assert (mesh(0) * sizeof(int) == sizeof(int *));
@@ -577,9 +574,6 @@ session_input_tensors (
     memcpy (&mesh(4), &(ilist), sizeof(int *));
     memcpy (&mesh(8), &(jrange), sizeof(int *));
     memcpy (&mesh(12), &(jlist), sizeof(int *));
-    memcpy (&mesh(16), &(array_int), sizeof(int *));
-    memcpy (&mesh(20), &(array_longlong), sizeof(unsigned long long *));
-    memcpy (&mesh(24), &(array_double), sizeof(double *));
 
     natoms (0) = nloc;
     natoms (1) = nall;
diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu
index 4b309522fa..1636df8ff5 100644
--- a/source/op/cuda/descrpt_se_a.cu
+++ b/source/op/cuda/descrpt_se_a.cu
@@ -18,8 +18,6 @@ limitations under the License.
 #include <cub/block/block_radix_sort.cuh>
 #include <cuda_runtime.h>
 
-#define MAGIC_NUMBER 1024
-
 #ifdef HIGH_PREC
     typedef double  VALUETYPE;
 #else
@@ -118,11 +116,12 @@ __global__ void format_nlist_fill_a_se_a(const VALUETYPE * coord,
                             const int  * jlist,
                             const float rcut,
                             int_64 * key,
-                            int * i_idx)
+                            int * i_idx,
+                            const int MAGIC_NUMBER)
 {   
     // <<<nloc, MAGIC_NUMBER>>>
-    const unsigned int idx = blockIdx.x;
-    const unsigned int idy = threadIdx.x;
+    const unsigned int idx = blockIdx.y;
+    const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
     
     const int nsize = jrange[i_idx[idx] + 1] - jrange[i_idx[idx]];
     if (idy >= nsize) {
@@ -154,7 +153,8 @@ __global__ void format_nlist_fill_b_se_a(int * nlist,
                             int_64 * key,
                             const int * sec_a,
                             const int sec_a_size,
-                            int * nei_iter_dev)
+                            int * nei_iter_dev,
+                            const int MAGIC_NUMBER)
 { 
 
     const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
@@ -194,7 +194,6 @@ __global__ void compute_descriptor_se_a (VALUETYPE* descript,
                             const VALUETYPE* coord,
                             const VALUETYPE rmin,
                             const VALUETYPE rmax,
-                            compute_t* sel_a_diff_dev,
                             const int sec_a_size)
 {   
     // <<<nloc, sec_a.back()>>>
@@ -208,16 +207,14 @@ __global__ void compute_descriptor_se_a (VALUETYPE* descript,
     VALUETYPE * row_descript = descript + idx * ndescrpt;
     VALUETYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size;
     VALUETYPE * row_rij = rij + idx * rij_size;
-    compute_t * sel_a_diff = sel_a_diff_dev + idx * nlist_size * 3;
     int * row_nlist = nlist + idx * nlist_size;
 
     if (row_nlist[idy] >= 0) {
         const int & j_idx = row_nlist[idy];
         for (int kk = 0; kk < 3; kk++) {
-            sel_a_diff[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk];
-            row_rij[idy * 3 + kk] = sel_a_diff[idy * 3 + kk];
+            row_rij[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk];
         }
-        const compute_t * rr = &sel_a_diff[idy * 3 + 0];
+        const compute_t * rr = &row_rij[idy * 3 + 0];
         compute_t nr2 = dev_dot(rr, rr);
         compute_t inr = 1./sqrt(nr2);
         compute_t nr = nr2 * inr;
@@ -263,6 +260,166 @@ __global__ void compute_descriptor_se_a (VALUETYPE* descript,
     }
 }
 
+void format_nbor_list_256 (
+    const VALUETYPE* coord,
+    const int* type,
+    const int* jrange,
+    const int* jlist,
+    const int& nloc,       
+    const float& rcut_r, 
+    int * i_idx, 
+    int_64 * key
+) 
+{   
+    const int LEN = 256;
+    const int MAGIC_NUMBER = 256;
+    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    dim3 block_grid(nblock, nloc);
+    format_nlist_fill_a_se_a
+    <<<block_grid, LEN>>> (
+        coord,
+        type,
+        jrange,
+        jlist,
+        rcut_r,
+        key,
+        i_idx,
+        MAGIC_NUMBER
+    );
+    const int ITEMS_PER_THREAD = 4;
+    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+}
+
+void format_nbor_list_512 (
+    const VALUETYPE* coord,
+    const int* type,
+    const int* jrange,
+    const int* jlist,
+    const int& nloc,       
+    const float& rcut_r, 
+    int * i_idx, 
+    int_64 * key
+) 
+{   
+    const int LEN = 256;
+    const int MAGIC_NUMBER = 512;
+    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    dim3 block_grid(nblock, nloc);
+    format_nlist_fill_a_se_a
+    <<<block_grid, LEN>>> (
+        coord,
+        type,
+        jrange,
+        jlist,
+        rcut_r,
+        key,
+        i_idx,
+        MAGIC_NUMBER
+    );
+    const int ITEMS_PER_THREAD = 4;
+    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+}
+
+void format_nbor_list_1024 (
+    const VALUETYPE* coord,
+    const int* type,
+    const int* jrange,
+    const int* jlist,
+    const int& nloc,       
+    const float& rcut_r, 
+    int * i_idx, 
+    int_64 * key
+) 
+{   
+    const int LEN = 256;
+    const int MAGIC_NUMBER = 1024;
+    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    dim3 block_grid(nblock, nloc);
+    format_nlist_fill_a_se_a
+    <<<block_grid, LEN>>> (
+        coord,
+        type,
+        jrange,
+        jlist,
+        rcut_r,
+        key,
+        i_idx,
+        MAGIC_NUMBER
+    );
+    const int ITEMS_PER_THREAD = 8;
+    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+}
+
+void format_nbor_list_2048 (
+    const VALUETYPE* coord,
+    const int* type,
+    const int* jrange,
+    const int* jlist,
+    const int& nloc,       
+    const float& rcut_r, 
+    int * i_idx, 
+    int_64 * key
+) 
+{   
+    const int LEN = 256;
+    const int MAGIC_NUMBER = 2048;
+    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    dim3 block_grid(nblock, nloc);
+    format_nlist_fill_a_se_a
+    <<<block_grid, LEN>>> (
+        coord,
+        type,
+        jrange,
+        jlist,
+        rcut_r,
+        key,
+        i_idx,
+        MAGIC_NUMBER
+    );
+    const int ITEMS_PER_THREAD = 8;
+    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+}
+
+void format_nbor_list_4096 (
+    const VALUETYPE* coord,
+    const int* type,
+    const int* jrange,
+    const int* jlist,
+    const int& nloc,       
+    const float& rcut_r, 
+    int * i_idx, 
+    int_64 * key
+) 
+{   
+    const int LEN = 256;
+    const int MAGIC_NUMBER = 4096;
+    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    dim3 block_grid(nblock, nloc);
+    format_nlist_fill_a_se_a
+    <<<block_grid, LEN>>> (
+        coord,
+        type,
+        jrange,
+        jlist,
+        rcut_r,
+        key,
+        i_idx,
+        MAGIC_NUMBER
+    );
+    const int ITEMS_PER_THREAD = 16;
+    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+}
+
 void DescrptSeALauncher(const VALUETYPE* coord,
                             const int* type,
                             const int* ilist,
@@ -270,22 +427,20 @@ void DescrptSeALauncher(const VALUETYPE* coord,
                             const int* jlist,
                             int* array_int,
                             unsigned long long* array_longlong,
-                            compute_t* array_double,
                             const VALUETYPE* avg,
                             const VALUETYPE* std,
                             VALUETYPE* descript,
                             VALUETYPE* descript_deriv,
                             VALUETYPE* rij,
                             int* nlist,
-                            const int& ntypes,
                             const int& nloc,       
-                            const int& nall,       
                             const int& nnei,       
                             const float& rcut_r,     
                             const float& rcut_r_smth,
                             const int& ndescrpt, 
                             const std::vector<int>& sec_a,      
-                            const bool& fill_nei_a
+                            const bool& fill_nei_a,
+                            const int MAGIC_NUMBER
 )
 {   
     const int LEN = 256;
@@ -294,42 +449,76 @@ void DescrptSeALauncher(const VALUETYPE* coord,
     int * nei_iter = array_int + sec_a.size(); // = new int[sec_a_size];
     int * i_idx = array_int + sec_a.size() + nloc * sec_a.size();
     int_64 * key = array_longlong;
-    compute_t * sel_a_diff = array_double;    // = new VALUETYPE *[nlist_size]; nnei
-    // int_64 * key = NULL;
-    // VALUETYPE * sel_a_diff = NULL;    // = new VALUETYPE *[nlist_size]; nnei
-
+    
     cudaError_t res = cudaSuccess;
-    // res = cudaMalloc((void**)&sec_a_dev, sizeof(int) * sec_a.size()); cudaErrcheck(res);
-    // res = cudaMalloc((void**)&nei_iter, sizeof(int) * nloc * sec_a.size()); cudaErrcheck(res);
-    // res = cudaMalloc((void**)&i_idx, sizeof(int) * nloc); cudaErrcheck(res);
-    // res = cudaMalloc((void**)&key, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2); cudaErrcheck(res);
-    // res = cudaMalloc((void**)&sel_a_diff, sizeof(VALUETYPE) * nnei * 3 * nloc); cudaErrcheck(res);
     res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res);    
     res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res);
     res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res);
     res = cudaMemset(descript, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt); cudaErrcheck(res);
     res = cudaMemset(descript_deriv, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt * 3); cudaErrcheck(res);
-    // res = cudaMemset(rij, 0.0, sizeof(VALUETYPE) * nloc * nnei * 3); cudaErrcheck(res);
 
     if (fill_nei_a) {
         // ~~~
         // cudaProfilerStart();
         get_i_idx_se_a<<<nblock, LEN>>> (nloc, ilist, i_idx);
 
-        format_nlist_fill_a_se_a<<<nloc, MAGIC_NUMBER>>> (
-                            coord,
-                            type,
-                            jrange,
-                            jlist,
-                            rcut_r,
-                            key,
-                            i_idx
-        );
-        const int ITEMS_PER_THREAD = 4;
-        const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
-        // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
-        BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
-        
+        if (nnei <= 256) {
+            format_nbor_list_256 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } else if (nnei <= 512) {
+            format_nbor_list_512 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } else if (nnei <= 1024) {
+            format_nbor_list_1024 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } else if (nnei <= 2048) {
+            format_nbor_list_2048 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } else if (nnei <= 4096) {
+            format_nbor_list_4096 (
+                coord,
+                type,
+                jrange,
+                jlist,
+                nloc,       
+                rcut_r, 
+                i_idx, 
+                key
+            ); 
+        } 
+
         format_nlist_fill_b_se_a<<<nblock, LEN>>> (
                             nlist,
                             nnei,       
@@ -339,7 +528,8 @@ void DescrptSeALauncher(const VALUETYPE* coord,
                             key,
                             sec_a_dev,
                             sec_a.size(),
-                            nei_iter
+                            nei_iter,
+                            MAGIC_NUMBER
         );
     }
 
@@ -360,14 +550,6 @@ void DescrptSeALauncher(const VALUETYPE* coord,
                             coord,
                             rcut_r_smth,
                             rcut_r,
-                            sel_a_diff,
                             sec_a.back()
     );
-////
-    // res = cudaFree(sec_a_dev);                  cudaErrcheck(res);
-    // res = cudaFree(key);                        cudaErrcheck(res);
-    // res = cudaFree(i_idx);                      cudaErrcheck(res);
-    // res = cudaFree(nei_iter);                   cudaErrcheck(res);
-    // res = cudaFree(sel_a_diff);                 cudaErrcheck(res);
-    //output some interesting things...
 }  
\ No newline at end of file
diff --git a/source/op/descrpt_se_a_gpu.cc b/source/op/descrpt_se_a_gpu.cc
index 70dd9c7751..fd5ae632cb 100644
--- a/source/op/descrpt_se_a_gpu.cc
+++ b/source/op/descrpt_se_a_gpu.cc
@@ -67,6 +67,24 @@ REGISTER_OP("DescrptSeA")
     .Output("nlist: int32");
 #endif
 
+int get_magic_number(int const nnei) {
+    if (nnei <= 256) {
+        return 256;
+    }
+    else if (nnei <= 512) {
+        return 512;
+    }
+    else if (nnei <= 1024) {
+        return 1024;
+    }
+    else if (nnei <= 2048) {
+        return 2048;
+    }
+    else if (nnei <= 4096) {
+        return 4096;
+    }
+}
+
 void DescrptSeALauncher(const VALUETYPE* coord,
                             const int* type,
                             const int* ilist,
@@ -74,22 +92,20 @@ void DescrptSeALauncher(const VALUETYPE* coord,
                             const int* jlist,
                             int* array_int,
                             unsigned long long* array_longlong,
-                            compute_t* array_double,
                             const VALUETYPE* avg,
                             const VALUETYPE* std,
                             VALUETYPE* descript,
                             VALUETYPE* descript_deriv,
                             VALUETYPE* rij,
                             int* nlist,
-                            const int& ntypes,
                             const int& nloc,
-                            const int& nall,
                             const int& nnei,
                             const float& rcut_r,
                             const float& rcut_r_smth,
                             const int& ndescrpt,
                             const std::vector<int>& sec_a,
-                            const bool& fill_nei_a
+                            const bool& fill_nei_a,
+                            const int MAGIC_NUMBER
 );
 
 class DescrptSeAOp : public OpKernel {
@@ -112,6 +128,7 @@ class DescrptSeAOp : public OpKernel {
         nnei_r = sec_r.back();
         nnei = nnei_a + nnei_r;
         fill_nei_a = (rcut_a < 0);
+        magic_number = get_magic_number(nnei);
     }
 
     void Compute(OpKernelContext* context) override {
@@ -158,7 +175,7 @@ class DescrptSeAOp : public OpKernel {
         
         OP_REQUIRES (context, (ntypes == int(sel_a.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
         OP_REQUIRES (context, (ntypes == int(sel_r.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
-        OP_REQUIRES (context, (nnei <= 1024),	                errors::InvalidArgument ("Assert failed, max neighbor size of atom(nnei) " + std::to_string(nnei) + " is larger than 1024!, which currently is not supported by deepmd-kit."));
+        OP_REQUIRES (context, (nnei <= 4096),	                errors::InvalidArgument ("Assert failed, max neighbor size of atom(nnei) " + std::to_string(nnei) + " is larger than 4096, which currently is not supported by deepmd-kit."));
 
         // Create output tensors
         TensorShape descrpt_shape ;
@@ -192,14 +209,23 @@ class DescrptSeAOp : public OpKernel {
 	    					     nlist_shape,
 	    					     &nlist_tensor));
         
-	    int * ilist = NULL, *jrange = NULL, *jlist = NULL;
-        int * array_int = NULL; unsigned long long *array_longlong = NULL; compute_t *array_double = NULL;
+        // allocate temp memory, temp memory must not be used after this operation!
+        Tensor int_temp;
+        TensorShape int_shape;
+        int_shape.AddDim(sec_a.size() + nloc * sec_a.size() + nloc);
+        OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
+
+        Tensor uint64_temp;
+        TensorShape uint64_shape;
+        uint64_shape.AddDim(nloc * magic_number * 2);
+        OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp));
+
+	    int * ilist = NULL, * jrange = NULL, * jlist = NULL;
+        int * array_int = int_temp.flat<int>().data(); 
+        unsigned long long * array_longlong = uint64_temp.flat<unsigned long long>().data(); 
         cudaErrcheck(cudaMemcpy(&(ilist), 4 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
         cudaErrcheck(cudaMemcpy(&(jrange), 8 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
         cudaErrcheck(cudaMemcpy(&(jlist), 12 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
-        cudaErrcheck(cudaMemcpy(&(array_int), 16 + mesh_tensor.flat<int>().data(), sizeof(int *), cudaMemcpyDeviceToHost));
-        cudaErrcheck(cudaMemcpy(&(array_longlong), 20 + mesh_tensor.flat<int>().data(), sizeof(unsigned long long *), cudaMemcpyDeviceToHost));
-        cudaErrcheck(cudaMemcpy(&(array_double), 24 + mesh_tensor.flat<int>().data(), sizeof(compute_t *), cudaMemcpyDeviceToHost));
 
         // Launch computation
         for (int II = 0; II < nsamples; II++) {
@@ -210,22 +236,20 @@ class DescrptSeAOp : public OpKernel {
                         jlist,
                         array_int,
                         array_longlong,
-                        array_double,
                         avg_tensor.matrix<VALUETYPE>().data(),
                         std_tensor.matrix<VALUETYPE>().data(),
                         descrpt_tensor->matrix<VALUETYPE>().data() + II * (nloc * ndescrpt),
                         descrpt_deriv_tensor->matrix<VALUETYPE>().data() + II * (nloc * ndescrpt * 3),
                         rij_tensor->matrix<VALUETYPE>().data() + II * (nloc * nnei * 3),
                         nlist_tensor->matrix<int>().data() + II * (nloc * nnei),
-                        ntypes,
                         nloc,
-                        nall,
                         nnei,
                         rcut_r,
                         rcut_r_smth,
                         ndescrpt,
                         sec_a,
-                        fill_nei_a
+                        fill_nei_a,
+                        magic_number
             );
         }
         // std::cout << "done" << std::endl;
@@ -242,7 +266,7 @@ class DescrptSeAOp : public OpKernel {
     std::vector<int> sec_a;
     std::vector<int> sec_r;
     int ndescrpt, ndescrpt_a, ndescrpt_r;
-    int nnei, nnei_a, nnei_r, nloc, nall;
+    int nnei, nnei_a, nnei_r, nloc, nall, magic_number;
     bool fill_nei_a;
 
     //private func