diff --git a/source/lib/include/CustomeOperation.h b/source/lib/include/CustomeOperation.h
index c446db8130..f7fd7c2496 100644
--- a/source/lib/include/CustomeOperation.h
+++ b/source/lib/include/CustomeOperation.h
@@ -169,7 +169,7 @@ void compute_descriptor_se_a_cpu (
 }
 
 template<typename FPTYPE>
-void DescrptSeACPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+void DescrptSeACPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
     // set & normalize coord
     std::vector<FPTYPE> d_coord3(nall * 3);
     for (int ii = 0; ii < nall; ++ii) {
@@ -235,8 +235,8 @@ void DescrptSeACPULauncher(const FPTYPE * coord, const int * type, const int * i
 
 #if GOOGLE_CUDA
 template<typename FPTYPE>
-void DescrptSeAGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
-    DescrptSeAGPUExecuteFunctor<FPTYPE>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+void DescrptSeAGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
+    DescrptSeAGPUExecuteFunctor<FPTYPE>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, max_nbor_size);
 }
 #endif // GOOGLE_CUDA
 // ******************************************************************************
@@ -432,7 +432,7 @@ void compute_descriptor_se_r_cpu (
 }
 
 template<typename FPTYPE>
-void DescrptSeRCPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
+void DescrptSeRCPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
     // set & normalize coord
     std::vector<FPTYPE> d_coord3(nall * 3);
     for (int ii = 0; ii < nall; ++ii) {
@@ -498,8 +498,8 @@ void DescrptSeRCPULauncher(const FPTYPE * coord, const int * type, const int * i
 
 #if GOOGLE_CUDA
 template<typename FPTYPE>
-void DescrptSeRGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
-    DescrptSeRGPUExecuteFunctor<FPTYPE>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+void DescrptSeRGPULauncher(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
+    DescrptSeRGPUExecuteFunctor<FPTYPE>()(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, max_nbor_size);
 }
 #endif // GOOGLE_CUDA
 // ******************************************************************************
diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu
index 5965254111..a528c4c477 100644
--- a/source/op/cuda/descrpt_se_a.cu
+++ b/source/op/cuda/descrpt_se_a.cu
@@ -84,9 +84,9 @@ __global__ void format_nlist_fill_a_se_a(const FPTYPE * coord,
                             const float rcut,
                             int_64 * key,
                             int * i_idx,
-                            const int MAGIC_NUMBER)
+                            const int MAX_NBOR_SIZE)
 {   
-    // <<<nloc, MAGIC_NUMBER>>>
+    // <<<nloc, MAX_NBOR_SIZE>>>
     const unsigned int idx = blockIdx.x;
     const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
     
@@ -98,7 +98,7 @@ __global__ void format_nlist_fill_a_se_a(const FPTYPE * coord,
     const int * nei_idx = jlist + jrange[i_idx[idx]];
     // dev_copy(nei_idx, &jlist[jrange[i_idx]], nsize);
 
-    int_64 * key_in = key + idx * MAGIC_NUMBER;
+    int_64 * key_in = key + idx * MAX_NBOR_SIZE;
 
     FPTYPE diff[3];
     const int & j_idx = nei_idx[idy];
@@ -121,7 +121,7 @@ __global__ void format_nlist_fill_b_se_a(int * nlist,
                             const int * sec_a,
                             const int sec_a_size,
                             int * nei_iter_dev,
-                            const int MAGIC_NUMBER)
+                            const int MAX_NBOR_SIZE)
 { 
 
     const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
@@ -132,13 +132,13 @@ __global__ void format_nlist_fill_b_se_a(int * nlist,
     
     int * row_nlist = nlist + idy * nlist_size;
     int * nei_iter = nei_iter_dev + idy * sec_a_size;
-    int_64 * key_out = key + nloc * MAGIC_NUMBER + idy * MAGIC_NUMBER;
+    int_64 * key_out = key + nloc * MAX_NBOR_SIZE + idy * MAX_NBOR_SIZE;
 
     for (int ii = 0; ii < sec_a_size; ii++) {
         nei_iter[ii] = sec_a[ii];
     }
     
-    for (unsigned int kk = 0; key_out[kk] != key_out[MAGIC_NUMBER - 1]; kk++) {
+    for (unsigned int kk = 0; key_out[kk] != key_out[MAX_NBOR_SIZE - 1]; kk++) {
         const int & nei_type = key_out[kk] / 1E15;
         if (nei_iter[nei_type] < sec_a[nei_type + 1]) {
             row_nlist[nei_iter[nei_type]++] = key_out[kk] % 100000;
@@ -228,73 +228,6 @@ __global__ void compute_descriptor_se_a (FPTYPE* descript,
     }
 }
 
-template<typename FPTYPE>
-void format_nbor_list_256 (
-    const FPTYPE* coord,
-    const int* type,
-    const int* jrange,
-    const int* jlist,
-    const int& nloc,       
-    const float& rcut_r, 
-    int * i_idx, 
-    int_64 * key
-) 
-{   
-    const int LEN = 256;
-    const int MAGIC_NUMBER = 256;
-    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
-    dim3 block_grid(nloc, nblock);
-    dim3 thread_grid(1, LEN);
-    format_nlist_fill_a_se_a
-    <<<block_grid, thread_grid>>> (
-        coord,
-        type,
-        jrange,
-        jlist,
-        rcut_r,
-        key,
-        i_idx,
-        MAGIC_NUMBER
-    );
-    const int ITEMS_PER_THREAD = 4;
-    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
-    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
-    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
-}
-
-template<typename FPTYPE>
-void format_nbor_list_512 (
-    const FPTYPE* coord,
-    const int* type,
-    const int* jrange,
-    const int* jlist,
-    const int& nloc,       
-    const float& rcut_r, 
-    int * i_idx, 
-    int_64 * key
-) 
-{   
-    const int LEN = 256;
-    const int MAGIC_NUMBER = 512;
-    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
-    dim3 block_grid(nloc, nblock);
-    dim3 thread_grid(1, LEN);
-    format_nlist_fill_a_se_a
-    <<<block_grid, thread_grid>>> (
-        coord,
-        type,
-        jrange,
-        jlist,
-        rcut_r,
-        key,
-        i_idx,
-        MAGIC_NUMBER
-    );
-    const int ITEMS_PER_THREAD = 4;
-    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
-    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
-    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
-}
 
 template<typename FPTYPE>
 void format_nbor_list_1024 (
@@ -309,8 +242,8 @@ void format_nbor_list_1024 (
 ) 
 {   
     const int LEN = 256;
-    const int MAGIC_NUMBER = 1024;
-    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    const int MAX_NBOR_SIZE = 1024;
+    const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
     dim3 block_grid(nloc, nblock);
     dim3 thread_grid(1, LEN);
     format_nlist_fill_a_se_a
@@ -322,12 +255,12 @@ void format_nbor_list_1024 (
         rcut_r,
         key,
         i_idx,
-        MAGIC_NUMBER
+        MAX_NBOR_SIZE
     );
     const int ITEMS_PER_THREAD = 8;
-    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
     // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
-    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAX_NBOR_SIZE);
 }
 
 template<typename FPTYPE>
@@ -343,8 +276,8 @@ void format_nbor_list_2048 (
 ) 
 {   
     const int LEN = 256;
-    const int MAGIC_NUMBER = 2048;
-    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    const int MAX_NBOR_SIZE = 2048;
+    const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
     dim3 block_grid(nloc, nblock);
     dim3 thread_grid(1, LEN);
     format_nlist_fill_a_se_a
@@ -356,12 +289,12 @@ void format_nbor_list_2048 (
         rcut_r,
         key,
         i_idx,
-        MAGIC_NUMBER
+        MAX_NBOR_SIZE
     );
     const int ITEMS_PER_THREAD = 8;
-    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
     // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
-    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAX_NBOR_SIZE);
 }
 
 template<typename FPTYPE>
@@ -377,8 +310,8 @@ void format_nbor_list_4096 (
 ) 
 {   
     const int LEN = 256;
-    const int MAGIC_NUMBER = 4096;
-    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    const int MAX_NBOR_SIZE = 4096;
+    const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
     dim3 block_grid(nloc, nblock);
     dim3 thread_grid(1, LEN);
     format_nlist_fill_a_se_a
@@ -390,16 +323,16 @@ void format_nbor_list_4096 (
         rcut_r,
         key,
         i_idx,
-        MAGIC_NUMBER
+        MAX_NBOR_SIZE
     );
     const int ITEMS_PER_THREAD = 16;
-    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
     // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
-    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAX_NBOR_SIZE);
 }
 
 template <typename FPTYPE>
-void DescrptSeAGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int MAGIC_NUMBER) {
+void DescrptSeAGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
     const int LEN = 256;
     int nblock = (nloc + LEN -1) / LEN;
     int * sec_a_dev = array_int;
@@ -409,7 +342,7 @@ void DescrptSeAGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const
     
     cudaError_t res = cudaSuccess;
     res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res);    
-    res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res);
+    res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * max_nbor_size); cudaErrcheck(res);
     res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res);
     res = cudaMemset(descript, 0.0, sizeof(FPTYPE) * nloc * ndescrpt); cudaErrcheck(res);
     res = cudaMemset(descript_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3); cudaErrcheck(res);
@@ -419,29 +352,7 @@ void DescrptSeAGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const
         // cudaProfilerStart();
         get_i_idx_se_a<<<nblock, LEN>>> (nloc, ilist, i_idx);
 
-        if (nnei <= 256) {
-            format_nbor_list_256 (
-                coord,
-                type,
-                jrange,
-                jlist,
-                nloc,       
-                rcut_r, 
-                i_idx, 
-                key
-            ); 
-        } else if (nnei <= 512) {
-            format_nbor_list_512 (
-                coord,
-                type,
-                jrange,
-                jlist,
-                nloc,       
-                rcut_r, 
-                i_idx, 
-                key
-            ); 
-        } else if (nnei <= 1024) {
+        if (max_nbor_size <= 1024) {
             format_nbor_list_1024 (
                 coord,
                 type,
@@ -452,7 +363,7 @@ void DescrptSeAGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const
                 i_idx, 
                 key
             ); 
-        } else if (nnei <= 2048) {
+        } else if (max_nbor_size <= 2048) {
             format_nbor_list_2048 (
                 coord,
                 type,
@@ -463,7 +374,7 @@ void DescrptSeAGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const
                 i_idx, 
                 key
             ); 
-        } else if (nnei <= 4096) {
+        } else if (max_nbor_size <= 4096) {
             format_nbor_list_4096 (
                 coord,
                 type,
@@ -486,7 +397,7 @@ void DescrptSeAGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const
                             sec_a_dev,
                             sec_a.size(),
                             nei_iter,
-                            MAGIC_NUMBER
+                            max_nbor_size
         );
     }
 
diff --git a/source/op/cuda/descrpt_se_r.cu b/source/op/cuda/descrpt_se_r.cu
index a65ba5887a..0715f19c5e 100644
--- a/source/op/cuda/descrpt_se_r.cu
+++ b/source/op/cuda/descrpt_se_r.cu
@@ -84,9 +84,9 @@ __global__ void format_nlist_fill_a_se_r(const FPTYPE * coord,
                             const float rcut,
                             int_64 * key,
                             int * i_idx,
-                            const int MAGIC_NUMBER)
+                            const int MAX_NBOR_SIZE)
 {   
-    // <<<nloc, MAGIC_NUMBER>>>
+    // <<<nloc, MAX_NBOR_SIZE>>>
     const unsigned int idx = blockIdx.x;
     const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
     
@@ -98,7 +98,7 @@ __global__ void format_nlist_fill_a_se_r(const FPTYPE * coord,
     const int * nei_idx = jlist + jrange[i_idx[idx]];
     // dev_copy(nei_idx, &jlist[jrange[i_idx]], nsize);
 
-    int_64 * key_in = key + idx * MAGIC_NUMBER;
+    int_64 * key_in = key + idx * MAX_NBOR_SIZE;
 
     FPTYPE diff[3];
     const int & j_idx = nei_idx[idy];
@@ -121,7 +121,7 @@ __global__ void format_nlist_fill_b_se_r(int * nlist,
                             const int * sec_a,
                             const int sec_a_size,
                             int * nei_iter_dev,
-                            const int MAGIC_NUMBER)
+                            const int MAX_NBOR_SIZE)
 { 
 
     const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x;
@@ -132,13 +132,13 @@ __global__ void format_nlist_fill_b_se_r(int * nlist,
     
     int * row_nlist = nlist + idy * nlist_size;
     int * nei_iter = nei_iter_dev + idy * sec_a_size;
-    int_64 * key_out = key + nloc * MAGIC_NUMBER + idy * MAGIC_NUMBER;
+    int_64 * key_out = key + nloc * MAX_NBOR_SIZE + idy * MAX_NBOR_SIZE;
 
     for (int ii = 0; ii < sec_a_size; ii++) {
         nei_iter[ii] = sec_a[ii];
     }
     
-    for (unsigned int kk = 0; key_out[kk] != key_out[MAGIC_NUMBER - 1]; kk++) {
+    for (unsigned int kk = 0; key_out[kk] != key_out[MAX_NBOR_SIZE - 1]; kk++) {
         const int & nei_type = key_out[kk] / 1E15;
         if (nei_iter[nei_type] < sec_a[nei_type + 1]) {
             row_nlist[nei_iter[nei_type]++] = key_out[kk] % 100000;
@@ -210,73 +210,6 @@ __global__ void compute_descriptor_se_r (FPTYPE* descript,
     }
 }
 
-template<typename FPTYPE>
-void format_nbor_list_256 (
-    const FPTYPE* coord,
-    const int* type,
-    const int* jrange,
-    const int* jlist,
-    const int& nloc,       
-    const float& rcut_r, 
-    int * i_idx, 
-    int_64 * key
-) 
-{   
-    const int LEN = 256;
-    const int MAGIC_NUMBER = 256;
-    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
-    dim3 block_grid(nloc, nblock);
-    dim3 thread_grid(1, LEN);
-    format_nlist_fill_a_se_r
-    <<<block_grid, thread_grid>>> (
-        coord,
-        type,
-        jrange,
-        jlist,
-        rcut_r,
-        key,
-        i_idx,
-        MAGIC_NUMBER
-    );
-    const int ITEMS_PER_THREAD = 4;
-    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
-    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
-    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
-}
-
-template<typename FPTYPE>
-void format_nbor_list_512 (
-    const FPTYPE* coord,
-    const int* type,
-    const int* jrange,
-    const int* jlist,
-    const int& nloc,       
-    const float& rcut_r, 
-    int * i_idx, 
-    int_64 * key
-) 
-{   
-    const int LEN = 256;
-    const int MAGIC_NUMBER = 512;
-    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
-    dim3 block_grid(nloc, nblock);
-    dim3 thread_grid(1, LEN);
-    format_nlist_fill_a_se_r
-    <<<block_grid, thread_grid>>> (
-        coord,
-        type,
-        jrange,
-        jlist,
-        rcut_r,
-        key,
-        i_idx,
-        MAGIC_NUMBER
-    );
-    const int ITEMS_PER_THREAD = 4;
-    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
-    // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
-    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
-}
 
 template<typename FPTYPE>
 void format_nbor_list_1024 (
@@ -291,8 +224,8 @@ void format_nbor_list_1024 (
 ) 
 {   
     const int LEN = 256;
-    const int MAGIC_NUMBER = 1024;
-    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    const int MAX_NBOR_SIZE = 1024;
+    const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
     dim3 block_grid(nloc, nblock);
     dim3 thread_grid(1, LEN);
     format_nlist_fill_a_se_r
@@ -304,12 +237,12 @@ void format_nbor_list_1024 (
         rcut_r,
         key,
         i_idx,
-        MAGIC_NUMBER
+        MAX_NBOR_SIZE
     );
     const int ITEMS_PER_THREAD = 8;
-    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
     // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
-    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAX_NBOR_SIZE);
 }
 
 template<typename FPTYPE>
@@ -325,8 +258,8 @@ void format_nbor_list_2048 (
 ) 
 {   
     const int LEN = 256;
-    const int MAGIC_NUMBER = 2048;
-    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    const int MAX_NBOR_SIZE = 2048;
+    const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
     dim3 block_grid(nloc, nblock);
     dim3 thread_grid(1, LEN);
     format_nlist_fill_a_se_r
@@ -338,12 +271,12 @@ void format_nbor_list_2048 (
         rcut_r,
         key,
         i_idx,
-        MAGIC_NUMBER
+        MAX_NBOR_SIZE
     );
     const int ITEMS_PER_THREAD = 8;
-    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
     // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
-    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAX_NBOR_SIZE);
 }
 
 template<typename FPTYPE>
@@ -359,8 +292,8 @@ void format_nbor_list_4096 (
 ) 
 {   
     const int LEN = 256;
-    const int MAGIC_NUMBER = 4096;
-    const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN;
+    const int MAX_NBOR_SIZE = 4096;
+    const int nblock = (MAX_NBOR_SIZE + LEN - 1) / LEN;
     dim3 block_grid(nloc, nblock);
     dim3 thread_grid(1, LEN);
     format_nlist_fill_a_se_r
@@ -372,16 +305,16 @@ void format_nbor_list_4096 (
         rcut_r,
         key,
         i_idx,
-        MAGIC_NUMBER
+        MAX_NBOR_SIZE
     );
     const int ITEMS_PER_THREAD = 16;
-    const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
+    const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
     // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
-    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
+    BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAX_NBOR_SIZE);
 }
 
 template <typename FPTYPE>
-void DescrptSeRGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int MAGIC_NUMBER) {
+void DescrptSeRGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const int * type, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descript, FPTYPE * descript_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
     const int LEN = 256;
     int nblock = (nloc + LEN -1) / LEN;
     int * sec_a_dev = array_int;
@@ -391,7 +324,7 @@ void DescrptSeRGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const
     
     cudaError_t res = cudaSuccess;
     res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res);    
-    res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res);
+    res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * max_nbor_size); cudaErrcheck(res);
     res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res);
     res = cudaMemset(descript, 0.0, sizeof(FPTYPE) * nloc * ndescrpt); cudaErrcheck(res);
     res = cudaMemset(descript_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3); cudaErrcheck(res);
@@ -401,29 +334,7 @@ void DescrptSeRGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const
         // cudaProfilerStart();
         get_i_idx_se_r<<<nblock, LEN>>> (nloc, ilist, i_idx);
 
-        if (nnei <= 256) {
-            format_nbor_list_256 (
-                coord,
-                type,
-                jrange,
-                jlist,
-                nloc,       
-                rcut_r, 
-                i_idx, 
-                key
-            ); 
-        } else if (nnei <= 512) {
-            format_nbor_list_512 (
-                coord,
-                type,
-                jrange,
-                jlist,
-                nloc,       
-                rcut_r, 
-                i_idx, 
-                key
-            ); 
-        } else if (nnei <= 1024) {
+        if (max_nbor_size <= 1024) {
             format_nbor_list_1024 (
                 coord,
                 type,
@@ -434,7 +345,7 @@ void DescrptSeRGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const
                 i_idx, 
                 key
             ); 
-        } else if (nnei <= 2048) {
+        } else if (max_nbor_size <= 2048) {
             format_nbor_list_2048 (
                 coord,
                 type,
@@ -445,7 +356,7 @@ void DescrptSeRGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const
                 i_idx, 
                 key
             ); 
-        } else if (nnei <= 4096) {
+        } else if (max_nbor_size <= 4096) {
             format_nbor_list_4096 (
                 coord,
                 type,
@@ -468,7 +379,7 @@ void DescrptSeRGPUExecuteFunctor<FPTYPE>::operator()(const FPTYPE * coord, const
                             sec_a_dev,
                             sec_a.size(),
                             nei_iter,
-                            MAGIC_NUMBER
+                            max_nbor_size
         );
     }
 
diff --git a/source/op/descrpt_se_a_multi_device.cc b/source/op/descrpt_se_a_multi_device.cc
index ae5e623171..93e2cdccac 100644
--- a/source/op/descrpt_se_a_multi_device.cc
+++ b/source/op/descrpt_se_a_multi_device.cc
@@ -34,13 +34,13 @@ struct DeviceFunctor {
 
 template <typename FPTYPE>
 struct DescrptSeAFunctor {
-    void operator()(const CPUDevice& d, const FPTYPE * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
-        DescrptSeACPULauncher(coord, type, ilist, jrange, jlist, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ntypes, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+    void operator()(const CPUDevice& d, const FPTYPE * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
+        DescrptSeACPULauncher(coord, type, ilist, jrange, jlist, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ntypes, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, max_nbor_size);
     }
 
     #if GOOGLE_CUDA
-    void operator()(const GPUDevice& d, const FPTYPE * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
-        DescrptSeAGPULauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+    void operator()(const GPUDevice& d, const FPTYPE * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const FPTYPE * avg, const FPTYPE * std, FPTYPE * descrpt, FPTYPE * descrpt_deriv, FPTYPE * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
+        DescrptSeAGPULauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, max_nbor_size);
     }
     #endif // GOOGLE_CUDA 
 };
@@ -66,7 +66,7 @@ class DescrptSeAOp : public OpKernel {
         nnei_r = sec_r.back();
         nnei = nnei_a + nnei_r;
         fill_nei_a = (rcut_a < 0);
-        magic_number = get_magic_number(nnei);
+        max_nbor_size = 1024;
     }
 
     void Compute(OpKernelContext* context) override {
@@ -117,7 +117,6 @@ class DescrptSeAOp : public OpKernel {
         
         OP_REQUIRES (context, (ntypes == int(sel_a.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
         OP_REQUIRES (context, (ntypes == int(sel_r.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
-        OP_REQUIRES (context, (nnei <= 4096),	                errors::InvalidArgument ("Assert failed, max neighbor size of atom(nnei) " + std::to_string(nnei) + " is larger than 4096, which currently is not supported by deepmd-kit."));
 
         // Create output tensors
         TensorShape descrpt_shape ;
@@ -159,13 +158,14 @@ class DescrptSeAOp : public OpKernel {
             OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
             Tensor uint64_temp;
             TensorShape uint64_shape;
-            uint64_shape.AddDim(nloc * magic_number * 2);
+            uint64_shape.AddDim(nloc * max_nbor_size * 2);
             OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp));
 
             array_int = int_temp.flat<int>().data(); 
             array_longlong = uint64_temp.flat<unsigned long long>().data();
 
             nbor_update(mesh_tensor.flat<int>().data(), static_cast<int>(mesh_tensor.NumElements()));
+            OP_REQUIRES (context, (max_nbor_size <= 4096),	                errors::InvalidArgument ("Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_nbor_size) + " is larger than 4096, which currently is not supported by deepmd-kit."));
         }
         else if (device == "CPU") {
             memcpy (&ilist,  4  + mesh_tensor.flat<int>().data(), sizeof(int *));
@@ -198,7 +198,7 @@ class DescrptSeAOp : public OpKernel {
             rcut_r_smth,
             sec_a,
             fill_nei_a,
-            magic_number
+            max_nbor_size
         );
     }
 
@@ -212,7 +212,7 @@ class DescrptSeAOp : public OpKernel {
     std::vector<int> sec_a;
     std::vector<int> sec_r;
     int ndescrpt, ndescrpt_a, ndescrpt_r;
-    int nnei, nnei_a, nnei_r, nloc, nall, magic_number;
+    int nnei, nnei_a, nnei_r, nloc, nall, max_nbor_size;
     bool fill_nei_a;
 
     //private func
@@ -266,27 +266,15 @@ class DescrptSeAOp : public OpKernel {
             cudaErrcheck(cudaMemcpy(ilist,  ilist_host,  sizeof(int) * mesh_host[1], cudaMemcpyHostToDevice));
             cudaErrcheck(cudaMemcpy(jrange, jrange_host, sizeof(int) * mesh_host[2], cudaMemcpyHostToDevice));
             cudaErrcheck(cudaMemcpy(jlist,  jlist_host,  sizeof(int) * mesh_host[3], cudaMemcpyHostToDevice));
+            
+            max_nbor_size = 1024;
+            for(int ii = 0; ii < mesh_host[2]; ii++) {
+                max_nbor_size = (jrange_host[ii + 1] - jrange_host[ii]) > max_nbor_size ? (jrange_host[ii + 1] - jrange_host[ii]) : max_nbor_size;
+            }
         }
         delete [] mesh_host;
     }
 
-    int get_magic_number(int const nnei) {
-        if (nnei <= 256) {
-            return 256;
-        }
-        else if (nnei <= 512) {
-            return 512;
-        }
-        else if (nnei <= 1024) {
-            return 1024;
-        }
-        else if (nnei <= 2048) {
-            return 2048;
-        }
-        else if (nnei <= 4096) {
-            return 4096;
-        }
-    }
 };
 
 // Register the CPU kernels.
diff --git a/source/op/descrpt_se_r_multi_device.cc b/source/op/descrpt_se_r_multi_device.cc
index c5eaff616c..b94f97d6e1 100644
--- a/source/op/descrpt_se_r_multi_device.cc
+++ b/source/op/descrpt_se_r_multi_device.cc
@@ -31,13 +31,13 @@ struct DeviceFunctor {
 
 template <typename T>
 struct DescrptSeRFunctor {
-    void operator()(const CPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
-        DescrptSeRCPULauncher(coord, type, ilist, jrange, jlist, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ntypes, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+    void operator()(const CPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
+        DescrptSeRCPULauncher(coord, type, ilist, jrange, jlist, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ntypes, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, max_nbor_size);
     }
 
     #if GOOGLE_CUDA
-    void operator()(const GPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int magic_number) {
-        DescrptSeRGPULauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, magic_number);
+    void operator()(const GPUDevice& d, const T * coord, const int * type, const int * mesh, const int * ilist, const int * jrange, const int * jlist, int * array_int, unsigned long long * array_longlong, const T * avg, const T * std, T * descrpt, T * descrpt_deriv, T * rij, int * nlist, const int nloc, const int nall, const int nnei, const int ntypes, const int ndescrpt, const float rcut_r, const float rcut_r_smth, const std::vector<int> sec_a, const bool fill_nei_a, const int max_nbor_size) {
+        DescrptSeRGPULauncher(coord, type, ilist, jrange, jlist, array_int, array_longlong, avg, std, descrpt, descrpt_deriv, rij, nlist, nloc, nall, nnei, ndescrpt, rcut_r, rcut_r_smth, sec_a, fill_nei_a, max_nbor_size);
     }
     #endif // GOOGLE_CUDA 
 };
@@ -55,9 +55,7 @@ class DescrptSeROp : public OpKernel {
         ndescrpt = sec.back() * 1;
         nnei = sec.back();
         fill_nei_a = true;
-        magic_number = get_magic_number(nnei);
-        // count_nei_idx_overflow = 0;
-        // std::cout << "I'm in descrpt_se_r_gpu.cc" << std::endl;
+        max_nbor_size = 1024;
     }
 
     void Compute(OpKernelContext* context) override {
@@ -149,13 +147,14 @@ class DescrptSeROp : public OpKernel {
             OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
             Tensor uint64_temp;
             TensorShape uint64_shape;
-            uint64_shape.AddDim(nloc * magic_number * 2);
+            uint64_shape.AddDim(nloc * max_nbor_size * 2);
             OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp));
 
             array_int = int_temp.flat<int>().data(); 
             array_longlong = uint64_temp.flat<unsigned long long>().data();
 
             nbor_update(mesh_tensor.flat<int>().data(), static_cast<int>(mesh_tensor.NumElements()));
+            OP_REQUIRES (context, (max_nbor_size <= 4096),	                errors::InvalidArgument ("Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_nbor_size) + " is larger than 4096, which currently is not supported by deepmd-kit."));
         }
         else if (device == "CPU") {
             memcpy (&ilist,  4  + mesh_tensor.flat<int>().data(), sizeof(int *));
@@ -188,7 +187,7 @@ class DescrptSeROp : public OpKernel {
             rcut_smth,
             sec,
             fill_nei_a,
-            magic_number
+            max_nbor_size
         );
     }
 
@@ -213,7 +212,7 @@ class DescrptSeROp : public OpKernel {
         }
     }
 
-    int magic_number;
+    int max_nbor_size;
     std::string device;
     int *array_int;
     unsigned long long*array_longlong;
@@ -256,27 +255,15 @@ class DescrptSeROp : public OpKernel {
             cudaErrcheck(cudaMemcpy(ilist,  ilist_host,  sizeof(int) * mesh_host[1], cudaMemcpyHostToDevice));
             cudaErrcheck(cudaMemcpy(jrange, jrange_host, sizeof(int) * mesh_host[2], cudaMemcpyHostToDevice));
             cudaErrcheck(cudaMemcpy(jlist,  jlist_host,  sizeof(int) * mesh_host[3], cudaMemcpyHostToDevice));
+
+            max_nbor_size = 1024;
+            for(int ii = 0; ii < mesh_host[2]; ii++) {
+                max_nbor_size = (jrange_host[ii + 1] - jrange_host[ii]) > max_nbor_size ? (jrange_host[ii + 1] - jrange_host[ii]) : max_nbor_size;
+            }
         }
         delete [] mesh_host;
     }
 
-    int get_magic_number(int const nnei) {
-        if (nnei <= 256) {
-            return 256;
-        }
-        else if (nnei <= 512) {
-            return 512;
-        }
-        else if (nnei <= 1024) {
-            return 1024;
-        }
-        else if (nnei <= 2048) {
-            return 2048;
-        }
-        else if (nnei <= 4096) {
-            return 4096;
-        }
-    }
 };
 
 // Register the CPU kernels.