diff --git a/source/lib/src/NNPInter.cc b/source/lib/src/NNPInter.cc
index 58042019c7..f4f39945ff 100644
--- a/source/lib/src/NNPInter.cc
+++ b/source/lib/src/NNPInter.cc
@@ -3,7 +3,7 @@
 #include "SimulationRegion.h"
 #include <stdexcept>	
 
-#define MAGIC_NUMBER 256
+#define MAGIC_NUMBER 1024
 
 #ifdef  USE_CUDA_TOOLKIT
 #include "cuda_runtime.h"
diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu
index 39434306f2..4b309522fa 100644
--- a/source/op/cuda/descrpt_se_a.cu
+++ b/source/op/cuda/descrpt_se_a.cu
@@ -18,7 +18,7 @@ limitations under the License.
 #include <cub/block/block_radix_sort.cuh>
 #include <cuda_runtime.h>
 
-#define MAGIC_NUMBER 256
+#define MAGIC_NUMBER 1024
 
 #ifdef HIGH_PREC
     typedef double  VALUETYPE;
@@ -326,7 +326,7 @@ void DescrptSeALauncher(const VALUETYPE* coord,
                             i_idx
         );
         const int ITEMS_PER_THREAD = 4;
-        const int BLOCK_THREADS = 64;
+        const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD;
         // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
         BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (key, key + nloc * MAGIC_NUMBER);
         
diff --git a/source/op/descrpt_se_a_gpu.cc b/source/op/descrpt_se_a_gpu.cc
index 93c83016fb..70dd9c7751 100644
--- a/source/op/descrpt_se_a_gpu.cc
+++ b/source/op/descrpt_se_a_gpu.cc
@@ -7,7 +7,6 @@
 #include "tensorflow/core/framework/shape_inference.h"
 
 using namespace tensorflow;  // NOLINT(build/namespaces)
-#define MAGIC_NUMBER 256
 
 #ifdef HIGH_PREC
     typedef double VALUETYPE ;
@@ -159,7 +158,8 @@ class DescrptSeAOp : public OpKernel {
         
         OP_REQUIRES (context, (ntypes == int(sel_a.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
         OP_REQUIRES (context, (ntypes == int(sel_r.size())),	errors::InvalidArgument ("number of types should match the length of sel array"));
-        
+        OP_REQUIRES (context, (nnei <= 1024),	                errors::InvalidArgument ("Assert failed, max neighbor size of atom(nnei) " + std::to_string(nnei) + " is larger than 1024!, which currently is not supported by deepmd-kit."));
+
         // Create output tensors
         TensorShape descrpt_shape ;
         descrpt_shape.AddDim (nsamples);
@@ -201,7 +201,6 @@ class DescrptSeAOp : public OpKernel {
         cudaErrcheck(cudaMemcpy(&(array_longlong), 20 + mesh_tensor.flat<int>().data(), sizeof(unsigned long long *), cudaMemcpyDeviceToHost));
         cudaErrcheck(cudaMemcpy(&(array_double), 24 + mesh_tensor.flat<int>().data(), sizeof(compute_t *), cudaMemcpyDeviceToHost));
 
-        // cudaErrcheck(cudaMemcpy(jlist, host_jlist, sizeof(int) * nloc * MAGIC_NUMBER, cudaMemcpyHostToDevice));
         // Launch computation
         for (int II = 0; II < nsamples; II++) {
             DescrptSeALauncher(coord_tensor.matrix<VALUETYPE>().data() + II * (nall * 3),    // related to the kk argument
diff --git a/source/train/CMakeLists.txt b/source/train/CMakeLists.txt
index 8be6b6c819..1875d2097c 100644
--- a/source/train/CMakeLists.txt
+++ b/source/train/CMakeLists.txt
@@ -2,7 +2,7 @@
 
 configure_file("RunOptions.py.in" "${CMAKE_CURRENT_BINARY_DIR}/RunOptions.py" @ONLY)
 
-file(GLOB LIB_PY main.py common.py env.py compat.py calculator.py Network.py Deep*.py Data.py DataSystem.py Model*.py Descrpt*.py Fitting.py Loss.py LearningRate.py Trainer.py TabInter.py EwaldRecp.py DataModifier.py ${CMAKE_CURRENT_BINARY_DIR}/RunOptions.py)
+file(GLOB LIB_PY main.py common.py env.py compat.py calculator.py Network.py Deep*.py Data.py DataSystem.py Model*.py Descrpt*.py Fitting.py Loss.py LearningRate.py Trainer.py TabInter.py EwaldRecp.py DataModifier.py ${CMAKE_CURRENT_BINARY_DIR}/RunOptions.py transform.py)
 
 file(GLOB CLS_PY  Local.py Slurm.py)