From 7328be08982fbbbe12725ee2121a1899f206f2ce Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Wed, 3 Mar 2021 13:54:24 +0800
Subject: [PATCH] fix bug of max_nbor_size usage

---
 source/op/descrpt_se_a_multi_device.cc | 19 +++++++++++++++----
 source/op/descrpt_se_r_multi_device.cc | 18 +++++++++++++++---
 2 files changed, 30 insertions(+), 7 deletions(-)
diff --git a/source/op/descrpt_se_a_multi_device.cc b/source/op/descrpt_se_a_multi_device.cc
index 93e2cdccac..141b2d89bc 100644
--- a/source/op/descrpt_se_a_multi_device.cc
+++ b/source/op/descrpt_se_a_multi_device.cc
@@ -21,6 +21,8 @@ REGISTER_OP("DescrptSeA")
     .Output("nlist: int32");
     // only sel_a and rcut_r uesd.
 
+#define GPU_MAX_NBOR_SIZE 4096
+
 struct DeviceFunctor {
     void operator()(const CPUDevice& d, std::string& device) {
         device = "CPU";
@@ -158,14 +160,14 @@ class DescrptSeAOp : public OpKernel {
             OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
             Tensor uint64_temp;
             TensorShape uint64_shape;
-            uint64_shape.AddDim(nloc * max_nbor_size * 2);
+            uint64_shape.AddDim(nloc * GPU_MAX_NBOR_SIZE * 2);
             OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp));
 
             array_int = int_temp.flat<int>().data(); 
             array_longlong = uint64_temp.flat<unsigned long long>().data();
 
             nbor_update(mesh_tensor.flat<int>().data(), static_cast<int>(mesh_tensor.NumElements()));
-            OP_REQUIRES (context, (max_nbor_size <= 4096),	                errors::InvalidArgument ("Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_nbor_size) + " is larger than 4096, which currently is not supported by deepmd-kit."));
+            OP_REQUIRES (context, (max_nbor_size <= GPU_MAX_NBOR_SIZE),	                errors::InvalidArgument ("Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_nbor_size) + " is larger than 4096, which currently is not supported by deepmd-kit."));
         }
         else if (device == "CPU") {
             memcpy (&ilist,  4  + mesh_tensor.flat<int>().data(), sizeof(int *));
@@ -267,14 +269,23 @@ class DescrptSeAOp : public OpKernel {
             cudaErrcheck(cudaMemcpy(jrange, jrange_host, sizeof(int) * mesh_host[2], cudaMemcpyHostToDevice));
             cudaErrcheck(cudaMemcpy(jlist,  jlist_host,  sizeof(int) * mesh_host[3], cudaMemcpyHostToDevice));
             
-            max_nbor_size = 1024;
+            max_nbor_size = 0;
             for(int ii = 0; ii < mesh_host[2]; ii++) {
                 max_nbor_size = (jrange_host[ii + 1] - jrange_host[ii]) > max_nbor_size ? (jrange_host[ii + 1] - jrange_host[ii]) : max_nbor_size;
             }
+            assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
+            if (max_nbor_size <= 1024) {
+                max_nbor_size = 1024;
+            }
+            else if (max_nbor_size <= 2048) {
+                max_nbor_size = 2048;
+            }
+            else {
+                max_nbor_size = 4096;
+            }
         }
         delete [] mesh_host;
     }
-
 };
 
 // Register the CPU kernels.
diff --git a/source/op/descrpt_se_r_multi_device.cc b/source/op/descrpt_se_r_multi_device.cc
index b94f97d6e1..c355e34f12 100644
--- a/source/op/descrpt_se_r_multi_device.cc
+++ b/source/op/descrpt_se_r_multi_device.cc
@@ -18,6 +18,8 @@ REGISTER_OP("DescrptSeR")
     .Output("rij: T")
     .Output("nlist: int32");
 
+#define GPU_MAX_NBOR_SIZE 4096
+
 struct DeviceFunctor {
     void operator()(const CPUDevice& d, std::string& device) {
         device = "CPU";
@@ -147,14 +149,14 @@ class DescrptSeROp : public OpKernel {
             OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp));
             Tensor uint64_temp;
             TensorShape uint64_shape;
-            uint64_shape.AddDim(nloc * max_nbor_size * 2);
+            uint64_shape.AddDim(nloc * GPU_MAX_NBOR_SIZE * 2);
             OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp));
 
             array_int = int_temp.flat<int>().data(); 
             array_longlong = uint64_temp.flat<unsigned long long>().data();
 
             nbor_update(mesh_tensor.flat<int>().data(), static_cast<int>(mesh_tensor.NumElements()));
-            OP_REQUIRES (context, (max_nbor_size <= 4096),	                errors::InvalidArgument ("Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_nbor_size) + " is larger than 4096, which currently is not supported by deepmd-kit."));
+            OP_REQUIRES (context, (max_nbor_size <= GPU_MAX_NBOR_SIZE),	                errors::InvalidArgument ("Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_nbor_size) + " is larger than 4096, which currently is not supported by deepmd-kit."));
         }
         else if (device == "CPU") {
             memcpy (&ilist,  4  + mesh_tensor.flat<int>().data(), sizeof(int *));
@@ -256,10 +258,20 @@ class DescrptSeROp : public OpKernel {
             cudaErrcheck(cudaMemcpy(jrange, jrange_host, sizeof(int) * mesh_host[2], cudaMemcpyHostToDevice));
             cudaErrcheck(cudaMemcpy(jlist,  jlist_host,  sizeof(int) * mesh_host[3], cudaMemcpyHostToDevice));
 
-            max_nbor_size = 1024;
+            max_nbor_size = 0;
             for(int ii = 0; ii < mesh_host[2]; ii++) {
                 max_nbor_size = (jrange_host[ii + 1] - jrange_host[ii]) > max_nbor_size ? (jrange_host[ii + 1] - jrange_host[ii]) : max_nbor_size;
             }
+            assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
+            if (max_nbor_size <= 1024) {
+                max_nbor_size = 1024;
+            }
+            else if (max_nbor_size <= 2048) {
+                max_nbor_size = 2048;
+            }
+            else {
+                max_nbor_size = 4096;
+            }
         }
         delete [] mesh_host;
     }