From 7328be08982fbbbe12725ee2121a1899f206f2ce Mon Sep 17 00:00:00 2001 From: denghuilu Date: Wed, 3 Mar 2021 13:54:24 +0800 Subject: [PATCH] fix bug of max_nbor_size usage --- source/op/descrpt_se_a_multi_device.cc | 19 +++++++++++++++---- source/op/descrpt_se_r_multi_device.cc | 18 +++++++++++++++--- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/source/op/descrpt_se_a_multi_device.cc b/source/op/descrpt_se_a_multi_device.cc index 93e2cdccac..141b2d89bc 100644 --- a/source/op/descrpt_se_a_multi_device.cc +++ b/source/op/descrpt_se_a_multi_device.cc @@ -21,6 +21,8 @@ REGISTER_OP("DescrptSeA") .Output("nlist: int32"); // only sel_a and rcut_r uesd. +#define GPU_MAX_NBOR_SIZE 4096 + struct DeviceFunctor { void operator()(const CPUDevice& d, std::string& device) { device = "CPU"; @@ -158,14 +160,14 @@ class DescrptSeAOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp)); Tensor uint64_temp; TensorShape uint64_shape; - uint64_shape.AddDim(nloc * max_nbor_size * 2); + uint64_shape.AddDim(nloc * GPU_MAX_NBOR_SIZE * 2); OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp)); array_int = int_temp.flat().data(); array_longlong = uint64_temp.flat().data(); nbor_update(mesh_tensor.flat().data(), static_cast(mesh_tensor.NumElements())); - OP_REQUIRES (context, (max_nbor_size <= 4096), errors::InvalidArgument ("Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_nbor_size) + " is larger than 4096, which currently is not supported by deepmd-kit.")); + OP_REQUIRES (context, (max_nbor_size <= GPU_MAX_NBOR_SIZE), errors::InvalidArgument ("Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_nbor_size) + " is larger than 4096, which currently is not supported by deepmd-kit.")); } else if (device == "CPU") { memcpy (&ilist, 4 + mesh_tensor.flat().data(), sizeof(int *)); @@ -267,14 +269,23 @@ class DescrptSeAOp : public OpKernel { cudaErrcheck(cudaMemcpy(jrange, jrange_host, sizeof(int) * mesh_host[2], cudaMemcpyHostToDevice)); cudaErrcheck(cudaMemcpy(jlist, jlist_host, sizeof(int) * mesh_host[3], cudaMemcpyHostToDevice)); - max_nbor_size = 1024; + max_nbor_size = 0; for(int ii = 0; ii < mesh_host[2]; ii++) { max_nbor_size = (jrange_host[ii + 1] - jrange_host[ii]) > max_nbor_size ? (jrange_host[ii + 1] - jrange_host[ii]) : max_nbor_size; } + assert(max_nbor_size <= GPU_MAX_NBOR_SIZE); + if (max_nbor_size <= 1024) { + max_nbor_size = 1024; + } + else if (max_nbor_size <= 2048) { + max_nbor_size = 2048; + } + else { + max_nbor_size = 4096; + } } delete [] mesh_host; } - }; // Register the CPU kernels. diff --git a/source/op/descrpt_se_r_multi_device.cc b/source/op/descrpt_se_r_multi_device.cc index b94f97d6e1..c355e34f12 100644 --- a/source/op/descrpt_se_r_multi_device.cc +++ b/source/op/descrpt_se_r_multi_device.cc @@ -18,6 +18,8 @@ REGISTER_OP("DescrptSeR") .Output("rij: T") .Output("nlist: int32"); +#define GPU_MAX_NBOR_SIZE 4096 + struct DeviceFunctor { void operator()(const CPUDevice& d, std::string& device) { device = "CPU"; @@ -147,14 +149,14 @@ class DescrptSeROp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp)); Tensor uint64_temp; TensorShape uint64_shape; - uint64_shape.AddDim(nloc * max_nbor_size * 2); + uint64_shape.AddDim(nloc * GPU_MAX_NBOR_SIZE * 2); OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp)); array_int = int_temp.flat().data(); array_longlong = uint64_temp.flat().data(); nbor_update(mesh_tensor.flat().data(), static_cast(mesh_tensor.NumElements())); - OP_REQUIRES (context, (max_nbor_size <= 4096), errors::InvalidArgument ("Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_nbor_size) + " is larger than 4096, which currently is not supported by deepmd-kit.")); + OP_REQUIRES (context, (max_nbor_size <= GPU_MAX_NBOR_SIZE), errors::InvalidArgument ("Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_nbor_size) + " is larger than 4096, which currently is not supported by deepmd-kit.")); } else if (device == "CPU") { memcpy (&ilist, 4 + mesh_tensor.flat().data(), sizeof(int *)); @@ -256,10 +258,20 @@ class DescrptSeROp : public OpKernel { cudaErrcheck(cudaMemcpy(jrange, jrange_host, sizeof(int) * mesh_host[2], cudaMemcpyHostToDevice)); cudaErrcheck(cudaMemcpy(jlist, jlist_host, sizeof(int) * mesh_host[3], cudaMemcpyHostToDevice)); - max_nbor_size = 1024; + max_nbor_size = 0; for(int ii = 0; ii < mesh_host[2]; ii++) { max_nbor_size = (jrange_host[ii + 1] - jrange_host[ii]) > max_nbor_size ? (jrange_host[ii + 1] - jrange_host[ii]) : max_nbor_size; } + assert(max_nbor_size <= GPU_MAX_NBOR_SIZE); + if (max_nbor_size <= 1024) { + max_nbor_size = 1024; + } + else if (max_nbor_size <= 2048) { + max_nbor_size = 2048; + } + else { + max_nbor_size = 4096; + } } delete [] mesh_host; }