From d42968f193a8796847febcdb119ed9121c1f0dcf Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Tue, 20 Apr 2021 22:34:45 +0800
Subject: [PATCH 1/2] use bit operations to construct nbor sort keys

---
 source/lib/include/device.h            |   2 +-
 source/lib/include/fmt_nlist.h         |  16 +++
 source/lib/include/gpu_cuda.h          |   8 ++
 source/lib/src/cuda/prod_env_mat.cu    | 144 +++++++++++++------------
 source/lib/tests/test_env_mat_a.cc     |   4 +-
 source/lib/tests/test_env_mat_r.cc     |   4 +-
 source/lib/tests/test_fmt_nlist.cc     | 121 ++++++++++++++++++++-
 source/op/prod_env_mat_multi_device.cc |   2 +-
 8 files changed, 226 insertions(+), 75 deletions(-)
diff --git a/source/lib/include/device.h b/source/lib/include/device.h
index dfd12a4f27..f0e36ae73d 100644
--- a/source/lib/include/device.h
+++ b/source/lib/include/device.h
@@ -6,7 +6,7 @@
 
 #define TPB 256
 #define SQRT_2_PI 0.7978845608028654 
-typedef unsigned long long int_64;
+typedef unsigned long long uint_64;
 
 #if GOOGLE_CUDA
 #include "gpu_cuda.h"
diff --git a/source/lib/include/fmt_nlist.h b/source/lib/include/fmt_nlist.h
index 13c9082240..cc3adf43c5 100644
--- a/source/lib/include/fmt_nlist.h
+++ b/source/lib/include/fmt_nlist.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <vector>
+#include "device.h"
 #include "neighbor_list.h"
 
 namespace deepmd{
@@ -16,6 +17,21 @@ void format_nlist_cpu(
     const float rcut, 
     const std::vector<int> sec);
 
+#if GOOGLE_CUDA
+template <typename FPTYPE>
+void format_nbor_list_gpu_cuda(    
+    int * nlist, 
+    const FPTYPE * coord, 
+    const int * type, 
+    const deepmd::InputNlist & gpu_inlist,
+    int * array_int,
+    uint_64 * array_longlong,
+    const int max_nbor_size,
+    const int nloc, 
+    const int nall, 
+    const float rcut, 
+    const std::vector<int> sec);
+#endif //GOOGLE_CUDA
 }
 
 
diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h
index a9988b1caa..904ba5fafa 100644
--- a/source/lib/include/gpu_cuda.h
+++ b/source/lib/include/gpu_cuda.h
@@ -13,6 +13,14 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=
   }
 }
 
+#define nborErrcheck(res) {nborAssert((res), __FILE__, __LINE__);}
+inline void nborAssert(cudaError_t code, const char *file, int line, bool abort=true) {
+    if (code != cudaSuccess) {
+        fprintf(stderr,"cuda assert: %s %s %d\n", "DeePMD-kit:\tillegal nbor list sorting", file, line);
+        if (abort) exit(code);
+    }
+}
+
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
 static __inline__ __device__ double atomicAdd(
     double* address, 
diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/cuda/prod_env_mat.cu
index 807b2c37de..bfb15dd785 100644
--- a/source/lib/src/cuda/prod_env_mat.cu
+++ b/source/lib/src/cuda/prod_env_mat.cu
@@ -1,4 +1,5 @@
 #include "gpu_cuda.h"
+#include "fmt_nlist.h"
 #include "prod_env_mat.h"
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_store.cuh>
@@ -86,7 +87,7 @@ __global__ void get_i_idx(
 
 template<typename FPTYPE>
 __global__ void format_nlist_fill_a(
-    int_64 * key,
+    uint_64 * key,
     const FPTYPE * coord,
     const int * type,
     const int * numneigh,
@@ -106,15 +107,18 @@ __global__ void format_nlist_fill_a(
 
   const int * nei_idx = firstneigh[i_idx[idx]];
   // dev_copy(nei_idx, &jlist[jrange[i_idx]], nsize);
-  int_64 * key_in = key + idx * MAX_NBOR_SIZE;
+  uint_64 * key_in = key + idx * MAX_NBOR_SIZE;
   FPTYPE diff[3];
   const int & j_idx = nei_idx[idy];
   for (int dd = 0; dd < 3; dd++) {
     diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd];
   }
   FPTYPE rr = sqrt(dev_dot(diff, diff)); 
+  if(type[j_idx] > 128 || rr > 128.0 || j_idx > (1 << 24)) {
+    asm("trap;");
+  }
   if (rr <= rcut) {
-    key_in[idy] = type[j_idx] * 1E14+ (int_64)(rr * 1.0E12) / 10000000 * 10000000 + j_idx;
+    key_in[idy] = ((uint_64)type[j_idx] << 57) + (uint_64)((double)rr * ((uint_64)1 << 50)) / (1 << 24) * (1 << 24) + j_idx;
   }
 }
 
@@ -142,16 +146,16 @@ __global__ void format_nlist_fill_b(
   }
   
   for (unsigned int kk = 0; key_out[kk] != key_out[max_nbor_size - 1]; kk++) {
-    const int & nei_type = key_out[kk] / 1E14;
+    const int & nei_type = key_out[kk] >> 57;
     if (nei_iter[nei_type] < sec[nei_type + 1]) {
-      row_nlist[nei_iter[nei_type]++] = key_out[kk] % 10000000;
+      row_nlist[nei_iter[nei_type]++] = key_out[kk] & 0xFFFFFF;
     }
   }
 }
 
 template<typename FPTYPE>
 void format_nbor_list_1024 (
-    int_64 * key,
+    uint_64 * key,
     const FPTYPE* coord,
     const int* type,
     const deepmd::InputNlist & gpu_inlist,
@@ -170,14 +174,14 @@ void format_nbor_list_1024 (
   const int ITEMS_PER_THREAD = 8;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
-  BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (
+  BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (
       key, 
       key + nloc * MAX_NBOR_SIZE);
 }
 
 template<typename FPTYPE>
 void format_nbor_list_2048 (
-    int_64 * key,
+    uint_64 * key,
     const FPTYPE* coord,
     const int* type,
     const deepmd::InputNlist & gpu_inlist,
@@ -196,14 +200,14 @@ void format_nbor_list_2048 (
   const int ITEMS_PER_THREAD = 8;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
-  BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (
+  BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (
       key, 
       key + nloc * MAX_NBOR_SIZE);
 }
 
 template<typename FPTYPE>
 void format_nbor_list_4096 (
-    int_64 * key,
+    uint_64 * key,
     const FPTYPE* coord,
     const int* type,
     const deepmd::InputNlist & gpu_inlist,
@@ -222,62 +226,11 @@ void format_nbor_list_4096 (
   const int ITEMS_PER_THREAD = 16;
   const int BLOCK_THREADS = MAX_NBOR_SIZE / ITEMS_PER_THREAD;
   // BlockSortKernel<NeighborInfo, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>> (
-  BlockSortKernel<int_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (
+  BlockSortKernel<uint_64, BLOCK_THREADS, ITEMS_PER_THREAD> <<<nloc, BLOCK_THREADS>>> (
       key, 
       key + nloc * MAX_NBOR_SIZE);
 }
 
-template <typename FPTYPE>
-void format_nbor_list(    
-    int * nlist, 
-    const FPTYPE * coord, 
-    const int * type, 
-    const deepmd::InputNlist & gpu_inlist,
-    int * array_int,
-    int_64 * array_longlong,
-    const int max_nbor_size,
-    const int nloc, 
-    const int nall, 
-    const float rcut, 
-    const std::vector<int> sec)
-{
-  const int LEN = 256;
-  const int nnei = sec.back();
-  const int nblock = (nloc + LEN -1) / LEN;
-  int * sec_dev = array_int;
-  int * nei_iter = array_int + sec.size(); // = new int[sec_size];
-  int * i_idx = array_int + sec.size() + nloc * sec.size();
-  int_64 * key = array_longlong;
-  assert(max_nbor_size == 1024 || max_nbor_size == 2048 || max_nbor_size == 4096);
-  cudaErrcheck(cudaMemset(nlist, -1, sizeof(int) * nloc * nnei));
-  cudaErrcheck(cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * max_nbor_size));
-  cudaErrcheck(cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), cudaMemcpyHostToDevice));   
-
-  get_i_idx<<<nblock, LEN>>>(
-      i_idx,
-      nloc, gpu_inlist.ilist);
-
-  if (max_nbor_size == 1024) {
-    format_nbor_list_1024 (
-        key,
-        coord, type, gpu_inlist, nloc, rcut, i_idx); 
-  } 
-  else if (max_nbor_size == 2048) {
-    format_nbor_list_2048 (
-        key,
-        coord, type, gpu_inlist, nloc, rcut, i_idx); 
-  } 
-  else if (max_nbor_size == 4096) {
-    format_nbor_list_4096 (
-        key,
-        coord, type, gpu_inlist, nloc, rcut, i_idx); 
-  }
-
-  format_nlist_fill_b<<<nblock, LEN>>> (
-      nlist,
-      nnei, nloc, key, sec_dev, sec.size(), nei_iter, max_nbor_size);
-}
-
 template<
     typename FPTYPE,
     int      THREADS_PER_BLOCK>
@@ -432,6 +385,57 @@ __global__ void compute_env_mat_r(
 }
 
 namespace deepmd {
+template <typename FPTYPE>
+void format_nbor_list_gpu_cuda(    
+    int * nlist, 
+    const FPTYPE * coord, 
+    const int * type, 
+    const deepmd::InputNlist & gpu_inlist,
+    int * array_int,
+    uint_64 * array_longlong,
+    const int max_nbor_size,
+    const int nloc, 
+    const int nall, 
+    const float rcut, 
+    const std::vector<int> sec)
+{
+  const int LEN = 256;
+  const int nnei = sec.back();
+  const int nblock = (nloc + LEN -1) / LEN;
+  int * sec_dev = array_int;
+  int * nei_iter = array_int + sec.size(); // = new int[sec_size];
+  int * i_idx = array_int + sec.size() + nloc * sec.size();
+  uint_64 * key = array_longlong;
+  assert(max_nbor_size == 1024 || max_nbor_size == 2048 || max_nbor_size == 4096);
+  cudaErrcheck(cudaMemset(nlist, -1, sizeof(int) * nloc * nnei));
+  cudaErrcheck(cudaMemset(key, 0xffffffff, sizeof(uint_64) * nloc * max_nbor_size));
+  cudaErrcheck(cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), cudaMemcpyHostToDevice));   
+
+  get_i_idx<<<nblock, LEN>>>(
+      i_idx,
+      nloc, gpu_inlist.ilist);
+
+  if (max_nbor_size == 1024) {
+    format_nbor_list_1024 (
+        key,
+        coord, type, gpu_inlist, nloc, rcut, i_idx); 
+  } 
+  else if (max_nbor_size == 2048) {
+    format_nbor_list_2048 (
+        key,
+        coord, type, gpu_inlist, nloc, rcut, i_idx); 
+  } 
+  else if (max_nbor_size == 4096) {
+    format_nbor_list_4096 (
+        key,
+        coord, type, gpu_inlist, nloc, rcut, i_idx); 
+  }
+
+  format_nlist_fill_b<<<nblock, LEN>>> (
+      nlist,
+      nnei, nloc, key, sec_dev, sec.size(), nei_iter, max_nbor_size);
+}
+
 template <typename FPTYPE>
 void prod_env_mat_a_gpu_cuda(    
     FPTYPE * em, 
@@ -442,7 +446,7 @@ void prod_env_mat_a_gpu_cuda(
     const int * type, 
     const InputNlist & gpu_inlist,
     int * array_int, 
-    int_64 * array_longlong,
+    uint_64 * array_longlong,
     const int max_nbor_size,
     const FPTYPE * avg, 
     const FPTYPE * std, 
@@ -457,9 +461,11 @@ void prod_env_mat_a_gpu_cuda(
   cudaErrcheck(cudaMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
   cudaErrcheck(cudaMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
 
-  format_nbor_list(
+  format_nbor_list_gpu_cuda(
       nlist, 
       coord, type, gpu_inlist, array_int, array_longlong, max_nbor_size, nloc, nall, rcut, sec);
+  nborErrcheck(cudaGetLastError());
+  nborErrcheck(cudaDeviceSynchronize());
 
   compute_env_mat_a<FPTYPE, TPB> <<<nloc, TPB>>> (
       em, em_deriv, rij, 
@@ -476,7 +482,7 @@ void prod_env_mat_r_gpu_cuda(
     const int * type, 
     const InputNlist & gpu_inlist,
     int * array_int, 
-    int_64 * array_longlong,
+    uint_64 * array_longlong,
     const int max_nbor_size,
     const FPTYPE * avg, 
     const FPTYPE * std, 
@@ -491,10 +497,12 @@ void prod_env_mat_r_gpu_cuda(
   cudaErrcheck(cudaMemset(em, 0.0, sizeof(FPTYPE) * nloc * ndescrpt));
   cudaErrcheck(cudaMemset(em_deriv, 0.0, sizeof(FPTYPE) * nloc * ndescrpt * 3));
 
-  format_nbor_list(
+  format_nbor_list_gpu_cuda(
       nlist, 
       coord, type, gpu_inlist, array_int, array_longlong, max_nbor_size, nloc, nall, rcut, sec);
-
+  nborErrcheck(cudaGetLastError());
+  nborErrcheck(cudaDeviceSynchronize());
+  
   compute_env_mat_r<FPTYPE, TPB> <<<nloc, TPB>>> (
       em, em_deriv, rij, 
       coord, avg, std, type, nlist, nnei, rcut_smth, rcut);
@@ -504,4 +512,6 @@ template void prod_env_mat_a_gpu_cuda<float>(float * em, float * em_deriv, float
 template void prod_env_mat_a_gpu_cuda<double>(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
 template void prod_env_mat_r_gpu_cuda<float>(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
 template void prod_env_mat_r_gpu_cuda<double>(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
+template void format_nbor_list_gpu_cuda<float>(int * nlist, const float * coord, const int * type, const deepmd::InputNlist & gpu_inlist,int * array_int,uint_64 * array_longlong,const int max_nbor_size,const int nloc, const int nall, const float rcut, const std::vector<int> sec);
+template void format_nbor_list_gpu_cuda<double>(int * nlist, const double * coord, const int * type, const deepmd::InputNlist & gpu_inlist,int * array_int,uint_64 * array_longlong,const int max_nbor_size,const int nloc, const int nall, const float rcut, const std::vector<int> sec);
 }
diff --git a/source/lib/tests/test_env_mat_a.cc b/source/lib/tests/test_env_mat_a.cc
index d32203c692..f9e262cb2c 100644
--- a/source/lib/tests/test_env_mat_a.cc
+++ b/source/lib/tests/test_env_mat_a.cc
@@ -545,7 +545,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda)
   double * em_dev = NULL, * em_deriv_dev = NULL, * rij_dev = NULL;
   double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL;
   int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL;
-  int_64 * array_longlong_dev = NULL;
+  uint_64 * array_longlong_dev = NULL;
   deepmd::malloc_device_memory_sync(em_dev, em);
   deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
   deepmd::malloc_device_memory_sync(rij_dev, rij);
@@ -635,7 +635,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu)
   double * em_dev = NULL, * em_deriv_dev = NULL, * rij_dev = NULL;
   double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL;
   int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL;
-  int_64 * array_longlong_dev = NULL;
+  uint_64 * array_longlong_dev = NULL;
   deepmd::malloc_device_memory_sync(em_dev, em);
   deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
   deepmd::malloc_device_memory_sync(rij_dev, rij);
diff --git a/source/lib/tests/test_env_mat_r.cc b/source/lib/tests/test_env_mat_r.cc
index aac71eff8c..c7b0db5e1d 100644
--- a/source/lib/tests/test_env_mat_r.cc
+++ b/source/lib/tests/test_env_mat_r.cc
@@ -387,7 +387,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda)
   double * em_dev = NULL, * em_deriv_dev = NULL, * rij_dev = NULL;
   double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL;
   int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL;
-  int_64 * array_longlong_dev = NULL;
+  uint_64 * array_longlong_dev = NULL;
   deepmd::malloc_device_memory_sync(em_dev, em);
   deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
   deepmd::malloc_device_memory_sync(rij_dev, rij);
@@ -477,7 +477,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu)
   double * em_dev = NULL, * em_deriv_dev = NULL, * rij_dev = NULL;
   double * posi_cpy_dev = NULL, * avg_dev = NULL, * std_dev = NULL;
   int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL;
-  int_64 * array_longlong_dev = NULL;
+  uint_64 * array_longlong_dev = NULL;
   deepmd::malloc_device_memory_sync(em_dev, em);
   deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
   deepmd::malloc_device_memory_sync(rij_dev, rij);
diff --git a/source/lib/tests/test_fmt_nlist.cc b/source/lib/tests/test_fmt_nlist.cc
index 6dc539b7d6..d1cd3eef0b 100644
--- a/source/lib/tests/test_fmt_nlist.cc
+++ b/source/lib/tests/test_fmt_nlist.cc
@@ -32,6 +32,7 @@ class TestFormatNlist : public ::testing::Test
     3 , 6 , -1, -1, -1, -1, -1, -1, -1, -1, 4 , 2 , 7 , -1, -1, -1, -1, -1, -1, -1
   };      
   std::vector<int> expect_nlist;
+  int max_nbor_size;
 
   void SetUp() override {
     double box[] = {13., 0., 0., 0., 13., 0., 0., 0., 13.};
@@ -53,7 +54,8 @@ class TestFormatNlist : public ::testing::Test
       else{
 	expect_nlist.push_back(-1);
       }
-    }    
+    } 
+    max_nbor_size = 0;   
   }
   void TearDown() override {
   }
@@ -90,6 +92,7 @@ class TestFormatNlistShortSel : public ::testing::Test
     3,  6,  4,  2, 
   };      
   std::vector<int> expect_nlist;
+  int max_nbor_size;
 
   void SetUp() override {
     double box[] = {13., 0., 0., 0., 13., 0., 0., 0., 13.};
@@ -111,7 +114,8 @@ class TestFormatNlistShortSel : public ::testing::Test
       else{
 	expect_nlist.push_back(-1);
       }
-    }    
+    } 
+    max_nbor_size = 0;      
   }
   void TearDown() override {
   }
@@ -301,3 +305,116 @@ TEST_F(TestFormatNlistShortSel, cpu)
   }
 }
 
+#if GOOGLE_CUDA
+TEST_F(TestFormatNlist, gpu_cuda)
+{
+  std::vector<std::vector<int>> nlist_a_0, nlist_r_0;
+  build_nlist(nlist_a_0, nlist_r_0, posi_cpy, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region, ncell);  
+  // make a input nlist
+  int inum = nlist_a_0.size();
+  std::vector<int > ilist(inum);
+  std::vector<int > numneigh(inum);
+  std::vector<int* > firstneigh(inum);
+  deepmd::InputNlist in_nlist(inum, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
+  convert_nlist(in_nlist, nlist_a_0);
+  // allocate the mem for the result
+  std::vector<int> nlist(inum * sec_a.back());
+  EXPECT_EQ(nlist.size(), expect_nlist_cpy.size());
+
+  double * posi_cpy_dev = NULL;
+  int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL;
+  uint_64 * array_longlong_dev = NULL;
+  for (int ii = 0; ii < inum; ii++) {
+    max_nbor_size = max_nbor_size >= numneigh[ii] ? max_nbor_size : numneigh[ii];
+  }
+  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
+  if (max_nbor_size <= 1024) {
+    max_nbor_size = 1024;
+  }
+  else if (max_nbor_size <= 2048) {
+    max_nbor_size = 2048;
+  }
+  else {
+    max_nbor_size = 4096;
+  }
+  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
+  deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
+  deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
+  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
+  deepmd::convert_nlist_gpu_cuda(gpu_inlist, in_nlist, memory_dev, max_nbor_size);
+  // format nlist
+  format_nbor_list_gpu_cuda(
+      nlist_dev, 
+      posi_cpy_dev, atype_cpy_dev, gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, nloc, nall, rc, sec_a);
+  deepmd::memcpy_device_to_host(nlist_dev, nlist);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(posi_cpy_dev);
+  deepmd::delete_device_memory(atype_cpy_dev);
+  deepmd::delete_device_memory(array_int_dev);
+  deepmd::delete_device_memory(array_longlong_dev);
+  deepmd::delete_device_memory(memory_dev);
+  deepmd::free_nlist_gpu_cuda(gpu_inlist);
+  // validate
+  for(int ii = 0; ii < nlist.size(); ++ii){
+    EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]);
+  }
+}
+
+TEST_F(TestFormatNlistShortSel, gpu_cuda)
+{
+  std::vector<std::vector<int>> nlist_a_0, nlist_r_0;
+  build_nlist(nlist_a_0, nlist_r_0, posi_cpy, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region, ncell);  
+  // make a input nlist
+  int inum = nlist_a_0.size();
+  std::vector<int > ilist(inum);
+  std::vector<int > numneigh(inum);
+  std::vector<int* > firstneigh(inum);
+  deepmd::InputNlist in_nlist(inum, &ilist[0], &numneigh[0], &firstneigh[0]), gpu_inlist;
+  convert_nlist(in_nlist, nlist_a_0);  
+  // mem
+  std::vector<int> nlist(inum * sec_a.back());
+  EXPECT_EQ(nlist.size(), expect_nlist_cpy.size());
+  // format nlist
+  double * posi_cpy_dev = NULL;
+  int * atype_cpy_dev = NULL, * nlist_dev = NULL, * array_int_dev = NULL, * memory_dev = NULL;
+  uint_64 * array_longlong_dev = NULL;
+  for (int ii = 0; ii < inum; ii++) {
+    max_nbor_size = max_nbor_size >= numneigh[ii] ? max_nbor_size : numneigh[ii];
+  }
+  assert(max_nbor_size <= GPU_MAX_NBOR_SIZE);
+  if (max_nbor_size <= 1024) {
+    max_nbor_size = 1024;
+  }
+  else if (max_nbor_size <= 2048) {
+    max_nbor_size = 2048;
+  }
+  else {
+    max_nbor_size = 4096;
+  }
+  deepmd::malloc_device_memory_sync(posi_cpy_dev, posi_cpy);
+  deepmd::malloc_device_memory_sync(atype_cpy_dev, atype_cpy);
+  deepmd::malloc_device_memory_sync(nlist_dev, nlist);
+  deepmd::malloc_device_memory(array_int_dev, sec_a.size() + nloc * sec_a.size() + nloc);
+  deepmd::malloc_device_memory(array_longlong_dev, nloc * GPU_MAX_NBOR_SIZE * 2);
+  deepmd::malloc_device_memory(memory_dev, nloc * max_nbor_size);
+  deepmd::convert_nlist_gpu_cuda(gpu_inlist, in_nlist, memory_dev, max_nbor_size);
+  // format nlist
+  format_nbor_list_gpu_cuda(
+      nlist_dev, 
+      posi_cpy_dev, atype_cpy_dev, gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, nloc, nall, rc, sec_a);
+  deepmd::memcpy_device_to_host(nlist_dev, nlist);
+  deepmd::delete_device_memory(nlist_dev);
+  deepmd::delete_device_memory(posi_cpy_dev);
+  deepmd::delete_device_memory(atype_cpy_dev);
+  deepmd::delete_device_memory(array_int_dev);
+  deepmd::delete_device_memory(array_longlong_dev);
+  deepmd::delete_device_memory(memory_dev);
+  deepmd::free_nlist_gpu_cuda(gpu_inlist);
+  // validate
+  for(int ii = 0; ii < nlist.size(); ++ii){
+    EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]);
+  }
+}
+#endif // GOOGLE_CUDA
\ No newline at end of file
diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index f1843f9afe..be812dbc9e 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -954,8 +954,8 @@ _prepare_coord_nlist_gpu(
     const int & max_cpy_trial,
     const int & max_nnei_trial)
 {    
-  inlist.inum = nloc;
   if(nei_mode != 3){
+    inlist.inum = nloc;
     // build nlist by myself
     // normalize and copy coord
     if(nei_mode == 1){

From 58e83ae5b053757c22dc66472eb93dacc9e824fc Mon Sep 17 00:00:00 2001
From: denghuilu <denghuilu@pku.edu.cn>
Date: Fri, 23 Apr 2021 12:53:12 +0800
Subject: [PATCH 2/2] add ut for encoding/decoding nbor info

---
 source/lib/include/fmt_nlist.h      |  10 +++
 source/lib/src/cuda/prod_env_mat.cu |  72 +++++++++++++++++--
 source/lib/tests/test_fmt_nlist.cc  | 108 ++++++++++++++++++++++++++++
 3 files changed, 184 insertions(+), 6 deletions(-)

diff --git a/source/lib/include/fmt_nlist.h b/source/lib/include/fmt_nlist.h
index cc3adf43c5..1c6f66059c 100644
--- a/source/lib/include/fmt_nlist.h
+++ b/source/lib/include/fmt_nlist.h
@@ -31,6 +31,16 @@ void format_nbor_list_gpu_cuda(
     const int nall, 
     const float rcut, 
     const std::vector<int> sec);
+
+template <typename FPTYPE>
+void test_encoding_decoding_nbor_info_gpu_cuda(
+    uint_64 * key,
+    int * out_type,
+    int * out_index,
+    const int * in_type,
+    const FPTYPE * in_dist,
+    const int * in_index,
+    const int size_of_array);
 #endif //GOOGLE_CUDA
 }
 
diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/cuda/prod_env_mat.cu
index bfb15dd785..1ec317b685 100644
--- a/source/lib/src/cuda/prod_env_mat.cu
+++ b/source/lib/src/cuda/prod_env_mat.cu
@@ -72,6 +72,31 @@ __device__ inline void spline5_switch(
   }
 }
 
+template<typename FPTYPE>
+__device__ inline uint_64 encoding_nbor_info(
+    const int type,
+    const FPTYPE dist,
+    const int index)
+{
+  // nbor info checking:
+  // the type of nbor atom must be smaller than 128
+  // the distance of center atom between nbor atom must be smaller than 128
+  // the index of nbor atom(including ghost region) must be smaller than 16777216(1 << 24)
+  if(type >= 128 || dist >= 128.0 || index >= (1 << 24)) {
+    asm("trap;");
+  }
+  return ((uint_64)type << 57) + (uint_64)((double)dist * ((uint_64)1 << 50)) / (1 << 24) * (1 << 24) + index;
+}
+
+__device__ inline void decoding_nbor_info(
+    int &type,
+    int &index,
+    const uint_64 key)
+{
+  type = key >> 57;
+  index = key & 0xFFFFFF;
+}
+
 template<typename FPTYPE>
 __global__ void get_i_idx(
     FPTYPE * i_idx,
@@ -114,11 +139,8 @@ __global__ void format_nlist_fill_a(
     diff[dd] = coord[j_idx * 3 + dd] - coord[idx * 3 + dd];
   }
   FPTYPE rr = sqrt(dev_dot(diff, diff)); 
-  if(type[j_idx] > 128 || rr > 128.0 || j_idx > (1 << 24)) {
-    asm("trap;");
-  }
   if (rr <= rcut) {
-    key_in[idy] = ((uint_64)type[j_idx] << 57) + (uint_64)((double)rr * ((uint_64)1 << 50)) / (1 << 24) * (1 << 24) + j_idx;
+    key_in[idy] = encoding_nbor_info(type[j_idx], rr, j_idx);
   }
 }
 
@@ -145,14 +167,34 @@ __global__ void format_nlist_fill_b(
     nei_iter[ii] = sec[ii];
   }
   
+  int nei_type = 0, nbor_idx = 0;
   for (unsigned int kk = 0; key_out[kk] != key_out[max_nbor_size - 1]; kk++) {
-    const int & nei_type = key_out[kk] >> 57;
+    decoding_nbor_info(nei_type, nbor_idx, key_out[kk]);
     if (nei_iter[nei_type] < sec[nei_type + 1]) {
-      row_nlist[nei_iter[nei_type]++] = key_out[kk] & 0xFFFFFF;
+      row_nlist[nei_iter[nei_type]++] = nbor_idx;
     }
   }
 }
 
+template<typename FPTYPE>
+__global__ void encoding_decoding_nbor_info(
+    uint_64 * key,
+    int * out_type,
+    int * out_index,
+    const int * in_type,
+    const FPTYPE * in_dist,
+    const int * in_index,
+    const int size_of_array)
+{ 
+  const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if(idx >= size_of_array) {
+    return;
+  }
+  
+  key[idx] = encoding_nbor_info(in_type[idx], in_dist[idx], in_index[idx]);
+  decoding_nbor_info(out_type[idx], out_index[idx], key[idx]);
+}
+
 template<typename FPTYPE>
 void format_nbor_list_1024 (
     uint_64 * key,
@@ -508,10 +550,28 @@ void prod_env_mat_r_gpu_cuda(
       coord, avg, std, type, nlist, nnei, rcut_smth, rcut);
 }
 
+template <typename FPTYPE>
+void test_encoding_decoding_nbor_info_gpu_cuda(
+    uint_64 * key,
+    int * out_type,
+    int * out_index,
+    const int * in_type,
+    const FPTYPE * in_dist,
+    const int * in_index,
+    const int size_of_array)
+{
+  const int nblock = (size_of_array + TPB - 1) / TPB;
+  encoding_decoding_nbor_info<<<nblock, TPB>>> (
+      key, out_type, out_index,
+      in_type, in_dist, in_index, size_of_array);
+}
+
 template void prod_env_mat_a_gpu_cuda<float>(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
 template void prod_env_mat_a_gpu_cuda<double>(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
 template void prod_env_mat_r_gpu_cuda<float>(float * em, float * em_deriv, float * rij, int * nlist, const float * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const float * avg, const float * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
 template void prod_env_mat_r_gpu_cuda<double>(double * em, double * em_deriv, double * rij, int * nlist, const double * coord, const int * type, const InputNlist & gpu_inlist, int * array_int, unsigned long long * array_longlong, const int max_nbor_size, const double * avg, const double * std, const int nloc, const int nall, const float rcut, const float rcut_smth, const std::vector<int> sec);
 template void format_nbor_list_gpu_cuda<float>(int * nlist, const float * coord, const int * type, const deepmd::InputNlist & gpu_inlist,int * array_int,uint_64 * array_longlong,const int max_nbor_size,const int nloc, const int nall, const float rcut, const std::vector<int> sec);
 template void format_nbor_list_gpu_cuda<double>(int * nlist, const double * coord, const int * type, const deepmd::InputNlist & gpu_inlist,int * array_int,uint_64 * array_longlong,const int max_nbor_size,const int nloc, const int nall, const float rcut, const std::vector<int> sec);
+template void test_encoding_decoding_nbor_info_gpu_cuda(uint_64 * key, int * out_type, int * out_index, const int * in_type, const float * in_dist, const int * in_index, const int size_of_array);
+template void test_encoding_decoding_nbor_info_gpu_cuda(uint_64 * key, int * out_type, int * out_index, const int * in_type, const double * in_dist, const int * in_index, const int size_of_array);
 }
diff --git a/source/lib/tests/test_fmt_nlist.cc b/source/lib/tests/test_fmt_nlist.cc
index d1cd3eef0b..df2a68534f 100644
--- a/source/lib/tests/test_fmt_nlist.cc
+++ b/source/lib/tests/test_fmt_nlist.cc
@@ -121,6 +121,44 @@ class TestFormatNlistShortSel : public ::testing::Test
   }
 };
 
+class TestEncodingDecodingNborInfo : public ::testing::Test
+{
+protected:
+  std::vector<int > valid_type = {
+    0, 1, 127, 77, 47, 9, 11
+  };
+  std::vector<double > valid_dist = {
+    23.3333, 0.001234, 1.456, 127.7, 2.021, 0.409, 11.2
+  };
+  std::vector<int > valid_index = {
+    0, 16777215, 1000000, 10000000, 202149, 478910, 5006
+  };
+  std::vector<uint_64 > expect_key = {
+    26270960290103296UL, 144116577447444479UL, 18304268195882549824UL, 11240646899941283456UL, 6775689283274741157UL, 1297497185738772158UL, 1597877147777635214UL
+  };
+
+  std::vector<int > invalid_type = {
+    0, 256, 128, 77, 47, 126, 1100
+  };
+  std::vector<double > invalid_dist = {
+    128.0, 0.001234, 1.456, 130.7, 2.021, 0.409, 11.2
+  };
+  std::vector<int > invalid_index = {
+    0, 16777215, 1 << 24, 10000000, 20210409, 478910, 5006
+  };
+  std::vector<bool> expect_cuda_error_check = {
+    false, false, false, false, false, true, false
+  };
+
+  std::vector<int > expect_type = valid_type;
+  std::vector<int > expect_index = valid_index;
+  int size_of_array = valid_type.size();
+
+  void SetUp() override {
+  }
+  void TearDown() override {
+  }
+};
 
 // orginal implementation. copy ghost
 TEST_F(TestFormatNlist, orig_cpy)
@@ -417,4 +455,74 @@ TEST_F(TestFormatNlistShortSel, gpu_cuda)
     EXPECT_EQ(nlist[ii], expect_nlist_cpy[ii]);
   }
 }
+
+TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu_cuda) 
+{
+  int * valid_type_dev = NULL, * valid_index_dev = NULL, * out_type_dev = NULL, * out_index_dev = NULL;
+  double * valid_dist_dev = NULL;
+  uint_64 * key_dev = NULL;
+  std::vector<int> out_type(size_of_array, 0);
+  std::vector<int> out_index(size_of_array, 0);
+  std::vector<uint_64> key(size_of_array, 0);
+  deepmd::malloc_device_memory_sync(valid_type_dev, valid_type);
+  deepmd::malloc_device_memory_sync(valid_dist_dev, valid_dist);
+  deepmd::malloc_device_memory_sync(valid_index_dev, valid_index);
+  deepmd::malloc_device_memory_sync(out_type_dev, out_type);
+  deepmd::malloc_device_memory_sync(out_index_dev, out_index);
+  deepmd::malloc_device_memory_sync(key_dev, key);
+
+  deepmd::test_encoding_decoding_nbor_info_gpu_cuda(
+      key_dev, out_type_dev, out_index_dev,
+      valid_type_dev, valid_dist_dev, valid_index_dev, size_of_array
+  );
+
+  deepmd::memcpy_device_to_host(key_dev, key);
+  deepmd::memcpy_device_to_host(out_type_dev, out_type);
+  deepmd::memcpy_device_to_host(out_index_dev, out_index);
+  deepmd::delete_device_memory(valid_type_dev);
+  deepmd::delete_device_memory(valid_dist_dev);
+  deepmd::delete_device_memory(valid_index_dev);
+  deepmd::delete_device_memory(out_type_dev);
+  deepmd::delete_device_memory(out_index_dev);
+  deepmd::delete_device_memory(key_dev);
+  // validate
+  for(int ii = 0; ii < size_of_array; ii++) {
+    EXPECT_EQ(key[ii], expect_key[ii]);
+    EXPECT_EQ(out_type[ii], expect_type[ii]);
+    EXPECT_EQ(out_index[ii], expect_index[ii]);
+  }
+}
+
+// TEST_F(TestEncodingDecodingNborInfo, invalid_nbor_info_gpu_cuda) 
+// {
+//   int * invalid_type_dev = NULL, * invalid_index_dev = NULL, * out_type_dev = NULL, * out_index_dev = NULL;
+//   double * invalid_dist_dev = NULL;
+//   uint_64 * key_dev = NULL;
+//   std::vector<int> out_type(size_of_array, 0);
+//   std::vector<int> out_index(size_of_array, 0);
+//   std::vector<uint_64> key(size_of_array, 0);
+//   deepmd::malloc_device_memory_sync(invalid_type_dev, invalid_type);
+//   deepmd::malloc_device_memory_sync(invalid_dist_dev, invalid_dist);
+//   deepmd::malloc_device_memory_sync(invalid_index_dev, invalid_index);
+//   deepmd::malloc_device_memory_sync(out_type_dev, out_type);
+//   deepmd::malloc_device_memory_sync(out_index_dev, out_index);
+//   deepmd::malloc_device_memory_sync(key_dev, key);
+  
+//   EXPECT_EQ(cudaGetLastError() == cudaSuccess && cudaDeviceSynchronize() == cudaSuccess, true);
+//   deepmd::test_encoding_decoding_nbor_info_gpu_cuda(
+//       key_dev, out_type_dev, out_index_dev,
+//       invalid_type_dev, invalid_dist_dev, invalid_index_dev, size_of_array
+//   );
+//   EXPECT_EQ(cudaGetLastError() == cudaSuccess && cudaDeviceSynchronize() == cudaSuccess, false);
+//   cudaErrcheck(cudaDeviceReset());
+//   deepmd::memcpy_device_to_host(key_dev, key);
+//   deepmd::memcpy_device_to_host(out_type_dev, out_type);
+//   deepmd::memcpy_device_to_host(out_index_dev, out_index);
+//   deepmd::delete_device_memory(invalid_type_dev);
+//   deepmd::delete_device_memory(invalid_dist_dev);
+//   deepmd::delete_device_memory(invalid_index_dev);
+//   deepmd::delete_device_memory(out_type_dev);
+//   deepmd::delete_device_memory(out_index_dev);
+//   deepmd::delete_device_memory(key_dev);
+// }
 #endif // GOOGLE_CUDA
\ No newline at end of file