diff --git a/.travis.yml b/.travis.yml index 4db9e48dd4..dd4b0eed9e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -83,7 +83,7 @@ jobs: - CXX=g++-7 - TENSORFLOW_VERSION=2.1 install: - - python -m pip install twine cibuildwheel==1.1.0 scikit-build + - python -m pip install twine cibuildwheel==1.1.0 scikit-build setuptools_scm script: - python -m cibuildwheel --output-dir wheelhouse - python setup.py sdist diff --git a/pyproject.toml b/pyproject.toml index b1251c5890..2e9e3239ab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,3 @@ [build-system] -requires = ["setuptools", "wheel", "scikit-build", "cmake", "ninja", "m2r"] +requires = ["setuptools", "setuptools_scm", "wheel", "scikit-build", "cmake", "ninja", "m2r"] diff --git a/setup.py b/setup.py index 8c6a335ad4..99a86d3da8 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,21 @@ from skbuild import setup from skbuild.exceptions import SKBuildError from skbuild.cmaker import get_cmake_version +from setuptools_scm import get_version from packaging.version import LegacyVersion from os import path, makedirs -import imp +import imp, sys, platform + +def get_dp_install_path() : + site_packages_path = path.join(path.dirname(path.__file__), 'site-packages') + dp_scm_version = get_version(root="./", relative_to=__file__) + python_version = 'py' + str(sys.version_info.major + sys.version_info.minor * 0.1) + os_info = sys.platform + machine_info = platform.machine() + dp_pip_install_path = site_packages_path + '/deepmd' + dp_setup_install_path = site_packages_path + '/deepmd_kit-' + dp_scm_version + '-' + python_version + '-' + os_info + '-' + machine_info + '.egg/deepmd' + + return dp_pip_install_path, dp_setup_install_path readme_file = path.join(path.dirname(path.abspath(__file__)), 'README.md') try: @@ -34,6 +46,8 @@ except OSError: pass +dp_pip_install_path, dp_setup_install_path = get_dp_install_path() + setup( name="deepmd-kit", setup_requires=setup_requires, @@ -56,6 +70,8 @@ '-DBUILD_PY_IF:BOOL=TRUE', '-DBUILD_CPP_IF:BOOL=FALSE', '-DFLOAT_PREC:STRING=high', + '-DDP_PIP_INSTALL_PATH=%s' % dp_pip_install_path, + '-DDP_SETUP_INSTALL_PATH=%s' % dp_setup_install_path, ], cmake_source_dir='source', cmake_minimum_required_version='3.0', diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 6b18cb95ac..84c3d326da 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -186,7 +186,7 @@ if (BUILD_CPP_IF) set (LIB_DEEPMD_OP "deepmd_op") if (USE_CUDA_TOOLKIT) set (LIB_DEEPMD_OP_CUDA "deepmd_op_cuda") - else() + else () set (LIB_DEEPMD_OP_CUDA "deepmd_op") endif() if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 4.9) diff --git a/source/lib/include/NNPInter.h b/source/lib/include/NNPInter.h index 69a0ecf7d1..a32940a738 100644 --- a/source/lib/include/NNPInter.h +++ b/source/lib/include/NNPInter.h @@ -93,10 +93,8 @@ class NNPInter compute_t *array_double; InternalNeighborList nlist; NNPAtomMap nnpmap; - unsigned long long *array_longlong; - int *ilist, *jrange, *jlist, *array_int; + int *ilist, *jrange, *jlist; int ilist_size, jrange_size, jlist_size; - int arr_int_size, arr_ll_size, arr_dou_size; // function used for neighbor list copy vector get_sel_a() const; @@ -191,13 +189,10 @@ class NNPInterModelDevi vector > sec; InternalNeighborList nlist; NNPAtomMap nnpmap; - unsigned long long *array_longlong; - int max_sec_size = 0, max_sec_back = 0; - int *ilist, *jrange, *jlist, *array_int; - int ilist_size, jrange_size, jlist_size, arr_int_size, arr_ll_size, arr_dou_size; + int *ilist, *jrange, *jlist; + int ilist_size, jrange_size, jlist_size; // function used for nborlist copy - void get_max_sec(); vector > get_sel() const; void cum_sum(const std::vector > n_sel); #ifdef USE_CUDA_TOOLKIT diff --git a/source/lib/include/SimulationRegion_Impl.h b/source/lib/include/SimulationRegion_Impl.h index 2b4ed1a72b..8fa318c561 100644 --- a/source/lib/include/SimulationRegion_Impl.h +++ b/source/lib/include/SimulationRegion_Impl.h @@ -5,6 +5,7 @@ #include #include #include +#include using namespace std; @@ -500,6 +501,9 @@ computeVolume() boxt[0*3+1] * (boxt[1*3+0]*boxt[2*3+2] - boxt[2*3+0]*boxt[1*3+2]) + boxt[0*3+2] * (boxt[1*3+0]*boxt[2*3+1] - boxt[2*3+0]*boxt[1*3+1]); volumei = static_cast(1.)/volume; + if (volume < 0) { + throw std::runtime_error("Negative volume detected. Please make sure the simulation cell obeys the right-hand rule."); + } } template diff --git a/source/lib/include/common.h b/source/lib/include/common.h index 8db7d817ce..3912f21f7f 100644 --- a/source/lib/include/common.h +++ b/source/lib/include/common.h @@ -144,9 +144,6 @@ session_input_tensors (vector>& input_tensors, const int * ilist, const int * jrange, const int * jlist, - int * array_int, - unsigned long long * array_longlong, - double * array_double, const vector & fparam_, const vector & aparam_, const NNPAtomMap & nnpmap, diff --git a/source/lib/src/NNPInter.cc b/source/lib/src/NNPInter.cc index f4f39945ff..aea9d48c9b 100644 --- a/source/lib/src/NNPInter.cc +++ b/source/lib/src/NNPInter.cc @@ -3,7 +3,6 @@ #include "SimulationRegion.h" #include -#define MAGIC_NUMBER 1024 #ifdef USE_CUDA_TOOLKIT #include "cuda_runtime.h" @@ -14,7 +13,7 @@ #define cudaErrcheck(res) { cudaAssert((res), __FILE__, __LINE__); } inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) { - if (code != cudaSuccess) + if (code != cudaSuccess) { fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); @@ -273,9 +272,6 @@ NNPInter::~NNPInter() { cudaErrcheck(cudaFree(ilist)); cudaErrcheck(cudaFree(jrange)); cudaErrcheck(cudaFree(jlist)); - cudaErrcheck(cudaFree(array_int)); - cudaErrcheck(cudaFree(array_longlong)); - cudaErrcheck(cudaFree(array_double)); } #endif } @@ -283,23 +279,12 @@ NNPInter::~NNPInter() { #ifdef USE_CUDA_TOOLKIT void NNPInter::update_nbor(const InternalNeighborList & nlist, const int nloc) { if (!init_nbor) { - sec_a = cum_sum(get_sel_a()); cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size())); cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size())); cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size())); - cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (sec_a.size() + nloc * sec_a.size() + nloc))); - cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2)); - #ifdef HIGH_PREC - cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3)); - #else - cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3)); - #endif ilist_size = nlist.ilist.size(); jrange_size = nlist.jrange.size(); jlist_size = nlist.jlist.size(); - arr_int_size = sec_a.size() + nloc * sec_a.size() + nloc; - arr_ll_size = nloc * MAGIC_NUMBER * 2; - arr_dou_size = nloc * sec_a.back() * 3; init_nbor = true; } if (ilist_size < nlist.ilist.size()) { @@ -317,25 +302,7 @@ void NNPInter::update_nbor(const InternalNeighborList & nlist, const int nloc) { cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size())); jlist_size = nlist.jlist.size(); } - if (arr_int_size < sec_a.size() + nloc * sec_a.size() + nloc) { - cudaErrcheck(cudaFree(array_int)); - cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (sec_a.size() + nloc * sec_a.size() + nloc))); - arr_int_size = sec_a.size() + nloc * sec_a.size() + nloc; - } - if (arr_ll_size < nloc * MAGIC_NUMBER * 2) { - cudaErrcheck(cudaFree(array_longlong)); - cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2)); - arr_ll_size = nloc * MAGIC_NUMBER * 2; - } - if (arr_dou_size < nloc * sec_a.back() * 3) { - cudaErrcheck(cudaFree(array_double)); - #ifdef HIGH_PREC - cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3)); - #else - cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * sec_a.back() * 3)); - #endif - arr_dou_size = nloc * sec_a.back() * 3; - } + cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice)); cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice)); cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice)); @@ -378,14 +345,10 @@ init (const string & model, const int & gpu_rank) if (dfparam < 0) dfparam = 0; if (daparam < 0) daparam = 0; inited = true; - + init_nbor = false; - array_int = NULL; - array_double = NULL; - array_longlong = NULL; ilist = NULL; jrange = NULL; jlist = NULL; ilist_size = 0; jrange_size = 0; jlist_size = 0; - arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0; } #else void @@ -415,12 +378,8 @@ init (const string & model, const int & gpu_rank) inited = true; init_nbor = false; - array_int = NULL; - array_double = NULL; - array_longlong = NULL; ilist = NULL; jrange = NULL; jlist = NULL; ilist_size = 0; jrange_size = 0; jlist_size = 0; - arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0; } #endif @@ -602,7 +561,7 @@ compute_inner (ENERGYTYPE & dener, } #ifdef USE_CUDA_TOOLKIT - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost); + int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost); #else int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); #endif @@ -669,7 +628,7 @@ compute (ENERGYTYPE & dener, } #ifdef USE_CUDA_TOOLKIT - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost); + int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost); #else int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); #endif @@ -710,9 +669,6 @@ NNPInterModelDevi::~NNPInterModelDevi() { cudaErrcheck(cudaFree(ilist)); cudaErrcheck(cudaFree(jrange)); cudaErrcheck(cudaFree(jlist)); - cudaErrcheck(cudaFree(array_int)); - cudaErrcheck(cudaFree(array_longlong)); - cudaErrcheck(cudaFree(array_double)); } #endif } @@ -761,14 +717,10 @@ init (const vector & models, const int & gpu_rank) // cell_size = rcut; // ntypes = get_ntypes(); inited = true; - + init_nbor = false; - array_int = NULL; - array_double = NULL; - array_longlong = NULL; ilist = NULL; jrange = NULL; jlist = NULL; ilist_size = 0; jrange_size = 0; jlist_size = 0; - arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0; } #else void @@ -798,14 +750,10 @@ init (const vector & models, const int & gpu_rank) // cell_size = rcut; // ntypes = get_ntypes(); inited = true; - + init_nbor = false; - array_int = NULL; - array_double = NULL; - array_longlong = NULL; ilist = NULL; jrange = NULL; jlist = NULL; ilist_size = 0; jrange_size = 0; jlist_size = 0; - arr_int_size = 0; arr_ll_size = 0; arr_dou_size = 0; } #endif @@ -873,40 +821,18 @@ cum_sum (const std::vector > n_sel) } } -void -NNPInterModelDevi:: -get_max_sec() -{ - for (int ii = 0; ii < numb_models; ii++) { - this->max_sec_size = max_sec_size < sec[ii].size() ? sec[ii].size() : max_sec_size; - this->max_sec_back = max_sec_back < sec[ii].back() ? sec[ii].back() : max_sec_back; - } -} - #ifdef USE_CUDA_TOOLKIT void NNPInterModelDevi:: update_nbor(const InternalNeighborList & nlist, const int nloc) { if (!init_nbor) { - cum_sum(get_sel()); - get_max_sec(); cudaErrcheck(cudaMalloc((void**)&ilist, sizeof(int) * nlist.ilist.size())); cudaErrcheck(cudaMalloc((void**)&jrange, sizeof(int) * nlist.jrange.size())); cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size())); - cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (max_sec_size + nloc * max_sec_size + nloc))); - cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2)); - #ifdef HIGH_PREC - cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3)); - #else - cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3)); - #endif ilist_size = nlist.ilist.size(); jrange_size = nlist.jrange.size(); jlist_size = nlist.jlist.size(); - arr_int_size = max_sec_size + nloc * max_sec_size + nloc; - arr_ll_size = nloc * MAGIC_NUMBER * 2; - arr_dou_size = nloc * max_sec_back * 3; init_nbor = true; } if (ilist_size < nlist.ilist.size()) { @@ -924,25 +850,7 @@ update_nbor(const InternalNeighborList & nlist, const int nloc) cudaErrcheck(cudaMalloc((void**)&jlist, sizeof(int) * nlist.jlist.size())); jlist_size = nlist.jlist.size(); } - if (arr_int_size < max_sec_size + nloc * max_sec_size + nloc) { - cudaErrcheck(cudaFree(array_int)); - cudaErrcheck(cudaMalloc((void**)&array_int, sizeof(int) * (max_sec_size + nloc * max_sec_size + nloc))); - arr_int_size = max_sec_size + nloc * max_sec_size + nloc; - } - if (arr_ll_size < nloc * MAGIC_NUMBER * 2) { - cudaErrcheck(cudaFree(array_longlong)); - cudaErrcheck(cudaMalloc((void**)&array_longlong, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2)); - arr_ll_size = nloc * MAGIC_NUMBER * 2; - } - if (arr_dou_size < nloc * max_sec_back * 3) { - cudaErrcheck(cudaFree(array_double)); - #ifdef HIGH_PREC - cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3)); - #else - cudaErrcheck(cudaMalloc((void**)&array_double, sizeof(compute_t) * nloc * max_sec_back * 3)); - #endif - arr_dou_size = nloc * max_sec_back * 3; - } + cudaErrcheck(cudaMemcpy(ilist, &nlist.ilist[0], sizeof(int) * nlist.ilist.size(), cudaMemcpyHostToDevice)); cudaErrcheck(cudaMemcpy(jrange, &nlist.jrange[0], sizeof(int) * nlist.jrange.size(), cudaMemcpyHostToDevice)); cudaErrcheck(cudaMemcpy(jlist, &nlist.jlist[0], sizeof(int) * nlist.jlist.size(), cudaMemcpyHostToDevice)); @@ -1044,7 +952,7 @@ compute (vector & all_energy, } #ifdef USE_CUDA_TOOLKIT - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost); + int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost); #else int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); #endif @@ -1094,7 +1002,7 @@ compute (vector & all_energy, } #ifdef USE_CUDA_TOOLKIT - int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, array_int, array_longlong, array_double, fparam, aparam, nnpmap, nghost); + int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, ilist, jrange, jlist, fparam, aparam, nnpmap, nghost); #else int ret = session_input_tensors (input_tensors, dcoord_, ntypes, datype_, dbox, nlist, fparam, aparam, nnpmap, nghost); #endif diff --git a/source/lib/src/common.cc b/source/lib/src/common.cc index 296800a396..dd4c3f672a 100644 --- a/source/lib/src/common.cc +++ b/source/lib/src/common.cc @@ -479,9 +479,6 @@ session_input_tensors ( const int * ilist, const int * jrange, const int * jlist, - int * array_int, - unsigned long long * array_longlong, - double * array_double, const vector & fparam_, const vector & aparam_, const NNPAtomMap & nnpmap, @@ -511,7 +508,7 @@ session_input_tensors ( box_shape.AddDim (nframes); box_shape.AddDim (9); TensorShape mesh_shape; - mesh_shape.AddDim (32); + mesh_shape.AddDim (16); TensorShape natoms_shape; natoms_shape.AddDim (2 + ntypes); TensorShape fparam_shape; @@ -565,7 +562,7 @@ session_input_tensors ( } } - for (int ii = 0; ii < 32; ++ii) mesh(ii) = 0; + for (int ii = 0; ii < 16; ++ii) mesh(ii) = 0; mesh (0) = sizeof(int *) / sizeof(int); assert (mesh(0) * sizeof(int) == sizeof(int *)); @@ -577,9 +574,6 @@ session_input_tensors ( memcpy (&mesh(4), &(ilist), sizeof(int *)); memcpy (&mesh(8), &(jrange), sizeof(int *)); memcpy (&mesh(12), &(jlist), sizeof(int *)); - memcpy (&mesh(16), &(array_int), sizeof(int *)); - memcpy (&mesh(20), &(array_longlong), sizeof(unsigned long long *)); - memcpy (&mesh(24), &(array_double), sizeof(double *)); natoms (0) = nloc; natoms (1) = nall; diff --git a/source/lmp/env.sh.in b/source/lmp/env.sh.in index 00bc3b18b6..2e091432f6 100644 --- a/source/lmp/env.sh.in +++ b/source/lmp/env.sh.in @@ -8,4 +8,4 @@ TF_RPATH=`echo $TENSORFLOW_LIBRARY_PATH | sed "s/;/ -Wl,-rpath=/g"` NNP_INC=" -std=c++11 @PREC_DEF@ @TTM_DEF@ @OLD_LMP_PPPM_DEF@ -I$TF_INCLUDE_DIRS -I$DEEPMD_ROOT/include/deepmd " NNP_PATH=" -L$TF_LIBRARY_PATH -L$DEEPMD_ROOT/lib" -NNP_LIB=" -Wl,--no-as-needed -l@LIB_DEEPMD_OP@ -l@LIB_DEEPMD_OP_CUDA@ -l@LIB_DEEPMD@ -ltensorflow_cc -ltensorflow_framework -Wl,-rpath=$TF_RPATH -Wl,-rpath=$DEEPMD_ROOT/lib" +NNP_LIB=" -Wl,--no-as-needed -l@LIB_DEEPMD_OP_CUDA@ -l@LIB_DEEPMD_OP@ -l@LIB_DEEPMD@ -ltensorflow_cc -ltensorflow_framework -Wl,-rpath=$TF_RPATH -Wl,-rpath=$DEEPMD_ROOT/lib" diff --git a/source/lmp/fix_dplr.cpp b/source/lmp/fix_dplr.cpp index c0f0bb930c..03fef55420 100644 --- a/source/lmp/fix_dplr.cpp +++ b/source/lmp/fix_dplr.cpp @@ -24,6 +24,7 @@ is_key (const string& input) keys.push_back("model"); keys.push_back("type_associate"); keys.push_back("bond_type"); + keys.push_back("efield"); for (int ii = 0; ii < keys.size(); ++ii){ if (input == keys[ii]) { return true; @@ -34,7 +35,11 @@ is_key (const string& input) FixDPLR::FixDPLR(LAMMPS *lmp, int narg, char **arg) - :Fix(lmp, narg, arg) + :Fix(lmp, narg, arg), + efield(3, 0.0), + efield_fsum(4, 0.0), + efield_fsum_all(4, 0.0), + efield_force_flag(0) { virial_flag = 1; @@ -54,7 +59,14 @@ FixDPLR::FixDPLR(LAMMPS *lmp, int narg, char **arg) model = string(arg[iarg+1]); iarg += 2; } - if (string(arg[iarg]) == string("type_associate")) { + else if (string(arg[iarg]) == string("efield")) { + if (iarg+3 > narg) error->all(FLERR,"Illegal fix adapt command, efield should be provided 3 float numbers"); + efield[0] = atof(arg[iarg+1]); + efield[1] = atof(arg[iarg+2]); + efield[2] = atof(arg[iarg+3]); + iarg += 4; + } + else if (string(arg[iarg]) == string("type_associate")) { int iend = iarg+1; while (iend < narg && (! is_key(arg[iend]) )) { map_vec.push_back(atoi(arg[iend])-1); @@ -62,7 +74,7 @@ FixDPLR::FixDPLR(LAMMPS *lmp, int narg, char **arg) } iarg = iend; } - if (string(arg[iarg]) == string("bond_type")) { + else if (string(arg[iarg]) == string("bond_type")) { int iend = iarg+1; while (iend < narg && (! is_key(arg[iend]) )) { bond_type.push_back(atoi(arg[iend])-1); @@ -105,6 +117,7 @@ FixDPLR::FixDPLR(LAMMPS *lmp, int narg, char **arg) int FixDPLR::setmask() { int mask = 0; + mask |= THERMO_ENERGY; mask |= POST_INTEGRATE; mask |= PRE_FORCE; mask |= POST_FORCE; @@ -347,6 +360,7 @@ void FixDPLR::post_force(int vflag) int nall = nlocal + nghost; vector dcoord(nall*3, 0.0), dbox(9, 0.0), dfele(nlocal*3, 0.0); vector dtype(nall, 0); + // set values for dcoord, dbox, dfele { int *type = atom->type; for (int ii = 0; ii < nall; ++ii){ @@ -366,9 +380,40 @@ void FixDPLR::post_force(int vflag) } } assert(dfele_.size() == nlocal * 3); + // revise force according to efield for (int ii = 0; ii < nlocal*3; ++ii){ dfele[ii] = dfele_[ii]; } + // revise force and virial according to efield + double * q = atom->q; + imageint *image = atom->image; + double unwrap[3]; + double v[6]; + efield_fsum[0] = efield_fsum[1] = efield_fsum[2] = efield_fsum[3] = 0.0; + efield_force_flag = 0; + for (int ii = 0; ii < nlocal; ++ii){ + double tmpf[3]; + for (int dd = 0; dd < 3; ++dd){ + tmpf[dd] = q[ii] * efield[dd]; + } + for (int dd = 0; dd < 3; ++dd){ + dfele[ii*3+dd] += tmpf[dd]; + } + domain->unmap(x[ii],image[ii],unwrap); + efield_fsum[0] -= tmpf[0]*unwrap[0]+tmpf[1]*unwrap[1]+tmpf[2]*unwrap[2]; + efield_fsum[1] += tmpf[0]; + efield_fsum[2] += tmpf[1]; + efield_fsum[3] += tmpf[2]; + if (evflag) { + v[0] = tmpf[0] *unwrap[0]; + v[1] = tmpf[1] *unwrap[1]; + v[2] = tmpf[2] *unwrap[2]; + v[3] = tmpf[0] *unwrap[1]; + v[4] = tmpf[0] *unwrap[2]; + v[5] = tmpf[1] *unwrap[2]; + v_tally(ii, v); + } + } } // lmp nlist NeighList * list = pair_nnp->list; @@ -466,5 +511,28 @@ void FixDPLR::unpack_reverse_comm(int n, int *list, double *buf) } } +/* ---------------------------------------------------------------------- + return energy added by fix +------------------------------------------------------------------------- */ +double FixDPLR::compute_scalar(void) +{ + if (efield_force_flag == 0) { + MPI_Allreduce(&efield_fsum[0],&efield_fsum_all[0],4,MPI_DOUBLE,MPI_SUM,world); + efield_force_flag = 1; + } + return efield_fsum_all[0]; +} + +/* ---------------------------------------------------------------------- + return total extra force due to fix +------------------------------------------------------------------------- */ +double FixDPLR::compute_vector(int n) +{ + if (efield_force_flag == 0) { + MPI_Allreduce(&efield_fsum[0],&efield_fsum_all[0],4,MPI_DOUBLE,MPI_SUM,world); + efield_force_flag = 1; + } + return efield_fsum_all[n+1]; +} diff --git a/source/lmp/fix_dplr.h b/source/lmp/fix_dplr.h index edc9204874..a0f45185b4 100644 --- a/source/lmp/fix_dplr.h +++ b/source/lmp/fix_dplr.h @@ -32,6 +32,8 @@ namespace LAMMPS_NS { void post_force(int); int pack_reverse_comm(int, int, double *); void unpack_reverse_comm(int, int *, double *); + double compute_scalar(void); + double compute_vector(int); private: PairNNP * pair_nnp; DeepTensor dpt; @@ -45,6 +47,9 @@ namespace LAMMPS_NS { map bk_type_asso; vector dipole_recd; vector dfcorr_buff; + vector efield; + vector efield_fsum, efield_fsum_all; + int efield_force_flag; void get_valid_pairs(vector >& pairs); }; } diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt index 89416056e1..993a1b6fd4 100644 --- a/source/op/CMakeLists.txt +++ b/source/op/CMakeLists.txt @@ -3,8 +3,9 @@ set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_DIR}/lib/src/NeighborList.cpp) set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} ) -file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc) -file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_gpu.cc descrpt_se_r_gpu.cc tab_inter.cc prod_force_se_a_gpu.cc prod_virial_se_a_gpu.cc prod_force_se_r_gpu.cc prod_virial_se_r_gpu.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ) +file(GLOB OP_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu.cc) +file(GLOB OP_PY_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a.cc descrpt_se_r.cc tab_inter.cc prod_force_se_a.cc prod_virial_se_a.cc prod_force_se_r.cc prod_virial_se_r.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu_gpu.cc) +file(GLOB OP_CUDA_SRC prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_gpu.cc descrpt_se_r_gpu.cc tab_inter.cc prod_force_se_a_gpu.cc prod_virial_se_a_gpu.cc prod_force_se_r_gpu.cc prod_virial_se_r_gpu.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_gpu.cc) file(GLOB OP_GRADS_SRC prod_force_grad.cc prod_force_se_a_grad.cc prod_force_se_r_grad.cc prod_virial_grad.cc prod_virial_se_a_grad.cc prod_virial_se_r_grad.cc soft_min_force_grad.cc soft_min_virial_grad.cc ) file(GLOB OP_PY *.py) @@ -23,8 +24,22 @@ if (BUILD_CPP_IF) endif (BUILD_CPP_IF) if (BUILD_PY_IF) - add_library(op_abi SHARED ${OP_SRC} ${OP_LIB}) - add_library(op_grads SHARED ${OP_GRADS_SRC}) + set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) + set(CMAKE_INSTALL_RPATH DESTINATION ${DP_PIP_INSTALL_PATH} ${DP_SETUP_INSTALL_PATH} ${CMAKE_BINARY_DIR}/op/cuda) + if (USE_CUDA_TOOLKIT) + add_library(op_abi SHARED ${OP_PY_CUDA_SRC} ${OP_LIB}) + add_library(op_grads SHARED ${OP_GRADS_SRC}) + add_subdirectory(cuda) + find_package(CUDA REQUIRED) + include_directories(${CUDA_INCLUDE_DIRS}) + set (EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_cuda) + target_link_libraries (op_abi ${EXTRA_LIBS}) + target_link_libraries (op_grads ${EXTRA_LIBS}) + else (USE_CUDA_TOOLKIT) + add_library(op_abi SHARED ${OP_SRC} ${OP_LIB}) + add_library(op_grads SHARED ${OP_GRADS_SRC}) + endif(USE_CUDA_TOOLKIT) + message(STATUS ${TensorFlowFramework_LIBRARY}) target_link_libraries( op_abi ${TensorFlowFramework_LIBRARY} ) diff --git a/source/op/_gelu.py b/source/op/_gelu.py new file mode 100644 index 0000000000..ac0585da78 --- /dev/null +++ b/source/op/_gelu.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +""" +First-order derivatives and second-order derivatives for gelu function. +""" + +from tensorflow.python.framework import ops +from deepmd.env import op_module + +@ops.RegisterGradient("Gelu") +def _gelu_cc (op, dy) : + return op_module.gelu_grad(dy, op.inputs[0]) + +@ops.RegisterGradient("GeluGrad") +def _gelu_grad_cc (op, dy) : + return [op_module.gelu_grad(dy, op.inputs[1]), op_module.gelu_grad_grad(dy, op.inputs[0], op.inputs[1])] diff --git a/source/op/cuda/CMakeLists.txt b/source/op/cuda/CMakeLists.txt index 25f796500b..89dd0b5922 100644 --- a/source/op/cuda/CMakeLists.txt +++ b/source/op/cuda/CMakeLists.txt @@ -80,9 +80,14 @@ else () endif() set (SOURCE_FILES - descrpt_se_a.cu descrpt_se_r.cu prod_force_se_a.cu prod_force_se_r.cu prod_virial_se_a.cu prod_virial_se_r.cu + descrpt_se_a.cu descrpt_se_r.cu prod_force_se_a.cu prod_force_se_r.cu prod_virial_se_a.cu prod_virial_se_r.cu gelu.cu ) cuda_add_library(deepmd_op_cuda SHARED ${SOURCE_FILES}) -install(TARGETS deepmd_op_cuda DESTINATION lib/) +if (BUILD_CPP_IF) + install(TARGETS deepmd_op_cuda DESTINATION lib/) +endif (BUILD_CPP_IF) +if (BUILD_PY_IF) + install(TARGETS deepmd_op_cuda DESTINATION deepmd/) +endif (BUILD_PY_IF) diff --git a/source/op/cuda/descrpt_se_a.cu b/source/op/cuda/descrpt_se_a.cu index 4b309522fa..1636df8ff5 100644 --- a/source/op/cuda/descrpt_se_a.cu +++ b/source/op/cuda/descrpt_se_a.cu @@ -18,8 +18,6 @@ limitations under the License. #include #include -#define MAGIC_NUMBER 1024 - #ifdef HIGH_PREC typedef double VALUETYPE; #else @@ -118,11 +116,12 @@ __global__ void format_nlist_fill_a_se_a(const VALUETYPE * coord, const int * jlist, const float rcut, int_64 * key, - int * i_idx) + int * i_idx, + const int MAGIC_NUMBER) { // <<>> - const unsigned int idx = blockIdx.x; - const unsigned int idy = threadIdx.x; + const unsigned int idx = blockIdx.y; + const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; const int nsize = jrange[i_idx[idx] + 1] - jrange[i_idx[idx]]; if (idy >= nsize) { @@ -154,7 +153,8 @@ __global__ void format_nlist_fill_b_se_a(int * nlist, int_64 * key, const int * sec_a, const int sec_a_size, - int * nei_iter_dev) + int * nei_iter_dev, + const int MAGIC_NUMBER) { const unsigned int idy = blockIdx.x * blockDim.x + threadIdx.x; @@ -194,7 +194,6 @@ __global__ void compute_descriptor_se_a (VALUETYPE* descript, const VALUETYPE* coord, const VALUETYPE rmin, const VALUETYPE rmax, - compute_t* sel_a_diff_dev, const int sec_a_size) { // <<>> @@ -208,16 +207,14 @@ __global__ void compute_descriptor_se_a (VALUETYPE* descript, VALUETYPE * row_descript = descript + idx * ndescrpt; VALUETYPE * row_descript_deriv = descript_deriv + idx * descript_deriv_size; VALUETYPE * row_rij = rij + idx * rij_size; - compute_t * sel_a_diff = sel_a_diff_dev + idx * nlist_size * 3; int * row_nlist = nlist + idx * nlist_size; if (row_nlist[idy] >= 0) { const int & j_idx = row_nlist[idy]; for (int kk = 0; kk < 3; kk++) { - sel_a_diff[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk]; - row_rij[idy * 3 + kk] = sel_a_diff[idy * 3 + kk]; + row_rij[idy * 3 + kk] = coord[j_idx * 3 + kk] - coord[idx * 3 + kk]; } - const compute_t * rr = &sel_a_diff[idy * 3 + 0]; + const compute_t * rr = &row_rij[idy * 3 + 0]; compute_t nr2 = dev_dot(rr, rr); compute_t inr = 1./sqrt(nr2); compute_t nr = nr2 * inr; @@ -263,6 +260,166 @@ __global__ void compute_descriptor_se_a (VALUETYPE* descript, } } +void format_nbor_list_256 ( + const VALUETYPE* coord, + const int* type, + const int* jrange, + const int* jlist, + const int& nloc, + const float& rcut_r, + int * i_idx, + int_64 * key +) +{ + const int LEN = 256; + const int MAGIC_NUMBER = 256; + const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + dim3 block_grid(nblock, nloc); + format_nlist_fill_a_se_a + <<>> ( + coord, + type, + jrange, + jlist, + rcut_r, + key, + i_idx, + MAGIC_NUMBER + ); + const int ITEMS_PER_THREAD = 4; + const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + // BlockSortKernel<<>> ( + BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); +} + +void format_nbor_list_512 ( + const VALUETYPE* coord, + const int* type, + const int* jrange, + const int* jlist, + const int& nloc, + const float& rcut_r, + int * i_idx, + int_64 * key +) +{ + const int LEN = 256; + const int MAGIC_NUMBER = 512; + const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + dim3 block_grid(nblock, nloc); + format_nlist_fill_a_se_a + <<>> ( + coord, + type, + jrange, + jlist, + rcut_r, + key, + i_idx, + MAGIC_NUMBER + ); + const int ITEMS_PER_THREAD = 4; + const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + // BlockSortKernel<<>> ( + BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); +} + +void format_nbor_list_1024 ( + const VALUETYPE* coord, + const int* type, + const int* jrange, + const int* jlist, + const int& nloc, + const float& rcut_r, + int * i_idx, + int_64 * key +) +{ + const int LEN = 256; + const int MAGIC_NUMBER = 1024; + const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + dim3 block_grid(nblock, nloc); + format_nlist_fill_a_se_a + <<>> ( + coord, + type, + jrange, + jlist, + rcut_r, + key, + i_idx, + MAGIC_NUMBER + ); + const int ITEMS_PER_THREAD = 8; + const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + // BlockSortKernel<<>> ( + BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); +} + +void format_nbor_list_2048 ( + const VALUETYPE* coord, + const int* type, + const int* jrange, + const int* jlist, + const int& nloc, + const float& rcut_r, + int * i_idx, + int_64 * key +) +{ + const int LEN = 256; + const int MAGIC_NUMBER = 2048; + const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + dim3 block_grid(nblock, nloc); + format_nlist_fill_a_se_a + <<>> ( + coord, + type, + jrange, + jlist, + rcut_r, + key, + i_idx, + MAGIC_NUMBER + ); + const int ITEMS_PER_THREAD = 8; + const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + // BlockSortKernel<<>> ( + BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); +} + +void format_nbor_list_4096 ( + const VALUETYPE* coord, + const int* type, + const int* jrange, + const int* jlist, + const int& nloc, + const float& rcut_r, + int * i_idx, + int_64 * key +) +{ + const int LEN = 256; + const int MAGIC_NUMBER = 4096; + const int nblock = (MAGIC_NUMBER + LEN - 1) / LEN; + dim3 block_grid(nblock, nloc); + format_nlist_fill_a_se_a + <<>> ( + coord, + type, + jrange, + jlist, + rcut_r, + key, + i_idx, + MAGIC_NUMBER + ); + const int ITEMS_PER_THREAD = 16; + const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; + // BlockSortKernel<<>> ( + BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); +} + void DescrptSeALauncher(const VALUETYPE* coord, const int* type, const int* ilist, @@ -270,22 +427,20 @@ void DescrptSeALauncher(const VALUETYPE* coord, const int* jlist, int* array_int, unsigned long long* array_longlong, - compute_t* array_double, const VALUETYPE* avg, const VALUETYPE* std, VALUETYPE* descript, VALUETYPE* descript_deriv, VALUETYPE* rij, int* nlist, - const int& ntypes, const int& nloc, - const int& nall, const int& nnei, const float& rcut_r, const float& rcut_r_smth, const int& ndescrpt, const std::vector& sec_a, - const bool& fill_nei_a + const bool& fill_nei_a, + const int MAGIC_NUMBER ) { const int LEN = 256; @@ -294,42 +449,76 @@ void DescrptSeALauncher(const VALUETYPE* coord, int * nei_iter = array_int + sec_a.size(); // = new int[sec_a_size]; int * i_idx = array_int + sec_a.size() + nloc * sec_a.size(); int_64 * key = array_longlong; - compute_t * sel_a_diff = array_double; // = new VALUETYPE *[nlist_size]; nnei - // int_64 * key = NULL; - // VALUETYPE * sel_a_diff = NULL; // = new VALUETYPE *[nlist_size]; nnei - + cudaError_t res = cudaSuccess; - // res = cudaMalloc((void**)&sec_a_dev, sizeof(int) * sec_a.size()); cudaErrcheck(res); - // res = cudaMalloc((void**)&nei_iter, sizeof(int) * nloc * sec_a.size()); cudaErrcheck(res); - // res = cudaMalloc((void**)&i_idx, sizeof(int) * nloc); cudaErrcheck(res); - // res = cudaMalloc((void**)&key, sizeof(unsigned long long) * nloc * MAGIC_NUMBER * 2); cudaErrcheck(res); - // res = cudaMalloc((void**)&sel_a_diff, sizeof(VALUETYPE) * nnei * 3 * nloc); cudaErrcheck(res); res = cudaMemcpy(sec_a_dev, &sec_a[0], sizeof(int) * sec_a.size(), cudaMemcpyHostToDevice); cudaErrcheck(res); res = cudaMemset(key, 0xffffffff, sizeof(int_64) * nloc * MAGIC_NUMBER); cudaErrcheck(res); res = cudaMemset(nlist, -1, sizeof(int) * nloc * nnei); cudaErrcheck(res); res = cudaMemset(descript, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt); cudaErrcheck(res); res = cudaMemset(descript_deriv, 0.0, sizeof(VALUETYPE) * nloc * ndescrpt * 3); cudaErrcheck(res); - // res = cudaMemset(rij, 0.0, sizeof(VALUETYPE) * nloc * nnei * 3); cudaErrcheck(res); if (fill_nei_a) { // ~~~ // cudaProfilerStart(); get_i_idx_se_a<<>> (nloc, ilist, i_idx); - format_nlist_fill_a_se_a<<>> ( - coord, - type, - jrange, - jlist, - rcut_r, - key, - i_idx - ); - const int ITEMS_PER_THREAD = 4; - const int BLOCK_THREADS = MAGIC_NUMBER / ITEMS_PER_THREAD; - // BlockSortKernel<<>> ( - BlockSortKernel <<>> (key, key + nloc * MAGIC_NUMBER); - + if (nnei <= 256) { + format_nbor_list_256 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } else if (nnei <= 512) { + format_nbor_list_512 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } else if (nnei <= 1024) { + format_nbor_list_1024 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } else if (nnei <= 2048) { + format_nbor_list_2048 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } else if (nnei <= 4096) { + format_nbor_list_4096 ( + coord, + type, + jrange, + jlist, + nloc, + rcut_r, + i_idx, + key + ); + } + format_nlist_fill_b_se_a<<>> ( nlist, nnei, @@ -339,7 +528,8 @@ void DescrptSeALauncher(const VALUETYPE* coord, key, sec_a_dev, sec_a.size(), - nei_iter + nei_iter, + MAGIC_NUMBER ); } @@ -360,14 +550,6 @@ void DescrptSeALauncher(const VALUETYPE* coord, coord, rcut_r_smth, rcut_r, - sel_a_diff, sec_a.back() ); -//// - // res = cudaFree(sec_a_dev); cudaErrcheck(res); - // res = cudaFree(key); cudaErrcheck(res); - // res = cudaFree(i_idx); cudaErrcheck(res); - // res = cudaFree(nei_iter); cudaErrcheck(res); - // res = cudaFree(sel_a_diff); cudaErrcheck(res); - //output some interesting things... } \ No newline at end of file diff --git a/source/op/cuda/gelu.cu b/source/op/cuda/gelu.cu new file mode 100644 index 0000000000..99b7b1aed4 --- /dev/null +++ b/source/op/cuda/gelu.cu @@ -0,0 +1,77 @@ +#include +#include + +#define SQRT_2_PI 0.7978845608028654 + +template +__global__ void gelu(const T * in, T * out, int const size) { + int const idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= size) {return;} + + out[idx] = in[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx]))); +} + +template +__global__ void gelu_grad(const T * dy, const T * in, T * out, int const size) { + int const idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= size) {return;} + + // out[idx] = in[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx]))); + T const var1 = tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx])); + out[idx] = dy[idx] * (0.5 * SQRT_2_PI * in[idx] * (1 - var1 * var1) * (0.134145 * in[idx] * in[idx] + 1) + 0.5 * var1 + 0.5); +} + +template +__global__ void gelu_grad_grad(const T * dy, const T * dy_, const T * in, T * out, int const size) { + int const idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= size) {return;} + + // out[idx] = in[idx] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx]))); + T const var1 = tanh(SQRT_2_PI * (in[idx] + 0.044715 * in[idx] * in[idx] *in[idx])); + T const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[idx] * in[idx] + 1); + + out[idx] = dy[idx] * dy_[idx] * (0.134145 * SQRT_2_PI * in[idx] * in[idx] * (1 - var1 * var1) - SQRT_2_PI * in[idx] * var2 * (0.134145 * in[idx] * in[idx] + 1) * var1 + var2); +} + + +void GeluLauncher(const float * in, float * out, int const size) { + int const THREAD_ITEMS = 1024; + int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; + + gelu<<>>(in, out, size); +} + +void GeluLauncher(const double * in, double * out, int const size) { + int const THREAD_ITEMS = 1024; + int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; + + gelu<<>>(in, out, size); +} + +void GeluGradLauncher(const float * dy, const float * in, float * out, int const size) { + int const THREAD_ITEMS = 1024; + int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; + + gelu_grad<<>>(dy, in, out, size); +} + +void GeluGradLauncher(const double * dy, const double * in, double * out, int const size) { + int const THREAD_ITEMS = 1024; + int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; + + gelu_grad<<>>(dy, in, out, size); +} + +void GeluGradGradLauncher(const float * dy, const float * dy_, const float * in, float * out, int const size) { + int const THREAD_ITEMS = 1024; + int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; + + gelu_grad_grad<<>>(dy, dy_, in, out, size); +} + +void GeluGradGradLauncher(const double * dy, const double * dy_, const double * in, double * out, int const size) { + int const THREAD_ITEMS = 1024; + int const BLOCK_NUMS = (size + THREAD_ITEMS - 1) / THREAD_ITEMS; + + gelu_grad_grad<<>>(dy, dy_, in, out, size); +} diff --git a/source/op/descrpt_se_a_gpu.cc b/source/op/descrpt_se_a_gpu.cc index 70dd9c7751..fd5ae632cb 100644 --- a/source/op/descrpt_se_a_gpu.cc +++ b/source/op/descrpt_se_a_gpu.cc @@ -67,6 +67,24 @@ REGISTER_OP("DescrptSeA") .Output("nlist: int32"); #endif +int get_magic_number(int const nnei) { + if (nnei <= 256) { + return 256; + } + else if (nnei <= 512) { + return 512; + } + else if (nnei <= 1024) { + return 1024; + } + else if (nnei <= 2048) { + return 2048; + } + else if (nnei <= 4096) { + return 4096; + } +} + void DescrptSeALauncher(const VALUETYPE* coord, const int* type, const int* ilist, @@ -74,22 +92,20 @@ void DescrptSeALauncher(const VALUETYPE* coord, const int* jlist, int* array_int, unsigned long long* array_longlong, - compute_t* array_double, const VALUETYPE* avg, const VALUETYPE* std, VALUETYPE* descript, VALUETYPE* descript_deriv, VALUETYPE* rij, int* nlist, - const int& ntypes, const int& nloc, - const int& nall, const int& nnei, const float& rcut_r, const float& rcut_r_smth, const int& ndescrpt, const std::vector& sec_a, - const bool& fill_nei_a + const bool& fill_nei_a, + const int MAGIC_NUMBER ); class DescrptSeAOp : public OpKernel { @@ -112,6 +128,7 @@ class DescrptSeAOp : public OpKernel { nnei_r = sec_r.back(); nnei = nnei_a + nnei_r; fill_nei_a = (rcut_a < 0); + magic_number = get_magic_number(nnei); } void Compute(OpKernelContext* context) override { @@ -158,7 +175,7 @@ class DescrptSeAOp : public OpKernel { OP_REQUIRES (context, (ntypes == int(sel_a.size())), errors::InvalidArgument ("number of types should match the length of sel array")); OP_REQUIRES (context, (ntypes == int(sel_r.size())), errors::InvalidArgument ("number of types should match the length of sel array")); - OP_REQUIRES (context, (nnei <= 1024), errors::InvalidArgument ("Assert failed, max neighbor size of atom(nnei) " + std::to_string(nnei) + " is larger than 1024!, which currently is not supported by deepmd-kit.")); + OP_REQUIRES (context, (nnei <= 4096), errors::InvalidArgument ("Assert failed, max neighbor size of atom(nnei) " + std::to_string(nnei) + " is larger than 4096, which currently is not supported by deepmd-kit.")); // Create output tensors TensorShape descrpt_shape ; @@ -192,14 +209,23 @@ class DescrptSeAOp : public OpKernel { nlist_shape, &nlist_tensor)); - int * ilist = NULL, *jrange = NULL, *jlist = NULL; - int * array_int = NULL; unsigned long long *array_longlong = NULL; compute_t *array_double = NULL; + // allocate temp memory, temp memory must not be used after this operation! + Tensor int_temp; + TensorShape int_shape; + int_shape.AddDim(sec_a.size() + nloc * sec_a.size() + nloc); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, int_shape, &int_temp)); + + Tensor uint64_temp; + TensorShape uint64_shape; + uint64_shape.AddDim(nloc * magic_number * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, &uint64_temp)); + + int * ilist = NULL, * jrange = NULL, * jlist = NULL; + int * array_int = int_temp.flat().data(); + unsigned long long * array_longlong = uint64_temp.flat().data(); cudaErrcheck(cudaMemcpy(&(ilist), 4 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); cudaErrcheck(cudaMemcpy(&(jrange), 8 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); cudaErrcheck(cudaMemcpy(&(jlist), 12 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); - cudaErrcheck(cudaMemcpy(&(array_int), 16 + mesh_tensor.flat().data(), sizeof(int *), cudaMemcpyDeviceToHost)); - cudaErrcheck(cudaMemcpy(&(array_longlong), 20 + mesh_tensor.flat().data(), sizeof(unsigned long long *), cudaMemcpyDeviceToHost)); - cudaErrcheck(cudaMemcpy(&(array_double), 24 + mesh_tensor.flat().data(), sizeof(compute_t *), cudaMemcpyDeviceToHost)); // Launch computation for (int II = 0; II < nsamples; II++) { @@ -210,22 +236,20 @@ class DescrptSeAOp : public OpKernel { jlist, array_int, array_longlong, - array_double, avg_tensor.matrix().data(), std_tensor.matrix().data(), descrpt_tensor->matrix().data() + II * (nloc * ndescrpt), descrpt_deriv_tensor->matrix().data() + II * (nloc * ndescrpt * 3), rij_tensor->matrix().data() + II * (nloc * nnei * 3), nlist_tensor->matrix().data() + II * (nloc * nnei), - ntypes, nloc, - nall, nnei, rcut_r, rcut_r_smth, ndescrpt, sec_a, - fill_nei_a + fill_nei_a, + magic_number ); } // std::cout << "done" << std::endl; @@ -242,7 +266,7 @@ class DescrptSeAOp : public OpKernel { std::vector sec_a; std::vector sec_r; int ndescrpt, ndescrpt_a, ndescrpt_r; - int nnei, nnei_a, nnei_r, nloc, nall; + int nnei, nnei_a, nnei_r, nloc, nall, magic_number; bool fill_nei_a; //private func diff --git a/source/op/gelu.cc b/source/op/gelu.cc new file mode 100644 index 0000000000..26c53c8511 --- /dev/null +++ b/source/op/gelu.cc @@ -0,0 +1,164 @@ +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/shape_inference.h" +#define SQRT_2_PI 0.7978845608028654 + +using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; + +REGISTER_OP("Gelu") + .Attr("T: {float, double}") + .Input("x: T") + .Output("output: T"); + +REGISTER_OP("GeluGrad") + .Attr("T: {float, double}") + .Input("dy: T") + .Input("x: T") + .Output("output: T"); + +REGISTER_OP("GeluGradGrad") + .Attr("T: {float, double}") + .Input("dy: T") + .Input("dy_: T") + .Input("x: T") + .Output("output: T"); + +template +struct GeluFunctor { + void operator()(const Device& d, const T * in, T * out, int const size) { + #pragma omp parallel for + for (int ii = 0; ii < size; ii++) { + out[ii] = in[ii] * 0.5 * (1.0 + tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] * in[ii]))); + } + } +}; + +template +struct GeluGradFunctor { + void operator()(const Device& d, const T * dy, const T * in, T * out, int const size) { + #pragma omp parallel for + for (int ii = 0; ii < size; ii++) { + T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])); + out[ii] = dy[ii] * (0.5 * SQRT_2_PI * in[ii] * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1) + 0.5 * var1 + 0.5); + } + } +}; + +template +struct GeluGradGradFunctor { + void operator()(const Device& d, const T * dy, const T * dy_, const T * in, T * out, int const size) { + #pragma omp parallel for + for (int ii = 0; ii < size; ii++) { + T const var1 = tanh(SQRT_2_PI * (in[ii] + 0.044715 * in[ii] * in[ii] *in[ii])); + T const var2 = SQRT_2_PI * (1 - var1 * var1) * (0.134145 * in[ii] * in[ii] + 1); + + out[ii] = dy[ii] * dy_[ii] * (0.134145 * SQRT_2_PI * in[ii] * in[ii] * (1 - var1 * var1) - SQRT_2_PI * in[ii] * var2 * (0.134145 * in[ii] * in[ii] + 1) * var1 + var2); + } + } +}; + +// OpKernel definition. +// template parameter is the datatype of the tensors. +template +class GeluOp : public OpKernel { + public : + explicit GeluOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + const Tensor& x = context->input(0); + + Tensor * output = NULL; + int context_output_index = 0; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + x.shape(), + &output)); + + GeluFunctor()( + context->eigen_device(), + x.flat().data(), + output->flat().data(), + static_cast(output->NumElements()) + ); + // GeluLauncher(x.flat().data(), output->flat().data(), static_cast(output->NumElements())); + } +}; + +// OpKernel definition. +// template parameter is the datatype of the tensors. +template +class GeluGradOp : public OpKernel { + public : + explicit GeluGradOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + const Tensor& dy = context->input(0); + const Tensor& x = context->input(1); + + Tensor * output = NULL; + int context_output_index = 0; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + x.shape(), + &output)); + + GeluGradFunctor()( + context->eigen_device(), + dy.flat().data(), + x.flat().data(), + output->flat().data(), + static_cast(output->NumElements()) + ); + // GeluGradLauncher(dy.flat().data(), x.flat().data(), output->flat().data(), static_cast(output->NumElements())); + } +}; + +// OpKernel definition. +// template parameter is the datatype of the tensors. +template +class GeluGradGradOp : public OpKernel { + public : + explicit GeluGradGradOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + const Tensor& dy = context->input(0); + const Tensor& dy_ = context->input(1); + const Tensor& x = context->input(2); + + Tensor * output = NULL; + int context_output_index = 0; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + x.shape(), + &output)); + + GeluGradGradFunctor()( + context->eigen_device(), + dy.flat().data(), + dy_.flat().data(), + x.flat().data(), + output->flat().data(), + static_cast(output->NumElements()) + ); + // GeluGradGradLauncher(dy.flat().data(), x.flat().data(), output->flat().data(), static_cast(output->NumElements())); + } +}; + +#define REGISTER_CPU(T) \ + /* Declare explicit instantiations in kernel_example.cu.cc. */ \ + REGISTER_KERNEL_BUILDER( \ + Name("Gelu").Device(DEVICE_CPU).TypeConstraint("T"), \ + GeluOp); \ + /* Declare explicit instantiations in kernel_example.cu.cc. */ \ + REGISTER_KERNEL_BUILDER( \ + Name("GeluGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + GeluGradOp); \ + /* Declare explicit instantiations in kernel_example.cu.cc. */ \ + REGISTER_KERNEL_BUILDER( \ + Name("GeluGradGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + GeluGradGradOp); + REGISTER_CPU(float); + REGISTER_CPU(double); \ No newline at end of file diff --git a/source/op/gelu_gpu.cc b/source/op/gelu_gpu.cc new file mode 100644 index 0000000000..34d4183f98 --- /dev/null +++ b/source/op/gelu_gpu.cc @@ -0,0 +1,159 @@ +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/shape_inference.h" + +using namespace tensorflow; +using CPUDevice = Eigen::ThreadPoolDevice; +using GPUDevice = Eigen::GpuDevice; + +REGISTER_OP("Gelu") + .Attr("T: {float, double}") + .Input("x: T") + .Output("output: T"); + +REGISTER_OP("GeluGrad") + .Attr("T: {float, double}") + .Input("dy: T") + .Input("x: T") + .Output("output: T"); + +REGISTER_OP("GeluGradGrad") + .Attr("T: {float, double}") + .Input("dy: T") + .Input("dy_: T") + .Input("x: T") + .Output("output: T"); + +// maybe instead use cudnn activation forward +void GeluLauncher(const float * in, float * out, int const size); +void GeluLauncher(const double * in, double * out, int const size); + +void GeluGradLauncher(const float * dy, const float * in, float * out, int const size); +void GeluGradLauncher(const double * dy, const double * in, double * out, int const size); + +void GeluGradGradLauncher(const float * dy, const float * dy_, const float * in, float * out, int const size); +void GeluGradGradLauncher(const double * dy, const double * dy_, const double * in, double * out, int const size); + +template +struct GeluFunctor { + void operator()(const Device& d, const T * in, T * out, int const size) { + GeluLauncher(in, out, size); + } +}; + +template +struct GeluGradFunctor { + void operator()(const Device& d, const T * dy, const T * in, T * out, int const size) { + GeluGradLauncher(dy, in, out, size); + } +}; + +template +struct GeluGradGradFunctor { + void operator()(const Device& d, const T * dy, const T * dy_, const T * in, T * out, int const size) { + GeluGradGradLauncher(dy, dy_, in, out, size); + } +}; + +// OpKernel definition. +// template parameter is the datatype of the tensors. +template +class GeluOp : public OpKernel { + public : + explicit GeluOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + const Tensor& x = context->input(0); + Tensor * output = NULL; + int context_output_index = 0; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + x.shape(), + &output)); + + GeluFunctor()( + context->eigen_device(), + x.flat().data(), + output->flat().data(), + static_cast(output->NumElements()) + ); + // GeluLauncher(x.flat().data(), output->flat().data(), static_cast(output->NumElements())); + } +}; + +// OpKernel definition. +// template parameter is the datatype of the tensors. +template +class GeluGradOp : public OpKernel { + public : + explicit GeluGradOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + const Tensor& dy = context->input(0); + const Tensor& x = context->input(1); + + Tensor * output = NULL; + int context_output_index = 0; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + x.shape(), + &output)); + + GeluGradFunctor()( + context->eigen_device(), + dy.flat().data(), + x.flat().data(), + output->flat().data(), + static_cast(output->NumElements()) + ); + // GeluGradLauncher(dy.flat().data(), x.flat().data(), output->flat().data(), static_cast(output->NumElements())); + } +}; + +// OpKernel definition. +// template parameter is the datatype of the tensors. +template +class GeluGradGradOp : public OpKernel { + public : + explicit GeluGradGradOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + const Tensor& dy = context->input(0); + const Tensor& dy_ = context->input(1); + const Tensor& x = context->input(2); + + Tensor * output = NULL; + int context_output_index = 0; + OP_REQUIRES_OK(context, context->allocate_output(context_output_index++, + x.shape(), + &output)); + + GeluGradGradFunctor()( + context->eigen_device(), + dy.flat().data(), + dy_.flat().data(), + x.flat().data(), + output->flat().data(), + static_cast(output->NumElements()) + ); + // GeluGradGradLauncher(dy.flat().data(), x.flat().data(), output->flat().data(), static_cast(output->NumElements())); + } +}; + +#define REGISTER_GPU(T) \ + /* Declare explicit instantiations in kernel_example.cu.cc. */ \ + REGISTER_KERNEL_BUILDER( \ + Name("Gelu").Device(DEVICE_GPU).TypeConstraint("T"), \ + GeluOp); \ + /* Declare explicit instantiations in kernel_example.cu.cc. */ \ + REGISTER_KERNEL_BUILDER( \ + Name("GeluGrad").Device(DEVICE_GPU).TypeConstraint("T"), \ + GeluGradOp); \ + /* Declare explicit instantiations in kernel_example.cu.cc. */ \ + REGISTER_KERNEL_BUILDER( \ + Name("GeluGradGrad").Device(DEVICE_GPU).TypeConstraint("T"), \ + GeluGradGradOp); + REGISTER_GPU(float); + REGISTER_GPU(double); diff --git a/source/train/Data.py b/source/train/Data.py index f3ecaeeb62..149c76bbe8 100644 --- a/source/train/Data.py +++ b/source/train/Data.py @@ -97,6 +97,8 @@ def check_batch_size (self, batch_size) : tmpe = np.load(os.path.join(ii, "coord.npy")).astype(global_ener_float_precision) else: tmpe = np.load(os.path.join(ii, "coord.npy")).astype(global_np_float_precision) + if tmpe.ndim == 1: + tmpe = tmpe.reshape([1,-1]) if tmpe.shape[0] < batch_size : return ii, tmpe.shape[0] return None @@ -106,6 +108,8 @@ def check_test_size (self, test_size) : tmpe = np.load(os.path.join(self.test_dir, "coord.npy")).astype(global_ener_float_precision) else: tmpe = np.load(os.path.join(self.test_dir, "coord.npy")).astype(global_np_float_precision) + if tmpe.ndim == 1: + tmpe = tmpe.reshape([1,-1]) if tmpe.shape[0] < test_size : return self.test_dir, tmpe.shape[0] else : @@ -271,6 +275,8 @@ def _load_set(self, set_name) : coord = np.load(path).astype(global_ener_float_precision) else: coord = np.load(path).astype(global_np_float_precision) + if coord.ndim == 1: + coord = coord.reshape([1,-1]) nframes = coord.shape[0] assert(coord.shape[1] == self.data_dict['coord']['ndof'] * self.natoms) # load keys diff --git a/source/train/DescrptSeA.py b/source/train/DescrptSeA.py index d409f7134f..e46265231f 100644 --- a/source/train/DescrptSeA.py +++ b/source/train/DescrptSeA.py @@ -17,6 +17,7 @@ def __init__ (self, jdata): .add('resnet_dt',bool, default = False) \ .add('trainable',bool, default = True) \ .add('seed', int) \ + .add('type_one_side', bool, default = False) \ .add('exclude_types', list, default = []) \ .add('set_davg_zero', bool, default = False) \ .add('activation_function', str, default = 'tanh') \ @@ -39,6 +40,9 @@ def __init__ (self, jdata): self.exclude_types.add((tt[0], tt[1])) self.exclude_types.add((tt[1], tt[0])) self.set_davg_zero = class_data['set_davg_zero'] + self.type_one_side = class_data['type_one_side'] + if self.type_one_side and len(exclude_types) != 0: + raise RuntimeError('"type_one_side" is not compatible with "exclude_types"') # descrpt config self.sel_r = [ 0 for ii in range(len(self.sel_a)) ] @@ -244,17 +248,27 @@ def _pass_filter(self, inputs = tf.reshape(inputs, [-1, self.ndescrpt * natoms[0]]) output = [] output_qmat = [] - for type_i in range(self.ntypes): - inputs_i = tf.slice (inputs, - [ 0, start_index* self.ndescrpt], - [-1, natoms[2+type_i]* self.ndescrpt] ) + if not self.type_one_side: + for type_i in range(self.ntypes): + inputs_i = tf.slice (inputs, + [ 0, start_index* self.ndescrpt], + [-1, natoms[2+type_i]* self.ndescrpt] ) + inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt]) + layer, qmat = self._filter(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn) + layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_out()]) + qmat = tf.reshape(qmat, [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_rot_mat_1() * 3]) + output.append(layer) + output_qmat.append(qmat) + start_index += natoms[2+type_i] + else : + inputs_i = inputs inputs_i = tf.reshape(inputs_i, [-1, self.ndescrpt]) - layer, qmat = self._filter(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_'+str(type_i)+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn) - layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_out()]) - qmat = tf.reshape(qmat, [tf.shape(inputs)[0], natoms[2+type_i] * self.get_dim_rot_mat_1() * 3]) + type_i = -1 + layer, qmat = self._filter(tf.cast(inputs_i, self.filter_precision), type_i, name='filter_type_all'+suffix, natoms=natoms, reuse=reuse, seed = self.seed, trainable = trainable, activation_fn = self.filter_activation_fn) + layer = tf.reshape(layer, [tf.shape(inputs)[0], natoms[0] * self.get_dim_out()]) + qmat = tf.reshape(qmat, [tf.shape(inputs)[0], natoms[0] * self.get_dim_rot_mat_1() * 3]) output.append(layer) output_qmat.append(qmat) - start_index += natoms[2+type_i] output = tf.concat(output, axis = 1) output_qmat = tf.concat(output_qmat, axis = 1) return output, output_qmat @@ -353,6 +367,7 @@ def _filter(self, self.filter_precision, tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed), trainable = trainable) + hidden = tf.reshape(activation_fn(tf.matmul(xyz_scatter, w) + b), [-1, outputs_size[ii]]) if self.filter_resnet_dt : idt = tf.get_variable('idt_'+str(ii)+'_'+str(type_i), [1, outputs_size[ii]], @@ -361,16 +376,16 @@ def _filter(self, trainable = trainable) if outputs_size[ii] == outputs_size[ii-1]: if self.filter_resnet_dt : - xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b) * idt + xyz_scatter += hidden * idt else : - xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b) + xyz_scatter += hidden elif outputs_size[ii] == outputs_size[ii-1] * 2: if self.filter_resnet_dt : - xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + activation_fn(tf.matmul(xyz_scatter, w) + b) * idt + xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + hidden * idt else : - xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + activation_fn(tf.matmul(xyz_scatter, w) + b) + xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + hidden else: - xyz_scatter = activation_fn(tf.matmul(xyz_scatter, w) + b) + xyz_scatter = hidden else: w = tf.zeros((outputs_size[0], outputs_size[-1]), dtype=global_tf_float_precision) xyz_scatter = tf.matmul(xyz_scatter, w) @@ -440,6 +455,7 @@ def _filter_type_ext(self, self.filter_precision, tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed), trainable = trainable) + hidden = tf.reshape(activation_fn(tf.matmul(xyz_scatter, w) + b), [-1, outputs_size[ii]]) if self.filter_resnet_dt : idt = tf.get_variable('idt_'+str(ii)+'_'+str(type_i), [1, outputs_size[ii]], @@ -448,16 +464,16 @@ def _filter_type_ext(self, trainable = trainable) if outputs_size[ii] == outputs_size[ii-1]: if self.filter_resnet_dt : - xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b) * idt + xyz_scatter += hidden * idt else : - xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b) + xyz_scatter += hidden elif outputs_size[ii] == outputs_size[ii-1] * 2: if self.filter_resnet_dt : - xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + activation_fn(tf.matmul(xyz_scatter, w) + b) * idt + xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + hidden * idt else : - xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + activation_fn(tf.matmul(xyz_scatter, w) + b) + xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + hidden else: - xyz_scatter = activation_fn(tf.matmul(xyz_scatter, w) + b) + xyz_scatter = hidden # natom x nei_type_i x out_size xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1]//4, outputs_size[-1])) # natom x nei_type_i x 4 diff --git a/source/train/DescrptSeR.py b/source/train/DescrptSeR.py index ed53a3bcd4..5c4af52d4d 100644 --- a/source/train/DescrptSeR.py +++ b/source/train/DescrptSeR.py @@ -298,6 +298,7 @@ def _filter_r(self, self.filter_precision, tf.random_normal_initializer(stddev=stddev, mean = bavg, seed = seed), trainable = trainable) + hidden = tf.reshape(activation_fn(tf.matmul(xyz_scatter, w) + b), [-1, outputs_size[ii]]) if self.filter_resnet_dt : idt = tf.get_variable('idt_'+str(ii)+'_'+str(type_i), [1, outputs_size[ii]], @@ -306,16 +307,16 @@ def _filter_r(self, trainable = trainable) if outputs_size[ii] == outputs_size[ii-1]: if self.filter_resnet_dt : - xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b) * idt + xyz_scatter += hidden * idt else : - xyz_scatter += activation_fn(tf.matmul(xyz_scatter, w) + b) + xyz_scatter += hidden elif outputs_size[ii] == outputs_size[ii-1] * 2: if self.filter_resnet_dt : - xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + activation_fn(tf.matmul(xyz_scatter, w) + b) * idt + xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + hidden * idt else : - xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + activation_fn(tf.matmul(xyz_scatter, w) + b) + xyz_scatter = tf.concat([xyz_scatter,xyz_scatter], 1) + hidden else: - xyz_scatter = activation_fn(tf.matmul(xyz_scatter, w) + b) + xyz_scatter = hidden else: w = tf.zeros((outputs_size[0], outputs_size[-1]), dtype=global_tf_float_precision) xyz_scatter = tf.matmul(xyz_scatter, w) diff --git a/source/train/Fitting.py b/source/train/Fitting.py index 49ca40b2c7..1effa8d7c2 100644 --- a/source/train/Fitting.py +++ b/source/train/Fitting.py @@ -194,12 +194,14 @@ def build (self, if self.numb_fparam > 0 : ext_fparam = tf.tile(fparam, [1, natoms[2+type_i]]) ext_fparam = tf.reshape(ext_fparam, [-1, self.numb_fparam]) + ext_fparam = tf.cast(ext_fparam,self.fitting_precision) layer = tf.concat([layer, ext_fparam], axis = 1) if self.numb_aparam > 0 : ext_aparam = tf.slice(aparam, [ 0, start_index * self.numb_aparam], [-1, natoms[2+type_i] * self.numb_aparam]) ext_aparam = tf.reshape(ext_aparam, [-1, self.numb_aparam]) + ext_aparam = tf.cast(ext_aparam,self.fitting_precision) layer = tf.concat([layer, ext_aparam], axis = 1) start_index += natoms[2+type_i] diff --git a/source/train/Network.py b/source/train/Network.py index ed188c085d..1afa83bd1c 100644 --- a/source/train/Network.py +++ b/source/train/Network.py @@ -41,9 +41,9 @@ def one_layer(inputs, # return activation_fn(hidden_bn) else: if use_timestep : - return activation_fn(hidden) * idt + return tf.reshape(activation_fn(hidden), [-1, outputs_size]) * idt else : - return activation_fn(hidden) + return tf.reshape(activation_fn(hidden), [-1, outputs_size]) else: if useBN: None diff --git a/source/train/Trainer.py b/source/train/Trainer.py index 50db1adfbc..b6428c987e 100644 --- a/source/train/Trainer.py +++ b/source/train/Trainer.py @@ -28,6 +28,7 @@ import deepmd._prod_virial_se_r_grad import deepmd._soft_min_force_grad import deepmd._soft_min_virial_grad +import deepmd._gelu from deepmd.common import j_must_have, ClassArg diff --git a/source/train/calculator.py b/source/train/calculator.py index 37fb7ad412..de439c3f78 100644 --- a/source/train/calculator.py +++ b/source/train/calculator.py @@ -42,10 +42,13 @@ def __init__(self, model, label="DP", type_dict=None, **kwargs): def calculate(self, atoms=None, properties=["energy", "forces", "stress"], system_changes=all_changes): coord = atoms.get_positions().reshape([1, -1]) - cell = atoms.get_cell().reshape([1, -1]) + if sum(atoms.get_pbc())>0: + cell = atoms.get_cell().reshape([1, -1]) + else: + cell = None symbols = atoms.get_chemical_symbols() atype = [self.type_dict[k] for k in symbols] - e, f, v = self.dp.eval(coord, cell, atype) + e, f, v = self.dp.eval(coords=coord, cells=cell, atom_types=atype) self.results['energy'] = e[0] self.results['forces'] = f[0] self.results['stress'] = v[0] diff --git a/source/train/common.py b/source/train/common.py index c250e4cdfc..887669a278 100644 --- a/source/train/common.py +++ b/source/train/common.py @@ -2,19 +2,22 @@ import numpy as np import math from deepmd.env import tf +from deepmd.env import op_module from deepmd.RunOptions import global_tf_float_precision -def gelu(x): - """Gaussian Error Linear Unit. - This is a smoother version of the RELU. - Original paper: https://arxiv.org/abs/1606.08415 - Args: - x: float Tensor to perform activation. - Returns: - `x` with the GELU activation applied. - """ - cdf = 0.5 * (1.0 + tf.tanh((math.sqrt(2 / math.pi) * (x + 0.044715 * tf.pow(x, 3))))) - return x * cdf +# def gelu(x): +# """Gaussian Error Linear Unit. +# This is a smoother version of the RELU. +# Original paper: https://arxiv.org/abs/1606.08415 +# Args: +# x: float Tensor to perform activation. +# Returns: +# `x` with the GELU activation applied. +# """ +# cdf = 0.5 * (1.0 + tf.tanh((math.sqrt(2 / math.pi) * (x + 0.044715 * tf.pow(x, 3))))) +# return x * cdf +def gelu(x) : + return op_module.gelu(x) data_requirement = {} activation_fn_dict = { @@ -40,7 +43,6 @@ def add_data_requirement(key, 'repeat': repeat, } - def select_idx_map(atom_type, type_sel): sort_type_sel = np.sort(type_sel) diff --git a/source/train/test.py b/source/train/test.py index 2674e9c01e..d8639020ce 100644 --- a/source/train/test.py +++ b/source/train/test.py @@ -32,12 +32,12 @@ def test (args): dp = DeepWFC(args.model) else : raise RuntimeError('unknow model type '+de.model_type) - for ii in all_sys: + for cc,ii in enumerate(all_sys): args.system = ii print ("# ---------------output of dp test--------------- ") print ("# testing system : " + ii) if de.model_type == 'ener': - err, siz = test_ener(dp, args) + err, siz = test_ener(dp, args, append_detail = (cc!=0)) elif de.model_type == 'dipole': err, siz = test_dipole(dp, args) elif de.model_type == 'polar': @@ -89,7 +89,14 @@ def weighted_average(err_coll, siz_coll): return sum_err -def test_ener (dp, args) : +def save_txt_file(fname, data, header = "", append = False): + fp = fname + if append : fp = open(fp, 'ab') + np.savetxt(fp, data, header = header) + if append : fp.close() + + +def test_ener (dp, args, append_detail = False) : if args.rand_seed is not None : np.random.seed(args.rand_seed % (2**32)) @@ -122,10 +129,7 @@ def test_ener (dp, args) : else : aparam = None detail_file = args.detail_file - if detail_file is not None: - atomic = True - else: - atomic = False + atomic = False ret = dp.eval(coord, box, atype, fparam = fparam, aparam = aparam, atomic = atomic) energy = ret[0] @@ -158,18 +162,21 @@ def test_ener (dp, args) : pe = np.concatenate((np.reshape(test_data["energy"][:numb_test], [-1,1]), np.reshape(energy, [-1,1])), axis = 1) - np.savetxt(detail_file+".e.out", pe, - header = 'data_e pred_e') + save_txt_file(detail_file+".e.out", pe, + header = '%s: data_e pred_e' % args.system, + append = append_detail) pf = np.concatenate((np.reshape(test_data["force"] [:numb_test], [-1,3]), np.reshape(force, [-1,3])), axis = 1) - np.savetxt(detail_file+".f.out", pf, - header = 'data_fx data_fy data_fz pred_fx pred_fy pred_fz') + save_txt_file(detail_file+".f.out", pf, + header = '%s: data_fx data_fy data_fz pred_fx pred_fy pred_fz' % args.system, + append = append_detail) pv = np.concatenate((np.reshape(test_data["virial"][:numb_test], [-1,9]), np.reshape(virial, [-1,9])), axis = 1) - np.savetxt(detail_file+".v.out", pv, - header = 'data_vxx data_vxy data_vxz data_vyx data_vyy data_vyz data_vzx data_vzy data_vzz pred_vxx pred_vxy pred_vxz pred_vyx pred_vyy pred_vyz pred_vzx pred_vzy pred_vzz') + save_txt_file(detail_file+".v.out", pv, + header = '%s: data_vxx data_vxy data_vxz data_vyx data_vyy data_vyz data_vzx data_vzy data_vzz pred_vxx pred_vxy pred_vxz pred_vyx pred_vyy pred_vyz pred_vzx pred_vzy pred_vzz' % args.system, + append = append_detail) return [l2ea, l2f, l2va], [energy.size, force.size, virial.size] diff --git a/source/train/transform.py b/source/train/transform.py index 6ae7723f7e..1d587ae531 100644 --- a/source/train/transform.py +++ b/source/train/transform.py @@ -1,6 +1,21 @@ from deepmd.env import tf import re import numpy as np + +def convertNumber(number): + binary = bin(number).replace("0b", "").zfill(16) + sign = int(binary[0]) * (-2) + 1 + exp = int(binary[1:6], 2) + frac = (int(binary[6:], 2) + 2 ** 10) * (2 ** -10) + return sign * (2 ** (exp - 15)) * frac + + +def convertMatrix(matrix, shape): + matrix = matrix.flatten() + tmp = np.array([convertNumber(matrix[i]) for i in range(len(matrix))]) + return tmp.reshape(shape) + + def transform(args): raw_graph = load_graph(args.raw_model) old_graph = load_graph(args.old_model) @@ -34,31 +49,59 @@ def transform_graph(raw_graph,old_graph): for node in raw_graph_def.node: if node.name in raw_graph_node.keys(): - if precision_dict[old_graph_node[node.name].dtype][1] == "float16" or precision_dict[raw_graph_node[node.name].dtype][1] == "float16": - raise RuntimeError("float16 conversions not currently supported") check_dim(raw_graph_node, old_graph_node, node.name) + tensor_shape = [dim.size for dim in raw_graph_node[node.name].tensor_shape.dim] + old_graph_dtype = precision_dict[old_graph_node[node.name].dtype] + raw_graph_dtype = precision_dict[raw_graph_node[node.name].dtype] + print("%s is passed from old graph(%s) to raw graph(%s)" % (node.name, old_graph_dtype[1],raw_graph_dtype[1])) + + if raw_graph_dtype[1] == "float16": + if old_graph_dtype[1] == "float64" or old_graph_dtype[1] == "float32": + if (len(tensor_shape) != 1) or (tensor_shape[0] != 1): + tensor_value = np.frombuffer(old_graph_node[node.name].tensor_content, dtype=old_graph_dtype[0]) + tensor_value = tensor_value.astype(np.float16) + node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value, tf.float16, tensor_shape))) - if re.fullmatch("final_layer_type_\d+/bias",node.name) == None: - tensor_value = np.frombuffer(old_graph_node[node.name].tensor_content,dtype = precision_dict[old_graph_node[node.name].dtype][0]) - tensor_value = tensor_value.astype(dtype=precision_dict[raw_graph_node[node.name].dtype][0]) - node.attr["value"].tensor.tensor_content = tensor_value.tostring() + else: + if old_graph_dtype[1] == "float64": + tensor_value = (np.array(old_graph_node[node.name].double_val)).astype(np.float16) + node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value,tf.float16, [1]))) - else: - if precision_dict[old_graph_node[node.name].dtype][1] == "float64": - tensor_value = (np.array(old_graph_node[node.name].double_val)).astype(precision_dict[raw_graph_node[node.name].dtype][0]) - node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value,precision_dict[raw_graph_node[node.name].dtype][0], [1]))) - - elif precision_dict[old_graph_node[node.name].dtype][1] == "float32": - tensor_value = (np.array(old_graph_node[node.name].float_val)).astype(precision_dict[raw_graph_node[node.name].dtype][0]) - node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value, precision_dict[raw_graph_node[node.name].dtype][0], [1]))) - - elif precision_dict[old_graph_node[node.name].dtype][1] == "float16": - tensor_value = (np.array(old_graph_node[node.name].half_val)).astype(precision_dict[raw_graph_node[node.name].dtype][0]) - node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value, precision_dict[raw_graph_node[node.name].dtype][0], [1]))) + elif old_graph_dtype[1] == "float32": + tensor_value = (np.array(old_graph_node[node.name].float_val)).astype(np.float16) + node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value,tf.float16, [1]))) + + elif old_graph_dtype[1] == "float16": + tensor_value = convertMatrix(np.array(old_graph_node[node.name].half_val), tensor_shape) + node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value, tf.float16, tensor_value.shape))) - print("%s is passed from old graph(%s) to raw graph(%s)" % (node.name,precision_dict[old_graph_node[node.name].dtype][1],precision_dict[raw_graph_node[node.name].dtype][1])) - + elif raw_graph_dtype[1] == "float64" or raw_graph_dtype[1] == "float32": + if old_graph_dtype[1] == "float64" or old_graph_dtype[1] == "float32": + if (len(tensor_shape) != 1) or (tensor_shape[0] != 1): + tensor_value = np.frombuffer(old_graph_node[node.name].tensor_content,dtype = old_graph_dtype[0]) + tensor_value = tensor_value.astype(dtype=raw_graph_dtype[0]) + node.attr["value"].tensor.tensor_content = tensor_value.tostring() + + else: + if old_graph_dtype[1] == "float64": + tensor_value = (np.array(old_graph_node[node.name].double_val)).astype(raw_graph_dtype[0]) + node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value,raw_graph_dtype[0], [1]))) + + elif old_graph_dtype[1] == "float32": + tensor_value = (np.array(old_graph_node[node.name].float_val)).astype(raw_graph_dtype[0]) + node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value,raw_graph_dtype[0], [1]))) + + elif old_graph_dtype[1] == "float16": + if (len(tensor_shape) != 1) or (tensor_shape[0] != 1): + tensor_value = convertMatrix(np.array(old_graph_node[node.name].half_val), tensor_shape) + tensor_value = tensor_value.astype(raw_graph_dtype[0]) + node.attr["value"].tensor.tensor_content = tensor_value.tostring() + else: + tensor_value = convertMatrix(np.array(old_graph_node[node.name].half_val), tensor_shape) + tensor_value = tensor_value.astype(raw_graph_dtype[0]) + node.attr["value"].CopyFrom(tf.AttrValue(tensor=tf.make_tensor_proto(tensor_value,raw_graph_dtype[0], tensor_value.shape))) + return raw_graph_def def check_dim(raw_graph_node, old_graph_node, node_name): @@ -77,8 +120,16 @@ def load_transform_node(graph): layer_\d+_type_\d+/matrix|\ layer_\d+_type_\d+/bias|\ layer_\d+_type_\d+/idt|\ +final_layer_type_\d+/matrix|\ +descrpt_attr/t_avg|\ +descrpt_attr/t_std|\ final_layer_type_\d+/bias|\ -final_layer_type_\d+/matrix\ +fitting_attr/t_fparam_avg|\ +fitting_attr/t_fparam_istd|\ +fitting_attr/t_aparam_avg|\ +fitting_attr/t_aparam_istd|\ +model_attr/t_tab_info|\ +model_attr/t_tab_data|\ " for node in graph.node: if re.fullmatch(transform_node_pattern,node.name) != None: