-
Notifications
You must be signed in to change notification settings - Fork 599
Add Errcheck after every kernel function runs And merge redundant code #855
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
70cb226
d6e9d13
8588bbb
1f0cb9a
82de35a
eea2ab1
c0f57f6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,8 +5,9 @@ | |
| #include <cuda_runtime.h> | ||
|
|
||
| #define GPU_MAX_NBOR_SIZE 4096 | ||
| #define cudaErrcheck(res) {cudaAssert((res), __FILE__, __LINE__);} | ||
| inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) { | ||
| #define DPErrcheck(res) {DPAssert((res), __FILE__, __LINE__);} | ||
| inline void DPAssert(cudaError_t code, const char *file, int line, bool abort=true) | ||
| { | ||
| if (code != cudaSuccess) { | ||
| fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); | ||
| if (code == 2) { | ||
|
|
@@ -27,7 +28,8 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort= | |
| } | ||
|
|
||
| #define nborErrcheck(res) {nborAssert((res), __FILE__, __LINE__);} | ||
| inline void nborAssert(cudaError_t code, const char *file, int line, bool abort=true) { | ||
| inline void nborAssert(cudaError_t code, const char *file, int line, bool abort=true) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's only 'nborErrcheck' that differs. Shall we output the same information like that in 'DPErrcheck' when any kernel meets error Or we output specific information when specific kernel meets error (like 'illegal nbor list sorting' and so on)?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we shoule use specific information for more important kernel, but not all kernel. |
||
| { | ||
| if (code != cudaSuccess) { | ||
| fprintf(stderr,"cuda assert: %s %s %d\n", "DeePMD-kit:\tillegal nbor list sorting", file, line); | ||
| if (code == 2) { | ||
|
|
@@ -65,12 +67,17 @@ static __inline__ __device__ double atomicAdd( | |
| #endif | ||
|
|
||
| namespace deepmd { | ||
|
|
||
| inline void DPGetDeviceCount(int &gpu_num) { cudaGetDeviceCount(&gpu_num) ;} | ||
|
|
||
| inline cudaError_t DPSetDevice(int rank) { return cudaSetDevice(rank); } | ||
|
|
||
| template <typename FPTYPE> | ||
| void memcpy_host_to_device( | ||
| FPTYPE * device, | ||
| const std::vector<FPTYPE> &host) | ||
| { | ||
| cudaErrcheck(cudaMemcpy(device, &host[0], sizeof(FPTYPE) * host.size(), cudaMemcpyHostToDevice)); | ||
| DPErrcheck(cudaMemcpy(device, &host[0], sizeof(FPTYPE) * host.size(), cudaMemcpyHostToDevice)); | ||
| } | ||
|
|
||
| template <typename FPTYPE> | ||
|
|
@@ -79,15 +86,15 @@ void memcpy_host_to_device( | |
| const FPTYPE * host, | ||
| const int size) | ||
| { | ||
| cudaErrcheck(cudaMemcpy(device, host, sizeof(FPTYPE) * size, cudaMemcpyHostToDevice)); | ||
| DPErrcheck(cudaMemcpy(device, host, sizeof(FPTYPE) * size, cudaMemcpyHostToDevice)); | ||
| } | ||
|
|
||
| template <typename FPTYPE> | ||
| void memcpy_device_to_host( | ||
| const FPTYPE * device, | ||
| std::vector<FPTYPE> &host) | ||
| { | ||
| cudaErrcheck(cudaMemcpy(&host[0], device, sizeof(FPTYPE) * host.size(), cudaMemcpyDeviceToHost)); | ||
| DPErrcheck(cudaMemcpy(&host[0], device, sizeof(FPTYPE) * host.size(), cudaMemcpyDeviceToHost)); | ||
| } | ||
|
|
||
| template <typename FPTYPE> | ||
|
|
@@ -96,31 +103,31 @@ void memcpy_device_to_host( | |
| FPTYPE * host, | ||
| const int size) | ||
| { | ||
| cudaErrcheck(cudaMemcpy(host, device, sizeof(FPTYPE) * size, cudaMemcpyDeviceToHost)); | ||
| DPErrcheck(cudaMemcpy(host, device, sizeof(FPTYPE) * size, cudaMemcpyDeviceToHost)); | ||
| } | ||
|
|
||
| template <typename FPTYPE> | ||
| void malloc_device_memory( | ||
| FPTYPE * &device, | ||
| const std::vector<FPTYPE> &host) | ||
| { | ||
| cudaErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * host.size())); | ||
| DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * host.size())); | ||
| } | ||
|
|
||
| template <typename FPTYPE> | ||
| void malloc_device_memory( | ||
| FPTYPE * &device, | ||
| const int size) | ||
| { | ||
| cudaErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * size)); | ||
| DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * size)); | ||
| } | ||
|
|
||
| template <typename FPTYPE> | ||
| void malloc_device_memory_sync( | ||
| FPTYPE * &device, | ||
| const std::vector<FPTYPE> &host) | ||
| { | ||
| cudaErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * host.size())); | ||
| DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * host.size())); | ||
| memcpy_host_to_device(device, host); | ||
| } | ||
|
|
||
|
|
@@ -130,7 +137,7 @@ void malloc_device_memory_sync( | |
| const FPTYPE * host, | ||
| const int size) | ||
| { | ||
| cudaErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * size)); | ||
| DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * size)); | ||
| memcpy_host_to_device(device, host, size); | ||
| } | ||
|
|
||
|
|
@@ -139,7 +146,7 @@ void delete_device_memory( | |
| FPTYPE * &device) | ||
| { | ||
| if (device != NULL) { | ||
| cudaErrcheck(cudaFree(device)); | ||
| DPErrcheck(cudaFree(device)); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -149,6 +156,6 @@ void memset_device_memory( | |
| const FPTYPE var, | ||
| const int size) | ||
| { | ||
| cudaErrcheck(cudaMemset(device, var, sizeof(FPTYPE) * size)); | ||
| DPErrcheck(cudaMemset(device, var, sizeof(FPTYPE) * size)); | ||
| } | ||
| } // end of namespace deepmd | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I feel that we could define
GOOGLE_CUDA || TENSORFLOW_USE_ROCMin another place (for exampleUSE_DEVICE), so when we add more devices, we do not need to modify these conditions.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
May not be a good idea, because we are supporting more accelerators (device) whose name may not be "gpu"