diff --git a/source/source_base/module_device/cuda_compat.h b/source/source_base/module_device/cuda_compat.h new file mode 100644 index 0000000000..78c0f6420c --- /dev/null +++ b/source/source_base/module_device/cuda_compat.h @@ -0,0 +1,34 @@ +/** + * @file cuda_compat.h + * @brief Compatibility layer for CUDA and NVTX headers across different CUDA Toolkit versions. + * + * This header abstracts the differences in NVTX (NVIDIA Tools Extension) header locations + * between CUDA Toolkit versions. + * + * @note Depends on the CUDA_VERSION macro defined in . + * + */ + +#ifndef CUDA_COMPAT_H_ +#define CUDA_COMPAT_H_ + +#include // defines CUDA_VERSION + +// NVTX header for CUDA versions prior to 12.9 vs. 12.9+ +// This block ensures the correct NVTX header path is used based on CUDA_VERSION. +// - For CUDA Toolkit < 12.9, the legacy header "nvToolsExt.h" is included. +// - For CUDA Toolkit >= 12.9, the modern header "nvtx3/nvToolsExt.h" is included, +// and NVTX v2 is removed from 12.9. +// This allows NVTX profiling APIs (e.g. nvtxRangePush) to be used consistently +// across different CUDA versions. +// See: +// https://docs.nvidia.com/cuda/archive/12.9.0/cuda-toolkit-release-notes/index.html#id4 +#if defined(__CUDA) && defined(__USE_NVTX) +#if CUDA_VERSION < 12090 + #include "nvToolsExt.h" +#else + #include "nvtx3/nvToolsExt.h" +#endif +#endif + +#endif // CUDA_COMPAT_H_ diff --git a/source/source_base/timer.cpp b/source/source_base/timer.cpp index f55bd6776a..c8cd3bc6ac 100644 --- a/source/source_base/timer.cpp +++ b/source/source_base/timer.cpp @@ -15,11 +15,7 @@ #include "source_base/formatter.h" #if defined(__CUDA) && defined(__USE_NVTX) -#if CUDA_VERSION < 12090 -#include "nvToolsExt.h" -#else -#include "nvtx3/nvToolsExt.h" -#endif +#include "source_base/module_device/cuda_compat.h" #include "source_io/module_parameter/parameter.h" #endif diff --git a/source/source_hsolver/kernels/cuda/diag_cusolver.cuh b/source/source_hsolver/kernels/cuda/diag_cusolver.cuh index faf4ec0a09..e47d43be50 100644 --- a/source/source_hsolver/kernels/cuda/diag_cusolver.cuh +++ b/source/source_hsolver/kernels/cuda/diag_cusolver.cuh @@ -3,12 +3,6 @@ #include #include -#if CUDA_VERSION < 12090 -#include "nvToolsExt.h" -#else -#include "nvtx3/nvToolsExt.h" -#endif - #include #include @@ -39,7 +33,7 @@ class Diag_Cusolver_gvd{ double *d_A = nullptr; double *d_B = nullptr; double *d_work = nullptr; - + cuDoubleComplex *d_A2 = nullptr; cuDoubleComplex *d_B2 = nullptr; cuDoubleComplex *d_work2 = nullptr; @@ -54,7 +48,7 @@ class Diag_Cusolver_gvd{ // - init_double : initializing relevant double type data structures and gpu apis' handle and memory // - init_complex : initializing relevant complex type data structures and gpu apis' handle and memory // Input Parameters -// N: the dimension of the matrix +// N: the dimension of the matrix void init_double(int N); void init_complex(int N); @@ -70,17 +64,17 @@ public: // - Dngvd_double : dense double type matrix // - Dngvd_complex : dense complex type matrix // Input Parameters -// N: the number of rows of the matrix -// M: the number of cols of the matrix -// A: the hermitian matrix A in A x=lambda B (column major) -// B: the SPD matrix B in A x=lambda B (column major) +// N: the number of rows of the matrix +// M: the number of cols of the matrix +// A: the hermitian matrix A in A x=lambda B (column major) +// B: the SPD matrix B in A x=lambda B (column major) // Output Parameter // W: generalized eigenvalues // V: generalized eigenvectors (column major) void Dngvd_double(int N, int M, double *A, double *B, double *W, double *V); void Dngvd_complex(int N, int M, std::complex *A, std::complex *B, double *W, std::complex *V); - + void Dngvd(int N, int M, double *A, double *B, double *W, double *V) { return Dngvd_double(N, M, A, B, W, V);