deepmodeling · mohanchen · Dec 11, 2025 · Dec 9, 2025 · Dec 9, 2025
diff --git a/source/source_base/module_device/cuda_compat.h b/source/source_base/module_device/cuda_compat.h
@@ -0,0 +1,34 @@
+/**
+ * @file cuda_compat.h
+ * @brief Compatibility layer for CUDA and NVTX headers across different CUDA Toolkit versions.
+ *
+ * This header abstracts the differences in NVTX (NVIDIA Tools Extension) header locations
+ * between CUDA Toolkit versions.
+ *
+ * @note Depends on the CUDA_VERSION macro defined in <cuda.h>.
+ *
+ */
+
+#ifndef CUDA_COMPAT_H_
+#define CUDA_COMPAT_H_
+
+#include <cuda.h> // defines CUDA_VERSION
+
+// NVTX header for CUDA versions prior to 12.9 vs. 12.9+
+// This block ensures the correct NVTX header path is used based on CUDA_VERSION.
+// - For CUDA Toolkit < 12.9, the legacy header "nvToolsExt.h" is included.
+// - For CUDA Toolkit >= 12.9, the modern header "nvtx3/nvToolsExt.h" is included,
+// and NVTX v2 is removed from 12.9.
+// This allows NVTX profiling APIs (e.g. nvtxRangePush) to be used consistently
+// across different CUDA versions.
+// See:
+// https://docs.nvidia.com/cuda/archive/12.9.0/cuda-toolkit-release-notes/index.html#id4
+#if defined(__CUDA) && defined(__USE_NVTX)
+#if CUDA_VERSION < 12090
+    #include "nvToolsExt.h"
+#else
+    #include "nvtx3/nvToolsExt.h"
+#endif
+#endif
+
+#endif // CUDA_COMPAT_H_
diff --git a/source/source_base/timer.cpp b/source/source_base/timer.cpp
@@ -15,11 +15,7 @@
 #include "source_base/formatter.h"
 
 #if defined(__CUDA) && defined(__USE_NVTX)
-#if CUDA_VERSION < 12090
-#include "nvToolsExt.h"
-#else
-#include "nvtx3/nvToolsExt.h"
-#endif
+#include "source_base/module_device/cuda_compat.h"
 #include "source_io/module_parameter/parameter.h"
 #endif
 

diff --git a/source/source_hsolver/kernels/cuda/diag_cusolver.cuh b/source/source_hsolver/kernels/cuda/diag_cusolver.cuh
@@ -3,12 +3,6 @@
 #include <cuda.h>
 #include <complex>
 
-#if CUDA_VERSION < 12090
-#include "nvToolsExt.h"
-#else
-#include "nvtx3/nvToolsExt.h"
-#endif
-
 #include <cuda_runtime.h>
 #include <cusolverDn.h>
 
@@ -39,7 +33,7 @@ class Diag_Cusolver_gvd{
     double *d_A = nullptr;
     double *d_B = nullptr;
     double *d_work = nullptr;
-    
+
     cuDoubleComplex *d_A2 = nullptr;
     cuDoubleComplex *d_B2 = nullptr;
     cuDoubleComplex *d_work2 = nullptr;
@@ -54,7 +48,7 @@ class Diag_Cusolver_gvd{
 //  - init_double : initializing relevant double type data structures and gpu apis' handle and memory
 //  - init_complex : initializing relevant complex type data structures and gpu apis' handle and memory
 //      Input Parameters
-//          N: the dimension of the matrix 
+//          N: the dimension of the matrix
     void init_double(int N);
     void init_complex(int N);
 
@@ -70,17 +64,17 @@ public:
 //  - Dngvd_double : dense double type matrix
 //  - Dngvd_complex : dense complex type matrix
 //      Input Parameters
-//          N: the number of rows of the matrix 
-//          M: the number of cols of the matrix  
-//          A: the hermitian matrix A in A x=lambda B (column major) 
-//          B: the SPD matrix B in A x=lambda B (column major) 
+//          N: the number of rows of the matrix
+//          M: the number of cols of the matrix
+//          A: the hermitian matrix A in A x=lambda B (column major)
+//          B: the SPD matrix B in A x=lambda B (column major)
 //      Output Parameter
 //          W: generalized eigenvalues
 //          V: generalized eigenvectors (column major)
 
     void Dngvd_double(int N, int M, double *A, double *B, double *W, double *V);
     void Dngvd_complex(int N, int M, std::complex<double> *A, std::complex<double> *B, double *W, std::complex<double> *V);
-    
+
     void Dngvd(int N, int M, double *A, double *B, double *W, double *V)
     {
         return Dngvd_double(N, M, A, B, W, V);