From 6688f515cb0d990e6aede4fd66e46239f956127c Mon Sep 17 00:00:00 2001 From: "tianxiang.wang@metax-tech.com" Date: Tue, 9 Sep 2025 06:35:26 +0000 Subject: [PATCH 1/2] =?UTF-8?q?Fix:=20switch=20to=20nvtx3=20headers=20when?= =?UTF-8?q?=20CUDA=5FVERSION=20>=3D=2012090=20Signed-off-by=EF=BC=9ATianxi?= =?UTF-8?q?ang=20Wang,=20Contributed=20unde?= =?UTF-8?q?r=20MetaX=20Integrated=20Circuits=20(Shanghai)=20Co.,=20Ltd.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- source/source_base/timer.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/source/source_base/timer.cpp b/source/source_base/timer.cpp index e7f22df70d..f55bd6776a 100644 --- a/source/source_base/timer.cpp +++ b/source/source_base/timer.cpp @@ -15,7 +15,11 @@ #include "source_base/formatter.h" #if defined(__CUDA) && defined(__USE_NVTX) -#include +#if CUDA_VERSION < 12090 +#include "nvToolsExt.h" +#else +#include "nvtx3/nvToolsExt.h" +#endif #include "source_io/module_parameter/parameter.h" #endif From 2a0fc700bd8b5175a082223f35b11343195ba1ec Mon Sep 17 00:00:00 2001 From: "tianxiang.wang@metax-tech.com" Date: Tue, 9 Sep 2025 06:37:29 +0000 Subject: [PATCH 2/2] =?UTF-8?q?Fix:=20resolve=20compile=20error=20with=20U?= =?UTF-8?q?SE=5FELPA=3DOFF=20+=20BUILD=5FTESTING=3DON=20Signed-off-by?= =?UTF-8?q?=EF=BC=9ATianxiang=20Wang,=20Con?= =?UTF-8?q?tributed=20under=20MetaX=20Integrated=20Circuits=20(Shanghai)?= =?UTF-8?q?=20Co.,=20Ltd.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- source/source_hsolver/test/CMakeLists.txt | 19 ++++++--- .../test/test_diago_hs_para.cpp | 42 ++++++++++++++----- 2 files changed, 45 insertions(+), 16 deletions(-) diff --git a/source/source_hsolver/test/CMakeLists.txt b/source/source_hsolver/test/CMakeLists.txt index e3fa6550fa..95b4362958 100644 --- a/source/source_hsolver/test/CMakeLists.txt +++ b/source/source_hsolver/test/CMakeLists.txt @@ -153,12 +153,19 @@ install(FILES diago_pexsi_parallel_test.sh DESTINATION ${CMAKE_CURRENT_BINARY_DI install(FILES parallel_k2d_test.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) - -AddTest( - TARGET MODULE_HSOLVER_diago_hs_parallel - LIBS parameter ${math_libs} ELPA::ELPA base device MPI::MPI_CXX genelpa psi - SOURCES test_diago_hs_para.cpp ../diag_hs_para.cpp ../diago_pxxxgvx.cpp ../diago_elpa.cpp ../diago_scalapack.cpp -) +if (USE_ELPA) + AddTest( + TARGET MODULE_HSOLVER_diago_hs_parallel + LIBS parameter ${math_libs} ELPA::ELPA base device MPI::MPI_CXX genelpa psi + SOURCES test_diago_hs_para.cpp ../diag_hs_para.cpp ../diago_pxxxgvx.cpp ../diago_elpa.cpp ../diago_scalapack.cpp + ) +else() + AddTest( + TARGET MODULE_HSOLVER_diago_hs_parallel + LIBS parameter ${math_libs} base device MPI::MPI_CXX psi + SOURCES test_diago_hs_para.cpp ../diag_hs_para.cpp ../diago_pxxxgvx.cpp ../diago_scalapack.cpp + ) +endif() AddTest( TARGET MODULE_HSOLVER_linear_trans diff --git a/source/source_hsolver/test/test_diago_hs_para.cpp b/source/source_hsolver/test/test_diago_hs_para.cpp index 425fd3b238..ad7d05c716 100644 --- a/source/source_hsolver/test/test_diago_hs_para.cpp +++ b/source/source_hsolver/test/test_diago_hs_para.cpp @@ -160,7 +160,9 @@ void test_performance(int lda, int nb, int nbands, MPI_Comm comm,int case_numb, MPI_Comm_size(comm, &nproc); std::vector h_mat, s_mat, wfc, h_psi, s_psi; +#ifdef __ELPA std::vector::type> ekb_elpa(lda); +#endif std::vector::type> ekb_scalap(lda); std::vector::type> ekb_lapack(lda); @@ -176,32 +178,36 @@ void test_performance(int lda, int nb, int nbands, MPI_Comm comm,int case_numb, } // store all the times in a vector +#ifdef __ELPA std::vector time_elpa(case_numb, 0); +#endif std::vector time_scalap(case_numb, 0); std::vector time_lapack(case_numb, 0); if (my_rank == 0) { std::cout << "Random matrix "; } - for (int randomi = 0; randomi < case_numb; ++randomi) + for (int randomi = 0; randomi < case_numb; ++randomi) { - + if (my_rank == 0) { std::cout << randomi << " "; generate_random_hs(lda, randomi, h_mat, s_mat); } - + auto start = std::chrono::high_resolution_clock::now(); + auto end = std::chrono::high_resolution_clock::now(); +#ifdef __ELPA // ELPA MPI_Barrier(comm); - auto start = std::chrono::high_resolution_clock::now(); + start = std::chrono::high_resolution_clock::now(); for (int j=0;j(h_mat.data(), s_mat.data(), lda, nbands,ekb_elpa.data(), wfc.data(), comm, 1, nb); MPI_Barrier(comm); } MPI_Barrier(comm); - auto end = std::chrono::high_resolution_clock::now(); + end = std::chrono::high_resolution_clock::now(); time_elpa[randomi] = std::chrono::duration_cast(end - start).count(); - +#endif // scalapack start = std::chrono::high_resolution_clock::now(); @@ -215,8 +221,8 @@ void test_performance(int lda, int nb, int nbands, MPI_Comm comm,int case_numb, time_scalap[randomi] = std::chrono::duration_cast(end - start).count(); //LApack - if (my_rank == 0) - { + if (my_rank == 0) + { std::vector h_tmp, s_tmp; start = std::chrono::high_resolution_clock::now(); base_device::DEVICE_CPU* ctx = {}; @@ -239,26 +245,34 @@ void test_performance(int lda, int nb, int nbands, MPI_Comm comm,int case_numb, //COMPARE EKB for (int i = 0; i < nbands; ++i) { - typename GetTypeReal::type diff_elpa_lapack = std::abs(ekb_elpa[i] - ekb_lapack[i]); typename GetTypeReal::type diff_scalap_lapack = std::abs(ekb_scalap[i] - ekb_lapack[i]); +#ifdef __ELPA + typename GetTypeReal::type diff_elpa_lapack = std::abs(ekb_elpa[i] - ekb_lapack[i]); if (diff_elpa_lapack > 1e-6 || diff_scalap_lapack > 1e-6) +#else + if (diff_scalap_lapack > 1e-6) +#endif { +#ifdef __ELPA std::cout << "eigenvalue " << i << " by ELPA: " << ekb_elpa[i] << std::endl; +#endif std::cout << "eigenvalue " << i << " by Scalapack: " << ekb_scalap[i] << std::endl; std::cout << "eigenvalue " << i << " by Lapack: " << ekb_lapack[i] << std::endl; } } } - MPI_Barrier(comm); + MPI_Barrier(comm); } if (my_rank == 0) { +#ifdef __ELPA std::cout << "\nELPA Time : "; for (int i=0; i < case_numb;i++) {std::cout << time_elpa[i] << " ";} std::cout << std::endl; +#endif std::cout << "scalapack Time: "; for (int i=0; i < case_numb;i++) @@ -271,21 +285,29 @@ void test_performance(int lda, int nb, int nbands, MPI_Comm comm,int case_numb, std::cout << std::endl; // print out the average time and speedup +#ifdef __ELPA double avg_time_elpa = 0; +#endif double avg_time_scalap = 0; double avg_time_lapack = 0; for (int i=0; i < case_numb;i++) { +#ifdef __ELPA avg_time_elpa += time_elpa[i]; +#endif avg_time_scalap += time_scalap[i]; avg_time_lapack += time_lapack[i]; } +#ifdef __ELPA avg_time_elpa /= case_numb; +#endif avg_time_scalap /= case_numb; avg_time_lapack /= case_numb; std::cout << "Average Lapack Time : " << avg_time_lapack << " ms" << std::endl; +#ifdef __ELPA std::cout << "Average ELPA Time : " << avg_time_elpa << " ms, Speedup: " << avg_time_lapack / avg_time_elpa << std::endl; +#endif std::cout << "Average Scalapack Time: " << avg_time_scalap << " ms, Speedup: " << avg_time_lapack / avg_time_scalap << std::endl; } }