From 81b7fb7272f84f819520f08fb1e8210195c39a9d Mon Sep 17 00:00:00 2001 From: "tianxiang.wang@metax-tech.com" Date: Mon, 8 Sep 2025 09:44:52 +0000 Subject: [PATCH 1/3] =?UTF-8?q?Feature:=20support=20NVTX=20profiling=20via?= =?UTF-8?q?=20timer=5Fenable=5Fnvtx=20flag=20Signed-off-by=EF=BC=9ATianxia?= =?UTF-8?q?ng=20Wang,=20Contributed=20under?= =?UTF-8?q?=20MetaX=20Integrated=20Circuits=20(Shanghai)=20Co.,=20Ltd.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/advanced/input_files/input-main.md | 9 +++++ source/source_base/timer.cpp | 38 +++++++++++++------ .../module_parameter/input_parameter.h | 21 +++++----- source/source_io/read_input_item_system.cpp | 6 +++ 4 files changed, 53 insertions(+), 21 deletions(-) diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md index db4ff3b4f4..91f1f1f770 100644 --- a/docs/advanced/input_files/input-main.md +++ b/docs/advanced/input_files/input-main.md @@ -706,6 +706,15 @@ If only one value is set (such as `kspacing 0.5`), then kspacing values of a/b/c - double: double precision - **Default**: double +### timer_enable_nvtx + +- **Type**: Boolean +- **Description**: + + - True: Enable NVTX profiling labels in the timer. + - False: Disable NVTX profiling labels in the timer. +- **Default**: False + ### nb2d - **Type**: Integer diff --git a/source/source_base/timer.cpp b/source/source_base/timer.cpp index f7a4be636d..9cd05d74c0 100644 --- a/source/source_base/timer.cpp +++ b/source/source_base/timer.cpp @@ -14,6 +14,11 @@ #include "chrono" #include "source_base/formatter.h" +#ifdef __CUDA +#include +#include "source_io/module_parameter/parameter.h" +#endif + namespace ModuleBase { @@ -93,6 +98,12 @@ void timer::tick(const std::string &class_name,const std::string &name) #endif ++timer_one.calls; timer_one.start_flag = false; +#ifdef __CUDA + if (PARAM.inp.timer_enable_nvtx){ + std::string label = class_name + ":" + name; + nvtxRangePushA(label.data()); + } +#endif } else { @@ -107,6 +118,11 @@ void timer::tick(const std::string &class_name,const std::string &name) timer_one.cpu_second += (cpu_time() - timer_one.cpu_start); #endif timer_one.start_flag = true; +#ifdef __CUDA + if (PARAM.inp.timer_enable_nvtx){ + nvtxRangePop(); + } +#endif } } // end if(!omp_get_thread_num()) } @@ -128,7 +144,7 @@ void timer::write_to_json(std::string file_name) int is_initialized = 0; MPI_Initialized(&is_initialized); if (!is_initialized) { - return; + return; } int my_rank = 0; MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); @@ -195,12 +211,12 @@ void timer::write_to_json(std::string file_name) const Timer_One timer_one = timer_pool_B.second; ofs << indent << indent << indent << indent << "{\n"; ofs << indent << indent << indent << indent << "\"name\": \"" << name << "\",\n"; - ofs << indent << indent << indent << indent << "\"cpu_second\": " + ofs << indent << indent << indent << indent << "\"cpu_second\": " << std::setprecision(15) << timer_one.cpu_second << ",\n"; ofs << indent << indent << indent << indent << "\"calls\": " << timer_one.calls << ",\n"; - ofs << indent << indent << indent << indent << "\"cpu_second_per_call\": " + ofs << indent << indent << indent << indent << "\"cpu_second_per_call\": " << double_to_string(timer_one.cpu_second/timer_one.calls) << ",\n"; - ofs << indent << indent << indent << indent << "\"cpu_second_per_total\": " + ofs << indent << indent << indent << indent << "\"cpu_second_per_total\": " << double_to_string(timer_one.cpu_second/timer_pool[""]["total"].cpu_second) << "\n"; if (order_b == timer_pool_A.second.size()) @@ -283,11 +299,11 @@ void timer::print_all(std::ofstream &ofs) // if the total time is too small, we do not calculate the percentage - if (timer_pool_order[0].second.cpu_second < 1e-9) + if (timer_pool_order[0].second.cpu_second < 1e-9) { pers.push_back(0); - } - else + } + else { pers.push_back(percentage); } @@ -300,10 +316,10 @@ void timer::print_all(std::ofstream &ofs) std::vector titles = {"CLASS_NAME", "NAME", "TIME/s", "CALLS", "AVG/s", "PER/%"}; std::vector formats = {"%-10s", "%-10s", "%6.2f", "%8d", "%6.2f", "%6.2f"}; - FmtTable time_statistics(/*titles=*/titles, - /*nrows=*/pers.size(), - /*formats=*/formats, - /*indent=*/0, + FmtTable time_statistics(/*titles=*/titles, + /*nrows=*/pers.size(), + /*formats=*/formats, + /*indent=*/0, /*align=*/{/*value*/FmtTable::Align::LEFT, /*title*/FmtTable::Align::CENTER}); time_statistics << class_names << names << times << calls << avgs << pers; const std::string table = "\nTIME STATISTICS\n" + time_statistics.str(); diff --git a/source/source_io/module_parameter/input_parameter.h b/source/source_io/module_parameter/input_parameter.h index 65a82ac2ac..db2b5d5a72 100644 --- a/source/source_io/module_parameter/input_parameter.h +++ b/source/source_io/module_parameter/input_parameter.h @@ -67,6 +67,7 @@ struct Input_para std::string device = "auto"; std::string precision = "double"; + bool timer_enable_nvtx = false; // ============== #Parameters (2.Electronic structure) =========================== std::string ks_solver = "default"; ///< xiaohui add 2013-09-01 @@ -375,7 +376,7 @@ struct Input_para bool out_proj_band = false; ///< projected band structure calculation jiyy add 2022-05-11 std::string out_level = "ie"; ///< control the output information. std::vector out_dmr = {0, 8}; ///< output density matrix in real space DM(R) - std::vector out_dmk = {0, 8}; ///< output density matrix in reciprocal space DM(k) + std::vector out_dmk = {0, 8}; ///< output density matrix in reciprocal space DM(k) bool out_bandgap = false; ///< QO added for bandgap printing std::vector out_mat_hs = {0, 8}; ///< output H matrix and S matrix in local basis. std::vector out_mat_tk = {0, 8}; ///< output T(k) matrix in local basis. @@ -659,29 +660,29 @@ struct Input_para * the following two sets of parameters are for the XC parameterization. * The first element should be the LibXC id, to assign the analytical * form of the eXchange and Correlation part of the functional. - * + * * Starting from the second parameter, the parameters are the coefficients * of the functional. For example the M06-L functional, one should refer * to the source file (source code of LibXC) - * + * * src/mgga_x_m06l.c - * + * * the implementation can be found in the file - * + * * src/maple2c/mgga_exc/mgga_x_m06l.c. - * + * * There are 18 parameters for the exchange part, so the whole length of * the xc_exch_ext should be 19. (MGGA_X_M06L, id = 203) - * + * * Likewise, the correlation part can be found in corresponding files. - * + * * PBE functional is used as the default functional for XCPNet. */ // src/gga_x_pbe.c std::vector xc_exch_ext = { - 101, 0.8040, 0.2195149727645171}; + 101, 0.8040, 0.2195149727645171}; // src/gga_c_pbe.c std::vector xc_corr_ext = { - 130, 0.06672455060314922, 0.031090690869654895034, 1.00000}; + 130, 0.06672455060314922, 0.031090690869654895034, 1.00000}; }; #endif diff --git a/source/source_io/read_input_item_system.cpp b/source/source_io/read_input_item_system.cpp index 2eb2234d9f..c9854c9ece 100644 --- a/source/source_io/read_input_item_system.cpp +++ b/source/source_io/read_input_item_system.cpp @@ -830,6 +830,12 @@ void ReadInput::item_system() }; this->add_item(item); } + { + Input_Item item("timer_enable_nvtx"); + item.annotation = "enable NVTX labeling for profiling or not"; + read_sync_bool(input.timer_enable_nvtx); + this->add_item(item); + } } } // namespace ModuleIO From fd5c37c3c1bdcff3fdd13763555bf9bf8058aad0 Mon Sep 17 00:00:00 2001 From: "tianxiang.wang@metax-tech.com" Date: Mon, 8 Sep 2025 10:17:48 +0000 Subject: [PATCH 2/3] =?UTF-8?q?Add=20timer=5Fenable=5Fnvtx=20section=20in?= =?UTF-8?q?=20markdown=20Signed-off-by=EF=BC=9ATianxiang=20Wang,=20Contributed=20under=20MetaX=20Integrate?= =?UTF-8?q?d=20Circuits=20(Shanghai)=20Co.,=20Ltd.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/advanced/input_files/input-main.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md index 91f1f1f770..0bf425ffe6 100644 --- a/docs/advanced/input_files/input-main.md +++ b/docs/advanced/input_files/input-main.md @@ -22,6 +22,7 @@ - [min\_dist\_coef](#min_dist_coef) - [device](#device) - [precision](#precision) + - [timer_enable_nvtx](#timer_enable_nvtx) - [nb2d](#nb2d) - [Input Files](#variables-related-to-input-files) - [stru\_file](#stru_file) From 933b8e337526f695e24aa644366417f7d0af189c Mon Sep 17 00:00:00 2001 From: "tianxiang.wang@metax-tech.com" Date: Tue, 9 Sep 2025 02:47:53 +0000 Subject: [PATCH 3/3] =?UTF-8?q?Fix:=20Use=20=5F=5FUSE=5FNVTX=20macro=20to?= =?UTF-8?q?=20avoid=20NVTX=20linking=20errors=20in=20tests.=20Clarify=20in?= =?UTF-8?q?=20docs=20that=20timer=5Fenable=5Fnvtx=20parameter=20only=20tak?= =?UTF-8?q?es=20effect=20on=20CUDA=20platforms.=20Signed-off-by=EF=BC=9ATi?= =?UTF-8?q?anxiang=20Wang,=20Contributed=20?= =?UTF-8?q?under=20MetaX=20Integrated=20Circuits=20(Shanghai)=20Co.,=20Ltd?= =?UTF-8?q?.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 7 ++++--- docs/advanced/input_files/input-main.md | 2 +- source/source_base/timer.cpp | 6 +++--- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 277f1924ec..c0c7b83bf8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -287,7 +287,7 @@ if (USE_SW) set(SW ON) include_directories(${SW_MATH}/include) include_directories(${SW_FFT}/include) - + target_link_libraries(${ABACUS_BIN_NAME} ${SW_FFT}/lib/libfftw3.a) target_link_libraries(${ABACUS_BIN_NAME} ${SW_MATH}/libswfft.a) target_link_libraries(${ABACUS_BIN_NAME} ${SW_MATH}/libswscalapack.a) @@ -373,6 +373,7 @@ if(USE_CUDA) if(USE_CUDA) add_compile_definitions(__CUDA) add_compile_definitions(__UT_USE_CUDA) + target_compile_definitions(${ABACUS_BIN_NAME} PRIVATE __USE_NVTX) if (CMAKE_BUILD_TYPE STREQUAL "Debug") set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G" CACHE STRING "CUDA flags for debug build" FORCE) endif() @@ -520,7 +521,7 @@ if(ENABLE_MLALGO) include_directories(${libnpy_INCLUDE_DIR}) endif() include_directories(${libnpy_SOURCE_DIR}/include) - + add_compile_definitions(__MLALGO) endif() @@ -560,7 +561,7 @@ if (ENABLE_CNPY) include_directories(${cnpy_INCLUDE_DIR}) endif() include_directories(${cnpy_SOURCE_DIR}) - + # find ZLIB and link find_package(ZLIB REQUIRED) target_link_libraries(${ABACUS_BIN_NAME} cnpy ZLIB::ZLIB) diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md index 0bf425ffe6..a2024dec66 100644 --- a/docs/advanced/input_files/input-main.md +++ b/docs/advanced/input_files/input-main.md @@ -710,7 +710,7 @@ If only one value is set (such as `kspacing 0.5`), then kspacing values of a/b/c ### timer_enable_nvtx - **Type**: Boolean -- **Description**: +- **Description**: Controls whether NVTX profiling labels are emitted by the timer. This feature is only effective on CUDA platforms. - True: Enable NVTX profiling labels in the timer. - False: Disable NVTX profiling labels in the timer. diff --git a/source/source_base/timer.cpp b/source/source_base/timer.cpp index 9cd05d74c0..e7f22df70d 100644 --- a/source/source_base/timer.cpp +++ b/source/source_base/timer.cpp @@ -14,7 +14,7 @@ #include "chrono" #include "source_base/formatter.h" -#ifdef __CUDA +#if defined(__CUDA) && defined(__USE_NVTX) #include #include "source_io/module_parameter/parameter.h" #endif @@ -98,7 +98,7 @@ void timer::tick(const std::string &class_name,const std::string &name) #endif ++timer_one.calls; timer_one.start_flag = false; -#ifdef __CUDA +#if defined(__CUDA) && defined(__USE_NVTX) if (PARAM.inp.timer_enable_nvtx){ std::string label = class_name + ":" + name; nvtxRangePushA(label.data()); @@ -118,7 +118,7 @@ void timer::tick(const std::string &class_name,const std::string &name) timer_one.cpu_second += (cpu_time() - timer_one.cpu_start); #endif timer_one.start_flag = true; -#ifdef __CUDA +#if defined(__CUDA) && defined(__USE_NVTX) if (PARAM.inp.timer_enable_nvtx){ nvtxRangePop(); }